From 77e3e3c5529fa8e69f27bd3a8023586d5d6c64c4 Mon Sep 17 00:00:00 2001 From: Wen Di Date: Mon, 12 Jan 2026 17:23:25 +0800 Subject: [PATCH 1/3] add xnnpack for softmax --- third_party/xla/xla/debug_options_flags.cc | 6 + third_party/xla/xla/service/cpu/BUILD | 35 + third_party/xla/xla/service/cpu/BUILD.orig | 2189 +++++++++++++ .../xla/xla/service/cpu/cpu_compiler.cc | 8 + .../xla/xla/service/cpu/cpu_compiler.cc.orig | 2712 +++++++++++++++++ .../xla/xla/service/cpu/cpu_runtime.cc | 2 + third_party/xla/xla/service/cpu/cpu_runtime.h | 1 + third_party/xla/xla/service/cpu/ir_emitter.cc | 39 + third_party/xla/xla/service/cpu/ir_emitter.h | 1 + .../service/cpu/runtime_symbol_generator.cc | 2 + .../xla/xla/service/cpu/xnnpack_ops.cc | 76 + third_party/xla/xla/service/cpu/xnnpack_ops.h | 36 + .../xla/service/cpu/xnnpack_ops_rewriter.cc | 228 ++ .../xla/service/cpu/xnnpack_ops_rewriter.h | 45 + .../xla/service/cpu/xnnpack_pattern_utils.h | 65 + third_party/xla/xla/xla.proto | 2 + 16 files changed, 5447 insertions(+) create mode 100644 third_party/xla/xla/service/cpu/BUILD.orig create mode 100644 third_party/xla/xla/service/cpu/cpu_compiler.cc.orig create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops.cc create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops.h create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h create mode 100644 third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc index 33fa90f7e35e9e..7ab70838950d98 100644 --- a/third_party/xla/xla/debug_options_flags.cc +++ b/third_party/xla/xla/debug_options_flags.cc @@ -103,6 +103,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_cpu_use_fusion_emitters(true); opts.set_xla_cpu_use_thunk_runtime(true); opts.set_xla_cpu_use_xnnpack(false); + opts.set_xla_cpu_enable_xnnpack(false); // For softmax opts.set_xla_cpu_experimental_xnn_graph_fusion_mode( DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED); opts.set_xla_cpu_parallel_codegen_split_count(32); @@ -994,6 +995,11 @@ void MakeDebugOptionsFlags(std::vector* flag_list, bool_setter_for(&DebugOptions::set_xla_cpu_use_xnnpack), debug_options->xla_cpu_use_xnnpack(), "Use XNNPACK for supported operations.")); + flag_list->push_back(tsl::Flag( + "xla_cpu_enable_xnnpack", + bool_setter_for(&DebugOptions::set_xla_cpu_enable_xnnpack), + debug_options->xla_cpu_enable_xnnpack(), + "Enable XNNPACK ops rewriter.")); flag_list->push_back(tsl::Flag( "xla_cpu_experimental_xnn_graph_fusion_mode", setter_for_xla_cpu_experimental_xnn_graph_fusion_mode, diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD index 90388079ca2fcf..f951a6ac93b626 100644 --- a/third_party/xla/xla/service/cpu/BUILD +++ b/third_party/xla/xla/service/cpu/BUILD @@ -76,6 +76,7 @@ filegroup( "runtime_single_threaded_matmul_s32.cc", "runtime_single_threaded_matmul_u8.cc", "runtime_topk.cc", + "xnnpack_ops.cc", # Multi-threaded support. "runtime_conv2d.cc", "runtime_conv3d.cc", @@ -109,6 +110,7 @@ filegroup( "runtime_single_threaded_fft.h", "runtime_single_threaded_matmul.h", "runtime_topk.h", + "xnnpack_ops.h", # Multi-threaded support. "runtime_conv2d.h", "runtime_conv3d.h", @@ -218,6 +220,7 @@ cc_library( ":small_while_loop_hoisting_pass", ":thunk_emitter", ":xla_framework", + ":xnnpack_ops_rewriter", "//xla:cpu_function_runtime", "//xla:debug_options_flags", "//xla:literal", @@ -617,6 +620,7 @@ cc_library( ":runtime_single_threaded_fft", ":runtime_single_threaded_matmul", ":runtime_topk", + ":xnnpack_ops", "//xla/service:custom_call_target_registry", "@com_google_absl//absl/functional:any_invocable", "@com_google_absl//absl/strings:string_view", @@ -838,6 +842,8 @@ cc_library( ":onednn_config_proto_cc", ":onednn_memory_util", ":parallel_loop_emitter", + ":xnnpack_ops_rewriter", + ":xnnpack_ops", "//xla:literal", "//xla:literal_util", "//xla:shape_util", @@ -2187,3 +2193,32 @@ xla_cc_test( "@local_tsl//tsl/platform:test", ], ) + +cc_library( + name = "xnnpack_ops_rewriter", + srcs = ["xnnpack_ops_rewriter.cc"], + hdrs = [ + "xnnpack_ops_rewriter.h", + "xnnpack_pattern_utils.h", + ], + visibility = ["//visibility:public"], + deps = [ + "//xla/hlo/ir:hlo", + "//xla:literal_comparison", + "//xla:literal_util", + "//xla:status_macros", + "//xla/hlo/pass:hlo_pass", + "//xla/service:pattern_matcher", + ], +) + +cc_library( + name = "xnnpack_ops", + srcs = ["xnnpack_ops.cc"], + hdrs = ["xnnpack_ops.h"], + visibility = ["//visibility:public"], + deps = [ + "@XNNPACK", + "@com_google_absl//absl/base", + ], +) diff --git a/third_party/xla/xla/service/cpu/BUILD.orig b/third_party/xla/xla/service/cpu/BUILD.orig new file mode 100644 index 00000000000000..90388079ca2fcf --- /dev/null +++ b/third_party/xla/xla/service/cpu/BUILD.orig @@ -0,0 +1,2189 @@ +# Description: +# LLVM-based CPU backend for XLA. + +load("@bazel_skylib//rules:build_test.bzl", "build_test") +load( + "//third_party/compute_library:build_defs.bzl", + "acl_deps", + "if_enable_acl", +) +load( + "//xla:xla.default.bzl", + "xla_cc_binary", + "xla_cc_test", +) +load("//xla/tests:build_defs.bzl", "xla_test") +load("//xla/tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts") +load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable") +load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api", "mkl_deps") +load("//xla/tsl/platform:build_config.bzl", "tf_proto_library") +load( + "//xla/tsl/platform:build_config_root.bzl", + "if_llvm_aarch64_available", + "if_llvm_powerpc_available", + "if_llvm_system_z_available", + "if_llvm_x86_available", +) +load("//xla/tsl/platform:rules_cc.bzl", "cc_library") +load(":build_defs.bzl", "runtime_copts") + +package( + # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"], + default_visibility = internal_visibility([":friends"]), + licenses = ["notice"], +) + +package_group( + name = "friends", + includes = [ + "//xla:friends", + ], +) + +# Filegroup used to collect source files for dependency checking. +filegroup( + name = "c_srcs", + data = glob([ + "**/*.cc", + "**/*.h", + ]), +) + +cc_library( + name = "test_header_helper", + testonly = True, + hdrs = ["test_target_triple_helper.h"], +) + +filegroup( + name = "runtime_srcs", + srcs = [ + # Single-threaded support. + "runtime_custom_call_status.cc", + "runtime_fp16.cc", + "runtime_key_value_sort.cc", + "runtime_pow.cc", + "runtime_single_threaded_conv2d.cc", + "runtime_single_threaded_conv3d.cc", + "runtime_single_threaded_fft.cc", + "runtime_single_threaded_matmul_c128.cc", + "runtime_single_threaded_matmul_c64.cc", + "runtime_single_threaded_matmul_common.h", + "runtime_single_threaded_matmul_f8.cc", + "runtime_single_threaded_matmul_f16.cc", + "runtime_single_threaded_matmul_f32.cc", + "runtime_single_threaded_matmul_f64.cc", + "runtime_single_threaded_matmul_s32.cc", + "runtime_single_threaded_matmul_u8.cc", + "runtime_topk.cc", + # Multi-threaded support. + "runtime_conv2d.cc", + "runtime_conv3d.cc", + "runtime_fft.cc", + "runtime_matmul_c128.cc", + "runtime_matmul_c64.cc", + "runtime_matmul_common.h", + "runtime_matmul_f16.cc", + "runtime_matmul_f32.cc", + "runtime_matmul_f64.cc", + "runtime_matmul_s32.cc", + "runtime_fork_join.cc", + "//xla/backends/cpu/runtime:runtime_srcs", + #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add "runtime_handle_ffi_call.cc". + ], + visibility = internal_visibility([":friends"]), +) + +filegroup( + name = "runtime_hdrs", + srcs = [ + # XLA Runtime support. + "buffer_desc.h", + # Single-threaded support. + "runtime_custom_call_status.h", + "runtime_fp16.h", + "runtime_key_value_sort.h", + "runtime_pow.h", + "runtime_single_threaded_conv2d.h", + "runtime_single_threaded_conv3d.h", + "runtime_single_threaded_fft.h", + "runtime_single_threaded_matmul.h", + "runtime_topk.h", + # Multi-threaded support. + "runtime_conv2d.h", + "runtime_conv3d.h", + "runtime_fft.h", + "runtime_fork_join.h", + "runtime_lightweight_check.h", + "runtime_matmul.h", + "//xla/backends/cpu/runtime:runtime_hdrs", + #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add "runtime_handle_ffi_call.h" + ], + visibility = internal_visibility([":friends"]), +) + +cc_library( + name = "cpu_xfeed", + srcs = ["cpu_xfeed.cc"], + hdrs = ["cpu_xfeed.h"], + deps = [ + ":cpu_runtime", + "//xla:literal", + "//xla:literal_util", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla:util", + "//xla/service:hlo_cost_analysis", + "//xla/service:shaped_buffer", + "@com_google_absl//absl/base", + "@com_google_absl//absl/cleanup", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/types:span", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:notification", + ], +) + +cc_library( + name = "cpu_transfer_manager", + srcs = ["cpu_transfer_manager.cc"], + hdrs = ["cpu_transfer_manager.h"], + deps = [ + ":cpu_runtime", + ":cpu_xfeed", + "//xla:literal", + "//xla:literal_util", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/service:compiler", + "//xla/service:generic_transfer_manager", + "//xla/service:transfer_manager", + "//xla/stream_executor:device_memory", + "//xla/stream_executor:platform_manager", + "//xla/stream_executor:stream_executor_h", + "//xla/stream_executor/host:host_platform_id", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/types:span", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:logging", + ], + alwayslink = True, # Contains per-platform transfer manager registration +) + +cc_library( + name = "buffer_info_util", + srcs = ["buffer_info_util.cc"], + hdrs = ["buffer_info_util.h"], + deps = [ + "//xla:cpu_function_runtime", + "//xla/hlo/ir:hlo", + "//xla/service:buffer_assignment", + "@com_google_absl//absl/types:span", + ], +) + +cc_library( + name = "cpu_compiler_pure", + srcs = ["cpu_compiler.cc"], + hdrs = ["cpu_compiler.h"], + copts = tsl_copts(), + deps = [ + ":buffer_info_util", + ":conv_canonicalization", + ":cpu_aot_compilation_result", + ":cpu_executable", + ":cpu_float_support", + ":cpu_instruction_fusion", + ":cpu_layout_assignment", + ":cpu_options", + ":dot_op_emitter", + ":executable_proto_cc", + ":fusion_wrapper", + ":ir_emission_utils", + ":ir_emitter", + ":ir_emitter2", + ":metrics", + ":onednn_contraction_rewriter", + ":onednn_float_support", + ":onednn_ops_rewriter", + ":parallel_task_assignment", + ":runtime_symbol_generator", + ":small_while_loop_hoisting_pass", + ":thunk_emitter", + ":xla_framework", + "//xla:cpu_function_runtime", + "//xla:debug_options_flags", + "//xla:literal", + "//xla:literal_pool", + "//xla:protobuf_util", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla:xla_proto_cc", + "//xla/backends/cpu:constant_allocation", + "//xla/backends/cpu:xnn_fusion", + "//xla/backends/cpu/codegen:compiled_function_library", + "//xla/backends/cpu/codegen:cpu_features", + "//xla/backends/cpu/codegen:execution_engine", + "//xla/backends/cpu/codegen:ir_compiler", + "//xla/backends/cpu/codegen:jit_compiler", + "//xla/backends/cpu/codegen:object_loader", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config", + "//xla/backends/cpu/runtime:function_library", + "//xla/backends/cpu/runtime:kernel_thunk", + "//xla/backends/cpu/runtime:thunk", + "//xla/backends/cpu/runtime:thunk_proto_cc_impl", + "//xla/backends/cpu/runtime:thunk_proto_serdes", + "//xla/backends/cpu/transforms:xnn_graph_fusion", + "//xla/hlo/analysis:hlo_ordering", + "//xla/hlo/analysis:indexed_array_analysis", + "//xla/hlo/ir:hlo", + "//xla/hlo/ir:hlo_module_group", + "//xla/hlo/pass:hlo_pass", + "//xla/hlo/pass:hlo_pass_pipeline", + "//xla/hlo/transforms:literal_canonicalizer", + "//xla/hlo/transforms:operand_upcaster", + "//xla/hlo/transforms:while_loop_trip_count_annotator", + "//xla/hlo/transforms/expanders:bitcast_dtypes_expander", + "//xla/hlo/transforms/expanders:cholesky_expander", + "//xla/hlo/transforms/expanders:comparison_expander", + "//xla/hlo/transforms/expanders:dot_decomposer", + "//xla/hlo/transforms/expanders:dynamic_index_splitter", + "//xla/hlo/transforms/expanders:eigh_expander", + "//xla/hlo/transforms/expanders:logistic_expander", + "//xla/hlo/transforms/expanders:optimization_barrier_expander", + "//xla/hlo/transforms/expanders:qr_expander", + "//xla/hlo/transforms/expanders:reduce_decomposer", + "//xla/hlo/transforms/expanders:reshape_decomposer", + "//xla/hlo/transforms/expanders:rng_bit_generator_expander", + "//xla/hlo/transforms/expanders:rng_expander", + "//xla/hlo/transforms/expanders:stochastic_convert_decomposer", + "//xla/hlo/transforms/simplifiers:algebraic_simplifier", + "//xla/hlo/transforms/simplifiers:batch_dot_simplification", + "//xla/hlo/transforms/simplifiers:broadcast_canonicalizer", + "//xla/hlo/transforms/simplifiers:conditional_canonicalizer", + "//xla/hlo/transforms/simplifiers:convolution_group_converter", + "//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier", + "//xla/hlo/transforms/simplifiers:flatten_call_graph", + "//xla/hlo/transforms/simplifiers:float_normalization", + "//xla/hlo/transforms/simplifiers:gather_simplifier", + "//xla/hlo/transforms/simplifiers:hlo_constant_folding", + "//xla/hlo/transforms/simplifiers:hlo_dce", + "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler", + "//xla/hlo/transforms/simplifiers:optimize_input_output_buffer_alias", + "//xla/hlo/transforms/simplifiers:reduce_window_rewriter", + "//xla/hlo/transforms/simplifiers:reshape_mover", + "//xla/hlo/transforms/simplifiers:result_caster", + "//xla/hlo/transforms/simplifiers:simplify_fp_conversions", + "//xla/hlo/transforms/simplifiers:slice_sinker", + "//xla/hlo/transforms/simplifiers:sort_simplifier", + "//xla/hlo/transforms/simplifiers:sub_byte_normalization", + "//xla/hlo/transforms/simplifiers:tree_reduction_rewriter", + "//xla/hlo/transforms/simplifiers:tuple_simplifier", + "//xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination", + "//xla/mlir_hlo", + "//xla/mlir_hlo:all_passes", + "//xla/mlir_hlo:transforms_passes", + "//xla/service:all_reduce_promotion", + "//xla/service:outer_dimension_propagation", + "//xla/service:get_outer_batch_value_simplifier", + "//xla/service:all_to_all_decomposer", + "//xla/service:batched_gather_scatter_normalizer", + "//xla/service:batchnorm_expander", + "//xla/service:buffer_assignment", + "//xla/service:call_graph", + "//xla/service:call_inliner", + "//xla/service:change_op_data_type", + "//xla/service:compiler", + "//xla/service:conditional_simplifier", + "//xla/service:conditional_to_select", + "//xla/service:copy_insertion", + "//xla/service:cpu_gpu_shape_verifier", + "//xla/service:dump", + "//xla/service:dynamic_dimension_inference", + "//xla/service:dynamic_padder", + "//xla/service:executable", + "//xla/service:float_support", + "//xla/service:gather_expander", + "//xla/service:hlo_cost_analysis", + "//xla/service:hlo_cse", + "//xla/service:hlo_execution_profile", + "//xla/service:hlo_module_config", + "//xla/service:hlo_profile_printer_data_cc", + "//xla/service:hlo_proto_cc", + "//xla/service:hlo_proto_util", + "//xla/service:hlo_verifier", + "//xla/service:layout_assignment", + "//xla/service:llvm_compiler", + "//xla/service:logical_buffer", + "//xla/service:map_inliner", + "//xla/service:scatter_expander", + "//xla/service:scatter_simplifier", + "//xla/service:select_and_scatter_expander", + "//xla/service:sharding_propagation", + "//xla/service:sharding_remover", + "//xla/service:slow_operation_alarm", + "//xla/service:topk_rewriter", + "//xla/service:transpose_folding", + "//xla/service:triangular_solve_expander", + "//xla/service:while_loop_constant_sinking", + "//xla/service:while_loop_invariant_code_motion", + "//xla/service:while_loop_simplifier", + "//xla/service/llvm_ir:llvm_command_line_options", + "//xla/service/llvm_ir:llvm_util", + "//xla/service/spmd:stateful_rng_spmd_partitioner", + "//xla/service/spmd/shardy:shardy_xla_pass", + "//xla/stream_executor:platform", + "//xla/stream_executor:stream_executor_h", + "//xla/stream_executor/host:host_platform_id", + "//xla/tsl/concurrency:async_value", + "//xla/tsl/platform:env", + "//xla/tsl/platform:status", + "//xla/tsl/platform:statusor", + "//xla/tsl/protobuf:error_codes_proto_impl_cc", + "@com_google_absl//absl/base", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/cleanup", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/functional:any_invocable", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:BitReader", + "@llvm-project//llvm:BitWriter", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Linker", + "@llvm-project//llvm:MC", + "@llvm-project//llvm:Object", + "@llvm-project//llvm:OrcJIT", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:Target", + "@llvm-project//llvm:TargetParser", + "@llvm-project//llvm:TransformUtils", + "@llvm-project//mlir:AffineDialect", + "@llvm-project//mlir:AffineToStandard", + "@llvm-project//mlir:ArithDialect", + "@llvm-project//mlir:ArithTransforms", + "@llvm-project//mlir:BufferizationTransforms", + "@llvm-project//mlir:BuiltinToLLVMIRTranslation", + "@llvm-project//mlir:FuncDialect", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:LLVMDialect", + "@llvm-project//mlir:LLVMToLLVMIRTranslation", + "@llvm-project//mlir:LinalgDialect", + "@llvm-project//mlir:LinalgTransforms", + "@llvm-project//mlir:MemRefTransforms", + "@llvm-project//mlir:Pass", + "@llvm-project//mlir:ReconcileUnrealizedCasts", + "@llvm-project//mlir:SCFDialect", + "@llvm-project//mlir:Support", + "@llvm-project//mlir:TensorDialect", + "@llvm-project//mlir:ToLLVMIRTranslation", + "@llvm-project//mlir:TransformUtils", + "@llvm-project//mlir:Transforms", + "@llvm-project//mlir:VectorDialect", + "@local_tsl//tsl/platform:casts", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:platform_port", + "@local_tsl//tsl/platform:status", + "@local_tsl//tsl/platform:statusor", + "@local_tsl//tsl/platform:threadpool_async_executor", + "@local_tsl//tsl/profiler/lib:traceme", + "@local_tsl//tsl/profiler/lib:traceme_encode", + ] + if_llvm_aarch64_available([ + "@llvm-project//llvm:AArch64CodeGen", # fixdeps: keep + ]) + if_llvm_powerpc_available([ + "@llvm-project//llvm:PowerPCCodeGen", # fixdeps: keep + ]) + if_llvm_system_z_available([ + "@llvm-project//llvm:SystemZCodeGen", # fixdeps: keep + ]) + if_llvm_x86_available([ + "@llvm-project//llvm:X86CodeGen", # fixdeps: keep + ]), +) + +cc_library( + name = "cpu_aot_compilation_result", + srcs = ["cpu_aot_compilation_result.cc"], + hdrs = ["cpu_aot_compilation_result.h"], + deps = [ + ":buffer_info_util", + ":cpu_executable", + ":executable_proto_cc", + "//xla:cpu_function_runtime", + "//xla:util", + "//xla/backends/cpu:constant_allocation", + "//xla/backends/cpu/runtime:function_library", + "//xla/backends/cpu/runtime:thunk", + "//xla/backends/cpu/runtime:thunk_proto_cc", + "//xla/backends/cpu/runtime:thunk_proto_serdes", + "//xla/hlo/ir:hlo", + "//xla/service:buffer_assignment", + "//xla/service:buffer_value", + "//xla/service:compiler", + "//xla/service:executable", + "//xla/service:hlo_cost_analysis", + "//xla/service:hlo_module_config", + "//xla/service:hlo_profile_printer_data_cc", + "//xla/service:hlo_proto_cc", + "//xla/stream_executor:platform", + "//xla/stream_executor/host:host_platform_id", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + ], +) + +cc_library( + # The old target name will still be used so that dependencies won't break. + # In the future, dependencies should be cleaned up and relinked to the above + # target if registration is not necesary. + name = "cpu_compiler", + srcs = ["cpu_compiler_registerer.cc"], + hdrs = ["cpu_compiler.h"], + deps = [ + "cpu_compiler_pure", + ":cpu_aot_compilation_result", + ":executable_proto_cc", + "//xla:util", + "//xla/backends/cpu/codegen:ir_compiler", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/hlo/ir:hlo_module_group", + "//xla/service:buffer_assignment", + "//xla/service:compiler", + "//xla/service:executable", + "//xla/service:hlo_cost_analysis", + "//xla/service:hlo_profile_printer_data_cc", + "//xla/service:hlo_proto_cc", + "//xla/service:llvm_compiler", + "//xla/stream_executor:platform", + "//xla/stream_executor:stream_executor_h", + "//xla/stream_executor/host:host_platform_id", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:Target", + "@llvm-project//llvm:TargetParser", + ], + alwayslink = True, # Contains compiler registration +) + +xla_test( + name = "cpu_compiler_test", + srcs = ["cpu_compiler_test.cc"], + backends = [ + "cpu", + ], + tags = [ + "test_migrated_to_hlo_runner_pjrt", + "test_xla_cpu_no_thunks", + ], + deps = [ + "//xla/hlo/testlib:verified_hlo_module", + "//xla/tests:hlo_pjrt_test_base", + "//xla/tests:xla_internal_test_main", + "//xla/tsl/lib/monitoring:collected_metrics", + "//xla/tsl/lib/monitoring:collection_registry", + "@com_google_absl//absl/strings:string_view", + "@local_tsl//tsl/platform:statusor", + "@local_tsl//tsl/platform:test", + ], +) + +xla_test( + name = "cpu_compiler_internals_test", + srcs = ["cpu_compiler_internals_test.cc"], + backends = [ + "cpu", + ], + deps = [ + "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:verified_hlo_module", + "//xla/service:llvm_compiler", + "//xla/tests:hlo_test_base", + "//xla/tests:xla_internal_test_main", + "//xla/tsl/platform:statusor", + "//xla/tsl/platform:test", + "@com_google_absl//absl/base:nullability", + "@com_google_absl//absl/strings:string_view", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + ], +) + +xla_test( + name = "cpu_aot_compiler_test", + srcs = ["cpu_aot_compiler_test.cc"], + backends = [ + "cpu", + ], + deps = [ + ":cpu_aot_compilation_result", + ":test_header_helper", + "//xla:literal", + "//xla:literal_util", + "//xla/hlo/ir:hlo", + "//xla/hlo/ir:hlo_module_group", + "//xla/service:compiler", + "//xla/service:executable", + "//xla/service:hlo_runner", + "//xla/service:hlo_runner_interface", + "//xla/stream_executor:platform", + "//xla/stream_executor:platform_manager", + "//xla/tests:hlo_test_base", + "//xla/tests:literal_test_util", + "//xla/tests:xla_internal_test_main", + "//xla/tsl/platform:statusor", + "//xla/tsl/platform:test", + "@com_google_absl//absl/strings:string_view", + ], +) + +tf_proto_library( + name = "executable_proto", + srcs = ["executable.proto"], + protodeps = [ + ":xla_framework_proto", + "//xla/service:hlo_proto", + "//xla:xla_proto", + "//xla/backends/cpu/runtime:thunk_proto", + ], +) + +tf_proto_library( + name = "xla_framework_proto", + srcs = ["xla_framework.proto"], +) + +cc_library( + name = "xla_framework", + hdrs = ["xla_framework.h"], + deps = [":xla_framework_proto_cc"], +) + +cc_library( + name = "runtime_symbol_generator", + srcs = [ + "runtime_symbol_generator.cc", + "windows_compatibility.cc", + "windows_compatibility.h", + ], + hdrs = ["runtime_symbol_generator.h"], + copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(), + deps = [ + ":cpu_runtime", + ":onednn_convolution", + ":onednn_layer_norm", + ":onednn_matmul", + ":onednn_softmax", + ":runtime_conv2d", + ":runtime_conv2d_acl", + ":runtime_conv2d_mkl", + ":runtime_conv3d", + ":runtime_custom_call_status", + ":runtime_fft", + ":runtime_fork_join", + ":runtime_fp16", + ":runtime_handle_ffi_call", + ":runtime_key_value_sort", + ":runtime_matmul", + ":runtime_matmul_acl", + ":runtime_pow", + ":runtime_single_threaded_conv2d", + ":runtime_single_threaded_conv3d", + ":runtime_single_threaded_fft", + ":runtime_single_threaded_matmul", + ":runtime_topk", + "//xla/service:custom_call_target_registry", + "@com_google_absl//absl/functional:any_invocable", + "@com_google_absl//absl/strings:string_view", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:OrcJIT", + "@llvm-project//llvm:OrcShared", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:mlir_c_runner_utils", + "@local_tsl//tsl/platform:logging", + ], +) + +cc_library( + name = "runtime_lightweight_check", + hdrs = ["runtime_lightweight_check.h"], + compatible_with = get_compatible_with_portable(), + copts = runtime_copts(), +) + +cc_library( + name = "runtime_fp16", + srcs = [ + "runtime_fp16.cc", + ], + hdrs = [ + "runtime_fp16.h", + ], + copts = runtime_copts(), + deps = ["@com_google_absl//absl/base:core_headers"], +) + +cc_library( + name = "runtime_pow", + srcs = [ + "runtime_pow.cc", + ], + hdrs = [ + "runtime_pow.h", + ], + copts = runtime_copts(), + deps = ["@com_google_absl//absl/base:core_headers"], +) + +cc_library( + name = "buffer_desc", + hdrs = ["buffer_desc.h"], +) + +cc_library( + name = "cpu_executable", + srcs = ["cpu_executable.cc"], + hdrs = ["cpu_executable.h"], + deps = [ + ":cpu_runtime", + ":executable_proto_cc", + "//xla:executable_run_options", + "//xla:literal", + "//xla:shape_tree", + "//xla:shape_util", + "//xla:status_macros", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu:constant_allocation", + "//xla/backends/cpu/runtime:buffer_allocations", + "//xla/backends/cpu/runtime:function_library", + "//xla/backends/cpu/runtime:thread_pool_task_runner", + "//xla/backends/cpu/runtime:thunk", + "//xla/backends/cpu/runtime:thunk_executor", + "//xla/hlo/ir:hlo", + "//xla/service:buffer_assignment", + "//xla/service:custom_call_status", + "//xla/service:custom_call_status_internal", + "//xla/service:executable", + "//xla/service:hlo_execution_profile", + "//xla/service:hlo_profile_printer_data_cc", + "//xla/service:hlo_value", + "//xla/service:maybe_owning_device_memory", + "//xla/service:shaped_buffer", + "//xla/service:xla_debug_info_manager", + "//xla/stream_executor:device_memory", + "//xla/stream_executor:device_memory_allocator", + "//xla/stream_executor/host:host_stream", + "//xla/tsl/concurrency:async_value", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:statusor", + ], +) + +cc_library( + name = "elemental_math_emitter", + srcs = ["elemental_math_emitter.cc"], + hdrs = ["elemental_math_emitter.h"], + deps = [ + "//xla:xla_data_proto_cc", + "//xla/service/llvm_ir:math_ops", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + ], +) + +cc_library( + name = "ir_emitter2", + srcs = ["ir_emitter2.cc"], + hdrs = ["ir_emitter2.h"], + deps = [ + ":backend_config_proto_cc", + ":dot_op_emitter", + ":elemental_ir_emitter", + ":ir_emitter", + ":parallel_loop_emitter", + "//xla:shape_util", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla:xla_proto_cc", + "//xla/backends/cpu/codegen:fusion_compiler", + "//xla/backends/cpu/codegen:kernel_api_ir_builder", + "//xla/backends/cpu/codegen:symbol_name_util", + "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config", + "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters", + "//xla/hlo/ir:hlo", + "//xla/service:buffer_assignment", + "//xla/service:hlo_module_config", + "//xla/service/llvm_ir:dynamic_update_slice_util", + "//xla/service/llvm_ir:fused_ir_emitter", + "//xla/service/llvm_ir:ir_array", + "//xla/service/llvm_ir:llvm_util", + "//xla/service/llvm_ir:loop_emitter", + "//xla/stream_executor:launch_dim", + "//xla/tsl/platform:errors", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/strings:string_view", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Linker", + "@llvm-project//llvm:Support", + "@llvm-project//mlir:IR", + ], +) + +xla_cc_test( + name = "ir_emitter_test", + srcs = ["ir_emitter_test.cc"], + deps = [ + ":cpu_compiler", + ":cpu_executable", + ":cpu_options", + ":ir_emitter", + ":ir_function", + ":runtime_symbol_generator", + ":target_machine_features_stub", + "//xla:cpu_function_runtime", + "//xla:shape_util", + "//xla/backends/cpu/codegen:cpu_features", + "//xla/backends/cpu/codegen:execution_engine", + "//xla/backends/cpu/codegen:ir_compiler", + "//xla/backends/cpu/codegen:jit_compiler", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/analysis:hlo_ordering", + "//xla/hlo/ir:hlo", + "//xla/hlo/parser:hlo_parser", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler", + "//xla/service:buffer_assignment", + "//xla/service:buffer_value", + "//xla/service:hlo_module_config", + "//xla/service:logical_buffer", + "//xla/service/llvm_ir:llvm_util", + "//xla/tsl/lib/core:status_test_util", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:Target", + "@llvm-project//mlir:IR", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:statusor", + "@local_tsl//tsl/platform:test", + ], +) + +cc_library( + name = "ir_emitter", + srcs = ["ir_emitter.cc"], + hdrs = ["ir_emitter.h"], + copts = tsl_copts(), + deps = [ + ":backend_config_proto_cc", + ":cpu_instruction_fusion", + ":cpu_options", + ":cpu_runtime", + ":dot_op_emitter", + ":elemental_ir_emitter", + ":ir_emission_utils", + ":ir_function", + ":onednn_config_proto_cc", + ":onednn_memory_util", + ":parallel_loop_emitter", + "//xla:literal", + "//xla:literal_util", + "//xla:shape_util", + "//xla:status_macros", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/service:buffer_assignment", + "//xla/service:collective_ops_utils", + "//xla/service:elemental_ir_emitter", + "//xla/service:hlo_module_config", + "//xla/service:name_uniquer", + "//xla/service/llvm_ir:alias_analysis", + "//xla/service/llvm_ir:buffer_assignment_util", + "//xla/service/llvm_ir:dynamic_update_slice_util", + "//xla/service/llvm_ir:fused_ir_emitter", + "//xla/service/llvm_ir:ir_array", + "//xla/service/llvm_ir:ir_builder_mixin", + "//xla/service/llvm_ir:llvm_loop", + "//xla/service/llvm_ir:llvm_type_conversion_util", + "//xla/service/llvm_ir:llvm_util", + "//xla/service/llvm_ir:loop_emitter", + "//xla/service/llvm_ir:tuple_ops", + "//xla/tsl/lib/math:math_util", + "//xla/tsl/platform:errors", + "//xla/tsl/platform:status", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/cleanup", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/log", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/meta:type_traits", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + "@llvm-project//llvm:TargetParser", + "@llvm-project//mlir:IR", + ], +) + +cc_library( + name = "target_machine_features_stub", + testonly = 1, + hdrs = ["target_machine_features_stub.h"], + deps = [ + "//xla/backends/cpu/codegen:target_machine_features", + "@llvm-project//llvm:Core", + "@local_tsl//tsl/platform:logging", + ], +) + +cc_library( + name = "ir_function", + srcs = ["ir_function.cc"], + hdrs = ["ir_function.h"], + deps = [ + ":cpu_runtime", + ":ir_emission_utils", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla/service:hlo_module_config", + "//xla/service/llvm_ir:llvm_util", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:Core", + ], +) + +cc_library( + name = "parallel_loop_emitter", + srcs = ["parallel_loop_emitter.cc"], + hdrs = ["parallel_loop_emitter.h"], + deps = [ + ":ir_emission_utils", + "//xla/service/llvm_ir:ir_array", + "//xla/service/llvm_ir:llvm_loop", + "//xla/service/llvm_ir:llvm_util", + "//xla/service/llvm_ir:loop_emitter", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/strings:string_view", + "@llvm-project//llvm:Core", + ], +) + +cc_library( + name = "thunk_emitter", + srcs = ["thunk_emitter.cc"], + hdrs = ["thunk_emitter.h"], + local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]), + deps = [ + ":backend_config_proto_cc", + ":dot_op_emitter", + ":ir_emission_utils", + ":ir_emitter2", + "//xla:comparison_util", + "//xla:cpu_function_runtime", + "//xla:shape_util", + "//xla:status_macros", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu:onednn_emitter", + "//xla/backends/cpu:onednn_fusion", + "//xla/backends/cpu:xnn_emitter", + "//xla/backends/cpu:xnn_fusion", + "//xla/backends/cpu/codegen:computation_kernel_emitter", + "//xla/backends/cpu/codegen:fusion_compiler", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/backends/cpu/codegen/dot:dot_kernel_emitter", + "//xla/backends/cpu/codegen/elemental:concatenate_kernel_emitter", + "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter", + "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters", + "//xla/backends/cpu/runtime:all_gather_thunk", + "//xla/backends/cpu/runtime:all_reduce_thunk", + "//xla/backends/cpu/runtime:all_to_all_thunk", + "//xla/backends/cpu/runtime:call_thunk", + "//xla/backends/cpu/runtime:collective_permute_thunk", + "//xla/backends/cpu/runtime:collective_thunk", + "//xla/backends/cpu/runtime:conditional_thunk", + "//xla/backends/cpu/runtime:convolution_thunk", + "//xla/backends/cpu/runtime:copy_thunk", + "//xla/backends/cpu/runtime:custom_call_thunk", + "//xla/backends/cpu/runtime:dot_thunk", + "//xla/backends/cpu/runtime:fft_thunk", + "//xla/backends/cpu/runtime:infeed_thunk", + "//xla/backends/cpu/runtime:kernel_thunk", + "//xla/backends/cpu/runtime:logical_id_thunk", + "//xla/backends/cpu/runtime:outfeed_thunk", + "//xla/backends/cpu/runtime:reduce_scatter_thunk", + "//xla/backends/cpu/runtime:rng_state_thunk", + "//xla/backends/cpu/runtime:sort_thunk", + "//xla/backends/cpu/runtime:thunk", + "//xla/backends/cpu/runtime:topk_thunk", + "//xla/backends/cpu/runtime:while_thunk", + "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk", + "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk", + "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk", + "//xla/codegen:kernel_definition", + "//xla/codegen:kernel_spec", + "//xla/codegen:llvm_ir_kernel_source", + "//xla/codegen:mlir_kernel_source", + "//xla/hlo/ir:hlo", + "//xla/runtime:resource_use", + "//xla/service:buffer_assignment", + "//xla/service:collective_ops_utils", + "//xla/service:hlo_module_config", + "//xla/service:hlo_proto_cc", + "//xla/service:pattern_matcher", + "//xla/tsl/platform:errors", + "//xla/tsl/platform:logging", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:JITLink", + "@llvm-project//llvm:ir_headers", + "@local_tsl//tsl/platform:casts", + "@local_tsl//tsl/profiler/lib:traceme", + ], +) + +cc_library( + name = "tiled_dot_emitter", + srcs = ["tiled_dot_emitter.cc"], + hdrs = ["tiled_dot_emitter.h"], + deps = [ + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/codegen:vector_ir_builder", + "//xla/service:hlo_module_config", + "//xla/service/llvm_ir:kernel_support_library", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/numeric:bits", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:Core", + ], +) + +cc_library( + name = "dot_op_emitter", + srcs = ["dot_op_emitter.cc"], + hdrs = [ + "dot_op_emitter.h", + ], + deps = [ + ":backend_config_proto_cc", + ":cpu_options", + ":cpu_runtime", + ":tiled_dot_emitter", + "//xla:shape_util", + "//xla:status_macros", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/service:hlo_module_config", + "//xla/service/llvm_ir:ir_array", + "//xla/service/llvm_ir:kernel_support_library", + "//xla/service/llvm_ir:llvm_loop", + "//xla/service/llvm_ir:llvm_util", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/status", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:Support", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:logging", + ], +) + +build_test( + name = "sample_harness_build_test", + targets = [ + ":sample_harness", + ], +) + +xla_cc_binary( + name = "sample_harness", + srcs = ["sample_harness.cc"], + deps = [ + "//xla:array4d", + "//xla:literal", + "//xla:types", + "//xla:xla_data_proto_cc", + "//xla/client", + "//xla/client:client_library", + "//xla/client:local_client", + "//xla/hlo/builder:xla_builder", + "//xla/hlo/builder:xla_computation", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:str_format", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:platform_port", + ], +) + +cc_library( + name = "cpu_runtime", + srcs = [ + "cpu_runtime.cc", + "xfeed_manager.cc", + ], + hdrs = [ + "cpu_runtime.h", + "xfeed_manager.h", + ], + copts = runtime_copts(), + deps = [ + ":cpu_executable_run_options", + "//xla:executable_run_options", + "//xla:shape_util", + "//xla:types", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/collectives:cpu_clique_key", + "//xla/backends/cpu/collectives:cpu_cliques", + "//xla/backends/cpu/collectives:cpu_collectives", + "//xla/backends/cpu/collectives:in_process_collectives", + "//xla/core/collectives:communicator", + "//xla/core/collectives:rank_id", + "//xla/hlo/parser:hlo_parser", + "//xla/service:collective_ops_utils", + "//xla/service:computation_placer", + "//xla/service:global_device_id", + "//xla/stream_executor:device_memory", + "//xla/stream_executor:stream_executor_h", + "//xla/tsl/concurrency:async_value", + "//xla/tsl/platform:errors", + "//xla/tsl/platform:logging", + "//xla/tsl/platform:status", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/synchronization", + "@com_google_absl//absl/time", + "@com_google_absl//absl/types:span", + "@local_tsl//tsl/profiler/lib:traceme", + ], +) + +cc_library( + name = "runtime_conv2d", + srcs = ["runtime_conv2d.cc"], + hdrs = ["runtime_conv2d.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla/backends/cpu/runtime:convolution_thunk_internal", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/synchronization", # build_cleaner: keep + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_conv3d", + srcs = ["runtime_conv3d.cc"], + hdrs = ["runtime_conv3d.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla/backends/cpu/runtime:convolution_thunk_internal", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/synchronization", # build_cleaner: keep + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_custom_call_status", + srcs = ["runtime_custom_call_status.cc"], + hdrs = ["runtime_custom_call_status.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "//xla/service:custom_call_status_internal", + "@com_google_absl//absl/base:core_headers", + ], +) + +cc_library( + name = "runtime_conv2d_mkl", + srcs = [ + "runtime_conv2d_mkl.cc", + ], + hdrs = ["runtime_conv2d_mkl.h"], + copts = runtime_copts() + tf_openmp_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_conv2d", + ":runtime_single_threaded_conv2d", + "//xla:executable_run_options", + "//xla/tsl/framework/convolution:eigen_helpers", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@eigen_archive//:eigen3", + ] + mkl_deps(), +) + +cc_library( + name = "runtime_fft", + srcs = [ + "runtime_fft.cc", + ], + hdrs = ["runtime_fft.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "//xla:executable_run_options", + "@com_google_absl//absl/base:core_headers", + "@ducc//:fft_wrapper", + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_matmul", + srcs = [ + "runtime_matmul_c128.cc", + "runtime_matmul_c64.cc", + "runtime_matmul_common.h", + "runtime_matmul_f16.cc", + "runtime_matmul_f32.cc", + "runtime_matmul_f64.cc", + "runtime_matmul_s32.cc", + ], + hdrs = ["runtime_matmul.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla/tsl/framework/contraction:eigen_contraction_kernel", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/synchronization", # build_cleaner: keep + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_matmul_acl", + srcs = ["runtime_matmul_acl.cc"], + hdrs = ["runtime_matmul_acl.h"], + copts = tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + ":runtime_matmul", + "//xla:executable_run_options", + "//xla/tsl/platform:dynamic_annotations", + "@com_google_absl//absl/base", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:types", + ] + acl_deps(), +) + +cc_library( + name = "runtime_conv2d_acl", + srcs = [ + "runtime_conv2d_acl.cc", + ], + hdrs = ["runtime_conv2d_acl.h"], + copts = tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_conv2d", + ":runtime_lightweight_check", + ":runtime_single_threaded_conv2d", + "//xla:executable_run_options", + "//xla/tsl/framework/convolution:eigen_helpers", + "//xla/tsl/platform:dynamic_annotations", + "@com_google_absl//absl/base", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:types", + ] + acl_deps(), +) + +cc_library( + name = "runtime_single_threaded_conv2d", + srcs = ["runtime_single_threaded_conv2d.cc"], + hdrs = ["runtime_single_threaded_conv2d.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "//xla/backends/cpu/runtime:convolution_thunk_internal", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/synchronization", # build_cleaner: keep + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_single_threaded_conv3d", + srcs = ["runtime_single_threaded_conv3d.cc"], + hdrs = ["runtime_single_threaded_conv3d.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "//xla/backends/cpu/runtime:convolution_thunk_internal", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/synchronization", # build_cleaner: keep + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_single_threaded_fft", + srcs = [ + "runtime_single_threaded_fft.cc", + ], + hdrs = ["runtime_single_threaded_fft.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_fft", + "@com_google_absl//absl/base:core_headers", + ], +) + +cc_library( + name = "runtime_single_threaded_matmul_impl", + srcs = [ + "runtime_single_threaded_matmul_c128.cc", + "runtime_single_threaded_matmul_c64.cc", + "runtime_single_threaded_matmul_common.h", + "runtime_single_threaded_matmul_f16.cc", + "runtime_single_threaded_matmul_f32.cc", + "runtime_single_threaded_matmul_f64.cc", + "runtime_single_threaded_matmul_f8.cc", + "runtime_single_threaded_matmul_s32.cc", + "runtime_single_threaded_matmul_u8.cc", + ], + hdrs = ["runtime_single_threaded_matmul.h"], + compatible_with = get_compatible_with_portable(), + copts = runtime_copts(), + linkstatic = 1, + visibility = ["//visibility:private"], + deps = [ + "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl", + "@com_google_absl//absl/base:core_headers", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:ml_dtypes", + ], +) + +cc_library( + name = "runtime_single_threaded_matmul", + hdrs = ["runtime_single_threaded_matmul.h"], + compatible_with = get_compatible_with_portable(), + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_single_threaded_matmul_impl", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:ml_dtypes", + ], +) + +cc_library( + name = "runtime_single_threaded_matmul_nomkl", + compatible_with = get_compatible_with_portable(), + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_single_threaded_matmul_impl", + "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl", + "@com_google_absl//absl/base:core_headers", + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_key_value_sort", + srcs = ["runtime_key_value_sort.cc"], + hdrs = ["runtime_key_value_sort.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@eigen_archive//:eigen3", + ], +) + +cc_library( + name = "runtime_topk", + srcs = ["runtime_topk.cc"], + hdrs = ["runtime_topk.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "@com_google_absl//absl/base", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + ], +) + +cc_library( + name = "runtime_fork_join", + srcs = ["runtime_fork_join.cc"], + hdrs = ["runtime_fork_join.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "//xla:executable_run_options", + "//xla/service:custom_call_status_internal", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:logging", + ], +) + +cc_library( + name = "runtime_handle_ffi_call", + srcs = ["runtime_handle_ffi_call.cc"], + hdrs = ["runtime_handle_ffi_call.h"], + copts = runtime_copts(), + visibility = ["//visibility:public"], + deps = [ + "//xla:executable_run_options", + "//xla:shape_util", + "//xla:xla_data_proto_cc", + "//xla/ffi:attribute_map", + "//xla/ffi:call_frame", + "//xla/ffi:execution_state", + "//xla/ffi:ffi_api", + "//xla/ffi/api:c_api", + "//xla/service:custom_call_status_public_headers", + "//xla/tsl/platform:errors", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + "@llvm-project//mlir:AsmParser", + "@llvm-project//mlir:IR", + "@llvm-project//mlir:Support", + ], +) + +xla_cc_test( + name = "cpu_runtime_test", + srcs = ["cpu_runtime_test.cc"], + shard_count = 10, + tags = ["optonly"], + deps = [ + ":cpu_runtime", + ":runtime_custom_call_status", + ":runtime_matmul", + ":runtime_matmul_acl", + ":runtime_single_threaded_matmul", + "//xla:array2d", + "//xla:executable_run_options", + "//xla:types", + "//xla/client:local_client", + "//xla/service:custom_call_status_internal", + "//xla/tests:xla_internal_test_main", + "@com_google_absl//absl/strings:str_format", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:test", + ], +) + +xla_cc_test( + name = "cpu_instruction_fusion_test", + srcs = ["cpu_instruction_fusion_test.cc"], + tags = ["not_run:arm"], + deps = [ + ":cpu_instruction_fusion", + "//xla:literal_util", + "//xla:shape_util", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/utils:hlo_matchers", + "//xla/service:transpose_folding", + "//xla/tests:test_utils", + "//xla/tests:xla_internal_test_main", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + "@com_google_googletest//:gtest", + "@local_tsl//tsl/platform:statusor", + ], +) + +xla_cc_test( + name = "xfeed_manager_test", + size = "small", + srcs = ["xfeed_manager_test.cc"], + deps = [ + ":cpu_runtime", + "//xla:shape_util", + "//xla:xla_data_proto_cc", + "//xla/tests:xla_internal_test_main", + "//xla/tsl/lib/core:status_test_util", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status:statusor", + "@com_google_googletest//:gtest", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:test", + ], +) + +cc_library( + name = "cpu_instruction_fusion", + srcs = ["cpu_instruction_fusion.cc"], + hdrs = ["cpu_instruction_fusion.h"], + deps = [ + "//xla:shape_util", + "//xla/hlo/ir:hlo", + "//xla/service:fusion_node_indexing_evaluation", + "//xla/service:instruction_fusion", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/log", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + ], +) + +cc_library( + name = "fusion_wrapper", + srcs = ["fusion_wrapper.cc"], + hdrs = ["fusion_wrapper.h"], + deps = [ + "//xla/codegen/emitters:fusion_wrapper_base", + "//xla/hlo/ir:hlo", + "@com_google_absl//absl/strings:string_view", + ], +) + +xla_cc_test( + name = "fusion_wrapper_test", + srcs = ["fusion_wrapper_test.cc"], + deps = [ + ":fusion_wrapper", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/tests:xla_internal_test_main", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/strings:string_view", + "@com_google_googletest//:gtest", + ], +) + +cc_library( + name = "ir_emission_utils", + srcs = ["ir_emission_utils.cc"], + hdrs = ["ir_emission_utils.h"], + deps = [ + ":cpu_runtime", + "//xla:shape_util", + "//xla:window_util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "@com_google_absl//absl/log:check", + "@llvm-project//llvm:Core", + ], +) + +xla_cc_test( + name = "ir_emission_utils_test", + srcs = ["ir_emission_utils_test.cc"], + deps = [ + ":ir_emission_utils", + ":target_machine_features_stub", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/tests:xla_internal_test_main", + ], +) + +cc_library( + name = "cpu_layout_assignment", + srcs = ["cpu_layout_assignment.cc"], + hdrs = ["cpu_layout_assignment.h"], + deps = [ + ":dot_op_emitter", + ":ir_emission_utils", + "//xla:shape_util", + "//xla:util", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/service:computation_layout", + "//xla/service:layout_assignment", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:status", + ], +) + +xla_cc_test( + name = "cpu_layout_assignment_test", + size = "small", + srcs = ["cpu_layout_assignment_test.cc"], + deps = [ + ":cpu_layout_assignment", + ":target_machine_features_stub", + "//xla:literal", + "//xla:shape_layout", + "//xla:shape_util", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/hlo/testlib:test_helpers", + "//xla/hlo/utils:hlo_matchers", + "//xla/service:computation_layout", + "//xla/tests:test_utils", + "//xla/tests:xla_internal_test_main", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/types:span", + "@local_tsl//tsl/platform:status", + ], +) + +cc_library( + name = "conv_canonicalization", + srcs = ["conv_canonicalization.cc"], + hdrs = ["conv_canonicalization.h"], + deps = [ + ":cpu_runtime", + ":ir_emission_utils", + "//xla:permutation_util", + "//xla:shape_util", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/hlo/pass:hlo_pass", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + "@local_tsl//tsl/platform:errors", + ], +) + +xla_cc_test( + name = "conv_canonicalization_test", + srcs = ["conv_canonicalization_test.cc"], + deps = [ + ":conv_canonicalization", + ":target_machine_features_stub", + "//xla:literal_util", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/hlo/testlib:test_helpers", + "//xla/tests:xla_internal_test_main", + ], +) + +cc_library( + name = "parallel_task_assignment", + srcs = ["parallel_task_assignment.cc"], + hdrs = ["parallel_task_assignment.h"], + deps = [ + ":backend_config_proto_cc", + ":ir_emission_utils", + "//xla:shape_util", + "//xla:util", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/hlo/pass:hlo_pass", + "//xla/service:hlo_cost_analysis", + "//xla/service/llvm_ir:dynamic_update_slice_util", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:platform_port", + "@local_tsl//tsl/platform:status", + ], +) + +xla_cc_test( + name = "parallel_task_assignment_test", + srcs = ["parallel_task_assignment_test.cc"], + deps = [ + ":backend_config_proto_cc", + ":cpu_executable", + ":parallel_task_assignment", + ":target_machine_features_stub", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/service:hlo_cost_analysis", + "//xla/tests:xla_internal_test_main", + "@com_google_absl//absl/status:statusor", + "@local_tsl//tsl/platform:statusor", + ], +) + +cc_library( + name = "cpu_options", + srcs = ["cpu_options.cc"], + hdrs = ["cpu_options.h"], + deps = [ + "//xla/service:hlo_module_config", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "orc_jit_memory_mapper", + srcs = ["orc_jit_memory_mapper.cc"], + hdrs = ["orc_jit_memory_mapper.h"], + deps = [ + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/synchronization", + "@llvm-project//llvm:ExecutionEngine", + "@local_tsl//tsl/platform:logging", + ], +) + +xla_cc_test( + name = "cpu_eigen_tensor_alignment_test", + size = "small", + srcs = ["cpu_eigen_tensor_alignment_test.cc"], + deps = [ + ":ir_emission_utils", + ":target_machine_features_stub", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/tests:xla_internal_test_main", + ], +) + +xla_cc_test( + name = "vectorized_reduce_with_no_vector_registers_test", + size = "small", + srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"], + tags = ["not_run:arm"], + target_compatible_with = ["@platforms//cpu:x86_64"], + deps = [ + ":cpu_compiler", + ":cpu_transfer_manager", + ":test_header_helper", + "//xla:util", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/hlo/ir:hlo_module_group", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/service:compiler", + "//xla/tests:xla_internal_test_main", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + "@com_google_googletest//:gtest", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:MC", + "@llvm-project//llvm:Target", + "@local_tsl//tsl/platform:statusor", + ], +) + +xla_cc_test( + name = "scoped_ir_builder_test", + srcs = ["scoped_ir_builder_test.cc"], + deps = [ + ":cpu_executable", + ":ir_emitter", + ":target_machine_features_stub", + "//xla/hlo/analysis:hlo_ordering", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/service:buffer_assignment", + "//xla/service:buffer_value", + "//xla/service:logical_buffer", + "@com_google_googletest//:gtest_main", + "@llvm-project//llvm:Core", + "@local_tsl//tsl/platform:test", + ], +) + +tf_proto_library( + name = "onednn_config_proto", + srcs = ["onednn_config.proto"], +) + +tf_proto_library( + name = "backend_config_proto", + srcs = ["backend_config.proto"], + protodeps = [ + ":onednn_config_proto", + ], +) + +cc_library( + name = "onednn_util", + srcs = ["onednn_util.cc"], + hdrs = [ + "onednn_util.h", + "//xla/tsl/util:onednn_util_hdrs", + ], + copts = runtime_copts() + tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/tsl/platform:env", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_memory_util", + srcs = ["onednn_memory_util.cc"], + hdrs = ["onednn_memory_util.h"], + copts = runtime_copts() + tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + "//xla:literal", + "//xla:shape_util", + "//xla:status_macros", + "//xla:types", + "//xla:util", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/service/llvm_ir:ir_array", + "//xla/service/llvm_ir:ir_builder_mixin", + "//xla/service/llvm_ir:llvm_util", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:Core", + "@llvm-project//llvm:TargetParser", + "@llvm-project//mlir:IR", + "@local_tsl//tsl/platform:errors", + "@local_tsl//tsl/platform:logging", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_matmul", + srcs = ["onednn_matmul.cc"], + hdrs = ["onednn_matmul.h"], + copts = runtime_copts() + tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + ":onednn_memory_util", + ":onednn_util", + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla:shape_util", + "//xla/hlo/ir:hlo", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_convolution", + srcs = ["onednn_convolution.cc"], + hdrs = ["onednn_convolution.h"], + copts = runtime_copts() + tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + ":onednn_memory_util", + ":onednn_util", + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla:shape_util", + "//xla/hlo/ir:hlo", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_layer_norm", + srcs = ["onednn_layer_norm.cc"], + hdrs = [ + "onednn_layer_norm.h", + "//xla/tsl/util:onednn_util_hdrs", + ], + copts = runtime_copts() + tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + ":onednn_memory_util", + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla/tsl/platform:env", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_softmax", + srcs = ["onednn_softmax.cc"], + hdrs = [ + "onednn_softmax.h", + "//xla/tsl/util:onednn_util_hdrs", + ], + copts = runtime_copts() + tsl_copts(), + visibility = ["//visibility:public"], + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + ":onednn_memory_util", + ":runtime_lightweight_check", + "//xla:executable_run_options", + "//xla/tsl/platform:env", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:dynamic_annotations", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_pattern_utils", + hdrs = ["onednn_pattern_utils.h"], + visibility = ["//visibility:public"], + deps = [ + ":onednn_util", + "//xla/hlo/ir:hlo", + "//xla/service:pattern_matcher", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_contraction_rewriter", + srcs = ["onednn_contraction_rewriter.cc"], + hdrs = [ + "onednn_contraction_rewriter.h", + "onednn_convolution.h", + "onednn_matmul.h", + "//xla/tsl/util:onednn_util_hdrs", + ], + copts = tsl_copts(), + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + ":onednn_convolution", + ":onednn_matmul", + ":onednn_memory_util", + ":onednn_pattern_utils", + ":onednn_util", + "//xla:executable_run_options", + "//xla:shape_util", + "//xla:status_macros", + "//xla:xla_data_proto_cc", + "//xla/hlo/evaluator:hlo_evaluator", + "//xla/hlo/ir:hlo", + "//xla/hlo/pass:hlo_pass", + "//xla/service:hlo_cost_analysis", + "//xla/service:hlo_creation_utils", + "//xla/service:pattern_matcher", + "//xla/tsl/platform:env", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/synchronization", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:env", + "@local_tsl//tsl/platform:logging", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_ops_rewriter", + srcs = ["onednn_ops_rewriter.cc"], + hdrs = ["onednn_ops_rewriter.h"], + copts = tsl_copts(), + deps = [ + ":backend_config_proto_cc", + ":onednn_config_proto_cc", + ":onednn_memory_util", + ":onednn_pattern_utils", + ":onednn_util", + "//xla:literal_comparison", + "//xla:literal_util", + "//xla:status_macros", + "//xla:xla_data_proto_cc", + "//xla/hlo/ir:hlo", + "//xla/hlo/pass:hlo_pass", + "//xla/service:hlo_creation_utils", + "//xla/service:pattern_matcher", + "@com_google_absl//absl/algorithm:container", + "@local_tsl//tsl/platform:platform_port", + ] + mkl_deps(), +) + +cc_library( + name = "onednn_float_support", + srcs = ["onednn_float_support.cc"], + hdrs = ["onednn_float_support.h"], + copts = tsl_copts(), + deps = [ + ":onednn_contraction_rewriter", + "//xla/service:float_support", + ], +) + +cc_library( + name = "cpu_float_support", + hdrs = ["cpu_float_support.h"], + copts = tsl_copts(), + deps = [ + "//xla/backends/cpu:xnn_fusion", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/hlo/ir:hlo", + "//xla/service:float_support", + ], +) + +xla_cc_test( + name = "cpu_float_support_test", + srcs = ["cpu_float_support_test.cc"], + deps = [ + ":cpu_float_support", + "//xla:shape_util", + "//xla:xla_data_proto_cc", + "//xla/backends/cpu/codegen:target_machine_features", + "//xla/backends/cpu/codegen:target_machine_test_base", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:verified_hlo_module", + "//xla/hlo/transforms/simplifiers:float_normalization", + "//xla/service:hlo_module_config", + "//xla/tsl/platform:statusor", + "//xla/tsl/platform:test", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:string_view", + "@com_google_googletest//:gtest_main", + ], +) + +cc_library( + name = "cpu_symbol_repository", + hdrs = ["cpu_symbol_repository.h"], + deps = [ + "//xla:xla_proto_cc", + "//xla/service:symbol_repository", + ], +) + +cc_library( + name = "cpu_executable_run_options", + hdrs = ["cpu_executable_run_options.h"], + deps = ["//xla/backends/cpu/collectives:cpu_collectives"], +) + +cc_library( + name = "metrics", + srcs = ["metrics.cc"], + hdrs = ["metrics.h"], + deps = [ + "//xla/tsl/lib/monitoring:counter", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:string_view", + "@local_tsl//tsl/platform:stacktrace", + "@local_tsl//tsl/profiler/lib:traceme", + ], +) + +cc_library( + name = "elemental_ir_emitter", + srcs = ["elemental_ir_emitter.cc"], + hdrs = ["elemental_ir_emitter.h"], + deps = [ + ":elemental_math_emitter", + "//xla/hlo/ir:hlo", + "//xla/service:elemental_ir_emitter", + "@com_google_absl//absl/functional:any_invocable", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + "@com_google_absl//absl/types:span", + "@llvm-project//llvm:ir_headers", + ], +) + +cc_library( + name = "small_while_loop_hoisting_pass", + srcs = ["small_while_loop_hoisting_pass.cc"], + hdrs = ["small_while_loop_hoisting_pass.h"], + deps = [ + ":cpu_executable", + "//xla:util", + "//xla/hlo/ir:hlo", + "//xla/hlo/pass:hlo_pass", + "//xla/service:collective_ops_utils", + "//xla/service:hlo_cost_analysis", + "//xla/tsl/platform:errors", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:string_view", + ], +) + +xla_cc_test( + name = "small_while_loop_hoisting_pass_test", + srcs = ["small_while_loop_hoisting_pass_test.cc"], + deps = [ + ":backend_config_proto_cc", + ":small_while_loop_hoisting_pass", + "//xla/hlo/ir:hlo", + "//xla/hlo/testlib:hlo_hardware_independent_test_base", + "//xla/hlo/testlib:test", + "//xla/tsl/platform:statusor", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings:string_view", + "@com_google_googletest//:gtest_main", + ], +) + +xla_cc_test( + name = "metrics_test", + srcs = ["metrics_test.cc"], + deps = [ + ":metrics", + "//xla/tests:xla_internal_test_main", + "//xla/tsl/lib/monitoring:collected_metrics", + "//xla/tsl/lib/monitoring:collection_registry", + "@local_tsl//tsl/platform:test", + ], +) diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc index 9ba0085b24d372..4a1402c6934cba 100644 --- a/third_party/xla/xla/service/cpu/cpu_compiler.cc +++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc @@ -236,6 +236,8 @@ limitations under the License. #include "tsl/profiler/lib/traceme.h" #include "tsl/profiler/lib/traceme_encode.h" +#include "xnnpack_ops_rewriter.h" + #ifdef TF_LLVM_X86_AVAILABLE #include "llvm/TargetParser/X86TargetParser.h" #endif @@ -591,6 +593,12 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( }; pipeline.AddPass(upcaster_filter); + // For softmax, rewrite to custom calls with XNNPACK targets. + bool enable_xnnpack = + xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack(); + if (enable_xnnpack) + pipeline.AddPass(); + // Expand random number generation. pipeline.AddPass(); pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig new file mode 100644 index 00000000000000..9ba0085b24d372 --- /dev/null +++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig @@ -0,0 +1,2712 @@ +/* Copyright 2017 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "xla/service/cpu/cpu_compiler.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc" +// IWYU pragma: no_include "llvm/Config/Targets.def.inc" + +#include "absl/cleanup/cleanup.h" +#include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" +#include "absl/log/check.h" +#include "absl/log/log.h" +#include "absl/memory/memory.h" +#include "absl/status/status.h" +#include "absl/status/statusor.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SplitModule.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Transforms/DialectConversion.h" +#include "xla/backends/cpu/codegen/cpu_features.h" +#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h" +#include "xla/backends/cpu/codegen/execution_engine.h" +#include "xla/backends/cpu/codegen/ir_compiler.h" +#include "xla/backends/cpu/codegen/jit_compiler.h" +#include "xla/backends/cpu/codegen/object_loader.h" +#include "xla/backends/cpu/codegen/target_machine_features.h" +#include "xla/backends/cpu/constant_allocation.h" +#include "xla/backends/cpu/runtime/function_library.h" +#include "xla/backends/cpu/runtime/thunk.h" +#include "xla/backends/cpu/runtime/thunk.pb.h" +#include "xla/backends/cpu/runtime/thunk_proto_serdes.h" +#include "xla/backends/cpu/transforms/xnn_graph_fusion.h" +#include "xla/backends/cpu/xnn_fusion.h" +#include "xla/cpu_function_runtime.h" +#include "xla/hlo/analysis/hlo_ordering.h" +#include "xla/hlo/analysis/indexed_array_analysis.h" +#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h" +#include "xla/hlo/ir/hlo_casting_utils.h" +#include "xla/hlo/ir/hlo_computation.h" +#include "xla/hlo/ir/hlo_instruction.h" +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/hlo/ir/hlo_module.h" +#include "xla/hlo/ir/hlo_module_group.h" +#include "xla/hlo/ir/hlo_opcode.h" +#include "xla/hlo/ir/hlo_schedule.h" +#include "xla/hlo/pass/hlo_pass_fix.h" +#include "xla/hlo/pass/hlo_pass_pipeline.h" +#include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h" +#include "xla/hlo/transforms/expanders/cholesky_expander.h" +#include "xla/hlo/transforms/expanders/comparison_expander.h" +#include "xla/hlo/transforms/expanders/dot_decomposer.h" +#include "xla/hlo/transforms/expanders/dynamic_index_splitter.h" +#include "xla/hlo/transforms/expanders/eigh_expander.h" +#include "xla/hlo/transforms/expanders/logistic_expander.h" +#include "xla/hlo/transforms/expanders/optimization_barrier_expander.h" +#include "xla/hlo/transforms/expanders/qr_expander.h" +#include "xla/hlo/transforms/expanders/reduce_decomposer.h" +#include "xla/hlo/transforms/expanders/reshape_decomposer.h" +#include "xla/hlo/transforms/expanders/rng_bit_generator_expander.h" +#include "xla/hlo/transforms/expanders/rng_expander.h" +#include "xla/hlo/transforms/expanders/stochastic_convert_decomposer.h" +#include "xla/hlo/transforms/literal_canonicalizer.h" +#include "xla/hlo/transforms/operand_upcaster.h" +#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h" +#include "xla/hlo/transforms/simplifiers/batch_dot_simplification.h" +#include "xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h" +#include "xla/hlo/transforms/simplifiers/conditional_canonicalizer.h" +#include "xla/hlo/transforms/simplifiers/convolution_group_converter.h" +#include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h" +#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h" +#include "xla/hlo/transforms/simplifiers/float_normalization.h" +#include "xla/hlo/transforms/simplifiers/gather_simplifier.h" +#include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h" +#include "xla/hlo/transforms/simplifiers/hlo_dce.h" +#include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h" +#include "xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h" +#include "xla/hlo/transforms/simplifiers/reduce_window_rewriter.h" +#include "xla/hlo/transforms/simplifiers/reshape_mover.h" +#include "xla/hlo/transforms/simplifiers/result_caster.h" +#include "xla/hlo/transforms/simplifiers/sort_simplifier.h" +#include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h" +#include "xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h" +#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h" +#include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h" +#include "xla/hlo/transforms/while_loop_trip_count_annotator.h" +#include "xla/literal_pool.h" +#include "xla/map_util.h" +#include "xla/mlir_hlo/transforms/passes.h" +#include "xla/service/all_reduce_promotion.h" +#include "xla/service/outer_dimension_propagation.h" +#include "xla/service/get_outer_batch_value_simplifier.h" +#include "xla/service/all_to_all_decomposer.h" +#include "xla/service/batched_gather_scatter_normalizer.h" +#include "xla/service/batchnorm_expander.h" +#include "xla/service/buffer_assignment.h" +#include "xla/service/call_graph.h" +#include "xla/service/call_inliner.h" +#include "xla/service/change_op_data_type.h" +#include "xla/service/compiler.h" +#include "xla/service/conditional_simplifier.h" +#include "xla/service/conditional_to_select.h" +#include "xla/service/copy_insertion.h" +#include "xla/service/cpu/buffer_info_util.h" +#include "xla/service/cpu/conv_canonicalization.h" +#include "xla/service/cpu/cpu_aot_compilation_result.h" +#include "xla/service/cpu/cpu_executable.h" +#include "xla/service/cpu/cpu_float_support.h" +#include "xla/service/cpu/cpu_instruction_fusion.h" +#include "xla/service/cpu/cpu_layout_assignment.h" +#include "xla/service/cpu/cpu_options.h" +#include "xla/service/cpu/dot_op_emitter.h" +#include "xla/service/cpu/executable.pb.h" +#include "xla/service/cpu/fusion_wrapper.h" +#include "xla/service/cpu/ir_emitter.h" +#include "xla/service/cpu/ir_emitter2.h" +#include "xla/service/cpu/metrics.h" +#include "xla/service/cpu/parallel_task_assignment.h" +#include "xla/service/cpu/runtime_symbol_generator.h" +#include "xla/service/cpu/small_while_loop_hoisting_pass.h" +#include "xla/service/cpu/thunk_emitter.h" +#include "xla/service/cpu_gpu_shape_verifier.h" +#include "xla/service/dump.h" +#include "xla/service/dynamic_dimension_inference.h" +#include "xla/service/dynamic_padder.h" +#include "xla/service/executable.h" +#include "xla/service/float_support.h" +#include "xla/service/gather_expander.h" +#include "xla/service/hlo.pb.h" +#include "xla/service/hlo_cost_analysis.h" +#include "xla/service/hlo_cse.h" +#include "xla/service/hlo_execution_profile.h" +#include "xla/service/hlo_module_config.h" +#include "xla/service/hlo_profile_printer_data.pb.h" +#include "xla/service/hlo_verifier.h" +#include "xla/service/layout_assignment.h" +#include "xla/service/llvm_compiler.h" +#include "xla/service/llvm_ir/llvm_command_line_options.h" +#include "xla/service/llvm_ir/llvm_util.h" +#include "xla/service/logical_buffer.h" +#include "xla/service/map_inliner.h" +#include "xla/service/scatter_expander.h" +#include "xla/service/scatter_simplifier.h" +#include "xla/service/select_and_scatter_expander.h" +#include "xla/service/sharding_propagation.h" +#include "xla/service/sharding_remover.h" +#include "xla/service/slow_operation_alarm.h" +#include "xla/service/spmd/shardy/shardy_xla_pass.h" +#include "xla/service/spmd/stateful_rng_spmd_partitioner.h" +#include "xla/service/topk_rewriter.h" +#include "xla/service/transpose_folding.h" +#include "xla/service/triangular_solve_expander.h" +#include "xla/service/while_loop_constant_sinking.h" +#include "xla/service/while_loop_invariant_code_motion.h" +#include "xla/service/while_loop_simplifier.h" +#include "xla/shape.h" +#include "xla/shape_util.h" +#include "xla/status_macros.h" +#include "xla/stream_executor/host/host_platform_id.h" +#include "xla/stream_executor/platform.h" +#include "xla/stream_executor/stream_executor.h" +#include "xla/tsl/platform/env.h" +#include "xla/tsl/platform/status.h" +#include "xla/tsl/platform/statusor.h" +#include "xla/tsl/platform/threadpool.h" +#include "xla/util.h" +#include "xla/xla.pb.h" +#include "xla/xla_data.pb.h" +#include "tsl/platform/casts.h" +#include "tsl/platform/cpu_info.h" +#include "tsl/platform/logging.h" // IWYU pragma: keep +#include "tsl/profiler/lib/traceme.h" +#include "tsl/profiler/lib/traceme_encode.h" + +#ifdef TF_LLVM_X86_AVAILABLE +#include "llvm/TargetParser/X86TargetParser.h" +#endif + +#if defined(INTEL_MKL) +#include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h" +#include "xla/service/cpu/onednn_contraction_rewriter.h" +#include "xla/service/cpu/onednn_float_support.h" +#include "xla/service/cpu/onednn_ops_rewriter.h" +#endif + +namespace xla { +namespace { + +using tsl::profiler::TraceMe; +using tsl::profiler::TraceMeEncode; + +// A module identifier (prefix) for emitted LLVM modules. +static constexpr absl::string_view kXlaModuleIdentifier = "__compute_module"; + +// Returns a global (per-process) thread pool for XLA CPU compilation tasks. +static tsl::thread::ThreadPool* GetCompilationThreadPool() { + // LLVM compilation has a lot of memory-bound pointer chasing and not + // so much CPU-bound work. Based on profiling a few examples, 32 threads seems + // to be enough to achieve maximum parallel compilation speedup. + static constexpr int kMaxCompilationThreads = 32; + + // On Mac OS the default stack size is 512KiB, this is too small for compiling + // reasonably sized programs + tsl::ThreadOptions thread_options; + thread_options.stack_size = 4 * 1024 * 1024; // 4 MB + + static auto* const thread_pool = new tsl::thread::ThreadPool( + tsl::Env::Default(), thread_options, "xla-cpu-llvm-codegen", + std::min(kMaxCompilationThreads, tsl::port::MaxParallelism())); + return thread_pool; +} + +// Returns task runner that uses the global compilation thread pool. +static cpu::JitCompiler::TaskRunner GetCompilationTaskRunner() { + return [](cpu::JitCompiler::Task task) { + GetCompilationThreadPool()->Schedule(std::move(task)); + }; +} + +// For each computation in the module, determines whether that computation +// calls a custom-call function, either directly or indirectly (e.g. because it +// calls another computation that does). +absl::flat_hash_map +ModuleComputationsTransitivelyContainCustomCall(const HloModule& module) { + absl::flat_hash_map custom_call_map; + std::unique_ptr call_graph = CallGraph::Build(&module); + + // Can never fail because we always return an OK status from the visitor. + TF_CHECK_OK(call_graph->VisitNodes([&custom_call_map]( + const CallGraphNode& node) { + const HloComputation* computation = node.computation(); + + for (const HloInstruction* instruction : computation->instructions()) { + // The computation contains a custom-call instruction directly. + if (DynCast(instruction)) { + custom_call_map[computation] = true; + return absl::OkStatus(); + } + // The computation calls something that contains a custom-call + // instruction (directly or indirectly). This lookup relies on the call + // graph traversing callees before callers, so that the map is always + // populated for all callees at this point. + for (const HloComputation* callee : instruction->called_computations()) { + bool callee_contains_custom_call = FindOrDie(custom_call_map, callee); + if (callee_contains_custom_call) { + custom_call_map[computation] = true; + return absl::OkStatus(); + } + } + } + + custom_call_map[computation] = false; + return absl::OkStatus(); + })); + + return custom_call_map; +} + +} // namespace + +namespace cpu { + +CpuCompiler::CpuCompiler() { + // Initialize LLVM the first time the CpuCompiler is initialized. + static bool llvm_initialized = []() { + InitializeLLVMTarget(); + return true; + }(); + (void)llvm_initialized; +} + +absl::StatusOr>> CpuCompiler::Compile( + std::unique_ptr module_group, + std::vector> stream_execs, + const CompileOptions& options) { + for (const std::vector& se_vector : stream_execs) { + if (se_vector.size() != 1) { + return Unimplemented( + "Model partitioning not implemented for the CPU compiler"); + } + } + return LLVMCompiler::Compile(std::move(module_group), stream_execs, options); +} + +/* static */ void CpuCompiler::InitializeLLVMTarget() { + // Initialize LLVM's MC layer for the native target. + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); +} + +namespace { + +// This visitor records which HLO instructions should have profiling information +// recorded. +class CollectProfileCandidates : public DfsHloVisitorWithDefault { + public: + static absl::StatusOr> + GetCandidatesForComputation( + const HloComputation& computation, + const absl::flat_hash_map& + assigned_indices) { + absl::flat_hash_map hlo_to_profile_idx; + CollectProfileCandidates profile_candidates_for_computation( + &hlo_to_profile_idx, assigned_indices); + TF_RETURN_IF_ERROR(computation.Accept(&profile_candidates_for_computation)); + return hlo_to_profile_idx; + } + + private: + CollectProfileCandidates( + absl::flat_hash_map* hlo_to_profile_idx, + const absl::flat_hash_map& + assigned_indices) + : hlo_to_profile_idx_(hlo_to_profile_idx), + assigned_indices_(assigned_indices) {} + + absl::Status DefaultAction(HloInstruction* hlo_instruction) override { + hlo_to_profile_idx_->insert( + {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)}); + return absl::OkStatus(); + } + + absl::Status HandleCall(HloInstruction* call) override { + TF_RETURN_IF_ERROR(DefaultAction(call)); + CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_, + assigned_indices_); + TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call)); + return absl::OkStatus(); + } + // Recurse into "conditional" so we can profile inside of it. + absl::Status HandleConditional(HloInstruction* conditional) override { + TF_RETURN_IF_ERROR(DefaultAction(conditional)); + + CollectProfileCandidates candidates_for_true(hlo_to_profile_idx_, + assigned_indices_); + TF_RETURN_IF_ERROR( + conditional->true_computation()->Accept(&candidates_for_true)); + + CollectProfileCandidates candidates_for_false(hlo_to_profile_idx_, + assigned_indices_); + TF_RETURN_IF_ERROR( + conditional->false_computation()->Accept(&candidates_for_false)); + + return absl::OkStatus(); + } + + // Skip constants, there is nothing to profile. + absl::Status HandleConstant(HloInstruction*) override { + return absl::OkStatus(); + } + // Skip parameters, they are a simple load. + absl::Status HandleParameter(HloInstruction*) override { + return absl::OkStatus(); + } + // It is important to recurse for "while" or else we risk overly coarse + // profiling information. + absl::Status HandleWhile(HloInstruction* xla_while) override { + TF_RETURN_IF_ERROR(DefaultAction(xla_while)); + + CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_, + assigned_indices_); + TF_RETURN_IF_ERROR( + xla_while->while_condition()->Accept(&candidates_for_condition)); + + CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_, + assigned_indices_); + TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body)); + + return absl::OkStatus(); + } + + absl::flat_hash_map* hlo_to_profile_idx_; + const absl::flat_hash_map& assigned_indices_; +}; + +// Adds the HloVerifier for CPU to the given pipeline. +void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {}, + bool debug_only = false) { + auto verifier_metadata = + std::make_unique(std::move(opts)); + + if (debug_only) { + pipeline->AddInvariantCheckerDebug( + std::move(verifier_metadata), "hlo verifier (debug)"); + } else { + pipeline->AddInvariantChecker(std::move(verifier_metadata), + "hlo verifier"); + } +} + +std::unique_ptr> CreateSimplificationPipeline( + absl::string_view name, HloModule* module, bool is_fusion_emitters) { + // Run the following passes to a fixed point. + auto pipeline = + std::make_unique>(std::string(name)); + AddHloVerifier(pipeline.get(), HloVerifierOpts{}, + /*debug_only=*/true); + + AlgebraicSimplifierOptions options; + options.set_enable_dot_strength_reduction(false); + // "slow" minmax means we propagate nan. + options.set_minmax_propagate_nan( + !module->config().debug_options().xla_cpu_enable_fast_min_max()); + options.set_supports_non_canonical_dots(false); + options.set_executing_on_cpu(true); + pipeline->AddPass(options); + pipeline->AddPass(); + pipeline->AddPass(); + pipeline->AddPass(GatherExpander::kEliminateSimpleGathers); + if (is_fusion_emitters) { + // Conversion to MLIR only works with simplified gathers. + pipeline->AddPass(); + } + + // Needs to happen after algebraic simplifier. + // pipeline->AddPass(); + + // BatchNormExpander can create zero-sized ops, so zero-sized HLO + // elimination has to come after that pass. + pipeline->AddPass(); + + pipeline->AddPass(); + pipeline->AddPass(); + pipeline->AddPass(); + pipeline->AddPass(); + + // TODO(b/134075051): Re-enable after b/134075051 is fixed. + // pipeline->AddPass(); + + pipeline->AddPass(); + pipeline->AddPass(); + pipeline->AddPass( + options::FoldAllConstants(module->config()) + ? HloConstantFolding::Level::kAggressive + : HloConstantFolding::Level::kDefault); + pipeline->AddPass(); + + return pipeline; +} + +} // namespace + +absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( + HloModule* module, bool is_aot_compile, + TargetMachineFeatures* target_machine_features) { + const int64_t num_partitions = module->config().num_partitions(); + const bool is_thunk_runtime = + module->config().debug_options().xla_cpu_use_thunk_runtime(); + const bool is_fusion_emitters = + is_thunk_runtime && + module->config().debug_options().xla_cpu_use_fusion_emitters(); + bool use_shardy_partitioner = module->config().use_shardy_partitioner(); + if (num_partitions > 1) { + if (!module->config().use_spmd_partitioning()) { + return InvalidArgument( + "num_partitions=%d but SPMD partitioning not enabled.", + num_partitions); + } + HloPassPipeline spmd_pipeline("spmd-partitioner"); + // Run some IR cleanup passes before running the SPMD partitioning + // passes. + AddHloVerifier(&spmd_pipeline); + spmd_pipeline.AddPass(); + spmd_pipeline.AddPass(); + spmd_pipeline.AddPass(); + spmd_pipeline.AddPass(); + if (use_shardy_partitioner) { + spmd_pipeline.AddPass(); + } else { + spmd_pipeline.AddPass( + /*is_spmd=*/true, /*propagate_metadata=*/false, + module->config().allow_spmd_sharding_propagation_to_output(), + module->config().allow_spmd_sharding_propagation_to_parameters()); + } + spmd_pipeline.AddPass( + num_partitions, module->config().replica_count()); + TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status()); + } else { + HloPassPipeline sharding_removal_pipeline("sharding-removal"); + AddHloVerifier(&sharding_removal_pipeline); + // Remove redundant sharding ops when partition_count == 1. + sharding_removal_pipeline.AddPass(); + // Run ShardyXLA without propagation, which enforces use-tuple-args. + if (use_shardy_partitioner) { + sharding_removal_pipeline.AddPass( + /*runSdyShardingPropagation=*/false); + } + sharding_removal_pipeline.AddPass(); + TF_RETURN_IF_ERROR(sharding_removal_pipeline.Run(module).status()); + } + + { + // SubbytePacker must be run before the rest of the pipeline since it + // modifies the layout of the entry computation inputs/outputs, which is + // passed to LayoutAssignment. + HloPassPipeline subbyte_packer_pipeline("SubbytePacker pipeline"); + subbyte_packer_pipeline.AddPass( + SubByteNormalization::SET_ELEMENT_SIZE); + TF_RETURN_IF_ERROR(subbyte_packer_pipeline.Run(module).status()); + } + HloPassPipeline pipeline("HLO passes through layout assignment"); + AddHloVerifier(&pipeline); + pipeline.AddPass(); + pipeline.AddPass(); + + // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does + // not support. `upcaster_filter` returns false if the instruction shouldn't + // be processed. + // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN + // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in + // `XnnFusionThunk`. + bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack(); + auto call_library_for_dot = [&](const HloInstruction& instr) { + if (!xnnpack_enabled) return false; + DotImplementationStrategy strategy = GetDotImplementationStrategy( + module->config(), instr, *target_machine_features, + /*allow_runtime_calls=*/true); + return strategy == DotImplementationStrategy::kEigen; + }; + HloPredicate upcaster_filter = [&](const HloInstruction* instr) { + if (!call_library_for_dot(*instr)) return true; + return !IsXnnDotSupported(instr->dot_dimension_numbers(), + instr->operand(0)->shape(), + instr->operand(1)->shape(), instr->shape(), + target_machine_features) + .value_or(false); + }; + pipeline.AddPass(upcaster_filter); + + // Expand random number generation. + pipeline.AddPass(); + pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); + + // Remove zero-sized HLO from the input so that other passes don't have to + // handle it. + pipeline.AddPass(); + + pipeline.AddPass(); + + pipeline.AddPass(); + pipeline.AddPass(); + + // The TopkDecomposer generates a compare op with type=TOTALORDER and must + // run before the ComparisonExpander which rewrites such comparisons. + pipeline.AddPass([&](const HloInstruction* instr) { + return instr->opcode() == HloOpcode::kTopK; + }); + + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + + // Inline computations with a single call site. + pipeline.AddPass(/*single_call_site=*/true); + pipeline.AddPass(); + pipeline.AddPass(); + + // Rewrite to custom calls with target as oneDNN library calls. +#if defined(INTEL_MKL) + // AOT compiled code runs in single thread. + if (!is_aot_compile && !is_thunk_runtime) { + // Placing OneDnnOpsRewriter here to match the flax patterns + // TODO: Decide where would be the appropriate place for this pass to make + // it more generic + // TODO - intel: Name of the pass might seem redundant as oneDnnRewriter, + // but in future plan to rename oneDNNrewriter to specific to onednn matmul + pipeline.AddPass(); + } +#endif // INTEL_MKL + + // Promote BF16 all-reduce to F32. + const std::pair ar_promoted_types[] = { + {BF16, F32}}; + pipeline.AddPass(ar_promoted_types); + // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU + // backend can support BF16/F8 operations without directly implementing a + // BF16/F8 lowering for most ops. + CpuFloatSupport bf16_support(BF16, call_library_for_dot, + target_machine_features); +#if defined(INTEL_MKL) + OneDnnFloatSupport onednn_bf16_support(BF16); + if (!is_aot_compile && !is_thunk_runtime) { + pipeline.AddPass(&onednn_bf16_support); + } else { + pipeline.AddPass(&bf16_support); + } +#else + pipeline.AddPass(&bf16_support); +#endif + FloatSupport f8e5m2_support(F8E5M2, F16); + pipeline.AddPass(&f8e5m2_support); + FloatSupport f8e4m3_support(F8E4M3, F16); + pipeline.AddPass(&f8e4m3_support); + FloatSupport f8e4m3fn_support(F8E4M3FN, F16); + pipeline.AddPass(&f8e4m3fn_support); + FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ, F16); + pipeline.AddPass(&f8e4m3b11fnuz_support); + FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ, F16); + pipeline.AddPass(&f8e5m2fnuz_support); + FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ, F16); + pipeline.AddPass(&f8e4m3fnuz_support); + FloatSupport f8e3m4_support(F8E3M4, F16); + pipeline.AddPass(&f8e3m4_support); + FloatSupport s4_support(S4, S8); + pipeline.AddPass(&s4_support); + FloatSupport u4_support(U4, U8); + pipeline.AddPass(&u4_support); + FloatSupport f4e2m1fn_support(F4E2M1FN, F16); + pipeline.AddPass(&f4e2m1fn_support); + FloatSupport f8e8m0fnu_support(F8E8M0FNU, F32); + pipeline.AddPass(&f8e8m0fnu_support); + // After canonicalization, there may be more batch dots that can be + // simplified. + pipeline.AddPass(); + auto cost_model = [](HloInstruction* conv) { + // We need a cost model for CPUs. Currently, do nothing. + return false; + }; + pipeline.AddPass( + /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model, + /*convert_batch_groups_only=*/true); + auto feature_group_should_expand = [](HloInstruction* conv) { + switch (conv->shape().element_type()) { + case F16: + case F32: + return false; + default: + return true; + } + }; + pipeline.AddPass( + feature_group_should_expand, cost_model, + /*convert_batch_groups_only=*/false); + pipeline.AddPass( + /*rewrite_training_op=*/true, + /*rewrite_inference_op=*/true, + /*rewrite_grad_op=*/true); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + + if (module->config() + .debug_options() + .xla_reduce_window_rewrite_base_length() != 0) { + pipeline.AddPass>( + module->config() + .debug_options() + .xla_reduce_window_rewrite_base_length()); + } + auto dynamic_padder_options = DynamicPadderOptions(); + // TODO(pgavin): ShapeChecks were never implemented correctly by the dynamic + // padder. The mode defaults to kIgnore, and it was not overridden for nested + // computations (such as while bodies or conditional branches), and so cases + // that could not be proven would still be accepted even with compile-time + // checks enabled. Recent changes to the DynamicPadder correctly + // override the mode. However, some models have started to rely on the check + // being ignored, and they would be broken if it is enforced. + dynamic_padder_options.shape_check_mode = + DynamicDimensionInference::ShapeCheckMode::kIgnore; + pipeline.AddPass(dynamic_padder_options); + + pipeline.AddPass(target_machine_features); + + // Run fp16 dots/convs in fp32 and then downcast the result to fp16. + // Justification: + // + // - This is significantly faster on our CPUs today than true fp16. + // - It's numerically more accurate. (Granted, this is not always + // desirable, thus the ability to disable this functionality.) + // - It matches more closely the GPU's behavior on fp16 dot/conv, where + // accumulation happens in f32. + if (!module->config().debug_options().xla_cpu_strict_dot_conv_math()) { + pipeline.AddPass( + F16, F32, HloPredicateIsOp); + } + + pipeline.AddPass(CreateSimplificationPipeline("simplification", module, + is_fusion_emitters)); + + // Scatter expander is sandwiched between two simplification pipelines to + // enable constant folding with the original scatter instructions (which is + // more efficient than with the expanded version) but then to also ensure that + // the resulting while loops are simplified. + pipeline.AddPass(); + if (is_fusion_emitters) { + pipeline.AddPass( + ScatterExpander::kEliminateSimpleScatters); + pipeline.AddPass(); + } + if (!is_fusion_emitters || !kFusionEmitterScatterEnabled) { + pipeline.AddPass(ScatterExpander::kEliminateAllScatters); + } + + pipeline.AddPass(CreateSimplificationPipeline( + "post_scatter_expansion_simplification", module, is_fusion_emitters)); + + pipeline.AddPass(); + + pipeline.AddPass([](const HloSortInstruction* sort, int64_t) { + return sort->operand(0)->shape().element_type() == F32; + }); + pipeline.AddPass(); + pipeline.AddPass( + [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr { + if (DotImplementationCanHandleTranspose(dot, *target_machine_features, + /*allow_runtime_calls=*/true)) { + return TransposeFolding::IsRowColumnTransposeDotOperand(dot, operand); + } + return false; + }, + TransposeFolding::NeverFoldTranspose); + pipeline.AddPass(/*is_layout_sensitive=*/false); + + pipeline.AddPass(); + pipeline.AddPass(); + + // Annotate while loops with statically known trip counts, so that at run time + // we can avoid running the loop condition computations. + pipeline.AddPass(); + + // Layout assignment uses alias analysis, which requires the call graph to be + // flattened. + pipeline.AddPass(); + ChannelLayoutConstraints layout_constraints; + pipeline.AddPass( + module->mutable_entry_computation_layout(), target_machine_features, + &layout_constraints); + // Run SubByteNormalization because CpuLayoutAssignment may modify a + // Layout's element_size_in_bits field. + pipeline.AddPass( + SubByteNormalization::SET_ELEMENT_SIZE); + + // Finally canonicalize all literals larger than 1024 bytes in the module to + // reuse the same literal across multiple HLO modules. + pipeline.AddPass(LiteralPool::Default(), + /*min_size_bytes=*/1024); + + return pipeline.Run(module).status(); +} + +absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn( + HloModule* module, bool is_aot_compile, + TargetMachineFeatures* target_machine_features, + const CompileOptions& compile_options) { + const auto& debug_options = module->config().debug_options(); + const bool is_thunk_runtime = debug_options.xla_cpu_use_thunk_runtime(); + const bool is_fusion_emitters = + is_thunk_runtime && debug_options.xla_cpu_use_fusion_emitters(); + HloPassPipeline pipeline("HLO passes after layout assignment"); + + { + HloPassPipeline normalization_pipeline("hlo normalization"); + normalization_pipeline.AddPass(); + normalization_pipeline.AddPass(); + normalization_pipeline.AddPass(); + TF_RETURN_IF_ERROR(normalization_pipeline.Run(module).status()); + } + + // After layout assignment, use a layout-sensitive verifier. + pipeline.AddPass("after layout assignment"); + AddHloVerifier(&pipeline, HloVerifierOpts{}.MakeLayoutSensitive(), + /*debug_only=*/true); + + pipeline.AddPass(); + + const int max_parallelism = + module->config().intra_op_parallelism_threads() > 0 + ? module->config().intra_op_parallelism_threads() + : tsl::port::NumSchedulableCPUs(); + +#if defined(INTEL_MKL) + // AOT compiled code runs in single thread. + if (!is_aot_compile && !is_thunk_runtime) { + // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it + // easier to match. + // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization. + if (debug_options.xla_allow_excess_precision()) { + pipeline.AddPass(); + } + pipeline.AddPass(max_parallelism, + compile_options.thread_pool); + // Run SimplifyFPConversions pass again to remove redundant Convert ops + // that may exist as a result of running OneDnnContractionRewriter pass. + if (debug_options.xla_allow_excess_precision()) { + pipeline.AddPass(); + } + } +#endif // INTEL_MKL + + if (module->config() + .debug_options() + .xla_cpu_experimental_xnn_graph_fusion_mode() != + DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED) { + pipeline.AddPass(); + } + + // Add a fusion pass now that layout assignment is done. + pipeline.AddPass(); + if (is_fusion_emitters) { + pipeline.AddPass(); + } + + // The LayoutAssignment pass may leave behind kCopy instructions which are + // duplicate or NOPs, so remove them with algebraic simplification and CSE. + // Run this to a fixed point. + [&pipeline = pipeline.AddPass>( + "simplification after layout assignment"), + &module] { + AddHloVerifier( + &pipeline, + HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout( + LayoutAssignment::InstructionCanChangeLayout), + /*debug_only=*/true); + AlgebraicSimplifierOptions options; + options.set_is_layout_sensitive(true); + options.set_supports_non_canonical_dots(false); + options.set_enable_dot_strength_reduction(false); + // "slow" minmax means we propagate nan. + options.set_minmax_propagate_nan( + !module->config().debug_options().xla_cpu_enable_fast_min_max()); + options.set_executing_on_cpu(true); + pipeline.AddPass(options); + pipeline.AddPass(); + pipeline.AddPass(/*is_layout_sensitive=*/true); + }(); + + // Outline ops in the entry computation into calls to subcomputations. + if (!is_aot_compile) { + // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module. + // Note this is not run for AOT because it would bring in thread pool + // and thread synchronization dependencies which would likely increase + // binary size (and most AOT applications are single-threaded). + // TODO(b/29630486) Support multi-threaded AOT. + pipeline.AddPass( + max_parallelism, ShapeSizeBytesFunction(), target_machine_features); + } + // Copy insertion should be performed immediately before IR emission to + // avoid inserting unnecessary copies (later pass adds an instruction which + // materializes the value) or missing a necessary copy (later pass removes + // an instruction which materializes a value). DCE must be run immediately + // before (and sometime after) copy insertion, to avoid dead code from + // interfering with the rewrites. + pipeline.AddPass(); + pipeline.AddPass(true); + + // If enabled we'll use more precise region based analysis for copy removal. + if (debug_options.xla_cpu_copy_insertion_use_region_analysis()) { + pipeline.AddPass( + /*can_share_buffer=*/nullptr, + /*use_region_based_live_range_analysis=*/-1); + } else { + pipeline.AddPass(); + } + + // The hoisting of small while loops is only useful in the context of the + // thunk runtime. + if (module->config().debug_options().xla_cpu_use_thunk_runtime()) { + TF_ASSIGN_OR_RETURN( + int64_t byte_threshold, + xla::cpu::options::SmallWhileLoopByteThreshold(module->config())); + pipeline.AddPass(byte_threshold); + } + + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + return pipeline.Run(module).status(); +} + +absl::Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, + llvm::TargetMachine* target_machine, + const CompileOptions& compile_options) { + TargetMachineFeatures target_machine_features(target_machine); + TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(module, is_aot_compile, + &target_machine_features)); + + return RunHloPassesAfterLayoutAssn(module, is_aot_compile, + &target_machine_features, compile_options); +} + +namespace { + +// Align buffers to XLA:CPU minimal alignment. +int64_t memory_alignment(LogicalBuffer::Color) { + return cpu_function_runtime::MinAlign(); +} + +llvm::TargetOptions CompilerTargetOptions( + const HloModuleConfig& module_config) { + llvm::TargetOptions target_options; + // Always allow FMA fusion. This increases precision instead of decreasing it. + target_options.AllowFPOpFusion = llvm::FPOpFusion::Fast; + return target_options; +} + +std::pair GetIRModuleHooks( + const HloModule& hlo_module, + const LLVMCompiler::ModuleHook& user_pre_optimization_hook, + const LLVMCompiler::ModuleHook& user_post_optimization_hook) { + // Create the IR hooks. If applicable, each IR hook does the following: + // + // * Calls the user supplied module hook. + // * Writes out the IR to a file in the output directory designated by + // --xla_dump_to + const HloModule* hlo_module_ptr = &hlo_module; + auto hook = [user_pre_optimization_hook, user_post_optimization_hook, + hlo_module_ptr](bool optimized, + const llvm::Module& llvm_module) { + const auto& user_hook = + !optimized ? user_pre_optimization_hook : user_post_optimization_hook; + if (user_hook) { + user_hook(llvm_module); + } + + // Include LLVM module identifier suffix in case `llvm_module` is just a + // part of the original LLVM module constructed by the XLA. + absl::string_view id = llvm_module.getModuleIdentifier(); + size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size()); + llvm_ir::DumpIrIfEnabled(*hlo_module_ptr, llvm_module, optimized, + /*filename_suffix=*/id.substr(pos)); + }; + return {[hook](const llvm::Module& llvm_module) { + return hook(/*optimized=*/false, llvm_module); + }, + [hook](const llvm::Module& llvm_module) { + return hook(/*optimized=*/true, llvm_module); + }}; +} + +absl::Status VerifyLlvmModule(const llvm::Module& llvm_module) { + XLA_SCOPED_LOGGING_TIMER("CpuCompiler - Running LLVM verifier"); + + std::string err; + llvm::raw_string_ostream err_stream(err); + + // verifyModule() returns true if the module is broken. + TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream)) + << "Invalid LLVM IR before optimizations:\n" + << err_stream.str() + << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. " + "Rerun with --xla_dump_to to get the IR. "; + return absl::OkStatus(); +} + +absl::Status CreateHloProfilingArtifacts( + const HloModule& module, + absl::flat_hash_map* + instruction_to_profile_idx, + absl::flat_hash_map* + computation_to_profile_idx, + std::unique_ptr* hlo_profile_index_map, + std::unique_ptr* hlo_profile_printer_data) { + *hlo_profile_index_map = std::make_unique(module); + const HloComputation& entry_computation = *module.entry_computation(); + + TF_ASSIGN_OR_RETURN( + *instruction_to_profile_idx, + CollectProfileCandidates::GetCandidatesForComputation( + entry_computation, + (*hlo_profile_index_map)->instruction_to_profile_idx())); + + auto shape_size_bytes = [](const Shape& shape) { + // On the cpu, opaques are pointers. + if (shape.IsOpaque()) { + return static_cast(sizeof(void*)); + } + return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); + }; + + HloCostAnalysis cost_analysis(shape_size_bytes); + TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis)); + *hlo_profile_printer_data = CreateHloProfilePrinterData( + **hlo_profile_index_map, cost_analysis, entry_computation.name()); + *computation_to_profile_idx = + (*hlo_profile_index_map)->computation_to_profile_idx(); + + return absl::OkStatus(); +} + +} // namespace + +absl::StatusOr> CpuCompiler::RunHloPasses( + std::unique_ptr module, se::StreamExecutor* /*stream_exec*/, + const CompileOptions& options) { + auto& config = module->config(); + + TF_ASSIGN_OR_RETURN( + std::unique_ptr jit_target_machine, + IrCompiler::InferTargetMachine( + CompilerTargetOptions(config), IrCompiler::GetCodeGenOptLevel(config), + CpuFeatureFromString(config.debug_options().xla_cpu_max_isa()))); + + TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false, + jit_target_machine.get(), + /*compile_options=*/options)); + return std::move(module); +} + +namespace { + +static void DumpModuleToFile(const llvm::Module& llvm_module, + const llvm::object::ObjectFile& obj_file, + const HloModule& hlo_module) { + absl::string_view id = llvm_module.getModuleIdentifier(); + size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size()); + auto get_file_suffix = [&]() { + std::vector parts = {"obj-file"}; + parts.reserve(3); + absl::string_view middle_name = id.substr(pos); + if (!middle_name.empty()) { + parts.push_back(middle_name); + } + parts.push_back("o"); + return absl::StrJoin(parts, "."); + }; + DumpToFileInDir( + hlo_module, /*file_prefix=*/"", get_file_suffix(), + absl::string_view(obj_file.getData().data(), obj_file.getData().size())); +} + +// Post-compilation callback functor for use by SimpleOrcJIT. +// +// Dumps machine code if dumping is enabled for the module. +static std::function +CreateOrcJITPostCompilationHook(const HloModule* hlo_module, + std::vector* obj_files) { + return [=](const llvm::Module& llvm_module, + const llvm::object::ObjectFile& obj_file) { + if (obj_files) obj_files->push_back(obj_file.getData().str()); + + if (DumpingEnabledForHloModule(*hlo_module)) { + DumpModuleToFile(llvm_module, obj_file, *hlo_module); + } + }; +} + +struct ComputationToEmit { + HloComputation* computation; + + // Are we emitting this computation with fast-math reassociation enabled? + // We enable reassociation for reductions because it has a significant + // performance impact. + bool allow_reassociation; + + bool operator==(const ComputationToEmit& other) const { + return computation == other.computation && + allow_reassociation == other.allow_reassociation; + } + + template + friend H AbslHashValue(H h, const ComputationToEmit& c) { + return H::combine(std::move(h), c.computation, c.allow_reassociation); + } +}; + +std::vector SubcomputationEmissionOrder( + HloComputation* root) { + absl::flat_hash_set visited; + std::vector postorder; + + // agenda of (node, leave) pairs. + std::stack> agenda; + agenda.emplace(ComputationToEmit{root, false}, false); + while (!agenda.empty()) { + ComputationToEmit c; + bool leave; + std::tie(c, leave) = agenda.top(); + agenda.pop(); + + if (leave) { + postorder.push_back(c); + continue; + } + + if (visited.insert(c).second) { + agenda.emplace(c, true); + for (auto* instruction : c.computation->instructions()) { + bool allow_reassociation = + instruction->opcode() == HloOpcode::kAllReduce || + instruction->opcode() == HloOpcode::kReduce || + instruction->opcode() == HloOpcode::kReduceWindow; + auto cc = absl::MakeSpan(instruction->called_computations()); + for (auto it = cc.rbegin(); it != cc.rend(); ++it) { + HloComputation* called_computation = *it; + ComputationToEmit callee{ + called_computation, c.allow_reassociation || allow_reassociation}; + if (!visited.contains(callee)) { + agenda.emplace(callee, false); + } + } + } + } + } + DCHECK(!postorder.empty() && postorder.back().computation == root); + postorder.pop_back(); + return postorder; +} + +} // namespace + +// Removes unused globals and function declarations from the LLVM module. +// +// After splitting LLVM module into multiple parts, we end up with unused +// symbols in each part: external globals and function declarations. We don't +// support linking across modules added to SimpleOrcJIT, and we don't need it, +// because we never construct LLVM IR that might require cross-module linking, +// so we can just remove unused symbols from each part. +static void RemoveUnusedSymbols(llvm::Module& module) { + llvm::SmallVector unused_globals; + llvm::SmallVector unused_functions; + + for (llvm::GlobalVariable& gv : module.globals()) { + if (gv.use_empty()) unused_globals.push_back(&gv); + } + for (llvm::Function& f : module.functions()) { + if (f.isDeclaration() && f.use_empty()) unused_functions.push_back(&f); + } + + for (auto* gv : unused_globals) { + module.eraseGlobalVariable(gv); + } + for (auto* f : unused_functions) { + f->eraseFromParent(); + } +} + +// Clones a ThreadSafeModule from the given LLVM module in a new LLVM context. +// +// To enable parallel compilation, each LLVM module has to be owned by a +// separate LLVM context. We take each part of the original module after a +// split, and clone it into a new LLVM context. +static llvm::orc::ThreadSafeModule CloneAsThreadSafeModule( + int64_t part, std::unique_ptr module) { + TraceMe trace([&] { + return TraceMeEncode("CpuCompiler::CloneAsThreadSafeModule", + {{"part", part}}); + }); + + // There is no way to clone a module from one context to another, so we need + // to serialize the module to bitcode and parse it back into the new context. + llvm::SmallString<0> bc; + llvm::raw_svector_ostream bcos(bc); + llvm::WriteBitcodeToFile(*module, bcos); + + // Parse module back into its own LLVM context. + auto clone_context = std::make_unique(); + auto clone_module = llvm::parseBitcodeFile( + llvm::MemoryBufferRef( + llvm::StringRef(bc.data(), bc.size()), + absl::StrFormat("%s_part_%02d", kXlaModuleIdentifier, part)), + *clone_context); + + return llvm::orc::ThreadSafeModule(std::move(*clone_module), + std::move(clone_context)); +} + +namespace { +// Compiled symbols (kernels and comparators) from a single LLVM module part. +struct CompiledSymbolsPart { + std::vector kernels; + std::vector comparators; +}; +} // namespace + +// Collect IrEmitter2 symbols that got into the LLVM module part. We issue +// compilation tasks in parallel, and to maximize concurrency we don't issue +// separate compilation tasks that compile symbols from the same module. +static CompiledSymbolsPart CollectCompiledSymbolsPart( + const IrEmitter2& ir_emitter, const llvm::Module& module) { + CompiledSymbolsPart syms; + + auto find_kernel = + [&](llvm::StringRef name) -> std::optional { + for (auto& k : ir_emitter.kernels()) { + if (k.name == name) return k; + } + return std::nullopt; + }; + + auto find_comparator = + [&](llvm::StringRef name) -> std::optional { + for (auto& c : ir_emitter.comparators()) { + if (c.name == name) return c; + } + return std::nullopt; + }; + + for (auto& f : module.functions()) { + if (auto kernel = find_kernel(f.getName())) { + syms.kernels.push_back(*kernel); + } + if (auto comparator = find_comparator(f.getName())) { + syms.comparators.push_back(*comparator); + } + } + + return syms; +} + +// If LLVM module has large constants constructed from literals, we don't want +// to split it, because it will cause us to copy large constants across module +// parts. We should not be storing large constants in LLVM IR in a first place, +// but while we do that, we have to be extra-careful, or it leads to extremely +// long compilation times, OOMs and timeouts. +// +// TODO(b/361800465): Figure out how to avoid putting large constants into +// LLVM IR in the first place. +static bool HasLargeConstants(llvm::Module& module) { + static constexpr int kMaxConstantSize = 10000; + for (auto& g : module.globals()) { + if (!g.hasInitializer()) { + continue; + } + + llvm::Constant* initializer = g.getInitializer(); + if (auto* arr = llvm::dyn_cast(initializer->getType())) { + if (arr->getNumElements() > kMaxConstantSize) return true; + } + } + return false; +} + +inline void VlogMaxIsa(absl::string_view max_cpu_isa) { + if (VLOG_IS_ON(1) && !max_cpu_isa.empty()) { + if (tsl::port::IsX86CPU()) { + VLOG(1) << "`xla_cpu_max_isa` is set. Will not use features newer than: " + << max_cpu_isa; + } else { + VLOG(1) << "`xla_cpu_max_isa` is set to `" << max_cpu_isa + << "`. This flag is not supported on non-x86 CPUs yet."; + } + } +} + +// We keep HloProto in the CpuExecutable, but we don't need to keep literals +// payload in it as we use it only for debugging and memory analysis. +static void StripPayloadFromLiteralProto(HloProto& proto) { + auto* module = proto.mutable_hlo_module(); + for (auto& computation : *module->mutable_computations()) { + for (auto& instruction : *computation.mutable_instructions()) { + // We only keep literal shape to correctly estimate memory usage of the + // HLO module, but we don't need the actual literal data. + if (instruction.has_literal()) { + LiteralProto literal; + *literal.mutable_shape() = instruction.literal().shape(); + *instruction.mutable_literal() = std::move(literal); + } + } + } +} + +// Extracts the given set of kernels from the original module. +// Returns a new module with the extracted kernels. +static absl::StatusOr> ExtractKernelsFromModule( + llvm::Module* original_module, + absl::flat_hash_set kernels) { + // Clone into a new module, only keeping definitions of the relevant kernels. + auto should_clone_definition = [&kernels](const llvm::GlobalValue* gv) { + if (auto* func = llvm::dyn_cast(gv)) { + return kernels.contains(func->getName()); + } + return false; + }; + llvm::ValueToValueMapTy vmap; + std::unique_ptr module = + llvm::CloneModule(*original_module, vmap, should_clone_definition); + + // Erase the cloned symbols from the original module. + for (const auto& kernel_name : kernels) { + llvm::Function* to_be_removed = original_module->getFunction(kernel_name); + if (to_be_removed == nullptr) { + return Internal("Cannot remove kernel %s: cannot be found in module %s", + kernel_name, original_module->getName()); + } + to_be_removed->eraseFromParent(); + } + return module; +} + +static void AddXlaBackendExtraOptionsAsModuleFlag( + llvm::Module* llvm_module, llvm::StringRef backend_extra_options) { + auto* options_mdstring = + llvm::MDString::get(llvm_module->getContext(), backend_extra_options); + llvm_module->addModuleFlag(llvm::Module::Error, "xla_backend_extra_options", + options_mdstring); +} + +absl::StatusOr> +CpuCompiler::CompileCpuExecutable(std::unique_ptr module) { + TraceMe trace([&] { + return TraceMeEncode("CpuCompiler::CompileCpuExecutable", + {{"name", module->name()}}); + }); + + ModuleHook pre_optimization_ir_hook; + ModuleHook post_optimization_ir_hook; + std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) = + GetIRModuleHooks(*module, user_pre_optimization_hook_, + user_post_optimization_hook_); + + // Compile must be thread-safe so create a new LLVM context for the module. + mlir::MLIRContext mlir_context; + auto llvm_context = std::make_unique(); + auto llvm_module = + std::make_unique(kXlaModuleIdentifier, *llvm_context); + + const DebugOptions& debug_options = module->config().debug_options(); + + // We collect compiled object files (machine code) so we can export + // CpuExecutable to an AOT compilation result. + std::vector obj_files; + + // We split LLVM module and distribute it across separate DyLibs to enable + // parallel compilation at run time. + size_t parallel_codegen_split_count = + debug_options.xla_cpu_parallel_codegen_split_count(); + VlogMaxIsa(debug_options.xla_cpu_max_isa()); + + const HloModuleConfig& config = module->config(); + + // Options for compiling LLVM IR to machine code. + IrCompiler::Options ir_compiler_options{ + /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config), + /*optimize_for_size=*/options::OptimizeForSizeRequested(config), + /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()), + /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config), + /*disable_expensive_passes=*/ + debug_options.xla_llvm_disable_expensive_passes(), + /*slp_vectorizer_disabled=*/options::SlpVectorizerDisabled(config), + /*disable_loop_unrolling=*/options::DisableLoopUnrolling(config), + }; + + // Compiler hooks to intercept compiled LLVM IR modules. + IrCompiler::CompilationHooks ir_compiler_hooks{ + pre_optimization_ir_hook, + post_optimization_ir_hook, + CreateOrcJITPostCompilationHook(module.get(), &obj_files), + }; + + // Definition generator to link with XLA:CPU host runtime symbols. + ExecutionEngine::DefinitionGenerator definition_generator = + [](const llvm::DataLayout& data_layout) { + return std::make_unique(data_layout); + }; + + // Options for orchestrating the JIT compilation process. + JitCompiler::Options jit_compiler_options{ + /*num_dylibs=*/parallel_codegen_split_count, + /*definition_generator=*/std::move(definition_generator), + }; + + std::unique_ptr ir_compiler = IrCompiler::Create( + CompilerTargetOptions(module->config()), std::move(ir_compiler_options), + std::move(ir_compiler_hooks)); + + TF_ASSIGN_OR_RETURN( + JitCompiler jit_compiler, + JitCompiler::Create(std::move(jit_compiler_options), + std::move(ir_compiler), GetCompilationTaskRunner())); + + HloComputation* entry_computation = module->entry_computation(); + absl::flat_hash_map + instruction_to_profile_idx; + absl::flat_hash_map + computation_to_profile_idx; + std::unique_ptr hlo_profile_index_map; + std::unique_ptr hlo_profile_printer_data; + if (module->config().hlo_profiling_enabled()) { + TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts( + *module, &instruction_to_profile_idx, &computation_to_profile_idx, + &hlo_profile_index_map, &hlo_profile_printer_data)); + } + + // Cache these flags here since we'll want to access them after the module's + // ownership is std::moved. + const bool embed_ir_in_executable = + debug_options.xla_embed_ir_in_executable(); + + TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module)); + TF_RETURN_IF_ERROR(module->set_schedule(schedule)); + + TF_ASSIGN_OR_RETURN(std::unique_ptr assignment, + CreateBufferAssignment(*module)); + DumpHloModuleIfEnabled(*module, *assignment, + absl::StrCat("cpu_", kAfterOptimizationsDumpName)); + + // Dump computation proto state and buffer assignment for + // GetCompiledMemoryStats results. + auto with_hlo_proto = [&](std::unique_ptr cpu_executable) { + auto hlo_proto = std::make_unique(); + *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto(); + *hlo_proto->mutable_buffer_assignment() = + cpu_executable->buffer_assignment().ToProto(); + StripPayloadFromLiteralProto(*hlo_proto); + cpu_executable->set_hlo_proto(std::move(hlo_proto)); + return cpu_executable; + }; + + TargetMachineFeatures target_machine_features(jit_compiler.target_machine()); + + // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should + // be renamed to NestedIrEmitter and be used only for emitting nested (aka + // thread local or embedded) computations (reductions, maps, etc.). + + // (Nested) IrEmitter is responsible for building LLVM module with functions + // for all HLO computations. In thunk execution mode we only build LLVM + // functions for embedded computations (e.g. reduction computations) and all + // high-level operations (fusions, elementwise, etc.) are lowered to kernel + // functions (which are also LLVM functions, but use a HostKernel ABI). + IrEmitter nested_ir_emitter( + &mlir_context, *module, *assignment, llvm_module.get(), + std::move(instruction_to_profile_idx), + std::move(computation_to_profile_idx), + ModuleComputationsTransitivelyContainCustomCall(*module), + &target_machine_features, +#ifdef MEMORY_SANITIZER + /*emit_code_for_msan=*/true +#else + /*emit_code_for_msan=*/false +#endif + ); + + // If we use Thunk runtime then instead of emitting LLVM function for the + // entry computation we emit a sequence of thunks that implement the + // computation as a sequence of interpreted commands. + if (module->config().debug_options().xla_cpu_use_thunk_runtime()) { + // The thunk runtime manages large constants, therefore we only emit + // small ones. + TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals()); + + // IR emitter is responsible for building LLVM module with host kernels for + // corresponding HLO instructions (fusions, elemental instructions, etc.). + IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter); + + // Thunk emitter is responsible for building a Thunk sequence that will + // resolved kernels in the compiled LLVM module and execute them together + // with Thunks implemented as library calls (e.g. oneDNN or Eigen). + ThunkEmitter thunk_emitter(ir_emitter2, *assignment, + target_machine_features, module->config()); + TF_ASSIGN_OR_RETURN(ThunkSequence thunks, + thunk_emitter.EmitEntryComputation(*module)); + + std::string ir_module_string; + if (embed_ir_in_executable) { + std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get()); + + auto thunk_kernel_fmt = [](std::string* out, + const ThunkEmitter::EmittedKernel& kernel) { + absl::StrAppend( + out, llvm_ir::DumpToString(kernel.module.getModuleUnlocked())); + }; + std::string thunks_ir = + absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt); + + ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir); + } + + TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); + for (const auto& [name, module] : thunk_emitter.kernels()) { + TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked())); + } + + // Some kernels have to be compiled separately because they have + // extra backend options. + int num_extra_functions = 0; + using BackendOptions = llvm::StringRef; + using Kernel = llvm::StringRef; + absl::flat_hash_map> + backend_extra_options_to_kernels; + for (const auto& k : ir_emitter2.kernels()) { + if (k.backend_extra_options.empty()) continue; + auto [_, inserted] = + backend_extra_options_to_kernels[k.backend_extra_options].insert( + k.name); + CHECK(inserted) << "Kernel " << k.name << " is not unique"; + num_extra_functions++; + } + const int num_extra_parts = backend_extra_options_to_kernels.size(); + // We assign one dylib to each set of kernels that have the same extra + // backend options. We do this because we work under the assumption that + // very few kernels will set extra options, and if they do, the options are + // likely to be identical. + if (num_extra_parts >= parallel_codegen_split_count) { + return Internal( + "Too many extra compilation parts due to non-default options (%d). " + "Consider reducing this number or increasing " + "parallel_codegen_split_count (%d)", + num_extra_parts, parallel_codegen_split_count); + } + + // We define the number of module parts based on the total number of + // compiled functions (kernels and comparators) that are called from thunks, + // and the maximum number of parts that we want to split the module into. + size_t num_compiled_functions = ir_emitter2.kernels().size() + + ir_emitter2.comparators().size() + + thunk_emitter.kernels().size(); + size_t num_default_parts = + std::min(num_compiled_functions - num_extra_functions, + parallel_codegen_split_count - num_extra_parts); + + // JIT compile the LLVM IR module to in-memory machine code. We split the + // module into `num_jit_dylibs` parts to allow parallel compilation. In + // practice, all of the kernel functions are independent and don't call each + // other, so we can compile each individual part in parallel. We split + // module preserving locals, which should guarantee that all thread local + // computations end up in the same module with the corresponding kernel. + + // Collect all compiled symbols grouped by LLVM module part, so that we can + // issue compile tasks in parallel without any interference. + std::vector compiled_parts; + + VLOG(2) << "Compile LLVM module with " << ir_emitter2.kernels().size() + << " kernels and " << ir_emitter2.comparators().size() + << " comparators"; + + int dylib_index = 0; + auto add_jit_module = [&](std::unique_ptr llvm_module_part) { + // Collect symbols that are compiled in this LLVM module part. + RemoveUnusedSymbols(*llvm_module_part); + compiled_parts.push_back( + CollectCompiledSymbolsPart(ir_emitter2, *llvm_module_part)); + + std::string dump = llvm_ir::DumpToString(llvm_module_part.get()); + VLOG(5) << "Adding compilation module:\n" << dump; + + // Clone LLVM module part into its own thread safe context. + auto tsm = + CloneAsThreadSafeModule(dylib_index, std::move(llvm_module_part)); + TF_CHECK_OK(jit_compiler.AddModule(std::move(tsm), dylib_index++)); + }; + + // If there are extra parts, compile them first, since we must + // remove the affected kernels from the LLVM module. + if (num_extra_parts > 0) { + TraceMe trace([&] { + return TraceMeEncode("CompileExtraKernels", + {{"num_extra_parts", num_extra_parts}}); + }); + for (const auto& [backend_extra_options, kernels] : + backend_extra_options_to_kernels) { + TF_ASSIGN_OR_RETURN( + std::unique_ptr new_module, + ExtractKernelsFromModule(llvm_module.get(), kernels)); + AddXlaBackendExtraOptionsAsModuleFlag(new_module.get(), + backend_extra_options); + add_jit_module(std::move(new_module)); + } + } + + if (HasLargeConstants(*llvm_module)) { + VLOG(3) << "Skip parallel compilation due to large constants"; + num_default_parts = 1; + } + + if (num_default_parts > 1) { + VLOG(3) << "Split LLVM module into " << num_default_parts + << " parts before codegen to enable parallel compilation" + << " (max split count: " << parallel_codegen_split_count << ")"; + + TraceMe trace([&] { + return TraceMeEncode("SplitModule", + {{"num_default_parts", num_default_parts}}); + }); + + llvm::SplitModule(*llvm_module, num_default_parts, add_jit_module, + /*PreserveLocals=*/true, /*RoundRobin=*/true); + // Free resources used by the original LLVM module. + llvm_module.reset(); + llvm_context.reset(); + + } else { + VLOG(3) << "Compile LLVM module without splitting (max split count: " + << parallel_codegen_split_count << ")"; + compiled_parts.push_back( + CollectCompiledSymbolsPart(ir_emitter2, *llvm_module)); + TF_CHECK_OK(jit_compiler.AddModule(llvm::orc::ThreadSafeModule( + std::move(llvm_module), std::move(llvm_context)))); + } + + // Collect compiled symbols from all LLVM module parts. + std::vector compiled_symbols; + + absl::flat_hash_map + symbol_type_id_to_function_type_id; + + VLOG(3) << "Adding " << thunk_emitter.kernels().size() + << " kernels to the JIT compiler"; + // Make sure we use all the "default" modules for maximum parallelism. + int num_default_so_far = dylib_index - num_extra_parts; + int kernel_dylib_index = + num_default_so_far < num_default_parts ? num_default_so_far : 0; + for (auto& [name, module] : thunk_emitter.kernels()) { + compiled_symbols.push_back( + FunctionLibrary::Sym(name)); + symbol_type_id_to_function_type_id.emplace( + compiled_symbols.back().type_id, SymbolProto::KERNEL); + TF_CHECK_OK(jit_compiler.AddModule(std::move(module), + num_extra_parts + kernel_dylib_index)); + // Simply roundrobin the default kernel dylibs + kernel_dylib_index = (kernel_dylib_index + 1) % num_default_parts; + } + + for (const CompiledSymbolsPart& part : compiled_parts) { + for (const IrEmitter2::KernelInfo& kernel : part.kernels) { + compiled_symbols.push_back( + FunctionLibrary::Sym(kernel.name)); + symbol_type_id_to_function_type_id.emplace( + compiled_symbols.back().type_id, SymbolProto::KERNEL); + } + for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) { + compiled_symbols.push_back( + FunctionLibrary::Sym(comparator.name)); + symbol_type_id_to_function_type_id.emplace( + compiled_symbols.back().type_id, SymbolProto::COMPARATOR); + } + } + + VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols"; + + TraceMe trace_codegen([&] { + return TraceMeEncode( + "Codegen", {{"num_default_parts", num_default_parts}, + {"num_extra_parts", num_extra_parts}, + {"num_compiled_functions", num_compiled_functions}}); + }); + + TF_ASSIGN_OR_RETURN(std::unique_ptr function_library, + std::move(jit_compiler).Compile(compiled_symbols)); + + // Create constant allocations from the buffer assignment. + TF_ASSIGN_OR_RETURN(std::vector constants, + CreateConstantAllocations(*assignment)); + + TF_ASSIGN_OR_RETURN( + auto cpu_executable, + CpuExecutable::Create(std::move(function_library), + std::move(assignment), std::move(module), + std::move(thunks), std::move(constants), + std::move(hlo_profile_printer_data), + std::move(hlo_profile_index_map))); + + // Save object files to be able to export them to AOT compilation + // result. + cpu_executable->set_obj_files(std::move(obj_files)); + + // Save compiled symbols to be able to export them to AOT compilation + // result. + cpu_executable->set_compiled_symbols(std::move(compiled_symbols)); + + // Save mapping between symbol type id and function type id to be able to + // export them to AOT compilation result. + cpu_executable->set_symbol_type_id_to_function_type_id( + symbol_type_id_to_function_type_id); + + if (embed_ir_in_executable) { + cpu_executable->set_ir_module_string(ir_module_string); + } + + return with_hlo_proto(std::move(cpu_executable)); + } + + TF_RETURN_IF_ERROR(nested_ir_emitter.EmitAllConstantGlobals()); + + // Each computation is a single function. Emit all embedded computations + // before the entry computation. The order of computations returned from + // SubcomputationEmissionOrder guarantees that a called computation occurs + // before a caller computation. + for (ComputationToEmit subcomputation : + SubcomputationEmissionOrder(entry_computation)) { + if (subcomputation.computation->IsFusionComputation()) { + continue; + } + TF_RETURN_IF_ERROR( + nested_ir_emitter + .EmitComputation( + subcomputation.computation, subcomputation.computation->name(), + /*is_top_level_computation=*/false, + schedule.sequence(subcomputation.computation).instructions(), + subcomputation.allow_reassociation) + .status()); + } + absl::string_view function_name_prefix = entry_computation->name().empty() + ? "__compute" + : entry_computation->name(); + TF_ASSIGN_OR_RETURN(llvm::Function * entry_function, + nested_ir_emitter.EmitComputation( + entry_computation, function_name_prefix, + /*is_top_level_computation=*/true, + schedule.sequence(entry_computation).instructions(), + /*allow_reassociation=*/false)); + + std::string ir_module_string; + if (embed_ir_in_executable) { + ir_module_string = llvm_ir::DumpToString(llvm_module.get()); + } + + TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); + + // Save entry function name before destroying LLVM module. + std::string entry_function_name = entry_function->getName().str(); + + // JIT compile the LLVM IR module to in-memory machine code. + llvm::orc::ThreadSafeModule thread_safe_module(std::move(llvm_module), + std::move(llvm_context)); + TF_RETURN_IF_ERROR(jit_compiler.AddModule(std::move(thread_safe_module))); + + using ComputeFn = std::remove_pointer_t; + TF_ASSIGN_OR_RETURN( + std::unique_ptr function_library, + std::move(jit_compiler) + .Compile({FunctionLibrary::Sym(entry_function_name)})); + + TF_ASSIGN_OR_RETURN( + auto cpu_executable, + CpuExecutable::Create(std::move(function_library), std::move(assignment), + std::move(module), entry_function_name, + std::move(hlo_profile_printer_data), + std::move(hlo_profile_index_map))); + + cpu_executable->set_obj_files(std::move(obj_files)); + + if (embed_ir_in_executable) { + cpu_executable->set_ir_module_string(ir_module_string); + } + + return with_hlo_proto(std::move(cpu_executable)); +} + +absl::StatusOr> CpuCompiler::RunBackend( + std::unique_ptr module, + [[maybe_unused]] se::StreamExecutor* stream_exec, + const CompileOptions& options) { + TraceMe trace([&] { + return TraceMeEncode("CpuCompiler::RunBackend", {{"name", module->name()}}); + }); + + VLOG(1) << "Compiling: " << module->name(); + RecordCpuCompilerStacktrace(); + XLA_SCOPED_LOGGING_TIMER( + absl::StrFormat("Compiling [%s] for CPU using JIT", module->name())); + std::string slow_compilation_msg = + absl::StrCat("Compiling module ", module->name()); + auto slow_compile_alarm = SlowCompilationAlarm(slow_compilation_msg); + auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions( + module->config().debug_options().xla_backend_extra_options()); + llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options); + + std::unique_ptr cpu_executable; + TF_ASSIGN_OR_RETURN(cpu_executable, CompileCpuExecutable(std::move(module))); + + cpu_executable->set_debug_info( + cpu_executable->buffer_assignment().StatsString( + /*report_total_fragmentation=*/true)); + VLOG(1) << "Compilation finished"; + return std::unique_ptr(std::move(cpu_executable)); +} + +absl::StatusOr>> +CpuCompiler::CompileAheadOfTime(std::unique_ptr module_group, + const AotCompilationOptions& aot_options) { + TF_RET_CHECK(!module_group->empty()); + std::vector> modules = + module_group->ConsumeModules(); + + auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions( + modules[0]->config().debug_options().xla_backend_extra_options()); + VlogMaxIsa(modules[0]->config().debug_options().xla_cpu_max_isa()); + llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options); + + // We can pass just one llvm::TargetOptions when we compile the LLVM module, + // so we bail if the configs have conflicting flags. At the moment, the only + // flags that need to be consistent are for fast-math. + for (const auto& fn_and_name : + {std::make_pair(&DebugOptions::xla_cpu_enable_fast_math, + "xla_cpu_enable_fast_math"), + std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_infs, + "xla_cpu_fast_math_honor_infs"), + std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_nans, + "xla_cpu_fast_math_honor_nans")}) { + // This only works because each of the method pointers above returns a + // bool. Otherwise we'd have to do some template magic. + const auto& field_method_ptr = fn_and_name.first; + const auto& field_name = fn_and_name.second; + bool first_module_val = + (modules[0]->config().debug_options().*field_method_ptr)(); + for (int64_t i = 0; i < modules.size(); ++i) { + bool cur_module_val = + (modules[i]->config().debug_options().*field_method_ptr)(); + if (first_module_val != cur_module_val) { + return InvalidArgument( + "All HLO module configs must have the same value for %s, but " + "module 0 and %d have different values (%d vs %d).", + field_name, i, first_module_val, cur_module_val); + } + } + } + + if (aot_options.PlatformId() != se::host::kHostPlatformId) { + return InvalidArgument("Incompatible AOT compilation platform"); + } + const CpuAotCompilationOptions& options = + static_cast(aot_options); + llvm::Triple triple(llvm::Triple::normalize(options.triple())); + std::string error; + const llvm::Target* target = + llvm::TargetRegistry::lookupTarget(triple.getTriple(), error); + if (target == nullptr) { + return Internal("TargetRegistry::lookupTarget failed: %s", error); + } + + llvm::Reloc::Model reloc_model = llvm::Reloc::Static; + llvm::PICLevel::Level pic_level = llvm::PICLevel::NotPIC; + llvm::PIELevel::Level pie_level = llvm::PIELevel::Default; + switch (options.relocation_model()) { + case CpuAotCompilationOptions::RelocationModel::Static: + reloc_model = llvm::Reloc::Static; + pic_level = llvm::PICLevel::NotPIC; + pie_level = llvm::PIELevel::Default; + break; + case CpuAotCompilationOptions::RelocationModel::SmallPic: + reloc_model = llvm::Reloc::PIC_; + pic_level = llvm::PICLevel::SmallPIC; + pie_level = llvm::PIELevel::Default; + break; + case CpuAotCompilationOptions::RelocationModel::BigPic: + reloc_model = llvm::Reloc::PIC_; + pic_level = llvm::PICLevel::BigPIC; + pie_level = llvm::PIELevel::Default; + break; + case CpuAotCompilationOptions::RelocationModel::SmallPie: + reloc_model = llvm::Reloc::PIC_; + pic_level = llvm::PICLevel::SmallPIC; + pie_level = llvm::PIELevel::Small; + break; + case CpuAotCompilationOptions::RelocationModel::BigPie: + reloc_model = llvm::Reloc::PIC_; + pic_level = llvm::PICLevel::BigPIC; + pie_level = llvm::PIELevel::Large; + break; + } + llvm::CodeGenOptLevel opt_level = + IrCompiler::GetCodeGenOptLevel(modules[0]->config()); + llvm::TargetOptions target_options = + CompilerTargetOptions(modules[0]->config()); + auto target_machine_builder = [&]() { + return absl::WrapUnique(target->createTargetMachine( + triple.getTriple(), options.cpu_name(), options.features(), + target_options, reloc_model, std::nullopt, opt_level)); + }; + + std::unique_ptr target_machine = + target_machine_builder(); + + // Compile must be thread-safe so create a new LLVM context for the module. + mlir::MLIRContext mlir_context; + llvm::LLVMContext llvm_context; + + std::vector> results; + for (auto& hlo_module : modules) { + VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name(); + if (hlo_module->has_schedule()) { + continue; + } + + TF_RETURN_IF_ERROR(RunHloPasses(hlo_module.get(), /*is_aot_compile=*/true, + target_machine.get(), + /*dummy*/ CompileOptions{})); + + if (hlo_module->config().debug_options().xla_cpu_use_thunk_runtime()) { + TF_ASSIGN_OR_RETURN(results.emplace_back(), + CompileAheadOfTimeThunks( + std::move(hlo_module), target_machine_builder, + options, triple, pic_level, pie_level)); + } else { + TF_ASSIGN_OR_RETURN(results.emplace_back(), + CompileAheadOfTimeLegacy( + std::move(hlo_module), target_machine_builder, + options, triple, pic_level, pie_level)); + } + } + + VLOG(1) << "Compilation finished"; + return std::move(results); +} + +absl::StatusOr> +CpuCompiler::CompileAheadOfTimeLegacy( + std::unique_ptr module, + IrCompiler::TargetMachineBuilder target_machine_builder, + const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple, + const llvm::PICLevel::Level& pic_level, + const llvm::PIELevel::Level& pie_level) { + TF_ASSIGN_OR_RETURN(HloSchedule schedule, + ScheduleModule(module.get(), BufferSizeBytesFunction())); + + // Run buffer analysis on the HLO graph. This analysis figures out which + // temporary buffers are required to run the computation. + TF_ASSIGN_OR_RETURN( + std::unique_ptr assignment, + BufferAssigner::Run(module.get(), + std::make_unique(schedule), + BufferSizeBytesFunction(), memory_alignment, + /*allocate_buffers_for_constants=*/true)); + // BufferAssignment::ToString() includes a header, so no need for us to + // print one ourselves. + if (DumpingEnabledForHloModule(*module)) { + DumpToFileInDirOrStdout(*module, "", "buffer_assignment", + assignment->ToString()); + } + DumpHloModuleIfEnabled(*module, *assignment, + absl::StrCat("cpu_", kAfterOptimizationsDumpName)); + + absl::flat_hash_map + instruction_to_profile_idx; + absl::flat_hash_map + computation_to_profile_idx; + std::unique_ptr hlo_profile_index_map; + std::unique_ptr hlo_profile_printer_data; + + if (module->config().hlo_profiling_enabled()) { + TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts( + *module, &instruction_to_profile_idx, &computation_to_profile_idx, + &hlo_profile_index_map, &hlo_profile_printer_data)); + } + + TF_ASSIGN_OR_RETURN(std::unique_ptr target_machine, + target_machine_builder()); + TargetMachineFeatures target_machine_features(target_machine.get()); + std::vector buffer_infos = + CreateBufferInfosFromBufferAssignment(*module, *assignment); + HloComputation* computation = module->entry_computation(); + + // Compile must be thread-safe so create a new LLVM context for the module. + mlir::MLIRContext mlir_context; + auto llvm_context = std::make_unique(); + + // Set required information before emitting IR + auto llvm_module = + std::make_unique(kXlaModuleIdentifier, *llvm_context); + llvm_module->setDataLayout(target_machine->createDataLayout()); + llvm_module->setTargetTriple(triple); + if (pic_level != llvm::PICLevel::NotPIC) { + llvm_module->setPICLevel(pic_level); + } + if (pie_level != llvm::PIELevel::Default) { + llvm_module->setPIELevel(pie_level); + } + IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(), + std::move(instruction_to_profile_idx), + std::move(computation_to_profile_idx), + ModuleComputationsTransitivelyContainCustomCall(*module), + &target_machine_features, + // TODO(b/66051036): Run full msan for AOT. + /*emit_code_for_msan=*/false); + + TF_RETURN_IF_ERROR(ir_emitter.EmitAllConstantGlobals()); + + for (ComputationToEmit subcomputation : + SubcomputationEmissionOrder(computation)) { + if (subcomputation.computation->IsFusionComputation()) { + continue; + } + TF_RETURN_IF_ERROR( + ir_emitter + .EmitComputation( + subcomputation.computation, subcomputation.computation->name(), + /*is_top_level_computation=*/false, + schedule.sequence(subcomputation.computation).instructions(), + subcomputation.allow_reassociation) + .status()); + } + const std::string& entry_point_name = aot_options.entry_point_name(); + TF_ASSIGN_OR_RETURN( + llvm::Function * entry_function, + ir_emitter.EmitComputation(computation, entry_point_name, + /*is_top_level_computation=*/true, + schedule.sequence(computation).instructions(), + /*allow_reassociation=*/false)); + + CHECK(entry_function->getName() == entry_point_name); + + ModuleHook pre_optimization_ir_hook; + ModuleHook post_optimization_ir_hook; + std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) = + GetIRModuleHooks(*module, user_pre_optimization_hook_, + user_post_optimization_hook_); + + // Run the LLVM verifier over the unoptimized LLVM IR. If it fails, run + // the pre-optimization IR dump hook before returning. + { + absl::Status verify_status = VerifyLlvmModule(*llvm_module); + if (!verify_status.ok() && pre_optimization_ir_hook) { + pre_optimization_ir_hook(*llvm_module); + } + TF_RETURN_IF_ERROR(verify_status); + } + + auto post_codegen_hook = [&](const llvm::Module& llvm_module, + const llvm::object::ObjectFile& obj_file) { + if (!DumpingEnabledForHloModule(*module)) { + return; + } + DumpModuleToFile(llvm_module, obj_file, *module); + }; + + DebugOptions debug_options = module->config().debug_options(); + IrCompiler::Options ir_compiler_options = { + /*optimization_level=*/target_machine->getOptLevel(), + /*optimize_for_size=*/ + options::OptimizeForSizeRequested(module->config()), + /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()), + /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()), + /*disable_expensive_passes=*/ + debug_options.xla_llvm_disable_expensive_passes(), + /*disable_slp_vectorizer=*/ + options::SlpVectorizerDisabled(module->config()), + /*disable_loop_unrolling=*/ + options::DisableLoopUnrolling(module->config()), + /*dfsan_enabled=*/aot_options.sanitize_dataflow(), + /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()}; + + IrCompiler::CompilationHooks ir_compiler_hooks = { + pre_optimization_ir_hook, + post_optimization_ir_hook, + post_codegen_hook, + }; + + IrCompiler ir_compiler(std::move(target_machine_builder), + std::move(ir_compiler_options), + std::move(ir_compiler_hooks)); + + std::unique_ptr object_file = + cantFail(ir_compiler(*llvm_module)); + ObjectFileData object_file_data(object_file->getBufferStart(), + object_file->getBufferEnd()); + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, + assignment->GetUniqueTopLevelOutputSlice()); + + return std::make_unique( + std::move(object_file_data), std::move(buffer_infos), + result_slice.index(), std::move(module), + std::move(hlo_profile_printer_data)); +} + +absl::StatusOr> +CpuCompiler::CompileAheadOfTimeThunks( + std::unique_ptr module, + IrCompiler::TargetMachineBuilder target_machine_builder, + const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple, + const llvm::PICLevel::Level& pic_level, + const llvm::PIELevel::Level& pie_level) { + TraceMe trace([&] { + return TraceMeEncode("CpuCompiler::CompileAheadOfTimeThunks", + {{"name", module->name()}}); + }); + // Compile must be thread-safe so create a new LLVM context for the module. + mlir::MLIRContext mlir_context; + auto llvm_context = std::make_unique(); + + const DebugOptions& debug_options = module->config().debug_options(); + + TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module)); + TF_RETURN_IF_ERROR(module->set_schedule(schedule)); + + TF_ASSIGN_OR_RETURN(std::unique_ptr assignment, + CreateBufferAssignment(*module)); + DumpHloModuleIfEnabled(*module, *assignment, + absl::StrCat("cpu_aot_", kAfterOptimizationsDumpName)); + + // TODO profiling related, probably delete this + absl::flat_hash_map + instruction_to_profile_idx; + absl::flat_hash_map + computation_to_profile_idx; + std::unique_ptr hlo_profile_index_map; + std::unique_ptr hlo_profile_printer_data; + if (module->config().hlo_profiling_enabled()) { + TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts( + *module, &instruction_to_profile_idx, &computation_to_profile_idx, + &hlo_profile_index_map, &hlo_profile_printer_data)); + } + // probably delete this end + + TF_ASSIGN_OR_RETURN(std::unique_ptr target_machine, + target_machine_builder()); + TargetMachineFeatures target_machine_features(target_machine.get()); + + auto llvm_module = + std::make_unique(kXlaModuleIdentifier, *llvm_context); + + llvm_module->setDataLayout(target_machine->createDataLayout()); + llvm_module->setTargetTriple(triple); + if (pic_level != llvm::PICLevel::NotPIC) { + llvm_module->setPICLevel(pic_level); + } + if (pie_level != llvm::PIELevel::Default) { + llvm_module->setPIELevel(pie_level); + } + + // Emitting part + // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should + // be renamed to NestedIrEmitter and be used only for emitting nested (aka + // thread local or embedded) computations (reductions, maps, etc.). + + // (Nested) IrEmitter is responsible for building LLVM module with functions + // for all HLO computations. In thunk execution mode we only build LLVM + // functions for embedded computations (e.g. reduction computations) and all + // high-level operations (fusions, elementwise, etc.) are lowered to kernel + // functions (which are also LLVM functions, but use a HostKernel ABI). + IrEmitter nested_ir_emitter( + &mlir_context, *module, *assignment, llvm_module.get(), + std::move(instruction_to_profile_idx), + std::move(computation_to_profile_idx), + ModuleComputationsTransitivelyContainCustomCall(*module), + &target_machine_features, + // TODO(b/66051036): Run full msan for AOT. + /*emit_code_for_msan=*/false); + + // The thunk runtime manages large constants, therefore we only emit + // small ones. + TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals()); + + // IR emitter is responsible for building LLVM module with host kernels for + // corresponding HLO instructions (fusions, elemental instructions, etc.). + IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter); + + // Thunk emitter is responsible for building a Thunk sequence that will + // resolved kernels in the compiled LLVM module and execute them together + // with Thunks implemented as library calls (e.g. oneDNN or Eigen). + ThunkEmitter thunk_emitter(ir_emitter2, *assignment, target_machine_features, + module->config()); + TF_ASSIGN_OR_RETURN(ThunkSequence thunks, + thunk_emitter.EmitEntryComputation(*module)); + + // Cache these flags here since we'll want to access them after the module's + // ownership is std::moved. + const bool embed_ir_in_executable = + debug_options.xla_embed_ir_in_executable(); + + std::string ir_module_string; + if (embed_ir_in_executable) { + std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get()); + + auto thunk_kernel_fmt = [](std::string* out, + const ThunkEmitter::EmittedKernel& kernel) { + absl::StrAppend(out, + llvm_ir::DumpToString(kernel.module.getModuleUnlocked())); + }; + std::string thunks_ir = + absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt); + + ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir); + } + + TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); + for (const auto& [name, module] : thunk_emitter.kernels()) { + TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked())); + } + + // Compilation part + ModuleHook pre_optimization_ir_hook; + ModuleHook post_optimization_ir_hook; + std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) = + GetIRModuleHooks(*module, user_pre_optimization_hook_, + user_post_optimization_hook_); + + std::vector obj_files; + auto post_codegen_hook = [&](const llvm::Module& llvm_module, + const llvm::object::ObjectFile& obj_file) { + obj_files.push_back(obj_file.getData().str()); + if (!DumpingEnabledForHloModule(*module)) { + return; + } + absl::string_view id = llvm_module.getModuleIdentifier(); + size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size()); + DumpToFileInDir( + *module, /*file_prefix=*/"", + /*file_suffix=*/absl::StrCat("obj-file.", id.substr(pos), ".o"), + absl::string_view(obj_file.getData().data(), + obj_file.getData().size())); + }; + + IrCompiler::Options ir_compiler_options = { + /*optimization_level=*/target_machine->getOptLevel(), + /*optimize_for_size=*/ + options::OptimizeForSizeRequested(module->config()), + /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()), + /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()), + /*disable_expensive_passes=*/ + module->config().debug_options().xla_llvm_disable_expensive_passes(), + /*disable_slp_vectorizer=*/ + options::SlpVectorizerDisabled(module->config()), + /*disable_loop_unrolling=*/ + options::DisableLoopUnrolling(module->config()), + /*dfsan_enabled=*/aot_options.sanitize_dataflow(), + /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()}; + + IrCompiler::CompilationHooks ir_compiler_hooks = { + pre_optimization_ir_hook, + post_optimization_ir_hook, + post_codegen_hook, + }; + + IrCompiler ir_compiler(std::move(target_machine_builder), + std::move(ir_compiler_options), + std::move(ir_compiler_hooks)); + + // For simplicity no parallel compilation is used. + std::vector compiled_parts; + compiled_parts.push_back( + CollectCompiledSymbolsPart(ir_emitter2, *llvm_module)); + + // Collect compiled symbols from all LLVM module parts. + std::vector compiled_symbols; + + absl::flat_hash_map + symbol_type_id_to_function_type_id; + + VLOG(3) << "Compiling " << thunk_emitter.kernels().size() + << " thunk kernels."; + + // We have to clone the LLVM module into a local context to be able to link + // it with the other modules. This enables us to have one object file for all + // the kernels. + auto copy_llvm_module_to_local_context = + [&llvm_context](llvm::Module& module) { + // There is no way to clone a module from one context to another, so we + // need to serialize the module to bitcode and parse it back into the + // new context. + llvm::SmallString<0> bc; + llvm::raw_svector_ostream bcos(bc); + llvm::WriteBitcodeToFile(module, bcos); + + // Parse module back into its own LLVM context. + auto clone_module = llvm::parseBitcodeFile( + llvm::MemoryBufferRef(llvm::StringRef(bc.data(), bc.size()), + absl::StrFormat("%s_cloned_to_local_context", + kXlaModuleIdentifier)), + *llvm_context); + + return clone_module; + }; + + llvm::Linker linker(*llvm_module); + + for (auto& [name, module] : thunk_emitter.kernels()) { + compiled_symbols.push_back( + FunctionLibrary::Sym(name)); + symbol_type_id_to_function_type_id.emplace(compiled_symbols.back().type_id, + SymbolProto::KERNEL); + auto cloned_module = + copy_llvm_module_to_local_context(*module.getModuleUnlocked()); + if (!cloned_module) { + return Internal("Failed to clone LLVM module."); + } + // Match data layouts to avoid warning messages. + cloned_module->get()->setDataLayout(llvm_module->getDataLayout()); + linker.linkInModule(std::move(cloned_module.get())); + } + + cantFail(ir_compiler(*llvm_module)); + + for (const CompiledSymbolsPart& part : compiled_parts) { + for (const IrEmitter2::KernelInfo& kernel : part.kernels) { + compiled_symbols.push_back( + FunctionLibrary::Sym(kernel.name)); + symbol_type_id_to_function_type_id.emplace( + compiled_symbols.back().type_id, SymbolProto::KERNEL); + } + for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) { + compiled_symbols.push_back( + FunctionLibrary::Sym(comparator.name)); + symbol_type_id_to_function_type_id.emplace( + compiled_symbols.back().type_id, SymbolProto::COMPARATOR); + } + } + + VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols"; + + // Create constant allocations from the buffer assignment. + TF_ASSIGN_OR_RETURN(std::vector constants, + CreateConstantAllocations(*assignment)); + + TF_ASSIGN_OR_RETURN( + auto cpu_executable, + CpuExecutable::Create( + /*function_library=*/nullptr, // NOTE: We don't need to generate a + // function library as the only purpose + // of this executable is to get + // exported. + std::move(assignment), std::move(module), std::move(thunks), + std::move(constants), std::move(hlo_profile_printer_data), + std::move(hlo_profile_index_map))); + + // Save compiled symbols to be able to export them to AOT compilation + // result. + cpu_executable->set_compiled_symbols(std::move(compiled_symbols)); + + // Save mapping between symbol type id and function type id to be able to + // export them to AOT compilation result. + cpu_executable->set_symbol_type_id_to_function_type_id( + symbol_type_id_to_function_type_id); + + if (embed_ir_in_executable) { + cpu_executable->set_ir_module_string(ir_module_string); + } + + // Dump computation proto state and buffer assignment for + // GetCompiledMemoryStats results. + auto with_hlo_proto = [&](std::unique_ptr cpu_executable) { + auto hlo_proto = std::make_unique(); + *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto(); + *hlo_proto->mutable_buffer_assignment() = + cpu_executable->buffer_assignment().ToProto(); + StripPayloadFromLiteralProto(*hlo_proto); + cpu_executable->set_hlo_proto(std::move(hlo_proto)); + return cpu_executable; + }; + + cpu_executable = with_hlo_proto(std::move(cpu_executable)); + + const ThunkSequence& thunk_sequence = + cpu_executable->thunks().thunk_sequence(); + + std::unique_ptr executable_hlo_profile_printer_data = + cpu_executable->module().config().hlo_profiling_enabled() + ? std::make_unique( + cpu_executable->hlo_profile_printer_data()) + : nullptr; + + return CpuAotCompilationResultThunks::Create( + &cpu_executable->module(), &cpu_executable->buffer_assignment(), + cpu_executable->module_name(), std::move(obj_files), + cpu_executable->get_compiled_symbols_proto(), thunk_sequence, + std::move(*cpu_executable).consume_function_library().release(), + std::move(executable_hlo_profile_printer_data)); +} + +se::Platform::Id CpuCompiler::PlatformId() const { + return se::host::kHostPlatformId; +} + +HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const { + return CpuExecutable::ShapeSizeBytes; +} + +namespace { + +// TODO(basioli): This should be removed once new runtime is implemented, and +// CpuAotCompilationResult will be the only implementation of +// AotCompilationResult. This is still used as it allows us to `Export` and +// subsequently load both runtimes. + +// This is a result of exporting JIT compiled +// CpuExecutable to AOT compilation result that can be saved on disk and shipped +// over the wire. +class CpuExecutableAotCompilationResult : public AotCompilationResult { + public: + static absl::StatusOr> + Create(const HloModule* hlo_module, const BufferAssignment* buffer_assignment, + absl::string_view function_name, std::vector obj_files, + std::vector symbols, const ThunkSequence* thunks, + CompilationResultProto::ObjFileKind obj_file_kind) { + std::optional thunk_proto; + + if (thunks != nullptr) { + ThunkSequenceSerDesProtobuf thunk_sequence_serdes( + &buffer_assignment->Allocations()); + TF_ASSIGN_OR_RETURN(thunk_proto, thunk_sequence_serdes.ToProto(*thunks)); + } + + return absl::WrapUnique(new CpuExecutableAotCompilationResult( + hlo_module, buffer_assignment, function_name, std::move(obj_files), + std::move(symbols), thunk_proto, obj_file_kind)); + } + + absl::StatusOr SerializeAsString() const override { + return proto_.SerializeAsString(); + } + + static absl::StatusOr> + FromString(const std::string& serialized) { + CompilationResultProto proto; + if (!proto.ParseFromString(serialized)) { + return Internal( + "Failed to parse serialized CpuExecutableAotCompilationResult."); + } + + TF_ASSIGN_OR_RETURN( + std::unique_ptr module, + HloModule::CreateFromProtoWithConfig(proto.hlo_module())); + + return std::unique_ptr( + new CpuExecutableAotCompilationResult(proto, std::move(module))); + } + + absl::StatusOr> LoadExecutable( + Compiler* compiler, + const se::StreamExecutor* stream_exec) const&& override; + + const HloModule* optimized_module() const override { return module_.get(); } + + std::unique_ptr consume_optimized_module() override { + return std::move(module_); + } + + private: + CpuExecutableAotCompilationResult( + const HloModule* hlo_module, const BufferAssignment* buffer_assignment, + absl::string_view function_name, std::vector obj_files, + std::vector symbols, + const std::optional& thunks, + CompilationResultProto::ObjFileKind obj_file_kind) { + *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto(); + *proto_.mutable_hlo_module()->mutable_config() = + hlo_module->config().ToProto(); + *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto(); + proto_.set_entry_function_name(std::string(function_name)); + for (std::string& obj_file : obj_files) { + proto_.add_obj_files(std::move(obj_file)); + } + + for (const auto& symbol : symbols) { + auto* symbol_proto = proto_.add_compiled_symbols(); + *symbol_proto = symbol; + } + proto_.set_obj_files_kind(obj_file_kind); + module_ = hlo_module->Clone(); + + if (thunks.has_value()) { + ThunkSequenceSerDesProtobuf thunk_sequence_serdes( + &buffer_assignment->Allocations()); + *proto_.mutable_thunk_sequence() = *thunks; + } + } + + explicit CpuExecutableAotCompilationResult(CompilationResultProto proto, + std::unique_ptr module) + : proto_(std::move(proto)), module_(std::move(module)) {} + + CompilationResultProto proto_; + std::unique_ptr module_; +}; + +} // namespace + +absl::StatusOr> +CpuExecutableAotCompilationResult::LoadExecutable( + Compiler* compiler, const se::StreamExecutor* stream_exec) const&& { + // Recreate HloModule from proto. + TF_ASSIGN_OR_RETURN( + std::unique_ptr module, + HloModule::CreateFromProtoWithConfig(proto_.hlo_module())); + + VLOG(2) << "Load XLA:CPU executable for module: " << module->name(); + + // Recreate BufferAssignment from proto. + TF_ASSIGN_OR_RETURN( + std::unique_ptr buffer_assignment, + BufferAssignment::FromProto(proto_.buffer_assignment(), module.get(), + compiler->BufferSizeBytesFunction(), + /*can_share_buffer=*/nullptr)); + + const DebugOptions& debug_options = module->config().debug_options(); + VlogMaxIsa(debug_options.xla_cpu_max_isa()); + const HloModuleConfig& config = module->config(); + + // Infer target machine from the current host CPU. + TF_ASSIGN_OR_RETURN( + std::unique_ptr target_machine, + IrCompiler::InferTargetMachine( + std::move(CompilerTargetOptions(module->config())), + IrCompiler::GetCodeGenOptLevel(config), + CpuFeatureFromString(debug_options.xla_cpu_max_isa()))); + + // Definition generator to link with XLA:CPU host runtime symbols. + ExecutionEngine::DefinitionGenerator definition_generator = + [](const llvm::DataLayout& data_layout) { + return std::make_unique(data_layout); + }; + + ObjectLoader object_loader(/*num_dylibs=*/1, + target_machine->createDataLayout(), + definition_generator); + + for (size_t i = 0; i < object_loader.num_dylibs(); ++i) { + object_loader.dylib(i).value()->addGenerator( + std::make_unique( + target_machine->createDataLayout())); + } + + // We might have an XLA:CPU executable that has only runtime thunks and + // doesn't have any corresponding object files, and it's absolutely fine. + VLOG(2) << "Load XLA:CPU executable from " << proto_.obj_files_size() + << " object files; entry_function_name=" + << proto_.entry_function_name(); + + size_t obj_file_index = 0; + for (auto& obj_file : proto_.obj_files()) { + llvm::StringRef data(obj_file.data(), obj_file.size()); + TF_RETURN_IF_ERROR( + object_loader.AddObjFile(llvm::MemoryBuffer::getMemBuffer( + data, absl::StrCat(proto_.entry_function_name(), "_", + obj_file_index++)))); + } + + std::unique_ptr cpu_executable; + + if (proto_.obj_files_kind() == CompilationResultProto::KERNELS) { + ThunkSequenceSerDesProtobuf thunk_sequence_serdes( + &buffer_assignment->Allocations()); + TF_ASSIGN_OR_RETURN( + std::unique_ptr thunks, + thunk_sequence_serdes.FromProto(proto_.thunk_sequence())); + + VLOG(3) << "Loaded " << thunks->size() << " thunks."; + + std::vector compiled_symbols; + + for (const auto& symbol_proto : proto_.compiled_symbols()) { + switch (symbol_proto.function_type_id()) { + case SymbolProto::KERNEL: + compiled_symbols.push_back( + FunctionLibrary::Sym( + symbol_proto.name())); + break; + case SymbolProto::COMPARATOR: + compiled_symbols.push_back( + FunctionLibrary::Sym( + symbol_proto.name())); + break; + default: + return Internal( + "Unknown function type id %s", + SymbolProto_FunctionTypeId_Name(symbol_proto.function_type_id())); + } + } + + VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols"; + TF_ASSIGN_OR_RETURN(std::unique_ptr function_library, + std::move(object_loader).Load(compiled_symbols)); + + // Create constant allocations from the buffer assignment. + TF_ASSIGN_OR_RETURN(std::vector constants, + CreateConstantAllocations(*buffer_assignment)); + + TF_ASSIGN_OR_RETURN( + cpu_executable, + CpuExecutable::Create(std::move(function_library), + std::move(buffer_assignment), std::move(module), + std::move(*thunks), std::move(constants), nullptr, + nullptr)); + + } else if (proto_.obj_files_kind() == CompilationResultProto::CLASSIC) { + // Create a "classic" CPU executable. + using ComputeFn = std::remove_pointer_t; + TF_ASSIGN_OR_RETURN(std::unique_ptr function_library, + std::move(object_loader) + .Load({FunctionLibrary::Sym( + proto_.entry_function_name())})); + + TF_ASSIGN_OR_RETURN( + cpu_executable, + CpuExecutable::Create(std::move(function_library), + std::move(buffer_assignment), std::move(module), + proto_.entry_function_name(), nullptr, nullptr)); + + } else { + return Internal("Unknown obj file kind"); + } + + // Dump computation proto state and buffer assignment for + // GetCompiledMemoryStats results. + auto hlo_proto = std::make_unique(); + *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto(); + *hlo_proto->mutable_buffer_assignment() = + cpu_executable->buffer_assignment().ToProto(); + cpu_executable->set_hlo_proto(std::move(hlo_proto)); + + return cpu_executable; +} + +absl::StatusOr> CpuCompiler::Export( + Executable* executable) const { + auto* cpu_executable = tensorflow::down_cast(executable); + if (!cpu_executable) + return Internal("Could not downcast Executable to CpuExecutable"); + + // Export object files for all dylibs. + std::vector obj_files; + for (const auto& obj_file : cpu_executable->obj_files()) { + obj_files.push_back(std::string(obj_file)); + } + + auto kind = cpu_executable->has_thunks() ? CompilationResultProto::KERNELS + : CompilationResultProto::CLASSIC; + const ThunkSequence* thunk_sequence = + cpu_executable->has_thunks() ? &cpu_executable->thunks().thunk_sequence() + : nullptr; + + std::vector compiled_symbols = + cpu_executable->get_compiled_symbols_proto(); + + return CpuExecutableAotCompilationResult::Create( + &cpu_executable->module(), &cpu_executable->buffer_assignment(), + cpu_executable->module_name(), std::move(obj_files), + std::move(compiled_symbols), thunk_sequence, kind); +} + +absl::StatusOr> +CpuCompiler::LoadAotCompilationResult( + const std::string& serialized_aot_result) { + return CpuExecutableAotCompilationResult::FromString(serialized_aot_result); +} + +absl::StatusOr CpuCompiler::CreateHloSchedule( + const HloModule& hlo_module) const { + // Select a memory scheduler optimized for concurrency vs minimal memory. + auto scheduler = + hlo_module.config() + .debug_options() + .xla_cpu_enable_concurrency_optimized_scheduler() + ? std::unique_ptr( + std::make_unique(BufferSizeBytesFunction())) + : std::make_unique(BufferSizeBytesFunction()); + + // Select an order for emitting the HLO instructions for each + // computation. Using this sequence enables tighter buffer liveness analysis + // and reduced memory usage (as compared to using `DependencyHloOrdering`). + return ScheduleModule(&hlo_module, *scheduler); +} + +absl::StatusOr> +CpuCompiler::CreateBufferAssignment(const HloModule& module) const { + // Run buffer allocation on the HLO graph. + return BufferAssigner::Run( + &module, std::make_unique(module.schedule()), + BufferSizeBytesFunction(), memory_alignment, + /*allocate_buffers_for_constants=*/true); +} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc index 7caf9c43b1119b..1f8e9291f84c32 100644 --- a/third_party/xla/xla/service/cpu/cpu_runtime.cc +++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc @@ -197,6 +197,8 @@ extern const char* const kOneDnnMatMulReorderSymbolName = "__xla_cpu_runtime_OneDnnMatMulReorder"; extern const char* const kHandleFfiCallSymbolName = "__xla_cpu_runtime_HandleFfiCall"; +extern const char* const kXnnPackSoftMaxNDSymbolName = + "__xla_cpu_runtime_XnnPackSoftMaxND"; namespace { diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h index 71e27ea600ee28..31c7f9d0d86ef5 100644 --- a/third_party/xla/xla/service/cpu/cpu_runtime.h +++ b/third_party/xla/xla/service/cpu/cpu_runtime.h @@ -97,6 +97,7 @@ extern const char* const kOneDnnLayerNormSymbolName; extern const char* const kOneDnnConvolutionSymbolName; extern const char* const kOneDnnMatMulReorderSymbolName; extern const char* const kHandleFfiCallSymbolName; +extern const char* const kXnnPackSoftMaxNDSymbolName; // All symbol names for XLA CPU runtime functions need to start with this // prefix. diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc index feca6552d243f8..2bd5d7278b07c5 100644 --- a/third_party/xla/xla/service/cpu/ir_emitter.cc +++ b/third_party/xla/xla/service/cpu/ir_emitter.cc @@ -110,6 +110,9 @@ limitations under the License. #include "xla/util.h" #include "xla/xla_data.pb.h" +#include "xnnpack_ops.h" +#include "xnnpack_ops_rewriter.h" + #if defined(INTEL_MKL) #include "xla/service/cpu/onednn_memory_util.h" #endif @@ -2463,6 +2466,39 @@ absl::Status IrEmitter::HandleTopK(HloInstruction* hlo) { return absl::OkStatus(); } +absl::Status IrEmitter::HandleXnnPackSoftMax(HloInstruction* hlo) { + const HloInstruction* input = hlo->operand(0); + Shape shape = input->shape(); + + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo)); + TF_RET_CHECK(input->shape().element_type() == F32); + TF_RET_CHECK(shape.dimensions().size() >= 2); + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input_values_slice, + assignment_.GetUniqueSlice(hlo->operand(0), {})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice, + assignment_.GetUniqueSlice(hlo, {})); + + llvm::Value* values_ptr = EmitBufferPointer(input_values_slice, shape); + llvm::Value* out_values_ptr = EmitBufferPointer(out_values_slice, shape); + + // Flatten the batches into a single dimension. + int channels = shape.dimensions(shape.dimensions().size() - 1); + int batch_size = 1; + for (int i = 0; i < shape.dimensions().size() - 1; i++) + batch_size = batch_size * shape.dimensions(i); + + EmitCallToFunc(runtime::kXnnPackSoftMaxNDSymbolName, + {/*run_options=*/GetExecutableRunOptionsArgument(), + /*input*/ values_ptr, + /*output*/ out_values_ptr, + /*batch_size*/ b()->getInt64(batch_size), + /*channels*/ b()->getInt64(channels)}, + b()->getVoidTy()); + + return absl::OkStatus(); +} + #if defined(INTEL_MKL) // Emits operands alloca vector for oneDNN custom calls. @@ -2815,6 +2851,9 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) { if (custom_call->custom_call_target() == "TopK") { return HandleTopK(custom_call); } + if (custom_call->custom_call_target() == kCustomCallXnnPackSoftMax) { + return HandleXnnPackSoftMax(custom_call); + } #if defined(INTEL_MKL) if (custom_call->custom_call_target() == "__onednn$matmul") { return HandleOneDnnMatMulCalls(custom_call, diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h index 40f54d2f4bff97..b3d47d41c6ca69 100644 --- a/third_party/xla/xla/service/cpu/ir_emitter.h +++ b/third_party/xla/xla/service/cpu/ir_emitter.h @@ -336,6 +336,7 @@ class IrEmitter : public DfsHloVisitorWithDefault, absl::Status HandleTopK(HloInstruction* hlo) override; absl::Status HandleAllReduceSingleReplica(HloInstruction* crs); absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs); + absl::Status HandleXnnPackSoftMax(HloInstruction* hlo); #if defined(INTEL_MKL) std::vector EmitOneDnnOperandsAlloca(HloInstruction* custom_call, llvm::Value*& args_val, diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc index 87aca6c386751a..64e5970c8f04a4 100644 --- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc +++ b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc @@ -56,6 +56,7 @@ limitations under the License. #include "xla/service/cpu/runtime_single_threaded_matmul.h" #include "xla/service/cpu/runtime_topk.h" #include "xla/service/cpu/windows_compatibility.h" +#include "xla/service/cpu/xnnpack_ops.h" #include "xla/service/custom_call_target_registry.h" #include "tsl/platform/logging.h" @@ -209,6 +210,7 @@ static bool RegisterKnownJITSymbols() { REGISTER_CPU_RUNTIME_SYMBOL(TracingStart); REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd); REGISTER_CPU_RUNTIME_SYMBOL(HandleFfiCall); + REGISTER_CPU_RUNTIME_SYMBOL(XnnPackSoftMaxND); #if defined(INTEL_MKL) REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul); REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax); diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops.cc b/third_party/xla/xla/service/cpu/xnnpack_ops.cc new file mode 100644 index 00000000000000..902086924f0fdf --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops.cc @@ -0,0 +1,76 @@ +/* Original Copyright: Copyright (c) Facebook, Inc. and its affiliates. +This source code is licensed under the BSD-style license found in the +LICENSE file in the root directory of this source tree. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define XNN_LOG_LEVEL 4 +#include +#include "xnnpack.h" +#include "absl/base/attributes.h" + +namespace xla { +namespace cpu { + +extern "C" { +ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_XnnPackSoftMaxND( + const void* run_options_ptr, void* in, void* out, int64_t batch_size, + int64_t channels) { + // NB: run_options_ptr is ignored. + float* input = (float*)in; + float* output = (float*)out; + + xnn_status status = xnn_initialize(nullptr /* allocator */); + if (status != xnn_status_success) { + std::cout << "failed to initialize XNNPACK"; + return; + } + + xnn_operator_t softmax_op = nullptr; + status = xnn_create_softmax_nc_f32(0 /* flags */, &softmax_op); + if (status != xnn_status_success || softmax_op == nullptr) { + std::cout << "failed to create SoftMax operator\n"; + return; + } + + status = xnn_reshape_softmax_nc_f32(softmax_op, channels, /* channels */ + channels /* input stride */, + channels /* output stride */, batch_size, + /*threadpool=*/nullptr); + if (status != xnn_status_success) { + std::cout << "failed to reshape SoftMax operator"; + return; + } + + status = xnn_setup_softmax_nc_f32(softmax_op, input, output); + if (status != xnn_status_success) { + std::cout << "failed to setup SoftMax operator"; + return; + } + + status = xnn_run_operator(softmax_op, /*threadpool=*/nullptr); + if (status != xnn_status_success) { + std::cout << "failed to run SoftMax operator"; + return; + } + + xnn_delete_operator(softmax_op); + + xnn_deinitialize(); +} + +} // extern "C" + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops.h b/third_party/xla/xla/service/cpu/xnnpack_ops.h new file mode 100644 index 00000000000000..c3811f641a9f4c --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops.h @@ -0,0 +1,36 @@ +/* Referenced & Modified External Open Source Code: +Source URL: https://github.com/openxla/xla/pull/7540/files +Original Copyright: 2023 The TensorFlow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_XNNPACK_OPS_H_ +#define XLA_SERVICE_CPU_XNNPACK_OPS_H_ + +namespace xla { +namespace cpu { + +extern "C" { + +extern void __xla_cpu_runtime_XnnPackSoftMaxND(const void* run_options_ptr, + void* in, void* out, + int64_t batch_size, + int64_t channels); + +} // extern "C" + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_XNNPACK_OPS_H_ diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc new file mode 100644 index 00000000000000..a3a5f1827d0da8 --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc @@ -0,0 +1,228 @@ +/* +Referenced & Modified External Open Source Code: +Original Copyright: 2023 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "xnnpack_ops_rewriter.h" + +#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h" +#include "xla/literal_comparison.h" +#include "xla/literal_util.h" +#include "xnnpack_pattern_utils.h" +#include "xla/status_macros.h" + +namespace xla { +namespace cpu { + +extern const char* const kCustomCallXnnPackSoftMax = "__xnnpack$softmax"; + +namespace { +namespace m = match; +namespace pu = ::xla::cpu::xnnpack_pattern_utils_internal; + +bool IsNegInfConstScalar(const HloInstruction* const_instr) { + if (const_instr->opcode() != HloOpcode::kConstant) { + return false; + } + if (!ShapeUtil::IsEffectiveScalar(const_instr->shape())) { + return false; + } + auto value = LiteralUtil::GetFirstScalarLiteral(const_instr->literal()); + return literal_comparison::Equal( + value, LiteralUtil::MinValue(const_instr->shape().element_type())) + .ok(); +} + +bool IsMaxReducerComputation(const HloComputation* comp) { + if (comp->root_instruction()->opcode() != HloOpcode::kMaximum) { + return false; + } + auto max_instr = comp->root_instruction(); + const HloInstruction* p0 = comp->parameter_instruction(0); + const HloInstruction* p1 = comp->parameter_instruction(1); + const HloInstruction* max_p0 = max_instr->operand(0); + const HloInstruction* max_p1 = max_instr->operand(1); + return (max_p0 == p0 && max_p1 == p1) || (max_p1 == p0 && max_p0 == p1); +} + +// Pattern to match any of Maximum(Reduce_max(...), -inf) or Reduce_max(...). +auto MaxReduce(HloInstruction** instr) { + auto is_valid_reduce_max = [](const HloInstruction* reduce) { + HloComputation* reducer = reduce->to_apply(); + return IsMaxReducerComputation(reducer) && + (reduce->dimensions().size() == 1) && + (reduce->operand(1)->opcode() == HloOpcode::kConstant) && + IsNegInfConstScalar(reduce->operand(1)); + }; + + return m::AnyOf( + m::Maximum().WithBinaryOperandsAnyOrder( + m::Reduce(instr).WithPredicate(is_valid_reduce_max).WithOneUse(), + pu::OptionalBroadcast( + m::Constant().WithPredicate(IsNegInfConstScalar))), + m::Reduce(instr).WithPredicate(is_valid_reduce_max).WithOneUse()); +} + +// Matches the softmax pattern with divide instruction as root node. +// Here we pass 'instr' as root node and return the producer HloInstruction. +// Tha axis on which softmax is applied is stored in 'axis'. +std::optional MatchSoftmax(HloInstruction* instr, int* axis) { + // + // producer + // | \ + // | reduce_max or max(reduce_max) + // | | + // | reshape + // | | + // | broadcast + // | | + // | reshape + // | | + // | broadcast + // | / + // subtract + // | + // exponential + // | \ + // | Convert(optional) + // | | + // | reduce_sum + // | | + // | Convert(optional) + // | | + // | reshape + // | | + // | Convert(optional) + // | | + // | broadcast + // | | + // | reshape + // | | + // | broadcast + // | / + // divide // (instr parameter) + // + + // This matcher covers the most common SoftMax patterns we have encountered + // in real-life models. + HloInstruction* left_exponential; + HloInstruction* right_exponential; + HloInstruction* left_producer; + HloInstruction* reduce_sum; + HloInstruction* reduce_max; + HloInstruction* reduce_instr; + + // Lower diamond + if (!Match(instr, + m::Divide( + m::Exp(&left_exponential, m::Op()), + m::Broadcast(m::Reshape(m::Broadcast( + pu::OptionalConvert(m::Reshape(pu::OptionalConvert( + m::Reduce(&reduce_sum, + pu::OptionalConvert( + m::Exp(&right_exponential, m::Op())), + m::ConstantScalar(0)) + .WithPredicate([](const HloInstruction* reduce) { + HloComputation* reducer = reduce->to_apply(); + return (reducer->root_instruction()->opcode() == + HloOpcode::kAdd && + reduce->dimensions().size() == 1); + }) + .WithOneUse()))))))))) { + return std::nullopt; + } + + if (left_exponential != right_exponential || + left_exponential->user_count() != 2) { + return std::nullopt; + } + + // Upper diamond + if (!Match(left_exponential->mutable_operand(0), + m::Subtract(m::Op(&left_producer), + m::Broadcast(m::Reshape(m::Broadcast( + m::Reshape(m::Op(&reduce_instr))))) + .WithOneUse()) + .WithOneUse())) { + return std::nullopt; + } + + // Match the reduce max. + if (!Match(reduce_instr, MaxReduce(&reduce_max))) { + return std::nullopt; + } + + if (left_producer != reduce_max->operand(0) || + left_producer->user_count() != 2) { + return std::nullopt; + } + + if (reduce_sum->dimensions()[0] != reduce_max->dimensions()[0]) { + return std::nullopt; + } + + *axis = reduce_sum->dimensions()[0]; + + return left_producer; +} + +} // namespace + +class XnnPackOpsRewriterVisitor : public DfsHloRewriteVisitor { + public: + absl::Status HandleDivide(HloInstruction* divide_instr) override { + if (divide_instr->HasControlDependencies()) { + return absl::OkStatus(); + } + if (!pu::IsSupportedType(divide_instr->shape().element_type())) { + return absl::OkStatus(); + } + int axis = -1; + std::optional producer = MatchSoftmax(divide_instr, &axis); + if (producer == std::nullopt) { + return absl::OkStatus(); + } + + const Shape& output_shape = divide_instr->shape(); + int softmax_dims = output_shape.dimensions().size(); + if (softmax_dims < 2) { + XLA_VLOG_LINES(3, "Found SoftMax with " + std::to_string(softmax_dims) + + " dims, which is not supported\n"); + return absl::OkStatus(); + } + + HloInstruction* softmax_call = + divide_instr->AddInstruction(HloInstruction::CreateCustomCall( + output_shape, {producer.value()}, kCustomCallXnnPackSoftMax)); + TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call)); + + return absl::OkStatus(); + } +}; + +absl::StatusOr XnnPackOpsRewriter::Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) { + XLA_VLOG_LINES(3, + "XnnPackOpsRewriter::Run(), before:\n" + module->ToString()); + XnnPackOpsRewriterVisitor visitor; + TF_ASSIGN_OR_RETURN(auto result, + visitor.RunOnModule(module, execution_threads)); + XLA_VLOG_LINES(3, "XnnPackOpsRewriter::Run(), after:\n" + module->ToString()); + return result; +} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h new file mode 100644 index 00000000000000..2bdc58965c96dc --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h @@ -0,0 +1,45 @@ +/* Referenced & Modified External Open Source Code: +Original Copyright: 2023 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_ +#define XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_ + +#include + +#include "absl/algorithm/container.h" +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/hlo/ir/hlo_module.h" +#include "xla/hlo/pass/hlo_pass_interface.h" + +namespace xla { +namespace cpu { + +extern const char* const kCustomCallXnnPackSoftMax; + +class XnnPackOpsRewriter : public HloModulePass { + public: + absl::string_view name() const override { return "xnnpack-ops-rewriter"; } + + using HloPassInterface::Run; + absl::StatusOr Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) override; +}; + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_ diff --git a/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h b/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h new file mode 100644 index 00000000000000..1ea52de3695def --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h @@ -0,0 +1,65 @@ +/* +Referenced & Modified External Open Source Code: +Original Copyright: 2024 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_ +#define XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_ + +#include "xla/hlo/ir/hlo_instruction.h" +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/service/pattern_matcher.h" + +namespace xla { +namespace cpu { + +namespace xnnpack_pattern_utils_internal { +namespace m = match; + +template +auto OptionalConvert(Pattern pattern) { + return m::AnyOf(m::Convert(pattern), std::move(pattern)); +} + +template +auto OptionalBroadcast(Pattern pattern) { + return m::AnyOf(m::Broadcast(pattern), std::move(pattern)); +} + +// Simplified from upstream XLA. +inline bool IsSupportedType(xla::PrimitiveType dtype) { return dtype == F32; } + +template +inline auto SupportedConvert(Pattern pattern) { + auto supported_convert = [](const HloInstruction* instr) -> bool { + return IsSupportedType(instr->shape().element_type()) && + IsSupportedType(instr->operand(0)->shape().element_type()); + }; + return m::Convert(pattern).WithPredicate(supported_convert); +} + +template +inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) { + auto supported_convert = [](const HloInstruction* instr) -> bool { + return IsSupportedType(instr->shape().element_type()) && + IsSupportedType(instr->operand(0)->shape().element_type()); + }; + return m::Convert(convert, pattern).WithPredicate(supported_convert); +} +} // namespace xnnpack_pattern_utils_internal +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_ diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto index ca8ba0553bd56a..854eed7235720a 100644 --- a/third_party/xla/xla/xla.proto +++ b/third_party/xla/xla/xla.proto @@ -222,6 +222,8 @@ message DebugOptions { // When true, XLA:CPU uses XNNPACK to execute supported operations. bool xla_cpu_use_xnnpack = 359; + bool xla_cpu_enable_xnnpack = 389; + // Enabling this will enable optimizations that ignore the possibility of NaN. bool xla_enable_fast_math = 335; From 912c9afbf83ab83575ab68a25b1c833e025e358c Mon Sep 17 00:00:00 2001 From: Wen Di Date: Mon, 12 Jan 2026 17:33:37 +0800 Subject: [PATCH 2/3] add kernel selector --- tensorflow/workspace2.bzl | 2 + third_party/xla/third_party/openblas/BUILD | 0 .../xla/third_party/openblas/openblas.BUILD | 17 + .../xla/third_party/openblas/workspace.bzl | 10 + third_party/xla/workspace2.bzl | 2 + third_party/xla/xla/debug_options_flags.cc | 10 +- third_party/xla/xla/service/cpu/BUILD | 82 +- third_party/xla/xla/service/cpu/BUILD.orig | 35 + .../xla/xla/service/cpu/cpu_compiler.cc | 11 +- .../xla/xla/service/cpu/cpu_compiler.cc.orig | 8 + .../xla/xla/service/cpu/cpu_runtime.cc | 46 + third_party/xla/xla/service/cpu/cpu_runtime.h | 29 + third_party/xla/xla/service/cpu/ir_emitter.cc | 186 +- third_party/xla/xla/service/cpu/ir_emitter.h | 3 + .../xla/xla/service/cpu/kernel_selector.cc | 423 ++ .../xla/xla/service/cpu/kernel_selector.h | 191 + .../cpu/kernel_selector_ops_rewriter.cc | 658 +++ .../cpu/kernel_selector_ops_rewriter.h | 42 + .../service/cpu/runtime_symbol_generator.cc | 22 + .../xla/service/cpu/xnnpack_ops_rewriter.cc | 4 +- .../xla/service/cpu/xnnpack_ops_rewriter.h | 2 - third_party/xla/xla/service/libs/BUILD | 17 + .../xla/service/libs/libblas_mlir/Makefile | 52 + .../libs/libblas_mlir/include/MemrefHelpers.h | 10 + .../service/libs/libblas_mlir/include/cblas.h | 11 + .../kernels/sbatch_matmul_3d_nn_mlir.s | 4079 ++++++++++++++++ .../kernels/sbatch_matmul_3d_nt_mlir.s | 2987 ++++++++++++ .../kernels/sbatch_matmul_4d_nn_mlir.s | 4171 +++++++++++++++++ .../kernels/sbatch_matmul_4d_nt_mlir.s | 3208 +++++++++++++ .../kernels/sgemm_nn_alpha1_beta1_mlir.s | 4104 ++++++++++++++++ .../kernels/sgemv_n_alpha1_beta1_mlir.s | 709 +++ .../libblas_mlir/src/sbatch_matmul_3d.cpp | 46 + .../libblas_mlir/src/sbatch_matmul_4d.cpp | 49 + .../service/libs/libblas_mlir/src/sgemm.cpp | 43 + .../service/libs/libblas_mlir/src/sgemv.cpp | 43 + third_party/xla/xla/xla.proto | 3 +- 36 files changed, 21295 insertions(+), 20 deletions(-) create mode 100644 third_party/xla/third_party/openblas/BUILD create mode 100644 third_party/xla/third_party/openblas/openblas.BUILD create mode 100644 third_party/xla/third_party/openblas/workspace.bzl create mode 100644 third_party/xla/xla/service/cpu/kernel_selector.cc create mode 100644 third_party/xla/xla/service/cpu/kernel_selector.h create mode 100644 third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc create mode 100644 third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h create mode 100644 third_party/xla/xla/service/libs/BUILD create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/Makefile create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl index 85edecae8c67a6..2b1aa738a475cd 100644 --- a/tensorflow/workspace2.bzl +++ b/tensorflow/workspace2.bzl @@ -29,6 +29,7 @@ load("@local_xla//third_party/nvshmem:workspace.bzl", nvshmem = "repo") load("@local_xla//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo") load("@local_xla//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo") load("@local_xla//third_party/robin_map:workspace.bzl", robin_map = "repo") +load("@local_xla//third_party/openblas:workspace.bzl", openblas = "repo") load("@rules_jvm_external//:defs.bzl", "maven_install") load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies") load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure") @@ -100,6 +101,7 @@ def _initialize_third_party(): tensorrt() nvshmem() triton() + openblas() # copybara: tsl vendor diff --git a/third_party/xla/third_party/openblas/BUILD b/third_party/xla/third_party/openblas/BUILD new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/third_party/xla/third_party/openblas/openblas.BUILD b/third_party/xla/third_party/openblas/openblas.BUILD new file mode 100644 index 00000000000000..6d36eec9e6b0d7 --- /dev/null +++ b/third_party/xla/third_party/openblas/openblas.BUILD @@ -0,0 +1,17 @@ +genrule( + name = "build_openblas", + srcs = glob(["**"], exclude = ["*.a"]), + outs = ["libopenblas.a"], + cmd = """ + cd $$(dirname $(location //:README.md)) && \ + make NO_SHARED=1 ONLY_CBLAS=1 TARGET=ARMV8 ARCH=arm64 && \ + cd - && \ + cp $$(dirname $(location //:README.md))/libopenblas_*.a $@ + """, +) + +cc_import( + name = "openblas", + static_library = "libopenblas.a", + visibility = ["//visibility:public"], +) diff --git a/third_party/xla/third_party/openblas/workspace.bzl b/third_party/xla/third_party/openblas/workspace.bzl new file mode 100644 index 00000000000000..6728207dbfe58f --- /dev/null +++ b/third_party/xla/third_party/openblas/workspace.bzl @@ -0,0 +1,10 @@ +load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls") + +def repo(): + tf_http_archive( + name = "openblas", + strip_prefix = "OpenBLAS-0.3.29", + sha256 = "38240eee1b29e2bde47ebb5d61160207dc68668a54cac62c076bb5032013b1eb", + urls = tf_mirror_urls("https://github.com/OpenMathLib/OpenBLAS/archive/8795fc7985635de1ecf674b87e2008a15097ffab.tar.gz"), + build_file = "//third_party/openblas:openblas.BUILD", + ) diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl index 345f1931c68e47..cc2013365b40c8 100644 --- a/third_party/xla/workspace2.bzl +++ b/third_party/xla/workspace2.bzl @@ -18,6 +18,7 @@ load("//third_party/shardy:workspace.bzl", shardy = "repo") load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo") load("//third_party/triton:workspace.bzl", triton = "repo") load("//third_party/uv:workspace.bzl", uv = "repo") +load("//third_party/openblas:workspace.bzl", openblas = "repo") def _initialize_third_party(): """ Load third party repositories. See above load() statements. """ @@ -31,6 +32,7 @@ def _initialize_third_party(): stablehlo() triton() uv() + openblas() # Define all external repositories required by TensorFlow def _tf_repositories(): diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc index 7ab70838950d98..7792ab22f7f929 100644 --- a/third_party/xla/xla/debug_options_flags.cc +++ b/third_party/xla/xla/debug_options_flags.cc @@ -100,10 +100,11 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { #ifdef XLA_CPU_USE_ACL opts.set_xla_cpu_use_acl(true); #endif - opts.set_xla_cpu_use_fusion_emitters(true); - opts.set_xla_cpu_use_thunk_runtime(true); + opts.set_xla_cpu_use_fusion_emitters(false); + opts.set_xla_cpu_use_thunk_runtime(false); opts.set_xla_cpu_use_xnnpack(false); opts.set_xla_cpu_enable_xnnpack(false); // For softmax + opts.set_xla_cpu_use_kernel_selector(false); opts.set_xla_cpu_experimental_xnn_graph_fusion_mode( DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED); opts.set_xla_cpu_parallel_codegen_split_count(32); @@ -1000,6 +1001,11 @@ void MakeDebugOptionsFlags(std::vector* flag_list, bool_setter_for(&DebugOptions::set_xla_cpu_enable_xnnpack), debug_options->xla_cpu_enable_xnnpack(), "Enable XNNPACK ops rewriter.")); + flag_list->push_back(tsl::Flag( + "xla_cpu_use_kernel_selector", + bool_setter_for(&DebugOptions::set_xla_cpu_use_kernel_selector), + debug_options->xla_cpu_use_kernel_selector() , + "Replace dot with custom call to libraries.")); flag_list->push_back(tsl::Flag( "xla_cpu_experimental_xnn_graph_fusion_mode", setter_for_xla_cpu_experimental_xnn_graph_fusion_mode, diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD index f951a6ac93b626..bc46d88d626fa0 100644 --- a/third_party/xla/xla/service/cpu/BUILD +++ b/third_party/xla/xla/service/cpu/BUILD @@ -89,6 +89,7 @@ filegroup( "runtime_matmul_f64.cc", "runtime_matmul_s32.cc", "runtime_fork_join.cc", + "kernel_selector.cc", "//xla/backends/cpu/runtime:runtime_srcs", #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add "runtime_handle_ffi_call.cc". ], @@ -118,6 +119,7 @@ filegroup( "runtime_fork_join.h", "runtime_lightweight_check.h", "runtime_matmul.h", + "kernel_selector.h", "//xla/backends/cpu/runtime:runtime_hdrs", #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add "runtime_handle_ffi_call.h" ], @@ -195,7 +197,11 @@ cc_library( name = "cpu_compiler_pure", srcs = ["cpu_compiler.cc"], hdrs = ["cpu_compiler.h"], - copts = tsl_copts(), + copts = tsl_copts() + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), deps = [ ":buffer_info_util", ":conv_canonicalization", @@ -221,6 +227,7 @@ cc_library( ":thunk_emitter", ":xla_framework", ":xnnpack_ops_rewriter", + ":kernel_selector_ops_rewriter", "//xla:cpu_function_runtime", "//xla:debug_options_flags", "//xla:literal", @@ -420,7 +427,21 @@ cc_library( "@llvm-project//llvm:SystemZCodeGen", # fixdeps: keep ]) + if_llvm_x86_available([ "@llvm-project//llvm:X86CodeGen", # fixdeps: keep - ]), + ]) + select({ + ":enable_blas_mlir": [":libmlir"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), +) + +config_setting( + name = "enable_blas_mlir", + define_values = {"ENABLE_BLAS_MLIR": "true"}, +) + +config_setting( + name = "disable_blas_mlir", + define_values = {"ENABLE_BLAS_MLIR": "false"}, ) cc_library( @@ -595,7 +616,11 @@ cc_library( "windows_compatibility.h", ], hdrs = ["runtime_symbol_generator.h"], - copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(), + copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts() + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), deps = [ ":cpu_runtime", ":onednn_convolution", @@ -621,6 +646,7 @@ cc_library( ":runtime_single_threaded_matmul", ":runtime_topk", ":xnnpack_ops", + ":kernel_selector", "//xla/service:custom_call_target_registry", "@com_google_absl//absl/functional:any_invocable", "@com_google_absl//absl/strings:string_view", @@ -842,8 +868,6 @@ cc_library( ":onednn_config_proto_cc", ":onednn_memory_util", ":parallel_loop_emitter", - ":xnnpack_ops_rewriter", - ":xnnpack_ops", "//xla:literal", "//xla:literal_util", "//xla:shape_util", @@ -1108,7 +1132,11 @@ cc_library( "cpu_runtime.h", "xfeed_manager.h", ], - copts = runtime_copts(), + copts = runtime_copts() + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), deps = [ ":cpu_executable_run_options", "//xla:executable_run_options", @@ -2201,6 +2229,7 @@ cc_library( "xnnpack_ops_rewriter.h", "xnnpack_pattern_utils.h", ], + copts = ["-O3"], visibility = ["//visibility:public"], deps = [ "//xla/hlo/ir:hlo", @@ -2216,9 +2245,50 @@ cc_library( name = "xnnpack_ops", srcs = ["xnnpack_ops.cc"], hdrs = ["xnnpack_ops.h"], + copts = ["-O3"], visibility = ["//visibility:public"], deps = [ "@XNNPACK", "@com_google_absl//absl/base", ], ) + +cc_library( + name = "kernel_selector", + srcs = ["kernel_selector.cc"], + hdrs = ["kernel_selector.h"], + copts = ["-O3"] + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + "//xla:executable_run_options", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:blocking_counter", + "@openblas//:openblas", + ], +) + +cc_library( + name = "kernel_selector_ops_rewriter", + srcs = ["kernel_selector_ops_rewriter.cc"], + hdrs = ["kernel_selector_ops_rewriter.h"], + copts = ["-O3"], + visibility = ["//visibility:public"], + deps = [ + ":cpu_runtime", + "//xla/hlo/ir:hlo", + "//xla:literal_util", + "//xla/hlo/pass:hlo_pass", + ], +) + +cc_import( + name = "libmlir", + visibility = ["//visibility:public"], + shared_library = "//xla/service/libs:libblas_mlir.so", + system_provided = 0 +) diff --git a/third_party/xla/xla/service/cpu/BUILD.orig b/third_party/xla/xla/service/cpu/BUILD.orig index 90388079ca2fcf..f951a6ac93b626 100644 --- a/third_party/xla/xla/service/cpu/BUILD.orig +++ b/third_party/xla/xla/service/cpu/BUILD.orig @@ -76,6 +76,7 @@ filegroup( "runtime_single_threaded_matmul_s32.cc", "runtime_single_threaded_matmul_u8.cc", "runtime_topk.cc", + "xnnpack_ops.cc", # Multi-threaded support. "runtime_conv2d.cc", "runtime_conv3d.cc", @@ -109,6 +110,7 @@ filegroup( "runtime_single_threaded_fft.h", "runtime_single_threaded_matmul.h", "runtime_topk.h", + "xnnpack_ops.h", # Multi-threaded support. "runtime_conv2d.h", "runtime_conv3d.h", @@ -218,6 +220,7 @@ cc_library( ":small_while_loop_hoisting_pass", ":thunk_emitter", ":xla_framework", + ":xnnpack_ops_rewriter", "//xla:cpu_function_runtime", "//xla:debug_options_flags", "//xla:literal", @@ -617,6 +620,7 @@ cc_library( ":runtime_single_threaded_fft", ":runtime_single_threaded_matmul", ":runtime_topk", + ":xnnpack_ops", "//xla/service:custom_call_target_registry", "@com_google_absl//absl/functional:any_invocable", "@com_google_absl//absl/strings:string_view", @@ -838,6 +842,8 @@ cc_library( ":onednn_config_proto_cc", ":onednn_memory_util", ":parallel_loop_emitter", + ":xnnpack_ops_rewriter", + ":xnnpack_ops", "//xla:literal", "//xla:literal_util", "//xla:shape_util", @@ -2187,3 +2193,32 @@ xla_cc_test( "@local_tsl//tsl/platform:test", ], ) + +cc_library( + name = "xnnpack_ops_rewriter", + srcs = ["xnnpack_ops_rewriter.cc"], + hdrs = [ + "xnnpack_ops_rewriter.h", + "xnnpack_pattern_utils.h", + ], + visibility = ["//visibility:public"], + deps = [ + "//xla/hlo/ir:hlo", + "//xla:literal_comparison", + "//xla:literal_util", + "//xla:status_macros", + "//xla/hlo/pass:hlo_pass", + "//xla/service:pattern_matcher", + ], +) + +cc_library( + name = "xnnpack_ops", + srcs = ["xnnpack_ops.cc"], + hdrs = ["xnnpack_ops.h"], + visibility = ["//visibility:public"], + deps = [ + "@XNNPACK", + "@com_google_absl//absl/base", + ], +) diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc index 4a1402c6934cba..c6d02568dfb9e4 100644 --- a/third_party/xla/xla/service/cpu/cpu_compiler.cc +++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc @@ -183,6 +183,8 @@ limitations under the License. #include "xla/service/cpu/runtime_symbol_generator.h" #include "xla/service/cpu/small_while_loop_hoisting_pass.h" #include "xla/service/cpu/thunk_emitter.h" +#include "xla/service/cpu/xnnpack_ops_rewriter.h" +#include "xla/service/cpu/kernel_selector_ops_rewriter.h" #include "xla/service/cpu_gpu_shape_verifier.h" #include "xla/service/dump.h" #include "xla/service/dynamic_dimension_inference.h" @@ -236,8 +238,6 @@ limitations under the License. #include "tsl/profiler/lib/traceme.h" #include "tsl/profiler/lib/traceme_encode.h" -#include "xnnpack_ops_rewriter.h" - #ifdef TF_LLVM_X86_AVAILABLE #include "llvm/TargetParser/X86TargetParser.h" #endif @@ -599,6 +599,13 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( if (enable_xnnpack) pipeline.AddPass(); + bool use_kernel_selector = + xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector(); + if (use_kernel_selector) { + // This pass rewrites hlo.dot into custom calls. + pipeline.AddPass(); + } + // Expand random number generation. pipeline.AddPass(); pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig index 9ba0085b24d372..4a1402c6934cba 100644 --- a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig +++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig @@ -236,6 +236,8 @@ limitations under the License. #include "tsl/profiler/lib/traceme.h" #include "tsl/profiler/lib/traceme_encode.h" +#include "xnnpack_ops_rewriter.h" + #ifdef TF_LLVM_X86_AVAILABLE #include "llvm/TargetParser/X86TargetParser.h" #endif @@ -591,6 +593,12 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( }; pipeline.AddPass(upcaster_filter); + // For softmax, rewrite to custom calls with XNNPACK targets. + bool enable_xnnpack = + xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack(); + if (enable_xnnpack) + pipeline.AddPass(); + // Expand random number generation. pipeline.AddPass(); pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc index 1f8e9291f84c32..5b66495798d800 100644 --- a/third_party/xla/xla/service/cpu/cpu_runtime.cc +++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc @@ -199,6 +199,52 @@ extern const char* const kHandleFfiCallSymbolName = "__xla_cpu_runtime_HandleFfiCall"; extern const char* const kXnnPackSoftMaxNDSymbolName = "__xla_cpu_runtime_XnnPackSoftMaxND"; +extern const char* const kArgMax3DParallelSymbolName = + "__xla_cpu_runtime_ArgMax3DParallel"; +extern const char* const kArgMax3DSequentialSymbolName = + "__xla_cpu_runtime_ArgMax3DSequential"; +extern const char* const kKernelSelectorGEMVSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMV"; +extern const char* const kKernelSelectorGEMMSequentialSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMSequential"; +extern const char* const kKernelSelectorGEMMParallelSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMParallel"; +extern const char* const kKernelSelectorBatch3DSequentialSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DSequential"; +extern const char* const kKernelSelectorBatch3DParallelSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DParallel"; +#ifdef ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMVMLIR"; +#endif // ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorBatch4DSequentialSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DSequential"; +extern const char* const kKernelSelectorBatch4DParallelSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DParallel"; +#ifdef ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMMMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMMLIR"; +extern const char* const kKernelSelectorBatch3DMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DMLIR"; +extern const char* const kKernelSelectorBatch4DMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DMLIR"; +#endif // ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorGEMVEmpty"; +extern const char* const kKernelSelectorGEMMEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMEmpty"; +extern const char* const kKernelSelectorBatch3DEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DEmpty"; +extern const char* const kKernelSelectorBatch4DEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DEmpty"; +extern const char* const kArgMax3DEmptySymbolName = + "__xla_cpu_runtime_ArgMax3DEmpty"; +extern const char* const kKernelSelectorOperationGEMV = "GEMV"; +extern const char* const kKernelSelectorOperationGEMM = "GEMM"; +extern const char* const kKernelSelectorOperationBATCH3D = "BATCH3D"; +extern const char* const kKernelSelectorOperationBATCH4D = "BATCH4D"; +extern const char* const kKernelSelectorOperationARGMAX = "ARGMAX"; +extern const char* const kCustomCallKernelSelector = "KernelSelector"; namespace { diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h index 31c7f9d0d86ef5..4469a468a2ff5c 100644 --- a/third_party/xla/xla/service/cpu/cpu_runtime.h +++ b/third_party/xla/xla/service/cpu/cpu_runtime.h @@ -98,6 +98,35 @@ extern const char* const kOneDnnConvolutionSymbolName; extern const char* const kOneDnnMatMulReorderSymbolName; extern const char* const kHandleFfiCallSymbolName; extern const char* const kXnnPackSoftMaxNDSymbolName; +extern const char* const kArgMax3DParallelSymbolName; +extern const char* const kArgMax3DSequentialSymbolName; +extern const char* const kKernelSelectorGEMVSymbolName; +extern const char* const kKernelSelectorGEMMSequentialSymbolName; +extern const char* const kKernelSelectorGEMMParallelSymbolName; +extern const char* const kKernelSelectorBatch3DSequentialSymbolName; +extern const char* const kKernelSelectorBatch3DParallelSymbolName; +extern const char* const kKernelSelectorBatch4DSequentialSymbolName; +extern const char* const kKernelSelectorBatch4DParallelSymbolName; +#ifdef ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVMLIRSymbolName; +extern const char* const kKernelSelectorGEMMMLIRSymbolName; +extern const char* const kKernelSelectorBatch3DMLIRSymbolName; +extern const char* const kKernelSelectorBatch4DMLIRSymbolName; +#endif // ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVEmptySymbolName; +extern const char* const kKernelSelectorGEMMEmptySymbolName; +extern const char* const kKernelSelectorBatch3DEmptySymbolName; +extern const char* const kKernelSelectorBatch4DEmptySymbolName; +extern const char* const kArgMax3DEmptySymbolName; + +// Kernel selector operation names. +extern const char* const kKernelSelectorOperationGEMV; +extern const char* const kKernelSelectorOperationGEMM; +extern const char* const kKernelSelectorOperationBATCH3D; +extern const char* const kKernelSelectorOperationBATCH4D; +extern const char* const kKernelSelectorOperationARGMAX; + +extern const char* const kCustomCallKernelSelector; // All symbol names for XLA CPU runtime functions need to start with this // prefix. diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc index 2bd5d7278b07c5..f99308bcd6104f 100644 --- a/third_party/xla/xla/service/cpu/ir_emitter.cc +++ b/third_party/xla/xla/service/cpu/ir_emitter.cc @@ -110,9 +110,6 @@ limitations under the License. #include "xla/util.h" #include "xla/xla_data.pb.h" -#include "xnnpack_ops.h" -#include "xnnpack_ops_rewriter.h" - #if defined(INTEL_MKL) #include "xla/service/cpu/onednn_memory_util.h" #endif @@ -2499,6 +2496,184 @@ absl::Status IrEmitter::HandleXnnPackSoftMax(HloInstruction* hlo) { return absl::OkStatus(); } +absl::Status IrEmitter::HandleKernelSelectorArgMax(HloInstruction* hlo) { + OpMetadata metadata = hlo->metadata(); + + const HloInstruction* in1 = hlo->operand(0); + const HloInstruction* in2 = hlo->operand(1); + const HloInstruction* in3 = hlo->operand(2); + const HloInstruction* in4 = hlo->operand(3); + + Shape shape = in1->shape(); + TF_RET_CHECK(shape.dimensions().size() == 3); + + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo)); + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input1_slice, + assignment_.GetUniqueSlice(in1, {})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input2_slice, + assignment_.GetUniqueSlice(in2, {})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice, + assignment_.GetUniqueSlice(hlo, {0})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_indices_slice, + assignment_.GetUniqueSlice(hlo, {1})); + + llvm::Value* values1_ptr = EmitBufferPointer(input1_slice, in1->shape()); + llvm::Value* values2_ptr = EmitBufferPointer(input2_slice, in2->shape()); + llvm::Value* out_values_ptr = + EmitBufferPointer(out_values_slice, hlo->shape().tuple_shapes(0)); + llvm::Value* out_indices_ptr = + EmitBufferPointer(out_indices_slice, hlo->shape().tuple_shapes(1)); + + float cst1_val = in3->literal().Get({}); + llvm::Constant* cst1 = llvm::ConstantFP::get(b()->getFloatTy(), cst1_val); + + EmitCallToFunc( + metadata.op_name(), + {/*run_options=*/GetExecutableRunOptionsArgument(), + /*B*/ b()->getInt64(shape.dimensions(0)), + /*M*/ b()->getInt64(shape.dimensions(1)), + /*N*/ b()->getInt64(shape.dimensions(2)), + /*invals*/ BitCast(values1_ptr, b()->getInt32Ty()->getPointerTo()), + /*inidxs*/ BitCast(values2_ptr, b()->getInt32Ty()->getPointerTo()), + /*init_value*/ cst1, + /*init_idx*/ b()->getInt32(in4->literal().Get({})), + /*outvals*/ BitCast(out_values_ptr, b()->getFloatTy()->getPointerTo()), + /*outidxs*/ BitCast(out_indices_ptr, b()->getInt32Ty()->getPointerTo())}, + b()->getVoidTy()); + + llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr}, + b()); + return absl::OkStatus(); +} + +absl::Status IrEmitter::HandleKernelSelectorBlas(HloInstruction* custom_call) { + OpMetadata metadata = custom_call->metadata(); + + bool isGEMV = (metadata.op_type() == runtime::kKernelSelectorOperationGEMV); + bool isGEMM = (metadata.op_type() == runtime::kKernelSelectorOperationGEMM); + bool isBATCHMATMUL3D = + (metadata.op_type() == runtime::kKernelSelectorOperationBATCH3D); + bool isBATCHMATMUL4D = + (metadata.op_type() == runtime::kKernelSelectorOperationBATCH4D); + bool isBATCHMATMUL = isBATCHMATMUL3D | isBATCHMATMUL4D; + + int operand = 0; + std::vector arguments; + + // | arguments | + // | gemm | batch3d | batch4d | gemv | + // ----------------------------------------- + // | trA | trA | trA | trA | + // | trB | trB | trB | | + // | A | A | A | A | + // | B | B | B | X | + // | | | Q | | + // | | P | P | | + // | M | M | M | M | + // | N | N | N | N | + // | K | K | K | | + // | alpha | | | alpha | + // | beta | | | beta | + + arguments.push_back(/*run_options=*/GetExecutableRunOptionsArgument()); + + // trA + HloInstruction const* trA = custom_call->operand(operand++); + bool tranA = trA->literal().Get({}); + arguments.push_back(b()->getInt1(tranA)); + + if (isGEMM || isBATCHMATMUL) { + // trB + HloInstruction const* trB = custom_call->operand(operand++); + bool tranB = trB->literal().Get({}); + arguments.push_back(b()->getInt1(tranB)); + } + + // A + HloInstruction const* A = custom_call->operand(operand++); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice a_slice, + assignment_.GetUniqueSlice(A, {})); + llvm::Value* A_ptr = EmitBufferPointer(a_slice, A->shape()); + arguments.push_back(A_ptr); + + // B (or X in GEMV) + HloInstruction const* B = custom_call->operand(operand++); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice b_slice, + assignment_.GetUniqueSlice(B, {})); + llvm::Value* B_ptr = EmitBufferPointer(b_slice, B->shape()); + arguments.push_back(B_ptr); + + if (isBATCHMATMUL) { + // Q + if (isBATCHMATMUL4D) { + HloInstruction const* Q = custom_call->operand(operand++); + int q = Q->literal().Get({}); + arguments.push_back(b()->getInt32(q)); + } + + // P + HloInstruction const* P = custom_call->operand(operand++); + int p = P->literal().Get({}); + arguments.push_back(b()->getInt32(p)); + } + + // M + HloInstruction const* M = custom_call->operand(operand++); + int m = M->literal().Get({}); + arguments.push_back(b()->getInt32(m)); + + // N + HloInstruction const* N = custom_call->operand(operand++); + int n = N->literal().Get({}); + arguments.push_back(b()->getInt32(n)); + + if (isGEMM || isBATCHMATMUL) { + // K + HloInstruction const* K = custom_call->operand(operand++); + int k = K->literal().Get({}); + arguments.push_back(b()->getInt32(k)); + } + + float beta = 0.0; + if (isGEMM || isGEMV) { + // Alpha + HloInstruction const* Alpha = custom_call->operand(operand++); + float alpha = Alpha->literal().Get({}); + llvm::Constant* alphaConst = llvm::ConstantFP::get(b()->getFloatTy(), alpha); + arguments.push_back(alphaConst); + + // Beta + HloInstruction const* Beta = custom_call->operand(operand++); + beta = Beta->literal().Get({}); + llvm::Constant* betaConst = llvm::ConstantFP::get(b()->getFloatTy(), beta); + arguments.push_back(betaConst); + } + + // C (or Y in GEMV) + HloInstruction const* C = custom_call; + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice c_slice, + assignment_.GetUniqueSlice(C, {})); + llvm::Value* C_ptr = EmitBufferPointer(c_slice, C->shape()); + arguments.push_back(C_ptr); + + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call)); + + EmitCallToFunc(metadata.op_name(), arguments, b()->getVoidTy()); + + return absl::OkStatus(); +} + +absl::Status IrEmitter::HandleKernelSelector(HloInstruction* custom_call) { + OpMetadata metadata = custom_call->metadata(); + + if (metadata.op_type() == runtime::kKernelSelectorOperationARGMAX) + return HandleKernelSelectorArgMax(custom_call); + else + return HandleKernelSelectorBlas(custom_call); +} + #if defined(INTEL_MKL) // Emits operands alloca vector for oneDNN custom calls. @@ -2851,9 +3026,12 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) { if (custom_call->custom_call_target() == "TopK") { return HandleTopK(custom_call); } - if (custom_call->custom_call_target() == kCustomCallXnnPackSoftMax) { + if (custom_call->custom_call_target() == "__xnnpack$softmax") { return HandleXnnPackSoftMax(custom_call); } + if (custom_call->custom_call_target() == runtime::kCustomCallKernelSelector) { + return HandleKernelSelector(custom_call); + } #if defined(INTEL_MKL) if (custom_call->custom_call_target() == "__onednn$matmul") { return HandleOneDnnMatMulCalls(custom_call, diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h index b3d47d41c6ca69..9d668325d1618b 100644 --- a/third_party/xla/xla/service/cpu/ir_emitter.h +++ b/third_party/xla/xla/service/cpu/ir_emitter.h @@ -337,6 +337,9 @@ class IrEmitter : public DfsHloVisitorWithDefault, absl::Status HandleAllReduceSingleReplica(HloInstruction* crs); absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs); absl::Status HandleXnnPackSoftMax(HloInstruction* hlo); + absl::Status HandleKernelSelector(HloInstruction* hlo); + absl::Status HandleKernelSelectorBlas(HloInstruction* hlo); + absl::Status HandleKernelSelectorArgMax(HloInstruction* hlo); #if defined(INTEL_MKL) std::vector EmitOneDnnOperandsAlloca(HloInstruction* custom_call, llvm::Value*& args_val, diff --git a/third_party/xla/xla/service/cpu/kernel_selector.cc b/third_party/xla/xla/service/cpu/kernel_selector.cc new file mode 100644 index 00000000000000..0ba46ab5989c44 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector.cc @@ -0,0 +1,423 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernel_selector.h" + +#define EIGEN_USE_THREADS + +#include + +#include "tsl/platform/blocking_counter.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "xla/executable_run_options.h" +#include "xla/service/cpu/runtime_lightweight_check.h" + +namespace xla { +namespace cpu { + +// TODO: Need to test handling trA, trB +void __xla_cpu_runtime_KernelSelectorGEMMSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int M, int N, int K, float alpha, float beta, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, + ldc); +} + +// TODO: Need to test handling trA, trB +void __xla_cpu_runtime_KernelSelectorGEMMParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int M, int N, int K, float alpha, float beta, float* C) { + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float beta_v = beta; + if (beta == 0.0) { + beta_v = 1.0; + memset(C, 0.0, M * N * sizeof(float)); + } + + int njobs = eigen_interface_->NumThreads(); + + int sqrt_jobs = (int)sqrt(njobs); + + tsl::BlockingCounter bc(njobs); + + // TODO: Look at a more flexible way to distribute computation amongst + // threads. + for (int i = 0; i < sqrt_jobs; i++) { + for (int j = 0; j < sqrt_jobs; j++) { + int M_tile = M / sqrt_jobs; + int N_tile = N / sqrt_jobs; + + int M_start = i * M_tile; + int N_start = j * N_tile; + + int M_len = (i == sqrt_jobs - 1) ? (M - M_start) : M_tile; + int N_len = (j == sqrt_jobs - 1) ? (N - N_start) : N_tile; + + eigen_interface_->Schedule([=, &bc]() { + cblas_sgemm(Order, TransA, TransB, M_len, N_len, K, alpha, + &A[M_start * lda], lda, &B[N_start], ldb, beta_v, + &C[M_start * ldc + N_start], ldc); + bc.DecrementCount(); + }); + } + } + bc.Wait(); +} + +void __xla_cpu_runtime_KernelSelectorBatch3DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + for (int i = 0; i < P; ++i) { + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, &A[i * M * K], lda, + &B[i * K * N], ldb, beta, &C[i * M * N], ldc); + } +} + +void __xla_cpu_runtime_KernelSelectorBatch3DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C) { + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + int njobs = eigen_interface_->NumThreads(); + + int num_batches = P; + + tsl::BlockingCounter bc(num_batches < njobs ? num_batches : njobs); + + // parallelize batches + int PB = (num_batches) / njobs; + int rem = (num_batches) % njobs; + + // TODO: Need to test handling trA + for (int batchIdx = 0, threadIdx = 0; batchIdx < num_batches; threadIdx++) { + int adjPB = PB + (threadIdx < rem ? 1 : 0); + + eigen_interface_->Schedule([=, &bc]() { + for (int i = 0; i < adjPB; i++) { + const float* AA = &A[(batchIdx + i) * M * K]; + const float* BB = &B[(batchIdx + i) * K * N]; + float* CC = &C[(batchIdx + i) * M * N]; + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, AA, lda, BB, ldb, + beta, CC, ldc); + } + bc.DecrementCount(); + }); + + batchIdx += adjPB; + } + bc.Wait(); +} + +void __xla_cpu_runtime_KernelSelectorGEMV(const void* run_options_ptr, bool trA, + const float* A, const float* X, int M, + int N, float alpha, float beta, + float* Y) { + int lda = trA ? M : N; + int incX = 1; + int incY = 1; + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + cblas_sgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); +} + +#ifdef ENABLE_BLAS_MLIR +void __xla_cpu_runtime_KernelSelectorGEMMMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int M, int N, int K, float alpha, + float beta, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float beta_v = beta; + if (beta == 0.0) { + beta_v = 1.0; + memset(C, 0.0, M * N * sizeof(float)); + } + + cblas_sgemm_mlir(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta_v, C, ldc); +} + +void __xla_cpu_runtime_KernelSelectorBatch3DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int P, int M, int N, int K, + float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + cblas_sbatch_matmul_mlir(Order, TransA, TransB, P, M, N, K, A, lda, B, ldb, C, + ldc); +} + +void __xla_cpu_runtime_KernelSelectorBatch4DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int Q, int P, int M, int N, + int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + cblas_sbatch_matmul_4d_mlir(Order, TransA, TransB, Q, P, M, N, K, A, lda, B, + ldb, C, ldc); +} +#endif // ENABLE_BLAS_MLIR + +void __xla_cpu_runtime_KernelSelectorBatch4DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + for (int i = 0; i < Q * P; ++i) { + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, &A[i * M * K], lda, + &B[i * K * N], ldb, beta, &C[i * M * N], ldc); + } +} + +void __xla_cpu_runtime_KernelSelectorBatch4DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + int njobs = eigen_interface_->NumThreads(); + + int num_batches = P * Q; + + tsl::BlockingCounter bc(num_batches < njobs ? num_batches : njobs); + + // parallelize batches + int PB = (num_batches) / njobs; + int rem = (num_batches) % njobs; + + // TODO: Need to test handling trA + for (int batchIdx = 0, threadIdx = 0; batchIdx < num_batches; threadIdx++) { + int adjPB = PB + (threadIdx < rem ? 1 : 0); + + eigen_interface_->Schedule([=, &bc]() { + for (int i = 0; i < adjPB; i++) { + const float* AA = &A[(batchIdx + i) * M * K]; + const float* BB = &B[(batchIdx + i) * K * N]; + float* CC = &C[(batchIdx + i) * M * N]; + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, AA, lda, BB, ldb, + beta, CC, ldc); + } + bc.DecrementCount(); + }); + + batchIdx += adjPB; + } + bc.Wait(); +} + +#ifdef ENABLE_BLAS_MLIR +void __xla_cpu_runtime_KernelSelectorGEMVMLIR(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y) { + int lda = trA ? M : N; + int incX = 1; + int incY = 1; + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + + cblas_sgemv_mlir(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); +} +#endif // ENABLE_BLAS_MLIR + +void __xla_cpu_runtime_ArgMaxTask(size_t out_idx, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs) { + float maxval = init_value; + int32_t maxidx = init_idx; + size_t idx = (out_idx)*N; + + for (int i = 0; i < N; i++) { + float val = invals[idx]; + int32_t idx_val = inidxs[idx]; + + if (val >= maxval) { + maxval = val; + maxidx = idx_val; + } + + idx++; + } + + outvals[out_idx] = maxval; + outidxs[out_idx] = maxidx; +} + +void __xla_cpu_runtime_ArgMax3DParallel(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs) { + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + int BM = B * M; + int num_threads = eigen_interface_->NumThreads(); + const int block_size = (BM + num_threads - 1) / num_threads; + tsl::BlockingCounter bc(num_threads); + + for (size_t t = 0; t < num_threads; t++) { + size_t start = t * block_size; + size_t end = std::min((t + 1) * block_size, BM); + + eigen_interface_->ScheduleWithHint( + [=, &bc]() { + for (size_t bm = start; bm < end; bm++) { + __xla_cpu_runtime_ArgMaxTask(bm, N, invals, inidxs, init_value, + init_idx, outvals, outidxs); + } + bc.DecrementCount(); + }, + t, t + 1); + } + + bc.Wait(); +} + +void __xla_cpu_runtime_ArgMax3DSequential(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs) { + // NB: run_options_ptr is ignored in the sequential version. + for (int b = 0; b < B; b++) { + for (int m = 0; m < M; m++) { + size_t out_idx = b * M + m; + __xla_cpu_runtime_ArgMaxTask(out_idx, N, invals, inidxs, init_value, + init_idx, outvals, outidxs); + } + } +} + +void __xla_cpu_runtime_ArgMax3DEmpty(const void* run_options_ptr, int B, int M, + int N, float* invals, int32_t* inidxs, + float init_value, int32_t init_idx, + float* outvals, int32_t* outidxs) {} + +void __xla_cpu_runtime_KernelSelectorGEMVEmpty(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y) {} + +void __xla_cpu_runtime_KernelSelectorGEMMEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int m, int n, int k, float alpha, + float beta, float* C) {} + +void __xla_cpu_runtime_KernelSelectorBatch3DEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, + const float* B, int P, int M, + int N, int K, float* C) {} + +void __xla_cpu_runtime_KernelSelectorBatch4DEmpty( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C) {} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/kernel_selector.h b/third_party/xla/xla/service/cpu/kernel_selector.h new file mode 100644 index 00000000000000..beb64d033f6b99 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector.h @@ -0,0 +1,191 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef XLA_SERVICE_CPU_KERNEL_SELECTOR_H_ +#define XLA_SERVICE_CPU_KERNEL_SELECTOR_H_ +#include + +namespace xla { +namespace cpu { + +#ifndef OPENBLAS_CONST +#define OPENBLAS_CONST const +#endif + +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; + +typedef enum CBLAS_TRANSPOSE { + CblasNoTrans = 111, + CblasTrans = 112, + CblasConjTrans = 113, + CblasConjNoTrans = 114 +} CBLAS_TRANSPOSE; + +typedef int blasint; +typedef CBLAS_ORDER CBLAS_LAYOUT; + +extern "C" { + +// BLAS interface +extern void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, + OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, + OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, + OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, + OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, + OPENBLAS_CONST float* A, OPENBLAS_CONST blasint lda, + OPENBLAS_CONST float* B, OPENBLAS_CONST blasint ldb, + OPENBLAS_CONST float beta, float* C, + OPENBLAS_CONST blasint ldc); + +extern void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order, + OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, + OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float* a, + OPENBLAS_CONST blasint lda, OPENBLAS_CONST float* x, + OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, + float* y, OPENBLAS_CONST blasint incy); + +#ifdef ENABLE_BLAS_MLIR +// MLIR LIB +extern void cblas_sbatch_matmul_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const blasint P, const blasint M, + const blasint N, const blasint K, const float* A, const blasint lda, + const float* B, const blasint ldb, float* C, const blasint ldc); + +extern void cblas_sbatch_matmul_4d_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const blasint Q, const blasint P, + const blasint M, const blasint N, const blasint K, const float* A, + const blasint lda, const float* B, const blasint ldb, float* C, + const blasint ldc); + +extern void cblas_sgemm_mlir(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const blasint M, + const blasint N, const blasint K, + const float alpha, const float* A, + const blasint lda, const float* B, + const blasint ldb, const float beta, float* C, + const blasint ldc); + +extern void cblas_sgemv_mlir(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const blasint M, + const blasint N, const float alpha, const float* A, + const blasint lda, const float* X, + const blasint incX, const float beta, float* Y, + const blasint incY); +#endif // ENABLE_BLAS_MLIR +} // extern "C" + +void __xla_cpu_runtime_KernelSelectorGEMMSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int M, int N, int K, float alpha, float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorGEMMParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int m, int n, int k, float alpha, float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorGEMV(const void* run_options_ptr, bool trA, + const float* A, const float* X, int M, + int N, float alpha, float beta, + float* Y); + +#ifdef ENABLE_BLAS_MLIR +void __xla_cpu_runtime_KernelSelectorGEMMMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int m, int n, int k, float alpha, + float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int P, int M, int N, int K, + float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int Q, int P, int M, int N, + int K, float* C); + +void __xla_cpu_runtime_KernelSelectorGEMVMLIR(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y); +#endif // ENABLE_BLAS_MLIR + +void __xla_cpu_runtime_ArgMax3DParallel(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs); +void __xla_cpu_runtime_ArgMax3DSequential(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs); + +void __xla_cpu_runtime_ArgMax3DEmpty(const void* run_options_ptr, int B, int M, + int N, float* invals, int32_t* inidxs, + float init_value, int32_t init_idx, + float* outvals, int32_t* outidxs); + +void __xla_cpu_runtime_KernelSelectorGEMVEmpty(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y); + +void __xla_cpu_runtime_KernelSelectorGEMMEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int m, int n, int k, float alpha, + float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, + const float* B, int P, int M, + int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DEmpty( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C); + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_KERNEL_SELECTOR_H_ diff --git a/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc new file mode 100644 index 00000000000000..79868054c13ed1 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc @@ -0,0 +1,658 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernel_selector_ops_rewriter.h" + +#include +#include +#include +#include + +#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h" +#include "xla/hlo/ir/hlo_casting_utils.h" +#include "xla/literal_util.h" +#include "xla/service/cpu/cpu_runtime.h" + +namespace xla { +namespace cpu { + +// Uncomment to get printed information about the sizes and the call selected. +#define PRINT_DEBUG + +#ifdef PRINT_DEBUG +#include +#define DEBUG(x) std::cerr << x << "\n"; +#else +#define DEBUG(x) \ + do { \ + } while (0); +#endif + +enum Operation { NONE, GEMV, GEMM, BATCH_MATMUL_3D, BATCH_MATMUL_4D }; +enum KernelType { kGEMV, kGEMM, kBATCH3D, kBATCH4D, kARGMAX }; + +using Range = std::pair; +using RangeSet = std::vector; + +Range maxRange = {0, INT_MAX}; + +class IntervalMap { + using TypedRange = std::pair; + std::map m_map; + + public: + void insert(KernelType kTy, RangeSet& ranges, std::string& value) { + m_map[{kTy, ranges}] = value; + } + + bool lookup(KernelType kTy, std::vector& keys, std::string& outValue, + bool& fallback) const { + fallback = false; + for (const auto& entry : m_map) { + TypedRange typedRange = entry.first; + std::string value = entry.second; + if (typedRange.first != kTy) continue; + + const RangeSet& ranges = typedRange.second; + if (ranges.size() != keys.size()) continue; + + bool match = true; + for (size_t i = 0; i < ranges.size(); ++i) { + if (keys[i] < ranges[i].first || keys[i] > ranges[i].second) { + match = false; + break; + } + if (ranges[i] == maxRange) { + fallback = true; + } + } + + if (match) { + outValue = value; + return true; + } + } + return false; + } + + void print() const { + for (const auto& entry : m_map) { + TypedRange typedRange = entry.first; + std::string value = entry.second; + int kTy = typedRange.first; + const RangeSet& ranges = typedRange.second; + + DEBUG("[" << kTy << "]("); + for (const auto& range : ranges) { + DEBUG("[" << range.first << ":" << range.second << "] "); + } + DEBUG(") -> " << value << "\n"); + } + } + + void clear() { m_map.clear(); } +}; + +struct ParsedData { + std::string kernelName; + RangeSet sizes; + std::string functionName; + bool isValid; +}; + +std::map kernelStringToType = {{"gemv", kGEMV}, + {"gemm", kGEMM}, + {"batch3d", kBATCH3D}, + {"batch4d", kBATCH4D}, + {"argmax", kARGMAX}}; +std::map kernelTypeToString; // filled automatically. + +std::map kernelTypeToSizeRank = { + {kGEMV, 2}, {kGEMM, 3}, {kARGMAX, 3}, {kBATCH3D, 4}, {kBATCH4D, 5}}; + +int parseInt(const std::string& str) { + if (str == "*") return maxRange.second; + + int size = std::stoi(str); + if (size < 0) { + LOG(ERROR) << "Found invalid size: " << size; + return -1; + } + + return size; +} + +Range parseRange(const std::string& str) { + size_t colonPos = str.find(':'); + + if (str == "*") { + return maxRange; + } + + // For non-range strings like "1" we create a range {1,1} + if (colonPos == std::string::npos) { + int value = parseInt(str); + return {value, value}; + } + + auto left = str.substr(0, colonPos); + auto right = str.substr(colonPos + 1); + + int start = parseInt(left); + int end = parseInt(right); + + assert(start <= end); + + return {start, end}; +} + +// Parses line from the mapping file which look like [kernel](size1,size2,...) +// -> symbol +ParsedData parseLine(std::string& line) { + // Remove all whitespace from the line first. + line.erase(std::remove_if(line.begin(), line.end(), ::isspace), line.end()); + // A range looks like 23:29 or 12:* + std::string range = R"(\d+:(?:\d+|\*))"; + // An element is either a number, a *, or a range + std::string element = R"((?:\d+|\*|)" + range + R"())"; + // Sizes is a list of elements in parentheses + std::string sizes = R"(\(((?:)" + element + R"(,)*)" + element + R"()\))"; + std::regex pattern(R"(^\[(.+)\])" + sizes + R"(->(.+))"); + + std::smatch matches; + + ParsedData data; + data.isValid = false; + + if (std::regex_match(line, matches, pattern)) { + data.kernelName = matches[1]; + std::stringstream ss(matches[2]); + std::string token; + + while (std::getline(ss, token, ',')) { + auto range = parseRange(token); + if (range.first == -1 || range.second == -1) return data; + data.sizes.push_back(range); + } + data.functionName = matches[3]; + data.isValid = true; + } else { + XLA_VLOG_LINES(3, "KernelSelectorOpsRewriter::parseLine() : No match.\n"); + } + + return data; +} + +IntervalMap sizesToSymbol; + +const char* kernel_map_file = std::getenv("KERNEL_MAP_FILE"); + +void fill_map_from_file(const char* map_file, IntervalMap& map) { + if (!map_file) { + XLA_VLOG_LINES(3, "NO MAP FILE\n"); + return; + } + + std::ifstream file(map_file); + if (!file.is_open()) { + std::string file_name(map_file); + XLA_VLOG_LINES(3, + "KernelSelectorOpsRewriter::fill_map_from_file() : Cannot " + "open file. \n"); + return; + } + + // Clear the map to prevent conflicts and unexpected + // behaviour due to default pre-filled values. + map.clear(); + + std::string line; + int lineno = 1; + while (std::getline(file, line)) { + // If the file we are reading has Windows line endings, make sure + // we remove the `\r` before processing the regex, otherwise it will + // not match. + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + + ParsedData data = parseLine(line); + if (!data.isValid) { + LOG(ERROR) << "Regex did not match on line " << lineno; + } else { + if (kernelStringToType.find(data.kernelName) == + kernelStringToType.end()) { + LOG(ERROR) << data.kernelName << " is not a valid kernel type"; + return; + } + + KernelType kTy = kernelStringToType[data.kernelName]; + int expectedRank = kernelTypeToSizeRank[kTy]; + + // Fallback case (i.e. lines like [gemm](*) -> symbol): store in the map + // the correct amount of "infinite" ranges: + if (data.sizes.size() == 1 && data.sizes[0] == maxRange) { + data.sizes.assign(expectedRank, maxRange); + } + + if (data.sizes.size() != expectedRank) { + LOG(ERROR) << data.kernelName + << " expected to have an input size of rank " << expectedRank + << ", but got " << data.sizes.size() << "(line " << lineno + << ")"; + } else { + map.insert(kTy, data.sizes, data.functionName); + } + } + lineno++; + } + + return; +} + +class KernelSelectorOpsRewriterVisitor : public DfsHloRewriteVisitor { + private: + void printDebugMessage(KernelType kTy, std::vector sizes) { + std::string debug_msg = "{"; + for (size_t i = 0; i < sizes.size(); ++i) { + debug_msg += std::to_string(sizes[i]); + if (i != sizes.size() - 1) { + debug_msg += ", "; + } + } + debug_msg += + "} -> Is not on the map and a fallback was not specified. The " + + kernelTypeToString[kTy] + " will not be replaced."; + + DEBUG(debug_msg); + } + + std::string GetKernelSelectorFunction(KernelType kTy, std::vector sizes, + bool& fallback) { + std::string fun_name; + bool found = sizesToSymbol.lookup(kTy, sizes, fun_name, fallback); + fallback = false; + + if (!found) { +#ifdef PRINT_DEBUG + printDebugMessage(kTy, sizes); +#endif + } + return fun_name; + } + + Operation getOperation(HloInstruction* instr) { + if (auto* dot = DynCast(instr)) { + auto batch_dims = dot->dot_dimension_numbers().lhs_batch_dimensions(); + auto dims = dot->shape().dimensions(); + if (batch_dims.size() == 1) { + return Operation::BATCH_MATMUL_3D; + } + if (batch_dims.size() == 2) { + return Operation::BATCH_MATMUL_4D; + } + if (dims.size() == 1) { + return Operation::GEMV; + } + if (batch_dims.empty()) { + return Operation::GEMM; + } + } + return Operation::NONE; + } + + template + HloInstruction* makeConstant(HloInstruction* op, T value) { + auto litteral = LiteralUtil::CreateR0(value); + return op->AddInstruction( + HloInstruction::CreateConstant(std::move(litteral))); + } + +#ifdef PRINT_DEBUG + std::map, std::string> AllocatedGemmSizes; + std::map, std::string> AllocatedGemvSizes; + std::map, std::string> AllocatedBatchMatmul3DSizes; + std::map, std::string> AllocatedBatchMatmul4DSizes; + std::map, std::string> AllocatedArgMax3DSizes; +#endif + + public: + absl::Status HandleDot(HloInstruction* dot) override { + Operation operation = getOperation(dot); + if (operation == Operation::NONE) { + return absl::OkStatus(); + } + bool fallbackSelected; + + // Collect all the operands for the CustomCall + switch (operation) { + case GEMM: { + KernelType kTy = kGEMM; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + auto rhs_contracting_dims = dnums.rhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + assert(rhs_contracting_dims.size() == 1); + + HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 0); + HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 1); + + HloInstruction* alpha = makeConstant(dot, (float)1.0); + HloInstruction* beta = makeConstant(dot, (float)0.0); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* B = dot->operands()[1]; + + int m = dot->shape().dimensions(0); + HloInstruction* M = makeConstant(dot, m); + + int n = dot->shape().dimensions(1); + HloInstruction* N = makeConstant(dot, n); + + int k = A->shape().dimensions(lhs_contracting_dims[0]); + HloInstruction* K = makeConstant(dot, k); + + std::string fun_name = + GetKernelSelectorFunction(kTy, {m, n, k}, fallbackSelected); + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedGemmSizes.find({m, n, k}) == AllocatedGemmSizes.end()) { + AllocatedGemmSizes[{m, n, k}] = fun_name; + DEBUG("{m: " << m << ", n: " << n << ", k: " << k << "} -> " + << fun_name << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, trB, A, B, M, + N, K, alpha, beta}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationGEMM); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + case GEMV: { + KernelType kTy = kGEMV; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + + bool is_trA = lhs_contracting_dims[0] == 0; + HloInstruction* trA = makeConstant(dot, is_trA); + + HloInstruction* alpha = makeConstant(dot, (float)1.0); + HloInstruction* beta = makeConstant(dot, (float)0.0); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* X = dot->operands()[1]; + + int m = A->shape().dimensions(is_trA ? 1 : 0); + HloInstruction* M = makeConstant(dot, m); + + int n = A->shape().dimensions(is_trA ? 0 : 1); + HloInstruction* N = makeConstant(dot, n); + + std::string fun_name = + GetKernelSelectorFunction(kTy, {m, n}, fallbackSelected); + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedGemvSizes.find({m, n}) == AllocatedGemvSizes.end()) { + AllocatedGemvSizes[{m, n}] = fun_name; + DEBUG("{m: " << m << ", n: " << n << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, A, X, M, N, alpha, beta}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationGEMV); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + case BATCH_MATMUL_3D: { + KernelType kTy = kBATCH3D; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + auto rhs_contracting_dims = dnums.rhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + assert(rhs_contracting_dims.size() == 1); + + HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 1); + HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 2); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* B = dot->operands()[1]; + + int p = dot->shape().dimensions(0); + HloInstruction* P = makeConstant(dot, p); + + int num_batch_dims = dnums.lhs_batch_dimensions_size(); + + int m = dot->shape().dimensions(num_batch_dims); + HloInstruction* M = makeConstant(dot, m); + + int n = dot->shape().dimensions(num_batch_dims + 1); + HloInstruction* N = makeConstant(dot, n); + + int k = A->shape().dimensions(lhs_contracting_dims[0]); + HloInstruction* K = makeConstant(dot, k); + + std::string fun_name = + GetKernelSelectorFunction(kTy, {p, m, n, k}, fallbackSelected); + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedBatchMatmul3DSizes.find({p, m, n, k}) == + AllocatedBatchMatmul3DSizes.end()) { + AllocatedBatchMatmul3DSizes[{p, m, n, k}] = fun_name; + DEBUG("{p: " << p << ", m: " << m << ", n: " << n << ", k: " << k + << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, trB, A, B, P, M, N, K}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationBATCH3D); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + case BATCH_MATMUL_4D: { + KernelType kTy = kBATCH4D; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + auto rhs_contracting_dims = dnums.rhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + assert(rhs_contracting_dims.size() == 1); + + HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 2); + HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 3); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* B = dot->operands()[1]; + + int q = dot->shape().dimensions(0); + HloInstruction* Q = makeConstant(dot, q); + + int p = dot->shape().dimensions(1); + HloInstruction* P = makeConstant(dot, p); + + int num_batch_dims = dnums.lhs_batch_dimensions_size(); + + int m = dot->shape().dimensions(num_batch_dims); + HloInstruction* M = makeConstant(dot, m); + + int n = dot->shape().dimensions(num_batch_dims + 1); + HloInstruction* N = makeConstant(dot, n); + + int k = A->shape().dimensions(lhs_contracting_dims[0]); + HloInstruction* K = makeConstant(dot, k); + + std::string fun_name = GetKernelSelectorFunction(kTy, {q, p, m, n, k}, fallbackSelected); + + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedBatchMatmul4DSizes.find({q, p, m, n, k}) == + AllocatedBatchMatmul4DSizes.end()) { + AllocatedBatchMatmul4DSizes[{q, p, m, n, k}] = fun_name; + DEBUG("{q: " << q << ", p: " << p << ", m: " << m << ", n: " << n + << ", k: " << k << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, trB, A, B, Q, P, M, N, K}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationBATCH4D); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + default: + DEBUG("No library funcion was selected."); + return absl::OkStatus(); + } + + return absl::OkStatus(); + } + + absl::Status HandleReduce(HloInstruction* reduce) override { + bool fallbackSelected; + std::string op_type = reduce->metadata().op_type(); + // TODO: Is this reliable way to check for ArgMax? + // Works for BERT but its unclear if this is the proper way. + if (op_type != "ArgMax") { + return absl::OkStatus(); + } + + auto reduceOpr = reduce->operands(); + // The ArgMax pattern we support has exactly 4 operands. + if (reduceOpr.size() != 4) { + return absl::OkStatus(); + } + + // We currently only support 3D ArgMax. + auto dims = reduceOpr[0]->shape().dimensions(); + if (dims.size() != 3) { + return absl::OkStatus(); + } + + KernelType kTy = kARGMAX; + int b = dims[0]; + int m = dims[1]; + int n = dims[2]; + + std::string fun_name = GetKernelSelectorFunction(kTy, {b, m, n}, fallbackSelected); + + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedArgMax3DSizes.find({b, m, n}) == + AllocatedArgMax3DSizes.end()) { + AllocatedArgMax3DSizes[{b, m, n}] = fun_name; + DEBUG("{b: " << b << ", m: " << m << ", n: " << n << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands; + for (int i = 0; i < 4; i++) operands.push_back(reduceOpr[i]); + + HloInstruction* kernel_selector_call = + reduce->AddInstruction(HloInstruction::CreateCustomCall( + reduce->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = reduce->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationARGMAX); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(reduce, kernel_selector_call)); + + return absl::OkStatus(); + } +}; // namespace cpu + +absl::StatusOr KernelSelectorOpsRewriter::Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) { + XLA_VLOG_LINES( + 3, "KernelSelectorOpsRewriter::Run(), before:\n" + module->ToString()); + + if (!kernel_map_file) { + LOG(INFO) << "KERNEL_MAP_FILE is not set. The kernel selector will not " + "run.\n Check xla/service/cpu/example_kernel_map.txt for an " + "example of kernel map file"; + return absl::OkStatus(); + } + + // Build the reverse map. + for (const auto& pair : kernelStringToType) { + kernelTypeToString[pair.second] = pair.first; + } + + fill_map_from_file(kernel_map_file, sizesToSymbol); + + KernelSelectorOpsRewriterVisitor visitor; + TF_ASSIGN_OR_RETURN(auto result, + visitor.RunOnModule(module, execution_threads)); + XLA_VLOG_LINES( + 3, "KernelSelectorOpsRewriter::Run(), after:\n" + module->ToString()); + return result; +} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h new file mode 100644 index 00000000000000..36714cfdf315b3 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h @@ -0,0 +1,42 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_ +#define XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_ + +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/hlo/ir/hlo_module.h" +#include "xla/hlo/pass/hlo_pass_interface.h" + +namespace xla { +namespace cpu { + +// This pass rewrites hlo.dot into custom calls. +class KernelSelectorOpsRewriter : public HloModulePass { + public: + absl::string_view name() const override { + return "kernel-selector-ops-rewriter"; + } + + using HloPassInterface::Run; + absl::StatusOr Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) override; +}; + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_ diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc index 64e5970c8f04a4..fd9479f35fff82 100644 --- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc +++ b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc @@ -23,6 +23,7 @@ limitations under the License. #include #include #include +#include #include "absl/functional/any_invocable.h" #include "absl/strings/string_view.h" @@ -57,6 +58,7 @@ limitations under the License. #include "xla/service/cpu/runtime_topk.h" #include "xla/service/cpu/windows_compatibility.h" #include "xla/service/cpu/xnnpack_ops.h" +#include "xla/service/cpu/kernel_selector.h" #include "xla/service/custom_call_target_registry.h" #include "tsl/platform/logging.h" @@ -211,6 +213,26 @@ static bool RegisterKnownJITSymbols() { REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd); REGISTER_CPU_RUNTIME_SYMBOL(HandleFfiCall); REGISTER_CPU_RUNTIME_SYMBOL(XnnPackSoftMaxND); + REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DParallel); + REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMParallel); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DParallel); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DParallel); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMV); +#ifdef ENABLE_BLAS_MLIR + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMMLIR); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DMLIR); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DMLIR); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMVMLIR); +#endif // ENABLE_BLAS_MLIR + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMVEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DEmpty); #if defined(INTEL_MKL) REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul); REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax); diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc index a3a5f1827d0da8..4687473caf3ac7 100644 --- a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc +++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc @@ -26,8 +26,6 @@ limitations under the License. namespace xla { namespace cpu { -extern const char* const kCustomCallXnnPackSoftMax = "__xnnpack$softmax"; - namespace { namespace m = match; namespace pu = ::xla::cpu::xnnpack_pattern_utils_internal; @@ -205,7 +203,7 @@ class XnnPackOpsRewriterVisitor : public DfsHloRewriteVisitor { HloInstruction* softmax_call = divide_instr->AddInstruction(HloInstruction::CreateCustomCall( - output_shape, {producer.value()}, kCustomCallXnnPackSoftMax)); + output_shape, {producer.value()}, "__xnnpack$softmax")); TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call)); return absl::OkStatus(); diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h index 2bdc58965c96dc..f1cd18769d1704 100644 --- a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h +++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h @@ -27,8 +27,6 @@ limitations under the License. namespace xla { namespace cpu { -extern const char* const kCustomCallXnnPackSoftMax; - class XnnPackOpsRewriter : public HloModulePass { public: absl::string_view name() const override { return "xnnpack-ops-rewriter"; } diff --git a/third_party/xla/xla/service/libs/BUILD b/third_party/xla/xla/service/libs/BUILD new file mode 100644 index 00000000000000..c9435fb4686cf4 --- /dev/null +++ b/third_party/xla/xla/service/libs/BUILD @@ -0,0 +1,17 @@ +cc_binary( + name = "libblas_mlir.so", + srcs = ["libblas_mlir/src/sgemm.cpp", + "libblas_mlir/src/sgemv.cpp", + "libblas_mlir/src/sbatch_matmul_3d.cpp", + "libblas_mlir/src/sbatch_matmul_4d.cpp", + "libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s", + "libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s", + "libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s", + "libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s", + "libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s", + "libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s"], + linkshared = True, + linkstatic = False, + includes = ["libblas_mlir/include"], + visibility = ["//visibility:public"], +) diff --git a/third_party/xla/xla/service/libs/libblas_mlir/Makefile b/third_party/xla/xla/service/libs/libblas_mlir/Makefile new file mode 100644 index 00000000000000..941f9062f20211 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/Makefile @@ -0,0 +1,52 @@ +# List of source files +SRCS := sgemm.cpp sgemv.cpp sbatch_matmul_3d.cpp sbatch_matmul_4d.cpp +KERNELS_DIR := kernels +KERNEL_SRCS := $(wildcard $(KERNELS_DIR)/*.s) + +# Source directory +SRC_DIR := src + +# Output directory +BUILD := build + +# Compiler and flags +CC := gcc +CFLAGS := -S -I include -O3 +ASFLAGS := -c -O3 +LDFLAGS := -shared + +# Full paths +SRC_PATHS := $(SRCS:%=$(SRC_DIR)/%) +ASM := $(SRCS:%.cpp=$(BUILD)/%.s) +OBJS := $(SRCS:%.cpp=$(BUILD)/%.o) +KERNEL_OBJS := $(KERNEL_SRCS:$(KERNELS_DIR)/%.s=$(BUILD)/%.o) + +# All object files +ALL_OBJS := $(OBJS) $(KERNEL_OBJS) + +# Default target +all: $(BUILD) $(ASM) $(ALL_OBJS) $(BUILD)/libblas_mlir.so + +# Create build directory +$(BUILD): + @mkdir -p $(BUILD) + +# Compile each .cpp file to .s in build/ +$(BUILD)/%.s: $(SRC_DIR)/%.cpp + @$(CC) $(CFLAGS) $< -o $@ + +# Assemble .s to .o +$(BUILD)/%.o: $(BUILD)/%.s + @$(CC) $(ASFLAGS) $< -o $@ + +# Assemble kernels .s to .o +$(BUILD)/%.o: $(KERNELS_DIR)/%.s | $(BUILD) + @$(CC) $(ASFLAGS) $< -o $@ + +# Link .o files into lib.so +$(BUILD)/libblas_mlir.so: $(ALL_OBJS) + @$(CC) $(LDFLAGS) -o $@ $^ + +# Clean target +clean: + @rm -rf $(BUILD) \ No newline at end of file diff --git a/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h b/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h new file mode 100644 index 00000000000000..6d4fab5e34f49c --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h @@ -0,0 +1,10 @@ +#ifndef MEMREF_HELPERS_H_ +#define MEMREF_HELPERS_H_ + +#define Memref_1D_Args(NAME, M, S) NAME, NAME, 0, M, S +#define Memref_2D_Args(NAME, M, N, LD) NAME, NAME, 0, M, N, LD, 1 +#define Memref_3D_Args(NAME, B, M, N, LD) NAME, NAME, 0, B, M, N, M *LD, LD, 1 +#define Memref_4D_Args(NAME, B1, B2, M, N, LD) \ + NAME, NAME, 0, B1, B2, M, N, B2 *M *LD, M *LD, LD, 1 + +#endif \ No newline at end of file diff --git a/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h b/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h new file mode 100644 index 00000000000000..4f7c410ec9bb3b --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h @@ -0,0 +1,11 @@ +typedef int BLASINT; + +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; + +typedef enum CBLAS_TRANSPOSE { + CblasNoTrans = 111, + CblasTrans = 112, +} CBLAS_TRANSPOSE; diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s new file mode 100644 index 00000000000000..38d54d0f69c54c --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s @@ -0,0 +1,4079 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_3d_nn_mlir // -- Begin function sbatch_matmul_3d_nn_mlir + .p2align 4 + .type sbatch_matmul_3d_nn_mlir,@function +sbatch_matmul_3d_nn_mlir: // @sbatch_matmul_3d_nn_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #1040 + .cfi_def_cfa_offset 1200 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x4, #0 + ldr x13, [sp, #1248] + ldr x29, [sp, #1336] + lsl x23, x5, #6 + cinv x8, x4, lt + ldr x20, [sp, #1264] + ldr x26, [sp, #1216] + add x0, x23, #64 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + mov x19, x7 + str x6, [sp, #760] // 8-byte Folded Spill + mov x21, x5 + stp x13, x3, [sp, #144] // 16-byte Folded Spill + mov x27, x2 + str x1, [sp, #720] // 8-byte Folded Spill + asr x9, x9, #1 + str x4, [sp, #744] // 8-byte Folded Spill + cinv x28, x9, lt + cmp x8, #0 + ldr x9, [sp, #1256] + csel x8, x10, x8, lt + cmp x4, #0 + ldr x10, [sp, #1328] + asr x8, x8, #2 + cinv x24, x8, lt + cmp x13, #0 + cinv x8, x13, lt + str x9, [sp, #752] // 8-byte Folded Spill + add x9, x8, x8, lsr #63 + str x10, [sp, #736] // 8-byte Folded Spill + add x10, x8, #15 + add x11, x8, #7 + add x12, x8, #3 + asr x9, x9, #1 + cinv x14, x9, lt + ldr x9, [sp, #1296] + cmp x8, #0 + str x14, [sp, #1000] // 8-byte Folded Spill + str x9, [sp, #696] // 8-byte Folded Spill + ldr x9, [sp, #1288] + str x9, [sp, #688] // 8-byte Folded Spill + csel x9, x10, x8, lt + csel x10, x11, x8, lt + csel x8, x12, x8, lt + cmp x13, #0 + asr x9, x9, #4 + asr x8, x8, #2 + asr x10, x10, #3 + cinv x11, x9, lt + ldr x9, [sp, #1224] + cinv x25, x8, lt + cinv x10, x10, lt + lsl x8, x25, #2 + str x11, [sp, #1016] // 8-byte Folded Spill + str x10, [sp, #1008] // 8-byte Folded Spill + str x8, [sp, #600] // 8-byte Folded Spill + lsl x8, x14, #1 + str x8, [sp, #648] // 8-byte Folded Spill + str x9, [sp, #712] // 8-byte Folded Spill + lsl x9, x11, #4 + str x9, [sp, #832] // 8-byte Folded Spill + lsl x9, x10, #3 + str x9, [sp, #768] // 8-byte Folded Spill + bl malloc + lsl x8, x24, #2 + negs x9, x21 + add x10, x19, x19, lsl #1 + mov w12, #1 // =0x1 + str x8, [sp, #1024] // 8-byte Folded Spill + lsl x8, x28, #1 + and x9, x9, #0x3 + str x27, [sp, #704] // 8-byte Folded Spill + str x8, [sp, #920] // 8-byte Folded Spill + add x8, x0, #63 + lsl x27, x27, #2 + lsl x5, x21, #2 + and x22, x8, #0xffffffffffffffc0 + and x8, x21, #0x3 + bfi x12, x24, #2, #62 + mul x17, x19, x12 + csneg x6, x8, x9, mi + lsl x8, x10, #2 + mul x18, x28, x19 + add x12, x5, x27 + lsl x15, x6, #2 + str x8, [sp, #1032] // 8-byte Folded Spill + mul x16, x24, x19 + lsl x2, x16, #4 + sub x8, x5, x15 + lsl x3, x17, #2 + stp x5, x8, [sp, #96] // 16-byte Folded Spill + lsl x4, x20, #2 + sub x8, x12, x15 + sub x12, x22, x6, lsl #6 + mov x13, x20 + str x8, [sp, #904] // 8-byte Folded Spill + add x10, x4, x20 + lsl x11, x20, #5 + lsl x20, x19, #2 + add x8, x12, x23 + lsl x9, x13, #4 + sub x28, x11, x4 + str x0, [sp, #16] // 8-byte Folded Spill + str x8, [sp, #552] // 8-byte Folded Spill + add x8, x27, x18, lsl #3 + lsl x10, x10, #2 + str x13, [sp, #728] // 8-byte Folded Spill + str xzr, [sp, #184] // 8-byte Folded Spill + str xzr, [sp, #776] // 8-byte Folded Spill + add x12, x8, x5 + str x8, [sp, #888] // 8-byte Folded Spill + sub x8, x12, x15 + ldr x12, [sp, #104] // 8-byte Folded Reload + str x26, [sp, #680] // 8-byte Folded Spill + str x4, [sp, #824] // 8-byte Folded Spill + str x8, [sp, #896] // 8-byte Folded Spill + add x8, x3, x27 + add x14, x8, x5 + str x8, [sp, #992] // 8-byte Folded Spill + sub x8, x14, x15 + sub x14, x21, x6 + str x8, [sp, #880] // 8-byte Folded Spill + add x8, x2, x27 + add x5, x8, x5 + str x8, [sp, #912] // 8-byte Folded Spill + sub x8, x5, x15 + add x15, x21, x20 + ldr x5, [sp, #720] // 8-byte Folded Reload + str x8, [sp, #872] // 8-byte Folded Spill + ldr x8, [sp, #1016] // 8-byte Folded Reload + sub x1, x15, x6 + add x15, x21, x16, lsl #2 + lsl x16, x25, #4 + str x9, [sp, #1016] // 8-byte Folded Spill + sub x15, x15, x6 + lsl x15, x15, #2 + lsl x7, x8, #6 + ldr x8, [sp, #1008] // 8-byte Folded Reload + str x15, [sp, #576] // 8-byte Folded Spill + add x15, x21, x17 + sub x15, x15, x6 + lsl x15, x15, #2 + str x15, [sp, #568] // 8-byte Folded Spill + add x15, x21, x18, lsl #1 + lsl x17, x8, #5 + ldr x8, [sp, #1000] // 8-byte Folded Reload + sub x15, x15, x6 + lsl x18, x15, #2 + lsl x15, x8, #3 + ldr x8, [sp, #712] // 8-byte Folded Reload + lsl x8, x8, #2 + add x23, x11, x8 + str x8, [sp, #864] // 8-byte Folded Spill + add x23, x26, x23 + str x23, [sp, #984] // 8-byte Folded Spill + add x23, x9, x8 + add x23, x26, x23 + str x23, [sp, #976] // 8-byte Folded Spill + add x23, x4, x8 + add x23, x26, x23 + str x23, [sp, #968] // 8-byte Folded Spill + lsl x23, x13, #3 + add x24, x23, x8 + add x24, x26, x24 + str x24, [sp, #960] // 8-byte Folded Spill + add x24, x13, x13, lsl #1 + lsl x25, x24, #3 + lsl x30, x24, #2 + add x24, x26, x8 + add x0, x24, x28 + str x0, [sp, #952] // 8-byte Folded Spill + add x0, x24, x25 + str x0, [sp, #944] // 8-byte Folded Spill + add x0, x24, x10 + str x0, [sp, #936] // 8-byte Folded Spill + add x0, x24, x30 + str x0, [sp, #928] // 8-byte Folded Spill + add x0, x12, #4 + ldr x12, [sp, #904] // 8-byte Folded Reload + str x0, [sp, #512] // 8-byte Folded Spill + madd x24, x13, x0, x8 + add x0, x12, #4 + str x0, [sp, #672] // 8-byte Folded Spill + mul x0, x13, x14 + add x24, x26, x24 + add x0, x8, x0, lsl #2 + lsl x8, x19, #4 + str x8, [sp, #1008] // 8-byte Folded Spill + add x12, x26, x0 + add x0, x8, x27 + add x0, x0, x5 + add x8, x0, #32 + add x0, x27, x1, lsl #2 + add x1, x26, x4 + str x8, [sp, #816] // 8-byte Folded Spill + add x0, x0, x5 + add x8, x0, #4 + str x8, [sp, #808] // 8-byte Folded Spill + add x8, x5, x3 + add x3, x26, x11 + add x11, x26, x23 + add x23, x24, x7 + str x8, [sp, #624] // 8-byte Folded Spill + add x8, x5, x2 + add x2, x26, x28 + add x0, x3, x7 + str x8, [sp, #616] // 8-byte Folded Spill + ldr x8, [sp, #888] // 8-byte Folded Reload + str x0, [sp, #504] // 8-byte Folded Spill + add x0, x2, x7 + str x0, [sp, #496] // 8-byte Folded Spill + add x13, x8, x5 + add x8, x13, #32 + add x13, x26, x10 + str x8, [sp, #640] // 8-byte Folded Spill + add x8, x18, #4 + add x18, x26, x25 + str x8, [sp, #560] // 8-byte Folded Spill + ldr x8, [sp, #896] // 8-byte Folded Reload + add x0, x18, x7 + str x0, [sp, #488] // 8-byte Folded Spill + add x0, x13, x7 + str x0, [sp, #480] // 8-byte Folded Spill + add x0, x26, x9 + add x9, x0, x7 + add x8, x5, x8 + str x9, [sp, #472] // 8-byte Folded Spill + add x9, x1, x7 + str x8, [sp, #632] // 8-byte Folded Spill + add x8, x26, x30 + str x9, [sp, #464] // 8-byte Folded Spill + add x9, x11, x7 + str x9, [sp, #456] // 8-byte Folded Spill + add x9, x8, x7 + str x9, [sp, #448] // 8-byte Folded Spill + add x9, x12, x7 + str x9, [sp, #440] // 8-byte Folded Spill + ldr x9, [sp, #880] // 8-byte Folded Reload + add x9, x5, x9 + str x9, [sp, #544] // 8-byte Folded Spill + ldr x9, [sp, #872] // 8-byte Folded Reload + add x9, x5, x9 + str x9, [sp, #656] // 8-byte Folded Spill + add x9, x3, x17 + str x9, [sp, #432] // 8-byte Folded Spill + add x9, x2, x17 + str x9, [sp, #424] // 8-byte Folded Spill + add x9, x18, x17 + str x9, [sp, #416] // 8-byte Folded Spill + add x9, x13, x17 + str x9, [sp, #408] // 8-byte Folded Spill + add x9, x0, x17 + str x9, [sp, #400] // 8-byte Folded Spill + add x9, x1, x17 + str x9, [sp, #392] // 8-byte Folded Spill + add x9, x11, x17 + str x9, [sp, #384] // 8-byte Folded Spill + add x9, x8, x17 + str x9, [sp, #376] // 8-byte Folded Spill + add x9, x24, x17 + str x9, [sp, #368] // 8-byte Folded Spill + add x9, x12, x17 + lsl x17, x21, #3 + str x9, [sp, #360] // 8-byte Folded Spill + add x9, x3, x16 + str x17, [sp, #72] // 8-byte Folded Spill + str x9, [sp, #352] // 8-byte Folded Spill + add x9, x2, x16 + str x9, [sp, #344] // 8-byte Folded Spill + add x9, x18, x16 + str x9, [sp, #336] // 8-byte Folded Spill + add x9, x13, x16 + str x9, [sp, #328] // 8-byte Folded Spill + add x9, x0, x16 + str x9, [sp, #320] // 8-byte Folded Spill + add x9, x1, x16 + str x9, [sp, #312] // 8-byte Folded Spill + add x9, x11, x16 + add x11, x11, x15 + str x9, [sp, #304] // 8-byte Folded Spill + add x9, x8, x16 + add x8, x8, x15 + str x9, [sp, #296] // 8-byte Folded Spill + add x9, x24, x16 + str x8, [sp, #216] // 8-byte Folded Spill + add x8, x24, x15 + str x9, [sp, #288] // 8-byte Folded Spill + add x9, x12, x16 + lsl x16, x21, #4 + str x8, [sp, #208] // 8-byte Folded Spill + str x9, [sp, #280] // 8-byte Folded Spill + lsl x9, x21, #5 + sub x7, x16, x6, lsl #4 + sub x10, x9, x6, lsl #5 + sub x6, x17, x6, lsl #3 + mov x17, x12 + add x12, x18, x15 + stp x16, x9, [sp, #80] // 16-byte Folded Spill + lsl x9, x19, #3 + add x8, x17, x15 + str x12, [sp, #256] // 8-byte Folded Spill + add x12, x13, x15 + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x16, x5, x9 + add x9, x9, x27 + str x14, [sp, #1032] // 8-byte Folded Spill + str x8, [sp, #200] // 8-byte Folded Spill + ldr x8, [sp, #992] // 8-byte Folded Reload + str x12, [sp, #248] // 8-byte Folded Spill + str x16, [sp, #800] // 8-byte Folded Spill + add x16, x3, x15 + add x9, x5, x9 + ldr x3, [sp, #776] // 8-byte Folded Reload + stp x6, x10, [sp, #56] // 16-byte Folded Spill + str x16, [sp, #272] // 8-byte Folded Spill + add x16, x2, x15 + str x9, [sp, #592] // 8-byte Folded Spill + add x9, x20, x27 + mov x2, x23 + sub x23, x14, #4 + str x16, [sp, #264] // 8-byte Folded Spill + ldr x16, [sp, #184] // 8-byte Folded Reload + add x9, x5, x9 + add x12, x5, x13 + str x9, [sp, #584] // 8-byte Folded Spill + mov x9, x24 + add x8, x5, x8 + str x12, [sp, #792] // 8-byte Folded Spill + add x12, x5, x27 + str x8, [sp, #536] // 8-byte Folded Spill + ldr x8, [sp, #912] // 8-byte Folded Reload + add x13, x12, x13 + str x13, [sp, #608] // 8-byte Folded Spill + add x13, x0, x15 + str x13, [sp, #240] // 8-byte Folded Spill + add x13, x1, x15 + stp x11, x13, [sp, #224] // 16-byte Folded Spill + add x8, x5, x8 + str x8, [sp, #528] // 8-byte Folded Spill + sub x8, x14, #3 + str x8, [sp, #912] // 8-byte Folded Spill + sub x8, x14, #2 + str x8, [sp, #904] // 8-byte Folded Spill + sub x8, x14, #1 + str x8, [sp, #896] // 8-byte Folded Spill + ldr x8, [sp, #752] // 8-byte Folded Reload + lsl x11, x8, #2 + ldr x8, [sp, #760] // 8-byte Folded Reload + lsl x8, x8, #2 + stp x8, x11, [sp, #128] // 16-byte Folded Spill + add x8, x5, x20 + add x11, x10, #32 + str x8, [sp, #784] // 8-byte Folded Spill + add x8, x22, #128 + str x8, [sp, #664] // 8-byte Folded Spill + add x8, x22, #256 + str x8, [sp, #1000] // 8-byte Folded Spill + ldr x8, [sp, #552] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #992] // 8-byte Folded Spill + add x8, x7, #16 + stp x8, x11, [sp, #40] // 16-byte Folded Spill + add x8, x6, #8 + stp x7, x8, [sp, #24] // 16-byte Folded Spill + b .LBB0_4 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_4 Depth=1 + str s0, [x24, x9, lsl #2] +.LBB0_2: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #120] // 8-byte Folded Reload + bl free +.LBB0_3: // %.backedge53 + // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #800] // 8-byte Folded Reload + ldp x11, x10, [sp, #128] // 16-byte Folded Reload + add x8, x8, x11 + ldr x5, [sp, #872] // 8-byte Folded Reload + ldp x9, x16, [sp, #176] // 16-byte Folded Reload + ldp x3, x17, [sp, #160] // 16-byte Folded Reload + ldr x12, [sp, #880] // 8-byte Folded Reload + ldr x2, [sp, #192] // 8-byte Folded Reload + add x5, x5, x11 + add x16, x16, x10 + add x9, x9, x10 + add x17, x17, x10 + add x12, x12, x11 + add x2, x2, x10 + str x8, [sp, #800] // 8-byte Folded Spill + ldr x8, [sp, #784] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #784] // 8-byte Folded Spill + ldr x8, [sp, #792] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #792] // 8-byte Folded Spill + ldr x8, [sp, #816] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #816] // 8-byte Folded Spill + ldr x8, [sp, #808] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #808] // 8-byte Folded Spill + ldr x8, [sp, #624] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #624] // 8-byte Folded Spill + ldr x8, [sp, #616] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #616] // 8-byte Folded Spill + ldr x8, [sp, #640] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #640] // 8-byte Folded Spill + ldr x8, [sp, #632] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #632] // 8-byte Folded Spill + ldr x8, [sp, #504] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #504] // 8-byte Folded Spill + ldr x8, [sp, #496] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #496] // 8-byte Folded Spill + ldr x8, [sp, #488] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #488] // 8-byte Folded Spill + ldr x8, [sp, #480] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #480] // 8-byte Folded Spill + ldr x8, [sp, #472] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #472] // 8-byte Folded Spill + ldr x8, [sp, #464] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #464] // 8-byte Folded Spill + ldr x8, [sp, #456] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #456] // 8-byte Folded Spill + ldr x8, [sp, #448] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #448] // 8-byte Folded Spill + ldr x8, [sp, #440] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #440] // 8-byte Folded Spill + ldr x8, [sp, #544] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #544] // 8-byte Folded Spill + ldr x8, [sp, #656] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #656] // 8-byte Folded Spill + ldr x8, [sp, #432] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #432] // 8-byte Folded Spill + ldr x8, [sp, #424] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #424] // 8-byte Folded Spill + ldr x8, [sp, #416] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #416] // 8-byte Folded Spill + ldr x8, [sp, #408] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #408] // 8-byte Folded Spill + ldr x8, [sp, #400] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #400] // 8-byte Folded Spill + ldr x8, [sp, #392] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #392] // 8-byte Folded Spill + ldr x8, [sp, #384] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #384] // 8-byte Folded Spill + ldr x8, [sp, #376] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #376] // 8-byte Folded Spill + ldr x8, [sp, #368] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #368] // 8-byte Folded Spill + ldr x8, [sp, #360] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #360] // 8-byte Folded Spill + ldr x8, [sp, #352] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #352] // 8-byte Folded Spill + ldr x8, [sp, #344] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #344] // 8-byte Folded Spill + ldr x8, [sp, #336] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #336] // 8-byte Folded Spill + ldr x8, [sp, #328] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #328] // 8-byte Folded Spill + ldr x8, [sp, #320] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #320] // 8-byte Folded Spill + ldr x8, [sp, #312] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #312] // 8-byte Folded Spill + ldr x8, [sp, #304] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #304] // 8-byte Folded Spill + ldr x8, [sp, #296] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #296] // 8-byte Folded Spill + ldr x8, [sp, #288] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #288] // 8-byte Folded Spill + ldr x8, [sp, #280] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #280] // 8-byte Folded Spill + ldr x8, [sp, #592] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #592] // 8-byte Folded Spill + ldr x8, [sp, #584] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #584] // 8-byte Folded Spill + ldr x8, [sp, #272] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #272] // 8-byte Folded Spill + ldr x8, [sp, #264] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #264] // 8-byte Folded Spill + ldr x8, [sp, #256] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #256] // 8-byte Folded Spill + ldr x8, [sp, #248] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #248] // 8-byte Folded Spill + ldr x8, [sp, #608] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #608] // 8-byte Folded Spill + ldr x8, [sp, #240] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #240] // 8-byte Folded Spill + ldr x8, [sp, #232] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #232] // 8-byte Folded Spill + ldr x8, [sp, #224] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #224] // 8-byte Folded Spill + ldr x8, [sp, #216] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #216] // 8-byte Folded Spill + ldr x8, [sp, #208] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #208] // 8-byte Folded Spill + ldr x8, [sp, #200] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #200] // 8-byte Folded Spill + ldr x8, [sp, #536] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #536] // 8-byte Folded Spill + ldr x8, [sp, #528] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #528] // 8-byte Folded Spill +.LBB0_4: // =>This Loop Header: Depth=1 + // Child Loop BB0_8 Depth 2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_15 Depth 3 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_24 Depth 3 + // Child Loop BB0_28 Depth 3 + // Child Loop BB0_30 Depth 3 + // Child Loop BB0_36 Depth 2 + // Child Loop BB0_39 Depth 2 + // Child Loop BB0_42 Depth 2 + // Child Loop BB0_44 Depth 3 + // Child Loop BB0_46 Depth 3 + // Child Loop BB0_49 Depth 2 + // Child Loop BB0_51 Depth 2 + // Child Loop BB0_55 Depth 2 + // Child Loop BB0_57 Depth 2 + // Child Loop BB0_61 Depth 2 + // Child Loop BB0_64 Depth 2 + // Child Loop BB0_67 Depth 2 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_74 Depth 2 + // Child Loop BB0_76 Depth 2 + // Child Loop BB0_80 Depth 2 + // Child Loop BB0_82 Depth 2 + // Child Loop BB0_86 Depth 2 + // Child Loop BB0_89 Depth 2 + // Child Loop BB0_92 Depth 2 + // Child Loop BB0_94 Depth 3 + // Child Loop BB0_96 Depth 3 + // Child Loop BB0_99 Depth 2 + // Child Loop BB0_101 Depth 2 + // Child Loop BB0_105 Depth 2 + // Child Loop BB0_107 Depth 2 + // Child Loop BB0_111 Depth 2 + // Child Loop BB0_114 Depth 2 + // Child Loop BB0_117 Depth 2 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + // Child Loop BB0_124 Depth 2 + // Child Loop BB0_126 Depth 2 + // Child Loop BB0_130 Depth 2 + // Child Loop BB0_132 Depth 2 + ldr x8, [sp, #152] // 8-byte Folded Reload + cmp x3, x8 + b.ge .LBB0_133 +// %bb.5: // in Loop: Header=BB0_4 Depth=1 + stp x16, x2, [sp, #184] // 16-byte Folded Spill + add x8, x3, #1 + ldr x2, [sp, #832] // 8-byte Folded Reload + mov x4, x16 + str x3, [sp, #776] // 8-byte Folded Spill + ldr x3, [sp, #768] // 8-byte Folded Reload + str x12, [sp, #880] // 8-byte Folded Spill + mov x12, xzr + stp x8, x17, [sp, #160] // 16-byte Folded Spill + mov x8, x17 + str x9, [sp, #176] // 8-byte Folded Spill + str x5, [sp, #872] // 8-byte Folded Spill + b .LBB0_8 + .p2align 2 +.LBB0_6: // in Loop: Header=BB0_8 Depth=2 + stp q3, q2, [x11] + stp q1, q0, [x11, #32] +.LBB0_7: // %.backedge + // in Loop: Header=BB0_8 Depth=2 + ldr x12, [sp, #856] // 8-byte Folded Reload + add x4, x4, #64 + add x9, x9, #64 + add x8, x8, #64 +.LBB0_8: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_15 Depth 3 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_24 Depth 3 + // Child Loop BB0_28 Depth 3 + // Child Loop BB0_30 Depth 3 + cmp x12, x2 + b.ge .LBB0_31 +// %bb.9: // in Loop: Header=BB0_8 Depth=2 + add x10, x12, #16 + ldr x11, [sp, #688] // 8-byte Folded Reload + ldr x2, [sp, #776] // 8-byte Folded Reload + mov x3, x4 + str x10, [sp, #856] // 8-byte Folded Spill + ldr x10, [sp, #696] // 8-byte Folded Reload + mov x17, xzr + add x13, x11, x10, lsl #2 + ldr x10, [sp, #736] // 8-byte Folded Reload + ldr x11, [sp, #752] // 8-byte Folded Reload + ldr x1, [sp, #680] // 8-byte Folded Reload + str x4, [sp, #888] // 8-byte Folded Spill + mov x4, x5 + ldr x5, [sp, #784] // 8-byte Folded Reload + ldr x6, [sp, #800] // 8-byte Folded Reload + mul x14, x2, x10 + lsl x10, x29, #1 + mul x16, x2, x11 + add x11, x10, x29 + add x15, x14, x12 + str x16, [sp, #848] // 8-byte Folded Spill + add x16, x16, x12 + add x0, x15, x29 + add x10, x15, x10 + add x18, x13, x15, lsl #2 + add x11, x15, x11 + add x15, x13, x0, lsl #2 + add x10, x13, x10, lsl #2 + ldr x0, [sp, #712] // 8-byte Folded Reload + add x11, x13, x11, lsl #2 + ldp q6, q4, [x18, #32] + ldp q1, q0, [x18] + ldr x18, [sp, #664] // 8-byte Folded Reload + add x0, x1, x0, lsl #2 + ldp q19, q17, [x10, #32] + ldp q22, q20, [x10] + ldr x10, [sp, #760] // 8-byte Folded Reload + ldp q3, q2, [x15, #32] + str x0, [sp, #840] // 8-byte Folded Spill + ldp q7, q5, [x15] + ldp q18, q16, [x11, #32] + ldp q23, q21, [x11] + ldr x11, [sp, #704] // 8-byte Folded Reload + ldr x15, [sp, #720] // 8-byte Folded Reload + add x0, x0, x16, lsl #2 + mul x16, x2, x10 + add x15, x15, x11, lsl #2 + ldr x2, [sp, #792] // 8-byte Folded Reload + ldp q29, q28, [x0, #32] + ldp q30, q31, [x0] + lsl x10, x16, #2 + ldr q26, [x15, x10] + add x10, x16, x19 + lsl x10, x10, #2 + ldr q25, [x15, x10] + add x10, x16, x19, lsl #1 + lsl x10, x10, #2 + ldr q24, [x15, x10] + add x10, x2, x27 + cmp xzr, x23 + prfm pldl1keep, [x10, #16] + ldr q27, [x10] + b.ge .LBB0_11 + .p2align 2 +.LBB0_10: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x1, [sp, #936] // 8-byte Folded Reload + ldr x24, [sp, #968] // 8-byte Folded Reload + fmla v1.4s, v30.4s, v26.s[0] + fmla v0.4s, v31.4s, v26.s[0] + ldr x0, [sp, #984] // 8-byte Folded Reload + fmla v6.4s, v29.4s, v26.s[0] + fmla v4.4s, v28.4s, v26.s[0] + add x11, x6, x27 + fmla v7.4s, v30.4s, v25.s[0] + fmla v5.4s, v31.4s, v25.s[0] + stp q30, q31, [x18, #-128] + fmla v3.4s, v29.4s, v25.s[0] + fmla v2.4s, v28.4s, v25.s[0] + stp q29, q28, [x18, #-96] + add x1, x1, x3 + add x24, x24, x3 + add x26, x0, x3 + ldr x0, [sp, #952] // 8-byte Folded Reload + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + fmla v22.4s, v30.4s, v24.s[0] + fmla v20.4s, v31.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + prfm pldl1keep, [x1] + ldr x1, [sp, #944] // 8-byte Folded Reload + fmla v21.4s, v31.4s, v27.s[0] + fmla v23.4s, v30.4s, v27.s[0] + ldp q28, q29, [x24, #32] + ldp q30, q31, [x24] + ldr x24, [sp, #960] // 8-byte Folded Reload + add x0, x0, x3 + add x1, x1, x3 + add x24, x24, x3 + fmla v4.4s, v29.4s, v26.s[1] + fmla v0.4s, v31.4s, v26.s[1] + fmla v6.4s, v28.4s, v26.s[1] + fmla v1.4s, v30.4s, v26.s[1] + fmla v2.4s, v29.4s, v25.s[1] + fmla v3.4s, v28.4s, v25.s[1] + fmla v5.4s, v31.4s, v25.s[1] + stp q30, q31, [x18, #-64] + fmla v7.4s, v30.4s, v25.s[1] + stp q28, q29, [x18, #-32] + fmla v20.4s, v31.4s, v24.s[1] + fmla v22.4s, v30.4s, v24.s[1] + fmla v19.4s, v28.4s, v24.s[1] + fmla v23.4s, v30.4s, v27.s[1] + prfm pldl1keep, [x1] + ldr x1, [sp, #928] // 8-byte Folded Reload + fmla v21.4s, v31.4s, v27.s[1] + ldp q31, q30, [x24, #32] + fmla v17.4s, v29.4s, v24.s[1] + fmla v18.4s, v28.4s, v27.s[1] + fmla v16.4s, v29.4s, v27.s[1] + ldp q29, q28, [x24] + add x1, x1, x3 + fmla v4.4s, v30.4s, v26.s[2] + fmla v2.4s, v30.4s, v25.s[2] + add x10, x5, x27 + fmla v17.4s, v30.4s, v24.s[2] + stp q29, q28, [x18] + fmla v16.4s, v30.4s, v27.s[2] + stp q31, q30, [x18, #32] + prfm pldl1keep, [x0] + ldr x0, [sp, #976] // 8-byte Folded Reload + ldp q30, q9, [x1] + ldp q10, q8, [x1, #32] + fmla v6.4s, v31.4s, v26.s[2] + fmla v3.4s, v31.4s, v25.s[2] + fmla v19.4s, v31.4s, v24.s[2] + fmla v1.4s, v29.4s, v26.s[2] + fmla v0.4s, v28.4s, v26.s[2] + fmla v7.4s, v29.4s, v25.s[2] + fmla v5.4s, v28.4s, v25.s[2] + fmla v22.4s, v29.4s, v24.s[2] + fmla v20.4s, v28.4s, v24.s[2] + fmla v23.4s, v29.4s, v27.s[2] + add x28, x4, x27 + add x7, x11, #32 + add x30, x10, #32 + add x25, x28, #32 + add x0, x0, x3 + fmla v18.4s, v31.4s, v27.s[2] + fmla v21.4s, v28.4s, v27.s[2] + stp q30, q9, [x18, #64] + fmla v4.4s, v8.4s, v26.s[3] + stp q10, q8, [x18, #96] + prfm pldl1keep, [x26] + fmla v6.4s, v10.4s, v26.s[3] + fmla v0.4s, v9.4s, v26.s[3] + fmla v1.4s, v30.4s, v26.s[3] + fmla v2.4s, v8.4s, v25.s[3] + fmla v3.4s, v10.4s, v25.s[3] + fmla v5.4s, v9.4s, v25.s[3] + fmla v7.4s, v30.4s, v25.s[3] + ldp q29, q28, [x0, #32] + fmla v20.4s, v9.4s, v24.s[3] + fmla v22.4s, v30.4s, v24.s[3] + add x17, x17, #4 + add x6, x6, #16 + fmla v19.4s, v10.4s, v24.s[3] + fmla v17.4s, v8.4s, v24.s[3] + add x5, x5, #16 + add x4, x4, #16 + fmla v23.4s, v30.4s, v27.s[3] + ldp q30, q31, [x0] + prfm pldl1keep, [x25] + ldr q26, [x28, #16] + prfm pldl1keep, [x30] + ldr q25, [x10, #16] + prfm pldl1keep, [x7] + ldr q24, [x11, #16] + ldr x10, [sp, #1016] // 8-byte Folded Reload + fmla v21.4s, v9.4s, v27.s[3] + fmla v18.4s, v10.4s, v27.s[3] + fmla v16.4s, v8.4s, v27.s[3] + add x3, x3, x10 + add x2, x2, #16 + add x18, x18, #256 + add x10, x2, x27 + cmp x17, x23 + prfm pldl1keep, [x10, #16] + ldr q27, [x10] + b.lt .LBB0_10 +.LBB0_11: // in Loop: Header=BB0_8 Depth=2 + ldr x0, [sp, #728] // 8-byte Folded Reload + ldr x17, [sp, #912] // 8-byte Folded Reload + add x11, x22, x23, lsl #6 + fmla v1.4s, v30.4s, v26.s[0] + ldr x3, [sp, #848] // 8-byte Folded Reload + ldr x4, [sp, #840] // 8-byte Folded Reload + fmla v0.4s, v31.4s, v26.s[0] + fmla v6.4s, v29.4s, v26.s[0] + ldr x1, [sp, #904] // 8-byte Folded Reload + stp q30, q31, [x11] + fmla v4.4s, v28.4s, v26.s[0] + stp q29, q28, [x11, #32] + fmla v2.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + madd x10, x17, x0, x3 + madd x18, x1, x0, x3 + fmla v5.4s, v31.4s, v25.s[0] + fmla v3.4s, v29.4s, v25.s[0] + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + fmla v20.4s, v31.4s, v24.s[0] + fmla v22.4s, v30.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + ldr x11, [sp, #896] // 8-byte Folded Reload + add x17, x22, x17, lsl #6 + fmla v21.4s, v31.4s, v27.s[0] + fmla v23.4s, v30.4s, v27.s[0] + ldr x5, [sp, #824] // 8-byte Folded Reload + ldr x6, [sp, #872] // 8-byte Folded Reload + add x10, x10, x12 + mov x2, xzr + add x10, x4, x10, lsl #2 + ldp q28, q29, [x10] + ldp q30, q31, [x10, #32] + add x10, x18, x12 + add x18, x22, x1, lsl #6 + add x10, x4, x10, lsl #2 + fmla v4.4s, v31.4s, v26.s[1] + fmla v0.4s, v29.4s, v26.s[1] + fmla v5.4s, v29.4s, v25.s[1] + fmla v2.4s, v31.4s, v25.s[1] + fmla v20.4s, v29.4s, v24.s[1] + fmla v17.4s, v31.4s, v24.s[1] + fmla v21.4s, v29.4s, v27.s[1] + fmla v16.4s, v31.4s, v27.s[1] + fmla v6.4s, v30.4s, v26.s[1] + stp q28, q29, [x17] + stp q30, q31, [x17, #32] + fmla v1.4s, v28.4s, v26.s[1] + fmla v3.4s, v30.4s, v25.s[1] + fmla v7.4s, v28.4s, v25.s[1] + fmla v22.4s, v28.4s, v24.s[1] + fmla v19.4s, v30.4s, v24.s[1] + fmla v23.4s, v28.4s, v27.s[1] + fmla v18.4s, v30.4s, v27.s[1] + ldp q29, q28, [x10, #32] + ldp q31, q30, [x10] + madd x10, x11, x0, x3 + add x0, x22, x11, lsl #6 + ldr x11, [sp, #1032] // 8-byte Folded Reload + add x10, x10, x12 + fmla v0.4s, v30.4s, v26.s[2] + fmla v4.4s, v28.4s, v26.s[2] + fmla v2.4s, v28.4s, v25.s[2] + fmla v5.4s, v30.4s, v25.s[2] + fmla v17.4s, v28.4s, v24.s[2] + fmla v20.4s, v30.4s, v24.s[2] + fmla v16.4s, v28.4s, v27.s[2] + fmla v21.4s, v30.4s, v27.s[2] + add x10, x4, x10, lsl #2 + stp q31, q30, [x18] + fmla v1.4s, v31.4s, v26.s[2] + stp q29, q28, [x18, #32] + fmla v6.4s, v29.4s, v26.s[2] + fmla v7.4s, v31.4s, v25.s[2] + fmla v3.4s, v29.4s, v25.s[2] + fmla v19.4s, v29.4s, v24.s[2] + fmla v22.4s, v31.4s, v24.s[2] + fmla v18.4s, v29.4s, v27.s[2] + fmla v23.4s, v31.4s, v27.s[2] + ldp q28, q29, [x10] + fmla v0.4s, v29.4s, v26.s[3] + ldp q30, q31, [x10, #32] + fmla v4.4s, v31.4s, v26.s[3] + fmla v5.4s, v29.4s, v25.s[3] + fmla v2.4s, v31.4s, v25.s[3] + fmla v20.4s, v29.4s, v24.s[3] + fmla v17.4s, v31.4s, v24.s[3] + fmla v21.4s, v29.4s, v27.s[3] + ldr x10, [sp, #672] // 8-byte Folded Reload + fmla v16.4s, v31.4s, v27.s[3] + fmla v6.4s, v30.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v3.4s, v30.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + stp q28, q29, [x0] + stp q30, q31, [x0, #32] + fmla v22.4s, v28.4s, v24.s[3] + fmla v19.4s, v30.4s, v24.s[3] + fmla v23.4s, v28.4s, v27.s[3] + fmla v18.4s, v30.4s, v27.s[3] + cmp x11, x21 + b.ge .LBB0_13 + .p2align 2 +.LBB0_12: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x1, x6, x10 + add x10, x10, #4 + add x3, x1, x20 + prfm pldl1keep, [x1] + ldur s24, [x1, #-4] + add x1, x9, x2 + add x4, x3, x20 + prfm pldl1keep, [x3] + ldur s25, [x3, #-4] + add x3, x8, x2 + add x2, x2, x5 + prfm pldl1keep, [x4] + ldur s26, [x4, #-4] + add x4, x4, x20 + prfm pldl1keep, [x4] + ldur s27, [x4, #-4] + prfm pldl1keep, [x1] + ldp q28, q29, [x3, #32] + add x1, x22, x11, lsl #6 + ldp q30, q31, [x3] + add x11, x11, #1 + fmla v4.4s, v29.4s, v24.s[0] + fmla v0.4s, v31.4s, v24.s[0] + fmla v5.4s, v31.4s, v25.s[0] + fmla v2.4s, v29.4s, v25.s[0] + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + fmla v6.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v3.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + stp q30, q31, [x1] + stp q28, q29, [x1, #32] + fmla v16.4s, v29.4s, v27.s[0] + cmp x11, x21 + b.lt .LBB0_12 +.LBB0_13: // %.preheader + // in Loop: Header=BB0_8 Depth=2 + ldr x1, [sp, #808] // 8-byte Folded Reload + ldr x11, [sp, #816] // 8-byte Folded Reload + mov x10, xzr + mov w5, #1 // =0x1 + mov w6, #2 // =0x2 + mov w4, #3 // =0x3 + mov w3, #4 // =0x4 + b .LBB0_15 + .p2align 2 +.LBB0_14: // %.loopexit + // in Loop: Header=BB0_15 Depth=3 + ldr x10, [sp, #1008] // 8-byte Folded Reload + add x11, x11, x10 + add x1, x1, x10 + mov x10, x3 + mov x3, x7 +.LBB0_15: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_19 Depth 4 + madd x10, x10, x29, x14 + add x10, x10, x12 + madd x2, x5, x29, x14 + madd x5, x6, x29, x14 + add x2, x2, x12 + add x5, x5, x12 + add x10, x13, x10, lsl #2 + stp q1, q0, [x10] + stp q6, q4, [x10, #32] + add x10, x13, x2, lsl #2 + add x2, x13, x5, lsl #2 + stp q7, q5, [x10] + stp q3, q2, [x10, #32] + madd x10, x4, x29, x14 + add x10, x10, x12 + stp q22, q20, [x2] + stp q19, q17, [x2, #32] + ldr x2, [sp, #1024] // 8-byte Folded Reload + cmp x3, x2 + add x10, x13, x10, lsl #2 + stp q23, q21, [x10] + stp q18, q16, [x10, #32] + b.ge .LBB0_20 +// %bb.16: // in Loop: Header=BB0_15 Depth=3 + madd x10, x3, x29, x14 + add x4, x3, #3 + add x5, x3, #1 + add x6, x3, #2 + madd x2, x5, x29, x14 + ldp q28, q29, [x22, #32] + mov x30, xzr + madd x24, x6, x29, x14 + ldp q30, q31, [x22] + add x7, x3, #4 + add x10, x10, x12 + add x10, x13, x10, lsl #2 + add x2, x2, x12 + add x2, x13, x2, lsl #2 + ldp q6, q4, [x10, #32] + ldp q1, q0, [x10] + madd x10, x4, x29, x14 + ldp q3, q2, [x2, #32] + add x10, x10, x12 + ldp q7, q5, [x2] + add x2, x24, x12 + add x2, x13, x2, lsl #2 + ldp q19, q17, [x2, #32] + ldp q22, q20, [x2] + mov x2, x11 + add x10, x13, x10, lsl #2 + ldp q18, q16, [x10, #32] + ldp q23, q21, [x10] + madd x10, x3, x19, x16 + lsl x10, x10, #2 + ldr q27, [x15, x10] + madd x10, x5, x19, x16 + lsl x10, x10, #2 + ldr q26, [x15, x10] + madd x10, x6, x19, x16 + lsl x10, x10, #2 + ldr q25, [x15, x10] + madd x10, x4, x19, x16 + lsl x10, x10, #2 + ldr q24, [x15, x10] + ldr x10, [sp, #1000] // 8-byte Folded Reload + fmla v4.4s, v29.4s, v27.s[0] + cmp xzr, x23 + b.ge .LBB0_18 + .p2align 2 +.LBB0_17: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_15 Depth=3 + // => This Inner Loop Header: Depth=4 + add x28, x10, #64 + fmla v6.4s, v28.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + add x24, x10, #128 + prfm pldl1keep, [x28] + ldp q9, q8, [x10, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x10, #-192] + fmla v2.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v5.4s, v31.4s, v26.s[0] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x24] + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + ldp q11, q10, [x10, #-128] + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + ldp q13, q14, [x10, #-96] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + add x26, x10, #192 + prfm pldl1keep, [x26] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x25, x10, #256 + add x30, x30, #4 + fmla v1.4s, v12.4s, v27.s[1] + fmla v6.4s, v9.4s, v27.s[1] + fmla v4.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v5.4s, v15.4s, v26.s[1] + fmla v3.4s, v9.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v15.4s, v25.s[1] + fmla v19.4s, v9.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v15.4s, v24.s[1] + ldp q15, q12, [x10, #-64] + fmla v18.4s, v9.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q9, q8, [x10, #-32] + prfm pldl1keep, [x25] + ldp q28, q29, [x10, #32] + ldp q30, q31, [x10] + add x10, x2, x20 + prfm pldl1keep, [x2] + fmla v4.4s, v14.4s, v27.s[2] + fmla v6.4s, v13.4s, v27.s[2] + fmla v1.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v2.4s, v14.4s, v26.s[2] + fmla v3.4s, v13.4s, v26.s[2] + fmla v5.4s, v10.4s, v26.s[2] + fmla v7.4s, v11.4s, v26.s[2] + fmla v17.4s, v14.4s, v25.s[2] + fmla v19.4s, v13.4s, v25.s[2] + fmla v20.4s, v10.4s, v25.s[2] + fmla v22.4s, v11.4s, v25.s[2] + fmla v16.4s, v14.4s, v24.s[2] + fmla v18.4s, v13.4s, v24.s[2] + fmla v21.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v1.4s, v15.4s, v27.s[3] + fmla v6.4s, v9.4s, v27.s[3] + fmla v4.4s, v8.4s, v27.s[3] + ldur q27, [x2, #-16] + prfm pldl1keep, [x10] + add x2, x2, #16 + fmla v7.4s, v15.4s, v26.s[3] + fmla v5.4s, v12.4s, v26.s[3] + fmla v3.4s, v9.4s, v26.s[3] + fmla v2.4s, v8.4s, v26.s[3] + ldur q26, [x10, #-16] + add x10, x10, x20 + add x24, x10, x20 + prfm pldl1keep, [x10] + fmla v22.4s, v15.4s, v25.s[3] + fmla v20.4s, v12.4s, v25.s[3] + fmla v19.4s, v9.4s, v25.s[3] + fmla v17.4s, v8.4s, v25.s[3] + ldur q25, [x10, #-16] + prfm pldl1keep, [x24] + mov x10, x25 + fmla v23.4s, v15.4s, v24.s[3] + fmla v21.4s, v12.4s, v24.s[3] + fmla v18.4s, v9.4s, v24.s[3] + fmla v16.4s, v8.4s, v24.s[3] + ldur q24, [x24, #-16] + fmla v4.4s, v29.4s, v27.s[0] + cmp x30, x23 + b.lt .LBB0_17 +.LBB0_18: // in Loop: Header=BB0_15 Depth=3 + ldp q10, q8, [x17, #32] + ldp q12, q11, [x17] + fmla v6.4s, v28.4s, v27.s[0] + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + fmla v2.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v5.4s, v31.4s, v26.s[0] + ldp q9, q13, [x18, #32] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + ldr x2, [sp, #992] // 8-byte Folded Reload + ldr x25, [sp, #1032] // 8-byte Folded Reload + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + mov x10, x1 + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + fmla v23.4s, v30.4s, v24.s[0] + ldp q29, q30, [x18] + ldp q31, q28, [x0, #32] + fmla v1.4s, v12.4s, v27.s[1] + fmla v0.4s, v11.4s, v27.s[1] + fmla v6.4s, v10.4s, v27.s[1] + fmla v4.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v5.4s, v11.4s, v26.s[1] + fmla v3.4s, v10.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v11.4s, v25.s[1] + fmla v19.4s, v10.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v11.4s, v24.s[1] + fmla v18.4s, v10.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q10, q8, [x0] + fmla v4.4s, v13.4s, v27.s[2] + fmla v6.4s, v9.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v29.4s, v27.s[2] + fmla v2.4s, v13.4s, v26.s[2] + fmla v3.4s, v9.4s, v26.s[2] + fmla v5.4s, v30.4s, v26.s[2] + fmla v7.4s, v29.4s, v26.s[2] + fmla v17.4s, v13.4s, v25.s[2] + fmla v19.4s, v9.4s, v25.s[2] + fmla v20.4s, v30.4s, v25.s[2] + fmla v22.4s, v29.4s, v25.s[2] + fmla v16.4s, v13.4s, v24.s[2] + fmla v18.4s, v9.4s, v24.s[2] + fmla v21.4s, v30.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v1.4s, v10.4s, v27.s[3] + fmla v0.4s, v8.4s, v27.s[3] + fmla v6.4s, v31.4s, v27.s[3] + fmla v4.4s, v28.4s, v27.s[3] + fmla v7.4s, v10.4s, v26.s[3] + fmla v5.4s, v8.4s, v26.s[3] + fmla v3.4s, v31.4s, v26.s[3] + fmla v2.4s, v28.4s, v26.s[3] + fmla v22.4s, v10.4s, v25.s[3] + fmla v20.4s, v8.4s, v25.s[3] + fmla v19.4s, v31.4s, v25.s[3] + fmla v17.4s, v28.4s, v25.s[3] + fmla v23.4s, v10.4s, v24.s[3] + fmla v21.4s, v8.4s, v24.s[3] + fmla v18.4s, v31.4s, v24.s[3] + fmla v16.4s, v28.4s, v24.s[3] + cmp x25, x21 + b.ge .LBB0_14 + .p2align 2 +.LBB0_19: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_15 Depth=3 + // => This Inner Loop Header: Depth=4 + add x24, x10, x20 + prfm pldl1keep, [x10] + ldur s24, [x10, #-4] + add x25, x25, #1 + prfm pldl1keep, [x24] + ldur s25, [x24, #-4] + add x24, x24, x20 + add x10, x10, #4 + prfm pldl1keep, [x24] + ldur s26, [x24, #-4] + add x24, x24, x20 + prfm pldl1keep, [x24] + ldur s27, [x24, #-4] + prfm pldl1keep, [x2] + ldp q28, q29, [x2, #-32] + fmla v4.4s, v29.4s, v24.s[0] + ldp q30, q31, [x2, #-64] + fmla v0.4s, v31.4s, v24.s[0] + fmla v5.4s, v31.4s, v25.s[0] + fmla v2.4s, v29.4s, v25.s[0] + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + add x2, x2, #64 + fmla v6.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v3.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + fmla v16.4s, v29.4s, v27.s[0] + cmp x25, x21 + b.lt .LBB0_19 + b .LBB0_14 + .p2align 2 +.LBB0_20: // in Loop: Header=BB0_8 Depth=2 + ldr x10, [sp, #1024] // 8-byte Folded Reload + ldr x11, [sp, #920] // 8-byte Folded Reload + cmp x10, x11 + b.ge .LBB0_26 +// %bb.21: // in Loop: Header=BB0_8 Depth=2 + ldr x3, [sp, #1024] // 8-byte Folded Reload + ldp q18, q19, [x22, #32] + mov x10, xzr + ldp q20, q21, [x22] + ldr x4, [sp, #1000] // 8-byte Folded Reload + madd x11, x3, x29, x14 + add x11, x11, x12 + add x1, x13, x11, lsl #2 + add x11, x3, #1 + madd x3, x3, x19, x16 + madd x2, x11, x29, x14 + madd x11, x11, x19, x16 + ldp q1, q0, [x1, #32] + ldp q4, q2, [x1] + lsl x3, x3, #2 + add x2, x2, x12 + lsl x11, x11, #2 + ldr q17, [x15, x3] + ldr x3, [sp, #624] // 8-byte Folded Reload + add x2, x13, x2, lsl #2 + ldr q16, [x15, x11] + ldr x11, [sp, #616] // 8-byte Folded Reload + ldp q5, q3, [x2, #32] + ldp q7, q6, [x2] + cmp xzr, x23 + b.ge .LBB0_23 + .p2align 2 +.LBB0_22: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x30, x4, #64 + fmla v0.4s, v19.4s, v17.s[0] + fmla v1.4s, v18.4s, v17.s[0] + add x28, x4, #128 + prfm pldl1keep, [x30] + ldp q23, q22, [x4, #-160] + fmla v4.4s, v20.4s, v17.s[0] + ldp q24, q25, [x4, #-192] + fmla v2.4s, v21.4s, v17.s[0] + fmla v3.4s, v19.4s, v16.s[0] + fmla v5.4s, v18.4s, v16.s[0] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + prfm pldl1keep, [x28] + ldp q19, q18, [x4, #-128] + ldp q20, q21, [x4, #-96] + fmla v2.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + fmla v6.4s, v25.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + fmla v4.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + add x24, x4, #192 + prfm pldl1keep, [x24] + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v23.4s, v16.s[1] + ldp q23, q22, [x4, #-32] + ldp q24, q25, [x4, #-64] + add x6, x3, x27 + add x25, x11, x27 + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + fmla v3.4s, v21.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v19.4s, v17.s[2] + add x5, x4, #256 + add x7, x6, #32 + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + add x26, x25, #32 + prfm pldl1keep, [x26] + add x10, x10, #4 + add x3, x3, #16 + add x11, x11, #16 + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + ldr q17, [x25, #16] + prfm pldl1keep, [x7] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + ldr q16, [x6, #16] + prfm pldl1keep, [x5] + ldp q18, q19, [x4, #32] + ldp q20, q21, [x4] + mov x4, x5 + cmp x10, x23 + b.lt .LBB0_22 +.LBB0_23: // in Loop: Header=BB0_8 Depth=2 + ldp q23, q22, [x17, #32] + ldp q25, q24, [x17] + fmla v0.4s, v19.4s, v17.s[0] + fmla v1.4s, v18.4s, v17.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + fmla v3.4s, v19.4s, v16.s[0] + fmla v5.4s, v18.4s, v16.s[0] + ldp q18, q19, [x18] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x18, #32] + fmla v2.4s, v24.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x10, [sp, #880] // 8-byte Folded Reload + ldr x11, [sp, #992] // 8-byte Folded Reload + fmla v4.4s, v25.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + ldr x3, [sp, #1032] // 8-byte Folded Reload + ldr x6, [sp, #576] // 8-byte Folded Reload + fmla v7.4s, v25.4s, v16.s[1] + fmla v6.4s, v24.4s, v16.s[1] + ldp q25, q24, [x0] + fmla v5.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + ldp q23, q22, [x0, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v19.4s, v17.s[2] + ldr x7, [sp, #568] // 8-byte Folded Reload + fmla v3.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v18.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v6.4s, v19.4s, v16.s[2] + fmla v7.4s, v18.4s, v16.s[2] + fmla v2.4s, v24.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v24.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v25.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + fmla v7.4s, v25.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + cmp x3, x21 + b.ge .LBB0_25 + .p2align 2 +.LBB0_24: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x10, x7 + add x5, x10, x6 + add x3, x3, #1 + add x5, x5, #4 + add x4, x4, #4 + prfm pldl1keep, [x5] + ldr s16, [x10, x6] + prfm pldl1keep, [x4] + ldr s17, [x10, x7] + prfm pldl1keep, [x11] + ldp q18, q19, [x11, #-64] + ldp q20, q21, [x11, #-32] + add x11, x11, #64 + add x10, x10, #4 + fmla v0.4s, v21.4s, v16.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v2.4s, v19.4s, v16.s[0] + fmla v4.4s, v18.4s, v16.s[0] + fmla v7.4s, v18.4s, v17.s[0] + fmla v6.4s, v19.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v3.4s, v21.4s, v17.s[0] + cmp x3, x21 + b.lt .LBB0_24 +.LBB0_25: // in Loop: Header=BB0_8 Depth=2 + stp q4, q2, [x1] + stp q1, q0, [x1, #32] + stp q7, q6, [x2] + stp q5, q3, [x2, #32] +.LBB0_26: // in Loop: Header=BB0_8 Depth=2 + ldr x10, [sp, #744] // 8-byte Folded Reload + ldr x11, [sp, #920] // 8-byte Folded Reload + cmp x11, x10 + ldr x2, [sp, #832] // 8-byte Folded Reload + ldr x3, [sp, #768] // 8-byte Folded Reload + ldr x5, [sp, #872] // 8-byte Folded Reload + ldr x4, [sp, #888] // 8-byte Folded Reload + b.ge .LBB0_7 +// %bb.27: // in Loop: Header=BB0_8 Depth=2 + ldr x1, [sp, #920] // 8-byte Folded Reload + ldp q7, q16, [x22, #32] + mov x10, xzr + ldp q6, q5, [x22] + madd x11, x1, x29, x14 + add x11, x11, x12 + madd x12, x1, x19, x16 + add x11, x13, x11, lsl #2 + ldr x13, [sp, #1000] // 8-byte Folded Reload + lsl x12, x12, #2 + ldp q1, q0, [x11, #32] + ldp q3, q2, [x11] + ldr q4, [x15, x12] + ldr x12, [sp, #640] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_29 + .p2align 2 +.LBB0_28: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x1, x13, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x16, x13, #128 + prfm pldl1keep, [x1] + ldp q18, q17, [x13, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x13, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x16] + ldp q6, q5, [x13, #-128] + ldp q7, q16, [x13, #-96] + add x15, x13, #192 + prfm pldl1keep, [x15] + add x14, x13, #256 + add x10, x10, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x13, #-32] + ldp q19, q20, [x13, #-64] + prfm pldl1keep, [x12] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x12, #-16] + prfm pldl1keep, [x14] + add x12, x12, #16 + ldp q7, q16, [x13, #32] + ldp q6, q5, [x13] + mov x13, x14 + cmp x10, x23 + b.lt .LBB0_28 +.LBB0_29: // in Loop: Header=BB0_8 Depth=2 + ldp q18, q17, [x17, #32] + ldp q20, q19, [x17] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q5, q6, [x18] + ldp q7, q16, [x18, #32] + ldr x12, [sp, #560] // 8-byte Folded Reload + ldr x16, [sp, #552] // 8-byte Folded Reload + ldr x17, [sp, #632] // 8-byte Folded Reload + ldr x18, [sp, #880] // 8-byte Folded Reload + fmla v2.4s, v19.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + mov x10, xzr + mov w13, #64 // =0x40 + fmla v3.4s, v20.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x0, #32] + ldp q20, q19, [x0] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v5.4s, v4.s[2] + fmla v2.4s, v19.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v20.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldr x14, [sp, #1032] // 8-byte Folded Reload + add x14, x14, xzr + cmp x14, x21 + b.ge .LBB0_6 + .p2align 2 +.LBB0_30: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x15, x18, x12 + add x14, x16, x13 + add x13, x13, #64 + prfm pldl1keep, [x15] + add x15, x16, x10, lsl #6 + ldr s4, [x17, x10, lsl #2] + prfm pldl1keep, [x14] + add x10, x10, #1 + add x12, x12, #4 + ldp q5, q6, [x15] + ldp q7, q16, [x15, #32] + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v6.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v5.4s, v4.s[0] + ldr x14, [sp, #1032] // 8-byte Folded Reload + add x14, x14, x10 + cmp x14, x21 + b.lt .LBB0_30 + b .LBB0_6 + .p2align 2 +.LBB0_31: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #696] // 8-byte Folded Reload + ldr x9, [sp, #688] // 8-byte Folded Reload + cmp x2, x3 + add x24, x9, x8, lsl #2 + lsl x8, x29, #1 + ldr x9, [sp, #680] // 8-byte Folded Reload + str x8, [sp, #520] // 8-byte Folded Spill + ldr x8, [sp, #712] // 8-byte Folded Reload + str x24, [sp, #856] // 8-byte Folded Spill + add x8, x9, x8, lsl #2 + ldr x9, [sp, #720] // 8-byte Folded Reload + str x8, [sp, #840] // 8-byte Folded Spill + ldr x8, [sp, #704] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + str x8, [sp, #888] // 8-byte Folded Spill + b.lt .LBB0_35 +// %bb.32: // in Loop: Header=BB0_4 Depth=1 + ldr x4, [sp, #600] // 8-byte Folded Reload + cmp x3, x4 + b.lt .LBB0_60 +.LBB0_33: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #648] // 8-byte Folded Reload + cmp x4, x8 + b.lt .LBB0_85 +.LBB0_34: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #144] // 8-byte Folded Reload + ldr x9, [sp, #648] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_3 + b .LBB0_110 + .p2align 2 +.LBB0_35: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #88] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x13, [sp, #776] // 8-byte Folded Reload + mov x11, xzr + mul x9, x13, x8 + ldr x8, [sp, #760] // 8-byte Folded Reload + ldr x2, [sp, #520] // 8-byte Folded Reload + add x12, x2, x29 + ldr x6, [sp, #784] // 8-byte Folded Reload + ldp x3, x4, [sp, #496] // 16-byte Folded Reload + ldr x7, [sp, #800] // 8-byte Folded Reload + mul x10, x13, x8 + ldr x8, [sp, #752] // 8-byte Folded Reload + mul x13, x13, x8 + add x8, x0, #63 + add x17, x10, x19 + lsl x16, x10, #2 + str x10, [sp, #848] // 8-byte Folded Spill + and x25, x8, #0xffffffffffffffc0 + ldr x8, [sp, #832] // 8-byte Folded Reload + stp x13, x0, [sp, #112] // 16-byte Folded Spill + add x14, x9, x8 + add x15, x13, x8 + ldr x8, [sp, #888] // 8-byte Folded Reload + add x12, x14, x12 + add x18, x24, x14, lsl #2 + add x1, x14, x29 + add x2, x14, x2 + lsl x14, x17, #2 + add x12, x24, x12, lsl #2 + add x17, x24, x2, lsl #2 + ldp q1, q0, [x18] + ldr x18, [sp, #792] // 8-byte Folded Reload + ldp q7, q5, [x12] + add x12, x10, x19, lsl #1 + lsl x12, x12, #2 + ldr q16, [x8, x16] + ldr q17, [x8, x14] + add x16, x24, x1, lsl #2 + ldp q6, q3, [x16] + ldp q4, q2, [x17] + ldp x16, x17, [sp, #464] // 16-byte Folded Reload + ldp x1, x2, [sp, #480] // 16-byte Folded Reload + ldr q18, [x8, x12] + ldr x8, [sp, #840] // 8-byte Folded Reload + add x12, x25, #64 + add x5, x8, x15, lsl #2 + ldp x14, x15, [sp, #448] // 16-byte Folded Reload + ldp q21, q20, [x5] + ldr x5, [sp, #872] // 8-byte Folded Reload + .p2align 2 +.LBB0_36: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x24, x18, x27 + fmla v1.4s, v21.4s, v16.s[0] + fmla v0.4s, v20.4s, v16.s[0] + cmp x11, x23 + prfm pldl1keep, [x24, #16] + ldr q19, [x24] + b.ge .LBB0_38 +// %bb.37: // in Loop: Header=BB0_36 Depth=2 + ldr x8, [sp, #864] // 8-byte Folded Reload + mov x10, x25 + fmla v6.4s, v21.4s, v17.s[0] + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + fmla v2.4s, v20.4s, v18.s[0] + stp q21, q20, [x12, #-64] + fmla v7.4s, v21.4s, v19.s[0] + fmla v5.4s, v20.4s, v19.s[0] + add x26, x6, x27 + add x28, x5, x27 + add x0, x26, #32 + add x11, x11, #4 + add x6, x6, #16 + add x5, x5, #16 + add x24, x1, x8 + add x25, x16, x8 + add x13, x14, x8 + add x30, x4, x8 + prfm pldl1keep, [x24] + ldp q20, q21, [x25] + add x24, x2, x8 + add x25, x15, x8 + add x18, x18, #16 + fmla v0.4s, v21.4s, v16.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v18.s[1] + fmla v5.4s, v21.4s, v19.s[1] + fmla v1.4s, v20.4s, v16.s[1] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v18.s[1] + fmla v7.4s, v20.4s, v19.s[1] + stp q20, q21, [x12, #-32] + prfm pldl1keep, [x24] + ldp q21, q20, [x25] + add x24, x3, x8 + add x25, x7, x27 + add x7, x7, #16 + fmla v0.4s, v20.4s, v16.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v18.s[2] + fmla v5.4s, v20.4s, v19.s[2] + fmla v1.4s, v21.4s, v16.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v18.s[2] + fmla v7.4s, v21.4s, v19.s[2] + stp q21, q20, [x12] + prfm pldl1keep, [x24] + ldp q20, q21, [x13] + add x13, x17, x8 + add x24, x25, #32 + add x8, x28, #32 + fmla v0.4s, v21.4s, v16.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v18.s[3] + fmla v5.4s, v21.4s, v19.s[3] + fmla v1.4s, v20.4s, v16.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v18.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x12, #32] + prfm pldl1keep, [x30] + ldp q21, q20, [x13] + prfm pldl1keep, [x8] + ldr q16, [x28, #16] + prfm pldl1keep, [x0] + ldr q17, [x26, #16] + prfm pldl1keep, [x24] + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr q18, [x25, #16] + mov x25, x10 + add x12, x12, #128 + add x4, x4, x8 + add x3, x3, x8 + add x2, x2, x8 + add x1, x1, x8 + add x17, x17, x8 + add x16, x16, x8 + add x15, x15, x8 + add x14, x14, x8 + b .LBB0_36 + .p2align 2 +.LBB0_38: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #728] // 8-byte Folded Reload + ldr x11, [sp, #912] // 8-byte Folded Reload + add x8, x25, x23, lsl #5 + fmla v6.4s, v21.4s, v17.s[0] + ldr x10, [sp, #112] // 8-byte Folded Reload + ldr x4, [sp, #832] // 8-byte Folded Reload + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + stp q21, q20, [x8] + fmla v2.4s, v20.4s, v18.s[0] + fmla v5.4s, v20.4s, v19.s[0] + ldr x16, [sp, #840] // 8-byte Folded Reload + ldr x12, [sp, #904] // 8-byte Folded Reload + fmla v7.4s, v21.4s, v19.s[0] + mov x14, xzr + madd x8, x11, x13, x10 + ldr x15, [sp, #896] // 8-byte Folded Reload + add x11, x25, x11, lsl #5 + ldr x0, [sp, #824] // 8-byte Folded Reload + ldr x1, [sp, #192] // 8-byte Folded Reload + ldr x2, [sp, #440] // 8-byte Folded Reload + add x8, x8, x4 + ldr x24, [sp, #856] // 8-byte Folded Reload + add x8, x16, x8, lsl #2 + ldp q20, q21, [x8] + madd x8, x12, x13, x10 + add x12, x25, x12, lsl #5 + add x8, x8, x4 + fmla v0.4s, v21.4s, v16.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v18.s[1] + fmla v5.4s, v21.4s, v19.s[1] + add x8, x16, x8, lsl #2 + fmla v1.4s, v20.4s, v16.s[1] + stp q20, q21, [x11] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v18.s[1] + fmla v7.4s, v20.4s, v19.s[1] + ldp q21, q20, [x8] + madd x8, x15, x13, x10 + add x13, x25, x15, lsl #5 + ldr x15, [sp, #512] // 8-byte Folded Reload + add x8, x8, x4 + fmla v0.4s, v20.4s, v16.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v18.s[2] + fmla v5.4s, v20.4s, v19.s[2] + add x8, x16, x8, lsl #2 + stp q21, q20, [x12] + fmla v1.4s, v21.4s, v16.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v18.s[2] + fmla v7.4s, v21.4s, v19.s[2] + ldr x16, [sp, #1032] // 8-byte Folded Reload + ldp q20, q21, [x8] + fmla v0.4s, v21.4s, v16.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v18.s[3] + fmla v5.4s, v21.4s, v19.s[3] + fmla v1.4s, v20.4s, v16.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v18.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x13] + ldr x17, [sp, #880] // 8-byte Folded Reload + cmp x16, x21 + b.ge .LBB0_40 + .p2align 2 +.LBB0_39: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x17, x17, x15 + add x8, x1, x14 + add x18, x25, x16, lsl #5 + add x16, x16, #1 + prfm pldl1keep, [x17] + ldur s16, [x17, #-4] + add x17, x17, x20 + add x15, x15, #4 + prfm pldl1keep, [x17] + ldur s17, [x17, #-4] + add x17, x17, x20 + prfm pldl1keep, [x17] + ldur s18, [x17, #-4] + add x17, x17, x20 + prfm pldl1keep, [x17] + ldur s19, [x17, #-4] + add x17, x2, x14 + prfm pldl1keep, [x8] + add x14, x14, x0 + ldp q20, q21, [x17] + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v5.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + stp q20, q21, [x18] + ldr x17, [sp, #880] // 8-byte Folded Reload + cmp x16, x21 + b.lt .LBB0_39 +.LBB0_40: // %.preheader52 + // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #48] // 8-byte Folded Reload + ldr x16, [sp, #808] // 8-byte Folded Reload + mov x5, xzr + add x14, x25, #128 + ldr x17, [sp, #816] // 8-byte Folded Reload + mov w2, #1 // =0x1 + mov w3, #2 // =0x2 + mov w1, #3 // =0x3 + mov w18, #4 // =0x4 + add x15, x25, x8 + b .LBB0_42 + .p2align 2 +.LBB0_41: // %.loopexit48 + // in Loop: Header=BB0_42 Depth=2 + ldr x8, [sp, #1008] // 8-byte Folded Reload + mov x5, x18 + mov x18, x4 + ldr x4, [sp, #832] // 8-byte Folded Reload + add x17, x17, x8 + add x16, x16, x8 +.LBB0_42: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_44 Depth 3 + // Child Loop BB0_46 Depth 3 + madd x8, x5, x29, x9 + add x8, x8, x4 + madd x0, x2, x29, x9 + madd x2, x3, x29, x9 + add x0, x0, x4 + add x8, x24, x8, lsl #2 + add x0, x24, x0, lsl #2 + stp q1, q0, [x8] + madd x8, x1, x29, x9 + add x8, x8, x4 + stp q6, q3, [x0] + add x0, x2, x4 + add x0, x24, x0, lsl #2 + stp q4, q2, [x0] + add x8, x24, x8, lsl #2 + stp q7, q5, [x8] + ldr x8, [sp, #1024] // 8-byte Folded Reload + cmp x18, x8 + b.ge .LBB0_47 +// %bb.43: // in Loop: Header=BB0_42 Depth=2 + madd x8, x18, x29, x9 + add x2, x18, #1 + add x1, x18, #3 + add x3, x18, #2 + madd x0, x2, x29, x9 + mov x7, x4 + ldr x10, [sp, #848] // 8-byte Folded Reload + mov x5, xzr + madd x6, x3, x29, x9 + ldp q20, q21, [x25] + add x8, x8, x4 + add x8, x24, x8, lsl #2 + add x0, x0, x4 + add x4, x18, #4 + add x0, x24, x0, lsl #2 + ldp q1, q0, [x8] + madd x8, x1, x29, x9 + add x8, x8, x7 + ldp q6, q3, [x0] + add x0, x6, x7 + add x0, x24, x0, lsl #2 + ldp q4, q2, [x0] + ldr x0, [sp, #888] // 8-byte Folded Reload + mov x6, x14 + mov x7, x17 + add x8, x24, x8, lsl #2 + ldp q7, q5, [x8] + madd x8, x18, x19, x10 + lsl x8, x8, #2 + ldr q19, [x0, x8] + madd x8, x2, x19, x10 + lsl x8, x8, #2 + ldr q18, [x0, x8] + madd x8, x3, x19, x10 + lsl x8, x8, #2 + ldr q17, [x0, x8] + madd x8, x1, x19, x10 + lsl x8, x8, #2 + ldr q16, [x0, x8] + cmp xzr, x23 + b.ge .LBB0_45 + .p2align 2 +.LBB0_44: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_42 Depth=2 + // => This Inner Loop Header: Depth=3 + add x8, x6, #32 + fmla v1.4s, v20.4s, v19.s[0] + fmla v0.4s, v21.4s, v19.s[0] + add x5, x5, #4 + prfm pldl1keep, [x8] + ldp q22, q23, [x6, #-96] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + add x8, x6, #96 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x6, #-64] + prfm pldl1keep, [x8] + add x8, x7, x20 + add x0, x8, x20 + fmla v0.4s, v23.4s, v19.s[1] + fmla v3.4s, v23.4s, v18.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v5.4s, v23.4s, v16.s[1] + fmla v1.4s, v22.4s, v19.s[1] + fmla v6.4s, v22.4s, v18.s[1] + fmla v4.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v0.4s, v20.4s, v19.s[2] + ldp q22, q23, [x6, #-32] + fmla v3.4s, v20.4s, v18.s[2] + fmla v2.4s, v20.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v1.4s, v21.4s, v19.s[2] + fmla v6.4s, v21.4s, v18.s[2] + fmla v4.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x6], #128 + prfm pldl1keep, [x7] + fmla v0.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v18.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v5.4s, v23.4s, v16.s[3] + fmla v1.4s, v22.4s, v19.s[3] + ldur q19, [x7, #-16] + prfm pldl1keep, [x8] + fmla v6.4s, v22.4s, v18.s[3] + ldur q18, [x8, #-16] + add x8, x0, x20 + prfm pldl1keep, [x0] + add x7, x7, #16 + fmla v4.4s, v22.4s, v17.s[3] + ldur q17, [x0, #-16] + prfm pldl1keep, [x8] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x8, #-16] + cmp x5, x23 + b.lt .LBB0_44 +.LBB0_45: // in Loop: Header=BB0_42 Depth=2 + ldp q23, q22, [x11] + fmla v0.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v19.s[0] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + ldr x7, [sp, #1032] // 8-byte Folded Reload + mov x5, x16 + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + mov x6, x15 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x12] + fmla v0.4s, v22.4s, v19.s[1] + fmla v3.4s, v22.4s, v18.s[1] + fmla v2.4s, v22.4s, v17.s[1] + fmla v5.4s, v22.4s, v16.s[1] + fmla v1.4s, v23.4s, v19.s[1] + fmla v6.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v7.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v19.s[2] + ldp q23, q22, [x13] + fmla v3.4s, v21.4s, v18.s[2] + fmla v2.4s, v21.4s, v17.s[2] + fmla v5.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v19.s[2] + fmla v6.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v7.4s, v20.4s, v16.s[2] + fmla v0.4s, v22.4s, v19.s[3] + fmla v3.4s, v22.4s, v18.s[3] + fmla v2.4s, v22.4s, v17.s[3] + fmla v5.4s, v22.4s, v16.s[3] + fmla v1.4s, v23.4s, v19.s[3] + fmla v6.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v7.4s, v23.4s, v16.s[3] + cmp x7, x21 + b.ge .LBB0_41 + .p2align 2 +.LBB0_46: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_42 Depth=2 + // => This Inner Loop Header: Depth=3 + add x8, x5, x20 + prfm pldl1keep, [x5] + ldur s16, [x5, #-4] + add x7, x7, #1 + prfm pldl1keep, [x8] + ldur s17, [x8, #-4] + add x8, x8, x20 + add x5, x5, #4 + prfm pldl1keep, [x8] + ldur s18, [x8, #-4] + add x8, x8, x20 + prfm pldl1keep, [x8] + ldur s19, [x8, #-4] + prfm pldl1keep, [x6] + ldp q20, q21, [x6, #-32] + add x6, x6, #32 + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + fmla v5.4s, v21.4s, v19.s[0] + cmp x7, x21 + b.lt .LBB0_46 + b .LBB0_41 + .p2align 2 +.LBB0_47: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #1024] // 8-byte Folded Reload + ldr x15, [sp, #920] // 8-byte Folded Reload + cmp x8, x15 + b.ge .LBB0_53 +// %bb.48: // in Loop: Header=BB0_4 Depth=1 + ldr x18, [sp, #1024] // 8-byte Folded Reload + ldr x10, [sp, #848] // 8-byte Folded Reload + mov x17, xzr + madd x8, x18, x29, x9 + ldr x0, [sp, #888] // 8-byte Folded Reload + ldp q6, q7, [x25] + ldr x1, [sp, #616] // 8-byte Folded Reload + ldr x2, [sp, #624] // 8-byte Folded Reload + add x8, x8, x4 + add x15, x24, x8, lsl #2 + add x8, x18, #1 + madd x18, x18, x19, x10 + madd x16, x8, x29, x9 + madd x8, x8, x19, x10 + lsl x18, x18, #2 + ldp q1, q0, [x15] + add x16, x16, x4 + lsl x8, x8, #2 + ldr q5, [x0, x18] + mov x18, x14 + add x16, x24, x16, lsl #2 + ldr q4, [x0, x8] + ldp q3, q2, [x16] + cmp xzr, x23 + b.ge .LBB0_50 + .p2align 2 +.LBB0_49: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x6, x18, #32 + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + add x5, x18, #96 + prfm pldl1keep, [x6] + ldp q16, q17, [x18, #-96] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x18, #-64] + prfm pldl1keep, [x5] + add x8, x2, x27 + add x3, x1, x27 + add x0, x8, #32 + add x4, x3, #32 + add x17, x17, #4 + add x2, x2, #16 + add x1, x1, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v6.4s, v5.s[2] + ldp q16, q17, [x18, #-32] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + ldp q6, q7, [x18], #128 + prfm pldl1keep, [x4] + ldr x4, [sp, #832] // 8-byte Folded Reload + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x3, #16] + prfm pldl1keep, [x0] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x8, #16] + cmp x17, x23 + b.lt .LBB0_49 +.LBB0_50: // in Loop: Header=BB0_4 Depth=1 + ldp q17, q16, [x11] + fmla v0.4s, v7.4s, v5.s[0] + fmla v1.4s, v6.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q6, q7, [x12] + ldr x8, [sp, #64] // 8-byte Folded Reload + ldr x2, [sp, #1032] // 8-byte Folded Reload + mov x17, xzr + mov x18, xzr + fmla v0.4s, v16.4s, v5.s[1] + fmla v2.4s, v16.4s, v4.s[1] + add x1, x25, x8 + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldp q17, q16, [x13] + fmla v0.4s, v7.4s, v5.s[2] + fmla v2.4s, v7.4s, v4.s[2] + fmla v1.4s, v6.4s, v5.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v0.4s, v16.4s, v5.s[3] + fmla v2.4s, v16.4s, v4.s[3] + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + cmp x2, x21 + b.ge .LBB0_52 + .p2align 2 +.LBB0_51: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x5, [sp, #544] // 8-byte Folded Reload + ldr x6, [sp, #656] // 8-byte Folded Reload + add x8, x1, x18, lsl #3 + add x2, x2, #1 + add x8, x8, #32 + add x0, x5, x18 + add x3, x6, x18 + add x0, x0, #4 + add x3, x3, #4 + prfm pldl1keep, [x3] + ldr s4, [x6, x18] + prfm pldl1keep, [x0] + ldr s5, [x5, x18] + add x0, x1, x17 + prfm pldl1keep, [x8] + add x18, x18, #4 + add x17, x17, #32 + ldp q6, q7, [x0] + fmla v0.4s, v7.4s, v4.s[0] + fmla v1.4s, v6.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + cmp x2, x21 + b.lt .LBB0_51 +.LBB0_52: // in Loop: Header=BB0_4 Depth=1 + stp q1, q0, [x15] + stp q3, q2, [x16] +.LBB0_53: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #744] // 8-byte Folded Reload + ldr x15, [sp, #920] // 8-byte Folded Reload + cmp x15, x8 + b.ge .LBB0_59 +// %bb.54: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #920] // 8-byte Folded Reload + ldr x10, [sp, #888] // 8-byte Folded Reload + mov x15, xzr + madd x8, x16, x29, x9 + ldp q4, q3, [x25] + ldr x17, [sp, #632] // 8-byte Folded Reload + add x8, x8, x4 + add x9, x24, x8, lsl #2 + ldr x8, [sp, #848] // 8-byte Folded Reload + ldp q1, q0, [x9] + madd x8, x16, x19, x8 + lsl x8, x8, #2 + ldr q2, [x10, x8] + ldr x10, [sp, #640] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x16, x14, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x8, x14, #96 + prfm pldl1keep, [x16] + ldp q5, q6, [x14, #-96] + add x15, x15, #4 + ldp q4, q3, [x14, #-64] + prfm pldl1keep, [x8] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x14, #-32] + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x10, #-16] + ldp q4, q3, [x14], #128 + add x10, x10, #16 + cmp x15, x23 + b.lt .LBB0_55 +.LBB0_56: // in Loop: Header=BB0_4 Depth=1 + ldp q6, q5, [x11] + fmla v0.4s, v3.4s, v2.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp q3, q4, [x12] + ldr x8, [sp, #64] // 8-byte Folded Reload + ldr x11, [sp, #1032] // 8-byte Folded Reload + mov x10, xzr + mov x14, xzr + fmla v0.4s, v5.4s, v2.s[1] + add x8, x25, x8 + fmla v1.4s, v6.4s, v2.s[1] + ldp q6, q5, [x13] + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v3.4s, v2.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v6.4s, v2.s[3] + cmp x11, x21 + b.ge .LBB0_58 + .p2align 2 +.LBB0_57: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x14, lsl #3 + add x13, x17, x14 + add x11, x11, #1 + add x13, x13, #4 + add x12, x12, #32 + prfm pldl1keep, [x13] + ldr s2, [x17, x14] + add x13, x8, x10 + add x14, x14, #4 + add x10, x10, #32 + prfm pldl1keep, [x12] + ldp q3, q4, [x13] + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v3.4s, v2.s[0] + cmp x11, x21 + b.lt .LBB0_57 +.LBB0_58: // in Loop: Header=BB0_4 Depth=1 + stp q1, q0, [x9] +.LBB0_59: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #120] // 8-byte Folded Reload + bl free + ldr x3, [sp, #768] // 8-byte Folded Reload + ldr x4, [sp, #600] // 8-byte Folded Reload + cmp x3, x4 + b.ge .LBB0_33 +.LBB0_60: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #80] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x11, [sp, #776] // 8-byte Folded Reload + mov x12, xzr + mul x9, x11, x8 + ldr x8, [sp, #752] // 8-byte Folded Reload + ldr x10, [sp, #768] // 8-byte Folded Reload + ldr x18, [sp, #520] // 8-byte Folded Reload + add x13, x18, x29 + ldp x1, x2, [sp, #408] // 16-byte Folded Reload + ldp x3, x4, [sp, #424] // 16-byte Folded Reload + ldr x5, [sp, #872] // 8-byte Folded Reload + ldr x6, [sp, #784] // 8-byte Folded Reload + ldr x7, [sp, #800] // 8-byte Folded Reload + mul x15, x11, x8 + add x14, x9, x10 + add x8, x0, #63 + lsl x16, x14, #2 + add x17, x14, x29 + add x18, x14, x18 + add x13, x14, x13 + and x8, x8, #0xffffffffffffffc0 + lsl x13, x13, #2 + ldr q0, [x24, x16] + lsl x16, x18, #2 + ldr x18, [sp, #792] // 8-byte Folded Reload + ldr q3, [x24, x13] + ldr q1, [x24, x16] + stp x15, x0, [sp, #112] // 16-byte Folded Spill + add x15, x15, x10 + ldr x10, [sp, #760] // 8-byte Folded Reload + lsl x14, x15, #2 + lsl x15, x17, #2 + ldp x16, x17, [sp, #392] // 16-byte Folded Reload + mul x11, x11, x10 + ldr x10, [sp, #840] // 8-byte Folded Reload + ldr q2, [x24, x15] + lsl x13, x11, #2 + ldr q7, [x10, x14] + ldr x10, [sp, #888] // 8-byte Folded Reload + str x11, [sp, #848] // 8-byte Folded Spill + ldp x14, x15, [sp, #376] // 16-byte Folded Reload + ldr q4, [x10, x13] + add x13, x11, x19 + lsl x13, x13, #2 + ldr q5, [x10, x13] + add x13, x11, x19, lsl #1 + lsl x13, x13, #2 + ldr q6, [x10, x13] + orr x13, x8, #0x20 + .p2align 2 +.LBB0_61: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x24, x18, x27 + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + cmp x12, x23 + prfm pldl1keep, [x24, #16] + ldr q16, [x24] + b.ge .LBB0_63 +// %bb.62: // in Loop: Header=BB0_61 Depth=2 + ldr x10, [sp, #864] // 8-byte Folded Reload + stur q7, [x13, #-32] + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + add x25, x6, x27 + add x26, x5, x27 + add x11, x25, #32 + add x0, x26, #32 + add x12, x12, #4 + add x6, x6, #16 + add x5, x5, #16 + add x18, x18, #16 + add x24, x1, x10 + add x28, x4, x10 + prfm pldl1keep, [x24] + add x24, x2, x10 + ldr q17, [x16, x10] + stur q17, [x13, #-16] + prfm pldl1keep, [x24] + ldr q18, [x15, x10] + add x24, x3, x10 + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v1.4s, v17.4s, v6.s[1] + fmla v3.4s, v17.4s, v16.s[1] + str q18, [x13] + prfm pldl1keep, [x24] + ldr q19, [x14, x10] + fmla v0.4s, v18.4s, v4.s[2] + fmla v2.4s, v18.4s, v5.s[2] + fmla v1.4s, v18.4s, v6.s[2] + add x24, x7, x27 + fmla v3.4s, v18.4s, v16.s[2] + add x7, x7, #16 + add x30, x24, #32 + str q19, [x13, #16] + prfm pldl1keep, [x28] + ldr q7, [x17, x10] + fmla v0.4s, v19.4s, v4.s[3] + fmla v2.4s, v19.4s, v5.s[3] + fmla v1.4s, v19.4s, v6.s[3] + prfm pldl1keep, [x0] + ldr q4, [x26, #16] + prfm pldl1keep, [x11] + ldr q5, [x25, #16] + prfm pldl1keep, [x30] + ldr x10, [sp, #1016] // 8-byte Folded Reload + ldr q6, [x24, #16] + fmla v3.4s, v19.4s, v16.s[3] + add x13, x13, #64 + add x4, x4, x10 + add x3, x3, x10 + add x2, x2, x10 + add x1, x1, x10 + add x17, x17, x10 + add x16, x16, x10 + add x15, x15, x10 + add x14, x14, x10 + b .LBB0_61 + .p2align 2 +.LBB0_63: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #728] // 8-byte Folded Reload + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + ldr x15, [sp, #112] // 8-byte Folded Reload + ldr x5, [sp, #768] // 8-byte Folded Reload + mov x12, xzr + madd x11, x10, x13, x15 + ldr x14, [sp, #840] // 8-byte Folded Reload + str q7, [x8, x23, lsl #4] + ldr x6, [sp, #576] // 8-byte Folded Reload + ldr x7, [sp, #568] // 8-byte Folded Reload + ldr x17, [sp, #880] // 8-byte Folded Reload + ldr x24, [sp, #856] // 8-byte Folded Reload + add x11, x11, x5 + lsl x11, x11, #2 + ldr q7, [x14, x11] + fmla v0.4s, v7.4s, v4.s[1] + str q7, [x8, x10, lsl #4] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v2.4s, v7.4s, v5.s[1] + fmla v1.4s, v7.4s, v6.s[1] + fmla v3.4s, v7.4s, v16.s[1] + madd x11, x10, x13, x15 + add x11, x11, x5 + lsl x11, x11, #2 + ldr q17, [x14, x11] + fmla v0.4s, v17.4s, v4.s[2] + str q17, [x8, x10, lsl #4] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.4s, v17.4s, v5.s[2] + fmla v1.4s, v17.4s, v6.s[2] + fmla v3.4s, v17.4s, v16.s[2] + madd x11, x10, x13, x15 + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x11, x11, x5 + lsl x11, x11, #2 + ldr q7, [x14, x11] + ldr x11, [sp, #512] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[3] + fmla v2.4s, v7.4s, v5.s[3] + fmla v1.4s, v7.4s, v6.s[3] + fmla v3.4s, v7.4s, v16.s[3] + str q7, [x8, x10, lsl #4] + ldp x16, x10, [sp, #360] // 16-byte Folded Reload + cmp x13, x21 + b.ge .LBB0_65 + .p2align 2 +.LBB0_64: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x17, x11 + add x14, x10, x12 + add x11, x11, #4 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + prfm pldl1keep, [x14] + ldr x14, [sp, #824] // 8-byte Folded Reload + ldr q16, [x16, x12] + add x12, x12, x14 + fmla v0.4s, v16.4s, v4.s[0] + str q16, [x8, x13, lsl #4] + add x13, x13, #1 + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x13, x21 + b.lt .LBB0_64 +.LBB0_65: // %.preheader51 + // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #40] // 8-byte Folded Reload + ldr x13, [sp, #808] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #48 + ldr x14, [sp, #816] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x10 + b .LBB0_67 + .p2align 2 +.LBB0_66: // %.loopexit47 + // in Loop: Header=BB0_67 Depth=2 + ldr x10, [sp, #1008] // 8-byte Folded Reload + mov x2, x15 + mov x15, x1 + add x14, x14, x10 + add x13, x13, x10 +.LBB0_67: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + madd x0, x2, x29, x9 + add x0, x0, x5 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x5 + add x17, x17, x5 + lsl x0, x0, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str q0, [x24, x0] + str q2, [x24, x16] + add x16, x18, x5 + lsl x16, x16, #2 + str q1, [x24, x17] + str q3, [x24, x16] + ldr x16, [sp, #1024] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_72 +// %bb.68: // in Loop: Header=BB0_67 Depth=2 + madd x0, x15, x29, x9 + add x17, x15, #2 + add x18, x15, #3 + ldr x10, [sp, #888] // 8-byte Folded Reload + madd x3, x17, x29, x9 + add x16, x15, #1 + ldr q16, [x8] + mov x2, xzr + madd x1, x16, x29, x9 + mov x4, x14 + add x0, x0, x5 + lsl x0, x0, #2 + add x3, x3, x5 + add x1, x1, x5 + ldr q0, [x24, x0] + madd x0, x18, x29, x9 + lsl x3, x3, #2 + lsl x1, x1, #2 + add x0, x0, x5 + ldr q1, [x24, x3] + ldr x3, [sp, #848] // 8-byte Folded Reload + lsl x0, x0, #2 + ldr q2, [x24, x1] + add x1, x15, #4 + ldr q3, [x24, x0] + madd x0, x15, x19, x3 + lsl x0, x0, #2 + ldr q7, [x10, x0] + madd x0, x16, x19, x3 + lsl x0, x0, #2 + ldr q6, [x10, x0] + madd x0, x17, x19, x3 + lsl x0, x0, #2 + ldr q5, [x10, x0] + madd x0, x18, x19, x3 + mov x3, x11 + lsl x0, x0, #2 + ldr q4, [x10, x0] + cmp xzr, x23 + b.ge .LBB0_70 + .p2align 2 +.LBB0_69: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_67 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x3, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + add x2, x2, #4 + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x0] + add x0, x4, x20 + ldp q16, q17, [x3, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v2.4s, v16.4s, v6.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v2.4s, v17.4s, v6.s[2] + fmla v1.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x3], #64 + prfm pldl1keep, [x4] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x0] + fmla v2.4s, v17.4s, v6.s[3] + ldur q6, [x0, #-16] + add x0, x0, x20 + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x0] + ldur q5, [x0, #-16] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur q4, [x0, #-16] + cmp x2, x23 + b.lt .LBB0_69 +.LBB0_70: // in Loop: Header=BB0_67 Depth=2 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + ldr x4, [sp, #1032] // 8-byte Folded Reload + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + mov x2, x13 + mov x3, x12 + ldr q17, [x8, x10, lsl #4] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v0.4s, v17.4s, v7.s[1] + ldr q16, [x8, x10, lsl #4] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.4s, v17.4s, v6.s[1] + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldr q18, [x8, x10, lsl #4] + fmla v0.4s, v16.4s, v7.s[2] + fmla v2.4s, v16.4s, v6.s[2] + fmla v1.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v2.4s, v18.4s, v6.s[3] + fmla v1.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x4, x21 + b.ge .LBB0_66 + .p2align 2 +.LBB0_71: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_67 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x2, x20 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x0] + ldur s5, [x0, #-4] + add x0, x0, x20 + add x2, x2, #4 + prfm pldl1keep, [x0] + ldur s6, [x0, #-4] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur s7, [x0, #-4] + prfm pldl1keep, [x3] + ldur q16, [x3, #-16] + add x3, x3, #16 + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x4, x21 + b.lt .LBB0_71 + b .LBB0_66 + .p2align 2 +.LBB0_72: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #1024] // 8-byte Folded Reload + ldr x14, [sp, #920] // 8-byte Folded Reload + cmp x13, x14 + b.ge .LBB0_78 +// %bb.73: // in Loop: Header=BB0_4 Depth=1 + ldr x17, [sp, #1024] // 8-byte Folded Reload + ldr x18, [sp, #848] // 8-byte Folded Reload + mov x15, xzr + add x16, x17, #1 + madd x13, x17, x29, x9 + madd x17, x17, x19, x18 + ldr x10, [sp, #888] // 8-byte Folded Reload + ldr q4, [x8] + madd x14, x16, x29, x9 + madd x16, x16, x19, x18 + ldr x18, [sp, #624] // 8-byte Folded Reload + add x13, x13, x5 + lsl x17, x17, #2 + add x14, x14, x5 + add x13, x24, x13, lsl #2 + lsl x16, x16, #2 + ldr q3, [x10, x17] + ldr x17, [sp, #616] // 8-byte Folded Reload + add x14, x24, x14, lsl #2 + ldr q2, [x10, x16] + mov x16, x11 + ldr q0, [x13] + ldr q1, [x14] + cmp xzr, x23 + b.ge .LBB0_75 + .p2align 2 +.LBB0_74: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x4, x16, #32 + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + add x0, x18, x27 + prfm pldl1keep, [x4] + ldp q4, q5, [x16, #-32] + add x2, x17, x27 + add x1, x0, #32 + add x3, x2, #32 + add x15, x15, #4 + add x18, x18, #16 + add x17, x17, #16 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x16], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x0, #16] + cmp x15, x23 + b.lt .LBB0_74 +.LBB0_75: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldr x15, [sp, #880] // 8-byte Folded Reload + ldr x16, [sp, #1032] // 8-byte Folded Reload + ldr q5, [x8, x10, lsl #4] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr q4, [x8, x10, lsl #4] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x10, lsl #4] + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x16, x21 + b.ge .LBB0_77 + .p2align 2 +.LBB0_76: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x17, x15, x7 + add x18, x15, x6 + add x16, x16, #1 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x15, x6] + prfm pldl1keep, [x17] + ldr s3, [x15, x7] + prfm pldl1keep, [x12] + ldur q4, [x12, #-16] + add x12, x12, #16 + add x15, x15, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + cmp x16, x21 + b.lt .LBB0_76 +.LBB0_77: // in Loop: Header=BB0_4 Depth=1 + str q0, [x13] + str q1, [x14] +.LBB0_78: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #744] // 8-byte Folded Reload + ldr x13, [sp, #920] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_84 +// %bb.79: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #920] // 8-byte Folded Reload + ldr x10, [sp, #848] // 8-byte Folded Reload + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x19, x10 + ldr x13, [sp, #888] // 8-byte Folded Reload + ldr q2, [x8] + ldr x14, [sp, #632] // 8-byte Folded Reload + ldr x15, [sp, #880] // 8-byte Folded Reload + add x9, x9, x5 + lsl x10, x10, #2 + add x9, x24, x9, lsl #2 + ldr q1, [x13, x10] + ldr x10, [sp, #640] // 8-byte Folded Reload + ldr q0, [x9] + cmp xzr, x23 + b.ge .LBB0_81 + .p2align 2 +.LBB0_80: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x11, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x11, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x11], #64 + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x23 + b.lt .LBB0_80 +.LBB0_81: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #912] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + mov x10, xzr + mov w12, #16 // =0x10 + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #904] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x11, lsl #4] + ldr x11, [sp, #896] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #24] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #560] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[3] + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x13, x13, xzr + cmp x13, x21 + b.ge .LBB0_83 + .p2align 2 +.LBB0_82: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x15, x11 + add x11, x11, #4 + prfm pldl1keep, [x13] + add x13, x8, x12 + ldr s1, [x14, x10, lsl #2] + add x12, x12, #16 + prfm pldl1keep, [x13] + ldr q2, [x8, x10, lsl #4] + add x10, x10, #1 + fmla v0.4s, v2.4s, v1.s[0] + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x13, x13, x10 + cmp x13, x21 + b.lt .LBB0_82 +.LBB0_83: // in Loop: Header=BB0_4 Depth=1 + str q0, [x9] +.LBB0_84: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #120] // 8-byte Folded Reload + bl free + ldr x4, [sp, #600] // 8-byte Folded Reload + ldr x8, [sp, #648] // 8-byte Folded Reload + cmp x4, x8 + b.ge .LBB0_34 +.LBB0_85: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #72] // 8-byte Folded Reload + mov x24, x4 + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x11, [sp, #776] // 8-byte Folded Reload + mov x12, xzr + mul x9, x11, x8 + ldr x8, [sp, #752] // 8-byte Folded Reload + ldr x18, [sp, #520] // 8-byte Folded Reload + str x0, [sp, #848] // 8-byte Folded Spill + add x13, x18, x29 + ldp x1, x2, [sp, #328] // 16-byte Folded Reload + ldp x3, x4, [sp, #344] // 16-byte Folded Reload + ldr x5, [sp, #872] // 8-byte Folded Reload + ldr x6, [sp, #784] // 8-byte Folded Reload + ldr x7, [sp, #800] // 8-byte Folded Reload + mul x10, x11, x8 + add x8, x0, #63 + add x14, x9, x24 + lsl x16, x14, #2 + add x17, x14, x29 + add x18, x14, x18 + add x13, x14, x13 + and x8, x8, #0xffffffffffffffc0 + lsl x13, x13, #2 + str x10, [sp, #112] // 8-byte Folded Spill + add x15, x10, x24 + ldr x10, [sp, #760] // 8-byte Folded Reload + lsl x14, x15, #2 + lsl x15, x17, #2 + mul x0, x11, x10 + ldr x10, [sp, #856] // 8-byte Folded Reload + ldr x11, [sp, #840] // 8-byte Folded Reload + ldr d0, [x10, x16] + lsl x16, x18, #2 + ldr x18, [sp, #792] // 8-byte Folded Reload + ldr d2, [x10, x15] + ldr d3, [x10, x13] + lsl x13, x0, #2 + ldr d7, [x11, x14] + str x0, [sp, #120] // 8-byte Folded Spill + ldp x14, x15, [sp, #296] // 16-byte Folded Reload + ldr d1, [x10, x16] + ldr x10, [sp, #888] // 8-byte Folded Reload + ldp x16, x17, [sp, #312] // 16-byte Folded Reload + ldr q4, [x10, x13] + add x13, x0, x19 + lsl x13, x13, #2 + ldr q5, [x10, x13] + add x13, x0, x19, lsl #1 + lsl x13, x13, #2 + ldr q6, [x10, x13] + orr x13, x8, #0x10 + .p2align 2 +.LBB0_86: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x24, x18, x27 + fmla v0.2s, v7.2s, v4.s[0] + fmla v2.2s, v7.2s, v5.s[0] + cmp x12, x23 + prfm pldl1keep, [x24, #16] + ldr q16, [x24] + b.ge .LBB0_88 +// %bb.87: // in Loop: Header=BB0_86 Depth=2 + ldr x10, [sp, #864] // 8-byte Folded Reload + stur d7, [x13, #-16] + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + add x25, x6, x27 + add x26, x5, x27 + add x11, x25, #32 + add x0, x26, #32 + add x12, x12, #4 + add x6, x6, #16 + add x5, x5, #16 + add x18, x18, #16 + add x24, x1, x10 + add x28, x4, x10 + prfm pldl1keep, [x24] + add x24, x2, x10 + ldr d17, [x16, x10] + stur d17, [x13, #-8] + prfm pldl1keep, [x24] + ldr d18, [x15, x10] + add x24, x3, x10 + fmla v0.2s, v17.2s, v4.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v1.2s, v17.2s, v6.s[1] + fmla v3.2s, v17.2s, v16.s[1] + str d18, [x13] + prfm pldl1keep, [x24] + ldr d19, [x14, x10] + fmla v0.2s, v18.2s, v4.s[2] + fmla v2.2s, v18.2s, v5.s[2] + fmla v1.2s, v18.2s, v6.s[2] + add x24, x7, x27 + fmla v3.2s, v18.2s, v16.s[2] + add x7, x7, #16 + add x30, x24, #32 + str d19, [x13, #8] + prfm pldl1keep, [x28] + ldr d7, [x17, x10] + fmla v0.2s, v19.2s, v4.s[3] + fmla v2.2s, v19.2s, v5.s[3] + fmla v1.2s, v19.2s, v6.s[3] + prfm pldl1keep, [x0] + ldr q4, [x26, #16] + prfm pldl1keep, [x11] + ldr q5, [x25, #16] + prfm pldl1keep, [x30] + ldr x10, [sp, #1016] // 8-byte Folded Reload + ldr q6, [x24, #16] + fmla v3.2s, v19.2s, v16.s[3] + add x13, x13, #32 + add x4, x4, x10 + add x3, x3, x10 + add x2, x2, x10 + add x1, x1, x10 + add x17, x17, x10 + add x16, x16, x10 + add x15, x15, x10 + add x14, x14, x10 + b .LBB0_86 + .p2align 2 +.LBB0_88: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #728] // 8-byte Folded Reload + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + ldp x15, x6, [sp, #112] // 16-byte Folded Reload + ldr x4, [sp, #600] // 8-byte Folded Reload + mov x12, xzr + madd x11, x10, x13, x15 + ldr x14, [sp, #840] // 8-byte Folded Reload + str d7, [x8, x23, lsl #3] + ldr x16, [sp, #824] // 8-byte Folded Reload + ldr x18, [sp, #880] // 8-byte Folded Reload + ldr x5, [sp, #856] // 8-byte Folded Reload + add x11, x11, x4 + lsl x11, x11, #2 + ldr d7, [x14, x11] + fmla v0.2s, v7.2s, v4.s[1] + str d7, [x8, x10, lsl #3] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v3.2s, v7.2s, v16.s[1] + madd x11, x10, x13, x15 + add x11, x11, x4 + lsl x11, x11, #2 + ldr d17, [x14, x11] + fmla v0.2s, v17.2s, v4.s[2] + str d17, [x8, x10, lsl #3] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.2s, v17.2s, v5.s[2] + fmla v1.2s, v17.2s, v6.s[2] + fmla v3.2s, v17.2s, v16.s[2] + madd x11, x10, x13, x15 + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x11, x11, x4 + lsl x11, x11, #2 + ldr d7, [x14, x11] + ldr x11, [sp, #512] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[3] + fmla v2.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v3.2s, v7.2s, v16.s[3] + str d7, [x8, x10, lsl #3] + ldp x17, x10, [sp, #280] // 16-byte Folded Reload + cmp x13, x21 + b.ge .LBB0_90 + .p2align 2 +.LBB0_89: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x18, x11 + add x14, x10, x12 + add x11, x11, #4 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + prfm pldl1keep, [x14] + ldr d16, [x17, x12] + add x12, x12, x16 + fmla v0.2s, v16.2s, v4.s[0] + str d16, [x8, x13, lsl #3] + add x13, x13, #1 + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x13, x21 + b.lt .LBB0_89 +.LBB0_90: // %.preheader50 + // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #32] // 8-byte Folded Reload + ldr x13, [sp, #808] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #24 + ldr x14, [sp, #816] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x10 + b .LBB0_92 + .p2align 2 +.LBB0_91: // %.loopexit46 + // in Loop: Header=BB0_92 Depth=2 + ldr x10, [sp, #1008] // 8-byte Folded Reload + ldr x4, [sp, #600] // 8-byte Folded Reload + mov x2, x15 + mov x15, x1 + add x14, x14, x10 + add x13, x13, x10 +.LBB0_92: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_94 Depth 3 + // Child Loop BB0_96 Depth 3 + madd x0, x2, x29, x9 + add x0, x0, x4 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x4 + add x17, x17, x4 + lsl x0, x0, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str d0, [x5, x0] + str d2, [x5, x16] + add x16, x18, x4 + lsl x16, x16, #2 + str d1, [x5, x17] + str d3, [x5, x16] + ldr x16, [sp, #1024] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_97 +// %bb.93: // in Loop: Header=BB0_92 Depth=2 + madd x0, x15, x29, x9 + add x18, x15, #3 + ldr x10, [sp, #888] // 8-byte Folded Reload + add x16, x15, #1 + add x17, x15, #2 + madd x1, x16, x29, x9 + ldr d16, [x8] + mov x2, xzr + madd x3, x17, x29, x9 + add x0, x0, x4 + lsl x0, x0, #2 + add x1, x1, x4 + add x3, x3, x4 + ldr d0, [x5, x0] + madd x0, x18, x29, x9 + lsl x1, x1, #2 + lsl x3, x3, #2 + ldr d2, [x5, x1] + ldr d1, [x5, x3] + add x1, x15, #4 + mov x3, x11 + add x0, x0, x4 + mov x4, x14 + lsl x0, x0, #2 + ldr d3, [x5, x0] + madd x0, x15, x19, x6 + lsl x0, x0, #2 + ldr q7, [x10, x0] + madd x0, x16, x19, x6 + lsl x0, x0, #2 + ldr q6, [x10, x0] + madd x0, x17, x19, x6 + lsl x0, x0, #2 + ldr q5, [x10, x0] + madd x0, x18, x19, x6 + lsl x0, x0, #2 + ldr q4, [x10, x0] + cmp xzr, x23 + b.ge .LBB0_95 + .p2align 2 +.LBB0_94: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_92 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x3, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + add x2, x2, #4 + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x0] + add x0, x4, x20 + ldp d16, d17, [x3, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v2.2s, v17.2s, v6.s[2] + fmla v1.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x3], #32 + prfm pldl1keep, [x4] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x0] + fmla v2.2s, v17.2s, v6.s[3] + ldur q6, [x0, #-16] + add x0, x0, x20 + fmla v1.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x0] + ldur q5, [x0, #-16] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur q4, [x0, #-16] + cmp x2, x23 + b.lt .LBB0_94 +.LBB0_95: // in Loop: Header=BB0_92 Depth=2 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + ldr x4, [sp, #1032] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + mov x2, x13 + mov x3, x12 + ldr d17, [x8, x10, lsl #3] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v0.2s, v17.2s, v7.s[1] + ldr d16, [x8, x10, lsl #3] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.2s, v17.2s, v6.s[1] + fmla v1.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + ldr d18, [x8, x10, lsl #3] + fmla v0.2s, v16.2s, v7.s[2] + fmla v2.2s, v16.2s, v6.s[2] + fmla v1.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v2.2s, v18.2s, v6.s[3] + fmla v1.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x4, x21 + b.ge .LBB0_91 + .p2align 2 +.LBB0_96: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_92 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x2, x20 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x0] + ldur s5, [x0, #-4] + add x0, x0, x20 + add x2, x2, #4 + prfm pldl1keep, [x0] + ldur s6, [x0, #-4] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur s7, [x0, #-4] + prfm pldl1keep, [x3] + ldur d16, [x3, #-8] + add x3, x3, #8 + fmla v0.2s, v16.2s, v4.s[0] + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x4, x21 + b.lt .LBB0_96 + b .LBB0_91 + .p2align 2 +.LBB0_97: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #920] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_103 +// %bb.98: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #1024] // 8-byte Folded Reload + ldr x10, [sp, #888] // 8-byte Folded Reload + mov x14, xzr + add x15, x16, #1 + madd x12, x16, x29, x9 + madd x16, x16, x19, x6 + ldr d4, [x8] + ldr x17, [sp, #624] // 8-byte Folded Reload + madd x13, x15, x29, x9 + madd x15, x15, x19, x6 + add x12, x12, x4 + lsl x16, x16, #2 + add x13, x13, x4 + add x12, x5, x12, lsl #2 + lsl x15, x15, #2 + add x13, x5, x13, lsl #2 + ldr q3, [x10, x16] + ldr q2, [x10, x15] + ldr x16, [sp, #616] // 8-byte Folded Reload + mov x15, x11 + ldr d0, [x12] + ldr d1, [x13] + cmp xzr, x23 + b.ge .LBB0_100 + .p2align 2 +.LBB0_99: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x3, x15, #16 + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + add x18, x17, x27 + prfm pldl1keep, [x3] + ldp d4, d5, [x15, #-16] + add x1, x16, x27 + add x0, x18, #32 + add x2, x1, #32 + add x14, x14, #4 + add x17, x17, #16 + add x16, x16, #16 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x15], #32 + prfm pldl1keep, [x2] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x1, #16] + prfm pldl1keep, [x0] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x18, #16] + cmp x14, x23 + b.lt .LBB0_99 +.LBB0_100: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr x1, [sp, #544] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + ldr d5, [x8, x10, lsl #3] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr d4, [x8, x10, lsl #3] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x10, lsl #3] + ldr x10, [sp, #56] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + add x16, x8, x10 + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + ldr x10, [sp, #1032] // 8-byte Folded Reload + add x17, x10, xzr + cmp x17, x21 + b.ge .LBB0_102 + .p2align 2 +.LBB0_101: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x10, [sp, #656] // 8-byte Folded Reload + add x17, x16, x15, lsl #3 + add x18, x1, x14 + add x18, x18, #4 + add x17, x17, #8 + add x0, x10, x14 + add x14, x14, #4 + add x0, x0, #4 + prfm pldl1keep, [x0] + ldr s2, [x10, x15, lsl #2] + prfm pldl1keep, [x18] + ldr s3, [x1, x15, lsl #2] + prfm pldl1keep, [x17] + ldr d4, [x16, x15, lsl #3] + add x15, x15, #1 + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + ldr x10, [sp, #1032] // 8-byte Folded Reload + add x17, x10, x15 + cmp x17, x21 + b.lt .LBB0_101 +.LBB0_102: // in Loop: Header=BB0_4 Depth=1 + str d0, [x12] + str d1, [x13] +.LBB0_103: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #744] // 8-byte Folded Reload + ldr x13, [sp, #920] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_109 +// %bb.104: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #920] // 8-byte Folded Reload + ldr d2, [x8] + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x19, x6 + ldr x13, [sp, #888] // 8-byte Folded Reload + ldr x14, [sp, #632] // 8-byte Folded Reload + ldr x15, [sp, #880] // 8-byte Folded Reload + add x9, x9, x4 + lsl x10, x10, #2 + add x9, x5, x9, lsl #2 + ldr d0, [x9] + ldr q1, [x13, x10] + ldr x10, [sp, #640] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_106 + .p2align 2 +.LBB0_105: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x11, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp d2, d3, [x11, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x11], #32 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x23 + b.lt .LBB0_105 +.LBB0_106: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[0] + mov x10, xzr + ldr d3, [x8, x11, lsl #3] + ldr x11, [sp, #904] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + ldr d4, [x8, x11, lsl #3] + ldr x11, [sp, #896] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[2] + ldr d2, [x8, x11, lsl #3] + ldr x11, [sp, #56] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #560] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[3] + ldr x12, [sp, #1032] // 8-byte Folded Reload + add x12, x12, xzr + cmp x12, x21 + b.ge .LBB0_108 + .p2align 2 +.LBB0_107: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x10, lsl #3 + add x13, x15, x11 + add x11, x11, #4 + prfm pldl1keep, [x13] + ldr s1, [x14, x10, lsl #2] + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr d2, [x8, x10, lsl #3] + add x10, x10, #1 + fmla v0.2s, v2.2s, v1.s[0] + ldr x12, [sp, #1032] // 8-byte Folded Reload + add x12, x12, x10 + cmp x12, x21 + b.lt .LBB0_107 +.LBB0_108: // in Loop: Header=BB0_4 Depth=1 + str d0, [x9] +.LBB0_109: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #848] // 8-byte Folded Reload + bl free + ldr x8, [sp, #144] // 8-byte Folded Reload + ldr x9, [sp, #648] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_3 +.LBB0_110: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #96] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x17, [sp, #776] // 8-byte Folded Reload + add x10, x0, #63 + mov x12, xzr + ldr x15, [sp, #520] // 8-byte Folded Reload + ldr x16, [sp, #648] // 8-byte Folded Reload + mov x13, xzr + mul x9, x17, x8 + ldr x24, [sp, #856] // 8-byte Folded Reload + add x8, x15, x29 + ldp x1, x2, [sp, #248] // 16-byte Folded Reload + ldp x3, x4, [sp, #264] // 16-byte Folded Reload + str x0, [sp, #120] // 8-byte Folded Spill + add x11, x9, x16 + add x8, x11, x8 + add x14, x11, x29 + add x15, x11, x15 + ldr s2, [x24, x11, lsl #2] + ldr x11, [sp, #840] // 8-byte Folded Reload + ldr s0, [x24, x8, lsl #2] + and x8, x10, #0xffffffffffffffc0 + ldr x10, [sp, #752] // 8-byte Folded Reload + ldr s3, [x24, x14, lsl #2] + ldr s1, [x24, x15, lsl #2] + mul x10, x17, x10 + str x10, [sp, #520] // 8-byte Folded Spill + add x10, x10, x16 + ldp x15, x16, [sp, #216] // 16-byte Folded Reload + ldr s7, [x11, x10, lsl #2] + ldr x10, [sp, #760] // 8-byte Folded Reload + mul x11, x17, x10 + ldr x10, [sp, #888] // 8-byte Folded Reload + lsl x14, x11, #2 + ldp x17, x18, [sp, #232] // 16-byte Folded Reload + str x11, [sp, #776] // 8-byte Folded Spill + ldr q4, [x10, x14] + add x14, x11, x19 + lsl x14, x14, #2 + ldr q5, [x10, x14] + add x14, x11, x19, lsl #1 + lsl x14, x14, #2 + ldr q6, [x10, x14] + orr x10, x8, #0xc + str x10, [sp, #848] // 8-byte Folded Spill + .p2align 2 +.LBB0_111: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x10, [sp, #608] // 8-byte Folded Reload + ext v20.16b, v4.16b, v4.16b, #8 + cmp x13, x23 + ext v19.16b, v5.16b, v5.16b, #8 + add x5, x10, x12 + prfm pldl1keep, [x5, #16] + ldr q16, [x5] + ext v18.16b, v6.16b, v6.16b, #8 + ext v17.16b, v16.16b, v16.16b, #8 + b.ge .LBB0_113 +// %bb.112: // in Loop: Header=BB0_111 Depth=2 + ldr x10, [sp, #592] // 8-byte Folded Reload + ldr x14, [sp, #864] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v4.2s + fmla v3.2s, v7.2s, v5.2s + fmla v1.2s, v7.2s, v6.2s + fmla v0.2s, v7.2s, v16.2s + add x13, x13, #4 + add x5, x10, x12 + ldr x10, [sp, #584] // 8-byte Folded Reload + add x0, x1, x14 + add x11, x2, x14 + add x24, x3, x14 + add x30, x4, x14 + add x6, x5, #32 + add x7, x10, x12 + ldr x10, [sp, #880] // 8-byte Folded Reload + add x25, x7, #32 + add x26, x10, x12 + ldr x10, [sp, #848] // 8-byte Folded Reload + add x28, x26, #32 + add x10, x10, x12 + add x12, x12, #16 + stur s7, [x10, #-12] + prfm pldl1keep, [x0] + ldr s7, [x17, x14] + fmla v2.2s, v7.2s, v4.s[1] + fmla v3.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v0.2s, v7.2s, v16.s[1] + stur s7, [x10, #-8] + prfm pldl1keep, [x11] + ldr s7, [x16, x14] + fmla v1.2s, v7.2s, v18.2s + stur s7, [x10, #-4] + prfm pldl1keep, [x24] + ldr s18, [x15, x14] + fmla v2.2s, v7.2s, v20.2s + fmla v3.2s, v7.2s, v19.2s + ldr x24, [sp, #856] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v17.2s + str s18, [x10] + prfm pldl1keep, [x30] + ldr s7, [x18, x14] + fmla v2.2s, v18.2s, v4.s[3] + fmla v3.2s, v18.2s, v5.s[3] + fmla v1.2s, v18.2s, v6.s[3] + prfm pldl1keep, [x28] + ldr q4, [x26, #16] + prfm pldl1keep, [x25] + ldr q5, [x7, #16] + prfm pldl1keep, [x6] + ldr x10, [sp, #1016] // 8-byte Folded Reload + ldr q6, [x5, #16] + fmla v0.2s, v18.2s, v16.s[3] + add x4, x4, x10 + add x3, x3, x10 + add x2, x2, x10 + add x1, x1, x10 + add x18, x18, x10 + add x17, x17, x10 + add x16, x16, x10 + add x15, x15, x10 + b .LBB0_111 + .p2align 2 +.LBB0_113: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #728] // 8-byte Folded Reload + ldr x13, [sp, #912] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v4.2s + fmla v3.2s, v7.2s, v5.2s + ldr x15, [sp, #520] // 8-byte Folded Reload + ldr x4, [sp, #648] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.2s + fmla v0.2s, v7.2s, v16.2s + ldr x14, [sp, #840] // 8-byte Folded Reload + str s7, [x8, x23, lsl #2] + mov x12, xzr + ldr x16, [sp, #880] // 8-byte Folded Reload + ldp x18, x17, [sp, #200] // 16-byte Folded Reload + madd x10, x13, x11, x15 + ldr x5, [sp, #776] // 8-byte Folded Reload + add x10, x10, x4 + ldr s7, [x14, x10, lsl #2] + str s7, [x8, x13, lsl #2] + ldr x13, [sp, #904] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v4.s[1] + fmla v3.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v0.2s, v7.2s, v16.s[1] + madd x10, x13, x11, x15 + add x10, x10, x4 + ldr s7, [x14, x10, lsl #2] + fmla v2.2s, v7.2s, v20.2s + str s7, [x8, x13, lsl #2] + ldr x13, [sp, #896] // 8-byte Folded Reload + fmla v3.2s, v7.2s, v19.2s + fmla v1.2s, v7.2s, v18.2s + fmla v0.2s, v7.2s, v17.2s + madd x10, x13, x11, x15 + ldr x11, [sp, #512] // 8-byte Folded Reload + ldr x15, [sp, #824] // 8-byte Folded Reload + add x10, x10, x4 + ldr s7, [x14, x10, lsl #2] + fmla v2.2s, v7.2s, v4.s[3] + fmla v3.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v0.2s, v7.2s, v16.s[3] + str s7, [x8, x13, lsl #2] + ldr x13, [sp, #1032] // 8-byte Folded Reload + cmp x13, x21 + b.ge .LBB0_115 + .p2align 2 +.LBB0_114: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x14, x16, x11 + add x10, x17, x12 + add x11, x11, #4 + prfm pldl1keep, [x14] + ldur s4, [x14, #-4] + add x14, x14, x20 + prfm pldl1keep, [x14] + ldur s5, [x14, #-4] + add x14, x14, x20 + prfm pldl1keep, [x14] + ldur s6, [x14, #-4] + add x14, x14, x20 + prfm pldl1keep, [x14] + ldur s7, [x14, #-4] + prfm pldl1keep, [x10] + ldr s16, [x18, x12] + add x12, x12, x15 + fmla v2.2s, v16.2s, v4.2s + str s16, [x8, x13, lsl #2] + add x13, x13, #1 + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x13, x21 + b.lt .LBB0_114 +.LBB0_115: // %.preheader49 + // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #512] // 8-byte Folded Reload + ldr x13, [sp, #808] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #12 + ldr x14, [sp, #816] // 8-byte Folded Reload + mov w17, #1 // =0x1 + mov w18, #2 // =0x2 + mov w16, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x10 + b .LBB0_117 + .p2align 2 +.LBB0_116: // %.loopexit45 + // in Loop: Header=BB0_117 Depth=2 + ldr x10, [sp, #1008] // 8-byte Folded Reload + ldr x4, [sp, #648] // 8-byte Folded Reload + mov x2, x15 + mov x15, x1 + add x14, x14, x10 + add x13, x13, x10 +.LBB0_117: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + madd x10, x2, x29, x9 + add x10, x10, x4 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x17, x17, x4 + str s2, [x24, x10, lsl #2] + madd x10, x16, x29, x9 + add x16, x18, x4 + str s3, [x24, x17, lsl #2] + add x10, x10, x4 + str s1, [x24, x16, lsl #2] + str s0, [x24, x10, lsl #2] + ldr x10, [sp, #1024] // 8-byte Folded Reload + cmp x15, x10 + b.ge .LBB0_122 +// %bb.118: // in Loop: Header=BB0_117 Depth=2 + add x17, x15, #1 + madd x10, x15, x29, x9 + add x18, x15, #2 + add x16, x15, #3 + madd x0, x17, x29, x9 + ldr s16, [x8] + mov x2, xzr + add x10, x10, x4 + madd x1, x18, x29, x9 + madd x3, x16, x29, x9 + add x0, x0, x4 + add x1, x1, x4 + ldr s2, [x24, x10, lsl #2] + madd x10, x15, x19, x5 + add x3, x3, x4 + mov x4, x14 + ldr s3, [x24, x0, lsl #2] + ldr x0, [sp, #888] // 8-byte Folded Reload + ldr s0, [x24, x3, lsl #2] + ldr s1, [x24, x1, lsl #2] + add x1, x15, #4 + mov x3, x11 + lsl x10, x10, #2 + ldr q7, [x0, x10] + madd x10, x17, x19, x5 + lsl x10, x10, #2 + ldr q6, [x0, x10] + madd x10, x18, x19, x5 + lsl x10, x10, #2 + ldr q5, [x0, x10] + madd x10, x16, x19, x5 + lsl x10, x10, #2 + ldr q4, [x0, x10] + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x23 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_120 + .p2align 2 +.LBB0_119: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_117 Depth=2 + // => This Inner Loop Header: Depth=3 + add x10, x3, #8 + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x2, x2, #4 + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x10] + add x10, x4, x20 + ldp s16, s21, [x3, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v2.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v2.2s, v21.2s, v20.2s + ldp s17, s16, [x3], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v1.2s, v21.2s, v18.2s + prfm pldl1keep, [x4] + fmla v2.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x10] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x10, #-16] + add x10, x10, x20 + fmla v1.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x10] + ldur q5, [x10, #-16] + add x10, x10, x20 + prfm pldl1keep, [x10] + ldur q4, [x10, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x2, x23 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_119 +.LBB0_120: // in Loop: Header=BB0_117 Depth=2 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + ldr x4, [sp, #1032] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + mov x2, x13 + mov x3, x12 + ldr s21, [x8, x10, lsl #2] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v2.2s, v21.2s, v7.s[1] + ldr s16, [x8, x10, lsl #2] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v3.2s, v21.2s, v6.s[1] + fmla v1.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + ldr s22, [x8, x10, lsl #2] + fmla v2.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v1.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v2.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v1.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x4, x21 + b.ge .LBB0_116 + .p2align 2 +.LBB0_121: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_117 Depth=2 + // => This Inner Loop Header: Depth=3 + add x10, x2, x20 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x10] + ldur s5, [x10, #-4] + add x10, x10, x20 + add x2, x2, #4 + prfm pldl1keep, [x10] + ldur s6, [x10, #-4] + add x10, x10, x20 + prfm pldl1keep, [x10] + ldur s7, [x10, #-4] + prfm pldl1keep, [x3] + ldur s16, [x3, #-4] + add x3, x3, #4 + fmla v2.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x4, x21 + b.lt .LBB0_121 + b .LBB0_116 + .p2align 2 +.LBB0_122: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #1024] // 8-byte Folded Reload + ldr x12, [sp, #920] // 8-byte Folded Reload + cmp x10, x12 + ldr x2, [sp, #544] // 8-byte Folded Reload + b.ge .LBB0_128 +// %bb.123: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #1024] // 8-byte Folded Reload + ldr x16, [sp, #888] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + ldr s4, [x8] + madd x12, x13, x19, x5 + add x10, x13, #1 + lsl x12, x12, #2 + ldr q3, [x16, x12] + madd x12, x10, x19, x5 + madd x10, x10, x29, x9 + lsl x12, x12, #2 + ldr q2, [x16, x12] + madd x12, x13, x29, x9 + add x13, x10, x4 + ldr s0, [x24, x13, lsl #2] + add x12, x12, x4 + ldr s1, [x24, x12, lsl #2] + ext v6.16b, v3.16b, v3.16b, #8 + cmp xzr, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.ge .LBB0_125 + .p2align 2 +.LBB0_124: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x0, x8, x14 + ldr x10, [sp, #536] // 8-byte Folded Reload + ldr x17, [sp, #528] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v3.2s + add x1, x0, #20 + fmla v0.2s, v4.2s, v2.2s + add x15, x15, #4 + prfm pldl1keep, [x1] + ldp s4, s7, [x0, #4] + add x10, x10, x14 + add x17, x17, x14 + add x14, x14, #16 + add x16, x10, #32 + add x18, x17, #32 + fmla v0.2s, v4.2s, v2.s[1] + fmla v1.2s, v4.2s, v3.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x0, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v3.s[3] + ldr q3, [x17, #16] + prfm pldl1keep, [x16] + fmla v0.2s, v5.2s, v2.s[3] + ldr q2, [x10, #16] + ext v6.16b, v3.16b, v3.16b, #8 + cmp x15, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.lt .LBB0_124 +.LBB0_125: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v3.2s + fmla v0.2s, v4.2s, v2.2s + ldr x16, [sp, #1032] // 8-byte Folded Reload + mov x14, xzr + ldr s7, [x8, x10, lsl #2] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr s4, [x8, x10, lsl #2] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v3.s[1] + fmla v0.2s, v7.2s, v2.s[1] + ldr s7, [x8, x10, lsl #2] + ldr x10, [sp, #104] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + add x15, x8, x10 + fmla v1.2s, v7.2s, v3.s[3] + fmla v0.2s, v7.2s, v2.s[3] + cmp x16, x21 + b.ge .LBB0_127 + .p2align 2 +.LBB0_126: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x0, [sp, #656] // 8-byte Folded Reload + add x10, x15, x14 + add x17, x2, x14 + add x16, x16, #1 + add x10, x10, #4 + add x17, x17, #4 + add x18, x0, x14 + add x18, x18, #4 + prfm pldl1keep, [x18] + prfm pldl1keep, [x17] + ldr s2, [x0, x14] + prfm pldl1keep, [x10] + ldr s3, [x15, x14] + fmla v1.2s, v3.2s, v2.2s + ldr s2, [x2, x14] + add x14, x14, #4 + fmla v0.2s, v3.2s, v2.2s + cmp x16, x21 + b.lt .LBB0_126 +.LBB0_127: // in Loop: Header=BB0_4 Depth=1 + str s1, [x24, x12, lsl #2] + str s0, [x24, x13, lsl #2] +.LBB0_128: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #744] // 8-byte Folded Reload + ldr x12, [sp, #920] // 8-byte Folded Reload + cmp x12, x10 + b.ge .LBB0_2 +// %bb.129: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #920] // 8-byte Folded Reload + ldr x13, [sp, #888] // 8-byte Folded Reload + mov x12, xzr + madd x9, x10, x29, x9 + madd x10, x10, x19, x5 + ldr s2, [x8] + ldr x14, [sp, #632] // 8-byte Folded Reload + add x9, x9, x4 + lsl x10, x10, #2 + ldr s0, [x24, x9, lsl #2] + ldr q1, [x13, x10] + ldr x10, [sp, #640] // 8-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x23 + b.ge .LBB0_131 + .p2align 2 +.LBB0_130: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x11, #8 + fmla v0.2s, v2.2s, v1.2s + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x11, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x11], #16 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x12, x23 + b.lt .LBB0_130 +.LBB0_131: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x10, xzr + ldr s4, [x8, x11, lsl #2] + ldr x11, [sp, #904] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s5, [x8, x11, lsl #2] + ldr x11, [sp, #896] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.2s + ldr s2, [x8, x11, lsl #2] + ldr x11, [sp, #104] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #1032] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[3] + cmp x11, x21 + b.ge .LBB0_1 + .p2align 2 +.LBB0_132: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x10 + add x13, x14, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + prfm pldl1keep, [x12] + ldr s1, [x8, x10] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x21 + b.lt .LBB0_132 + b .LBB0_1 +.LBB0_133: + ldr x0, [sp, #16] // 8-byte Folded Reload + bl free + add sp, sp, #1040 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_3d_nn_mlir, .Lfunc_end0-sbatch_matmul_3d_nn_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s new file mode 100644 index 00000000000000..a70650bb6207e2 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s @@ -0,0 +1,2987 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_3d_nt_mlir // -- Begin function sbatch_matmul_3d_nt_mlir + .p2align 4 + .type sbatch_matmul_3d_nt_mlir,@function +sbatch_matmul_3d_nt_mlir: // @sbatch_matmul_3d_nt_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #512 + .cfi_def_cfa_offset 672 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x4, #0 + ldr x13, [sp, #712] + ldr x14, [sp, #768] + mov x19, x7 + cinv x8, x4, lt + ldr x12, [sp, #760] + ldr x28, [sp, #808] + mov x21, x5 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + ldr x23, [sp, #728] + ldr x27, [sp, #736] + str x6, [sp, #448] // 8-byte Folded Spill + stp x13, x3, [sp, #136] // 16-byte Folded Spill + mov x26, x2 + mov x25, x1 + asr x9, x9, #1 + stp x12, x14, [sp, #328] // 16-byte Folded Spill + cinv x22, x9, lt + cmp x8, #0 + csel x8, x10, x8, lt + cmp x4, #0 + ldr x10, [sp, #800] + asr x8, x8, #2 + cinv x24, x8, lt + cmp x13, #0 + cinv x8, x13, lt + add x9, x8, x8, lsr #63 + stp x10, x4, [sp, #360] // 16-byte Folded Spill + add x10, x8, #15 + add x11, x8, #7 + add x12, x8, #3 + asr x9, x9, #1 + cinv x14, x9, lt + cmp x8, #0 + csel x9, x10, x8, lt + csel x10, x11, x8, lt + ldr x11, [sp, #696] + csel x8, x12, x8, lt + cmp x13, #0 + asr x9, x9, #4 + asr x10, x10, #3 + asr x8, x8, #2 + cinv x9, x9, lt + cinv x10, x10, lt + cinv x29, x8, lt + lsl x8, x14, #1 + stp x9, x10, [sp, #456] // 16-byte Folded Spill + lsl x9, x9, #4 + str x8, [sp, #168] // 8-byte Folded Spill + lsl x8, x5, #6 + lsl x20, x29, #2 + stp x11, x14, [sp, #488] // 16-byte Folded Spill + ldr x11, [sp, #688] + str x9, [sp, #416] // 8-byte Folded Spill + lsl x9, x10, #3 + add x0, x8, #64 + stp x8, x11, [sp, #472] // 16-byte Folded Spill + str x9, [sp, #280] // 8-byte Folded Spill + bl malloc + lsl x8, x24, #2 + mov x12, x22 + lsl x10, x23, #2 + mul x11, x24, x19 + str x8, [sp, #504] // 8-byte Folded Spill + lsl x8, x22, #1 + and x9, x21, #0x3 + lsl x22, x19, #2 + str x8, [sp, #440] // 8-byte Folded Spill + negs x8, x21 + str x10, [sp, #128] // 8-byte Folded Spill + lsl x10, x27, #6 + mul x13, x12, x19 + str x0, [sp, #16] // 8-byte Folded Spill + add x12, x0, #63 + and x8, x8, #0x3 + ldp x0, x18, [sp, #480] // 16-byte Folded Reload + str x10, [sp, #320] // 8-byte Folded Spill + mov w10, #1 // =0x1 + add x14, x21, x22 + csneg x8, x9, x8, mi + lsl x2, x26, #2 + bfi x10, x24, #2, #62 + sub x14, x14, x8 + and x23, x12, #0xffffffffffffffc0 + mul x12, x19, x10 + lsl x24, x19, #4 + add x14, x2, x14, lsl #2 + lsl x9, x11, #4 + add x11, x21, x11, lsl #2 + add x18, x0, x18, lsl #2 + add x0, x24, x2 + add x0, x0, x25 + add x14, x14, x25 + sub x11, x11, x8 + add x0, x0, #32 + add x14, x14, #4 + lsl x17, x12, #2 + add x12, x21, x12 + lsl x11, x11, #2 + stp x14, x0, [sp, #384] // 16-byte Folded Spill + ldr x0, [sp, #472] // 8-byte Folded Reload + add x16, x21, x13, lsl #1 + str x11, [sp, #264] // 8-byte Folded Spill + sub x11, x12, x8 + ldr x10, [sp, #456] // 8-byte Folded Reload + lsl x11, x11, #2 + sub x14, x23, x8, lsl #6 + str x11, [sp, #256] // 8-byte Folded Spill + sub x11, x16, x8 + mov w15, #1 // =0x1 + lsl x16, x11, #2 + add x13, x2, x13, lsl #3 + bfi x15, x29, #2, #62 + add x16, x16, #4 + add x0, x14, x0 + add x14, x25, x17 + lsl x4, x21, #2 + mul x10, x10, x27 + str x14, [sp, #296] // 8-byte Folded Spill + add x14, x25, x9 + mul x12, x29, x27 + mul x15, x27, x15 + str x14, [sp, #288] // 8-byte Folded Spill + add x14, x13, x25 + str x16, [sp, #272] // 8-byte Folded Spill + add x13, x13, x4 + lsl x16, x8, #2 + add x14, x14, #32 + sub x13, x13, x16 + ldr x11, [sp, #464] // 8-byte Folded Reload + str x14, [sp, #304] // 8-byte Folded Spill + ldr x14, [sp, #496] // 8-byte Folded Reload + add x13, x25, x13 + add x3, x18, #4 + str x13, [sp, #376] // 8-byte Folded Spill + add x13, x17, x2 + add x17, x18, x15, lsl #2 + add x18, x18, x12, lsl #4 + add x12, x13, x4 + add x10, x3, x10, lsl #6 + mul x11, x11, x27 + str x10, [sp, #232] // 8-byte Folded Spill + sub x12, x12, x16 + add x9, x9, x2 + mov x15, x0 + lsl x0, x21, #3 + mul x14, x14, x27 + add x10, x25, x12 + add x12, x25, x13 + add x13, x25, x9 + str x10, [sp, #240] // 8-byte Folded Spill + add x10, x9, x4 + lsl x9, x21, #5 + str x26, [sp, #352] // 8-byte Folded Spill + sub x10, x10, x16 + lsl x26, x27, #2 + sub x27, x21, x8 + str x0, [sp, #80] // 8-byte Folded Spill + add x10, x25, x10 + sub x0, x0, x8, lsl #3 + str x2, [sp, #456] // 8-byte Folded Spill + add x2, x25, x2 + str x10, [sp, #312] // 8-byte Folded Spill + add x10, x3, x11, lsl #5 + sub x11, x4, x16 + lsl x16, x21, #4 + add x14, x3, x14, lsl #3 + stp x16, x9, [sp, #88] // 16-byte Folded Spill + sub x9, x9, x8, lsl #5 + sub x16, x16, x8, lsl #4 + ldr x8, [sp, #448] // 8-byte Folded Reload + str x14, [sp, #224] // 8-byte Folded Spill + sub x14, x27, #3 + mov x1, x20 + str x14, [sp, #496] // 8-byte Folded Spill + sub x14, x27, #2 + stp x0, x9, [sp, #64] // 16-byte Folded Spill + add x9, x9, #32 + str x14, [sp, #488] // 8-byte Folded Spill + sub x14, x27, #1 + str x9, [sp, #56] // 8-byte Folded Spill + add x9, x16, #16 + str x14, [sp, #480] // 8-byte Folded Spill + lsl x14, x8, #2 + str x9, [sp, #48] // 8-byte Folded Spill + add x9, x0, #8 + stp x4, x14, [sp, #112] // 16-byte Folded Spill + add x14, x2, #4 + mov x20, xzr + sub x29, x27, #4 + str x14, [sp, #400] // 8-byte Folded Spill + add x14, x23, #256 + str x9, [sp, #40] // 8-byte Folded Spill + add x9, x11, #4 + str x14, [sp, #472] // 8-byte Folded Spill + add x14, x15, #64 + str x25, [sp, #344] // 8-byte Folded Spill + str x14, [sp, #464] // 8-byte Folded Spill + str x11, [sp, #104] // 8-byte Folded Spill + stp x16, x9, [sp, #24] // 16-byte Folded Spill + str x1, [sp, #176] // 8-byte Folded Spill + str x15, [sp, #248] // 8-byte Folded Spill + b .LBB0_4 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_4 Depth=1 + str s0, [x15, x9, lsl #2] +.LBB0_2: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload +.LBB0_3: // %.backedge28 + // in Loop: Header=BB0_4 Depth=1 + ldp x14, x9, [sp, #120] // 16-byte Folded Reload + ldp x10, x2, [sp, #400] // 16-byte Folded Reload + ldp x20, x3, [sp, #152] // 16-byte Folded Reload + ldr x17, [sp, #216] // 8-byte Folded Reload + add x10, x10, x14 + ldp x13, x12, [sp, #184] // 16-byte Folded Reload + add x3, x3, x9 + add x2, x2, x14 + add x17, x17, x9 + add x12, x12, x14 + add x13, x13, x14 + str x10, [sp, #400] // 8-byte Folded Spill + ldr x10, [sp, #392] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #392] // 8-byte Folded Spill + ldr x10, [sp, #384] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #384] // 8-byte Folded Spill + ldr x10, [sp, #296] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #296] // 8-byte Folded Spill + ldr x10, [sp, #288] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #288] // 8-byte Folded Spill + ldr x10, [sp, #304] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #304] // 8-byte Folded Spill + ldr x10, [sp, #376] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #376] // 8-byte Folded Spill + ldp x11, x10, [sp, #224] // 16-byte Folded Reload + add x10, x10, x9 + add x11, x11, x9 + stp x11, x10, [sp, #224] // 16-byte Folded Spill + ldr x10, [sp, #240] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #240] // 8-byte Folded Spill + ldr x10, [sp, #312] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #312] // 8-byte Folded Spill + ldp x10, x18, [sp, #200] // 16-byte Folded Reload + add x10, x10, x9 + add x18, x18, x9 +.LBB0_4: // =>This Loop Header: Depth=1 + // Child Loop BB0_8 Depth 2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_13 Depth 3 + // Child Loop BB0_15 Depth 4 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_20 Depth 3 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_26 Depth 3 + // Child Loop BB0_28 Depth 3 + // Child Loop BB0_34 Depth 2 + // Child Loop BB0_37 Depth 2 + // Child Loop BB0_39 Depth 3 + // Child Loop BB0_41 Depth 3 + // Child Loop BB0_44 Depth 2 + // Child Loop BB0_46 Depth 2 + // Child Loop BB0_50 Depth 2 + // Child Loop BB0_52 Depth 2 + // Child Loop BB0_56 Depth 2 + // Child Loop BB0_59 Depth 2 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_63 Depth 3 + // Child Loop BB0_66 Depth 2 + // Child Loop BB0_68 Depth 2 + // Child Loop BB0_72 Depth 2 + // Child Loop BB0_74 Depth 2 + // Child Loop BB0_78 Depth 2 + // Child Loop BB0_81 Depth 2 + // Child Loop BB0_83 Depth 3 + // Child Loop BB0_85 Depth 3 + // Child Loop BB0_88 Depth 2 + // Child Loop BB0_90 Depth 2 + // Child Loop BB0_94 Depth 2 + // Child Loop BB0_96 Depth 2 + // Child Loop BB0_100 Depth 2 + // Child Loop BB0_103 Depth 2 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_107 Depth 3 + // Child Loop BB0_110 Depth 2 + // Child Loop BB0_112 Depth 2 + // Child Loop BB0_116 Depth 2 + // Child Loop BB0_118 Depth 2 + ldr x9, [sp, #144] // 8-byte Folded Reload + cmp x20, x9 + b.ge .LBB0_119 +// %bb.5: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #416] // 8-byte Folded Reload + ldr x30, [sp, #280] // 8-byte Folded Reload + add x9, x20, #1 + stp x10, x18, [sp, #200] // 16-byte Folded Spill + mov x10, xzr + str x2, [sp, #408] // 8-byte Folded Spill + stp x13, x12, [sp, #184] // 16-byte Folded Spill + str x17, [sp, #216] // 8-byte Folded Spill + stp x9, x3, [sp, #152] // 16-byte Folded Spill + b .LBB0_8 + .p2align 2 +.LBB0_6: // in Loop: Header=BB0_8 Depth=2 + ldr x8, [sp, #448] // 8-byte Folded Reload + stp q3, q2, [x10] + stp q1, q0, [x10, #32] +.LBB0_7: // %.backedge + // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #320] // 8-byte Folded Reload + ldp x10, x3, [sp, #424] // 16-byte Folded Reload + add x3, x3, x9 +.LBB0_8: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_13 Depth 3 + // Child Loop BB0_15 Depth 4 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_20 Depth 3 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_26 Depth 3 + // Child Loop BB0_28 Depth 3 + ldp x11, x9, [sp, #344] // 16-byte Folded Reload + ldr x16, [sp, #400] // 8-byte Folded Reload + cmp x10, x0 + add x25, x11, x9, lsl #2 + b.ge .LBB0_29 +// %bb.9: // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #360] // 8-byte Folded Reload + mov x13, xzr + mul x12, x20, x9 + add x14, x12, x10 + ldp x11, x9, [sp, #328] // 16-byte Folded Reload + add x11, x11, x9, lsl #2 + add x15, x14, x28 + add x15, x11, x15, lsl #2 + add x9, x11, x14, lsl #2 + ldp q3, q1, [x15, #32] + ldp q5, q4, [x15] + lsl x15, x28, #1 + ldp q16, q6, [x9, #32] + ldp q2, q0, [x9] + add x9, x14, x15 + add x15, x15, x28 + add x14, x14, x15 + add x9, x11, x9, lsl #2 + mov x15, x3 + add x14, x11, x14, lsl #2 + ldp q17, q7, [x9, #32] + ldp q20, q18, [x9] + add x9, x10, #16 + str x9, [sp, #424] // 8-byte Folded Spill + ldp q21, q19, [x14, #32] + ldp q23, q22, [x14] + mov x14, x16 + cmp xzr, x21 + b.ge .LBB0_11 + .p2align 2 +.LBB0_10: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x16, x14, x22 + prfm pldl1keep, [x14] + ldur s27, [x14, #-4] + add x14, x14, #4 + add x17, x16, x22 + prfm pldl1keep, [x16] + ldur s28, [x16, #-4] + add x16, x15, x26 + add x18, x17, x22 + prfm pldl1keep, [x17] + ldur s26, [x17, #-4] + sub x17, x16, #4 + prfm pldl1keep, [x18] + ldur s25, [x18, #-4] + add x18, x16, x26 + prfm pldl1keep, [x15] + ldur s24, [x15, #-4] + add x15, x15, #4 + prfm pldl1keep, [x16] + sub x16, x18, #4 + prfm pldl1keep, [x18] + ld1 { v24.s }[1], [x17] + add x17, x18, x26 + prfm pldl1keep, [x17] + ld1 { v24.s }[2], [x16] + add x16, x17, x26 + sub x17, x17, #4 + prfm pldl1keep, [x16] + ldur s29, [x16, #-4] + add x16, x16, x26 + sub x18, x16, #4 + add x0, x16, x26 + ld1 { v24.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v29.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x26 + prfm pldl1keep, [x17] + fmla v2.4s, v24.4s, v27.s[0] + ld1 { v29.s }[2], [x16] + add x16, x17, x26 + sub x17, x17, #4 + fmla v5.4s, v24.4s, v28.s[0] + fmla v20.4s, v24.4s, v26.s[0] + fmla v23.4s, v24.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s30, [x16, #-4] + add x16, x16, x26 + sub x18, x16, #4 + add x0, x16, x26 + ld1 { v29.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v30.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x26 + prfm pldl1keep, [x17] + ld1 { v30.s }[2], [x16] + add x16, x17, x26 + sub x17, x17, #4 + fmla v0.4s, v29.4s, v27.s[0] + fmla v4.4s, v29.4s, v28.s[0] + fmla v18.4s, v29.4s, v26.s[0] + fmla v22.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s31, [x16, #-4] + add x16, x16, x26 + sub x18, x16, #4 + add x0, x16, x26 + ld1 { v30.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v31.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x26 + prfm pldl1keep, [x17] + fmla v16.4s, v30.4s, v27.s[0] + ld1 { v31.s }[2], [x16] + sub x16, x17, #4 + fmla v3.4s, v30.4s, v28.s[0] + fmla v17.4s, v30.4s, v26.s[0] + fmla v21.4s, v30.4s, v25.s[0] + ld1 { v31.s }[3], [x16] + add x16, x23, x13, lsl #6 + add x13, x13, #1 + stp q24, q29, [x16] + fmla v6.4s, v31.4s, v27.s[0] + fmla v1.4s, v31.4s, v28.s[0] + fmla v7.4s, v31.4s, v26.s[0] + fmla v19.4s, v31.4s, v25.s[0] + stp q30, q31, [x16, #32] + cmp x13, x21 + b.lt .LBB0_10 +.LBB0_11: // %.preheader + // in Loop: Header=BB0_8 Depth=2 + ldp x13, x14, [sp, #384] // 16-byte Folded Reload + str x3, [sp, #432] // 8-byte Folded Spill + mov x1, xzr + mov w17, #1 // =0x1 + mov w18, #2 // =0x2 + mov w16, #3 // =0x3 + mov w15, #4 // =0x4 + b .LBB0_13 + .p2align 2 +.LBB0_12: // %.loopexit + // in Loop: Header=BB0_13 Depth=3 + add x14, x14, x24 + add x13, x13, x24 + mov x1, x15 + mov x15, x0 +.LBB0_13: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_15 Depth 4 + // Child Loop BB0_17 Depth 4 + madd x0, x1, x28, x12 + ldr x9, [sp, #504] // 8-byte Folded Reload + add x0, x0, x10 + madd x17, x17, x28, x12 + madd x18, x18, x28, x12 + madd x16, x16, x28, x12 + add x17, x17, x10 + add x18, x18, x10 + add x16, x16, x10 + cmp x15, x9 + add x0, x11, x0, lsl #2 + add x17, x11, x17, lsl #2 + stp q2, q0, [x0] + add x18, x11, x18, lsl #2 + add x16, x11, x16, lsl #2 + stp q16, q6, [x0, #32] + stp q5, q4, [x17] + stp q3, q1, [x17, #32] + stp q20, q18, [x18] + stp q17, q7, [x18, #32] + stp q23, q22, [x16] + stp q21, q19, [x16, #32] + b.ge .LBB0_18 +// %bb.14: // in Loop: Header=BB0_13 Depth=3 + add x17, x15, #1 + add x16, x15, #3 + mul x2, x20, x8 + add x18, x15, #2 + madd x3, x17, x28, x12 + ldp q28, q29, [x23, #32] + mov x1, xzr + madd x0, x15, x28, x12 + ldp q30, q31, [x23] + add x0, x0, x10 + add x3, x3, x10 + add x0, x11, x0, lsl #2 + add x3, x11, x3, lsl #2 + ldp q16, q6, [x0, #32] + ldp q2, q0, [x0] + madd x0, x18, x28, x12 + ldp q3, q1, [x3, #32] + add x0, x0, x10 + ldp q5, q4, [x3] + madd x3, x16, x28, x12 + add x3, x3, x10 + add x0, x11, x0, lsl #2 + add x3, x11, x3, lsl #2 + ldp q17, q7, [x0, #32] + ldp q20, q18, [x0] + add x0, x15, #4 + ldp q21, q19, [x3, #32] + ldp q23, q22, [x3] + madd x3, x15, x19, x2 + lsl x3, x3, #2 + ldr q27, [x25, x3] + madd x3, x17, x19, x2 + lsl x3, x3, #2 + ldr q26, [x25, x3] + madd x3, x18, x19, x2 + madd x2, x16, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q25, [x25, x3] + ldr q24, [x25, x2] + ldr x3, [sp, #472] // 8-byte Folded Reload + mov x2, x14 + fmla v6.4s, v29.4s, v27.s[0] + cmp xzr, x29 + b.ge .LBB0_16 + .p2align 2 +.LBB0_15: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_13 Depth=3 + // => This Inner Loop Header: Depth=4 + add x7, x3, #64 + fmla v16.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + add x6, x3, #128 + prfm pldl1keep, [x7] + ldp q9, q8, [x3, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x3, #-192] + fmla v1.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x6] + fmla v17.4s, v28.4s, v25.s[0] + fmla v18.4s, v31.4s, v25.s[0] + ldp q11, q10, [x3, #-128] + fmla v20.4s, v30.4s, v25.s[0] + fmla v19.4s, v29.4s, v24.s[0] + ldp q13, q14, [x3, #-96] + fmla v21.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + add x5, x3, #192 + prfm pldl1keep, [x5] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x4, x3, #256 + add x1, x1, #4 + fmla v2.4s, v12.4s, v27.s[1] + fmla v16.4s, v9.4s, v27.s[1] + fmla v6.4s, v8.4s, v27.s[1] + fmla v5.4s, v12.4s, v26.s[1] + fmla v4.4s, v15.4s, v26.s[1] + fmla v3.4s, v9.4s, v26.s[1] + fmla v1.4s, v8.4s, v26.s[1] + fmla v20.4s, v12.4s, v25.s[1] + fmla v18.4s, v15.4s, v25.s[1] + fmla v17.4s, v9.4s, v25.s[1] + fmla v7.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v22.4s, v15.4s, v24.s[1] + ldp q15, q12, [x3, #-64] + fmla v21.4s, v9.4s, v24.s[1] + fmla v19.4s, v8.4s, v24.s[1] + ldp q9, q8, [x3, #-32] + prfm pldl1keep, [x4] + ldp q28, q29, [x3, #32] + ldp q30, q31, [x3] + add x3, x2, x22 + prfm pldl1keep, [x2] + fmla v6.4s, v14.4s, v27.s[2] + fmla v16.4s, v13.4s, v27.s[2] + fmla v2.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v1.4s, v14.4s, v26.s[2] + fmla v3.4s, v13.4s, v26.s[2] + fmla v4.4s, v10.4s, v26.s[2] + fmla v5.4s, v11.4s, v26.s[2] + fmla v7.4s, v14.4s, v25.s[2] + fmla v17.4s, v13.4s, v25.s[2] + fmla v18.4s, v10.4s, v25.s[2] + fmla v20.4s, v11.4s, v25.s[2] + fmla v19.4s, v14.4s, v24.s[2] + fmla v21.4s, v13.4s, v24.s[2] + fmla v22.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v2.4s, v15.4s, v27.s[3] + fmla v16.4s, v9.4s, v27.s[3] + fmla v6.4s, v8.4s, v27.s[3] + ldur q27, [x2, #-16] + prfm pldl1keep, [x3] + add x2, x2, #16 + fmla v5.4s, v15.4s, v26.s[3] + fmla v4.4s, v12.4s, v26.s[3] + fmla v3.4s, v9.4s, v26.s[3] + fmla v1.4s, v8.4s, v26.s[3] + ldur q26, [x3, #-16] + add x3, x3, x22 + add x5, x3, x22 + prfm pldl1keep, [x3] + fmla v20.4s, v15.4s, v25.s[3] + fmla v18.4s, v12.4s, v25.s[3] + fmla v17.4s, v9.4s, v25.s[3] + fmla v7.4s, v8.4s, v25.s[3] + ldur q25, [x3, #-16] + prfm pldl1keep, [x5] + mov x3, x4 + fmla v23.4s, v15.4s, v24.s[3] + fmla v22.4s, v12.4s, v24.s[3] + fmla v21.4s, v9.4s, v24.s[3] + fmla v19.4s, v8.4s, v24.s[3] + ldur q24, [x5, #-16] + fmla v6.4s, v29.4s, v27.s[0] + cmp x1, x29 + b.lt .LBB0_15 +.LBB0_16: // in Loop: Header=BB0_13 Depth=3 + ldr x9, [sp, #496] // 8-byte Folded Reload + fmla v16.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + mov x2, x13 + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v29.4s, v26.s[0] + mov x3, x27 + add x1, x23, x9, lsl #6 + ldr x9, [sp, #488] // 8-byte Folded Reload + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + fmla v17.4s, v28.4s, v25.s[0] + fmla v18.4s, v31.4s, v25.s[0] + fmla v20.4s, v30.4s, v25.s[0] + ldp q10, q9, [x1, #32] + ldp q11, q12, [x1] + fmla v19.4s, v29.4s, v24.s[0] + fmla v21.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + add x1, x23, x9, lsl #6 + fmla v23.4s, v30.4s, v24.s[0] + ldr x9, [sp, #480] // 8-byte Folded Reload + ldp q29, q30, [x1] + ldp q8, q13, [x1, #32] + fmla v0.4s, v12.4s, v27.s[1] + fmla v6.4s, v9.4s, v27.s[1] + fmla v4.4s, v12.4s, v26.s[1] + fmla v1.4s, v9.4s, v26.s[1] + fmla v18.4s, v12.4s, v25.s[1] + fmla v7.4s, v9.4s, v25.s[1] + fmla v22.4s, v12.4s, v24.s[1] + add x1, x23, x9, lsl #6 + fmla v2.4s, v11.4s, v27.s[1] + fmla v16.4s, v10.4s, v27.s[1] + fmla v5.4s, v11.4s, v26.s[1] + fmla v3.4s, v10.4s, v26.s[1] + ldp q31, q28, [x1, #32] + fmla v20.4s, v11.4s, v25.s[1] + fmla v17.4s, v10.4s, v25.s[1] + fmla v23.4s, v11.4s, v24.s[1] + fmla v21.4s, v10.4s, v24.s[1] + fmla v19.4s, v9.4s, v24.s[1] + ldp q9, q10, [x1] + ldr x1, [sp, #464] // 8-byte Folded Reload + fmla v6.4s, v13.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v13.4s, v26.s[2] + fmla v4.4s, v30.4s, v26.s[2] + fmla v7.4s, v13.4s, v25.s[2] + fmla v18.4s, v30.4s, v25.s[2] + fmla v19.4s, v13.4s, v24.s[2] + fmla v22.4s, v30.4s, v24.s[2] + fmla v16.4s, v8.4s, v27.s[2] + fmla v2.4s, v29.4s, v27.s[2] + fmla v3.4s, v8.4s, v26.s[2] + fmla v5.4s, v29.4s, v26.s[2] + fmla v17.4s, v8.4s, v25.s[2] + fmla v20.4s, v29.4s, v25.s[2] + fmla v21.4s, v8.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v0.4s, v10.4s, v27.s[3] + fmla v6.4s, v28.4s, v27.s[3] + fmla v4.4s, v10.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v18.4s, v10.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + fmla v22.4s, v10.4s, v24.s[3] + fmla v19.4s, v28.4s, v24.s[3] + fmla v2.4s, v9.4s, v27.s[3] + fmla v16.4s, v31.4s, v27.s[3] + fmla v5.4s, v9.4s, v26.s[3] + fmla v3.4s, v31.4s, v26.s[3] + fmla v20.4s, v9.4s, v25.s[3] + fmla v17.4s, v31.4s, v25.s[3] + fmla v23.4s, v9.4s, v24.s[3] + fmla v21.4s, v31.4s, v24.s[3] + cmp x27, x21 + b.ge .LBB0_12 + .p2align 2 +.LBB0_17: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_13 Depth=3 + // => This Inner Loop Header: Depth=4 + prfm pldl1keep, [x1] + ldp q24, q25, [x1, #-64] + add x4, x2, x22 + ldp q26, q27, [x1, #-32] + prfm pldl1keep, [x2] + add x3, x3, #1 + ldur s28, [x2, #-4] + prfm pldl1keep, [x4] + add x2, x2, #4 + add x1, x1, #64 + ldur s29, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + fmla v6.4s, v27.4s, v28.s[0] + ldur s30, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + fmla v16.4s, v26.4s, v28.s[0] + fmla v0.4s, v25.4s, v28.s[0] + fmla v2.4s, v24.4s, v28.s[0] + ldur s28, [x4, #-4] + fmla v4.4s, v25.4s, v29.s[0] + fmla v5.4s, v24.4s, v29.s[0] + fmla v3.4s, v26.4s, v29.s[0] + fmla v1.4s, v27.4s, v29.s[0] + fmla v20.4s, v24.4s, v30.s[0] + fmla v18.4s, v25.4s, v30.s[0] + fmla v17.4s, v26.4s, v30.s[0] + fmla v7.4s, v27.4s, v30.s[0] + fmla v23.4s, v24.4s, v28.s[0] + fmla v22.4s, v25.4s, v28.s[0] + fmla v21.4s, v26.4s, v28.s[0] + fmla v19.4s, v27.4s, v28.s[0] + cmp x3, x21 + b.lt .LBB0_17 + b .LBB0_12 + .p2align 2 +.LBB0_18: // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #504] // 8-byte Folded Reload + ldr x13, [sp, #440] // 8-byte Folded Reload + cmp x9, x13 + ldr x9, [sp, #496] // 8-byte Folded Reload + add x15, x23, x9, lsl #6 + ldr x9, [sp, #488] // 8-byte Folded Reload + add x14, x23, x9, lsl #6 + ldr x9, [sp, #480] // 8-byte Folded Reload + add x13, x23, x9, lsl #6 + b.ge .LBB0_24 +// %bb.19: // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #504] // 8-byte Folded Reload + add x17, x12, x10 + ldp q18, q19, [x23, #32] + ldp q20, q21, [x23] + mov x18, xzr + add x0, x9, #1 + mul x16, x9, x28 + mul x2, x9, x19 + madd x1, x0, x28, x12 + add x16, x17, x16 + add x17, x11, x16, lsl #2 + add x16, x1, x10 + mul x1, x20, x8 + madd x0, x0, x19, x1 + add x16, x11, x16, lsl #2 + ldp q2, q0, [x17, #32] + add x2, x1, x2 + ldp q6, q4, [x17] + ldp q3, q1, [x16, #32] + ldp q7, q5, [x16] + lsl x2, x2, #2 + lsl x0, x0, #2 + ldr q17, [x25, x2] + ldr q16, [x25, x0] + ldp x0, x1, [sp, #288] // 16-byte Folded Reload + ldr x2, [sp, #472] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_21 + .p2align 2 +.LBB0_20: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x8, [sp, #456] // 8-byte Folded Reload + fmla v0.4s, v19.4s, v17.s[0] + fmla v2.4s, v18.4s, v17.s[0] + add x9, x2, #128 + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v17.s[0] + add x30, x2, #192 + add x3, x2, #256 + fmla v1.4s, v19.4s, v16.s[0] + fmla v3.4s, v18.4s, v16.s[0] + add x18, x18, #4 + add x4, x1, x8 + add x6, x0, x8 + add x8, x2, #64 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + add x1, x1, #16 + add x0, x0, #16 + prfm pldl1keep, [x8] + add x5, x4, #32 + ldp q23, q22, [x2, #-160] + ldp q24, q25, [x2, #-192] + prfm pldl1keep, [x9] + ldp q19, q18, [x2, #-128] + add x7, x6, #32 + ldp q20, q21, [x2, #-96] + prfm pldl1keep, [x30] + fmla v4.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + fmla v5.4s, v25.4s, v16.s[1] + fmla v1.4s, v22.4s, v16.s[1] + fmla v6.4s, v24.4s, v17.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v3.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v17.s[2] + ldp q23, q22, [x2, #-32] + ldp q24, q25, [x2, #-64] + fmla v4.4s, v18.4s, v17.s[2] + fmla v1.4s, v21.4s, v16.s[2] + fmla v5.4s, v18.4s, v16.s[2] + prfm pldl1keep, [x7] + fmla v2.4s, v20.4s, v17.s[2] + fmla v6.4s, v19.4s, v17.s[2] + fmla v3.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v4.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v5.4s, v25.4s, v16.s[3] + fmla v1.4s, v22.4s, v16.s[3] + fmla v6.4s, v24.4s, v17.s[3] + fmla v2.4s, v23.4s, v17.s[3] + ldr q17, [x6, #16] + prfm pldl1keep, [x5] + fmla v7.4s, v24.4s, v16.s[3] + fmla v3.4s, v23.4s, v16.s[3] + ldr q16, [x4, #16] + prfm pldl1keep, [x3] + ldp q18, q19, [x2, #32] + ldp q20, q21, [x2] + mov x2, x3 + cmp x18, x29 + b.lt .LBB0_20 +.LBB0_21: // in Loop: Header=BB0_8 Depth=2 + ldp q23, q22, [x15, #32] + ldp q24, q25, [x15] + fmla v0.4s, v19.4s, v17.s[0] + fmla v2.4s, v18.4s, v17.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v17.s[0] + fmla v1.4s, v19.4s, v16.s[0] + fmla v3.4s, v18.4s, v16.s[0] + ldp q19, q18, [x14] + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x14, #32] + fmla v4.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x18, [sp, #464] // 8-byte Folded Reload + ldr x0, [sp, #408] // 8-byte Folded Reload + fmla v6.4s, v24.4s, v17.s[1] + fmla v2.4s, v23.4s, v17.s[1] + ldr x30, [sp, #280] // 8-byte Folded Reload + mov x1, x27 + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v25.4s, v16.s[1] + ldp q24, q25, [x13] + fmla v3.4s, v23.4s, v16.s[1] + fmla v1.4s, v22.4s, v16.s[1] + ldp q23, q22, [x13, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v4.4s, v18.4s, v17.s[2] + ldp x3, x2, [sp, #256] // 16-byte Folded Reload + fmla v2.4s, v20.4s, v17.s[2] + fmla v6.4s, v19.4s, v17.s[2] + fmla v1.4s, v21.4s, v16.s[2] + fmla v3.4s, v20.4s, v16.s[2] + fmla v5.4s, v18.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v4.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v5.4s, v25.4s, v16.s[3] + fmla v1.4s, v22.4s, v16.s[3] + fmla v6.4s, v24.4s, v17.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v7.4s, v24.4s, v16.s[3] + fmla v3.4s, v23.4s, v16.s[3] + cmp x27, x21 + b.ge .LBB0_23 + .p2align 2 +.LBB0_22: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x8, x0, x3 + add x9, x0, x2 + prfm pldl1keep, [x18] + add x1, x1, #1 + add x8, x8, #4 + add x9, x9, #4 + ldp q16, q17, [x18, #-64] + ldp q18, q19, [x18, #-32] + prfm pldl1keep, [x9] + add x18, x18, #64 + ldr s20, [x0, x2] + prfm pldl1keep, [x8] + fmla v0.4s, v19.4s, v20.s[0] + ldr s21, [x0, x3] + fmla v2.4s, v18.4s, v20.s[0] + fmla v4.4s, v17.4s, v20.s[0] + fmla v6.4s, v16.4s, v20.s[0] + fmla v5.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + fmla v3.4s, v18.4s, v21.s[0] + fmla v1.4s, v19.4s, v21.s[0] + add x0, x0, #4 + cmp x1, x21 + b.lt .LBB0_22 +.LBB0_23: // in Loop: Header=BB0_8 Depth=2 + ldr x8, [sp, #448] // 8-byte Folded Reload + stp q6, q4, [x17] + stp q2, q0, [x17, #32] + stp q7, q5, [x16] + stp q3, q1, [x16, #32] +.LBB0_24: // in Loop: Header=BB0_8 Depth=2 + ldp x9, x1, [sp, #368] // 16-byte Folded Reload + ldr x16, [sp, #440] // 8-byte Folded Reload + cmp x16, x9 + ldr x0, [sp, #416] // 8-byte Folded Reload + b.ge .LBB0_7 +// %bb.25: // in Loop: Header=BB0_8 Depth=2 + mov x17, x8 + add x8, x12, x10 + ldr x12, [sp, #440] // 8-byte Folded Reload + ldr x2, [sp, #408] // 8-byte Folded Reload + ldp q7, q16, [x23, #32] + ldp q6, q5, [x23] + mov x16, xzr + mul x9, x12, x28 + add x8, x8, x9 + add x10, x11, x8, lsl #2 + mul x8, x12, x19 + ldr x11, [sp, #304] // 8-byte Folded Reload + ldr x12, [sp, #472] // 8-byte Folded Reload + madd x8, x20, x17, x8 + ldp q1, q0, [x10, #32] + ldp q3, q2, [x10] + lsl x8, x8, #2 + ldr q4, [x25, x8] + cmp xzr, x29 + b.ge .LBB0_27 + .p2align 2 +.LBB0_26: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x18, x12, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x9, x12, #128 + prfm pldl1keep, [x18] + ldp q18, q17, [x12, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x12, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x9] + ldp q6, q5, [x12, #-128] + ldp q7, q16, [x12, #-96] + add x8, x12, #192 + prfm pldl1keep, [x8] + add x17, x12, #256 + add x16, x16, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x12, #-32] + ldp q19, q20, [x12, #-64] + prfm pldl1keep, [x11] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x11, #-16] + prfm pldl1keep, [x17] + add x11, x11, #16 + ldp q7, q16, [x12, #32] + ldp q6, q5, [x12] + mov x12, x17 + cmp x16, x29 + b.lt .LBB0_26 +.LBB0_27: // in Loop: Header=BB0_8 Depth=2 + ldp q18, q17, [x15, #32] + ldp q19, q20, [x15] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + ldp q6, q5, [x14] + ldp q7, q16, [x14, #32] + ldr x15, [sp, #248] // 8-byte Folded Reload + mov x11, xzr + mov w12, #64 // =0x40 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + fmla v0.4s, v16.4s, v4.s[2] + ldp q18, q17, [x13, #32] + ldp q19, q20, [x13] + fmla v2.4s, v5.4s, v4.s[2] + ldr x13, [sp, #272] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + add x8, x27, xzr + cmp x8, x21 + b.ge .LBB0_6 + .p2align 2 +.LBB0_28: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x14, x15, x11, lsl #6 + add x8, x2, x13 + add x9, x15, x12 + add x13, x13, #4 + prfm pldl1keep, [x9] + add x12, x12, #64 + ldp q4, q5, [x14] + ldp q6, q7, [x14, #32] + prfm pldl1keep, [x8] + ldr s16, [x1, x11, lsl #2] + add x11, x11, #1 + fmla v0.4s, v7.4s, v16.s[0] + fmla v1.4s, v6.4s, v16.s[0] + fmla v2.4s, v5.4s, v16.s[0] + fmla v3.4s, v4.4s, v16.s[0] + add x8, x27, x11 + cmp x8, x21 + b.lt .LBB0_28 + b .LBB0_6 + .p2align 2 +.LBB0_29: // in Loop: Header=BB0_4 Depth=1 + ldp x10, x9, [sp, #328] // 16-byte Folded Reload + cmp x0, x30 + add x11, x10, x9, lsl #2 + lsl x9, x28, #1 + stp x9, x11, [sp, #424] // 16-byte Folded Spill + b.lt .LBB0_33 +// %bb.30: // in Loop: Header=BB0_4 Depth=1 + ldr x1, [sp, #176] // 8-byte Folded Reload + cmp x30, x1 + b.lt .LBB0_55 +.LBB0_31: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #168] // 8-byte Folded Reload + cmp x1, x10 + b.lt .LBB0_77 +.LBB0_32: // in Loop: Header=BB0_4 Depth=1 + ldr x9, [sp, #136] // 8-byte Folded Reload + cmp x10, x9 + b.ge .LBB0_3 + b .LBB0_99 + .p2align 2 +.LBB0_33: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #96] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x6, [sp, #432] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x15, [sp, #232] // 8-byte Folded Reload + ldr x16, [sp, #400] // 8-byte Folded Reload + mul x9, x20, x8 + ldp x8, x13, [sp, #416] // 16-byte Folded Reload + add x8, x9, x8 + add x12, x6, x8, lsl #2 + ldp q3, q2, [x12] + add x12, x8, x28 + add x12, x6, x12, lsl #2 + ldp q1, q0, [x12] + add x12, x8, x13 + add x12, x6, x12, lsl #2 + ldp q5, q4, [x12] + add x12, x13, x28 + add x8, x8, x12 + add x8, x6, x8, lsl #2 + ldp q7, q6, [x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x21 + b.ge .LBB0_35 + .p2align 2 +.LBB0_34: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x16, x10 + add x12, x15, x10 + add x10, x10, #4 + prfm pldl1keep, [x13] + ldur s16, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s17, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s18, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s20, [x13, #-4] + prfm pldl1keep, [x12] + ldur s19, [x12, #-4] + add x12, x12, x26 + sub x13, x12, #4 + prfm pldl1keep, [x12] + add x12, x12, x26 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x26 + ld1 { v19.s }[1], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x26 + ld1 { v19.s }[2], [x14] + prfm pldl1keep, [x12] + ldur s21, [x12, #-4] + add x12, x12, x26 + ld1 { v19.s }[3], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x26 + prfm pldl1keep, [x12] + ld1 { v21.s }[1], [x13] + sub x14, x12, #4 + add x12, x12, x26 + prfm pldl1keep, [x12] + sub x12, x12, #4 + fmla v3.4s, v19.4s, v16.s[0] + fmla v1.4s, v19.4s, v17.s[0] + fmla v5.4s, v19.4s, v18.s[0] + fmla v7.4s, v19.4s, v20.s[0] + ld1 { v21.s }[2], [x14] + ld1 { v21.s }[3], [x12] + add x12, x8, x11, lsl #5 + add x11, x11, #1 + fmla v2.4s, v21.4s, v16.s[0] + fmla v0.4s, v21.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + fmla v6.4s, v21.4s, v20.s[0] + stp q19, q21, [x12] + cmp x11, x21 + b.lt .LBB0_34 +.LBB0_35: // %.preheader27 + // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #56] // 8-byte Folded Reload + ldp x15, x16, [sp, #384] // 16-byte Folded Reload + mov x11, xzr + add x10, x8, #128 + mov w18, #1 // =0x1 + mov w2, #2 // =0x2 + mov w1, #3 // =0x3 + mov w17, #4 // =0x4 + add x14, x8, x12 + b .LBB0_37 + .p2align 2 +.LBB0_36: // %.loopexit23 + // in Loop: Header=BB0_37 Depth=2 + add x16, x16, x24 + add x15, x15, x24 + mov x11, x17 + mov x17, x3 +.LBB0_37: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_39 Depth 3 + // Child Loop BB0_41 Depth 3 + madd x11, x11, x28, x9 + ldr x7, [sp, #416] // 8-byte Folded Reload + add x11, x11, x7 + madd x12, x18, x28, x9 + madd x13, x2, x28, x9 + add x12, x12, x7 + add x13, x13, x7 + add x11, x6, x11, lsl #2 + add x12, x6, x12, lsl #2 + stp q3, q2, [x11] + madd x11, x1, x28, x9 + stp q1, q0, [x12] + add x12, x6, x13, lsl #2 + add x11, x11, x7 + stp q5, q4, [x12] + add x11, x6, x11, lsl #2 + stp q7, q6, [x11] + ldr x11, [sp, #504] // 8-byte Folded Reload + cmp x17, x11 + ldr x11, [sp, #496] // 8-byte Folded Reload + add x13, x8, x11, lsl #5 + ldr x11, [sp, #488] // 8-byte Folded Reload + add x12, x8, x11, lsl #5 + ldr x11, [sp, #480] // 8-byte Folded Reload + add x11, x8, x11, lsl #5 + b.ge .LBB0_42 +// %bb.38: // in Loop: Header=BB0_37 Depth=2 + madd x5, x17, x28, x9 + add x18, x17, #1 + mov x30, x6 + add x2, x17, #2 + madd x6, x18, x28, x9 + add x1, x17, #3 + ldp q20, q21, [x8] + mov x4, xzr + add x3, x17, #4 + add x5, x5, x7 + add x5, x30, x5, lsl #2 + add x6, x6, x7 + add x6, x30, x6, lsl #2 + ldp q3, q2, [x5] + madd x5, x2, x28, x9 + ldp q1, q0, [x6] + madd x6, x1, x28, x9 + add x5, x5, x7 + add x5, x30, x5, lsl #2 + add x6, x6, x7 + add x6, x30, x6, lsl #2 + ldp q5, q4, [x5] + ldr x5, [sp, #448] // 8-byte Folded Reload + mul x5, x20, x5 + ldp q7, q6, [x6] + madd x6, x17, x19, x5 + lsl x6, x6, #2 + ldr q19, [x25, x6] + madd x6, x18, x19, x5 + lsl x6, x6, #2 + ldr q18, [x25, x6] + madd x6, x2, x19, x5 + madd x5, x1, x19, x5 + lsl x6, x6, #2 + lsl x5, x5, #2 + ldr q17, [x25, x6] + ldr q16, [x25, x5] + mov x5, x10 + mov x6, x16 + cmp xzr, x29 + b.ge .LBB0_40 + .p2align 2 +.LBB0_39: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_37 Depth=2 + // => This Inner Loop Header: Depth=3 + add x7, x5, #32 + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + add x4, x4, #4 + prfm pldl1keep, [x7] + ldp q22, q23, [x5, #-96] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + add x7, x5, #96 + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x5, #-64] + prfm pldl1keep, [x7] + add x7, x6, x22 + add x30, x7, x22 + fmla v2.4s, v23.4s, v19.s[1] + fmla v0.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v2.4s, v20.4s, v19.s[2] + ldp q22, q23, [x5, #-32] + fmla v0.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x5], #128 + prfm pldl1keep, [x6] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + ldur q19, [x6, #-16] + prfm pldl1keep, [x7] + fmla v1.4s, v22.4s, v18.s[3] + ldur q18, [x7, #-16] + add x7, x30, x22 + prfm pldl1keep, [x30] + add x6, x6, #16 + fmla v5.4s, v22.4s, v17.s[3] + ldur q17, [x30, #-16] + prfm pldl1keep, [x7] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x7, #-16] + cmp x4, x29 + b.lt .LBB0_39 +.LBB0_40: // in Loop: Header=BB0_37 Depth=2 + ldp q22, q23, [x13] + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + ldr x6, [sp, #432] // 8-byte Folded Reload + mov x13, x27 + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x12] + mov x12, x15 + fmla v2.4s, v23.4s, v19.s[1] + fmla v0.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v2.4s, v20.4s, v19.s[2] + ldp q22, q23, [x11] + fmla v0.4s, v20.4s, v18.s[2] + mov x11, x14 + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + fmla v1.4s, v22.4s, v18.s[3] + fmla v5.4s, v22.4s, v17.s[3] + fmla v7.4s, v22.4s, v16.s[3] + cmp x27, x21 + b.ge .LBB0_36 + .p2align 2 +.LBB0_41: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_37 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x12, x22 + prfm pldl1keep, [x11] + ldp q16, q17, [x11, #-32] + prfm pldl1keep, [x12] + ldur s18, [x12, #-4] + add x13, x13, #1 + add x12, x12, #4 + prfm pldl1keep, [x4] + ldur s19, [x4, #-4] + add x4, x4, x22 + add x11, x11, #32 + prfm pldl1keep, [x4] + ldur s20, [x4, #-4] + add x4, x4, x22 + fmla v2.4s, v17.4s, v18.s[0] + prfm pldl1keep, [x4] + ldur s21, [x4, #-4] + fmla v3.4s, v16.4s, v18.s[0] + fmla v0.4s, v17.4s, v19.s[0] + fmla v1.4s, v16.4s, v19.s[0] + fmla v4.4s, v17.4s, v20.s[0] + fmla v5.4s, v16.4s, v20.s[0] + fmla v6.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + cmp x13, x21 + b.lt .LBB0_41 + b .LBB0_36 + .p2align 2 +.LBB0_42: // in Loop: Header=BB0_4 Depth=1 + ldr x14, [sp, #504] // 8-byte Folded Reload + ldr x15, [sp, #440] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_48 +// %bb.43: // in Loop: Header=BB0_4 Depth=1 + ldr x1, [sp, #504] // 8-byte Folded Reload + ldr x18, [sp, #416] // 8-byte Folded Reload + mov x16, xzr + mul x14, x1, x28 + add x17, x1, #1 + mul x1, x1, x19 + ldp q6, q7, [x8] + madd x15, x17, x28, x9 + add x14, x9, x14 + add x14, x14, x18 + add x15, x15, x18 + ldr x18, [sp, #448] // 8-byte Folded Reload + add x14, x6, x14, lsl #2 + add x15, x6, x15, lsl #2 + mul x18, x20, x18 + ldp q1, q0, [x14] + ldp q3, q2, [x15] + madd x17, x17, x19, x18 + add x1, x18, x1 + lsl x1, x1, #2 + lsl x17, x17, #2 + ldr q5, [x25, x1] + ldr q4, [x25, x17] + ldp x18, x1, [sp, #288] // 16-byte Folded Reload + mov x17, x10 + cmp xzr, x29 + b.ge .LBB0_45 + .p2align 2 +.LBB0_44: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x7, x17, #32 + ldr x4, [sp, #456] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + prfm pldl1keep, [x7] + ldp q16, q17, [x17, #-96] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x17, #-64] + add x6, x17, #96 + prfm pldl1keep, [x6] + add x16, x16, #4 + add x2, x1, x4 + add x4, x18, x4 + add x1, x1, #16 + add x18, x18, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + add x3, x2, #32 + add x5, x4, #32 + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + ldp q16, q17, [x17, #-32] + fmla v0.4s, v6.4s, v5.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + ldp q6, q7, [x17], #128 + prfm pldl1keep, [x5] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x4, #16] + prfm pldl1keep, [x3] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x2, #16] + cmp x16, x29 + b.lt .LBB0_44 +.LBB0_45: // in Loop: Header=BB0_4 Depth=1 + ldp q16, q17, [x13] + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x12] + ldr x18, [sp, #72] // 8-byte Folded Reload + mov x16, xzr + mov x17, xzr + mov x1, x27 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + add x18, x8, x18 + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + ldp q16, q17, [x11] + fmla v0.4s, v6.4s, v5.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + fmla v1.4s, v16.4s, v5.s[3] + fmla v3.4s, v16.4s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_47 + .p2align 2 +.LBB0_46: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x6, [sp, #240] // 8-byte Folded Reload + ldr x7, [sp, #312] // 8-byte Folded Reload + add x4, x18, x17, lsl #3 + add x5, x18, x16 + add x1, x1, #1 + add x16, x16, #32 + add x4, x4, #32 + prfm pldl1keep, [x4] + ldp q4, q5, [x5] + add x2, x6, x17 + add x3, x7, x17 + add x2, x2, #4 + add x3, x3, #4 + prfm pldl1keep, [x3] + ldr s6, [x7, x17] + prfm pldl1keep, [x2] + fmla v0.4s, v5.4s, v6.s[0] + ldr s7, [x6, x17] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v5.4s, v7.s[0] + fmla v3.4s, v4.4s, v7.s[0] + add x17, x17, #4 + cmp x1, x21 + b.lt .LBB0_46 +.LBB0_47: // in Loop: Header=BB0_4 Depth=1 + ldr x6, [sp, #432] // 8-byte Folded Reload + stp q1, q0, [x14] + stp q3, q2, [x15] +.LBB0_48: // in Loop: Header=BB0_4 Depth=1 + ldr x14, [sp, #368] // 8-byte Folded Reload + ldr x15, [sp, #440] // 8-byte Folded Reload + cmp x15, x14 + b.ge .LBB0_54 +// %bb.49: // in Loop: Header=BB0_4 Depth=1 + ldp x17, x16, [sp, #440] // 16-byte Folded Reload + ldp q4, q3, [x8] + ldr x18, [sp, #376] // 8-byte Folded Reload + mov x14, xzr + mul x15, x17, x28 + add x9, x9, x15 + ldr x15, [sp, #416] // 8-byte Folded Reload + add x9, x9, x15 + mul x15, x17, x19 + madd x15, x20, x16, x15 + add x9, x6, x9, lsl #2 + ldp q1, q0, [x9] + lsl x15, x15, #2 + ldr q2, [x25, x15] + ldr x15, [sp, #304] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_51 + .p2align 2 +.LBB0_50: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x17, x10, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x16, x10, #96 + prfm pldl1keep, [x17] + ldp q5, q6, [x10, #-96] + add x14, x14, #4 + ldp q4, q3, [x10, #-64] + prfm pldl1keep, [x16] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x10, #-32] + prfm pldl1keep, [x15] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x15, #-16] + ldp q4, q3, [x10], #128 + add x15, x15, #16 + cmp x14, x29 + b.lt .LBB0_50 +.LBB0_51: // in Loop: Header=BB0_4 Depth=1 + ldp q5, q6, [x13] + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + ldp q4, q3, [x12] + mov x10, xzr + mov x14, xzr + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x11] + ldr x11, [sp, #72] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + add x8, x8, x11 + mov x11, x27 + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x27, x21 + b.ge .LBB0_53 + .p2align 2 +.LBB0_52: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x8, x14, lsl #3 + add x12, x18, x14 + add x15, x8, x10 + add x11, x11, #1 + add x12, x12, #4 + add x10, x10, #32 + add x13, x13, #32 + prfm pldl1keep, [x13] + ldp q2, q3, [x15] + prfm pldl1keep, [x12] + ldr s4, [x18, x14] + add x14, x14, #4 + fmla v0.4s, v3.4s, v4.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x11, x21 + b.lt .LBB0_52 +.LBB0_53: // in Loop: Header=BB0_4 Depth=1 + stp q1, q0, [x9] +.LBB0_54: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload + ldr x30, [sp, #280] // 8-byte Folded Reload + ldr x1, [sp, #176] // 8-byte Folded Reload + cmp x30, x1 + b.ge .LBB0_31 +.LBB0_55: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #88] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x5, [sp, #280] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldp x13, x6, [sp, #424] // 16-byte Folded Reload + ldr x15, [sp, #200] // 8-byte Folded Reload + mul x9, x20, x8 + ldr x16, [sp, #400] // 8-byte Folded Reload + add x8, x9, x5 + lsl x12, x8, #2 + ldr q0, [x6, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr q1, [x6, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr q2, [x6, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr q3, [x6, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x21 + b.ge .LBB0_57 + .p2align 2 +.LBB0_56: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x16, x10 + add x12, x15, x10 + add x10, x10, #4 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, x26 + sub x13, x12, #4 + prfm pldl1keep, [x12] + add x12, x12, x26 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x26 + ld1 { v16.s }[1], [x13] + prfm pldl1keep, [x12] + sub x12, x12, #4 + ld1 { v16.s }[2], [x14] + ld1 { v16.s }[3], [x12] + str q16, [x8, x11, lsl #4] + add x11, x11, #1 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x11, x21 + b.lt .LBB0_56 +.LBB0_57: // %.preheader26 + // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #48] // 8-byte Folded Reload + ldp x12, x13, [sp, #384] // 16-byte Folded Reload + mov x1, xzr + add x10, x8, #48 + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_59 + .p2align 2 +.LBB0_58: // %.loopexit22 + // in Loop: Header=BB0_59 Depth=2 + add x13, x13, x24 + add x12, x12, x24 + mov x1, x14 + mov x14, x18 +.LBB0_59: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_63 Depth 3 + madd x18, x1, x28, x9 + add x18, x18, x5 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x5 + add x16, x16, x5 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str q0, [x6, x18] + str q1, [x6, x15] + add x15, x17, x5 + lsl x15, x15, #2 + str q2, [x6, x16] + str q3, [x6, x15] + ldr x15, [sp, #504] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_64 +// %bb.60: // in Loop: Header=BB0_59 Depth=2 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x16, x14, #2 + add x17, x14, #3 + madd x3, x15, x28, x9 + ldr q16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x5 + lsl x2, x2, #2 + add x3, x3, x5 + lsl x3, x3, #2 + ldr q0, [x6, x2] + madd x2, x16, x28, x9 + add x2, x2, x5 + ldr q1, [x6, x3] + madd x3, x17, x28, x9 + lsl x2, x2, #2 + ldr q2, [x6, x2] + add x2, x3, x5 + lsl x2, x2, #2 + ldr q3, [x6, x2] + ldr x2, [sp, #448] // 8-byte Folded Reload + mul x2, x20, x2 + madd x3, x14, x19, x2 + lsl x3, x3, #2 + ldr q7, [x25, x3] + madd x3, x15, x19, x2 + lsl x3, x3, #2 + ldr q6, [x25, x3] + madd x3, x16, x19, x2 + madd x2, x17, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x25, x3] + ldr q4, [x25, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x29 + b.ge .LBB0_62 + .p2align 2 +.LBB0_61: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_59 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + add x1, x1, #4 + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x22 + ldp q16, q17, [x2, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v1.4s, v16.4s, v6.s[1] + fmla v2.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v1.4s, v17.4s, v6.s[2] + fmla v2.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x2], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.4s, v17.4s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x22 + fmla v2.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x29 + b.lt .LBB0_61 +.LBB0_62: // in Loop: Header=BB0_59 Depth=2 + ldp x1, x2, [sp, #488] // 16-byte Folded Reload + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + mov x3, x27 + ldr q17, [x8, x2, lsl #4] + ldr q16, [x8, x1, lsl #4] + ldr x1, [sp, #480] // 8-byte Folded Reload + mov x2, x12 + ldr q18, [x8, x1, lsl #4] + mov x1, x11 + fmla v0.4s, v17.4s, v7.s[1] + fmla v1.4s, v17.4s, v6.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + fmla v0.4s, v16.4s, v7.s[2] + fmla v1.4s, v16.4s, v6.s[2] + fmla v2.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v1.4s, v18.4s, v6.s[3] + fmla v2.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_58 + .p2align 2 +.LBB0_63: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_59 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, x22 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #16 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x22 + fmla v0.4s, v4.4s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v4.4s, v7.s[0] + fmla v3.4s, v4.4s, v16.s[0] + cmp x3, x21 + b.lt .LBB0_63 + b .LBB0_58 + .p2align 2 +.LBB0_64: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #504] // 8-byte Folded Reload + ldr x13, [sp, #440] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_70 +// %bb.65: // in Loop: Header=BB0_4 Depth=1 + ldr x17, [sp, #504] // 8-byte Folded Reload + ldr x16, [sp, #448] // 8-byte Folded Reload + mov x14, xzr + mul x12, x17, x28 + add x15, x17, #1 + mul x16, x20, x16 + mul x17, x17, x19 + ldr q4, [x8] + madd x13, x15, x28, x9 + madd x15, x15, x19, x16 + add x12, x9, x12 + add x17, x16, x17 + add x12, x12, x5 + add x13, x13, x5 + lsl x17, x17, #2 + lsl x15, x15, #2 + add x12, x6, x12, lsl #2 + add x13, x6, x13, lsl #2 + ldr q3, [x25, x17] + ldr q2, [x25, x15] + ldp x16, x17, [sp, #288] // 16-byte Folded Reload + mov x15, x10 + ldr q0, [x12] + ldr q1, [x13] + cmp xzr, x29 + b.ge .LBB0_67 + .p2align 2 +.LBB0_66: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x4, x15, #32 + ldr x2, [sp, #456] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + prfm pldl1keep, [x4] + ldp q4, q5, [x15, #-32] + add x14, x14, #4 + add x18, x17, x2 + add x2, x16, x2 + add x17, x17, #16 + add x16, x16, #16 + add x1, x18, #32 + add x3, x2, #32 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x15], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x18, #16] + cmp x14, x29 + b.lt .LBB0_66 +.LBB0_67: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x15, [sp, #488] // 16-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp x1, x18, [sp, #256] // 16-byte Folded Reload + ldr q5, [x8, x15, lsl #4] + ldr q4, [x8, x14, lsl #4] + ldr x14, [sp, #480] // 8-byte Folded Reload + mov x15, x27 + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x14, lsl #4] + ldr x14, [sp, #408] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x27, x21 + b.ge .LBB0_69 + .p2align 2 +.LBB0_68: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x16, x14, x1 + add x17, x14, x18 + prfm pldl1keep, [x11] + ldur q2, [x11, #-16] + add x16, x16, #4 + add x17, x17, #4 + add x15, x15, #1 + add x11, x11, #16 + prfm pldl1keep, [x17] + ldr s3, [x14, x18] + prfm pldl1keep, [x16] + ldr s4, [x14, x1] + add x14, x14, #4 + fmla v0.4s, v2.4s, v3.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x15, x21 + b.lt .LBB0_68 +.LBB0_69: // in Loop: Header=BB0_4 Depth=1 + str q0, [x12] + str q1, [x13] +.LBB0_70: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #368] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_76 +// %bb.71: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x13, [sp, #440] // 16-byte Folded Reload + ldr q2, [x8] + mov x11, xzr + mul x12, x14, x28 + add x9, x9, x12 + mul x12, x14, x19 + ldr x14, [sp, #376] // 8-byte Folded Reload + madd x12, x20, x13, x12 + add x9, x9, x5 + add x9, x6, x9, lsl #2 + ldr q0, [x9] + lsl x12, x12, #2 + ldr q1, [x25, x12] + ldr x12, [sp, #304] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_73 + .p2align 2 +.LBB0_72: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x10, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x10, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x10], #64 + prfm pldl1keep, [x12] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + cmp x11, x29 + b.lt .LBB0_72 +.LBB0_73: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #496] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + ldr x12, [sp, #272] // 8-byte Folded Reload + mov x10, xzr + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #488] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x11, lsl #4] + ldr x11, [sp, #480] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #24] // 8-byte Folded Reload + add x8, x8, x11 + mov w11, #16 // =0x10 + fmla v0.4s, v3.4s, v1.s[3] + add x13, x27, xzr + cmp x13, x21 + b.ge .LBB0_75 + .p2align 2 +.LBB0_74: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x8, x11 + add x11, x11, #16 + prfm pldl1keep, [x13] + ldr x13, [sp, #408] // 8-byte Folded Reload + ldr q1, [x8, x10, lsl #4] + add x13, x13, x12 + add x12, x12, #4 + prfm pldl1keep, [x13] + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.4s, v1.4s, v2.s[0] + add x13, x27, x10 + cmp x13, x21 + b.lt .LBB0_74 +.LBB0_75: // in Loop: Header=BB0_4 Depth=1 + str q0, [x9] +.LBB0_76: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload + ldp x10, x1, [sp, #168] // 16-byte Folded Reload + cmp x1, x10 + b.ge .LBB0_32 +.LBB0_77: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #80] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x5, [sp, #176] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldp x13, x6, [sp, #424] // 16-byte Folded Reload + ldp x17, x16, [sp, #208] // 16-byte Folded Reload + ldr x18, [sp, #400] // 8-byte Folded Reload + mul x9, x20, x8 + add x8, x9, x5 + lsl x12, x8, #2 + ldr d0, [x6, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr d1, [x6, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr d2, [x6, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr d3, [x6, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x21 + b.ge .LBB0_79 + .p2align 2 +.LBB0_78: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x18, x10 + add x12, x16, x10 + add x14, x17, x10 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x22 + add x13, x12, #4 + add x14, x14, #4 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x22 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x22 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + prfm pldl1keep, [x14] + prfm pldl1keep, [x13] + ldr s16, [x17, x10] + add x10, x10, #4 + ld1 { v16.s }[1], [x12] + fmla v0.2s, v16.2s, v4.s[0] + str d16, [x8, x11, lsl #3] + add x11, x11, #1 + fmla v1.2s, v16.2s, v5.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x11, x21 + b.lt .LBB0_78 +.LBB0_79: // %.preheader25 + // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #40] // 8-byte Folded Reload + ldp x12, x13, [sp, #384] // 16-byte Folded Reload + mov x1, xzr + add x10, x8, #24 + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_81 + .p2align 2 +.LBB0_80: // %.loopexit21 + // in Loop: Header=BB0_81 Depth=2 + add x13, x13, x24 + add x12, x12, x24 + mov x1, x14 + mov x14, x18 +.LBB0_81: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_83 Depth 3 + // Child Loop BB0_85 Depth 3 + madd x18, x1, x28, x9 + add x18, x18, x5 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x5 + add x16, x16, x5 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str d0, [x6, x18] + str d1, [x6, x15] + add x15, x17, x5 + lsl x15, x15, #2 + str d2, [x6, x16] + str d3, [x6, x15] + ldr x15, [sp, #504] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_86 +// %bb.82: // in Loop: Header=BB0_81 Depth=2 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x16, x14, #2 + add x17, x14, #3 + madd x3, x15, x28, x9 + ldr d16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x5 + lsl x2, x2, #2 + add x3, x3, x5 + lsl x3, x3, #2 + ldr d0, [x6, x2] + madd x2, x16, x28, x9 + add x2, x2, x5 + ldr d1, [x6, x3] + madd x3, x17, x28, x9 + lsl x2, x2, #2 + ldr d2, [x6, x2] + add x2, x3, x5 + lsl x2, x2, #2 + ldr d3, [x6, x2] + ldr x2, [sp, #448] // 8-byte Folded Reload + mul x2, x20, x2 + madd x3, x14, x19, x2 + lsl x3, x3, #2 + ldr q7, [x25, x3] + madd x3, x15, x19, x2 + lsl x3, x3, #2 + ldr q6, [x25, x3] + madd x3, x16, x19, x2 + madd x2, x17, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x25, x3] + ldr q4, [x25, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x29 + b.ge .LBB0_84 + .p2align 2 +.LBB0_83: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_81 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + add x1, x1, #4 + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x22 + ldp d16, d17, [x2, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v1.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v1.2s, v17.2s, v6.s[2] + fmla v2.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x2], #32 + prfm pldl1keep, [x3] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x22 + fmla v2.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x29 + b.lt .LBB0_83 +.LBB0_84: // in Loop: Header=BB0_81 Depth=2 + ldp x1, x2, [sp, #488] // 16-byte Folded Reload + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + mov x3, x27 + ldr d17, [x8, x2, lsl #3] + ldr d16, [x8, x1, lsl #3] + ldr x1, [sp, #480] // 8-byte Folded Reload + mov x2, x12 + ldr d18, [x8, x1, lsl #3] + mov x1, x11 + fmla v0.2s, v17.2s, v7.s[1] + fmla v1.2s, v17.2s, v6.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + fmla v0.2s, v16.2s, v7.s[2] + fmla v1.2s, v16.2s, v6.s[2] + fmla v2.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v1.2s, v18.2s, v6.s[3] + fmla v2.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_80 + .p2align 2 +.LBB0_85: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_81 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, x22 + prfm pldl1keep, [x1] + ldur d4, [x1, #-8] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #8 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x22 + fmla v0.2s, v4.2s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.2s, v4.2s, v6.s[0] + fmla v2.2s, v4.2s, v7.s[0] + fmla v3.2s, v4.2s, v16.s[0] + cmp x3, x21 + b.lt .LBB0_85 + b .LBB0_80 + .p2align 2 +.LBB0_86: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #504] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_92 +// %bb.87: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #504] // 8-byte Folded Reload + ldr x15, [sp, #448] // 8-byte Folded Reload + mov x13, xzr + mul x11, x16, x28 + add x14, x16, #1 + mul x15, x20, x15 + mul x16, x16, x19 + ldr d4, [x8] + madd x12, x14, x28, x9 + madd x14, x14, x19, x15 + add x11, x9, x11 + add x16, x15, x16 + add x11, x11, x5 + add x12, x12, x5 + lsl x16, x16, #2 + lsl x14, x14, #2 + add x11, x6, x11, lsl #2 + add x12, x6, x12, lsl #2 + ldr q3, [x25, x16] + ldr q2, [x25, x14] + ldp x15, x16, [sp, #288] // 16-byte Folded Reload + mov x14, x10 + ldr d0, [x11] + ldr d1, [x12] + cmp xzr, x29 + b.ge .LBB0_89 + .p2align 2 +.LBB0_88: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x3, x14, #16 + ldr x1, [sp, #456] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + prfm pldl1keep, [x3] + ldp d4, d5, [x14, #-16] + add x13, x13, #4 + add x17, x16, x1 + add x1, x15, x1 + add x16, x16, #16 + add x15, x15, #16 + add x18, x17, #32 + add x2, x1, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x14], #32 + prfm pldl1keep, [x2] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x1, #16] + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + cmp x13, x29 + b.lt .LBB0_88 +.LBB0_89: // in Loop: Header=BB0_4 Depth=1 + ldr x15, [sp, #496] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr x1, [sp, #240] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #488] // 8-byte Folded Reload + ldr d4, [x8, x15, lsl #3] + ldr x15, [sp, #480] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #64] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + add x15, x8, x15 + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + add x16, x27, xzr + cmp x16, x21 + b.ge .LBB0_91 + .p2align 2 +.LBB0_90: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x2, [sp, #312] // 8-byte Folded Reload + add x18, x15, x14, lsl #3 + add x16, x1, x13 + add x16, x16, #4 + add x18, x18, #8 + prfm pldl1keep, [x18] + ldr d2, [x15, x14, lsl #3] + add x17, x2, x13 + add x13, x13, #4 + add x17, x17, #4 + prfm pldl1keep, [x17] + ldr s3, [x2, x14, lsl #2] + prfm pldl1keep, [x16] + fmla v0.2s, v2.2s, v3.s[0] + ldr s4, [x1, x14, lsl #2] + fmla v1.2s, v2.2s, v4.s[0] + add x14, x14, #1 + add x16, x27, x14 + cmp x16, x21 + b.lt .LBB0_90 +.LBB0_91: // in Loop: Header=BB0_4 Depth=1 + str d0, [x11] + str d1, [x12] +.LBB0_92: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #368] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_98 +// %bb.93: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x13, [sp, #440] // 16-byte Folded Reload + ldr d2, [x8] + mov x11, xzr + mul x12, x14, x28 + add x9, x9, x12 + mul x12, x14, x19 + ldr x14, [sp, #376] // 8-byte Folded Reload + madd x12, x20, x13, x12 + add x9, x9, x5 + add x9, x6, x9, lsl #2 + ldr d0, [x9] + lsl x12, x12, #2 + ldr q1, [x25, x12] + ldr x12, [sp, #304] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_95 + .p2align 2 +.LBB0_94: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x10, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp d2, d3, [x10, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x10], #32 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + cmp x11, x29 + b.lt .LBB0_94 +.LBB0_95: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #496] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[0] + mov x10, xzr + ldr d3, [x8, x11, lsl #3] + ldr x11, [sp, #488] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + ldr d4, [x8, x11, lsl #3] + ldr x11, [sp, #480] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[2] + ldr d2, [x8, x11, lsl #3] + ldr x11, [sp, #64] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #272] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[3] + add x12, x27, xzr + cmp x12, x21 + b.ge .LBB0_97 + .p2align 2 +.LBB0_96: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x10, lsl #3 + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr x12, [sp, #408] // 8-byte Folded Reload + ldr d1, [x8, x10, lsl #3] + add x12, x12, x11 + add x11, x11, #4 + prfm pldl1keep, [x12] + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.2s, v1.2s, v2.s[0] + add x12, x27, x10 + cmp x12, x21 + b.lt .LBB0_96 +.LBB0_97: // in Loop: Header=BB0_4 Depth=1 + str d0, [x9] +.LBB0_98: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload + ldr x10, [sp, #168] // 8-byte Folded Reload + ldr x9, [sp, #136] // 8-byte Folded Reload + cmp x10, x9 + b.ge .LBB0_3 +.LBB0_99: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #112] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x5, [sp, #168] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldp x13, x14, [sp, #424] // 16-byte Folded Reload + mul x9, x20, x8 + add x12, x9, x5 + add x8, x12, x13 + add x13, x13, x28 + ldr s2, [x14, x12, lsl #2] + add x13, x12, x13 + ldr s1, [x14, x8, lsl #2] + add x8, x0, #63 + ldr s0, [x14, x13, lsl #2] + add x13, x12, x28 + ldr x12, [sp, #224] // 8-byte Folded Reload + and x8, x8, #0xffffffffffffffc0 + ldr s3, [x14, x13, lsl #2] + ldr x14, [sp, #400] // 8-byte Folded Reload + cmp xzr, x21 + b.ge .LBB0_101 + .p2align 2 +.LBB0_100: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x14, x10 + add x11, x11, #1 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, #4 + fmla v2.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x8, x10] + add x10, x10, #4 + cmp x11, x21 + b.lt .LBB0_100 +.LBB0_101: // %.preheader24 + // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #32] // 8-byte Folded Reload + ldp x12, x13, [sp, #384] // 16-byte Folded Reload + mov x1, xzr + ldp x7, x6, [sp, #184] // 16-byte Folded Reload + add x10, x8, #12 + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w15, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_103 + .p2align 2 +.LBB0_102: // %.loopexit20 + // in Loop: Header=BB0_103 Depth=2 + add x13, x13, x24 + add x12, x12, x24 + mov x1, x14 + mov x14, x18 +.LBB0_103: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_107 Depth 3 + madd x18, x1, x28, x9 + ldr x30, [sp, #432] // 8-byte Folded Reload + add x18, x18, x5 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + madd x15, x15, x28, x9 + add x16, x16, x5 + add x15, x15, x5 + str s2, [x30, x18, lsl #2] + str s3, [x30, x16, lsl #2] + add x16, x17, x5 + str s1, [x30, x16, lsl #2] + str s0, [x30, x15, lsl #2] + ldr x15, [sp, #504] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_108 +// %bb.104: // in Loop: Header=BB0_103 Depth=2 + madd x2, x14, x28, x9 + add x15, x14, #3 + add x16, x14, #1 + add x17, x14, #2 + madd x3, x16, x28, x9 + ldr s16, [x8] + mov x1, xzr + add x18, x14, #4 + madd x4, x17, x28, x9 + add x2, x2, x5 + ldr s2, [x30, x2, lsl #2] + madd x2, x15, x28, x9 + add x3, x3, x5 + add x4, x4, x5 + add x2, x2, x5 + ldr s3, [x30, x3, lsl #2] + ldr s1, [x30, x4, lsl #2] + ldr s0, [x30, x2, lsl #2] + ldr x2, [sp, #448] // 8-byte Folded Reload + mul x2, x20, x2 + madd x3, x14, x19, x2 + lsl x3, x3, #2 + ldr q7, [x25, x3] + madd x3, x16, x19, x2 + lsl x3, x3, #2 + ldr q6, [x25, x3] + madd x3, x17, x19, x2 + madd x2, x15, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x25, x3] + ldr q4, [x25, x2] + mov x2, x10 + mov x3, x13 + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x29 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_106 + .p2align 2 +.LBB0_105: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_103 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, #8 + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x1, x1, #4 + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x4] + add x4, x3, x22 + ldp s16, s21, [x2, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v2.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v2.2s, v21.2s, v20.2s + ldp s17, s16, [x2], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v1.2s, v21.2s, v18.2s + prfm pldl1keep, [x3] + fmla v2.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x22 + fmla v1.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x1, x29 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_105 +.LBB0_106: // in Loop: Header=BB0_103 Depth=2 + ldp x1, x2, [sp, #488] // 16-byte Folded Reload + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + mov x3, x27 + ldr s21, [x8, x2, lsl #2] + ldr s16, [x8, x1, lsl #2] + ldr x1, [sp, #480] // 8-byte Folded Reload + mov x2, x12 + ldr s22, [x8, x1, lsl #2] + mov x1, x11 + fmla v2.2s, v21.2s, v7.s[1] + fmla v3.2s, v21.2s, v6.s[1] + fmla v1.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + fmla v2.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v1.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v2.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v1.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_102 + .p2align 2 +.LBB0_107: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_103 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, x22 + prfm pldl1keep, [x1] + ldur s4, [x1, #-4] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #4 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x22 + fmla v2.2s, v4.2s, v5.2s + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v3.2s, v4.2s, v6.2s + fmla v1.2s, v4.2s, v7.2s + fmla v0.2s, v4.2s, v16.2s + cmp x3, x21 + b.lt .LBB0_107 + b .LBB0_102 + .p2align 2 +.LBB0_108: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #504] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_114 +// %bb.109: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #504] // 8-byte Folded Reload + ldr x12, [sp, #448] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr s4, [x8] + mul x12, x20, x12 + mul x15, x16, x19 + mul x11, x16, x28 + add x15, x12, x15 + add x11, x9, x11 + lsl x15, x15, #2 + add x11, x11, x5 + ldr q2, [x25, x15] + add x15, x16, #1 + madd x16, x15, x19, x12 + madd x12, x15, x28, x9 + ldr x15, [sp, #432] // 8-byte Folded Reload + add x12, x12, x5 + ldr s1, [x15, x11, lsl #2] + ldr s0, [x15, x12, lsl #2] + lsl x15, x16, #2 + ldr q3, [x25, x15] + ext v6.16b, v2.16b, v2.16b, #8 + cmp xzr, x29 + ext v5.16b, v3.16b, v3.16b, #8 + b.ge .LBB0_111 + .p2align 2 +.LBB0_110: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x1, x8, x13 + fmla v1.2s, v4.2s, v2.2s + fmla v0.2s, v4.2s, v3.2s + add x15, x6, x13 + add x2, x1, #20 + add x17, x7, x13 + add x16, x15, #32 + add x18, x17, #32 + prfm pldl1keep, [x2] + ldp s4, s7, [x1, #4] + add x14, x14, #4 + add x13, x13, #16 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x1, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + prfm pldl1keep, [x16] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x15, #16] + ext v6.16b, v2.16b, v2.16b, #8 + cmp x14, x29 + ext v5.16b, v3.16b, v3.16b, #8 + b.lt .LBB0_110 +.LBB0_111: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x15, [sp, #488] // 16-byte Folded Reload + fmla v1.2s, v4.2s, v2.2s + fmla v0.2s, v4.2s, v3.2s + ldr x1, [sp, #240] // 8-byte Folded Reload + ldr x3, [sp, #432] // 8-byte Folded Reload + mov x13, xzr + ldr s7, [x8, x15, lsl #2] + ldr s4, [x8, x14, lsl #2] + ldr x14, [sp, #480] // 8-byte Folded Reload + mov x15, x27 + fmla v1.2s, v7.2s, v2.s[1] + fmla v0.2s, v7.2s, v3.s[1] + ldr s7, [x8, x14, lsl #2] + ldr x14, [sp, #104] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + add x14, x8, x14 + fmla v1.2s, v7.2s, v2.s[3] + fmla v0.2s, v7.2s, v3.s[3] + cmp x27, x21 + b.ge .LBB0_113 + .p2align 2 +.LBB0_112: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x2, [sp, #312] // 8-byte Folded Reload + add x16, x1, x13 + add x18, x14, x13 + add x15, x15, #1 + add x16, x16, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + add x17, x2, x13 + ldr s2, [x14, x13] + add x17, x17, #4 + prfm pldl1keep, [x17] + prfm pldl1keep, [x16] + ldr s3, [x2, x13] + fmla v1.2s, v2.2s, v3.2s + ldr s3, [x1, x13] + add x13, x13, #4 + fmla v0.2s, v2.2s, v3.2s + cmp x15, x21 + b.lt .LBB0_112 +.LBB0_113: // in Loop: Header=BB0_4 Depth=1 + str s1, [x3, x11, lsl #2] + str s0, [x3, x12, lsl #2] +.LBB0_114: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #368] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_2 +// %bb.115: // in Loop: Header=BB0_4 Depth=1 + ldp x15, x13, [sp, #432] // 16-byte Folded Reload + ldr s2, [x8] + mov x11, xzr + ldr x14, [sp, #376] // 8-byte Folded Reload + mul x12, x13, x28 + add x9, x9, x12 + mul x12, x13, x19 + ldr x13, [sp, #448] // 8-byte Folded Reload + add x9, x9, x5 + ldr s0, [x15, x9, lsl #2] + madd x12, x20, x13, x12 + lsl x12, x12, #2 + ldr q1, [x25, x12] + ldr x12, [sp, #304] // 8-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x29 + b.ge .LBB0_117 + .p2align 2 +.LBB0_116: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x10, #8 + fmla v0.2s, v2.2s, v1.2s + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x10, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x10], #16 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x11, x29 + b.lt .LBB0_116 +.LBB0_117: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #496] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x10, xzr + ldr s4, [x8, x11, lsl #2] + ldr x11, [sp, #488] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s5, [x8, x11, lsl #2] + ldr x11, [sp, #480] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.2s + ldr s2, [x8, x11, lsl #2] + ldr x11, [sp, #104] // 8-byte Folded Reload + add x8, x8, x11 + mov x11, x27 + fmla v0.2s, v2.2s, v1.s[3] + cmp x27, x21 + b.ge .LBB0_1 + .p2align 2 +.LBB0_118: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x14, x10 + add x13, x8, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + ldr s1, [x8, x10] + prfm pldl1keep, [x12] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x21 + b.lt .LBB0_118 + b .LBB0_1 +.LBB0_119: + ldr x0, [sp, #16] // 8-byte Folded Reload + bl free + add sp, sp, #512 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_3d_nt_mlir, .Lfunc_end0-sbatch_matmul_3d_nt_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s new file mode 100644 index 00000000000000..96e02991c200d9 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s @@ -0,0 +1,4171 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_4d_nn_mlir // -- Begin function sbatch_matmul_4d_nn_mlir + .p2align 4 + .type sbatch_matmul_4d_nn_mlir,@function +sbatch_matmul_4d_nn_mlir: // @sbatch_matmul_4d_nn_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #1312 + .cfi_def_cfa_offset 1472 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x5, #0 + ldr x13, [sp, #1544] + ldr x29, [sp, #1656] + mov x20, x6 + cinv x8, x5, lt + ldr x23, [sp, #1568] + ldr x27, [sp, #1512] + mov x21, x1 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + ldr x24, [sp, #1504] + ldr x28, [sp, #1480] + str x7, [sp, #1056] // 8-byte Folded Spill + str x4, [sp, #520] // 8-byte Folded Spill + asr x9, x9, #1 + str x3, [sp, #40] // 8-byte Folded Spill + str x2, [sp, #960] // 8-byte Folded Spill + cinv x9, x9, lt + cmp x8, #0 + str x5, [sp, #1024] // 8-byte Folded Spill + str x13, [sp, #504] // 8-byte Folded Spill + csel x8, x10, x8, lt + str x9, [sp, #1280] // 8-byte Folded Spill + ldr x9, [sp, #1552] + cmp x5, #0 + ldr x10, [sp, #1600] + asr x8, x8, #2 + cinv x22, x8, lt + cmp x13, #0 + cinv x8, x13, lt + str x9, [sp, #1048] // 8-byte Folded Spill + ldr x9, [sp, #1648] + str x10, [sp, #944] // 8-byte Folded Spill + ldr x10, [sp, #1592] + add x11, x8, #7 + add x12, x8, #3 + str x9, [sp, #1016] // 8-byte Folded Spill + ldr x9, [sp, #1640] + str x10, [sp, #936] // 8-byte Folded Spill + add x10, x8, #15 + str x9, [sp, #1008] // 8-byte Folded Spill + add x9, x8, x8, lsr #63 + asr x9, x9, #1 + cinv x14, x9, lt + ldr x9, [sp, #1560] + cmp x8, #0 + str x14, [sp, #1272] // 8-byte Folded Spill + str x9, [sp, #1040] // 8-byte Folded Spill + csel x9, x10, x8, lt + csel x10, x11, x8, lt + csel x8, x12, x8, lt + cmp x13, #0 + asr x9, x9, #4 + asr x8, x8, #2 + asr x10, x10, #3 + cinv x19, x9, lt + cinv x9, x8, lt + cinv x25, x10, lt + lsl x8, x19, #4 + str x9, [sp, #1264] // 8-byte Folded Spill + lsl x26, x25, #3 + str x8, [sp, #1104] // 8-byte Folded Spill + ldr x8, [sp, #1472] + str x8, [sp, #1032] // 8-byte Folded Spill + lsl x8, x9, #2 + str x8, [sp, #592] // 8-byte Folded Spill + lsl x8, x14, #1 + str x8, [sp, #584] // 8-byte Folded Spill + lsl x8, x6, #6 + add x0, x8, #64 + str x8, [sp, #1288] // 8-byte Folded Spill + bl malloc + ldr x16, [sp, #1280] // 8-byte Folded Reload + lsl x8, x22, #2 + mov x30, x26 + str x27, [sp, #928] // 8-byte Folded Spill + str x8, [sp, #1296] // 8-byte Folded Spill + add x8, x0, #63 + lsl x12, x27, #2 + lsl x27, x28, #2 + and x26, x8, #0xffffffffffffffc0 + lsl x6, x20, #2 + str x0, [sp, #16] // 8-byte Folded Spill + mov w14, #20 // =0x14 + madd x14, x23, x14, x12 + mov w11, #12 // =0xc + str x6, [sp, #480] // 8-byte Folded Spill + mov w17, #28 // =0x1c + lsl x9, x16, #1 + mul x3, x16, x28 + madd x17, x23, x17, x12 + mov w15, #24 // =0x18 + str x9, [sp, #1128] // 8-byte Folded Spill + negs x9, x20 + madd x15, x23, x15, x12 + str x23, [sp, #1000] // 8-byte Folded Spill + and x8, x9, #0x3 + and x9, x20, #0x3 + str x24, [sp, #920] // 8-byte Folded Spill + lsl x25, x25, #5 + csneg x7, x9, x8, mi + ldr x8, [sp, #960] // 8-byte Folded Reload + add x9, x20, x27 + lsl x19, x19, #6 + lsl x18, x7, #2 + sub x4, x26, x7, lsl #6 + stp xzr, xzr, [sp, #264] // 16-byte Folded Spill + mov x13, xzr + str x21, [sp, #952] // 8-byte Folded Spill + stp x25, x19, [sp, #464] // 16-byte Folded Spill + lsl x10, x8, #2 + sub x8, x6, x18 + str x30, [sp, #1080] // 8-byte Folded Spill + str x8, [sp, #512] // 8-byte Folded Spill + sub x8, x9, x7 + mov w9, #1 // =0x1 + add x2, x10, x28, lsl #3 + bfi x9, x22, #2, #62 + str x8, [sp, #1304] // 8-byte Folded Spill + mul x8, x22, x28 + mul x1, x28, x9 + add x9, x20, x3, lsl #1 + sub x9, x9, x7 + str x9, [sp, #1280] // 8-byte Folded Spill + ldr x9, [sp, #1264] // 8-byte Folded Reload + add x22, x10, x8, lsl #4 + add x5, x22, x6 + lsl x0, x9, #4 + ldr x9, [sp, #1272] // 8-byte Folded Reload + str x0, [sp, #456] // 8-byte Folded Spill + lsl x16, x9, #3 + add x9, x21, x2 + add x2, x6, x10 + str x9, [sp, #1224] // 8-byte Folded Spill + sub x9, x2, x18 + add x2, x24, x14 + madd x14, x28, x11, x10 + madd x11, x23, x11, x12 + str x9, [sp, #1272] // 8-byte Folded Spill + ldr x9, [sp, #1288] // 8-byte Folded Reload + add x9, x4, x9 + str x9, [sp, #848] // 8-byte Folded Spill + add x9, x10, x1, lsl #2 + add x1, x10, x3, lsl #3 + add x3, x24, x15 + stp x2, x3, [sp, #232] // 16-byte Folded Spill + add x4, x9, x6 + add x6, x1, x6 + str x9, [sp, #1256] // 8-byte Folded Spill + sub x9, x20, x7 + sub x8, x4, x18 + add x4, x24, x17 + str x8, [sp, #1264] // 8-byte Folded Spill + sub x8, x5, x18 + str x8, [sp, #1248] // 8-byte Folded Spill + sub x8, x6, x18 + add x6, x12, x23, lsl #5 + str x8, [sp, #1216] // 8-byte Folded Spill + add x8, x21, x14 + lsl x14, x23, #4 + str x14, [sp, #1240] // 8-byte Folded Spill + add x14, x14, x12 + str x8, [sp, #1288] // 8-byte Folded Spill + ldr x8, [sp, #512] // 8-byte Folded Reload + add x6, x24, x6 + add x18, x24, x14 + lsl x14, x23, #2 + stp x4, x6, [sp, #248] // 16-byte Folded Spill + str x14, [sp, #992] // 8-byte Folded Spill + add x14, x14, x12 + add x17, x24, x14 + add x14, x12, x23, lsl #3 + add x8, x8, #4 + stp x17, x18, [sp, #216] // 16-byte Folded Spill + add x15, x24, x14 + add x14, x24, x11 + mul x11, x23, x9 + str x8, [sp, #448] // 8-byte Folded Spill + stp x14, x15, [sp, #200] // 16-byte Folded Spill + add x11, x12, x11, lsl #2 + madd x12, x23, x8, x12 + ldr x8, [sp, #1272] // 8-byte Folded Reload + add x23, x24, x11 + add x5, x24, x12 + add x12, x8, x21 + ldr x8, [sp, #1304] // 8-byte Folded Reload + add x24, x21, x10 + str x9, [sp, #1304] // 8-byte Folded Spill + add x12, x12, #4 + str x5, [sp, #192] // 8-byte Folded Spill + add x5, x5, x16 + stp x23, x12, [sp, #176] // 16-byte Folded Spill + add x11, x10, x8, lsl #2 + ldr x8, [sp, #1280] // 8-byte Folded Reload + add x12, x10, x8, lsl #2 + add x10, x10, x21 + lsl x8, x28, #4 + add x10, x8, x10 + str x8, [sp, #1232] // 8-byte Folded Spill + add x8, x10, #32 + add x10, x11, x21 + ldr x11, [sp, #848] // 8-byte Folded Reload + str x8, [sp, #168] // 8-byte Folded Spill + add x8, x10, #4 + ldr x10, [sp, #1264] // 8-byte Folded Reload + str x8, [sp, #160] // 8-byte Folded Spill + ldr x8, [sp, #1256] // 8-byte Folded Reload + add x10, x21, x10 + str x10, [sp, #1168] // 8-byte Folded Spill + ldr x10, [sp, #1248] // 8-byte Folded Reload + add x8, x21, x8 + str x8, [sp, #1184] // 8-byte Folded Spill + add x8, x21, x22 + sub x22, x9, #4 + str x8, [sp, #1176] // 8-byte Folded Spill + add x10, x21, x10 + str x10, [sp, #1160] // 8-byte Folded Spill + add x10, x1, x21 + lsl x1, x20, #3 + add x10, x10, #32 + str x1, [sp, #424] // 8-byte Folded Spill + sub x1, x1, x7, lsl #3 + str x10, [sp, #152] // 8-byte Folded Spill + add x10, x12, x21 + lsl x12, x20, #4 + add x10, x10, #4 + str x10, [sp, #144] // 8-byte Folded Spill + ldr x10, [sp, #1216] // 8-byte Folded Reload + str x24, [sp, #1216] // 8-byte Folded Spill + add x10, x21, x10 + str x10, [sp, #136] // 8-byte Folded Spill + lsl x10, x20, #5 + stp x12, x10, [sp, #432] // 16-byte Folded Spill + sub x10, x10, x7, lsl #5 + sub x12, x12, x7, lsl #4 + add x7, x6, x16 + stp x7, x1, [sp, #400] // 16-byte Folded Spill + add x7, x4, x16 + str x7, [sp, #392] // 8-byte Folded Spill + add x7, x3, x16 + str x10, [sp, #416] // 8-byte Folded Spill + str x12, [sp, #280] // 8-byte Folded Spill + str x7, [sp, #384] // 8-byte Folded Spill + add x7, x2, x16 + str x7, [sp, #840] // 8-byte Folded Spill + add x7, x18, x16 + str x7, [sp, #832] // 8-byte Folded Spill + add x7, x17, x16 + str x7, [sp, #824] // 8-byte Folded Spill + add x7, x15, x16 + str x7, [sp, #816] // 8-byte Folded Spill + add x7, x14, x16 + add x16, x23, x16 + stp x16, x5, [sp, #120] // 16-byte Folded Spill + sub x16, x9, #3 + str x7, [sp, #808] // 8-byte Folded Spill + str x16, [sp, #984] // 8-byte Folded Spill + sub x16, x9, #2 + sub x9, x9, #1 + str x16, [sp, #976] // 8-byte Folded Spill + add x16, x26, #128 + str x9, [sp, #968] // 8-byte Folded Spill + ldr x9, [sp, #1048] // 8-byte Folded Reload + str x16, [sp, #912] // 8-byte Folded Spill + add x16, x26, #256 + str x16, [sp, #1200] // 8-byte Folded Spill + add x16, x11, #64 + add x11, x10, #32 + add x10, x6, x25 + stp x10, x11, [sp, #328] // 16-byte Folded Spill + add x10, x4, x25 + add x11, x12, #16 + str x16, [sp, #1192] // 8-byte Folded Spill + add x16, x6, x19 + ldr x12, [sp, #1288] // 8-byte Folded Reload + str x10, [sp, #320] // 8-byte Folded Spill + add x10, x3, x25 + lsl x9, x9, #2 + str x16, [sp, #376] // 8-byte Folded Spill + add x16, x4, x19 + str x10, [sp, #776] // 8-byte Folded Spill + add x10, x2, x25 + str x9, [sp, #32] // 8-byte Folded Spill + ldr x9, [sp, #1040] // 8-byte Folded Reload + str x16, [sp, #368] // 8-byte Folded Spill + add x16, x3, x19 + str x10, [sp, #768] // 8-byte Folded Spill + add x10, x18, x25 + str x16, [sp, #360] // 8-byte Folded Spill + add x16, x2, x19 + str x10, [sp, #760] // 8-byte Folded Spill + add x10, x17, x25 + str x16, [sp, #352] // 8-byte Folded Spill + add x16, x18, x19 + stp x24, x12, [sp, #104] // 16-byte Folded Spill + str x10, [sp, #752] // 8-byte Folded Spill + add x10, x15, x25 + str x16, [sp, #344] // 8-byte Folded Spill + add x16, x17, x19 + str x10, [sp, #744] // 8-byte Folded Spill + add x10, x14, x25 + lsl x9, x9, #2 + str x16, [sp, #800] // 8-byte Folded Spill + add x16, x15, x19 + str x10, [sp, #736] // 8-byte Folded Spill + add x10, x6, x0 + str x9, [sp, #496] // 8-byte Folded Spill + ldr x9, [sp, #1056] // 8-byte Folded Reload + str x16, [sp, #792] // 8-byte Folded Spill + add x16, x14, x19 + stp x10, x11, [sp, #304] // 16-byte Folded Spill + add x10, x4, x0 + ldr x11, [sp, #1224] // 8-byte Folded Reload + str x16, [sp, #784] // 8-byte Folded Spill + str x10, [sp, #296] // 8-byte Folded Spill + add x10, x3, x0 + str x10, [sp, #728] // 8-byte Folded Spill + add x10, x2, x0 + str x10, [sp, #720] // 8-byte Folded Spill + add x10, x18, x0 + lsl x9, x9, #2 + str x10, [sp, #712] // 8-byte Folded Spill + add x10, x17, x0 + str x9, [sp, #24] // 8-byte Folded Spill + ldr x9, [sp, #1032] // 8-byte Folded Reload + str x10, [sp, #704] // 8-byte Folded Spill + add x10, x15, x0 + str x10, [sp, #696] // 8-byte Folded Spill + add x10, x14, x0 + str x10, [sp, #688] // 8-byte Folded Spill + add x10, x1, #8 + str x10, [sp, #288] // 8-byte Folded Spill + mov x10, x8 + ldr x8, [sp, #1184] // 8-byte Folded Reload + lsl x9, x9, #2 + str x9, [sp, #488] // 8-byte Folded Spill + add x9, x24, x27 + str x9, [sp, #1208] // 8-byte Folded Spill + str x9, [sp, #96] // 8-byte Folded Spill + mov x9, x8 + ldr x8, [sp, #1160] // 8-byte Folded Reload + stp x8, x11, [sp, #80] // 16-byte Folded Spill + ldr x8, [sp, #1168] // 8-byte Folded Reload + str x8, [sp, #72] // 8-byte Folded Spill + b .LBB0_2 + .p2align 2 +.LBB0_1: // %.loopexit68 + // in Loop: Header=BB0_2 Depth=1 + ldr x8, [sp, #264] // 8-byte Folded Reload + ldp x10, x9, [sp, #24] // 16-byte Folded Reload + add x8, x8, x10 + ldr x13, [sp, #48] // 8-byte Folded Reload + str x8, [sp, #264] // 8-byte Folded Spill + ldr x8, [sp, #256] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #256] // 8-byte Folded Spill + ldr x8, [sp, #248] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #248] // 8-byte Folded Spill + ldr x8, [sp, #240] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #240] // 8-byte Folded Spill + ldr x8, [sp, #232] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #232] // 8-byte Folded Spill + ldr x8, [sp, #224] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #224] // 8-byte Folded Spill + ldr x8, [sp, #216] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #216] // 8-byte Folded Spill + ldr x8, [sp, #208] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #208] // 8-byte Folded Spill + ldr x8, [sp, #200] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #200] // 8-byte Folded Spill + ldr x8, [sp, #192] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #192] // 8-byte Folded Spill + ldr x8, [sp, #184] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #184] // 8-byte Folded Spill + ldr x8, [sp, #176] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #176] // 8-byte Folded Spill + ldr x8, [sp, #168] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #168] // 8-byte Folded Spill + ldr x8, [sp, #160] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #160] // 8-byte Folded Spill + ldr x8, [sp, #152] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #152] // 8-byte Folded Spill + ldr x8, [sp, #144] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #144] // 8-byte Folded Spill + ldr x8, [sp, #136] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #136] // 8-byte Folded Spill + ldr x8, [sp, #272] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #272] // 8-byte Folded Spill + ldr x8, [sp, #72] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #72] // 8-byte Folded Spill + ldr x8, [sp, #80] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #80] // 8-byte Folded Spill + ldr x8, [sp, #88] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #88] // 8-byte Folded Spill + ldr x8, [sp, #96] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #96] // 8-byte Folded Spill + ldr x8, [sp, #104] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #104] // 8-byte Folded Spill + ldr x8, [sp, #112] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #112] // 8-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #128] // 8-byte Folded Spill + ldr x8, [sp, #120] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #120] // 8-byte Folded Spill + ldp x9, x8, [sp, #56] // 16-byte Folded Reload + add x9, x9, x10 + add x10, x8, x10 +.LBB0_2: // =>This Loop Header: Depth=1 + // Child Loop BB0_7 Depth 2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + // Child Loop BB0_26 Depth 4 + // Child Loop BB0_28 Depth 4 + // Child Loop BB0_32 Depth 4 + // Child Loop BB0_34 Depth 4 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_43 Depth 3 + // Child Loop BB0_46 Depth 3 + // Child Loop BB0_48 Depth 4 + // Child Loop BB0_50 Depth 4 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_65 Depth 3 + // Child Loop BB0_68 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_73 Depth 4 + // Child Loop BB0_75 Depth 4 + // Child Loop BB0_78 Depth 3 + // Child Loop BB0_80 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 3 + // Child Loop BB0_90 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_96 Depth 3 + // Child Loop BB0_98 Depth 4 + // Child Loop BB0_100 Depth 4 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_109 Depth 3 + // Child Loop BB0_111 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_118 Depth 3 + // Child Loop BB0_121 Depth 3 + // Child Loop BB0_123 Depth 4 + // Child Loop BB0_125 Depth 4 + // Child Loop BB0_128 Depth 3 + // Child Loop BB0_130 Depth 3 + // Child Loop BB0_134 Depth 3 + // Child Loop BB0_136 Depth 3 + ldr x8, [sp, #40] // 8-byte Folded Reload + cmp x13, x8 + b.ge .LBB0_137 +// %bb.3: // in Loop: Header=BB0_2 Depth=1 + add x8, x13, #1 + str x10, [sp, #64] // 8-byte Folded Spill + ldp x15, x16, [sp, #216] // 16-byte Folded Reload + stp x8, x9, [sp, #48] // 16-byte Folded Spill + mov x19, xzr + str x10, [sp, #616] // 8-byte Folded Spill + str x9, [sp, #608] // 8-byte Folded Spill + ldp x10, x8, [sp, #120] // 16-byte Folded Reload + str x13, [sp, #1064] // 8-byte Folded Spill + str x8, [sp, #600] // 8-byte Folded Spill + ldp x8, x9, [sp, #104] // 16-byte Folded Reload + ldp x13, x14, [sp, #200] // 16-byte Folded Reload + str x9, [sp, #888] // 8-byte Folded Spill + str x8, [sp, #872] // 8-byte Folded Spill + ldp x8, x9, [sp, #88] // 16-byte Folded Reload + str x9, [sp, #864] // 8-byte Folded Spill + str x8, [sp, #856] // 8-byte Folded Spill + ldp x8, x9, [sp, #72] // 16-byte Folded Reload + str x8, [sp, #680] // 8-byte Folded Spill + ldr x8, [sp, #272] // 8-byte Folded Reload + str x9, [sp, #648] // 8-byte Folded Spill + str x8, [sp, #656] // 8-byte Folded Spill + ldp x12, x8, [sp, #136] // 16-byte Folded Reload + str x8, [sp, #880] // 8-byte Folded Spill + ldp x9, x8, [sp, #152] // 16-byte Folded Reload + str x9, [sp, #904] // 8-byte Folded Spill + str x8, [sp, #1096] // 8-byte Folded Spill + ldp x9, x8, [sp, #168] // 16-byte Folded Reload + str x9, [sp, #1088] // 8-byte Folded Spill + str x8, [sp, #672] // 8-byte Folded Spill + ldp x9, x8, [sp, #184] // 16-byte Folded Reload + str x8, [sp, #664] // 8-byte Folded Spill + str x9, [sp, #1152] // 8-byte Folded Spill + ldp x17, x8, [sp, #232] // 16-byte Folded Reload + str x8, [sp, #640] // 8-byte Folded Spill + ldp x9, x8, [sp, #248] // 16-byte Folded Reload + str x8, [sp, #624] // 8-byte Folded Spill + ldr x8, [sp, #264] // 8-byte Folded Reload + str x9, [sp, #632] // 8-byte Folded Spill + str x8, [sp, #1120] // 8-byte Folded Spill + b .LBB0_7 + .p2align 2 +.LBB0_4: // in Loop: Header=BB0_7 Depth=2 + str s0, [x15, x9, lsl #2] +.LBB0_5: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload +.LBB0_6: // %.backedge69 + // in Loop: Header=BB0_7 Depth=2 + ldr x9, [sp, #488] // 8-byte Folded Reload + ldr x8, [sp, #1120] // 8-byte Folded Reload + add x8, x8, x9 + ldr x10, [sp, #624] // 8-byte Folded Reload + ldr x17, [sp, #536] // 8-byte Folded Reload + ldr x16, [sp, #544] // 8-byte Folded Reload + ldr x15, [sp, #552] // 8-byte Folded Reload + ldr x14, [sp, #560] // 8-byte Folded Reload + ldr x13, [sp, #568] // 8-byte Folded Reload + ldr x12, [sp, #896] // 8-byte Folded Reload + ldr x19, [sp, #528] // 8-byte Folded Reload + add x12, x12, x9 + str x8, [sp, #1120] // 8-byte Folded Spill + ldr x8, [sp, #496] // 8-byte Folded Reload + add x10, x10, x8 + add x17, x17, x8 + add x16, x16, x8 + add x15, x15, x8 + add x14, x14, x8 + add x13, x13, x8 + str x10, [sp, #624] // 8-byte Folded Spill + ldr x10, [sp, #632] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #632] // 8-byte Folded Spill + ldr x10, [sp, #640] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #640] // 8-byte Folded Spill + ldr x10, [sp, #664] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #664] // 8-byte Folded Spill + ldr x10, [sp, #1152] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #1152] // 8-byte Folded Spill + ldr x10, [sp, #672] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #672] // 8-byte Folded Spill + ldr x10, [sp, #1088] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #1088] // 8-byte Folded Spill + ldr x10, [sp, #1096] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #1096] // 8-byte Folded Spill + ldr x10, [sp, #904] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #904] // 8-byte Folded Spill + ldr x10, [sp, #880] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #880] // 8-byte Folded Spill + ldr x10, [sp, #656] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #656] // 8-byte Folded Spill + ldr x10, [sp, #680] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #680] // 8-byte Folded Spill + ldr x10, [sp, #648] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #648] // 8-byte Folded Spill + ldr x10, [sp, #856] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #856] // 8-byte Folded Spill + ldr x10, [sp, #864] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #864] // 8-byte Folded Spill + ldr x10, [sp, #872] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #872] // 8-byte Folded Spill + ldr x10, [sp, #888] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #888] // 8-byte Folded Spill + ldr x10, [sp, #600] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #600] // 8-byte Folded Spill + ldr x10, [sp, #576] // 8-byte Folded Reload + add x10, x10, x8 + ldr x8, [sp, #608] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #608] // 8-byte Folded Spill + ldr x8, [sp, #616] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #616] // 8-byte Folded Spill +.LBB0_7: // Parent Loop BB0_2 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + // Child Loop BB0_26 Depth 4 + // Child Loop BB0_28 Depth 4 + // Child Loop BB0_32 Depth 4 + // Child Loop BB0_34 Depth 4 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_43 Depth 3 + // Child Loop BB0_46 Depth 3 + // Child Loop BB0_48 Depth 4 + // Child Loop BB0_50 Depth 4 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_65 Depth 3 + // Child Loop BB0_68 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_73 Depth 4 + // Child Loop BB0_75 Depth 4 + // Child Loop BB0_78 Depth 3 + // Child Loop BB0_80 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 3 + // Child Loop BB0_90 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_96 Depth 3 + // Child Loop BB0_98 Depth 4 + // Child Loop BB0_100 Depth 4 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_109 Depth 3 + // Child Loop BB0_111 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_118 Depth 3 + // Child Loop BB0_121 Depth 3 + // Child Loop BB0_123 Depth 4 + // Child Loop BB0_125 Depth 4 + // Child Loop BB0_128 Depth 3 + // Child Loop BB0_130 Depth 3 + // Child Loop BB0_134 Depth 3 + // Child Loop BB0_136 Depth 3 + ldr x8, [sp, #520] // 8-byte Folded Reload + cmp x19, x8 + b.ge .LBB0_1 +// %bb.8: // in Loop: Header=BB0_7 Depth=2 + add x8, x19, #1 + str x15, [sp, #552] // 8-byte Folded Spill + mov x0, xzr + str x15, [sp, #1264] // 8-byte Folded Spill + ldr x15, [sp, #640] // 8-byte Folded Reload + str x8, [sp, #528] // 8-byte Folded Spill + ldr x8, [sp, #672] // 8-byte Folded Reload + str x16, [sp, #544] // 8-byte Folded Spill + str x16, [sp, #1256] // 8-byte Folded Spill + ldr x16, [sp, #632] // 8-byte Folded Reload + str x17, [sp, #536] // 8-byte Folded Spill + str x17, [sp, #1248] // 8-byte Folded Spill + ldr x17, [sp, #624] // 8-byte Folded Reload + str x12, [sp, #896] // 8-byte Folded Spill + str x10, [sp, #576] // 8-byte Folded Spill + str x13, [sp, #568] // 8-byte Folded Spill + str x13, [sp, #1280] // 8-byte Folded Spill + str x14, [sp, #560] // 8-byte Folded Spill + str x14, [sp, #1272] // 8-byte Folded Spill + str x19, [sp, #1072] // 8-byte Folded Spill + str x8, [sp, #1144] // 8-byte Folded Spill + ldr x8, [sp, #664] // 8-byte Folded Reload + str x8, [sp, #1136] // 8-byte Folded Spill + b .LBB0_11 + .p2align 2 +.LBB0_9: // in Loop: Header=BB0_11 Depth=3 + stp q3, q2, [x8] + stp q1, q0, [x8, #32] +.LBB0_10: // %.backedge + // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #1248] // 8-byte Folded Reload + ldr x0, [sp, #1112] // 8-byte Folded Reload + add x17, x17, #64 + add x16, x16, #64 + add x15, x15, #64 + add x8, x8, #64 + str x8, [sp, #1248] // 8-byte Folded Spill + ldr x8, [sp, #1256] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1256] // 8-byte Folded Spill + ldr x8, [sp, #1264] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1264] // 8-byte Folded Spill + ldr x8, [sp, #1272] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1272] // 8-byte Folded Spill + ldr x8, [sp, #1280] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1280] // 8-byte Folded Spill + ldr x8, [sp, #1136] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1136] // 8-byte Folded Spill + ldr x8, [sp, #1144] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1144] // 8-byte Folded Spill +.LBB0_11: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + // Child Loop BB0_26 Depth 4 + // Child Loop BB0_28 Depth 4 + // Child Loop BB0_32 Depth 4 + // Child Loop BB0_34 Depth 4 + ldr x8, [sp, #1104] // 8-byte Folded Reload + cmp x0, x8 + b.ge .LBB0_35 +// %bb.12: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #944] // 8-byte Folded Reload + ldr x9, [sp, #936] // 8-byte Folded Reload + mov x5, xzr + mov x7, xzr + ldr x10, [sp, #928] // 8-byte Folded Reload + ldr x11, [sp, #920] // 8-byte Folded Reload + add x1, x9, x8, lsl #2 + add x8, x0, #16 + add x14, x11, x10, lsl #2 + ldr x11, [sp, #1008] // 8-byte Folded Reload + ldr x12, [sp, #1064] // 8-byte Folded Reload + ldr x10, [sp, #1040] // 8-byte Folded Reload + lsl x9, x29, #1 + str x8, [sp, #1112] // 8-byte Folded Spill + ldr x8, [sp, #1016] // 8-byte Folded Reload + mul x10, x19, x10 + mul x8, x19, x8 + madd x2, x12, x11, x8 + ldr x11, [sp, #1048] // 8-byte Folded Reload + add x8, x9, x29 + madd x6, x12, x11, x10 + add x10, x2, x0 + add x11, x1, x10, lsl #2 + add x8, x10, x8 + add x9, x10, x9 + add x10, x10, x29 + add x8, x1, x8, lsl #2 + add x9, x1, x9, lsl #2 + add x10, x1, x10, lsl #2 + ldp q4, q3, [x11, #32] + ldp q1, q0, [x11] + add x11, x6, x0 + ldp q18, q16, [x8, #32] + ldp q23, q21, [x8] + ldp q19, q17, [x9, #32] + ldp q22, q20, [x9] + ldr x9, [sp, #1056] // 8-byte Folded Reload + ldp q5, q2, [x10, #32] + add x8, x14, x11, lsl #2 + ldp q7, q6, [x10] + ldr x10, [sp, #952] // 8-byte Folded Reload + ldp q29, q28, [x8, #32] + ldp q31, q30, [x8] + ldr x8, [sp, #1032] // 8-byte Folded Reload + mul x8, x19, x8 + ldr x19, [sp, #1120] // 8-byte Folded Reload + madd x4, x12, x9, x8 + ldr x9, [sp, #960] // 8-byte Folded Reload + add x3, x10, x9, lsl #2 + lsl x8, x4, #2 + ldr q26, [x3, x8] + add x8, x4, x28 + lsl x8, x8, #2 + ldr q25, [x3, x8] + add x8, x4, x28, lsl #1 + lsl x8, x8, #2 + ldr q24, [x3, x8] + ldr x8, [sp, #912] // 8-byte Folded Reload + .p2align 2 +.LBB0_13: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x9, [sp, #1288] // 8-byte Folded Reload + fmla v1.4s, v31.4s, v26.s[0] + fmla v0.4s, v30.4s, v26.s[0] + cmp x7, x22 + add x9, x9, x19 + prfm pldl1keep, [x9, #16] + ldr q27, [x9] + b.ge .LBB0_15 +// %bb.14: // in Loop: Header=BB0_13 Depth=4 + ldr x11, [sp, #1216] // 8-byte Folded Reload + ldr x12, [sp, #1264] // 8-byte Folded Reload + fmla v4.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + ldr x9, [sp, #1224] // 8-byte Folded Reload + ldr x10, [sp, #1208] // 8-byte Folded Reload + fmla v7.4s, v31.4s, v25.s[0] + fmla v6.4s, v30.4s, v25.s[0] + fmla v5.4s, v29.4s, v25.s[0] + fmla v2.4s, v28.4s, v25.s[0] + stp q31, q30, [x8, #-128] + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + stp q29, q28, [x8, #-96] + add x21, x11, x19 + ldr x11, [sp, #1248] // 8-byte Folded Reload + add x12, x12, x5 + fmla v22.4s, v31.4s, v24.s[0] + fmla v20.4s, v30.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + add x30, x16, x5 + fmla v21.4s, v30.4s, v27.s[0] + fmla v23.4s, v31.4s, v27.s[0] + add x9, x9, x19 + add x10, x10, x19 + add x23, x9, #32 + add x24, x10, #32 + add x25, x21, #32 + add x7, x7, #4 + add x11, x11, x5 + add x19, x19, #16 + prfm pldl1keep, [x11] + ldr x13, [sp, #1272] // 8-byte Folded Reload + ldp q28, q29, [x12, #32] + ldp q30, q31, [x12] + add x12, x15, x5 + add x11, x17, x5 + add x18, x13, x5 + fmla v3.4s, v29.4s, v26.s[1] + fmla v0.4s, v31.4s, v26.s[1] + fmla v2.4s, v29.4s, v25.s[1] + fmla v6.4s, v31.4s, v25.s[1] + fmla v20.4s, v31.4s, v24.s[1] + fmla v17.4s, v29.4s, v24.s[1] + fmla v21.4s, v31.4s, v27.s[1] + fmla v16.4s, v29.4s, v27.s[1] + fmla v4.4s, v28.4s, v26.s[1] + fmla v1.4s, v30.4s, v26.s[1] + fmla v5.4s, v28.4s, v25.s[1] + fmla v7.4s, v30.4s, v25.s[1] + fmla v22.4s, v30.4s, v24.s[1] + stp q28, q29, [x8, #-32] + fmla v19.4s, v28.4s, v24.s[1] + fmla v23.4s, v30.4s, v27.s[1] + fmla v18.4s, v28.4s, v27.s[1] + stp q30, q31, [x8, #-64] + prfm pldl1keep, [x12] + ldp q29, q28, [x18, #32] + ldp q31, q30, [x18] + ldr x12, [sp, #1280] // 8-byte Folded Reload + add x12, x12, x5 + fmla v0.4s, v30.4s, v26.s[2] + fmla v3.4s, v28.4s, v26.s[2] + fmla v2.4s, v28.4s, v25.s[2] + fmla v17.4s, v28.4s, v24.s[2] + fmla v6.4s, v30.4s, v25.s[2] + fmla v20.4s, v30.4s, v24.s[2] + stp q31, q30, [x8] + stp q29, q28, [x8, #32] + prfm pldl1keep, [x30] + ldp q8, q9, [x12] + fmla v1.4s, v31.4s, v26.s[2] + ldp q10, q11, [x12, #32] + ldr x12, [sp, #1256] // 8-byte Folded Reload + fmla v4.4s, v29.4s, v26.s[2] + fmla v5.4s, v29.4s, v25.s[2] + fmla v19.4s, v29.4s, v24.s[2] + fmla v7.4s, v31.4s, v25.s[2] + fmla v22.4s, v31.4s, v24.s[2] + add x12, x12, x5 + fmla v16.4s, v28.4s, v27.s[2] + fmla v18.4s, v29.4s, v27.s[2] + fmla v21.4s, v30.4s, v27.s[2] + fmla v23.4s, v31.4s, v27.s[2] + fmla v3.4s, v11.4s, v26.s[3] + fmla v4.4s, v10.4s, v26.s[3] + fmla v0.4s, v9.4s, v26.s[3] + stp q8, q9, [x8, #64] + stp q10, q11, [x8, #96] + prfm pldl1keep, [x11] + fmla v1.4s, v8.4s, v26.s[3] + fmla v2.4s, v11.4s, v25.s[3] + fmla v5.4s, v10.4s, v25.s[3] + fmla v6.4s, v9.4s, v25.s[3] + fmla v7.4s, v8.4s, v25.s[3] + ldp q29, q28, [x12, #32] + fmla v20.4s, v9.4s, v24.s[3] + fmla v22.4s, v8.4s, v24.s[3] + fmla v19.4s, v10.4s, v24.s[3] + fmla v17.4s, v11.4s, v24.s[3] + ldp q31, q30, [x12] + prfm pldl1keep, [x25] + ldr q26, [x21, #16] + prfm pldl1keep, [x24] + ldr q25, [x10, #16] + prfm pldl1keep, [x23] + ldr q24, [x9, #16] + ldr x9, [sp, #1240] // 8-byte Folded Reload + fmla v23.4s, v8.4s, v27.s[3] + fmla v21.4s, v9.4s, v27.s[3] + fmla v18.4s, v10.4s, v27.s[3] + fmla v16.4s, v11.4s, v27.s[3] + add x5, x5, x9 + add x8, x8, #256 + b .LBB0_13 + .p2align 2 +.LBB0_15: // in Loop: Header=BB0_11 Depth=3 + ldr x11, [sp, #1000] // 8-byte Folded Reload + ldr x12, [sp, #984] // 8-byte Folded Reload + add x9, x26, x22, lsl #6 + fmla v4.4s, v29.4s, v26.s[0] + ldr x13, [sp, #976] // 8-byte Folded Reload + fmla v3.4s, v28.4s, v26.s[0] + fmla v2.4s, v28.4s, v25.s[0] + fmla v7.4s, v31.4s, v25.s[0] + stp q31, q30, [x9] + stp q29, q28, [x9, #32] + fmla v6.4s, v30.4s, v25.s[0] + fmla v5.4s, v29.4s, v25.s[0] + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + madd x8, x12, x11, x6 + madd x10, x13, x11, x6 + fmla v20.4s, v30.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + add x5, x26, x12, lsl #6 + ldr x12, [sp, #968] // 8-byte Folded Reload + fmla v21.4s, v30.4s, v27.s[0] + fmla v23.4s, v31.4s, v27.s[0] + ldr x18, [sp, #1136] // 8-byte Folded Reload + mov x19, xzr + add x8, x8, x0 + add x9, x10, x0 + add x7, x26, x12, lsl #6 + add x8, x14, x8, lsl #2 + add x9, x14, x9, lsl #2 + ldp q28, q29, [x8] + fmla v0.4s, v29.4s, v26.s[1] + ldp q30, q31, [x8, #32] + madd x8, x12, x11, x6 + fmla v3.4s, v31.4s, v26.s[1] + fmla v6.4s, v29.4s, v25.s[1] + fmla v2.4s, v31.4s, v25.s[1] + fmla v20.4s, v29.4s, v24.s[1] + fmla v17.4s, v31.4s, v24.s[1] + fmla v21.4s, v29.4s, v27.s[1] + fmla v16.4s, v31.4s, v27.s[1] + add x6, x26, x13, lsl #6 + add x8, x8, x0 + ldr x13, [sp, #992] // 8-byte Folded Reload + stp q28, q29, [x5] + fmla v4.4s, v30.4s, v26.s[1] + stp q30, q31, [x5, #32] + fmla v1.4s, v28.4s, v26.s[1] + fmla v5.4s, v30.4s, v25.s[1] + fmla v7.4s, v28.4s, v25.s[1] + fmla v22.4s, v28.4s, v24.s[1] + fmla v19.4s, v30.4s, v24.s[1] + fmla v23.4s, v28.4s, v27.s[1] + fmla v18.4s, v30.4s, v27.s[1] + add x8, x14, x8, lsl #2 + ldr x14, [sp, #1144] // 8-byte Folded Reload + ldp q29, q28, [x9, #32] + ldp q31, q30, [x9] + ldr x9, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v30.4s, v26.s[2] + fmla v3.4s, v28.4s, v26.s[2] + fmla v2.4s, v28.4s, v25.s[2] + fmla v6.4s, v30.4s, v25.s[2] + fmla v17.4s, v28.4s, v24.s[2] + fmla v20.4s, v30.4s, v24.s[2] + fmla v16.4s, v28.4s, v27.s[2] + fmla v21.4s, v30.4s, v27.s[2] + stp q31, q30, [x6] + fmla v1.4s, v31.4s, v26.s[2] + stp q29, q28, [x6, #32] + fmla v4.4s, v29.4s, v26.s[2] + fmla v7.4s, v31.4s, v25.s[2] + fmla v5.4s, v29.4s, v25.s[2] + fmla v19.4s, v29.4s, v24.s[2] + fmla v22.4s, v31.4s, v24.s[2] + fmla v18.4s, v29.4s, v27.s[2] + fmla v23.4s, v31.4s, v27.s[2] + ldp q28, q29, [x8] + fmla v0.4s, v29.4s, v26.s[3] + ldp q30, q31, [x8, #32] + fmla v3.4s, v31.4s, v26.s[3] + fmla v6.4s, v29.4s, v25.s[3] + fmla v2.4s, v31.4s, v25.s[3] + fmla v20.4s, v29.4s, v24.s[3] + fmla v17.4s, v31.4s, v24.s[3] + fmla v21.4s, v29.4s, v27.s[3] + ldr x8, [sp, #1152] // 8-byte Folded Reload + fmla v16.4s, v31.4s, v27.s[3] + fmla v4.4s, v30.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v5.4s, v30.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + stp q28, q29, [x7] + stp q30, q31, [x7, #32] + fmla v22.4s, v28.4s, v24.s[3] + fmla v19.4s, v30.4s, v24.s[3] + fmla v23.4s, v28.4s, v27.s[3] + fmla v18.4s, v30.4s, v27.s[3] + cmp x9, x20 + b.ge .LBB0_17 + .p2align 2 +.LBB0_16: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x10, x8, x27 + add x11, x18, x19 + prfm pldl1keep, [x8] + ldur s24, [x8, #-4] + add x12, x10, x27 + prfm pldl1keep, [x10] + ldur s25, [x10, #-4] + add x10, x14, x19 + prfm pldl1keep, [x12] + ldur s26, [x12, #-4] + add x12, x12, x27 + add x19, x19, x13 + prfm pldl1keep, [x12] + ldur s27, [x12, #-4] + add x8, x8, #4 + prfm pldl1keep, [x11] + ldp q28, q29, [x10, #32] + fmla v3.4s, v29.4s, v24.s[0] + fmla v2.4s, v29.4s, v25.s[0] + ldp q30, q31, [x10] + add x10, x26, x9, lsl #6 + fmla v0.4s, v31.4s, v24.s[0] + fmla v6.4s, v31.4s, v25.s[0] + add x9, x9, #1 + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + fmla v4.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v5.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + stp q30, q31, [x10] + fmla v16.4s, v29.4s, v27.s[0] + stp q28, q29, [x10, #32] + cmp x9, x20 + b.lt .LBB0_16 +.LBB0_17: // %.preheader + // in Loop: Header=BB0_11 Depth=3 + ldr x30, [sp, #1096] // 8-byte Folded Reload + ldr x9, [sp, #1088] // 8-byte Folded Reload + mov x8, xzr + mov w19, #1 // =0x1 + mov w25, #2 // =0x2 + mov w23, #3 // =0x3 + mov w24, #4 // =0x4 + b .LBB0_19 + .p2align 2 +.LBB0_18: // %.loopexit + // in Loop: Header=BB0_19 Depth=4 + ldr x8, [sp, #1232] // 8-byte Folded Reload + add x9, x9, x8 + add x30, x30, x8 + mov x8, x24 + mov x24, x21 +.LBB0_19: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Loop Header: Depth=4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + madd x8, x8, x29, x2 + add x8, x8, x0 + madd x10, x19, x29, x2 + madd x11, x25, x29, x2 + add x10, x10, x0 + add x11, x11, x0 + add x8, x1, x8, lsl #2 + stp q1, q0, [x8] + stp q4, q3, [x8, #32] + add x8, x1, x10, lsl #2 + add x10, x1, x11, lsl #2 + stp q7, q6, [x8] + stp q5, q2, [x8, #32] + madd x8, x23, x29, x2 + add x8, x8, x0 + stp q22, q20, [x10] + stp q19, q17, [x10, #32] + ldr x10, [sp, #1296] // 8-byte Folded Reload + cmp x24, x10 + add x8, x1, x8, lsl #2 + stp q23, q21, [x8] + stp q18, q16, [x8, #32] + b.ge .LBB0_24 +// %bb.20: // in Loop: Header=BB0_19 Depth=4 + madd x10, x24, x29, x2 + add x23, x24, #3 + add x19, x24, #1 + add x25, x24, #2 + madd x11, x19, x29, x2 + ldp q28, q29, [x26, #32] + mov x8, xzr + madd x12, x25, x29, x2 + ldp q30, q31, [x26] + add x21, x24, #4 + mov x18, x9 + add x10, x10, x0 + add x10, x1, x10, lsl #2 + add x11, x11, x0 + add x11, x1, x11, lsl #2 + ldp q4, q3, [x10, #32] + ldp q1, q0, [x10] + madd x10, x23, x29, x2 + add x10, x10, x0 + ldp q5, q2, [x11, #32] + ldp q7, q6, [x11] + add x11, x12, x0 + add x11, x1, x11, lsl #2 + ldp q19, q17, [x11, #32] + ldp q22, q20, [x11] + add x10, x1, x10, lsl #2 + ldp q18, q16, [x10, #32] + ldp q23, q21, [x10] + madd x10, x24, x28, x4 + lsl x10, x10, #2 + ldr q27, [x3, x10] + madd x10, x19, x28, x4 + lsl x10, x10, #2 + ldr q26, [x3, x10] + madd x10, x25, x28, x4 + lsl x10, x10, #2 + ldr q25, [x3, x10] + madd x10, x23, x28, x4 + lsl x10, x10, #2 + ldr q24, [x3, x10] + ldr x10, [sp, #1200] // 8-byte Folded Reload + fmla v3.4s, v29.4s, v27.s[0] + cmp xzr, x22 + b.ge .LBB0_22 + .p2align 2 +.LBB0_21: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_19 Depth=4 + // => This Inner Loop Header: Depth=5 + add x14, x10, #64 + fmla v4.4s, v28.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + add x13, x10, #128 + prfm pldl1keep, [x14] + ldp q9, q8, [x10, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x10, #-192] + fmla v2.4s, v29.4s, v26.s[0] + fmla v5.4s, v28.4s, v26.s[0] + fmla v6.4s, v31.4s, v26.s[0] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x13] + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + ldp q11, q10, [x10, #-128] + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + ldp q13, q14, [x10, #-96] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + add x12, x10, #192 + prfm pldl1keep, [x12] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x11, x10, #256 + add x8, x8, #4 + fmla v1.4s, v12.4s, v27.s[1] + fmla v4.4s, v9.4s, v27.s[1] + fmla v3.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v6.4s, v15.4s, v26.s[1] + fmla v5.4s, v9.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v15.4s, v25.s[1] + fmla v19.4s, v9.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v15.4s, v24.s[1] + ldp q15, q12, [x10, #-64] + fmla v18.4s, v9.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q9, q8, [x10, #-32] + prfm pldl1keep, [x11] + ldp q28, q29, [x10, #32] + ldp q30, q31, [x10] + add x10, x18, x27 + prfm pldl1keep, [x18] + fmla v3.4s, v14.4s, v27.s[2] + fmla v4.4s, v13.4s, v27.s[2] + fmla v1.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v2.4s, v14.4s, v26.s[2] + fmla v5.4s, v13.4s, v26.s[2] + fmla v6.4s, v10.4s, v26.s[2] + fmla v7.4s, v11.4s, v26.s[2] + fmla v17.4s, v14.4s, v25.s[2] + fmla v19.4s, v13.4s, v25.s[2] + fmla v20.4s, v10.4s, v25.s[2] + fmla v22.4s, v11.4s, v25.s[2] + fmla v16.4s, v14.4s, v24.s[2] + fmla v18.4s, v13.4s, v24.s[2] + fmla v21.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v1.4s, v15.4s, v27.s[3] + fmla v4.4s, v9.4s, v27.s[3] + fmla v3.4s, v8.4s, v27.s[3] + ldur q27, [x18, #-16] + prfm pldl1keep, [x10] + add x18, x18, #16 + fmla v7.4s, v15.4s, v26.s[3] + fmla v6.4s, v12.4s, v26.s[3] + fmla v5.4s, v9.4s, v26.s[3] + fmla v2.4s, v8.4s, v26.s[3] + ldur q26, [x10, #-16] + add x10, x10, x27 + add x12, x10, x27 + prfm pldl1keep, [x10] + fmla v22.4s, v15.4s, v25.s[3] + fmla v20.4s, v12.4s, v25.s[3] + fmla v19.4s, v9.4s, v25.s[3] + fmla v17.4s, v8.4s, v25.s[3] + ldur q25, [x10, #-16] + prfm pldl1keep, [x12] + mov x10, x11 + fmla v23.4s, v15.4s, v24.s[3] + fmla v21.4s, v12.4s, v24.s[3] + fmla v18.4s, v9.4s, v24.s[3] + fmla v16.4s, v8.4s, v24.s[3] + ldur q24, [x12, #-16] + fmla v3.4s, v29.4s, v27.s[0] + cmp x8, x22 + b.lt .LBB0_21 +.LBB0_22: // in Loop: Header=BB0_19 Depth=4 + ldp q10, q8, [x5, #32] + ldp q12, q11, [x5] + fmla v4.4s, v28.4s, v27.s[0] + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + fmla v2.4s, v29.4s, v26.s[0] + fmla v5.4s, v28.4s, v26.s[0] + fmla v6.4s, v31.4s, v26.s[0] + ldp q9, q13, [x6, #32] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + ldr x10, [sp, #1192] // 8-byte Folded Reload + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + mov x8, x30 + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + fmla v23.4s, v30.4s, v24.s[0] + ldp q29, q30, [x6] + ldp q31, q28, [x7, #32] + fmla v1.4s, v12.4s, v27.s[1] + fmla v0.4s, v11.4s, v27.s[1] + fmla v4.4s, v10.4s, v27.s[1] + fmla v3.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v6.4s, v11.4s, v26.s[1] + fmla v5.4s, v10.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v11.4s, v25.s[1] + fmla v19.4s, v10.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v11.4s, v24.s[1] + fmla v18.4s, v10.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q10, q8, [x7] + fmla v3.4s, v13.4s, v27.s[2] + fmla v4.4s, v9.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v29.4s, v27.s[2] + fmla v2.4s, v13.4s, v26.s[2] + fmla v5.4s, v9.4s, v26.s[2] + fmla v6.4s, v30.4s, v26.s[2] + fmla v7.4s, v29.4s, v26.s[2] + fmla v17.4s, v13.4s, v25.s[2] + fmla v19.4s, v9.4s, v25.s[2] + fmla v20.4s, v30.4s, v25.s[2] + fmla v22.4s, v29.4s, v25.s[2] + fmla v16.4s, v13.4s, v24.s[2] + fmla v18.4s, v9.4s, v24.s[2] + fmla v21.4s, v30.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v1.4s, v10.4s, v27.s[3] + fmla v0.4s, v8.4s, v27.s[3] + fmla v4.4s, v31.4s, v27.s[3] + fmla v3.4s, v28.4s, v27.s[3] + fmla v7.4s, v10.4s, v26.s[3] + fmla v6.4s, v8.4s, v26.s[3] + fmla v5.4s, v31.4s, v26.s[3] + fmla v2.4s, v28.4s, v26.s[3] + fmla v22.4s, v10.4s, v25.s[3] + fmla v20.4s, v8.4s, v25.s[3] + fmla v19.4s, v31.4s, v25.s[3] + fmla v17.4s, v28.4s, v25.s[3] + fmla v23.4s, v10.4s, v24.s[3] + fmla v21.4s, v8.4s, v24.s[3] + fmla v18.4s, v31.4s, v24.s[3] + fmla v16.4s, v28.4s, v24.s[3] + cmp x11, x20 + b.ge .LBB0_18 + .p2align 2 +.LBB0_23: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_19 Depth=4 + // => This Inner Loop Header: Depth=5 + add x12, x8, x27 + prfm pldl1keep, [x8] + ldur s24, [x8, #-4] + add x11, x11, #1 + prfm pldl1keep, [x12] + ldur s25, [x12, #-4] + add x12, x12, x27 + add x8, x8, #4 + prfm pldl1keep, [x12] + ldur s26, [x12, #-4] + add x12, x12, x27 + prfm pldl1keep, [x12] + ldur s27, [x12, #-4] + prfm pldl1keep, [x10] + ldp q28, q29, [x10, #-32] + fmla v3.4s, v29.4s, v24.s[0] + ldp q30, q31, [x10, #-64] + fmla v0.4s, v31.4s, v24.s[0] + fmla v6.4s, v31.4s, v25.s[0] + fmla v2.4s, v29.4s, v25.s[0] + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + add x10, x10, #64 + fmla v4.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v5.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + fmla v16.4s, v29.4s, v27.s[0] + cmp x11, x20 + b.lt .LBB0_23 + b .LBB0_18 + .p2align 2 +.LBB0_24: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #1296] // 8-byte Folded Reload + ldr x9, [sp, #1128] // 8-byte Folded Reload + cmp x8, x9 + ldr x30, [sp, #1080] // 8-byte Folded Reload + b.ge .LBB0_30 +// %bb.25: // in Loop: Header=BB0_11 Depth=3 + ldr x12, [sp, #1296] // 8-byte Folded Reload + ldp q20, q21, [x26, #32] + mov x9, xzr + ldp q18, q19, [x26] + add x10, x12, #1 + madd x8, x12, x29, x2 + madd x11, x10, x29, x2 + madd x10, x10, x28, x4 + add x8, x8, x0 + add x11, x11, x0 + add x8, x1, x8, lsl #2 + lsl x10, x10, #2 + add x18, x1, x11, lsl #2 + madd x11, x12, x28, x4 + ldr q16, [x3, x10] + ldr x10, [sp, #1120] // 8-byte Folded Reload + lsl x11, x11, #2 + ldp q1, q0, [x8, #32] + ldp q4, q2, [x8] + ldp q5, q3, [x18, #32] + ldp q7, q6, [x18] + ldr q17, [x3, x11] + ldr x11, [sp, #1200] // 8-byte Folded Reload + cmp xzr, x22 + b.ge .LBB0_27 + .p2align 2 +.LBB0_26: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x12, [sp, #1184] // 8-byte Folded Reload + add x25, x11, #64 + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v4.4s, v18.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + add x14, x11, #128 + add x13, x11, #192 + fmla v3.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + add x19, x11, #256 + add x9, x9, #4 + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + add x21, x12, x10 + ldr x12, [sp, #1176] // 8-byte Folded Reload + prfm pldl1keep, [x25] + ldp q23, q22, [x11, #-160] + add x23, x21, #32 + ldp q24, q25, [x11, #-192] + prfm pldl1keep, [x14] + fmla v2.4s, v25.4s, v17.s[1] + ldp q19, q18, [x11, #-128] + ldp q20, q21, [x11, #-96] + fmla v0.4s, v22.4s, v17.s[1] + fmla v6.4s, v25.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + prfm pldl1keep, [x13] + add x24, x12, x10 + add x10, x10, #16 + fmla v4.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v23.4s, v16.s[1] + ldp q23, q22, [x11, #-32] + ldp q24, q25, [x11, #-64] + add x12, x24, #32 + prfm pldl1keep, [x12] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + fmla v3.4s, v21.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v19.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + ldr q17, [x24, #16] + prfm pldl1keep, [x23] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + ldr q16, [x21, #16] + prfm pldl1keep, [x19] + ldp q20, q21, [x11, #32] + ldp q18, q19, [x11] + mov x11, x19 + cmp x9, x22 + b.lt .LBB0_26 +.LBB0_27: // in Loop: Header=BB0_11 Depth=3 + ldp q23, q22, [x5, #32] + ldp q25, q24, [x5] + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + fmla v4.4s, v18.4s, v17.s[0] + fmla v3.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + ldp q20, q21, [x6, #32] + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + ldp q18, q19, [x6] + fmla v2.4s, v24.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x9, [sp, #1120] // 8-byte Folded Reload + ldr x10, [sp, #1192] // 8-byte Folded Reload + fmla v4.4s, v25.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v7.4s, v25.4s, v16.s[1] + fmla v6.4s, v24.4s, v16.s[1] + ldp q25, q24, [x7] + fmla v5.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + ldp q23, q22, [x7, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v19.4s, v17.s[2] + fmla v3.4s, v21.4s, v16.s[2] + fmla v6.4s, v19.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v18.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v18.4s, v16.s[2] + fmla v2.4s, v24.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v24.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v25.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + fmla v7.4s, v25.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + cmp x11, x20 + b.ge .LBB0_29 + .p2align 2 +.LBB0_28: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x14, [sp, #1168] // 8-byte Folded Reload + ldr x19, [sp, #1160] // 8-byte Folded Reload + add x11, x11, #1 + add x12, x14, x9 + add x13, x19, x9 + add x13, x13, #4 + add x12, x12, #4 + prfm pldl1keep, [x13] + ldr s16, [x19, x9] + prfm pldl1keep, [x12] + ldr s17, [x14, x9] + prfm pldl1keep, [x10] + add x9, x9, #4 + ldp q18, q19, [x10, #-64] + ldp q20, q21, [x10, #-32] + add x10, x10, #64 + fmla v0.4s, v21.4s, v16.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v2.4s, v19.4s, v16.s[0] + fmla v4.4s, v18.4s, v16.s[0] + fmla v7.4s, v18.4s, v17.s[0] + fmla v6.4s, v19.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v3.4s, v21.4s, v17.s[0] + cmp x11, x20 + b.lt .LBB0_28 +.LBB0_29: // in Loop: Header=BB0_11 Depth=3 + stp q4, q2, [x8] + stp q1, q0, [x8, #32] + stp q7, q6, [x18] + stp q5, q3, [x18, #32] +.LBB0_30: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #1024] // 8-byte Folded Reload + ldr x9, [sp, #1128] // 8-byte Folded Reload + cmp x9, x8 + ldr x19, [sp, #1072] // 8-byte Folded Reload + b.ge .LBB0_10 +// %bb.31: // in Loop: Header=BB0_11 Depth=3 + ldr x10, [sp, #1128] // 8-byte Folded Reload + ldp q7, q16, [x26, #32] + mov x9, xzr + ldp q6, q5, [x26] + ldr x11, [sp, #1200] // 8-byte Folded Reload + madd x8, x10, x29, x2 + madd x10, x10, x28, x4 + add x8, x8, x0 + lsl x10, x10, #2 + ldr x0, [sp, #848] // 8-byte Folded Reload + add x8, x1, x8, lsl #2 + ldr q4, [x3, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr x1, [sp, #896] // 8-byte Folded Reload + ldp q1, q0, [x8, #32] + ldp q3, q2, [x8] + cmp xzr, x22 + b.ge .LBB0_33 + .p2align 2 +.LBB0_32: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x18, x11, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x14, x11, #128 + prfm pldl1keep, [x18] + ldp q18, q17, [x11, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x11, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x14] + ldp q6, q5, [x11, #-128] + ldp q7, q16, [x11, #-96] + add x13, x11, #192 + prfm pldl1keep, [x13] + add x12, x11, #256 + add x9, x9, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x11, #-32] + ldp q19, q20, [x11, #-64] + prfm pldl1keep, [x10] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x10, #-16] + prfm pldl1keep, [x12] + add x10, x10, #16 + ldp q7, q16, [x11, #32] + ldp q6, q5, [x11] + mov x11, x12 + cmp x9, x22 + b.lt .LBB0_32 +.LBB0_33: // in Loop: Header=BB0_11 Depth=3 + ldp q18, q17, [x5, #32] + ldp q20, q19, [x5] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q5, q6, [x6] + ldp q7, q16, [x6, #32] + ldr x10, [sp, #880] // 8-byte Folded Reload + mov x9, xzr + mov w11, #64 // =0x40 + fmla v2.4s, v19.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v20.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + fmla v0.4s, v16.4s, v4.s[2] + ldp q18, q17, [x7, #32] + ldp q20, q19, [x7] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v5.4s, v4.s[2] + fmla v2.4s, v19.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v20.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, xzr + cmp x12, x20 + b.ge .LBB0_9 + .p2align 2 +.LBB0_34: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x13, x0, x9, lsl #6 + add x12, x0, x11 + prfm pldl1keep, [x10] + add x11, x11, #64 + ldr s4, [x1, x9, lsl #2] + prfm pldl1keep, [x12] + add x9, x9, #1 + ldp q5, q6, [x13] + ldp q7, q16, [x13, #32] + add x10, x10, #4 + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v6.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v5.4s, v4.s[0] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, x9 + cmp x12, x20 + b.lt .LBB0_34 + b .LBB0_9 + .p2align 2 +.LBB0_35: // in Loop: Header=BB0_7 Depth=2 + cmp x8, x30 + ldr x8, [sp, #944] // 8-byte Folded Reload + ldr x9, [sp, #936] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + ldr x9, [sp, #920] // 8-byte Folded Reload + str x8, [sp, #1272] // 8-byte Folded Spill + lsl x8, x29, #1 + str x8, [sp, #1256] // 8-byte Folded Spill + ldr x8, [sp, #928] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + ldr x9, [sp, #952] // 8-byte Folded Reload + str x8, [sp, #1264] // 8-byte Folded Spill + ldr x8, [sp, #960] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + str x8, [sp, #1280] // 8-byte Folded Spill + b.lt .LBB0_39 +// %bb.36: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #592] // 8-byte Folded Reload + cmp x30, x8 + b.lt .LBB0_64 +.LBB0_37: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #592] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x8, x9 + b.lt .LBB0_89 +.LBB0_38: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #504] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 + b .LBB0_114 + .p2align 2 +.LBB0_39: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #440] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x9, [sp, #1040] // 8-byte Folded Reload + ldr x8, [sp, #1016] // 8-byte Folded Reload + add x14, x0, #63 + mov x11, xzr + ldr x17, [sp, #1064] // 8-byte Folded Reload + ldr x1, [sp, #1256] // 8-byte Folded Reload + mul x12, x19, x9 + ldr x9, [sp, #1008] // 8-byte Folded Reload + mul x8, x19, x8 + ldr x16, [sp, #1104] // 8-byte Folded Reload + ldr x18, [sp, #1272] // 8-byte Folded Reload + add x10, x1, x29 + ldr x6, [sp, #992] // 8-byte Folded Reload + ldr x7, [sp, #976] // 8-byte Folded Reload + ldp x23, x21, [sp, #368] // 16-byte Folded Reload + madd x9, x17, x9, x8 + ldr x8, [sp, #1048] // 8-byte Folded Reload + ldp x25, x24, [sp, #352] // 16-byte Folded Reload + ldr x30, [sp, #344] // 8-byte Folded Reload + madd x13, x17, x8, x12 + add x12, x9, x16 + and x8, x14, #0xffffffffffffffc0 + add x10, x12, x10 + add x14, x18, x12, lsl #2 + add x15, x12, x29 + add x12, x12, x1 + add x10, x18, x10, lsl #2 + add x15, x18, x15, lsl #2 + add x12, x18, x12, lsl #2 + ldp q1, q0, [x14] + ldr x14, [sp, #1056] // 8-byte Folded Reload + ldp q7, q5, [x10] + ldr x10, [sp, #1032] // 8-byte Folded Reload + mul x10, x19, x10 + ldp q6, q3, [x15] + ldr x15, [sp, #1280] // 8-byte Folded Reload + ldr x19, [sp, #968] // 8-byte Folded Reload + ldp q4, q2, [x12] + add x12, x13, x16 + madd x10, x17, x14, x10 + lsl x14, x10, #2 + ldr q18, [x15, x14] + add x14, x10, x28 + lsl x14, x14, #2 + ldr q17, [x15, x14] + add x14, x10, x28, lsl #1 + lsl x14, x14, #2 + ldr q16, [x15, x14] + ldr x14, [sp, #1264] // 8-byte Folded Reload + ldr x15, [sp, #1120] // 8-byte Folded Reload + add x12, x14, x12, lsl #2 + ldr x14, [sp, #656] // 8-byte Folded Reload + ldp q21, q20, [x12] + add x12, x8, #64 + .p2align 2 +.LBB0_40: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1288] // 8-byte Folded Reload + fmla v1.4s, v21.4s, v18.s[0] + fmla v0.4s, v20.4s, v18.s[0] + cmp x11, x22 + add x16, x16, x15 + prfm pldl1keep, [x16, #16] + ldr q19, [x16] + b.ge .LBB0_42 +// %bb.41: // in Loop: Header=BB0_40 Depth=3 + ldr x17, [sp, #800] // 8-byte Folded Reload + add x16, x25, x14 + fmla v6.4s, v21.4s, v17.s[0] + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v16.s[0] + fmla v2.4s, v20.4s, v16.s[0] + stp q21, q20, [x12, #-64] + fmla v7.4s, v21.4s, v19.s[0] + fmla v5.4s, v20.4s, v19.s[0] + prfm pldl1keep, [x16] + add x16, x24, x14 + add x1, x21, x14 + add x5, x30, x14 + add x11, x11, #4 + add x17, x17, x14 + ldp q20, q21, [x17] + ldr x17, [sp, #792] // 8-byte Folded Reload + add x17, x17, x14 + fmla v0.4s, v21.4s, v18.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v16.s[1] + fmla v5.4s, v21.4s, v19.s[1] + fmla v1.4s, v20.4s, v18.s[1] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v16.s[1] + fmla v7.4s, v20.4s, v19.s[1] + stp q20, q21, [x12, #-32] + prfm pldl1keep, [x16] + add x16, x23, x14 + ldp q21, q20, [x17] + ldr x17, [sp, #784] // 8-byte Folded Reload + add x17, x17, x14 + fmla v0.4s, v20.4s, v18.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v16.s[2] + fmla v5.4s, v20.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v16.s[2] + fmla v7.4s, v21.4s, v19.s[2] + stp q21, q20, [x12] + prfm pldl1keep, [x16] + ldr x16, [sp, #1224] // 8-byte Folded Reload + ldp q20, q21, [x17] + ldr x17, [sp, #1208] // 8-byte Folded Reload + ldr x18, [sp, #1216] // 8-byte Folded Reload + add x16, x16, x15 + add x17, x17, x15 + add x18, x18, x15 + fmla v0.4s, v21.4s, v18.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v16.s[3] + fmla v5.4s, v21.4s, v19.s[3] + add x15, x15, #16 + add x2, x16, #32 + add x3, x17, #32 + add x4, x18, #32 + fmla v1.4s, v20.4s, v18.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v16.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x12, #32] + prfm pldl1keep, [x1] + add x12, x12, #128 + ldp q21, q20, [x5] + prfm pldl1keep, [x4] + ldr q18, [x18, #16] + prfm pldl1keep, [x3] + ldr q17, [x17, #16] + prfm pldl1keep, [x2] + ldr q16, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x14, x14, x16 + b .LBB0_40 + .p2align 2 +.LBB0_42: // in Loop: Header=BB0_7 Depth=2 + ldr x16, [sp, #1000] // 8-byte Folded Reload + ldr x12, [sp, #984] // 8-byte Folded Reload + add x11, x8, x22, lsl #5 + fmla v6.4s, v21.4s, v17.s[0] + ldr x17, [sp, #1104] // 8-byte Folded Reload + ldr x18, [sp, #1264] // 8-byte Folded Reload + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v16.s[0] + stp q21, q20, [x11] + fmla v2.4s, v20.4s, v16.s[0] + fmla v5.4s, v20.4s, v19.s[0] + fmla v7.4s, v21.4s, v19.s[0] + ldr x23, [sp, #648] // 8-byte Folded Reload + ldr x24, [sp, #680] // 8-byte Folded Reload + mov x14, xzr + madd x11, x12, x16, x13 + ldr x25, [sp, #1272] // 8-byte Folded Reload + mov x15, xzr + add x11, x11, x17 + add x11, x18, x11, lsl #2 + ldp q20, q21, [x11] + add x11, x8, x12, lsl #5 + madd x12, x7, x16, x13 + madd x13, x19, x16, x13 + ldr x16, [sp, #664] // 8-byte Folded Reload + add x12, x12, x17 + fmla v0.4s, v21.4s, v18.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v16.s[1] + fmla v5.4s, v21.4s, v19.s[1] + add x13, x13, x17 + ldr x17, [sp, #472] // 8-byte Folded Reload + add x12, x18, x12, lsl #2 + fmla v1.4s, v20.4s, v18.s[1] + stp q20, q21, [x11] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v16.s[1] + fmla v7.4s, v20.4s, v19.s[1] + add x13, x18, x13, lsl #2 + ldr x18, [sp, #672] // 8-byte Folded Reload + add x16, x16, x17 + ldp q21, q20, [x12] + add x12, x8, x7, lsl #5 + add x17, x18, x17 + ldr x18, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v20.4s, v18.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v16.s[2] + fmla v5.4s, v20.4s, v19.s[2] + stp q21, q20, [x12] + fmla v1.4s, v21.4s, v18.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v16.s[2] + fmla v7.4s, v21.4s, v19.s[2] + ldp q20, q21, [x13] + add x13, x8, x19, lsl #5 + fmla v0.4s, v21.4s, v18.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v16.s[3] + fmla v5.4s, v21.4s, v19.s[3] + fmla v1.4s, v20.4s, v18.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v16.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x13] + cmp x18, x20 + b.ge .LBB0_44 + .p2align 2 +.LBB0_43: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x2, [sp, #1152] // 8-byte Folded Reload + add x1, x16, x15 + add x3, x8, x18, lsl #5 + add x18, x18, #1 + add x2, x2, x14 + add x14, x14, #4 + prfm pldl1keep, [x2] + ldur s16, [x2, #-4] + add x2, x2, x27 + prfm pldl1keep, [x2] + ldur s17, [x2, #-4] + add x2, x2, x27 + prfm pldl1keep, [x2] + ldur s18, [x2, #-4] + add x2, x2, x27 + prfm pldl1keep, [x2] + ldur s19, [x2, #-4] + add x2, x17, x15 + prfm pldl1keep, [x1] + add x15, x15, x6 + ldp q20, q21, [x2] + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v5.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + stp q20, q21, [x3] + cmp x18, x20 + b.lt .LBB0_43 +.LBB0_44: // %.preheader67 + // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #336] // 8-byte Folded Reload + ldr x16, [sp, #1096] // 8-byte Folded Reload + mov x5, xzr + add x14, x8, #128 + ldr x17, [sp, #1088] // 8-byte Folded Reload + mov w2, #1 // =0x1 + mov w3, #2 // =0x2 + mov w1, #3 // =0x3 + mov w18, #4 // =0x4 + add x15, x8, x15 + b .LBB0_46 + .p2align 2 +.LBB0_45: // %.loopexit63 + // in Loop: Header=BB0_46 Depth=3 + ldr x5, [sp, #1232] // 8-byte Folded Reload + add x17, x17, x5 + add x16, x16, x5 + mov x5, x18 + mov x18, x4 +.LBB0_46: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_48 Depth 4 + // Child Loop BB0_50 Depth 4 + madd x4, x5, x29, x9 + ldr x21, [sp, #1104] // 8-byte Folded Reload + add x4, x4, x21 + madd x2, x2, x29, x9 + madd x3, x3, x29, x9 + madd x1, x1, x29, x9 + add x2, x2, x21 + add x1, x1, x21 + add x4, x25, x4, lsl #2 + add x2, x25, x2, lsl #2 + stp q1, q0, [x4] + stp q6, q3, [x2] + add x2, x3, x21 + add x1, x25, x1, lsl #2 + add x2, x25, x2, lsl #2 + stp q4, q2, [x2] + stp q7, q5, [x1] + ldr x1, [sp, #1296] // 8-byte Folded Reload + cmp x18, x1 + b.ge .LBB0_51 +// %bb.47: // in Loop: Header=BB0_46 Depth=3 + madd x4, x18, x29, x9 + add x2, x18, #1 + add x3, x18, #2 + add x1, x18, #3 + madd x6, x2, x29, x9 + ldp q20, q21, [x8] + mov x5, xzr + madd x7, x3, x29, x9 + add x4, x4, x21 + madd x19, x1, x29, x9 + add x4, x25, x4, lsl #2 + ldp q1, q0, [x4] + add x4, x6, x21 + add x6, x7, x21 + add x7, x19, x21 + add x6, x25, x6, lsl #2 + add x7, x25, x7, lsl #2 + add x4, x25, x4, lsl #2 + ldp q4, q2, [x6] + madd x6, x18, x28, x10 + lsl x6, x6, #2 + ldp q7, q5, [x7] + ldr x7, [sp, #1280] // 8-byte Folded Reload + ldp q6, q3, [x4] + add x4, x18, #4 + ldr q19, [x7, x6] + madd x6, x2, x28, x10 + lsl x6, x6, #2 + ldr q18, [x7, x6] + madd x6, x3, x28, x10 + lsl x6, x6, #2 + ldr q17, [x7, x6] + madd x6, x1, x28, x10 + lsl x6, x6, #2 + ldr q16, [x7, x6] + mov x6, x14 + mov x7, x17 + cmp xzr, x22 + b.ge .LBB0_49 + .p2align 2 +.LBB0_48: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_46 Depth=3 + // => This Inner Loop Header: Depth=4 + add x19, x6, #32 + fmla v1.4s, v20.4s, v19.s[0] + fmla v0.4s, v21.4s, v19.s[0] + add x5, x5, #4 + prfm pldl1keep, [x19] + ldp q22, q23, [x6, #-96] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + add x19, x6, #96 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x6, #-64] + prfm pldl1keep, [x19] + add x19, x7, x27 + add x21, x19, x27 + fmla v0.4s, v23.4s, v19.s[1] + fmla v3.4s, v23.4s, v18.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v5.4s, v23.4s, v16.s[1] + fmla v1.4s, v22.4s, v19.s[1] + fmla v6.4s, v22.4s, v18.s[1] + fmla v4.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v0.4s, v20.4s, v19.s[2] + ldp q22, q23, [x6, #-32] + fmla v3.4s, v20.4s, v18.s[2] + fmla v2.4s, v20.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v1.4s, v21.4s, v19.s[2] + fmla v6.4s, v21.4s, v18.s[2] + fmla v4.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x6], #128 + prfm pldl1keep, [x7] + fmla v0.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v18.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v5.4s, v23.4s, v16.s[3] + fmla v1.4s, v22.4s, v19.s[3] + ldur q19, [x7, #-16] + prfm pldl1keep, [x19] + fmla v6.4s, v22.4s, v18.s[3] + ldur q18, [x19, #-16] + add x19, x21, x27 + prfm pldl1keep, [x21] + add x7, x7, #16 + fmla v4.4s, v22.4s, v17.s[3] + ldur q17, [x21, #-16] + prfm pldl1keep, [x19] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x19, #-16] + cmp x5, x22 + b.lt .LBB0_48 +.LBB0_49: // in Loop: Header=BB0_46 Depth=3 + ldp q23, q22, [x11] + fmla v0.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v19.s[0] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + ldr x7, [sp, #1304] // 8-byte Folded Reload + mov x5, x16 + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + mov x6, x15 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x12] + fmla v0.4s, v22.4s, v19.s[1] + fmla v3.4s, v22.4s, v18.s[1] + fmla v2.4s, v22.4s, v17.s[1] + fmla v5.4s, v22.4s, v16.s[1] + fmla v1.4s, v23.4s, v19.s[1] + fmla v6.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v7.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v19.s[2] + ldp q23, q22, [x13] + fmla v3.4s, v21.4s, v18.s[2] + fmla v2.4s, v21.4s, v17.s[2] + fmla v5.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v19.s[2] + fmla v6.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v7.4s, v20.4s, v16.s[2] + fmla v0.4s, v22.4s, v19.s[3] + fmla v3.4s, v22.4s, v18.s[3] + fmla v2.4s, v22.4s, v17.s[3] + fmla v5.4s, v22.4s, v16.s[3] + fmla v1.4s, v23.4s, v19.s[3] + fmla v6.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v7.4s, v23.4s, v16.s[3] + cmp x7, x20 + b.ge .LBB0_45 + .p2align 2 +.LBB0_50: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_46 Depth=3 + // => This Inner Loop Header: Depth=4 + add x19, x5, x27 + prfm pldl1keep, [x5] + ldur s16, [x5, #-4] + add x7, x7, #1 + prfm pldl1keep, [x19] + ldur s17, [x19, #-4] + add x19, x19, x27 + add x5, x5, #4 + prfm pldl1keep, [x19] + ldur s18, [x19, #-4] + add x19, x19, x27 + prfm pldl1keep, [x19] + ldur s19, [x19, #-4] + prfm pldl1keep, [x6] + ldp q20, q21, [x6, #-32] + add x6, x6, #32 + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + fmla v5.4s, v21.4s, v19.s[0] + cmp x7, x20 + b.lt .LBB0_50 + b .LBB0_45 + .p2align 2 +.LBB0_51: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1296] // 8-byte Folded Reload + ldr x16, [sp, #1128] // 8-byte Folded Reload + cmp x15, x16 + ldr x19, [sp, #1072] // 8-byte Folded Reload + b.ge .LBB0_57 +// %bb.52: // in Loop: Header=BB0_7 Depth=2 + ldr x2, [sp, #1296] // 8-byte Folded Reload + ldr x1, [sp, #1104] // 8-byte Folded Reload + mov x17, xzr + add x18, x2, #1 + madd x15, x2, x29, x9 + ldp q6, q7, [x8] + madd x16, x18, x29, x9 + madd x18, x18, x28, x10 + add x15, x15, x1 + add x16, x16, x1 + madd x1, x2, x28, x10 + ldr x2, [sp, #1280] // 8-byte Folded Reload + add x15, x25, x15, lsl #2 + lsl x18, x18, #2 + add x16, x25, x16, lsl #2 + ldp q1, q0, [x15] + ldp q3, q2, [x16] + lsl x1, x1, #2 + ldr q4, [x2, x18] + mov x18, x14 + ldr q5, [x2, x1] + ldr x1, [sp, #1120] // 8-byte Folded Reload + cmp xzr, x22 + b.ge .LBB0_54 + .p2align 2 +.LBB0_53: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x7, x18, #32 + ldr x2, [sp, #1184] // 8-byte Folded Reload + ldr x4, [sp, #1176] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v5.s[0] + prfm pldl1keep, [x7] + ldp q16, q17, [x18, #-96] + fmla v0.4s, v7.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x18, #-64] + add x6, x18, #96 + prfm pldl1keep, [x6] + add x17, x17, #4 + add x2, x2, x1 + add x4, x4, x1 + add x1, x1, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + add x3, x2, #32 + add x5, x4, #32 + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + ldp q16, q17, [x18, #-32] + fmla v0.4s, v6.4s, v5.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + ldp q6, q7, [x18], #128 + prfm pldl1keep, [x5] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x4, #16] + prfm pldl1keep, [x3] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x2, #16] + cmp x17, x22 + b.lt .LBB0_53 +.LBB0_54: // in Loop: Header=BB0_7 Depth=2 + ldp q17, q16, [x11] + fmla v0.4s, v7.4s, v5.s[0] + fmla v1.4s, v6.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q6, q7, [x12] + ldr x1, [sp, #416] // 8-byte Folded Reload + ldr x2, [sp, #1304] // 8-byte Folded Reload + mov x17, xzr + mov x18, xzr + fmla v0.4s, v16.4s, v5.s[1] + fmla v2.4s, v16.4s, v4.s[1] + add x1, x8, x1 + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldp q17, q16, [x13] + fmla v0.4s, v7.4s, v5.s[2] + fmla v2.4s, v7.4s, v4.s[2] + fmla v1.4s, v6.4s, v5.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v0.4s, v16.4s, v5.s[3] + fmla v2.4s, v16.4s, v4.s[3] + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + cmp x2, x20 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x3, x1, x18, lsl #3 + add x4, x24, x18 + add x5, x23, x18 + add x2, x2, #1 + add x4, x4, #4 + add x5, x5, #4 + add x3, x3, #32 + prfm pldl1keep, [x5] + ldr s4, [x23, x18] + prfm pldl1keep, [x4] + add x4, x1, x17 + ldr s5, [x24, x18] + prfm pldl1keep, [x3] + add x18, x18, #4 + ldp q6, q7, [x4] + add x17, x17, #32 + fmla v0.4s, v7.4s, v4.s[0] + fmla v1.4s, v6.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + cmp x2, x20 + b.lt .LBB0_55 +.LBB0_56: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x15] + stp q3, q2, [x16] +.LBB0_57: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1024] // 8-byte Folded Reload + ldr x16, [sp, #1128] // 8-byte Folded Reload + cmp x16, x15 + b.ge .LBB0_63 +// %bb.58: // in Loop: Header=BB0_7 Depth=2 + ldr x17, [sp, #1128] // 8-byte Folded Reload + ldr x16, [sp, #1104] // 8-byte Folded Reload + mov x15, xzr + madd x9, x17, x29, x9 + madd x10, x17, x28, x10 + ldp q4, q3, [x8] + ldr x18, [sp, #896] // 8-byte Folded Reload + add x9, x9, x16 + ldr x16, [sp, #1280] // 8-byte Folded Reload + lsl x10, x10, #2 + add x9, x25, x9, lsl #2 + ldp q1, q0, [x9] + ldr q2, [x16, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + cmp xzr, x22 + b.ge .LBB0_60 + .p2align 2 +.LBB0_59: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x17, x14, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x16, x14, #96 + prfm pldl1keep, [x17] + ldp q5, q6, [x14, #-96] + add x15, x15, #4 + ldp q4, q3, [x14, #-64] + prfm pldl1keep, [x16] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x14, #-32] + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x10, #-16] + ldp q4, q3, [x14], #128 + add x10, x10, #16 + cmp x15, x22 + b.lt .LBB0_59 +.LBB0_60: // in Loop: Header=BB0_7 Depth=2 + ldp q6, q5, [x11] + fmla v0.4s, v3.4s, v2.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp q3, q4, [x12] + ldr x11, [sp, #416] // 8-byte Folded Reload + mov x10, xzr + mov x14, xzr + fmla v0.4s, v5.4s, v2.s[1] + add x8, x8, x11 + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v2.s[1] + ldp q6, q5, [x13] + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v3.4s, v2.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v6.4s, v2.s[3] + cmp x11, x20 + b.ge .LBB0_62 + .p2align 2 +.LBB0_61: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x14, lsl #3 + add x13, x18, x14 + add x11, x11, #1 + add x13, x13, #4 + add x12, x12, #32 + prfm pldl1keep, [x13] + ldr s2, [x18, x14] + add x13, x8, x10 + add x14, x14, #4 + add x10, x10, #32 + prfm pldl1keep, [x12] + ldp q3, q4, [x13] + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v3.4s, v2.s[0] + cmp x11, x20 + b.lt .LBB0_61 +.LBB0_62: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x9] +.LBB0_63: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload + ldr x8, [sp, #592] // 8-byte Folded Reload + cmp x30, x8 + b.ge .LBB0_37 +.LBB0_64: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #432] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr x9, [sp, #1008] // 8-byte Folded Reload + mov x16, x19 + mov x12, xzr + ldr x15, [sp, #1064] // 8-byte Folded Reload + ldr x19, [sp, #1080] // 8-byte Folded Reload + mul x8, x16, x8 + ldr x14, [sp, #1256] // 8-byte Folded Reload + ldr x17, [sp, #1272] // 8-byte Folded Reload + add x10, x14, x29 + ldr x21, [sp, #992] // 8-byte Folded Reload + ldr x23, [sp, #976] // 8-byte Folded Reload + ldr x24, [sp, #968] // 8-byte Folded Reload + ldp x30, x25, [sp, #320] // 16-byte Folded Reload + madd x9, x15, x9, x8 + add x8, x9, x19 + add x13, x8, x29 + lsl x11, x8, #2 + add x14, x8, x14 + add x8, x8, x10 + lsl x10, x13, #2 + ldr q0, [x17, x11] + lsl x11, x14, #2 + ldr x13, [sp, #1264] // 8-byte Folded Reload + ldr x14, [sp, #1280] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr q2, [x17, x10] + ldr x10, [sp, #1040] // 8-byte Folded Reload + mul x10, x16, x10 + ldr q1, [x17, x11] + ldr x11, [sp, #1048] // 8-byte Folded Reload + madd x11, x15, x11, x10 + ldr q3, [x17, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + add x10, x11, x19 + lsl x10, x10, #2 + ldr q7, [x13, x10] + ldr x10, [sp, #1032] // 8-byte Folded Reload + ldr x13, [sp, #1056] // 8-byte Folded Reload + mul x10, x16, x10 + madd x10, x15, x13, x10 + ldr x15, [sp, #1120] // 8-byte Folded Reload + lsl x13, x10, #2 + ldr q4, [x14, x13] + add x13, x10, x28 + lsl x13, x13, #2 + ldr q5, [x14, x13] + add x13, x10, x28, lsl #1 + lsl x13, x13, #2 + ldr q6, [x14, x13] + ldr x14, [sp, #656] // 8-byte Folded Reload + orr x13, x8, #0x20 + .p2align 2 +.LBB0_65: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1288] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + cmp x12, x22 + add x16, x16, x15 + prfm pldl1keep, [x16, #16] + ldr q16, [x16] + b.ge .LBB0_67 +// %bb.66: // in Loop: Header=BB0_65 Depth=3 + ldr x7, [sp, #768] // 8-byte Folded Reload + ldr x16, [sp, #1224] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + ldr x18, [sp, #1208] // 8-byte Folded Reload + ldr x2, [sp, #1216] // 8-byte Folded Reload + add x5, x30, x14 + add x4, x25, x14 + ldr x6, [sp, #776] // 8-byte Folded Reload + stur q7, [x13, #-32] + add x12, x12, #4 + add x7, x7, x14 + add x6, x6, x14 + add x16, x16, x15 + add x18, x18, x15 + add x2, x2, x15 + add x15, x15, #16 + prfm pldl1keep, [x7] + ldr x7, [sp, #752] // 8-byte Folded Reload + add x17, x16, #32 + add x1, x18, #32 + add x3, x2, #32 + ldr q7, [x7, x14] + stur q7, [x13, #-16] + prfm pldl1keep, [x6] + ldr x6, [sp, #744] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[1] + fmla v2.4s, v7.4s, v5.s[1] + fmla v1.4s, v7.4s, v6.s[1] + fmla v3.4s, v7.4s, v16.s[1] + ldr q7, [x6, x14] + str q7, [x13] + prfm pldl1keep, [x5] + ldr x5, [sp, #736] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[2] + fmla v2.4s, v7.4s, v5.s[2] + fmla v1.4s, v7.4s, v6.s[2] + fmla v3.4s, v7.4s, v16.s[2] + ldr q7, [x5, x14] + str q7, [x13, #16] + prfm pldl1keep, [x4] + ldr x4, [sp, #760] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[3] + fmla v2.4s, v7.4s, v5.s[3] + fmla v1.4s, v7.4s, v6.s[3] + fmla v3.4s, v7.4s, v16.s[3] + add x13, x13, #64 + ldr q7, [x4, x14] + prfm pldl1keep, [x3] + ldr q4, [x2, #16] + prfm pldl1keep, [x1] + ldr q5, [x18, #16] + prfm pldl1keep, [x17] + ldr q6, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x14, x14, x16 + b .LBB0_65 + .p2align 2 +.LBB0_67: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1000] // 8-byte Folded Reload + ldr x6, [sp, #984] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + ldr x16, [sp, #1264] // 8-byte Folded Reload + str q7, [x8, x22, lsl #4] + mov x12, xzr + ldr x7, [sp, #1272] // 8-byte Folded Reload + mov x13, xzr + madd x14, x6, x15, x11 + add x14, x14, x19 + lsl x14, x14, #2 + ldr q17, [x16, x14] + madd x14, x23, x15, x11 + madd x11, x24, x15, x11 + ldr x15, [sp, #672] // 8-byte Folded Reload + add x14, x14, x19 + add x11, x11, x19 + lsl x14, x14, #2 + lsl x11, x11, #2 + str q17, [x8, x6, lsl #4] + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v1.4s, v17.4s, v6.s[1] + fmla v3.4s, v17.4s, v16.s[1] + ldr q7, [x16, x14] + ldr x14, [sp, #464] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[2] + str q7, [x8, x23, lsl #4] + fmla v2.4s, v7.4s, v5.s[2] + fmla v1.4s, v7.4s, v6.s[2] + fmla v3.4s, v7.4s, v16.s[2] + ldr q7, [x16, x11] + ldr x11, [sp, #664] // 8-byte Folded Reload + add x11, x11, x14 + add x14, x15, x14 + ldr x15, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[3] + fmla v2.4s, v7.4s, v5.s[3] + fmla v1.4s, v7.4s, v6.s[3] + fmla v3.4s, v7.4s, v16.s[3] + str q7, [x8, x24, lsl #4] + cmp x15, x20 + b.ge .LBB0_69 + .p2align 2 +.LBB0_68: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x17, [sp, #1152] // 8-byte Folded Reload + add x16, x11, x13 + add x17, x17, x12 + add x12, x12, #4 + prfm pldl1keep, [x17] + ldur s4, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x16] + ldr q16, [x14, x13] + add x13, x13, x21 + fmla v0.4s, v16.4s, v4.s[0] + str q16, [x8, x15, lsl #4] + add x15, x15, #1 + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x15, x20 + b.lt .LBB0_68 +.LBB0_69: // %.preheader66 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #312] // 8-byte Folded Reload + ldr x13, [sp, #1096] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #48 + ldr x14, [sp, #1088] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x12 + b .LBB0_71 + .p2align 2 +.LBB0_70: // %.loopexit62 + // in Loop: Header=BB0_71 Depth=3 + ldr x2, [sp, #1232] // 8-byte Folded Reload + add x14, x14, x2 + add x13, x13, x2 + mov x2, x15 + mov x15, x1 +.LBB0_71: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_73 Depth 4 + // Child Loop BB0_75 Depth 4 + madd x1, x2, x29, x9 + add x1, x1, x19 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x19 + add x17, x17, x19 + lsl x1, x1, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str q0, [x7, x1] + str q2, [x7, x16] + add x16, x18, x19 + lsl x16, x16, #2 + str q1, [x7, x17] + str q3, [x7, x16] + ldr x16, [sp, #1296] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_76 +// %bb.72: // in Loop: Header=BB0_71 Depth=3 + add x16, x15, #1 + add x17, x15, #2 + madd x1, x15, x29, x9 + add x18, x15, #3 + madd x3, x16, x29, x9 + ldr q16, [x8] + mov x2, xzr + add x1, x1, x19 + madd x4, x17, x29, x9 + add x3, x3, x19 + add x4, x4, x19 + lsl x1, x1, #2 + lsl x3, x3, #2 + lsl x4, x4, #2 + ldr q0, [x7, x1] + madd x1, x18, x29, x9 + ldr q2, [x7, x3] + madd x3, x15, x28, x10 + ldr q1, [x7, x4] + ldr x4, [sp, #1280] // 8-byte Folded Reload + add x1, x1, x19 + lsl x3, x3, #2 + lsl x1, x1, #2 + ldr q7, [x4, x3] + madd x3, x16, x28, x10 + ldr q3, [x7, x1] + add x1, x15, #4 + lsl x3, x3, #2 + ldr q6, [x4, x3] + madd x3, x17, x28, x10 + lsl x3, x3, #2 + ldr q5, [x4, x3] + madd x3, x18, x28, x10 + lsl x3, x3, #2 + ldr q4, [x4, x3] + mov x3, x11 + mov x4, x14 + cmp xzr, x22 + b.ge .LBB0_74 + .p2align 2 +.LBB0_73: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_71 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x3, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + add x2, x2, #4 + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x5] + add x5, x4, x27 + ldp q16, q17, [x3, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v2.4s, v16.4s, v6.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v2.4s, v17.4s, v6.s[2] + fmla v1.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x3], #64 + prfm pldl1keep, [x4] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x5] + fmla v2.4s, v17.4s, v6.s[3] + ldur q6, [x5, #-16] + add x5, x5, x27 + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x5] + ldur q5, [x5, #-16] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur q4, [x5, #-16] + cmp x2, x22 + b.lt .LBB0_73 +.LBB0_74: // in Loop: Header=BB0_71 Depth=3 + ldr q17, [x8, x6, lsl #4] + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + ldr q16, [x8, x23, lsl #4] + ldr q18, [x8, x24, lsl #4] + ldr x4, [sp, #1304] // 8-byte Folded Reload + mov x2, x13 + mov x3, x12 + fmla v0.4s, v17.4s, v7.s[1] + fmla v2.4s, v17.4s, v6.s[1] + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + fmla v0.4s, v16.4s, v7.s[2] + fmla v2.4s, v16.4s, v6.s[2] + fmla v1.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v2.4s, v18.4s, v6.s[3] + fmla v1.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x4, x20 + b.ge .LBB0_70 + .p2align 2 +.LBB0_75: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_71 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x2, x27 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x5] + ldur s5, [x5, #-4] + add x5, x5, x27 + add x2, x2, #4 + prfm pldl1keep, [x5] + ldur s6, [x5, #-4] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur s7, [x5, #-4] + prfm pldl1keep, [x3] + ldur q16, [x3, #-16] + add x3, x3, #16 + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x4, x20 + b.lt .LBB0_75 + b .LBB0_70 + .p2align 2 +.LBB0_76: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1296] // 8-byte Folded Reload + ldr x14, [sp, #1128] // 8-byte Folded Reload + cmp x13, x14 + b.ge .LBB0_82 +// %bb.77: // in Loop: Header=BB0_7 Depth=2 + ldr x17, [sp, #1296] // 8-byte Folded Reload + ldr x18, [sp, #1280] // 8-byte Folded Reload + mov x15, xzr + add x16, x17, #1 + madd x13, x17, x29, x9 + madd x17, x17, x28, x10 + ldr q4, [x8] + madd x14, x16, x29, x9 + madd x16, x16, x28, x10 + add x13, x13, x19 + lsl x17, x17, #2 + add x14, x14, x19 + add x13, x7, x13, lsl #2 + lsl x16, x16, #2 + ldr q3, [x18, x17] + ldr x17, [sp, #1120] // 8-byte Folded Reload + add x14, x7, x14, lsl #2 + ldr q2, [x18, x16] + mov x16, x11 + ldr q0, [x13] + ldr q1, [x14] + cmp xzr, x22 + b.ge .LBB0_79 + .p2align 2 +.LBB0_78: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x16, #32 + ldr x18, [sp, #1184] // 8-byte Folded Reload + ldr x2, [sp, #1176] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + prfm pldl1keep, [x4] + fmla v1.4s, v4.4s, v2.s[0] + ldp q4, q5, [x16, #-32] + add x15, x15, #4 + add x18, x18, x17 + add x2, x2, x17 + add x17, x17, #16 + add x1, x18, #32 + add x3, x2, #32 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x16], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x18, #16] + cmp x15, x22 + b.lt .LBB0_78 +.LBB0_79: // in Loop: Header=BB0_7 Depth=2 + ldr q5, [x8, x6, lsl #4] + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldr q4, [x8, x23, lsl #4] + ldr x15, [sp, #1120] // 8-byte Folded Reload + ldr x16, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x24, lsl #4] + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x16, x20 + b.ge .LBB0_81 + .p2align 2 +.LBB0_80: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x1, [sp, #1168] // 8-byte Folded Reload + ldr x2, [sp, #1160] // 8-byte Folded Reload + add x16, x16, #1 + add x17, x1, x15 + add x18, x2, x15 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x2, x15] + prfm pldl1keep, [x17] + ldr s3, [x1, x15] + prfm pldl1keep, [x12] + ldur q4, [x12, #-16] + add x12, x12, #16 + add x15, x15, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + cmp x16, x20 + b.lt .LBB0_80 +.LBB0_81: // in Loop: Header=BB0_7 Depth=2 + str q0, [x13] + str q1, [x14] +.LBB0_82: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_88 +// %bb.83: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1128] // 8-byte Folded Reload + ldr q2, [x8] + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x28, x10 + ldr x13, [sp, #1280] // 8-byte Folded Reload + ldr x14, [sp, #896] // 8-byte Folded Reload + add x9, x9, x19 + lsl x10, x10, #2 + add x9, x7, x9, lsl #2 + ldr q1, [x13, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr q0, [x9] + cmp xzr, x22 + b.ge .LBB0_85 + .p2align 2 +.LBB0_84: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x11, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x11, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x11], #64 + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x22 + b.lt .LBB0_84 +.LBB0_85: // in Loop: Header=BB0_7 Depth=2 + ldr q3, [x8, x6, lsl #4] + fmla v0.4s, v2.4s, v1.s[0] + ldr x11, [sp, #280] // 8-byte Folded Reload + ldr q2, [x8, x23, lsl #4] + mov x10, xzr + mov w12, #16 // =0x10 + fmla v0.4s, v3.4s, v1.s[1] + ldr q3, [x8, x24, lsl #4] + add x8, x8, x11 + ldr x11, [sp, #880] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + fmla v0.4s, v3.4s, v1.s[3] + ldr x13, [sp, #1304] // 8-byte Folded Reload + add x13, x13, xzr + cmp x13, x20 + b.ge .LBB0_87 + .p2align 2 +.LBB0_86: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x8, x12 + prfm pldl1keep, [x11] + ldr s1, [x14, x10, lsl #2] + prfm pldl1keep, [x13] + ldr q2, [x8, x10, lsl #4] + add x10, x10, #1 + add x12, x12, #16 + add x11, x11, #4 + fmla v0.4s, v2.4s, v1.s[0] + ldr x13, [sp, #1304] // 8-byte Folded Reload + add x13, x13, x10 + cmp x13, x20 + b.lt .LBB0_86 +.LBB0_87: // in Loop: Header=BB0_7 Depth=2 + str q0, [x9] +.LBB0_88: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload + ldr x19, [sp, #1072] // 8-byte Folded Reload + ldr x8, [sp, #592] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x8, x9 + b.ge .LBB0_38 +.LBB0_89: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #424] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr x9, [sp, #1008] // 8-byte Folded Reload + mov x16, x19 + mov x12, xzr + ldr x15, [sp, #1064] // 8-byte Folded Reload + ldr x14, [sp, #1256] // 8-byte Folded Reload + mul x8, x19, x8 + ldr x19, [sp, #592] // 8-byte Folded Reload + ldr x17, [sp, #1272] // 8-byte Folded Reload + add x10, x14, x29 + ldr x21, [sp, #992] // 8-byte Folded Reload + ldr x23, [sp, #976] // 8-byte Folded Reload + ldr x24, [sp, #968] // 8-byte Folded Reload + ldp x30, x25, [sp, #296] // 16-byte Folded Reload + madd x9, x15, x9, x8 + add x8, x9, x19 + add x13, x8, x29 + lsl x11, x8, #2 + add x14, x8, x14 + add x8, x8, x10 + lsl x10, x13, #2 + ldr d0, [x17, x11] + lsl x11, x14, #2 + ldr x13, [sp, #1264] // 8-byte Folded Reload + ldr x14, [sp, #1280] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr d2, [x17, x10] + ldr x10, [sp, #1040] // 8-byte Folded Reload + mul x10, x16, x10 + ldr d1, [x17, x11] + ldr x11, [sp, #1048] // 8-byte Folded Reload + madd x11, x15, x11, x10 + ldr d3, [x17, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + add x10, x11, x19 + lsl x10, x10, #2 + ldr d7, [x13, x10] + ldr x10, [sp, #1032] // 8-byte Folded Reload + ldr x13, [sp, #1056] // 8-byte Folded Reload + mul x10, x16, x10 + madd x10, x15, x13, x10 + ldr x15, [sp, #1120] // 8-byte Folded Reload + lsl x13, x10, #2 + ldr q4, [x14, x13] + add x13, x10, x28 + lsl x13, x13, #2 + ldr q5, [x14, x13] + add x13, x10, x28, lsl #1 + lsl x13, x13, #2 + ldr q6, [x14, x13] + ldr x14, [sp, #656] // 8-byte Folded Reload + orr x13, x8, #0x10 + .p2align 2 +.LBB0_90: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1288] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[0] + fmla v2.2s, v7.2s, v5.s[0] + cmp x12, x22 + add x16, x16, x15 + prfm pldl1keep, [x16, #16] + ldr q16, [x16] + b.ge .LBB0_92 +// %bb.91: // in Loop: Header=BB0_90 Depth=3 + ldr x7, [sp, #720] // 8-byte Folded Reload + ldr x16, [sp, #1224] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + ldr x18, [sp, #1208] // 8-byte Folded Reload + ldr x2, [sp, #1216] // 8-byte Folded Reload + add x5, x30, x14 + add x4, x25, x14 + ldr x6, [sp, #728] // 8-byte Folded Reload + stur d7, [x13, #-16] + add x12, x12, #4 + add x7, x7, x14 + add x6, x6, x14 + add x16, x16, x15 + add x18, x18, x15 + add x2, x2, x15 + add x15, x15, #16 + prfm pldl1keep, [x7] + ldr x7, [sp, #704] // 8-byte Folded Reload + add x17, x16, #32 + add x1, x18, #32 + add x3, x2, #32 + ldr d7, [x7, x14] + stur d7, [x13, #-8] + prfm pldl1keep, [x6] + ldr x6, [sp, #696] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[1] + fmla v2.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v3.2s, v7.2s, v16.s[1] + ldr d7, [x6, x14] + str d7, [x13] + prfm pldl1keep, [x5] + ldr x5, [sp, #688] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[2] + fmla v2.2s, v7.2s, v5.s[2] + fmla v1.2s, v7.2s, v6.s[2] + fmla v3.2s, v7.2s, v16.s[2] + ldr d7, [x5, x14] + str d7, [x13, #8] + prfm pldl1keep, [x4] + ldr x4, [sp, #712] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[3] + fmla v2.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v3.2s, v7.2s, v16.s[3] + add x13, x13, #32 + ldr d7, [x4, x14] + prfm pldl1keep, [x3] + ldr q4, [x2, #16] + prfm pldl1keep, [x1] + ldr q5, [x18, #16] + prfm pldl1keep, [x17] + ldr q6, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x14, x14, x16 + b .LBB0_90 + .p2align 2 +.LBB0_92: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1000] // 8-byte Folded Reload + ldr x6, [sp, #984] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + ldr x16, [sp, #1264] // 8-byte Folded Reload + str d7, [x8, x22, lsl #3] + mov x12, xzr + ldr x7, [sp, #648] // 8-byte Folded Reload + ldr x25, [sp, #680] // 8-byte Folded Reload + mov x13, xzr + madd x14, x6, x15, x11 + ldr x30, [sp, #1272] // 8-byte Folded Reload + add x14, x14, x19 + lsl x14, x14, #2 + ldr d17, [x16, x14] + madd x14, x23, x15, x11 + madd x11, x24, x15, x11 + ldr x15, [sp, #672] // 8-byte Folded Reload + add x14, x14, x19 + add x11, x11, x19 + lsl x14, x14, #2 + lsl x11, x11, #2 + str d17, [x8, x6, lsl #3] + fmla v0.2s, v17.2s, v4.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v1.2s, v17.2s, v6.s[1] + fmla v3.2s, v17.2s, v16.s[1] + ldr d7, [x16, x14] + ldr x14, [sp, #456] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[2] + str d7, [x8, x23, lsl #3] + fmla v2.2s, v7.2s, v5.s[2] + fmla v1.2s, v7.2s, v6.s[2] + fmla v3.2s, v7.2s, v16.s[2] + ldr d7, [x16, x11] + ldr x11, [sp, #664] // 8-byte Folded Reload + add x11, x11, x14 + add x14, x15, x14 + ldr x15, [sp, #1304] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[3] + fmla v2.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v3.2s, v7.2s, v16.s[3] + str d7, [x8, x24, lsl #3] + cmp x15, x20 + b.ge .LBB0_94 + .p2align 2 +.LBB0_93: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x17, [sp, #1152] // 8-byte Folded Reload + add x16, x11, x13 + add x17, x17, x12 + add x12, x12, #4 + prfm pldl1keep, [x17] + ldur s4, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x16] + ldr d16, [x14, x13] + add x13, x13, x21 + fmla v0.2s, v16.2s, v4.s[0] + str d16, [x8, x15, lsl #3] + add x15, x15, #1 + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x15, x20 + b.lt .LBB0_93 +.LBB0_94: // %.preheader65 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #288] // 8-byte Folded Reload + ldr x13, [sp, #1096] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #24 + ldr x14, [sp, #1088] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x12 + b .LBB0_96 + .p2align 2 +.LBB0_95: // %.loopexit61 + // in Loop: Header=BB0_96 Depth=3 + ldr x2, [sp, #1232] // 8-byte Folded Reload + add x14, x14, x2 + add x13, x13, x2 + mov x2, x15 + mov x15, x1 +.LBB0_96: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_98 Depth 4 + // Child Loop BB0_100 Depth 4 + madd x1, x2, x29, x9 + add x1, x1, x19 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x19 + add x17, x17, x19 + lsl x1, x1, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str d0, [x30, x1] + str d2, [x30, x16] + add x16, x18, x19 + lsl x16, x16, #2 + str d1, [x30, x17] + str d3, [x30, x16] + ldr x16, [sp, #1296] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_101 +// %bb.97: // in Loop: Header=BB0_96 Depth=3 + add x16, x15, #1 + add x17, x15, #2 + madd x1, x15, x29, x9 + add x18, x15, #3 + madd x3, x16, x29, x9 + ldr d16, [x8] + mov x2, xzr + add x1, x1, x19 + madd x4, x17, x29, x9 + add x3, x3, x19 + add x4, x4, x19 + lsl x1, x1, #2 + lsl x3, x3, #2 + lsl x4, x4, #2 + ldr d0, [x30, x1] + madd x1, x18, x29, x9 + ldr d2, [x30, x3] + madd x3, x15, x28, x10 + ldr d1, [x30, x4] + ldr x4, [sp, #1280] // 8-byte Folded Reload + add x1, x1, x19 + lsl x3, x3, #2 + lsl x1, x1, #2 + ldr q7, [x4, x3] + madd x3, x16, x28, x10 + ldr d3, [x30, x1] + add x1, x15, #4 + lsl x3, x3, #2 + ldr q6, [x4, x3] + madd x3, x17, x28, x10 + lsl x3, x3, #2 + ldr q5, [x4, x3] + madd x3, x18, x28, x10 + lsl x3, x3, #2 + ldr q4, [x4, x3] + mov x3, x11 + mov x4, x14 + cmp xzr, x22 + b.ge .LBB0_99 + .p2align 2 +.LBB0_98: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_96 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x3, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + add x2, x2, #4 + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x5] + add x5, x4, x27 + ldp d16, d17, [x3, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v2.2s, v17.2s, v6.s[2] + fmla v1.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x3], #32 + prfm pldl1keep, [x4] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x5] + fmla v2.2s, v17.2s, v6.s[3] + ldur q6, [x5, #-16] + add x5, x5, x27 + fmla v1.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x5] + ldur q5, [x5, #-16] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur q4, [x5, #-16] + cmp x2, x22 + b.lt .LBB0_98 +.LBB0_99: // in Loop: Header=BB0_96 Depth=3 + ldr d17, [x8, x6, lsl #3] + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + ldr d16, [x8, x23, lsl #3] + ldr d18, [x8, x24, lsl #3] + ldr x4, [sp, #1304] // 8-byte Folded Reload + mov x2, x13 + mov x3, x12 + fmla v0.2s, v17.2s, v7.s[1] + fmla v2.2s, v17.2s, v6.s[1] + fmla v1.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + fmla v0.2s, v16.2s, v7.s[2] + fmla v2.2s, v16.2s, v6.s[2] + fmla v1.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v2.2s, v18.2s, v6.s[3] + fmla v1.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x4, x20 + b.ge .LBB0_95 + .p2align 2 +.LBB0_100: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_96 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x2, x27 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x5] + ldur s5, [x5, #-4] + add x5, x5, x27 + add x2, x2, #4 + prfm pldl1keep, [x5] + ldur s6, [x5, #-4] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur s7, [x5, #-4] + prfm pldl1keep, [x3] + ldur d16, [x3, #-8] + add x3, x3, #8 + fmla v0.2s, v16.2s, v4.s[0] + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x4, x20 + b.lt .LBB0_100 + b .LBB0_95 + .p2align 2 +.LBB0_101: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1296] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_107 +// %bb.102: // in Loop: Header=BB0_7 Depth=2 + ldr x16, [sp, #1296] // 8-byte Folded Reload + ldr x17, [sp, #1280] // 8-byte Folded Reload + mov x14, xzr + add x15, x16, #1 + madd x12, x16, x29, x9 + madd x16, x16, x28, x10 + ldr d4, [x8] + madd x13, x15, x29, x9 + madd x15, x15, x28, x10 + add x12, x12, x19 + lsl x16, x16, #2 + add x13, x13, x19 + add x12, x30, x12, lsl #2 + lsl x15, x15, #2 + ldr q3, [x17, x16] + ldr x16, [sp, #1120] // 8-byte Folded Reload + add x13, x30, x13, lsl #2 + ldr q2, [x17, x15] + mov x15, x11 + ldr d0, [x12] + ldr d1, [x13] + cmp xzr, x22 + b.ge .LBB0_104 + .p2align 2 +.LBB0_103: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x3, x15, #16 + ldr x17, [sp, #1184] // 8-byte Folded Reload + ldr x1, [sp, #1176] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + prfm pldl1keep, [x3] + fmla v1.2s, v4.2s, v2.s[0] + ldp d4, d5, [x15, #-16] + add x14, x14, #4 + add x17, x17, x16 + add x1, x1, x16 + add x16, x16, #16 + add x18, x17, #32 + add x2, x1, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x15], #32 + prfm pldl1keep, [x2] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x1, #16] + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + cmp x14, x22 + b.lt .LBB0_103 +.LBB0_104: // in Loop: Header=BB0_7 Depth=2 + ldr d5, [x8, x6, lsl #3] + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr d4, [x8, x23, lsl #3] + ldr x16, [sp, #408] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + add x16, x8, x16 + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x24, lsl #3] + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + ldr x17, [sp, #1304] // 8-byte Folded Reload + add x17, x17, xzr + cmp x17, x20 + b.ge .LBB0_106 + .p2align 2 +.LBB0_105: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x17, x16, x15, lsl #3 + add x18, x25, x14 + add x1, x7, x14 + add x14, x14, #4 + add x1, x1, #4 + add x18, x18, #4 + add x17, x17, #8 + prfm pldl1keep, [x1] + ldr s2, [x7, x15, lsl #2] + prfm pldl1keep, [x18] + ldr s3, [x25, x15, lsl #2] + prfm pldl1keep, [x17] + ldr d4, [x16, x15, lsl #3] + add x15, x15, #1 + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + ldr x17, [sp, #1304] // 8-byte Folded Reload + add x17, x17, x15 + cmp x17, x20 + b.lt .LBB0_105 +.LBB0_106: // in Loop: Header=BB0_7 Depth=2 + str d0, [x12] + str d1, [x13] +.LBB0_107: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_113 +// %bb.108: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1128] // 8-byte Folded Reload + ldr d2, [x8] + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x28, x10 + ldr x13, [sp, #1280] // 8-byte Folded Reload + ldr x14, [sp, #896] // 8-byte Folded Reload + add x9, x9, x19 + lsl x10, x10, #2 + add x9, x30, x9, lsl #2 + ldr q1, [x13, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr d0, [x9] + cmp xzr, x22 + b.ge .LBB0_110 + .p2align 2 +.LBB0_109: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x11, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp d2, d3, [x11, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x11], #32 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x22 + b.lt .LBB0_109 +.LBB0_110: // in Loop: Header=BB0_7 Depth=2 + ldr d3, [x8, x6, lsl #3] + fmla v0.2s, v2.2s, v1.s[0] + ldr x11, [sp, #408] // 8-byte Folded Reload + ldr d4, [x8, x23, lsl #3] + ldr d2, [x8, x24, lsl #3] + mov x10, xzr + add x8, x8, x11 + ldr x11, [sp, #880] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + fmla v0.2s, v4.2s, v1.s[2] + fmla v0.2s, v2.2s, v1.s[3] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, xzr + cmp x12, x20 + b.ge .LBB0_112 + .p2align 2 +.LBB0_111: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x10, lsl #3 + prfm pldl1keep, [x11] + ldr s1, [x14, x10, lsl #2] + add x11, x11, #4 + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr d2, [x8, x10, lsl #3] + add x10, x10, #1 + fmla v0.2s, v2.2s, v1.s[0] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, x10 + cmp x12, x20 + b.lt .LBB0_111 +.LBB0_112: // in Loop: Header=BB0_7 Depth=2 + str d0, [x9] +.LBB0_113: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload + ldr x19, [sp, #1072] // 8-byte Folded Reload + ldr x8, [sp, #504] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 +.LBB0_114: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #480] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr x9, [sp, #1008] // 8-byte Folded Reload + add x10, x0, #63 + mov x12, xzr + ldr x15, [sp, #1064] // 8-byte Folded Reload + ldr x21, [sp, #584] // 8-byte Folded Reload + mov x13, xzr + mul x8, x19, x8 + ldr x14, [sp, #1256] // 8-byte Folded Reload + ldr x16, [sp, #1272] // 8-byte Folded Reload + ldr x23, [sp, #992] // 8-byte Folded Reload + ldr x30, [sp, #384] // 8-byte Folded Reload + ldp x25, x24, [sp, #392] // 16-byte Folded Reload + madd x9, x15, x9, x8 + add x8, x9, x21 + add x11, x8, x14 + add x14, x14, x29 + ldr s1, [x16, x8, lsl #2] + add x14, x8, x14 + add x8, x8, x29 + ldr s2, [x16, x11, lsl #2] + ldr x11, [sp, #1048] // 8-byte Folded Reload + ldr s3, [x16, x8, lsl #2] + ldr x8, [sp, #1040] // 8-byte Folded Reload + ldr s0, [x16, x14, lsl #2] + ldr x14, [sp, #1264] // 8-byte Folded Reload + mul x8, x19, x8 + madd x11, x15, x11, x8 + add x8, x11, x21 + ldr s16, [x14, x8, lsl #2] + and x8, x10, #0xffffffffffffffc0 + ldr x10, [sp, #1032] // 8-byte Folded Reload + ldr x14, [sp, #1056] // 8-byte Folded Reload + mul x10, x19, x10 + madd x10, x15, x14, x10 + ldr x15, [sp, #1280] // 8-byte Folded Reload + lsl x14, x10, #2 + ldr q4, [x15, x14] + add x14, x10, x28 + lsl x14, x14, #2 + ldr q5, [x15, x14] + add x14, x10, x28, lsl #1 + lsl x14, x14, #2 + ldr q6, [x15, x14] + ldr x15, [sp, #656] // 8-byte Folded Reload + orr x14, x8, #0xc + .p2align 2 +.LBB0_115: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #888] // 8-byte Folded Reload + ext v20.16b, v4.16b, v4.16b, #8 + cmp x13, x22 + ext v19.16b, v5.16b, v5.16b, #8 + add x16, x16, x12 + prfm pldl1keep, [x16, #16] + ldr q7, [x16] + ext v18.16b, v6.16b, v6.16b, #8 + ext v17.16b, v7.16b, v7.16b, #8 + b.ge .LBB0_117 +// %bb.116: // in Loop: Header=BB0_115 Depth=3 + ldr x7, [sp, #840] // 8-byte Folded Reload + add x19, x14, x12 + ldr x16, [sp, #856] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.2s + ldr x18, [sp, #864] // 8-byte Folded Reload + ldr x2, [sp, #872] // 8-byte Folded Reload + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + stur s16, [x19, #-12] + fmla v0.2s, v16.2s, v7.2s + add x6, x30, x15 + add x5, x25, x15 + add x4, x24, x15 + add x13, x13, #4 + add x7, x7, x15 + add x16, x16, x12 + add x18, x18, x12 + add x2, x2, x12 + add x12, x12, #16 + prfm pldl1keep, [x7] + ldr x7, [sp, #824] // 8-byte Folded Reload + add x17, x16, #32 + add x1, x18, #32 + add x3, x2, #32 + ldr s16, [x7, x15] + stur s16, [x19, #-8] + prfm pldl1keep, [x6] + ldr x6, [sp, #816] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.s[1] + fmla v3.2s, v16.2s, v5.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v0.2s, v16.2s, v7.s[1] + ldr s16, [x6, x15] + stur s16, [x19, #-4] + prfm pldl1keep, [x5] + ldr x5, [sp, #808] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + ldr s16, [x5, x15] + str s16, [x19] + prfm pldl1keep, [x4] + ldr x4, [sp, #832] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.s[3] + fmla v3.2s, v16.2s, v5.s[3] + fmla v2.2s, v16.2s, v6.s[3] + fmla v0.2s, v16.2s, v7.s[3] + ldr s16, [x4, x15] + prfm pldl1keep, [x3] + ldr q4, [x2, #16] + prfm pldl1keep, [x1] + ldr q5, [x18, #16] + prfm pldl1keep, [x17] + ldr q6, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x15, x15, x16 + b .LBB0_115 + .p2align 2 +.LBB0_117: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1000] // 8-byte Folded Reload + ldr x6, [sp, #984] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + ldr x16, [sp, #1264] // 8-byte Folded Reload + ldr x7, [sp, #976] // 8-byte Folded Reload + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x8, x22, lsl #2] + ldr x19, [sp, #968] // 8-byte Folded Reload + mov x12, xzr + ldr x24, [sp, #616] // 8-byte Folded Reload + ldr x25, [sp, #608] // 8-byte Folded Reload + mov x13, xzr + madd x14, x6, x15, x11 + ldr x17, [sp, #576] // 8-byte Folded Reload + ldr x18, [sp, #600] // 8-byte Folded Reload + add x14, x14, x21 + ldr x30, [sp, #648] // 8-byte Folded Reload + ldr s16, [x16, x14, lsl #2] + madd x14, x7, x15, x11 + madd x11, x19, x15, x11 + add x14, x14, x21 + add x11, x11, x21 + str s16, [x8, x6, lsl #2] + fmla v1.2s, v16.2s, v4.s[1] + fmla v3.2s, v16.2s, v5.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v0.2s, v16.2s, v7.s[1] + ldr s16, [x16, x14, lsl #2] + ldr x14, [sp, #1304] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v20.2s + str s16, [x8, x7, lsl #2] + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + ldr s16, [x16, x11, lsl #2] + ldr x11, [sp, #512] // 8-byte Folded Reload + add x11, x8, x11 + fmla v1.2s, v16.2s, v4.s[3] + fmla v3.2s, v16.2s, v5.s[3] + fmla v2.2s, v16.2s, v6.s[3] + fmla v0.2s, v16.2s, v7.s[3] + str s16, [x8, x19, lsl #2] + cmp x14, x20 + b.ge .LBB0_119 + .p2align 2 +.LBB0_118: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1152] // 8-byte Folded Reload + add x15, x18, x13 + add x14, x14, #1 + add x16, x16, x12 + prfm pldl1keep, [x16] + ldur s4, [x16, #-4] + add x16, x16, x27 + prfm pldl1keep, [x16] + ldur s5, [x16, #-4] + add x16, x16, x27 + prfm pldl1keep, [x16] + ldur s6, [x16, #-4] + add x16, x16, x27 + prfm pldl1keep, [x16] + ldur s7, [x16, #-4] + prfm pldl1keep, [x15] + ldr s16, [x17, x13] + add x13, x13, x23 + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x11, x12] + add x12, x12, #4 + cmp x14, x20 + b.lt .LBB0_118 +.LBB0_119: // %.preheader64 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #448] // 8-byte Folded Reload + ldr x13, [sp, #1096] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #12 + ldr x14, [sp, #1088] // 8-byte Folded Reload + mov w17, #1 // =0x1 + mov w18, #2 // =0x2 + mov w16, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x12 + b .LBB0_121 + .p2align 2 +.LBB0_120: // %.loopexit60 + // in Loop: Header=BB0_121 Depth=3 + ldr x2, [sp, #1232] // 8-byte Folded Reload + add x14, x14, x2 + add x13, x13, x2 + mov x2, x15 + mov x15, x1 +.LBB0_121: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_123 Depth 4 + // Child Loop BB0_125 Depth 4 + madd x1, x2, x29, x9 + ldr x23, [sp, #1272] // 8-byte Folded Reload + add x1, x1, x21 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + madd x16, x16, x29, x9 + add x17, x17, x21 + add x16, x16, x21 + str s1, [x23, x1, lsl #2] + str s3, [x23, x17, lsl #2] + add x17, x18, x21 + str s2, [x23, x17, lsl #2] + str s0, [x23, x16, lsl #2] + ldr x16, [sp, #1296] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_126 +// %bb.122: // in Loop: Header=BB0_121 Depth=3 + add x17, x15, #1 + add x18, x15, #2 + add x16, x15, #3 + madd x1, x15, x29, x9 + madd x3, x17, x29, x9 + ldr s16, [x8] + mov x2, xzr + add x1, x1, x21 + madd x4, x18, x29, x9 + madd x5, x16, x29, x9 + add x3, x3, x21 + add x4, x4, x21 + add x5, x5, x21 + ldr s1, [x23, x1, lsl #2] + add x1, x15, #4 + ldr s3, [x23, x3, lsl #2] + madd x3, x15, x28, x10 + lsl x3, x3, #2 + ldr s2, [x23, x4, lsl #2] + ldr x4, [sp, #1280] // 8-byte Folded Reload + ldr s0, [x23, x5, lsl #2] + ldr q7, [x4, x3] + madd x3, x17, x28, x10 + lsl x3, x3, #2 + ldr q6, [x4, x3] + madd x3, x18, x28, x10 + lsl x3, x3, #2 + ldr q5, [x4, x3] + madd x3, x16, x28, x10 + lsl x3, x3, #2 + ldr q4, [x4, x3] + mov x3, x11 + mov x4, x14 + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x22 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_124 + .p2align 2 +.LBB0_123: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_121 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x3, #8 + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x2, x2, #4 + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x5] + add x5, x4, x27 + ldp s16, s21, [x3, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v1.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v1.2s, v21.2s, v20.2s + ldp s17, s16, [x3], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v2.2s, v21.2s, v18.2s + prfm pldl1keep, [x4] + fmla v1.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x5] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x5, #-16] + add x5, x5, x27 + fmla v2.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x5] + ldur q5, [x5, #-16] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur q4, [x5, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x2, x22 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_123 +.LBB0_124: // in Loop: Header=BB0_121 Depth=3 + ldr s21, [x8, x6, lsl #2] + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + ldr s16, [x8, x7, lsl #2] + ldr s22, [x8, x19, lsl #2] + ldr x4, [sp, #1304] // 8-byte Folded Reload + mov x2, x13 + mov x3, x12 + fmla v1.2s, v21.2s, v7.s[1] + fmla v3.2s, v21.2s, v6.s[1] + fmla v2.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + fmla v1.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v1.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v2.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x4, x20 + b.ge .LBB0_120 + .p2align 2 +.LBB0_125: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_121 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x2, x27 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x5] + ldur s5, [x5, #-4] + add x5, x5, x27 + add x2, x2, #4 + prfm pldl1keep, [x5] + ldur s6, [x5, #-4] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur s7, [x5, #-4] + prfm pldl1keep, [x3] + ldur s16, [x3, #-4] + add x3, x3, #4 + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x4, x20 + b.lt .LBB0_125 + b .LBB0_120 + .p2align 2 +.LBB0_126: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1296] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_132 +// %bb.127: // in Loop: Header=BB0_7 Depth=2 + ldr x16, [sp, #1296] // 8-byte Folded Reload + ldr x17, [sp, #1280] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + ldr s4, [x8] + madd x12, x16, x28, x10 + add x13, x16, #1 + lsl x12, x12, #2 + ldr q3, [x17, x12] + madd x12, x13, x28, x10 + madd x13, x13, x29, x9 + lsl x12, x12, #2 + add x13, x13, x21 + ldr q2, [x17, x12] + madd x12, x16, x29, x9 + ldr x16, [sp, #1272] // 8-byte Folded Reload + add x12, x12, x21 + ldr s0, [x16, x13, lsl #2] + ldr s1, [x16, x12, lsl #2] + ext v6.16b, v3.16b, v3.16b, #8 + cmp xzr, x22 + ext v5.16b, v2.16b, v2.16b, #8 + b.ge .LBB0_129 + .p2align 2 +.LBB0_128: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x2, x8, x14 + fmla v1.2s, v4.2s, v3.2s + fmla v0.2s, v4.2s, v2.2s + add x16, x25, x14 + add x3, x2, #20 + add x18, x24, x14 + add x17, x16, #32 + add x1, x18, #32 + prfm pldl1keep, [x3] + ldp s4, s7, [x2, #4] + add x15, x15, #4 + add x14, x14, #16 + fmla v0.2s, v4.2s, v2.s[1] + fmla v1.2s, v4.2s, v3.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x2, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x1] + fmla v1.2s, v5.2s, v3.s[3] + ldr q3, [x18, #16] + prfm pldl1keep, [x17] + fmla v0.2s, v5.2s, v2.s[3] + ldr q2, [x16, #16] + ext v6.16b, v3.16b, v3.16b, #8 + cmp x15, x22 + ext v5.16b, v2.16b, v2.16b, #8 + b.lt .LBB0_128 +.LBB0_129: // in Loop: Header=BB0_7 Depth=2 + ldr s7, [x8, x6, lsl #2] + fmla v1.2s, v4.2s, v3.2s + fmla v0.2s, v4.2s, v2.2s + ldr s4, [x8, x7, lsl #2] + ldr x15, [sp, #512] // 8-byte Folded Reload + ldr x16, [sp, #1304] // 8-byte Folded Reload + mov x14, xzr + add x15, x8, x15 + fmla v1.2s, v7.2s, v3.s[1] + fmla v0.2s, v7.2s, v2.s[1] + ldr s7, [x8, x19, lsl #2] + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + fmla v1.2s, v7.2s, v3.s[3] + fmla v0.2s, v7.2s, v2.s[3] + cmp x16, x20 + b.ge .LBB0_131 + .p2align 2 +.LBB0_130: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x2, [sp, #680] // 8-byte Folded Reload + add x17, x15, x14 + add x1, x30, x14 + add x16, x16, #1 + add x17, x17, #4 + add x1, x1, #4 + prfm pldl1keep, [x1] + add x18, x2, x14 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x30, x14] + prfm pldl1keep, [x17] + ldr s3, [x15, x14] + fmla v1.2s, v3.2s, v2.2s + ldr s2, [x2, x14] + add x14, x14, #4 + fmla v0.2s, v3.2s, v2.2s + cmp x16, x20 + b.lt .LBB0_130 +.LBB0_131: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #1272] // 8-byte Folded Reload + str s1, [x14, x12, lsl #2] + str s0, [x14, x13, lsl #2] +.LBB0_132: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_5 +// %bb.133: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1128] // 8-byte Folded Reload + ldr x15, [sp, #1272] // 8-byte Folded Reload + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x28, x10 + ldr x13, [sp, #1280] // 8-byte Folded Reload + ldr s2, [x8] + ldr x14, [sp, #896] // 8-byte Folded Reload + add x9, x9, x21 + lsl x10, x10, #2 + ldr s0, [x15, x9, lsl #2] + ldr q1, [x13, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x22 + b.ge .LBB0_135 + .p2align 2 +.LBB0_134: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x11, #8 + fmla v0.2s, v2.2s, v1.2s + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x11, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x11], #16 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x12, x22 + b.lt .LBB0_134 +.LBB0_135: // in Loop: Header=BB0_7 Depth=2 + ldr s4, [x8, x6, lsl #2] + fmla v0.2s, v2.2s, v1.2s + ldr x11, [sp, #512] // 8-byte Folded Reload + ldr s5, [x8, x7, lsl #2] + ldr s2, [x8, x19, lsl #2] + mov x10, xzr + add x8, x8, x11 + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + fmla v0.2s, v5.2s, v3.2s + fmla v0.2s, v2.2s, v1.s[3] + cmp x11, x20 + b.ge .LBB0_4 + .p2align 2 +.LBB0_136: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x10 + add x13, x14, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + prfm pldl1keep, [x12] + ldr s1, [x8, x10] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x20 + b.lt .LBB0_136 + b .LBB0_4 +.LBB0_137: + ldr x0, [sp, #16] // 8-byte Folded Reload + bl free + add sp, sp, #1312 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_4d_nn_mlir, .Lfunc_end0-sbatch_matmul_4d_nn_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s new file mode 100644 index 00000000000000..89f885cbd35df1 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s @@ -0,0 +1,3208 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_4d_nt_mlir // -- Begin function sbatch_matmul_4d_nt_mlir + .p2align 4 + .type sbatch_matmul_4d_nt_mlir,@function +sbatch_matmul_4d_nt_mlir: // @sbatch_matmul_4d_nt_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #688 + .cfi_def_cfa_offset 848 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x5, #0 + ldr x13, [sp, #912] + ldr x14, [sp, #848] + mov x20, x6 + cinv x8, x5, lt + ldr x28, [sp, #1032] + ldr x22, [sp, #856] + mov x27, x2 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + ldr x25, [sp, #944] + str x7, [sp, #664] // 8-byte Folded Spill + stp x13, x4, [sp, #296] // 16-byte Folded Spill + str x3, [sp, #32] // 8-byte Folded Spill + mov x19, x1 + asr x9, x9, #1 + str x14, [sp, #656] // 8-byte Folded Spill + str x5, [sp, #528] // 8-byte Folded Spill + cinv x21, x9, lt + ldr x9, [sp, #1024] + cmp x8, #0 + csel x8, x10, x8, lt + ldr x10, [sp, #976] + cmp x5, #0 + asr x8, x8, #2 + cinv x29, x8, lt + cmp x13, #0 + str x9, [sp, #520] // 8-byte Folded Spill + ldr x9, [sp, #1016] + cinv x8, x13, lt + add x11, x8, #7 + add x12, x8, #3 + str x9, [sp, #512] // 8-byte Folded Spill + ldr x9, [sp, #968] + stp x9, x10, [sp, #480] // 16-byte Folded Spill + add x9, x8, x8, lsr #63 + add x10, x8, #15 + asr x9, x9, #1 + cinv x14, x9, lt + cmp x8, #0 + csel x9, x10, x8, lt + csel x10, x11, x8, lt + ldr x11, [sp, #888] + csel x8, x12, x8, lt + cmp x13, #0 + str x14, [sp, #616] // 8-byte Folded Spill + asr x9, x9, #4 + asr x10, x10, #3 + asr x8, x8, #2 + cinv x24, x9, lt + cinv x26, x10, lt + cinv x23, x8, lt + lsl x8, x24, #4 + str x11, [sp, #672] // 8-byte Folded Spill + ldr x11, [sp, #880] + str x8, [sp, #568] // 8-byte Folded Spill + lsl x8, x23, #2 + str x11, [sp, #648] // 8-byte Folded Spill + ldr x11, [sp, #936] + str x11, [sp, #632] // 8-byte Folded Spill + ldr x11, [sp, #928] + str x11, [sp, #624] // 8-byte Folded Spill + lsl x11, x26, #3 + stp x8, x11, [sp, #440] // 16-byte Folded Spill + lsl x8, x14, #1 + str x8, [sp, #432] // 8-byte Folded Spill + lsl x8, x6, #6 + add x0, x8, #64 + str x8, [sp, #640] // 8-byte Folded Spill + bl malloc + add x12, x0, #63 + mul x9, x24, x25 + ldr x1, [sp, #672] // 8-byte Folded Reload + ldr x2, [sp, #648] // 8-byte Folded Reload + and x24, x12, #0xffffffffffffffc0 + ldr x12, [sp, #624] // 8-byte Folded Reload + mul x15, x21, x22 + lsl x8, x29, #2 + str x8, [sp, #680] // 8-byte Folded Spill + lsl x8, x21, #1 + mov w11, #1 // =0x1 + str x8, [sp, #592] // 8-byte Folded Spill + negs x8, x20 + bfi x11, x29, #2, #62 + and x10, x20, #0x3 + lsl x21, x22, #2 + str x0, [sp, #8] // 8-byte Folded Spill + mul x18, x22, x11 + and x8, x8, #0x3 + add x11, x20, x15, lsl #1 + lsl x12, x12, #2 + lsl x0, x27, #2 + mov w14, #1 // =0x1 + add x1, x2, x1, lsl #2 + str x12, [sp, #24] // 8-byte Folded Spill + ldr x12, [sp, #632] // 8-byte Folded Reload + csneg x8, x10, x8, mi + mul x10, x26, x25 + bfi x14, x23, #2, #62 + add x2, x0, x19 + mul x16, x25, x14 + add x4, x1, #4 + add x5, x2, #4 + add x9, x4, x9, lsl #6 + mul x17, x29, x22 + sub x29, x20, x8 + mul x13, x23, x25 + lsl x23, x22, #4 + add x2, x2, x23 + add x2, x2, #32 + lsl x12, x12, #2 + str x2, [sp, #152] // 8-byte Folded Spill + sub x2, x24, x8, lsl #6 + ldr x6, [sp, #640] // 8-byte Folded Reload + str x12, [sp, #288] // 8-byte Folded Spill + lsl x12, x25, #6 + add x18, x0, x18, lsl #2 + str x27, [sp, #504] // 8-byte Folded Spill + str x12, [sp, #472] // 8-byte Folded Spill + add x12, x20, x21 + add x17, x0, x17, lsl #4 + lsl x27, x25, #2 + sub x14, x12, x8 + sub x12, x11, x8 + add x15, x0, x15, lsl #3 + lsl x0, x8, #2 + add x12, x5, x12, lsl #2 + ldr x11, [sp, #616] // 8-byte Folded Reload + add x2, x2, x6 + add x6, x19, x18 + add x16, x1, x16, lsl #2 + add x7, x19, x17 + str xzr, [sp, #176] // 8-byte Folded Spill + mov x3, xzr + stp x9, x12, [sp, #96] // 16-byte Folded Spill + add x9, x4, x10, lsl #5 + lsl x10, x20, #4 + lsl x12, x20, #3 + add x13, x1, x13, lsl #4 + stp x13, x16, [sp, #136] // 16-byte Folded Spill + add x13, x15, x19 + add x13, x13, #32 + str x9, [sp, #88] // 8-byte Folded Spill + lsl x9, x20, #5 + mul x11, x11, x25 + lsl x25, x20, #2 + stp x10, x9, [sp, #248] // 16-byte Folded Spill + sub x9, x9, x8, lsl #5 + str x13, [sp, #128] // 8-byte Folded Spill + add x13, x15, x25 + sub x10, x10, x8, lsl #4 + sub x13, x13, x0 + add x18, x18, x25 + add x17, x17, x25 + stp x9, x12, [sp, #232] // 16-byte Folded Spill + sub x8, x12, x8, lsl #3 + sub x12, x29, #3 + add x13, x19, x13 + str x12, [sp, #648] // 8-byte Folded Spill + sub x12, x29, #2 + add x14, x5, x14, lsl #2 + sub x18, x18, x0 + str x12, [sp, #640] // 8-byte Folded Spill + sub x12, x29, #1 + sub x17, x17, x0 + stp x13, x14, [sp, #112] // 16-byte Folded Spill + str x12, [sp, #632] // 8-byte Folded Spill + ldr x12, [sp, #664] // 8-byte Folded Reload + sub x13, x25, x0 + ldr x0, [sp, #568] // 8-byte Folded Reload + add x9, x9, #32 + add x11, x4, x11, lsl #3 + stp x9, x8, [sp, #216] // 16-byte Folded Spill + str x10, [sp, #184] // 8-byte Folded Spill + add x10, x10, #16 + add x8, x8, #8 + add x18, x19, x18 + add x17, x19, x17 + stp x8, x10, [sp, #200] // 16-byte Folded Spill + add x8, x13, #4 + str x19, [sp, #496] // 8-byte Folded Spill + stp x13, x25, [sp, #264] // 16-byte Folded Spill + lsl x12, x12, #2 + stp x5, x4, [sp, #160] // 16-byte Folded Spill + str x18, [sp, #376] // 8-byte Folded Spill + sub x19, x29, #4 + str x12, [sp, #16] // 8-byte Folded Spill + ldr x12, [sp, #656] // 8-byte Folded Reload + mov x9, x11 + str x8, [sp, #192] // 8-byte Folded Spill + str x7, [sp, #600] // 8-byte Folded Spill + str x6, [sp, #608] // 8-byte Folded Spill + lsl x12, x12, #2 + stp x17, x6, [sp, #72] // 16-byte Folded Spill + stp x2, x17, [sp, #360] // 16-byte Folded Spill + str x18, [sp, #64] // 8-byte Folded Spill + str x12, [sp, #280] // 8-byte Folded Spill + add x12, x24, #256 + str x12, [sp, #624] // 8-byte Folded Spill + add x12, x2, #64 + str x12, [sp, #616] // 8-byte Folded Spill + b .LBB0_2 + .p2align 2 +.LBB0_1: // %.loopexit40 + // in Loop: Header=BB0_2 Depth=1 + ldp x10, x9, [sp, #16] // 16-byte Folded Reload + ldr x8, [sp, #168] // 8-byte Folded Reload + add x8, x8, x9 + ldr x3, [sp, #40] // 8-byte Folded Reload + str x8, [sp, #168] // 8-byte Folded Spill + ldr x8, [sp, #160] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #160] // 8-byte Folded Spill + ldr x8, [sp, #152] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #152] // 8-byte Folded Spill + ldr x8, [sp, #120] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #120] // 8-byte Folded Spill + ldr x8, [sp, #176] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #176] // 8-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #128] // 8-byte Folded Spill + ldr x8, [sp, #104] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #104] // 8-byte Folded Spill + ldr x8, [sp, #112] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #112] // 8-byte Folded Spill + ldr x8, [sp, #96] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #96] // 8-byte Folded Spill + ldr x8, [sp, #64] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #64] // 8-byte Folded Spill + ldr x8, [sp, #72] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #72] // 8-byte Folded Spill + ldr x8, [sp, #88] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #88] // 8-byte Folded Spill + ldr x8, [sp, #144] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #144] // 8-byte Folded Spill + ldr x8, [sp, #136] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #136] // 8-byte Folded Spill + ldp x7, x8, [sp, #48] // 16-byte Folded Reload + add x9, x8, x9 + ldr x8, [sp, #80] // 8-byte Folded Reload + add x7, x7, x10 + add x8, x8, x10 + str x8, [sp, #80] // 8-byte Folded Spill +.LBB0_2: // =>This Loop Header: Depth=1 + // Child Loop BB0_7 Depth 2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + // Child Loop BB0_23 Depth 4 + // Child Loop BB0_25 Depth 4 + // Child Loop BB0_29 Depth 4 + // Child Loop BB0_31 Depth 4 + // Child Loop BB0_37 Depth 3 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_42 Depth 4 + // Child Loop BB0_44 Depth 4 + // Child Loop BB0_47 Depth 3 + // Child Loop BB0_49 Depth 3 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_62 Depth 3 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_75 Depth 3 + // Child Loop BB0_77 Depth 3 + // Child Loop BB0_81 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 4 + // Child Loop BB0_88 Depth 4 + // Child Loop BB0_91 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_97 Depth 3 + // Child Loop BB0_99 Depth 3 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_106 Depth 3 + // Child Loop BB0_108 Depth 4 + // Child Loop BB0_110 Depth 4 + // Child Loop BB0_113 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + ldr x8, [sp, #32] // 8-byte Folded Reload + cmp x3, x8 + b.ge .LBB0_122 +// %bb.3: // in Loop: Header=BB0_2 Depth=1 + add x8, x3, #1 + str x9, [sp, #56] // 8-byte Folded Spill + mov x25, xzr + str x9, [sp, #328] // 8-byte Folded Spill + stp x8, x7, [sp, #40] // 16-byte Folded Spill + ldr x8, [sp, #80] // 8-byte Folded Reload + str x3, [sp, #672] // 8-byte Folded Spill + stp x8, x7, [sp, #336] // 16-byte Folded Spill + ldp x9, x8, [sp, #136] // 16-byte Folded Reload + stp x8, x9, [sp, #400] // 16-byte Folded Spill + ldr x9, [sp, #88] // 8-byte Folded Reload + ldp x11, x10, [sp, #64] // 16-byte Folded Reload + str x10, [sp, #352] // 8-byte Folded Spill + ldp x10, x8, [sp, #96] // 16-byte Folded Reload + stp x8, x11, [sp, #416] // 16-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + stp x10, x9, [sp, #384] // 16-byte Folded Spill + str x8, [sp, #456] // 8-byte Folded Spill + ldp x12, x8, [sp, #168] // 16-byte Folded Reload + str x8, [sp, #536] // 8-byte Folded Spill + ldp x11, x8, [sp, #112] // 16-byte Folded Reload + str x8, [sp, #552] // 8-byte Folded Spill + ldp x8, x16, [sp, #152] // 16-byte Folded Reload + str x8, [sp, #544] // 8-byte Folded Spill + b .LBB0_7 + .p2align 2 +.LBB0_4: // in Loop: Header=BB0_7 Depth=2 + str s0, [x6, x9, lsl #2] +.LBB0_5: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x16, [sp, #560] // 8-byte Folded Reload +.LBB0_6: // %.backedge41 + // in Loop: Header=BB0_7 Depth=2 + ldp x9, x8, [sp, #280] // 16-byte Folded Reload + ldr x10, [sp, #544] // 8-byte Folded Reload + add x10, x10, x9 + ldp x25, x12, [sp, #312] // 16-byte Folded Reload + add x12, x12, x8 + ldr x0, [sp, #568] // 8-byte Folded Reload + add x16, x16, x9 + str x10, [sp, #544] // 8-byte Folded Spill + ldr x10, [sp, #552] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #552] // 8-byte Folded Spill + ldr x10, [sp, #536] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #536] // 8-byte Folded Spill + ldp x10, x11, [sp, #456] // 16-byte Folded Reload + add x10, x10, x9 + add x11, x11, x9 + str x10, [sp, #456] // 8-byte Folded Spill + ldr x10, [sp, #416] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #416] // 8-byte Folded Spill + ldr x10, [sp, #384] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #384] // 8-byte Folded Spill + ldr x10, [sp, #424] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #424] // 8-byte Folded Spill + ldr x10, [sp, #352] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #352] // 8-byte Folded Spill + ldr x10, [sp, #392] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #392] // 8-byte Folded Spill + ldr x10, [sp, #400] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #400] // 8-byte Folded Spill + ldr x10, [sp, #408] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #408] // 8-byte Folded Spill + ldr x10, [sp, #328] // 8-byte Folded Reload + add x10, x10, x8 + ldr x8, [sp, #336] // 8-byte Folded Reload + add x8, x8, x9 + stp x10, x8, [sp, #328] // 16-byte Folded Spill + ldr x8, [sp, #344] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #344] // 8-byte Folded Spill +.LBB0_7: // Parent Loop BB0_2 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + // Child Loop BB0_23 Depth 4 + // Child Loop BB0_25 Depth 4 + // Child Loop BB0_29 Depth 4 + // Child Loop BB0_31 Depth 4 + // Child Loop BB0_37 Depth 3 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_42 Depth 4 + // Child Loop BB0_44 Depth 4 + // Child Loop BB0_47 Depth 3 + // Child Loop BB0_49 Depth 3 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_62 Depth 3 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_75 Depth 3 + // Child Loop BB0_77 Depth 3 + // Child Loop BB0_81 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 4 + // Child Loop BB0_88 Depth 4 + // Child Loop BB0_91 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_97 Depth 3 + // Child Loop BB0_99 Depth 3 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_106 Depth 3 + // Child Loop BB0_108 Depth 4 + // Child Loop BB0_110 Depth 4 + // Child Loop BB0_113 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + ldr x8, [sp, #304] // 8-byte Folded Reload + cmp x25, x8 + b.ge .LBB0_1 +// %bb.8: // in Loop: Header=BB0_7 Depth=2 + mov x10, xzr + add x8, x25, #1 + mov x1, x12 + str x11, [sp, #464] // 8-byte Folded Spill + stp x8, x12, [sp, #312] // 16-byte Folded Spill + str x16, [sp, #560] // 8-byte Folded Spill + b .LBB0_11 + .p2align 2 +.LBB0_9: // in Loop: Header=BB0_11 Depth=3 + stp q3, q2, [x10] + stp q1, q0, [x10, #32] +.LBB0_10: // %.backedge + // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #472] // 8-byte Folded Reload + ldr x1, [sp, #584] // 8-byte Folded Reload + add x1, x1, x8 + ldr x10, [sp, #576] // 8-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload +.LBB0_11: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + // Child Loop BB0_23 Depth 4 + // Child Loop BB0_25 Depth 4 + // Child Loop BB0_29 Depth 4 + // Child Loop BB0_31 Depth 4 + ldp x9, x8, [sp, #496] // 16-byte Folded Reload + cmp x10, x0 + add x26, x9, x8, lsl #2 + b.ge .LBB0_32 +// %bb.12: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x11, [sp, #672] // 8-byte Folded Reload + mov x13, xzr + mul x9, x25, x8 + ldr x8, [sp, #512] // 8-byte Folded Reload + madd x12, x11, x8, x9 + ldp x9, x8, [sp, #480] // 16-byte Folded Reload + add x11, x9, x8, lsl #2 + add x14, x12, x10 + add x8, x10, #16 + add x15, x14, x28 + str x8, [sp, #576] // 8-byte Folded Spill + add x15, x11, x15, lsl #2 + add x9, x11, x14, lsl #2 + ldp q3, q1, [x15, #32] + ldp q5, q4, [x15] + lsl x15, x28, #1 + ldp q17, q6, [x9, #32] + ldp q2, q0, [x9] + add x9, x14, x15 + add x15, x15, x28 + add x14, x14, x15 + add x9, x11, x9, lsl #2 + mov x15, x1 + add x14, x11, x14, lsl #2 + ldp q18, q7, [x9, #32] + ldp q21, q20, [x9] + ldp q19, q16, [x14, #32] + ldp q23, q22, [x14] + mov x14, x16 + cmp xzr, x20 + b.ge .LBB0_14 + .p2align 2 +.LBB0_13: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x16, x14, x21 + prfm pldl1keep, [x14] + ldur s27, [x14, #-4] + add x14, x14, #4 + add x17, x16, x21 + prfm pldl1keep, [x16] + ldur s28, [x16, #-4] + add x16, x15, x27 + add x18, x17, x21 + prfm pldl1keep, [x17] + ldur s26, [x17, #-4] + sub x17, x16, #4 + prfm pldl1keep, [x18] + ldur s25, [x18, #-4] + add x18, x16, x27 + prfm pldl1keep, [x15] + ldur s24, [x15, #-4] + add x15, x15, #4 + prfm pldl1keep, [x16] + sub x16, x18, #4 + prfm pldl1keep, [x18] + ld1 { v24.s }[1], [x17] + add x17, x18, x27 + prfm pldl1keep, [x17] + ld1 { v24.s }[2], [x16] + add x16, x17, x27 + sub x17, x17, #4 + prfm pldl1keep, [x16] + ldur s29, [x16, #-4] + add x16, x16, x27 + sub x18, x16, #4 + add x0, x16, x27 + ld1 { v24.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v29.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x27 + prfm pldl1keep, [x17] + fmla v2.4s, v24.4s, v27.s[0] + ld1 { v29.s }[2], [x16] + add x16, x17, x27 + sub x17, x17, #4 + fmla v5.4s, v24.4s, v28.s[0] + fmla v21.4s, v24.4s, v26.s[0] + fmla v23.4s, v24.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s30, [x16, #-4] + add x16, x16, x27 + sub x18, x16, #4 + add x0, x16, x27 + ld1 { v29.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v30.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x27 + prfm pldl1keep, [x17] + ld1 { v30.s }[2], [x16] + add x16, x17, x27 + sub x17, x17, #4 + fmla v0.4s, v29.4s, v27.s[0] + fmla v4.4s, v29.4s, v28.s[0] + fmla v20.4s, v29.4s, v26.s[0] + fmla v22.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s31, [x16, #-4] + add x16, x16, x27 + sub x18, x16, #4 + add x0, x16, x27 + ld1 { v30.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v31.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x27 + prfm pldl1keep, [x17] + fmla v17.4s, v30.4s, v27.s[0] + ld1 { v31.s }[2], [x16] + sub x16, x17, #4 + fmla v3.4s, v30.4s, v28.s[0] + fmla v18.4s, v30.4s, v26.s[0] + fmla v19.4s, v30.4s, v25.s[0] + ld1 { v31.s }[3], [x16] + add x16, x24, x13, lsl #6 + add x13, x13, #1 + stp q24, q29, [x16] + fmla v6.4s, v31.4s, v27.s[0] + fmla v1.4s, v31.4s, v28.s[0] + fmla v7.4s, v31.4s, v26.s[0] + fmla v16.4s, v31.4s, v25.s[0] + stp q30, q31, [x16, #32] + cmp x13, x20 + b.lt .LBB0_13 +.LBB0_14: // %.preheader + // in Loop: Header=BB0_11 Depth=3 + ldr x16, [sp, #552] // 8-byte Folded Reload + ldr x17, [sp, #544] // 8-byte Folded Reload + mov x13, xzr + mov w2, #2 // =0x2 + str x1, [sp, #584] // 8-byte Folded Spill + mov w1, #1 // =0x1 + mov w0, #3 // =0x3 + mov w18, #4 // =0x4 + b .LBB0_16 + .p2align 2 +.LBB0_15: // %.loopexit + // in Loop: Header=BB0_16 Depth=4 + add x17, x17, x23 + add x16, x16, x23 + mov x13, x18 + mov x18, x3 +.LBB0_16: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Loop Header: Depth=4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + ldr x8, [sp, #680] // 8-byte Folded Reload + madd x13, x13, x28, x12 + cmp x18, x8 + madd x14, x1, x28, x12 + madd x15, x2, x28, x12 + ldr x8, [sp, #648] // 8-byte Folded Reload + add x13, x13, x10 + add x14, x14, x10 + add x15, x15, x10 + add x13, x11, x13, lsl #2 + stp q2, q0, [x13] + stp q17, q6, [x13, #32] + add x13, x11, x14, lsl #2 + add x14, x11, x15, lsl #2 + add x15, x24, x8, lsl #6 + ldr x8, [sp, #640] // 8-byte Folded Reload + stp q5, q4, [x13] + stp q3, q1, [x13, #32] + madd x13, x0, x28, x12 + add x13, x13, x10 + stp q21, q20, [x14] + stp q18, q7, [x14, #32] + add x14, x24, x8, lsl #6 + ldr x8, [sp, #632] // 8-byte Folded Reload + add x0, x11, x13, lsl #2 + add x13, x24, x8, lsl #6 + stp q23, q22, [x0] + stp q19, q16, [x0, #32] + b.ge .LBB0_21 +// %bb.17: // in Loop: Header=BB0_16 Depth=4 + madd x5, x18, x28, x12 + add x1, x18, #1 + add x2, x18, #2 + add x0, x18, #3 + ldr x8, [sp, #656] // 8-byte Folded Reload + ldr x9, [sp, #672] // 8-byte Folded Reload + ldp q28, q29, [x24, #32] + ldp q30, q31, [x24] + mov x4, xzr + add x3, x18, #4 + add x5, x5, x10 + mul x6, x25, x8 + ldr x8, [sp, #664] // 8-byte Folded Reload + add x5, x11, x5, lsl #2 + ldp q17, q6, [x5, #32] + ldp q2, q0, [x5] + madd x5, x1, x28, x12 + add x5, x5, x10 + add x5, x11, x5, lsl #2 + ldp q3, q1, [x5, #32] + ldp q5, q4, [x5] + madd x5, x2, x28, x12 + add x5, x5, x10 + add x5, x11, x5, lsl #2 + ldp q18, q7, [x5, #32] + ldp q21, q20, [x5] + madd x5, x0, x28, x12 + add x5, x5, x10 + add x5, x11, x5, lsl #2 + ldp q19, q16, [x5, #32] + ldp q23, q22, [x5] + madd x5, x9, x8, x6 + madd x6, x18, x22, x5 + lsl x6, x6, #2 + ldr q27, [x26, x6] + madd x6, x1, x22, x5 + lsl x6, x6, #2 + ldr q26, [x26, x6] + madd x6, x2, x22, x5 + madd x5, x0, x22, x5 + lsl x6, x6, #2 + lsl x5, x5, #2 + ldr q25, [x26, x6] + ldr q24, [x26, x5] + ldr x6, [sp, #624] // 8-byte Folded Reload + mov x5, x17 + fmla v6.4s, v29.4s, v27.s[0] + cmp xzr, x19 + b.ge .LBB0_19 + .p2align 2 +.LBB0_18: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_16 Depth=4 + // => This Inner Loop Header: Depth=5 + add x8, x6, #64 + fmla v17.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + add x9, x6, #128 + prfm pldl1keep, [x8] + ldp q9, q8, [x6, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x6, #-192] + fmla v1.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x9] + fmla v18.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + ldp q11, q10, [x6, #-128] + fmla v21.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + ldp q13, q14, [x6, #-96] + fmla v19.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + add x30, x6, #192 + prfm pldl1keep, [x30] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x7, x6, #256 + add x8, x5, x21 + fmla v2.4s, v12.4s, v27.s[1] + fmla v17.4s, v9.4s, v27.s[1] + add x4, x4, #4 + fmla v6.4s, v8.4s, v27.s[1] + fmla v5.4s, v12.4s, v26.s[1] + fmla v4.4s, v15.4s, v26.s[1] + fmla v3.4s, v9.4s, v26.s[1] + fmla v1.4s, v8.4s, v26.s[1] + fmla v21.4s, v12.4s, v25.s[1] + fmla v20.4s, v15.4s, v25.s[1] + fmla v18.4s, v9.4s, v25.s[1] + fmla v7.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v22.4s, v15.4s, v24.s[1] + ldp q15, q12, [x6, #-64] + fmla v19.4s, v9.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q9, q8, [x6, #-32] + prfm pldl1keep, [x7] + ldp q28, q29, [x6, #32] + fmla v6.4s, v14.4s, v27.s[2] + ldp q30, q31, [x6] + prfm pldl1keep, [x5] + mov x6, x7 + fmla v17.4s, v13.4s, v27.s[2] + fmla v2.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v1.4s, v14.4s, v26.s[2] + fmla v3.4s, v13.4s, v26.s[2] + fmla v4.4s, v10.4s, v26.s[2] + fmla v5.4s, v11.4s, v26.s[2] + fmla v7.4s, v14.4s, v25.s[2] + fmla v18.4s, v13.4s, v25.s[2] + fmla v20.4s, v10.4s, v25.s[2] + fmla v21.4s, v11.4s, v25.s[2] + fmla v16.4s, v14.4s, v24.s[2] + fmla v19.4s, v13.4s, v24.s[2] + fmla v22.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v2.4s, v15.4s, v27.s[3] + fmla v17.4s, v9.4s, v27.s[3] + fmla v6.4s, v8.4s, v27.s[3] + ldur q27, [x5, #-16] + prfm pldl1keep, [x8] + add x5, x5, #16 + fmla v5.4s, v15.4s, v26.s[3] + fmla v4.4s, v12.4s, v26.s[3] + fmla v3.4s, v9.4s, v26.s[3] + fmla v1.4s, v8.4s, v26.s[3] + ldur q26, [x8, #-16] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v21.4s, v15.4s, v25.s[3] + fmla v20.4s, v12.4s, v25.s[3] + fmla v18.4s, v9.4s, v25.s[3] + fmla v7.4s, v8.4s, v25.s[3] + ldur q25, [x8, #-16] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v23.4s, v15.4s, v24.s[3] + fmla v22.4s, v12.4s, v24.s[3] + fmla v19.4s, v9.4s, v24.s[3] + fmla v16.4s, v8.4s, v24.s[3] + ldur q24, [x8, #-16] + fmla v6.4s, v29.4s, v27.s[0] + cmp x4, x19 + b.lt .LBB0_18 +.LBB0_19: // in Loop: Header=BB0_16 Depth=4 + ldp q10, q8, [x15, #32] + ldp q11, q12, [x15] + fmla v17.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + ldp q9, q13, [x14, #32] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + mov x15, x29 + fmla v18.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + fmla v21.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + fmla v19.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + ldp q31, q28, [x13, #32] + fmla v23.4s, v30.4s, v24.s[0] + ldp q29, q30, [x14] + mov x14, x16 + fmla v0.4s, v12.4s, v27.s[1] + fmla v2.4s, v11.4s, v27.s[1] + fmla v17.4s, v10.4s, v27.s[1] + fmla v6.4s, v8.4s, v27.s[1] + fmla v5.4s, v11.4s, v26.s[1] + fmla v4.4s, v12.4s, v26.s[1] + fmla v3.4s, v10.4s, v26.s[1] + fmla v1.4s, v8.4s, v26.s[1] + fmla v21.4s, v11.4s, v25.s[1] + fmla v20.4s, v12.4s, v25.s[1] + fmla v18.4s, v10.4s, v25.s[1] + fmla v7.4s, v8.4s, v25.s[1] + fmla v23.4s, v11.4s, v24.s[1] + fmla v22.4s, v12.4s, v24.s[1] + fmla v19.4s, v10.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + fmla v6.4s, v13.4s, v27.s[2] + ldp q8, q10, [x13] + ldr x13, [sp, #616] // 8-byte Folded Reload + fmla v17.4s, v9.4s, v27.s[2] + fmla v2.4s, v29.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v13.4s, v26.s[2] + fmla v3.4s, v9.4s, v26.s[2] + fmla v4.4s, v30.4s, v26.s[2] + fmla v5.4s, v29.4s, v26.s[2] + fmla v7.4s, v13.4s, v25.s[2] + fmla v18.4s, v9.4s, v25.s[2] + fmla v20.4s, v30.4s, v25.s[2] + fmla v21.4s, v29.4s, v25.s[2] + fmla v16.4s, v13.4s, v24.s[2] + fmla v19.4s, v9.4s, v24.s[2] + fmla v22.4s, v30.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v0.4s, v10.4s, v27.s[3] + fmla v2.4s, v8.4s, v27.s[3] + fmla v17.4s, v31.4s, v27.s[3] + fmla v6.4s, v28.4s, v27.s[3] + fmla v5.4s, v8.4s, v26.s[3] + fmla v4.4s, v10.4s, v26.s[3] + fmla v3.4s, v31.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v21.4s, v8.4s, v25.s[3] + fmla v20.4s, v10.4s, v25.s[3] + fmla v18.4s, v31.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + fmla v23.4s, v8.4s, v24.s[3] + fmla v22.4s, v10.4s, v24.s[3] + fmla v19.4s, v31.4s, v24.s[3] + fmla v16.4s, v28.4s, v24.s[3] + cmp x29, x20 + b.ge .LBB0_15 + .p2align 2 +.LBB0_20: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_16 Depth=4 + // => This Inner Loop Header: Depth=5 + prfm pldl1keep, [x13] + ldp q24, q25, [x13, #-64] + add x8, x14, x21 + ldp q26, q27, [x13, #-32] + prfm pldl1keep, [x14] + add x15, x15, #1 + ldur s28, [x14, #-4] + prfm pldl1keep, [x8] + add x14, x14, #4 + add x13, x13, #64 + ldur s29, [x8, #-4] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v6.4s, v27.4s, v28.s[0] + ldur s30, [x8, #-4] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v17.4s, v26.4s, v28.s[0] + fmla v0.4s, v25.4s, v28.s[0] + fmla v2.4s, v24.4s, v28.s[0] + ldur s28, [x8, #-4] + fmla v4.4s, v25.4s, v29.s[0] + fmla v5.4s, v24.4s, v29.s[0] + fmla v3.4s, v26.4s, v29.s[0] + fmla v1.4s, v27.4s, v29.s[0] + fmla v21.4s, v24.4s, v30.s[0] + fmla v20.4s, v25.4s, v30.s[0] + fmla v18.4s, v26.4s, v30.s[0] + fmla v7.4s, v27.4s, v30.s[0] + fmla v23.4s, v24.4s, v28.s[0] + fmla v22.4s, v25.4s, v28.s[0] + fmla v19.4s, v26.4s, v28.s[0] + fmla v16.4s, v27.4s, v28.s[0] + cmp x15, x20 + b.lt .LBB0_20 + b .LBB0_15 + .p2align 2 +.LBB0_21: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #680] // 8-byte Folded Reload + ldr x9, [sp, #592] // 8-byte Folded Reload + cmp x8, x9 + b.ge .LBB0_27 +// %bb.22: // in Loop: Header=BB0_11 Depth=3 + ldr x9, [sp, #656] // 8-byte Folded Reload + ldr x0, [sp, #680] // 8-byte Folded Reload + mov x18, xzr + mul x9, x25, x9 + ldr x17, [sp, #664] // 8-byte Folded Reload + ldr x1, [sp, #672] // 8-byte Folded Reload + madd x8, x0, x28, x12 + madd x9, x1, x17, x9 + ldp q20, q21, [x24, #32] + ldp q18, q19, [x24] + ldr x1, [sp, #624] // 8-byte Folded Reload + madd x17, x0, x22, x9 + add x8, x8, x10 + add x16, x11, x8, lsl #2 + add x8, x0, #1 + ldr x0, [sp, #536] // 8-byte Folded Reload + lsl x17, x17, #2 + ldr q17, [x26, x17] + madd x17, x8, x28, x12 + madd x8, x8, x22, x9 + ldp q1, q0, [x16, #32] + ldp q3, q2, [x16] + add x17, x17, x10 + lsl x8, x8, #2 + add x17, x11, x17, lsl #2 + ldr q16, [x26, x8] + ldp q5, q4, [x17, #32] + ldp q7, q6, [x17] + cmp xzr, x19 + b.ge .LBB0_24 + .p2align 2 +.LBB0_23: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x8, [sp, #608] // 8-byte Folded Reload + add x7, x1, #64 + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v3.4s, v18.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + add x9, x1, #128 + add x2, x1, #256 + fmla v4.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + add x18, x18, #4 + add x3, x8, x0 + ldr x8, [sp, #600] // 8-byte Folded Reload + prfm pldl1keep, [x7] + ldp q23, q22, [x1, #-160] + ldp q24, q25, [x1, #-192] + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + prfm pldl1keep, [x9] + ldp q19, q18, [x1, #-128] + add x4, x3, #32 + ldp q20, q21, [x1, #-96] + fmla v2.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + fmla v6.4s, v25.4s, v16.s[1] + fmla v4.4s, v22.4s, v16.s[1] + add x5, x8, x0 + add x8, x1, #192 + add x0, x0, #16 + fmla v3.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v23.4s, v16.s[1] + prfm pldl1keep, [x8] + ldp q23, q22, [x1, #-32] + ldp q24, q25, [x1, #-64] + add x6, x5, #32 + prfm pldl1keep, [x6] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + fmla v4.4s, v21.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v3.4s, v19.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v4.4s, v22.4s, v16.s[3] + fmla v3.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + ldr q17, [x5, #16] + prfm pldl1keep, [x4] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + ldr q16, [x3, #16] + prfm pldl1keep, [x2] + ldp q20, q21, [x1, #32] + ldp q18, q19, [x1] + mov x1, x2 + cmp x18, x19 + b.lt .LBB0_23 +.LBB0_24: // in Loop: Header=BB0_11 Depth=3 + ldp q23, q22, [x15, #32] + ldp q24, q25, [x15] + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v3.4s, v18.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + fmla v4.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + ldp q20, q21, [x14, #32] + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + ldp q19, q18, [x14] + fmla v2.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x18, [sp, #616] // 8-byte Folded Reload + ldr x0, [sp, #536] // 8-byte Folded Reload + fmla v3.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + ldp x3, x2, [sp, #368] // 16-byte Folded Reload + fmla v7.4s, v24.4s, v16.s[1] + fmla v6.4s, v25.4s, v16.s[1] + ldp q24, q25, [x13] + fmla v5.4s, v23.4s, v16.s[1] + fmla v4.4s, v22.4s, v16.s[1] + ldp q23, q22, [x13, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + mov x1, x29 + fmla v4.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v3.4s, v19.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v4.4s, v22.4s, v16.s[3] + fmla v3.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + cmp x29, x20 + b.ge .LBB0_26 + .p2align 2 +.LBB0_25: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x8, x2, x0 + add x9, x3, x0 + prfm pldl1keep, [x18] + add x1, x1, #1 + add x8, x8, #4 + add x9, x9, #4 + ldp q16, q17, [x18, #-64] + ldp q18, q19, [x18, #-32] + prfm pldl1keep, [x9] + add x18, x18, #64 + ldr s20, [x3, x0] + prfm pldl1keep, [x8] + fmla v0.4s, v19.4s, v20.s[0] + ldr s21, [x2, x0] + fmla v1.4s, v18.4s, v20.s[0] + fmla v2.4s, v17.4s, v20.s[0] + fmla v3.4s, v16.4s, v20.s[0] + fmla v6.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + fmla v5.4s, v18.4s, v21.s[0] + fmla v4.4s, v19.4s, v21.s[0] + add x0, x0, #4 + cmp x1, x20 + b.lt .LBB0_25 +.LBB0_26: // in Loop: Header=BB0_11 Depth=3 + stp q3, q2, [x16] + stp q1, q0, [x16, #32] + stp q7, q6, [x17] + stp q5, q4, [x17, #32] +.LBB0_27: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #528] // 8-byte Folded Reload + ldr x9, [sp, #592] // 8-byte Folded Reload + cmp x9, x8 + ldr x0, [sp, #568] // 8-byte Folded Reload + b.ge .LBB0_10 +// %bb.28: // in Loop: Header=BB0_11 Depth=3 + ldr x17, [sp, #592] // 8-byte Folded Reload + ldr x9, [sp, #664] // 8-byte Folded Reload + mov x16, xzr + madd x8, x17, x28, x12 + ldp q7, q16, [x24, #32] + ldp q6, q5, [x24] + ldr x12, [sp, #624] // 8-byte Folded Reload + ldr x1, [sp, #360] // 8-byte Folded Reload + add x8, x8, x10 + add x10, x11, x8, lsl #2 + ldr x8, [sp, #656] // 8-byte Folded Reload + ldr x11, [sp, #672] // 8-byte Folded Reload + ldp q1, q0, [x10, #32] + ldp q3, q2, [x10] + mul x8, x25, x8 + madd x8, x11, x9, x8 + ldr x11, [sp, #456] // 8-byte Folded Reload + madd x8, x17, x22, x8 + lsl x8, x8, #2 + ldr q4, [x26, x8] + cmp xzr, x19 + b.ge .LBB0_30 + .p2align 2 +.LBB0_29: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x18, x12, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x9, x12, #128 + prfm pldl1keep, [x18] + ldp q18, q17, [x12, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x12, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x9] + ldp q6, q5, [x12, #-128] + ldp q7, q16, [x12, #-96] + add x8, x12, #192 + prfm pldl1keep, [x8] + add x17, x12, #256 + add x16, x16, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x12, #-32] + ldp q19, q20, [x12, #-64] + prfm pldl1keep, [x11] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x11, #-16] + prfm pldl1keep, [x17] + add x11, x11, #16 + ldp q7, q16, [x12, #32] + ldp q6, q5, [x12] + mov x12, x17 + cmp x16, x19 + b.lt .LBB0_29 +.LBB0_30: // in Loop: Header=BB0_11 Depth=3 + ldp q18, q17, [x15, #32] + ldp q19, q20, [x15] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + ldp q6, q5, [x14] + ldp q7, q16, [x14, #32] + ldr x14, [sp, #464] // 8-byte Folded Reload + mov x11, xzr + mov w12, #64 // =0x40 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + fmla v0.4s, v16.4s, v4.s[2] + ldp q18, q17, [x13, #32] + ldp q19, q20, [x13] + fmla v2.4s, v5.4s, v4.s[2] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + add x8, x29, xzr + cmp x8, x20 + b.ge .LBB0_9 + .p2align 2 +.LBB0_31: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x9, x1, x11, lsl #6 + add x8, x1, x12 + add x12, x12, #64 + prfm pldl1keep, [x8] + ldp q4, q5, [x9] + ldp q6, q7, [x9, #32] + prfm pldl1keep, [x13] + ldr s16, [x14, x11, lsl #2] + add x11, x11, #1 + add x13, x13, #4 + fmla v0.4s, v7.4s, v16.s[0] + fmla v1.4s, v6.4s, v16.s[0] + fmla v2.4s, v5.4s, v16.s[0] + fmla v3.4s, v4.4s, v16.s[0] + add x8, x29, x11 + cmp x8, x20 + b.lt .LBB0_31 + b .LBB0_9 + .p2align 2 +.LBB0_32: // in Loop: Header=BB0_7 Depth=2 + ldp x9, x8, [sp, #480] // 16-byte Folded Reload + ldr x10, [sp, #448] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + cmp x0, x10 + str x8, [sp, #584] // 8-byte Folded Spill + lsl x8, x28, #1 + str x8, [sp, #576] // 8-byte Folded Spill + b.lt .LBB0_36 +// %bb.33: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #440] // 8-byte Folded Reload + cmp x10, x8 + b.lt .LBB0_58 +.LBB0_34: // in Loop: Header=BB0_7 Depth=2 + ldr x9, [sp, #432] // 8-byte Folded Reload + cmp x8, x9 + b.lt .LBB0_80 +.LBB0_35: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #296] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 + b .LBB0_102 + .p2align 2 +.LBB0_36: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #256] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x13, [sp, #584] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x14, [sp, #576] // 8-byte Folded Reload + ldr x15, [sp, #560] // 8-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #568] // 8-byte Folded Reload + add x8, x9, x8 + add x12, x13, x8, lsl #2 + ldp q3, q2, [x12] + add x12, x8, x28 + add x12, x13, x12, lsl #2 + ldp q1, q0, [x12] + add x12, x8, x14 + add x12, x13, x12, lsl #2 + ldp q5, q4, [x12] + add x12, x14, x28 + add x8, x8, x12 + add x8, x13, x8, lsl #2 + ldp q7, q6, [x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_38 + .p2align 2 +.LBB0_37: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x12, [sp, #384] // 8-byte Folded Reload + add x13, x15, x10 + prfm pldl1keep, [x13] + ldur s16, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s17, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s18, [x13, #-4] + add x13, x13, x21 + add x12, x12, x10 + prfm pldl1keep, [x13] + ldur s20, [x13, #-4] + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s19, [x12, #-4] + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x27 + ld1 { v19.s }[1], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + ld1 { v19.s }[2], [x14] + prfm pldl1keep, [x12] + ldur s21, [x12, #-4] + add x12, x12, x27 + ld1 { v19.s }[3], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + ld1 { v21.s }[1], [x13] + sub x14, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x12, x12, #4 + fmla v3.4s, v19.4s, v16.s[0] + fmla v1.4s, v19.4s, v17.s[0] + fmla v5.4s, v19.4s, v18.s[0] + fmla v7.4s, v19.4s, v20.s[0] + ld1 { v21.s }[2], [x14] + ld1 { v21.s }[3], [x12] + add x12, x8, x11, lsl #5 + add x11, x11, #1 + fmla v2.4s, v21.4s, v16.s[0] + fmla v0.4s, v21.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + fmla v6.4s, v21.4s, v20.s[0] + stp q19, q21, [x12] + cmp x11, x20 + b.lt .LBB0_37 +.LBB0_38: // %.preheader39 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #216] // 8-byte Folded Reload + ldr x15, [sp, #552] // 8-byte Folded Reload + mov x11, xzr + add x10, x8, #128 + ldr x16, [sp, #544] // 8-byte Folded Reload + mov w18, #1 // =0x1 + mov w2, #2 // =0x2 + mov w1, #3 // =0x3 + mov w17, #4 // =0x4 + add x14, x8, x12 + b .LBB0_40 + .p2align 2 +.LBB0_39: // %.loopexit35 + // in Loop: Header=BB0_40 Depth=3 + add x16, x16, x23 + add x15, x15, x23 + mov x11, x17 + mov x17, x3 +.LBB0_40: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_42 Depth 4 + // Child Loop BB0_44 Depth 4 + madd x11, x11, x28, x9 + ldr x7, [sp, #568] // 8-byte Folded Reload + ldr x30, [sp, #584] // 8-byte Folded Reload + add x11, x11, x7 + madd x12, x18, x28, x9 + madd x13, x2, x28, x9 + add x12, x12, x7 + add x13, x13, x7 + add x11, x30, x11, lsl #2 + add x12, x30, x12, lsl #2 + stp q3, q2, [x11] + madd x11, x1, x28, x9 + stp q1, q0, [x12] + add x12, x30, x13, lsl #2 + stp q5, q4, [x12] + add x11, x11, x7 + add x11, x30, x11, lsl #2 + stp q7, q6, [x11] + ldr x11, [sp, #680] // 8-byte Folded Reload + cmp x17, x11 + ldr x11, [sp, #648] // 8-byte Folded Reload + add x13, x8, x11, lsl #5 + ldr x11, [sp, #640] // 8-byte Folded Reload + add x12, x8, x11, lsl #5 + ldr x11, [sp, #632] // 8-byte Folded Reload + add x11, x8, x11, lsl #5 + b.ge .LBB0_45 +// %bb.41: // in Loop: Header=BB0_40 Depth=3 + madd x5, x17, x28, x9 + add x18, x17, #1 + add x2, x17, #2 + add x1, x17, #3 + madd x6, x18, x28, x9 + ldp q20, q21, [x8] + mov x4, xzr + add x3, x17, #4 + add x5, x5, x7 + add x5, x30, x5, lsl #2 + add x6, x6, x7 + add x6, x30, x6, lsl #2 + ldp q3, q2, [x5] + madd x5, x2, x28, x9 + ldp q1, q0, [x6] + madd x6, x1, x28, x9 + add x5, x5, x7 + add x6, x6, x7 + add x5, x30, x5, lsl #2 + ldr x7, [sp, #672] // 8-byte Folded Reload + add x6, x30, x6, lsl #2 + ldp q5, q4, [x5] + ldr x5, [sp, #656] // 8-byte Folded Reload + mul x5, x25, x5 + ldp q7, q6, [x6] + ldr x6, [sp, #664] // 8-byte Folded Reload + madd x5, x7, x6, x5 + madd x6, x17, x22, x5 + lsl x6, x6, #2 + ldr q19, [x26, x6] + madd x6, x18, x22, x5 + lsl x6, x6, #2 + ldr q18, [x26, x6] + madd x6, x2, x22, x5 + madd x5, x1, x22, x5 + lsl x6, x6, #2 + lsl x5, x5, #2 + ldr q17, [x26, x6] + ldr q16, [x26, x5] + mov x5, x10 + mov x6, x16 + cmp xzr, x19 + b.ge .LBB0_43 + .p2align 2 +.LBB0_42: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_40 Depth=3 + // => This Inner Loop Header: Depth=4 + add x7, x5, #32 + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + add x4, x4, #4 + prfm pldl1keep, [x7] + ldp q22, q23, [x5, #-96] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + add x7, x5, #96 + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x5, #-64] + prfm pldl1keep, [x7] + add x7, x6, x21 + add x30, x7, x21 + fmla v2.4s, v23.4s, v19.s[1] + fmla v0.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v2.4s, v20.4s, v19.s[2] + ldp q22, q23, [x5, #-32] + fmla v0.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x5], #128 + prfm pldl1keep, [x6] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + ldur q19, [x6, #-16] + prfm pldl1keep, [x7] + fmla v1.4s, v22.4s, v18.s[3] + ldur q18, [x7, #-16] + add x7, x30, x21 + prfm pldl1keep, [x30] + add x6, x6, #16 + fmla v5.4s, v22.4s, v17.s[3] + ldur q17, [x30, #-16] + prfm pldl1keep, [x7] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x7, #-16] + cmp x4, x19 + b.lt .LBB0_42 +.LBB0_43: // in Loop: Header=BB0_40 Depth=3 + ldp q22, q23, [x13] + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + mov x13, x29 + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + fmla v2.4s, v23.4s, v19.s[1] + ldp q21, q20, [x12] + fmla v0.4s, v23.4s, v18.s[1] + mov x12, x15 + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + ldp q22, q23, [x11] + mov x11, x14 + fmla v2.4s, v20.4s, v19.s[2] + fmla v0.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + fmla v1.4s, v22.4s, v18.s[3] + fmla v5.4s, v22.4s, v17.s[3] + fmla v7.4s, v22.4s, v16.s[3] + cmp x29, x20 + b.ge .LBB0_39 + .p2align 2 +.LBB0_44: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_40 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x12, x21 + prfm pldl1keep, [x11] + ldp q16, q17, [x11, #-32] + prfm pldl1keep, [x12] + ldur s18, [x12, #-4] + add x13, x13, #1 + add x12, x12, #4 + prfm pldl1keep, [x4] + ldur s19, [x4, #-4] + add x4, x4, x21 + add x11, x11, #32 + prfm pldl1keep, [x4] + ldur s20, [x4, #-4] + add x4, x4, x21 + fmla v2.4s, v17.4s, v18.s[0] + prfm pldl1keep, [x4] + ldur s21, [x4, #-4] + fmla v3.4s, v16.4s, v18.s[0] + fmla v0.4s, v17.4s, v19.s[0] + fmla v1.4s, v16.4s, v19.s[0] + fmla v4.4s, v17.4s, v20.s[0] + fmla v5.4s, v16.4s, v20.s[0] + fmla v6.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + cmp x13, x20 + b.lt .LBB0_44 + b .LBB0_39 + .p2align 2 +.LBB0_45: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #680] // 8-byte Folded Reload + ldr x15, [sp, #592] // 8-byte Folded Reload + mov x7, x30 + cmp x14, x15 + b.ge .LBB0_51 +// %bb.46: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #656] // 8-byte Folded Reload + ldr x18, [sp, #664] // 8-byte Folded Reload + mov x16, xzr + mul x15, x25, x15 + ldr x3, [sp, #672] // 8-byte Folded Reload + ldr x2, [sp, #680] // 8-byte Folded Reload + add x17, x2, #1 + madd x14, x2, x28, x9 + ldr x1, [sp, #568] // 8-byte Folded Reload + ldp q6, q7, [x8] + madd x18, x3, x18, x15 + madd x15, x2, x22, x18 + add x14, x14, x1 + add x14, x7, x14, lsl #2 + lsl x15, x15, #2 + ldr q4, [x26, x15] + madd x15, x17, x28, x9 + madd x17, x17, x22, x18 + ldp q1, q0, [x14] + ldr x18, [sp, #536] // 8-byte Folded Reload + add x15, x15, x1 + lsl x17, x17, #2 + add x15, x7, x15, lsl #2 + ldr q5, [x26, x17] + mov x17, x10 + ldp q3, q2, [x15] + cmp xzr, x19 + b.ge .LBB0_48 + .p2align 2 +.LBB0_47: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x6, x17, #32 + ldr x1, [sp, #608] // 8-byte Folded Reload + ldr x3, [sp, #600] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v4.s[0] + prfm pldl1keep, [x6] + ldp q16, q17, [x17, #-96] + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + ldp q7, q6, [x17, #-64] + add x5, x17, #96 + prfm pldl1keep, [x5] + add x16, x16, #4 + add x1, x1, x18 + add x3, x3, x18 + add x18, x18, #16 + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + add x2, x1, #32 + add x4, x3, #32 + fmla v1.4s, v16.4s, v4.s[1] + fmla v3.4s, v16.4s, v5.s[1] + ldp q16, q17, [x17, #-32] + fmla v0.4s, v6.4s, v4.s[2] + fmla v2.4s, v6.4s, v5.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v7.4s, v5.s[2] + fmla v0.4s, v17.4s, v4.s[3] + fmla v2.4s, v17.4s, v5.s[3] + ldp q6, q7, [x17], #128 + prfm pldl1keep, [x4] + fmla v1.4s, v16.4s, v4.s[3] + ldr q4, [x3, #16] + prfm pldl1keep, [x2] + fmla v3.4s, v16.4s, v5.s[3] + ldr q5, [x1, #16] + cmp x16, x19 + b.lt .LBB0_47 +.LBB0_48: // in Loop: Header=BB0_7 Depth=2 + ldp q16, q17, [x13] + fmla v1.4s, v6.4s, v4.s[0] + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + ldp q7, q6, [x12] + ldr x18, [sp, #232] // 8-byte Folded Reload + mov x16, xzr + mov x17, xzr + mov x1, x29 + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + add x18, x8, x18 + fmla v1.4s, v16.4s, v4.s[1] + fmla v3.4s, v16.4s, v5.s[1] + ldp q16, q17, [x11] + fmla v0.4s, v6.4s, v4.s[2] + fmla v2.4s, v6.4s, v5.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v7.4s, v5.s[2] + fmla v0.4s, v17.4s, v4.s[3] + fmla v2.4s, v17.4s, v5.s[3] + fmla v1.4s, v16.4s, v4.s[3] + fmla v3.4s, v16.4s, v5.s[3] + cmp x29, x20 + b.ge .LBB0_50 + .p2align 2 +.LBB0_49: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x2, [sp, #424] // 8-byte Folded Reload + ldr x6, [sp, #352] // 8-byte Folded Reload + add x4, x18, x17, lsl #3 + add x5, x18, x16 + add x1, x1, #1 + add x16, x16, #32 + add x4, x4, #32 + prfm pldl1keep, [x4] + ldp q4, q5, [x5] + add x2, x2, x17 + add x3, x6, x17 + add x2, x2, #4 + add x3, x3, #4 + prfm pldl1keep, [x3] + ldr s6, [x6, x17] + prfm pldl1keep, [x2] + ldr x2, [sp, #424] // 8-byte Folded Reload + ldr s7, [x2, x17] + add x17, x17, #4 + fmla v0.4s, v5.4s, v6.s[0] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v5.4s, v7.s[0] + fmla v3.4s, v4.4s, v7.s[0] + cmp x1, x20 + b.lt .LBB0_49 +.LBB0_50: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x14] + stp q3, q2, [x15] +.LBB0_51: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #528] // 8-byte Folded Reload + ldr x15, [sp, #592] // 8-byte Folded Reload + cmp x15, x14 + b.ge .LBB0_57 +// %bb.52: // in Loop: Header=BB0_7 Depth=2 + ldr x17, [sp, #592] // 8-byte Folded Reload + ldr x15, [sp, #568] // 8-byte Folded Reload + mov x14, xzr + madd x9, x17, x28, x9 + ldr x16, [sp, #664] // 8-byte Folded Reload + ldr x18, [sp, #672] // 8-byte Folded Reload + ldp q4, q3, [x8] + add x9, x9, x15 + ldr x15, [sp, #656] // 8-byte Folded Reload + add x9, x7, x9, lsl #2 + mul x15, x25, x15 + ldp q1, q0, [x9] + madd x15, x18, x16, x15 + madd x15, x17, x22, x15 + lsl x15, x15, #2 + ldr q2, [x26, x15] + ldr x15, [sp, #456] // 8-byte Folded Reload + cmp xzr, x19 + b.ge .LBB0_54 + .p2align 2 +.LBB0_53: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x17, x10, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x16, x10, #96 + prfm pldl1keep, [x17] + ldp q5, q6, [x10, #-96] + add x14, x14, #4 + ldp q4, q3, [x10, #-64] + prfm pldl1keep, [x16] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x10, #-32] + prfm pldl1keep, [x15] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x15, #-16] + ldp q4, q3, [x10], #128 + add x15, x15, #16 + cmp x14, x19 + b.lt .LBB0_53 +.LBB0_54: // in Loop: Header=BB0_7 Depth=2 + ldp q5, q6, [x13] + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + ldp q4, q3, [x12] + mov x10, xzr + mov x14, xzr + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x11] + ldr x11, [sp, #232] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + add x8, x8, x11 + mov x11, x29 + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x29, x20 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #464] // 8-byte Folded Reload + add x13, x8, x14, lsl #3 + add x15, x8, x10 + add x11, x11, #1 + add x10, x10, #32 + add x13, x13, #32 + prfm pldl1keep, [x13] + add x12, x16, x14 + ldp q2, q3, [x15] + add x12, x12, #4 + prfm pldl1keep, [x12] + ldr s4, [x16, x14] + add x14, x14, #4 + fmla v0.4s, v3.4s, v4.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x11, x20 + b.lt .LBB0_55 +.LBB0_56: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x9] +.LBB0_57: // in Loop: Header=BB0_7 Depth=2 + bl free + ldp x8, x10, [sp, #440] // 16-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload + cmp x10, x8 + b.ge .LBB0_34 +.LBB0_58: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #248] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x7, [sp, #584] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x13, [sp, #576] // 8-byte Folded Reload + ldr x15, [sp, #560] // 8-byte Folded Reload + ldp x6, x5, [sp, #368] // 16-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #448] // 8-byte Folded Reload + add x8, x9, x8 + lsl x12, x8, #2 + ldr q0, [x7, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr q1, [x7, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr q2, [x7, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr q3, [x7, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_60 + .p2align 2 +.LBB0_59: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x12, [sp, #392] // 8-byte Folded Reload + add x13, x15, x10 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x21 + add x12, x12, x10 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x27 + ld1 { v16.s }[1], [x13] + prfm pldl1keep, [x12] + sub x12, x12, #4 + ld1 { v16.s }[2], [x14] + ld1 { v16.s }[3], [x12] + str q16, [x8, x11, lsl #4] + add x11, x11, #1 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x11, x20 + b.lt .LBB0_59 +.LBB0_60: // %.preheader38 + // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #208] // 8-byte Folded Reload + ldr x12, [sp, #552] // 8-byte Folded Reload + mov x1, xzr + add x10, x8, #48 + ldr x13, [sp, #544] // 8-byte Folded Reload + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_62 + .p2align 2 +.LBB0_61: // %.loopexit34 + // in Loop: Header=BB0_62 Depth=3 + add x13, x13, x23 + add x12, x12, x23 + mov x1, x14 + mov x14, x18 +.LBB0_62: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + madd x18, x1, x28, x9 + ldr x4, [sp, #448] // 8-byte Folded Reload + add x18, x18, x4 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x4 + add x16, x16, x4 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str q0, [x7, x18] + str q1, [x7, x15] + add x15, x17, x4 + str q2, [x7, x16] + lsl x15, x15, #2 + str q3, [x7, x15] + ldr x15, [sp, #680] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_67 +// %bb.63: // in Loop: Header=BB0_62 Depth=3 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x17, x14, #3 + add x16, x14, #2 + madd x3, x16, x28, x9 + ldr q16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x4 + lsl x2, x2, #2 + add x3, x3, x4 + lsl x3, x3, #2 + ldr q0, [x7, x2] + madd x2, x15, x28, x9 + add x2, x2, x4 + ldr q2, [x7, x3] + ldr x3, [sp, #664] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr q1, [x7, x2] + madd x2, x17, x28, x9 + add x2, x2, x4 + ldr x4, [sp, #672] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr q3, [x7, x2] + ldr x2, [sp, #656] // 8-byte Folded Reload + mul x2, x25, x2 + madd x2, x4, x3, x2 + madd x3, x14, x22, x2 + lsl x3, x3, #2 + ldr q7, [x26, x3] + madd x3, x15, x22, x2 + lsl x3, x3, #2 + ldr q6, [x26, x3] + madd x3, x16, x22, x2 + madd x2, x17, x22, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x26, x3] + ldr q4, [x26, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x19 + b.ge .LBB0_65 + .p2align 2 +.LBB0_64: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_62 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + add x1, x1, #4 + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x21 + ldp q16, q17, [x2, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v1.4s, v16.4s, v6.s[1] + fmla v2.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v1.4s, v17.4s, v6.s[2] + fmla v2.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x2], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.4s, v17.4s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x21 + fmla v2.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x19 + b.lt .LBB0_64 +.LBB0_65: // in Loop: Header=BB0_62 Depth=3 + ldr x1, [sp, #648] // 8-byte Folded Reload + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + mov x2, x12 + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + mov x3, x29 + ldr q17, [x8, x1, lsl #4] + ldr x1, [sp, #640] // 8-byte Folded Reload + fmla v0.4s, v17.4s, v7.s[1] + ldr q16, [x8, x1, lsl #4] + ldr x1, [sp, #632] // 8-byte Folded Reload + fmla v1.4s, v17.4s, v6.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldr q18, [x8, x1, lsl #4] + mov x1, x11 + fmla v0.4s, v16.4s, v7.s[2] + fmla v1.4s, v16.4s, v6.s[2] + fmla v2.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v1.4s, v18.4s, v6.s[3] + fmla v2.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x29, x20 + b.ge .LBB0_61 + .p2align 2 +.LBB0_66: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_62 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, x21 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #16 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x21 + fmla v0.4s, v4.4s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v4.4s, v7.s[0] + fmla v3.4s, v4.4s, v16.s[0] + cmp x3, x20 + b.lt .LBB0_66 + b .LBB0_61 + .p2align 2 +.LBB0_67: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #680] // 8-byte Folded Reload + ldr x13, [sp, #592] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_73 +// %bb.68: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #656] // 8-byte Folded Reload + ldr x15, [sp, #664] // 8-byte Folded Reload + mov x14, xzr + mul x13, x25, x13 + ldr x18, [sp, #672] // 8-byte Folded Reload + ldr x16, [sp, #680] // 8-byte Folded Reload + madd x12, x16, x28, x9 + ldr x17, [sp, #448] // 8-byte Folded Reload + ldr q4, [x8] + madd x15, x18, x15, x13 + madd x13, x16, x22, x15 + add x16, x16, #1 + madd x15, x16, x22, x15 + add x12, x12, x17 + add x12, x7, x12, lsl #2 + lsl x13, x13, #2 + ldr q2, [x26, x13] + madd x13, x16, x28, x9 + lsl x15, x15, #2 + ldr q0, [x12] + ldr x16, [sp, #536] // 8-byte Folded Reload + ldr q3, [x26, x15] + mov x15, x10 + add x13, x13, x17 + add x13, x7, x13, lsl #2 + ldr q1, [x13] + cmp xzr, x19 + b.ge .LBB0_70 + .p2align 2 +.LBB0_69: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x3, x15, #32 + ldr x17, [sp, #608] // 8-byte Folded Reload + ldr x1, [sp, #600] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v2.s[0] + prfm pldl1keep, [x3] + fmla v1.4s, v4.4s, v3.s[0] + ldp q4, q5, [x15, #-32] + add x14, x14, #4 + add x17, x17, x16 + add x1, x1, x16 + add x16, x16, #16 + add x18, x17, #32 + add x2, x1, #32 + fmla v0.4s, v4.4s, v2.s[1] + fmla v1.4s, v4.4s, v3.s[1] + fmla v0.4s, v5.4s, v2.s[2] + fmla v1.4s, v5.4s, v3.s[2] + ldp q5, q4, [x15], #64 + prfm pldl1keep, [x2] + fmla v0.4s, v5.4s, v2.s[3] + ldr q2, [x1, #16] + prfm pldl1keep, [x18] + fmla v1.4s, v5.4s, v3.s[3] + ldr q3, [x17, #16] + cmp x14, x19 + b.lt .LBB0_69 +.LBB0_70: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #648] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + mov x15, x29 + ldr q5, [x8, x14, lsl #4] + ldr x14, [sp, #640] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v2.s[1] + ldr q4, [x8, x14, lsl #4] + ldr x14, [sp, #632] // 8-byte Folded Reload + fmla v1.4s, v5.4s, v3.s[1] + ldr q5, [x8, x14, lsl #4] + ldr x14, [sp, #536] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v4.4s, v3.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v5.4s, v3.s[3] + cmp x29, x20 + b.ge .LBB0_72 + .p2align 2 +.LBB0_71: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x16, x5, x14 + add x17, x6, x14 + prfm pldl1keep, [x11] + ldur q2, [x11, #-16] + add x16, x16, #4 + add x17, x17, #4 + add x15, x15, #1 + add x11, x11, #16 + prfm pldl1keep, [x17] + ldr s3, [x6, x14] + prfm pldl1keep, [x16] + ldr s4, [x5, x14] + add x14, x14, #4 + fmla v0.4s, v2.4s, v3.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x15, x20 + b.lt .LBB0_71 +.LBB0_72: // in Loop: Header=BB0_7 Depth=2 + str q0, [x12] + str q1, [x13] +.LBB0_73: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #528] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_79 +// %bb.74: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #592] // 8-byte Folded Reload + ldr x12, [sp, #448] // 8-byte Folded Reload + mov x11, xzr + madd x9, x14, x28, x9 + ldr x13, [sp, #664] // 8-byte Folded Reload + ldr x15, [sp, #672] // 8-byte Folded Reload + ldr q2, [x8] + add x9, x9, x12 + ldr x12, [sp, #656] // 8-byte Folded Reload + add x9, x7, x9, lsl #2 + ldr q0, [x9] + mul x12, x25, x12 + madd x12, x15, x13, x12 + madd x12, x14, x22, x12 + lsl x12, x12, #2 + ldr q1, [x26, x12] + ldp x12, x14, [sp, #456] // 16-byte Folded Reload + cmp xzr, x19 + b.ge .LBB0_76 + .p2align 2 +.LBB0_75: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x10, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x10, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x10], #64 + prfm pldl1keep, [x12] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + cmp x11, x19 + b.lt .LBB0_75 +.LBB0_76: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #648] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + ldr x12, [sp, #416] // 8-byte Folded Reload + mov x10, xzr + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #640] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x11, lsl #4] + ldr x11, [sp, #632] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #184] // 8-byte Folded Reload + add x8, x8, x11 + mov w11, #16 // =0x10 + fmla v0.4s, v3.4s, v1.s[3] + add x13, x29, xzr + cmp x13, x20 + b.ge .LBB0_78 + .p2align 2 +.LBB0_77: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x8, x11 + add x11, x11, #16 + prfm pldl1keep, [x13] + ldr q1, [x8, x10, lsl #4] + prfm pldl1keep, [x12] + add x12, x12, #4 + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.4s, v1.4s, v2.s[0] + add x13, x29, x10 + cmp x13, x20 + b.lt .LBB0_77 +.LBB0_78: // in Loop: Header=BB0_7 Depth=2 + str q0, [x9] +.LBB0_79: // in Loop: Header=BB0_7 Depth=2 + bl free + ldp x9, x8, [sp, #432] // 16-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload + cmp x8, x9 + b.ge .LBB0_35 +.LBB0_80: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #240] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x5, [sp, #584] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x13, [sp, #576] // 8-byte Folded Reload + ldr x17, [sp, #560] // 8-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #440] // 8-byte Folded Reload + add x8, x9, x8 + lsl x12, x8, #2 + ldr d0, [x5, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr d1, [x5, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr d2, [x5, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr d3, [x5, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_82 + .p2align 2 +.LBB0_81: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp x12, x16, [sp, #400] // 16-byte Folded Reload + add x15, x17, x10 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x21 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x21 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x21 + add x12, x12, x10 + add x14, x16, x10 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + add x13, x12, #4 + add x14, x14, #4 + prfm pldl1keep, [x14] + prfm pldl1keep, [x13] + ldr s16, [x16, x10] + add x10, x10, #4 + ld1 { v16.s }[1], [x12] + str d16, [x8, x11, lsl #3] + add x11, x11, #1 + fmla v0.2s, v16.2s, v4.s[0] + fmla v1.2s, v16.2s, v5.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x11, x20 + b.lt .LBB0_81 +.LBB0_82: // %.preheader37 + // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #200] // 8-byte Folded Reload + ldr x12, [sp, #552] // 8-byte Folded Reload + mov x1, xzr + add x10, x8, #24 + ldr x13, [sp, #544] // 8-byte Folded Reload + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_84 + .p2align 2 +.LBB0_83: // %.loopexit33 + // in Loop: Header=BB0_84 Depth=3 + add x13, x13, x23 + add x12, x12, x23 + mov x1, x14 + mov x14, x18 +.LBB0_84: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_86 Depth 4 + // Child Loop BB0_88 Depth 4 + madd x18, x1, x28, x9 + ldr x4, [sp, #440] // 8-byte Folded Reload + add x18, x18, x4 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x4 + add x16, x16, x4 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str d0, [x5, x18] + str d1, [x5, x15] + add x15, x17, x4 + str d2, [x5, x16] + lsl x15, x15, #2 + str d3, [x5, x15] + ldr x15, [sp, #680] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_89 +// %bb.85: // in Loop: Header=BB0_84 Depth=3 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x17, x14, #3 + add x16, x14, #2 + madd x3, x16, x28, x9 + ldr d16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x4 + lsl x2, x2, #2 + add x3, x3, x4 + lsl x3, x3, #2 + ldr d0, [x5, x2] + madd x2, x15, x28, x9 + add x2, x2, x4 + ldr d2, [x5, x3] + ldr x3, [sp, #664] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr d1, [x5, x2] + madd x2, x17, x28, x9 + add x2, x2, x4 + ldr x4, [sp, #672] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr d3, [x5, x2] + ldr x2, [sp, #656] // 8-byte Folded Reload + mul x2, x25, x2 + madd x2, x4, x3, x2 + madd x3, x14, x22, x2 + lsl x3, x3, #2 + ldr q7, [x26, x3] + madd x3, x15, x22, x2 + lsl x3, x3, #2 + ldr q6, [x26, x3] + madd x3, x16, x22, x2 + madd x2, x17, x22, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x26, x3] + ldr q4, [x26, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x19 + b.ge .LBB0_87 + .p2align 2 +.LBB0_86: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_84 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + add x1, x1, #4 + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x21 + ldp d16, d17, [x2, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v1.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v1.2s, v17.2s, v6.s[2] + fmla v2.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x2], #32 + prfm pldl1keep, [x3] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x21 + fmla v2.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x19 + b.lt .LBB0_86 +.LBB0_87: // in Loop: Header=BB0_84 Depth=3 + ldr x1, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + mov x2, x12 + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + mov x3, x29 + ldr d17, [x8, x1, lsl #3] + ldr x1, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v17.2s, v7.s[1] + ldr d16, [x8, x1, lsl #3] + ldr x1, [sp, #632] // 8-byte Folded Reload + fmla v1.2s, v17.2s, v6.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + ldr d18, [x8, x1, lsl #3] + mov x1, x11 + fmla v0.2s, v16.2s, v7.s[2] + fmla v1.2s, v16.2s, v6.s[2] + fmla v2.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v1.2s, v18.2s, v6.s[3] + fmla v2.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x29, x20 + b.ge .LBB0_83 + .p2align 2 +.LBB0_88: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_84 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, x21 + prfm pldl1keep, [x1] + ldur d4, [x1, #-8] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #8 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x21 + fmla v0.2s, v4.2s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.2s, v4.2s, v6.s[0] + fmla v2.2s, v4.2s, v7.s[0] + fmla v3.2s, v4.2s, v16.s[0] + cmp x3, x20 + b.lt .LBB0_88 + b .LBB0_83 + .p2align 2 +.LBB0_89: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #680] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_95 +// %bb.90: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #656] // 8-byte Folded Reload + ldr x14, [sp, #664] // 8-byte Folded Reload + mov x13, xzr + mul x12, x25, x12 + ldr x17, [sp, #672] // 8-byte Folded Reload + ldr x15, [sp, #680] // 8-byte Folded Reload + madd x11, x15, x28, x9 + ldr x16, [sp, #440] // 8-byte Folded Reload + ldr d4, [x8] + madd x14, x17, x14, x12 + madd x12, x15, x22, x14 + add x15, x15, #1 + madd x14, x15, x22, x14 + add x11, x11, x16 + add x11, x5, x11, lsl #2 + lsl x12, x12, #2 + ldr q2, [x26, x12] + madd x12, x15, x28, x9 + lsl x14, x14, #2 + ldr d0, [x11] + ldr x15, [sp, #536] // 8-byte Folded Reload + ldr q3, [x26, x14] + mov x14, x10 + add x12, x12, x16 + add x12, x5, x12, lsl #2 + ldr d1, [x12] + cmp xzr, x19 + b.ge .LBB0_92 + .p2align 2 +.LBB0_91: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x2, x14, #16 + ldr x16, [sp, #608] // 8-byte Folded Reload + ldr x18, [sp, #600] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[0] + prfm pldl1keep, [x2] + fmla v1.2s, v4.2s, v3.s[0] + ldp d4, d5, [x14, #-16] + add x13, x13, #4 + add x16, x16, x15 + add x18, x18, x15 + add x15, x15, #16 + add x17, x16, #32 + add x1, x18, #32 + fmla v0.2s, v4.2s, v2.s[1] + fmla v1.2s, v4.2s, v3.s[1] + fmla v0.2s, v5.2s, v2.s[2] + fmla v1.2s, v5.2s, v3.s[2] + ldp d5, d4, [x14], #32 + prfm pldl1keep, [x1] + fmla v0.2s, v5.2s, v2.s[3] + ldr q2, [x18, #16] + prfm pldl1keep, [x17] + fmla v1.2s, v5.2s, v3.s[3] + ldr q3, [x16, #16] + cmp x13, x19 + b.lt .LBB0_91 +.LBB0_92: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + ldr x1, [sp, #352] // 8-byte Folded Reload + ldr x2, [sp, #424] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v2.s[1] + ldr d4, [x8, x15, lsl #3] + ldr x15, [sp, #632] // 8-byte Folded Reload + fmla v1.2s, v5.2s, v3.s[1] + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #224] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[2] + fmla v1.2s, v4.2s, v3.s[2] + add x15, x8, x15 + fmla v0.2s, v5.2s, v2.s[3] + fmla v1.2s, v5.2s, v3.s[3] + add x16, x29, xzr + cmp x16, x20 + b.ge .LBB0_94 + .p2align 2 +.LBB0_93: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x18, x15, x14, lsl #3 + add x16, x2, x13 + add x17, x1, x13 + add x13, x13, #4 + add x17, x17, #4 + add x16, x16, #4 + add x18, x18, #8 + prfm pldl1keep, [x18] + ldr d2, [x15, x14, lsl #3] + prfm pldl1keep, [x17] + ldr s3, [x1, x14, lsl #2] + prfm pldl1keep, [x16] + fmla v0.2s, v2.2s, v3.s[0] + ldr s4, [x2, x14, lsl #2] + fmla v1.2s, v2.2s, v4.s[0] + add x14, x14, #1 + add x16, x29, x14 + cmp x16, x20 + b.lt .LBB0_93 +.LBB0_94: // in Loop: Header=BB0_7 Depth=2 + str d0, [x11] + str d1, [x12] +.LBB0_95: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #528] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_101 +// %bb.96: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #592] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + mov x11, xzr + madd x9, x14, x28, x9 + ldr x13, [sp, #664] // 8-byte Folded Reload + ldr x15, [sp, #672] // 8-byte Folded Reload + ldr d1, [x8] + add x9, x9, x12 + ldr x12, [sp, #656] // 8-byte Folded Reload + add x9, x5, x9, lsl #2 + ldr d0, [x9] + mul x12, x25, x12 + madd x12, x15, x13, x12 + madd x12, x14, x22, x12 + lsl x12, x12, #2 + ldr q2, [x26, x12] + ldp x12, x14, [sp, #456] // 16-byte Folded Reload + cmp xzr, x19 + b.ge .LBB0_98 + .p2align 2 +.LBB0_97: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x10, #16 + fmla v0.2s, v1.2s, v2.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp d1, d3, [x10, #-16] + fmla v0.2s, v1.2s, v2.s[1] + fmla v0.2s, v3.2s, v2.s[2] + ldp d3, d1, [x10], #32 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v2.s[3] + ldur q2, [x12, #-16] + add x12, x12, #16 + cmp x11, x19 + b.lt .LBB0_97 +.LBB0_98: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v1.2s, v2.s[0] + mov x10, xzr + ldr d3, [x8, x11, lsl #3] + ldr x11, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v2.s[1] + ldr d4, [x8, x11, lsl #3] + ldr x11, [sp, #632] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[2] + ldr d1, [x8, x11, lsl #3] + ldr x11, [sp, #224] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v1.2s, v2.s[3] + add x12, x29, xzr + cmp x12, x20 + b.ge .LBB0_100 + .p2align 2 +.LBB0_99: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x10, lsl #3 + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr d1, [x8, x10, lsl #3] + prfm pldl1keep, [x11] + add x11, x11, #4 + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.2s, v1.2s, v2.s[0] + add x12, x29, x10 + cmp x12, x20 + b.lt .LBB0_99 +.LBB0_100: // in Loop: Header=BB0_7 Depth=2 + str d0, [x9] +.LBB0_101: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x9, [sp, #432] // 8-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload + ldr x8, [sp, #296] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 +.LBB0_102: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #272] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x13, [sp, #576] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x6, [sp, #584] // 8-byte Folded Reload + ldr x14, [sp, #560] // 8-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #432] // 8-byte Folded Reload + add x12, x9, x8 + add x8, x13, x28 + add x8, x12, x8 + add x13, x12, x13 + ldr s1, [x6, x12, lsl #2] + add x12, x12, x28 + ldr s0, [x6, x8, lsl #2] + ldr s2, [x6, x13, lsl #2] + ldr s3, [x6, x12, lsl #2] + ldr x12, [sp, #328] // 8-byte Folded Reload + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_104 + .p2align 2 +.LBB0_103: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x14, x10 + add x11, x11, #1 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, #4 + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x8, x10] + add x10, x10, #4 + cmp x11, x20 + b.lt .LBB0_103 +.LBB0_104: // %.preheader36 + // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #192] // 8-byte Folded Reload + ldr x12, [sp, #552] // 8-byte Folded Reload + mov x1, xzr + add x10, x8, #12 + ldr x13, [sp, #544] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w15, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_106 + .p2align 2 +.LBB0_105: // %.loopexit32 + // in Loop: Header=BB0_106 Depth=3 + add x13, x13, x23 + add x12, x12, x23 + mov x1, x14 + mov x14, x18 +.LBB0_106: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_108 Depth 4 + // Child Loop BB0_110 Depth 4 + madd x18, x1, x28, x9 + ldr x5, [sp, #432] // 8-byte Folded Reload + add x18, x18, x5 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + madd x15, x15, x28, x9 + add x16, x16, x5 + add x15, x15, x5 + str s1, [x6, x18, lsl #2] + str s3, [x6, x16, lsl #2] + add x16, x17, x5 + str s2, [x6, x16, lsl #2] + str s0, [x6, x15, lsl #2] + ldr x15, [sp, #680] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_111 +// %bb.107: // in Loop: Header=BB0_106 Depth=3 + madd x2, x14, x28, x9 + add x15, x14, #3 + add x16, x14, #1 + add x17, x14, #2 + madd x3, x16, x28, x9 + ldr s16, [x8] + mov x1, xzr + add x18, x14, #4 + madd x4, x17, x28, x9 + add x2, x2, x5 + ldr s1, [x6, x2, lsl #2] + madd x2, x15, x28, x9 + add x4, x4, x5 + ldr s2, [x6, x4, lsl #2] + ldr x4, [sp, #672] // 8-byte Folded Reload + add x2, x2, x5 + ldr s0, [x6, x2, lsl #2] + add x2, x3, x5 + ldr x3, [sp, #664] // 8-byte Folded Reload + ldr s3, [x6, x2, lsl #2] + ldr x2, [sp, #656] // 8-byte Folded Reload + mul x2, x25, x2 + madd x2, x4, x3, x2 + madd x3, x14, x22, x2 + lsl x3, x3, #2 + ldr q7, [x26, x3] + madd x3, x16, x22, x2 + lsl x3, x3, #2 + ldr q6, [x26, x3] + madd x3, x17, x22, x2 + madd x2, x15, x22, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x26, x3] + ldr q4, [x26, x2] + mov x2, x10 + mov x3, x13 + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x19 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_109 + .p2align 2 +.LBB0_108: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_106 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, #8 + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x1, x1, #4 + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x4] + add x4, x3, x21 + ldp s16, s21, [x2, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v1.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v1.2s, v21.2s, v20.2s + ldp s17, s16, [x2], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v2.2s, v21.2s, v18.2s + prfm pldl1keep, [x3] + fmla v1.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x21 + fmla v2.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x1, x19 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_108 +.LBB0_109: // in Loop: Header=BB0_106 Depth=3 + ldr x1, [sp, #648] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + mov x2, x12 + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + mov x3, x29 + ldr s21, [x8, x1, lsl #2] + ldr x1, [sp, #640] // 8-byte Folded Reload + fmla v1.2s, v21.2s, v7.s[1] + ldr s16, [x8, x1, lsl #2] + ldr x1, [sp, #632] // 8-byte Folded Reload + fmla v3.2s, v21.2s, v6.s[1] + fmla v2.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + ldr s22, [x8, x1, lsl #2] + mov x1, x11 + fmla v1.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v1.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v2.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x29, x20 + b.ge .LBB0_105 + .p2align 2 +.LBB0_110: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_106 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, x21 + prfm pldl1keep, [x1] + ldur s4, [x1, #-4] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #4 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x21 + fmla v1.2s, v4.2s, v5.2s + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v3.2s, v4.2s, v6.2s + fmla v2.2s, v4.2s, v7.2s + fmla v0.2s, v4.2s, v16.2s + cmp x3, x20 + b.lt .LBB0_110 + b .LBB0_105 + .p2align 2 +.LBB0_111: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #680] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_117 +// %bb.112: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #656] // 8-byte Folded Reload + ldr x15, [sp, #664] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr x18, [sp, #672] // 8-byte Folded Reload + ldr x16, [sp, #680] // 8-byte Folded Reload + mul x12, x25, x12 + madd x11, x16, x28, x9 + ldr x17, [sp, #432] // 8-byte Folded Reload + ldr s4, [x8] + madd x12, x18, x15, x12 + madd x15, x16, x22, x12 + add x11, x11, x17 + ldr s1, [x6, x11, lsl #2] + lsl x15, x15, #2 + ldr q2, [x26, x15] + add x15, x16, #1 + madd x16, x15, x22, x12 + madd x12, x15, x28, x9 + add x12, x12, x17 + lsl x15, x16, #2 + ldr s0, [x6, x12, lsl #2] + ldr q3, [x26, x15] + ext v6.16b, v2.16b, v2.16b, #8 + cmp xzr, x19 + ext v5.16b, v3.16b, v3.16b, #8 + b.ge .LBB0_114 + .p2align 2 +.LBB0_113: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x1, x8, x13 + ldp x15, x17, [sp, #336] // 16-byte Folded Reload + fmla v1.2s, v4.2s, v2.2s + add x2, x1, #20 + fmla v0.2s, v4.2s, v3.2s + add x14, x14, #4 + prfm pldl1keep, [x2] + ldp s4, s7, [x1, #4] + add x15, x15, x13 + add x17, x17, x13 + add x13, x13, #16 + add x16, x15, #32 + add x18, x17, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x1, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + prfm pldl1keep, [x16] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x15, #16] + ext v6.16b, v2.16b, v2.16b, #8 + cmp x14, x19 + ext v5.16b, v3.16b, v3.16b, #8 + b.lt .LBB0_113 +.LBB0_114: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #648] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v2.2s + fmla v0.2s, v4.2s, v3.2s + ldr x1, [sp, #352] // 8-byte Folded Reload + ldr x2, [sp, #424] // 8-byte Folded Reload + mov x13, xzr + mov x15, x29 + ldr s7, [x8, x14, lsl #2] + ldr x14, [sp, #640] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v2.s[1] + ldr s4, [x8, x14, lsl #2] + ldr x14, [sp, #632] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v3.s[1] + ldr s7, [x8, x14, lsl #2] + ldr x14, [sp, #264] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + add x14, x8, x14 + fmla v1.2s, v7.2s, v2.s[3] + fmla v0.2s, v7.2s, v3.s[3] + cmp x29, x20 + b.ge .LBB0_116 + .p2align 2 +.LBB0_115: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x16, x2, x13 + add x17, x1, x13 + add x18, x14, x13 + add x15, x15, #1 + add x16, x16, #4 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x14, x13] + prfm pldl1keep, [x17] + prfm pldl1keep, [x16] + ldr s3, [x1, x13] + fmla v1.2s, v2.2s, v3.2s + ldr s3, [x2, x13] + add x13, x13, #4 + fmla v0.2s, v2.2s, v3.2s + cmp x15, x20 + b.lt .LBB0_115 +.LBB0_116: // in Loop: Header=BB0_7 Depth=2 + str s1, [x6, x11, lsl #2] + str s0, [x6, x12, lsl #2] +.LBB0_117: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #528] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_5 +// %bb.118: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #592] // 8-byte Folded Reload + ldr x12, [sp, #432] // 8-byte Folded Reload + mov x11, xzr + madd x9, x14, x28, x9 + ldr x13, [sp, #664] // 8-byte Folded Reload + ldr x15, [sp, #672] // 8-byte Folded Reload + ldr s2, [x8] + add x9, x9, x12 + ldr x12, [sp, #656] // 8-byte Folded Reload + ldr s0, [x6, x9, lsl #2] + mul x12, x25, x12 + madd x12, x15, x13, x12 + madd x12, x14, x22, x12 + lsl x12, x12, #2 + ldr q1, [x26, x12] + ldp x12, x14, [sp, #456] // 16-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x19 + b.ge .LBB0_120 + .p2align 2 +.LBB0_119: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x10, #8 + fmla v0.2s, v2.2s, v1.2s + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x10, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x10], #16 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x11, x19 + b.lt .LBB0_119 +.LBB0_120: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x10, xzr + ldr s4, [x8, x11, lsl #2] + ldr x11, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s5, [x8, x11, lsl #2] + ldr x11, [sp, #632] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.2s + ldr s2, [x8, x11, lsl #2] + ldr x11, [sp, #264] // 8-byte Folded Reload + add x8, x8, x11 + mov x11, x29 + fmla v0.2s, v2.2s, v1.s[3] + cmp x29, x20 + b.ge .LBB0_4 + .p2align 2 +.LBB0_121: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x14, x10 + add x13, x8, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + ldr s1, [x8, x10] + prfm pldl1keep, [x12] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x20 + b.lt .LBB0_121 + b .LBB0_4 +.LBB0_122: + ldr x0, [sp, #8] // 8-byte Folded Reload + bl free + add sp, sp, #688 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_4d_nt_mlir, .Lfunc_end0-sbatch_matmul_4d_nt_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s new file mode 100644 index 00000000000000..efa5087d8c2dfe --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s @@ -0,0 +1,4104 @@ + .text + .file "LLVMDialectModule" + .globl sgemm_nn_alpha1_beta1_mlir // -- Begin function sgemm_nn_alpha1_beta1_mlir + .p2align 4 + .type sgemm_nn_alpha1_beta1_mlir,@function +sgemm_nn_alpha1_beta1_mlir: // @sgemm_nn_alpha1_beta1_mlir + .cfi_startproc +// %bb.0: + str d12, [sp, #-144]! // 8-byte Folded Spill + stp d11, d10, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #48] // 16-byte Folded Spill + stp x28, x27, [sp, #64] // 16-byte Folded Spill + stp x26, x25, [sp, #80] // 16-byte Folded Spill + stp x24, x23, [sp, #96] // 16-byte Folded Spill + stp x22, x21, [sp, #112] // 16-byte Folded Spill + stp x20, x19, [sp, #128] // 16-byte Folded Spill + stp d9, d8, [sp, #32] // 16-byte Folded Spill + sub sp, sp, #512 + .cfi_def_cfa_offset 656 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -144 + cmp x3, #0 + ldr x29, [sp, #688] + ldr x20, [sp, #656] + mov x22, x5 + cinv x8, x3, lt + ldr x26, [sp, #664] + ldr x27, [sp, #744] + mov x19, x4 + add x10, x8, x8, lsr #63 + add x9, x8, #7 + str x2, [sp, #320] // 8-byte Folded Spill + mov x25, x1 + str x3, [sp, #288] // 8-byte Folded Spill + asr x10, x10, #1 + cinv x23, x10, lt + cmp x8, #0 + add x10, x8, #3 + csel x9, x9, x8, lt + csel x8, x10, x8, lt + cmp x3, #0 + asr x9, x9, #3 + asr x8, x8, #2 + cinv x24, x9, lt + ldr x9, [sp, #680] + cinv x21, x8, lt + cmp x9, #0 + str x9, [sp, #128] // 8-byte Folded Spill + cinv x10, x9, lt + add x8, x10, #7 + cmp x10, #0 + str x10, [sp, #112] // 8-byte Folded Spill + csel x8, x8, x10, lt + cmp x9, #0 + ldr x10, [sp, #712] + ldr x9, [sp, #720] + asr x8, x8, #3 + cinv x8, x8, lt + str x8, [sp, #16] // 8-byte Folded Spill + lsl x8, x8, #3 + str x8, [sp, #384] // 8-byte Folded Spill + lsl x8, x4, #5 + stp x9, x10, [sp, #256] // 16-byte Folded Spill + add x0, x8, #64 + str x8, [sp, #504] // 8-byte Folded Spill + bl malloc + lsl x8, x24, #3 + mul x3, x24, x22 + mov w9, #1 // =0x1 + add x12, x0, #63 + str x8, [sp, #520] // 8-byte Folded Spill + lsl x8, x21, #2 + bfi x9, x21, #2, #62 + and x11, x19, #0x3 + str x8, [sp, #336] // 8-byte Folded Spill + lsl x8, x23, #1 + mul x9, x22, x9 + str x0, [sp, #56] // 8-byte Folded Spill + str x8, [sp, #312] // 8-byte Folded Spill + negs x8, x19 + mul x0, x23, x22 + mul x2, x21, x22 + and x13, x8, #0x3 + and x8, x12, #0xffffffffffffffc0 + ldr x12, [sp, #320] // 8-byte Folded Reload + lsl x5, x19, #2 + csneg x18, x11, x13, mi + add x11, x19, x3, lsl #3 + add x16, x19, x0, lsl #1 + lsl x28, x22, #2 + lsl x6, x18, #2 + add x10, x19, x22, lsl #3 + lsl x4, x22, #5 + mov w15, #28 // =0x1c + sub x23, x11, x18 + add x11, x19, x9 + add x9, x25, x9, lsl #2 + sub x13, x4, x28 + lsl x24, x12, #2 + sub x11, x11, x18 + add x14, x19, x2, lsl #2 + sub x12, x5, x6 + str x9, [sp, #168] // 8-byte Folded Spill + add x9, x24, x11, lsl #2 + lsl x21, x26, #2 + add x17, x12, #4 + madd x1, x29, x15, x20 + sub x15, x16, x18 + add x16, x24, x25 + stp x13, x12, [sp, #64] // 16-byte Folded Spill + sub x12, x10, x18 + sub x10, x14, x18 + madd x14, x29, x17, x21 + add x16, x13, x16 + add x9, x25, x9 + add x16, x16, #16 + str x9, [sp, #480] // 8-byte Folded Spill + add x9, x24, x10, lsl #2 + str x16, [sp, #240] // 8-byte Folded Spill + mov w16, #16 // =0x10 + sub x30, x19, x18 + sub x13, x16, x13 + add x9, x25, x9 + mul x17, x29, x30 + str x13, [sp, #232] // 8-byte Folded Spill + add x13, x20, x14 + stp x13, x9, [sp, #464] // 16-byte Folded Spill + add x9, x24, x15, lsl #2 + ldr x13, [sp, #504] // 8-byte Folded Reload + add x14, x5, x24 + sub x14, x14, x6 + add x16, x21, x17, lsl #2 + add x14, x14, x25 + add x9, x9, x25 + sub x10, x30, #3 + stp x17, x18, [sp, #96] // 16-byte Folded Spill + add x11, x20, x29, lsl #5 + add x9, x9, #4 + str x26, [sp, #280] // 8-byte Folded Spill + lsl x26, x29, #4 + stp x20, x29, [sp, #296] // 16-byte Folded Spill + str x9, [sp, #152] // 8-byte Folded Spill + sub x9, x30, #2 + sub x17, x13, x18, lsl #5 + add x13, x14, #4 + add x14, x24, x12, lsl #2 + add x12, x4, x24 + stp x10, x9, [sp, #400] // 16-byte Folded Spill + sub x9, x30, #1 + str x13, [sp, #224] // 8-byte Folded Spill + add x13, x20, x16 + add x12, x12, x25 + str x9, [sp, #416] // 8-byte Folded Spill + mov w9, #20 // =0x14 + str x13, [sp, #456] // 8-byte Folded Spill + add x13, x12, #32 + add x12, x14, x25 + madd x9, x29, x9, x20 + add x12, x12, #4 + stp x12, x13, [sp, #208] // 16-byte Folded Spill + add x12, x24, x3, lsl #5 + add x13, x24, x23, lsl #2 + stp x6, x14, [sp, #80] // 16-byte Folded Spill + mov w10, #24 // =0x18 + lsl x18, x29, #2 + add x12, x12, x25 + madd x6, x29, x10, x20 + add x10, x20, x18 + str x25, [sp, #328] // 8-byte Folded Spill + stp x9, x11, [sp, #440] // 16-byte Folded Spill + add x11, x20, x29, lsl #3 + add x14, x12, #32 + add x12, x13, x25 + add x12, x12, #4 + add x9, x20, x26 + mov x7, xzr + str x5, [sp, #120] // 8-byte Folded Spill + str x11, [sp, #432] // 8-byte Folded Spill + add x11, x8, #64 + stp x12, x14, [sp, #176] // 16-byte Folded Spill + add x12, x17, #32 + str x11, [sp, #192] // 8-byte Folded Spill + mov w11, #12 // =0xc + add x13, x8, x17 + sub x23, x30, #4 + madd x11, x29, x11, x20 + mov x20, x1 + mov x1, x9 + add x9, x8, x12 + stp x9, x4, [sp, #488] // 16-byte Folded Spill + add x9, x24, x0, lsl #3 + add x5, x8, #128 + stp x2, x0, [sp, #40] // 16-byte Folded Spill + mov x0, x10 + stp x9, x3, [sp, #24] // 16-byte Folded Spill + str x12, [sp, #160] // 8-byte Folded Spill + add x9, x25, x9 + str x30, [sp, #504] // 8-byte Folded Spill + str x24, [sp, #344] // 8-byte Folded Spill + str x9, [sp, #144] // 8-byte Folded Spill + add x9, x9, #32 + str x11, [sp, #424] // 8-byte Folded Spill + str x18, [sp, #272] // 8-byte Folded Spill + str x9, [sp, #136] // 8-byte Folded Spill + add x9, x25, x2, lsl #4 + ldr x25, [sp, #384] // 8-byte Folded Reload + str x13, [sp, #200] // 8-byte Folded Spill + str x9, [sp, #248] // 8-byte Folded Spill + b .LBB0_3 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_3 Depth=1 + stp q1, q0, [x10] +.LBB0_2: // %.backedge + // in Loop: Header=BB0_3 Depth=1 + ldp x9, x11, [sp, #440] // 16-byte Folded Reload + ldr x20, [sp, #376] // 8-byte Folded Reload + add x6, x6, #32 + add x1, x1, #32 + ldp x7, x0, [sp, #352] // 16-byte Folded Reload + ldr x30, [sp, #504] // 8-byte Folded Reload + add x10, x11, #32 + add x20, x20, #32 + add x0, x0, #32 + add x9, x9, #32 + stp x9, x10, [sp, #440] // 16-byte Folded Spill + ldp x9, x11, [sp, #424] // 16-byte Folded Reload + add x10, x11, #32 + add x9, x9, #32 + stp x9, x10, [sp, #424] // 16-byte Folded Spill + ldp x9, x11, [sp, #456] // 16-byte Folded Reload + add x10, x11, #32 + add x9, x9, #32 + stp x9, x10, [sp, #456] // 16-byte Folded Spill +.LBB0_3: // =>This Loop Header: Depth=1 + // Child Loop BB0_5 Depth 2 + // Child Loop BB0_7 Depth 2 + // Child Loop BB0_10 Depth 2 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_14 Depth 3 + // Child Loop BB0_19 Depth 2 + // Child Loop BB0_21 Depth 2 + // Child Loop BB0_24 Depth 2 + // Child Loop BB0_26 Depth 2 + // Child Loop BB0_29 Depth 2 + // Child Loop BB0_31 Depth 2 + cmp x7, x25 + b.ge .LBB0_32 +// %bb.4: // in Loop: Header=BB0_3 Depth=1 + add x10, x7, #8 + add x12, x7, x27, lsl #1 + ldr x17, [sp, #192] // 8-byte Folded Reload + mov x9, xzr + str x10, [sp, #352] // 8-byte Folded Spill + ldp x11, x10, [sp, #256] // 16-byte Folded Reload + stp x0, x1, [sp, #360] // 16-byte Folded Spill + ldp x14, x3, [sp, #232] // 16-byte Folded Reload + ldp x4, x24, [sp, #440] // 16-byte Folded Reload + str x6, [sp, #392] // 8-byte Folded Spill + str x20, [sp, #376] // 8-byte Folded Spill + add x2, x10, x11, lsl #2 + add x11, x27, x7 + lsl x10, x7, #2 + add x11, x2, x11, lsl #2 + add x15, x2, x12, lsl #2 + add x12, x12, x27 + add x13, x2, x10 + add x12, x2, x12, lsl #2 + ldp q1, q0, [x13] + ldp q3, q2, [x11] + add x11, x7, x27, lsl #2 + ldp q6, q5, [x15] + ldp x15, x18, [sp, #424] // 16-byte Folded Reload + ldp q7, q4, [x12] + add x12, x2, x11, lsl #2 + add x11, x11, x27 + add x11, x2, x11, lsl #2 + ldp q17, q16, [x12] + ldp x13, x12, [sp, #320] // 16-byte Folded Reload + ldp q19, q18, [x11] + mov w11, #6 // =0x6 + madd x11, x27, x11, x7 + add x16, x12, x13, lsl #2 + lsl x12, x22, #3 + add x11, x2, x11, lsl #2 + ldr q25, [x16, x12] + ldr x12, [sp, #280] // 8-byte Folded Reload + ldr q26, [x16, x28] + ldr q28, [x16, x22, lsl #4] + ldr q30, [x16] + ldp q21, q20, [x11] + mov w11, #12 // =0xc + mul x11, x22, x11 + ldr q27, [x16, x11] + sub x11, x7, x27 + add x11, x11, x27, lsl #3 + add x11, x2, x11, lsl #2 + ldp q23, q22, [x11] + ldr x11, [sp, #296] // 8-byte Folded Reload + add x11, x11, x12, lsl #2 + mov w12, #20 // =0x14 + mul x12, x22, x12 + add x10, x11, x10 + ldp q8, q9, [x10] + ldr q24, [x16, x12] + mov w12, #24 // =0x18 + mul x12, x22, x12 + ldr q29, [x16, x12] + prfm pldl1keep, [x3] + ldur q31, [x3, #-16] + cmp xzr, x23 + b.ge .LBB0_6 + .p2align 2 +.LBB0_5: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x4, x21 + add x29, x0, x21 + fmla v1.4s, v8.4s, v30.s[0] + fmla v0.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v26.s[0] + fmla v2.4s, v9.4s, v26.s[0] + stp q8, q9, [x17, #-64] + fmla v6.4s, v8.4s, v25.s[0] + fmla v5.4s, v9.4s, v25.s[0] + prfm pldl1keep, [x12] + add x25, x6, x21 + fmla v7.4s, v8.4s, v27.s[0] + fmla v4.4s, v9.4s, v27.s[0] + add x12, x18, x21 + add x13, x20, x21 + fmla v16.4s, v9.4s, v28.s[0] + fmla v17.4s, v8.4s, v28.s[0] + add x10, x24, x21 + add x9, x9, #4 + fmla v18.4s, v9.4s, v24.s[0] + fmla v19.4s, v8.4s, v24.s[0] + add x24, x24, x26 + add x20, x20, x26 + fmla v20.4s, v9.4s, v29.s[0] + fmla v21.4s, v8.4s, v29.s[0] + add x6, x6, x26 + add x4, x4, x26 + fmla v22.4s, v9.4s, v31.s[0] + fmla v23.4s, v8.4s, v31.s[0] + ldp q8, q9, [x29] + fmla v0.4s, v9.4s, v30.s[1] + fmla v1.4s, v8.4s, v30.s[1] + stp q8, q9, [x17, #-32] + prfm pldl1keep, [x25] + fmla v2.4s, v9.4s, v26.s[1] + fmla v3.4s, v8.4s, v26.s[1] + add x0, x0, x26 + fmla v5.4s, v9.4s, v25.s[1] + fmla v6.4s, v8.4s, v25.s[1] + add x18, x18, x26 + fmla v4.4s, v9.4s, v27.s[1] + fmla v7.4s, v8.4s, v27.s[1] + fmla v17.4s, v8.4s, v28.s[1] + fmla v16.4s, v9.4s, v28.s[1] + fmla v19.4s, v8.4s, v24.s[1] + fmla v18.4s, v9.4s, v24.s[1] + fmla v21.4s, v8.4s, v29.s[1] + fmla v20.4s, v9.4s, v29.s[1] + fmla v23.4s, v8.4s, v31.s[1] + fmla v22.4s, v9.4s, v31.s[1] + ldp q9, q8, [x12] + add x12, x15, x21 + stp q9, q8, [x17] + prfm pldl1keep, [x13] + add x15, x15, x26 + ldp q11, q10, [x12] + add x12, x1, x21 + add x1, x1, x26 + fmla v1.4s, v9.4s, v30.s[2] + fmla v0.4s, v8.4s, v30.s[2] + stp q11, q10, [x17, #32] + fmla v3.4s, v9.4s, v26.s[2] + fmla v2.4s, v8.4s, v26.s[2] + prfm pldl1keep, [x10] + add x10, x3, x14 + fmla v6.4s, v9.4s, v25.s[2] + fmla v5.4s, v8.4s, v25.s[2] + add x3, x3, #16 + add x17, x17, #128 + fmla v7.4s, v9.4s, v27.s[2] + fmla v4.4s, v8.4s, v27.s[2] + fmla v16.4s, v8.4s, v28.s[2] + fmla v17.4s, v9.4s, v28.s[2] + fmla v18.4s, v8.4s, v24.s[2] + fmla v19.4s, v9.4s, v24.s[2] + fmla v20.4s, v8.4s, v29.s[2] + fmla v21.4s, v9.4s, v29.s[2] + fmla v22.4s, v8.4s, v31.s[2] + fmla v23.4s, v9.4s, v31.s[2] + ldp q8, q9, [x12] + prfm pldl1keep, [x10] + fmla v0.4s, v10.4s, v30.s[3] + fmla v1.4s, v11.4s, v30.s[3] + ldur q30, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v2.4s, v10.4s, v26.s[3] + fmla v3.4s, v11.4s, v26.s[3] + ldur q26, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v5.4s, v10.4s, v25.s[3] + fmla v6.4s, v11.4s, v25.s[3] + ldur q25, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v4.4s, v10.4s, v27.s[3] + fmla v7.4s, v11.4s, v27.s[3] + ldur q27, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v17.4s, v11.4s, v28.s[3] + fmla v16.4s, v10.4s, v28.s[3] + ldur q28, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v19.4s, v11.4s, v24.s[3] + fmla v18.4s, v10.4s, v24.s[3] + ldur q24, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v21.4s, v11.4s, v29.s[3] + fmla v20.4s, v10.4s, v29.s[3] + ldur q29, [x10, #-16] + fmla v23.4s, v11.4s, v31.s[3] + fmla v22.4s, v10.4s, v31.s[3] + prfm pldl1keep, [x3] + ldur q31, [x3, #-16] + cmp x9, x23 + b.lt .LBB0_5 +.LBB0_6: // in Loop: Header=BB0_3 Depth=1 + ldp x13, x12, [sp, #400] // 16-byte Folded Reload + ldr x14, [sp, #304] // 8-byte Folded Reload + add x10, x8, x23, lsl #5 + fmla v1.4s, v8.4s, v30.s[0] + fmla v0.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v26.s[0] + fmla v2.4s, v9.4s, v26.s[0] + stp q8, q9, [x10] + fmla v6.4s, v8.4s, v25.s[0] + fmla v5.4s, v9.4s, v25.s[0] + fmla v4.4s, v9.4s, v27.s[0] + fmla v7.4s, v8.4s, v27.s[0] + ldp x1, x18, [sp, #456] // 16-byte Folded Reload + madd x9, x13, x14, x7 + fmla v16.4s, v9.4s, v28.s[0] + fmla v17.4s, v8.4s, v28.s[0] + madd x10, x12, x14, x7 + fmla v18.4s, v9.4s, v24.s[0] + fmla v19.4s, v8.4s, v24.s[0] + add x0, x8, x12, lsl #5 + ldr x12, [sp, #416] // 8-byte Folded Reload + fmla v20.4s, v9.4s, v29.s[0] + fmla v21.4s, v8.4s, v29.s[0] + mov x15, xzr + add x9, x11, x9, lsl #2 + fmla v22.4s, v9.4s, v31.s[0] + fmla v23.4s, v8.4s, v31.s[0] + add x10, x11, x10, lsl #2 + ldp q8, q9, [x9] + add x9, x8, x13, lsl #5 + fmla v0.4s, v9.4s, v30.s[1] + fmla v2.4s, v9.4s, v26.s[1] + fmla v5.4s, v9.4s, v25.s[1] + fmla v4.4s, v9.4s, v27.s[1] + fmla v16.4s, v9.4s, v28.s[1] + fmla v18.4s, v9.4s, v24.s[1] + fmla v20.4s, v9.4s, v29.s[1] + fmla v22.4s, v9.4s, v31.s[1] + fmla v1.4s, v8.4s, v30.s[1] + stp q8, q9, [x9] + fmla v3.4s, v8.4s, v26.s[1] + fmla v6.4s, v8.4s, v25.s[1] + fmla v7.4s, v8.4s, v27.s[1] + fmla v17.4s, v8.4s, v28.s[1] + fmla v19.4s, v8.4s, v24.s[1] + fmla v21.4s, v8.4s, v29.s[1] + fmla v23.4s, v8.4s, v31.s[1] + ldp q9, q8, [x10] + madd x10, x12, x14, x7 + ldr x14, [sp, #272] // 8-byte Folded Reload + add x10, x11, x10, lsl #2 + fmla v0.4s, v8.4s, v30.s[2] + fmla v2.4s, v8.4s, v26.s[2] + fmla v5.4s, v8.4s, v25.s[2] + fmla v4.4s, v8.4s, v27.s[2] + fmla v16.4s, v8.4s, v28.s[2] + fmla v18.4s, v8.4s, v24.s[2] + fmla v20.4s, v8.4s, v29.s[2] + fmla v22.4s, v8.4s, v31.s[2] + mov x11, x30 + add x30, x8, x12, lsl #5 + stp q9, q8, [x0] + fmla v1.4s, v9.4s, v30.s[2] + fmla v3.4s, v9.4s, v26.s[2] + fmla v6.4s, v9.4s, v25.s[2] + fmla v7.4s, v9.4s, v27.s[2] + fmla v17.4s, v9.4s, v28.s[2] + fmla v19.4s, v9.4s, v24.s[2] + fmla v21.4s, v9.4s, v29.s[2] + fmla v23.4s, v9.4s, v31.s[2] + ldp q8, q9, [x10] + ldr x10, [sp, #224] // 8-byte Folded Reload + stp q8, q9, [x30] + fmla v0.4s, v9.4s, v30.s[3] + fmla v1.4s, v8.4s, v30.s[3] + fmla v2.4s, v9.4s, v26.s[3] + fmla v3.4s, v8.4s, v26.s[3] + fmla v5.4s, v9.4s, v25.s[3] + fmla v6.4s, v8.4s, v25.s[3] + fmla v7.4s, v8.4s, v27.s[3] + fmla v4.4s, v9.4s, v27.s[3] + fmla v17.4s, v8.4s, v28.s[3] + fmla v16.4s, v9.4s, v28.s[3] + fmla v19.4s, v8.4s, v24.s[3] + fmla v18.4s, v9.4s, v24.s[3] + fmla v21.4s, v8.4s, v29.s[3] + fmla v20.4s, v9.4s, v29.s[3] + fmla v23.4s, v8.4s, v31.s[3] + fmla v22.4s, v9.4s, v31.s[3] + cmp x11, x19 + b.ge .LBB0_8 + .p2align 2 +.LBB0_7: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x10, x28 + prfm pldl1keep, [x10] + ldur s24, [x10, #-4] + add x10, x10, #4 + add x13, x12, x28 + prfm pldl1keep, [x12] + ldur s25, [x12, #-4] + add x17, x13, x28 + prfm pldl1keep, [x13] + ldur s26, [x13, #-4] + add x12, x17, x28 + prfm pldl1keep, [x17] + ldur s27, [x17, #-4] + add x13, x12, x28 + prfm pldl1keep, [x12] + ldur s28, [x12, #-4] + add x12, x18, x15 + add x17, x13, x28 + prfm pldl1keep, [x13] + ldur s29, [x13, #-4] + add x13, x1, x15 + add x15, x15, x14 + prfm pldl1keep, [x17] + ldur s30, [x17, #-4] + add x17, x17, x28 + prfm pldl1keep, [x17] + ldur s31, [x17, #-4] + prfm pldl1keep, [x12] + add x12, x8, x11, lsl #5 + add x11, x11, #1 + ldp q8, q9, [x13] + fmla v0.4s, v9.4s, v24.s[0] + fmla v2.4s, v9.4s, v25.s[0] + fmla v5.4s, v9.4s, v26.s[0] + fmla v4.4s, v9.4s, v27.s[0] + fmla v16.4s, v9.4s, v28.s[0] + fmla v18.4s, v9.4s, v29.s[0] + fmla v20.4s, v9.4s, v30.s[0] + fmla v1.4s, v8.4s, v24.s[0] + fmla v3.4s, v8.4s, v25.s[0] + fmla v6.4s, v8.4s, v26.s[0] + fmla v7.4s, v8.4s, v27.s[0] + fmla v17.4s, v8.4s, v28.s[0] + fmla v19.4s, v8.4s, v29.s[0] + fmla v21.4s, v8.4s, v30.s[0] + fmla v23.4s, v8.4s, v31.s[0] + fmla v22.4s, v9.4s, v31.s[0] + stp q8, q9, [x12] + cmp x11, x19 + b.lt .LBB0_7 +.LBB0_8: // %.preheader29 + // in Loop: Header=BB0_3 Depth=1 + ldp x18, x13, [sp, #208] // 16-byte Folded Reload + mov x10, xzr + mov w6, #1 // =0x1 + mov w24, #2 // =0x2 + mov w20, #3 // =0x3 + mov w29, #4 // =0x4 + mov w15, #5 // =0x5 + mov w11, #6 // =0x6 + mov w25, #7 // =0x7 + mov w1, #8 // =0x8 + b .LBB0_10 + .p2align 2 +.LBB0_9: // %.loopexit28 + // in Loop: Header=BB0_10 Depth=2 + ldr x10, [sp, #496] // 8-byte Folded Reload + add x13, x13, x10 + add x18, x18, x10 + mov x10, x1 + mov x1, x3 +.LBB0_10: // Parent Loop BB0_3 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_14 Depth 3 + madd x10, x10, x27, x7 + add x10, x2, x10, lsl #2 + stp q1, q0, [x10] + madd x10, x6, x27, x7 + add x10, x2, x10, lsl #2 + stp q3, q2, [x10] + madd x10, x24, x27, x7 + add x10, x2, x10, lsl #2 + stp q6, q5, [x10] + madd x10, x20, x27, x7 + add x10, x2, x10, lsl #2 + stp q7, q4, [x10] + madd x10, x29, x27, x7 + add x10, x2, x10, lsl #2 + stp q17, q16, [x10] + madd x10, x15, x27, x7 + add x10, x2, x10, lsl #2 + stp q19, q18, [x10] + madd x10, x11, x27, x7 + ldr x11, [sp, #520] // 8-byte Folded Reload + add x10, x2, x10, lsl #2 + cmp x1, x11 + stp q21, q20, [x10] + madd x10, x25, x27, x7 + add x10, x2, x10, lsl #2 + stp q23, q22, [x10] + b.ge .LBB0_15 +// %bb.11: // in Loop: Header=BB0_10 Depth=2 + madd x10, x1, x27, x7 + add x20, x1, #3 + add x29, x1, #4 + add x6, x1, #1 + madd x15, x20, x27, x7 + add x25, x1, #7 + add x24, x1, #2 + mov x4, xzr + madd x11, x6, x27, x7 + ldp q8, q9, [x8] + add x3, x1, #8 + madd x12, x24, x27, x7 + mov x17, x13 + add x10, x2, x10, lsl #2 + add x15, x2, x15, lsl #2 + ldp q1, q0, [x10] + madd x10, x29, x27, x7 + add x11, x2, x11, lsl #2 + ldp q7, q4, [x15] + add x15, x1, #5 + add x12, x2, x12, lsl #2 + ldp q3, q2, [x11] + add x11, x1, #6 + add x10, x2, x10, lsl #2 + ldp q6, q5, [x12] + ldp q17, q16, [x10] + madd x10, x15, x27, x7 + add x10, x2, x10, lsl #2 + ldp q19, q18, [x10] + madd x10, x11, x27, x7 + add x10, x2, x10, lsl #2 + ldp q21, q20, [x10] + madd x10, x25, x27, x7 + add x10, x2, x10, lsl #2 + ldp q23, q22, [x10] + mul x10, x1, x22 + lsl x10, x10, #2 + ldr q31, [x16, x10] + mul x10, x6, x22 + lsl x10, x10, #2 + ldr q30, [x16, x10] + mul x10, x24, x22 + lsl x10, x10, #2 + ldr q29, [x16, x10] + mul x10, x20, x22 + lsl x10, x10, #2 + ldr q28, [x16, x10] + mul x10, x29, x22 + lsl x10, x10, #2 + ldr q27, [x16, x10] + mul x10, x15, x22 + lsl x10, x10, #2 + ldr q26, [x16, x10] + mul x10, x11, x22 + lsl x10, x10, #2 + ldr q25, [x16, x10] + mul x10, x25, x22 + lsl x10, x10, #2 + ldr q24, [x16, x10] + mov x10, x5 + cmp xzr, x23 + b.ge .LBB0_13 + .p2align 2 +.LBB0_12: // Parent Loop BB0_3 Depth=1 + // Parent Loop BB0_10 Depth=2 + // => This Inner Loop Header: Depth=3 + add x14, x10, #32 + fmla v1.4s, v8.4s, v31.s[0] + fmla v0.4s, v9.4s, v31.s[0] + add x12, x10, #96 + fmla v2.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v30.s[0] + prfm pldl1keep, [x14] + add x4, x4, #4 + fmla v5.4s, v9.4s, v29.s[0] + fmla v6.4s, v8.4s, v29.s[0] + fmla v4.4s, v9.4s, v28.s[0] + fmla v7.4s, v8.4s, v28.s[0] + fmla v16.4s, v9.4s, v27.s[0] + fmla v17.4s, v8.4s, v27.s[0] + fmla v18.4s, v9.4s, v26.s[0] + fmla v19.4s, v8.4s, v26.s[0] + fmla v20.4s, v9.4s, v25.s[0] + fmla v21.4s, v8.4s, v25.s[0] + fmla v22.4s, v9.4s, v24.s[0] + fmla v23.4s, v8.4s, v24.s[0] + ldp q8, q9, [x10, #-96] + fmla v0.4s, v9.4s, v31.s[1] + fmla v2.4s, v9.4s, v30.s[1] + fmla v1.4s, v8.4s, v31.s[1] + fmla v3.4s, v8.4s, v30.s[1] + fmla v6.4s, v8.4s, v29.s[1] + fmla v5.4s, v9.4s, v29.s[1] + fmla v7.4s, v8.4s, v28.s[1] + fmla v4.4s, v9.4s, v28.s[1] + fmla v17.4s, v8.4s, v27.s[1] + fmla v16.4s, v9.4s, v27.s[1] + fmla v19.4s, v8.4s, v26.s[1] + fmla v18.4s, v9.4s, v26.s[1] + fmla v21.4s, v8.4s, v25.s[1] + fmla v20.4s, v9.4s, v25.s[1] + fmla v23.4s, v8.4s, v24.s[1] + fmla v22.4s, v9.4s, v24.s[1] + ldp q9, q8, [x10, #-64] + prfm pldl1keep, [x12] + ldp q11, q10, [x10, #-32] + add x12, x17, x28 + fmla v1.4s, v9.4s, v31.s[2] + fmla v0.4s, v8.4s, v31.s[2] + fmla v2.4s, v8.4s, v30.s[2] + fmla v3.4s, v9.4s, v30.s[2] + fmla v5.4s, v8.4s, v29.s[2] + fmla v6.4s, v9.4s, v29.s[2] + fmla v4.4s, v8.4s, v28.s[2] + fmla v7.4s, v9.4s, v28.s[2] + fmla v16.4s, v8.4s, v27.s[2] + fmla v17.4s, v9.4s, v27.s[2] + fmla v18.4s, v8.4s, v26.s[2] + fmla v19.4s, v9.4s, v26.s[2] + fmla v20.4s, v8.4s, v25.s[2] + fmla v21.4s, v9.4s, v25.s[2] + fmla v22.4s, v8.4s, v24.s[2] + fmla v23.4s, v9.4s, v24.s[2] + ldp q8, q9, [x10], #128 + prfm pldl1keep, [x17] + fmla v0.4s, v10.4s, v31.s[3] + fmla v1.4s, v11.4s, v31.s[3] + ldur q31, [x17, #-16] + prfm pldl1keep, [x12] + add x17, x17, #16 + fmla v3.4s, v11.4s, v30.s[3] + fmla v2.4s, v10.4s, v30.s[3] + ldur q30, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v6.4s, v11.4s, v29.s[3] + fmla v5.4s, v10.4s, v29.s[3] + ldur q29, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v7.4s, v11.4s, v28.s[3] + fmla v4.4s, v10.4s, v28.s[3] + ldur q28, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v17.4s, v11.4s, v27.s[3] + fmla v16.4s, v10.4s, v27.s[3] + ldur q27, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v19.4s, v11.4s, v26.s[3] + fmla v18.4s, v10.4s, v26.s[3] + ldur q26, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v21.4s, v11.4s, v25.s[3] + fmla v20.4s, v10.4s, v25.s[3] + ldur q25, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v23.4s, v11.4s, v24.s[3] + fmla v22.4s, v10.4s, v24.s[3] + ldur q24, [x12, #-16] + cmp x4, x23 + b.lt .LBB0_12 +.LBB0_13: // in Loop: Header=BB0_10 Depth=2 + ldp q11, q10, [x9] + fmla v0.4s, v9.4s, v31.s[0] + fmla v1.4s, v8.4s, v31.s[0] + fmla v2.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v30.s[0] + ldr x17, [sp, #488] // 8-byte Folded Reload + ldr x4, [sp, #504] // 8-byte Folded Reload + fmla v5.4s, v9.4s, v29.s[0] + fmla v6.4s, v8.4s, v29.s[0] + mov x10, x18 + fmla v4.4s, v9.4s, v28.s[0] + fmla v7.4s, v8.4s, v28.s[0] + fmla v16.4s, v9.4s, v27.s[0] + fmla v17.4s, v8.4s, v27.s[0] + fmla v18.4s, v9.4s, v26.s[0] + fmla v19.4s, v8.4s, v26.s[0] + fmla v20.4s, v9.4s, v25.s[0] + fmla v21.4s, v8.4s, v25.s[0] + fmla v22.4s, v9.4s, v24.s[0] + ldp q9, q12, [x0] + fmla v23.4s, v8.4s, v24.s[0] + fmla v1.4s, v11.4s, v31.s[1] + fmla v0.4s, v10.4s, v31.s[1] + fmla v3.4s, v11.4s, v30.s[1] + fmla v2.4s, v10.4s, v30.s[1] + fmla v6.4s, v11.4s, v29.s[1] + fmla v5.4s, v10.4s, v29.s[1] + fmla v7.4s, v11.4s, v28.s[1] + fmla v4.4s, v10.4s, v28.s[1] + fmla v17.4s, v11.4s, v27.s[1] + fmla v16.4s, v10.4s, v27.s[1] + fmla v19.4s, v11.4s, v26.s[1] + fmla v18.4s, v10.4s, v26.s[1] + fmla v21.4s, v11.4s, v25.s[1] + fmla v20.4s, v10.4s, v25.s[1] + fmla v23.4s, v11.4s, v24.s[1] + fmla v22.4s, v10.4s, v24.s[1] + fmla v0.4s, v12.4s, v31.s[2] + ldp q10, q8, [x30] + fmla v1.4s, v9.4s, v31.s[2] + fmla v2.4s, v12.4s, v30.s[2] + fmla v3.4s, v9.4s, v30.s[2] + fmla v5.4s, v12.4s, v29.s[2] + fmla v6.4s, v9.4s, v29.s[2] + fmla v4.4s, v12.4s, v28.s[2] + fmla v7.4s, v9.4s, v28.s[2] + fmla v16.4s, v12.4s, v27.s[2] + fmla v17.4s, v9.4s, v27.s[2] + fmla v18.4s, v12.4s, v26.s[2] + fmla v19.4s, v9.4s, v26.s[2] + fmla v20.4s, v12.4s, v25.s[2] + fmla v21.4s, v9.4s, v25.s[2] + fmla v22.4s, v12.4s, v24.s[2] + fmla v23.4s, v9.4s, v24.s[2] + fmla v1.4s, v10.4s, v31.s[3] + fmla v0.4s, v8.4s, v31.s[3] + fmla v3.4s, v10.4s, v30.s[3] + fmla v2.4s, v8.4s, v30.s[3] + fmla v6.4s, v10.4s, v29.s[3] + fmla v5.4s, v8.4s, v29.s[3] + fmla v7.4s, v10.4s, v28.s[3] + fmla v4.4s, v8.4s, v28.s[3] + fmla v17.4s, v10.4s, v27.s[3] + fmla v16.4s, v8.4s, v27.s[3] + fmla v19.4s, v10.4s, v26.s[3] + fmla v18.4s, v8.4s, v26.s[3] + fmla v21.4s, v10.4s, v25.s[3] + fmla v20.4s, v8.4s, v25.s[3] + fmla v23.4s, v10.4s, v24.s[3] + fmla v22.4s, v8.4s, v24.s[3] + cmp x4, x19 + b.ge .LBB0_9 + .p2align 2 +.LBB0_14: // Parent Loop BB0_3 Depth=1 + // Parent Loop BB0_10 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x10, x28 + prfm pldl1keep, [x10] + ldur s24, [x10, #-4] + add x4, x4, #1 + prfm pldl1keep, [x12] + ldur s25, [x12, #-4] + add x12, x12, x28 + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s26, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s27, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s28, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s29, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s30, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s31, [x12, #-4] + prfm pldl1keep, [x17] + ldp q8, q9, [x17, #-32] + add x17, x17, #32 + fmla v0.4s, v9.4s, v24.s[0] + fmla v2.4s, v9.4s, v25.s[0] + fmla v5.4s, v9.4s, v26.s[0] + fmla v4.4s, v9.4s, v27.s[0] + fmla v16.4s, v9.4s, v28.s[0] + fmla v18.4s, v9.4s, v29.s[0] + fmla v20.4s, v9.4s, v30.s[0] + fmla v1.4s, v8.4s, v24.s[0] + fmla v3.4s, v8.4s, v25.s[0] + fmla v6.4s, v8.4s, v26.s[0] + fmla v7.4s, v8.4s, v27.s[0] + fmla v17.4s, v8.4s, v28.s[0] + fmla v19.4s, v8.4s, v29.s[0] + fmla v21.4s, v8.4s, v30.s[0] + fmla v23.4s, v8.4s, v31.s[0] + fmla v22.4s, v9.4s, v31.s[0] + cmp x4, x19 + b.lt .LBB0_14 + b .LBB0_9 + .p2align 2 +.LBB0_15: // in Loop: Header=BB0_3 Depth=1 + ldp x17, x24, [sp, #336] // 16-byte Folded Reload + ldr x20, [sp, #312] // 8-byte Folded Reload + cmp x11, x17 + ldp x25, x6, [sp, #384] // 16-byte Folded Reload + ldr x29, [sp, #200] // 8-byte Folded Reload + b.lt .LBB0_18 +// %bb.16: // in Loop: Header=BB0_3 Depth=1 + cmp x17, x20 + b.lt .LBB0_23 +.LBB0_17: // in Loop: Header=BB0_3 Depth=1 + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x1, [sp, #368] // 8-byte Folded Reload + cmp x20, x10 + b.ge .LBB0_2 + b .LBB0_28 + .p2align 2 +.LBB0_18: // in Loop: Header=BB0_3 Depth=1 + ldr x15, [sp, #520] // 8-byte Folded Reload + ldp q20, q21, [x8] + mov x10, xzr + add x12, x15, #1 + add x13, x15, #2 + mul x11, x15, x27 + add x14, x15, #3 + mul x15, x15, x22 + madd x18, x12, x27, x7 + mul x12, x12, x22 + madd x1, x13, x27, x7 + lsl x3, x15, #2 + add x11, x11, x7 + lsl x12, x12, #2 + add x15, x2, x18, lsl #2 + madd x18, x14, x27, x7 + add x17, x2, x11, lsl #2 + add x11, x2, x1, lsl #2 + ldr x1, [sp, #184] // 8-byte Folded Reload + ldr q18, [x16, x3] + ldr q19, [x16, x12] + mul x12, x13, x22 + mov x13, x5 + ldp q3, q0, [x17] + ldp q4, q1, [x15] + ldp q5, q2, [x11] + add x18, x2, x18, lsl #2 + lsl x12, x12, #2 + ldr q17, [x16, x12] + mul x12, x14, x22 + ldp q7, q6, [x18] + lsl x12, x12, #2 + ldr q16, [x16, x12] + cmp xzr, x23 + b.ge .LBB0_20 + .p2align 2 +.LBB0_19: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x13, #32 + fmla v3.4s, v20.4s, v18.s[0] + fmla v0.4s, v21.4s, v18.s[0] + add x10, x10, #4 + prfm pldl1keep, [x12] + ldp q22, q23, [x13, #-96] + fmla v1.4s, v21.4s, v19.s[0] + fmla v4.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + add x12, x13, #96 + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x13, #-64] + prfm pldl1keep, [x12] + add x12, x1, x28 + add x14, x12, x28 + fmla v0.4s, v23.4s, v18.s[1] + fmla v1.4s, v23.4s, v19.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v18.s[1] + fmla v4.4s, v22.4s, v19.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v0.4s, v20.4s, v18.s[2] + ldp q22, q23, [x13, #-32] + fmla v1.4s, v20.4s, v19.s[2] + fmla v2.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v18.s[2] + fmla v4.4s, v21.4s, v19.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x13], #128 + prfm pldl1keep, [x1] + fmla v0.4s, v23.4s, v18.s[3] + fmla v1.4s, v23.4s, v19.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v18.s[3] + ldur q18, [x1, #-16] + prfm pldl1keep, [x12] + fmla v4.4s, v22.4s, v19.s[3] + ldur q19, [x12, #-16] + add x12, x14, x28 + prfm pldl1keep, [x14] + add x1, x1, #16 + fmla v5.4s, v22.4s, v17.s[3] + ldur q17, [x14, #-16] + prfm pldl1keep, [x12] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x12, #-16] + cmp x10, x23 + b.lt .LBB0_19 +.LBB0_20: // in Loop: Header=BB0_3 Depth=1 + ldp q23, q22, [x9] + fmla v0.4s, v21.4s, v18.s[0] + fmla v3.4s, v20.4s, v18.s[0] + fmla v1.4s, v21.4s, v19.s[0] + fmla v4.4s, v20.4s, v19.s[0] + ldr x10, [sp, #176] // 8-byte Folded Reload + ldr x13, [sp, #488] // 8-byte Folded Reload + fmla v2.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + ldr x1, [sp, #504] // 8-byte Folded Reload + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x0] + fmla v0.4s, v22.4s, v18.s[1] + fmla v1.4s, v22.4s, v19.s[1] + fmla v2.4s, v22.4s, v17.s[1] + fmla v6.4s, v22.4s, v16.s[1] + fmla v3.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v19.s[1] + fmla v5.4s, v23.4s, v17.s[1] + fmla v7.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v18.s[2] + ldp q23, q22, [x30] + fmla v1.4s, v21.4s, v19.s[2] + fmla v2.4s, v21.4s, v17.s[2] + fmla v6.4s, v21.4s, v16.s[2] + fmla v3.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v19.s[2] + fmla v5.4s, v20.4s, v17.s[2] + fmla v7.4s, v20.4s, v16.s[2] + fmla v0.4s, v22.4s, v18.s[3] + fmla v1.4s, v22.4s, v19.s[3] + fmla v2.4s, v22.4s, v17.s[3] + fmla v6.4s, v22.4s, v16.s[3] + fmla v3.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v19.s[3] + fmla v5.4s, v23.4s, v17.s[3] + fmla v7.4s, v23.4s, v16.s[3] + cmp x1, x19 + b.ge .LBB0_22 + .p2align 2 +.LBB0_21: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x10, x28 + prfm pldl1keep, [x10] + ldur s16, [x10, #-4] + add x1, x1, #1 + prfm pldl1keep, [x12] + ldur s17, [x12, #-4] + add x12, x12, x28 + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s18, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s19, [x12, #-4] + prfm pldl1keep, [x13] + ldp q20, q21, [x13, #-32] + add x13, x13, #32 + fmla v0.4s, v21.4s, v16.s[0] + fmla v1.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v3.4s, v20.4s, v16.s[0] + fmla v4.4s, v20.4s, v17.s[0] + fmla v5.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + fmla v6.4s, v21.4s, v19.s[0] + cmp x1, x19 + b.lt .LBB0_21 +.LBB0_22: // in Loop: Header=BB0_3 Depth=1 + stp q3, q0, [x17] + ldr x17, [sp, #336] // 8-byte Folded Reload + stp q4, q1, [x15] + stp q5, q2, [x11] + stp q7, q6, [x18] + cmp x17, x20 + b.ge .LBB0_17 +.LBB0_23: // in Loop: Header=BB0_3 Depth=1 + mul x10, x17, x27 + add x12, x17, #1 + ldp q6, q7, [x8] + madd x11, x12, x27, x7 + ldr x18, [sp, #168] // 8-byte Folded Reload + mov x13, xzr + mov x15, x5 + mul x14, x17, x22 + ldr x17, [sp, #248] // 8-byte Folded Reload + add x10, x10, x7 + mul x12, x12, x22 + lsl x14, x14, #2 + add x10, x2, x10, lsl #2 + add x11, x2, x11, lsl #2 + lsl x12, x12, #2 + ldr q5, [x16, x14] + ldr q4, [x16, x12] + ldp q1, q0, [x10] + ldp q3, q2, [x11] + cmp xzr, x23 + b.ge .LBB0_25 + .p2align 2 +.LBB0_24: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x6, x15, #32 + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + add x4, x15, #96 + prfm pldl1keep, [x6] + ldp q16, q17, [x15, #-96] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x15, #-64] + prfm pldl1keep, [x4] + add x12, x18, x24 + add x1, x17, x24 + add x14, x12, #32 + add x3, x1, #32 + add x13, x13, #4 + add x18, x18, #16 + add x17, x17, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v6.4s, v5.s[2] + ldp q16, q17, [x15, #-32] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + ldp q6, q7, [x15], #128 + prfm pldl1keep, [x3] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x1, #16] + prfm pldl1keep, [x14] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x12, #16] + cmp x13, x23 + b.lt .LBB0_24 +.LBB0_25: // in Loop: Header=BB0_3 Depth=1 + ldp q17, q16, [x9] + fmla v0.4s, v7.4s, v5.s[0] + fmla v1.4s, v6.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q6, q7, [x0] + ldr x12, [sp, #504] // 8-byte Folded Reload + ldr x6, [sp, #392] // 8-byte Folded Reload + mov x13, xzr + mov x15, xzr + fmla v0.4s, v16.4s, v5.s[1] + fmla v2.4s, v16.4s, v4.s[1] + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldp q17, q16, [x30] + fmla v0.4s, v7.4s, v5.s[2] + fmla v2.4s, v7.4s, v4.s[2] + fmla v1.4s, v6.4s, v5.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v0.4s, v16.4s, v5.s[3] + fmla v2.4s, v16.4s, v4.s[3] + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + cmp x12, x19 + b.ge .LBB0_27 + .p2align 2 +.LBB0_26: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + ldp x3, x1, [sp, #472] // 16-byte Folded Reload + add x14, x29, x15, lsl #3 + add x12, x12, #1 + add x14, x14, #32 + add x17, x1, x15 + add x18, x3, x15 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s4, [x3, x15] + prfm pldl1keep, [x17] + ldr s5, [x1, x15] + add x17, x29, x13 + prfm pldl1keep, [x14] + add x15, x15, #4 + add x13, x13, #32 + ldp q6, q7, [x17] + fmla v0.4s, v7.4s, v4.s[0] + fmla v1.4s, v6.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + cmp x12, x19 + b.lt .LBB0_26 +.LBB0_27: // in Loop: Header=BB0_3 Depth=1 + stp q1, q0, [x10] + stp q3, q2, [x11] + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x1, [sp, #368] // 8-byte Folded Reload + cmp x20, x10 + b.ge .LBB0_2 +.LBB0_28: // in Loop: Header=BB0_3 Depth=1 + mul x10, x20, x27 + ldp q4, q3, [x8] + ldr x13, [sp, #136] // 8-byte Folded Reload + mul x12, x20, x22 + mov x11, xzr + add x10, x10, x7 + lsl x12, x12, #2 + add x10, x2, x10, lsl #2 + ldr q2, [x16, x12] + mov x12, x5 + ldp q1, q0, [x10] + cmp xzr, x23 + b.ge .LBB0_30 + .p2align 2 +.LBB0_29: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x12, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x14, x12, #96 + prfm pldl1keep, [x15] + ldp q5, q6, [x12, #-96] + add x11, x11, #4 + ldp q4, q3, [x12, #-64] + prfm pldl1keep, [x14] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x12, #-32] + prfm pldl1keep, [x13] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x13, #-16] + ldp q4, q3, [x12], #128 + add x13, x13, #16 + cmp x11, x23 + b.lt .LBB0_29 +.LBB0_30: // in Loop: Header=BB0_3 Depth=1 + ldp q6, q5, [x9] + fmla v0.4s, v3.4s, v2.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp q3, q4, [x0] + ldp x9, x11, [sp, #152] // 16-byte Folded Reload + ldr x12, [sp, #504] // 8-byte Folded Reload + ldr x15, [sp, #144] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v2.s[1] + fmla v1.4s, v6.4s, v2.s[1] + ldp q6, q5, [x30] + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v3.4s, v2.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v6.4s, v2.s[3] + cmp x12, x19 + b.ge .LBB0_1 + .p2align 2 +.LBB0_31: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x14, x8, x12, lsl #5 + add x13, x8, x11 + prfm pldl1keep, [x9] + add x11, x11, #32 + ldr s2, [x15, x12, lsl #2] + prfm pldl1keep, [x13] + add x12, x12, #1 + ldp q3, q4, [x14] + add x9, x9, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v3.4s, v2.s[0] + cmp x12, x19 + b.lt .LBB0_31 + b .LBB0_1 +.LBB0_32: + ldr x0, [sp, #56] // 8-byte Folded Reload + bl free + ldr x9, [sp, #112] // 8-byte Folded Reload + lsl x20, x22, #3 + str x20, [sp, #472] // 8-byte Folded Spill + add x8, x9, #3 + cmp x9, #0 + csel x8, x8, x9, lt + ldr x9, [sp, #128] // 8-byte Folded Reload + asr x8, x8, #2 + cmp x9, #0 + cinv x29, x8, lt + lsl x8, x29, #2 + str x8, [sp, #488] // 8-byte Folded Spill + cmp x25, x8 + ldp x9, x8, [sp, #256] // 16-byte Folded Reload + add x24, x8, x9, lsl #2 + b.ge .LBB0_63 +// %bb.33: + lsl x8, x19, #4 + str x29, [sp, #480] // 8-byte Folded Spill + add x0, x8, #64 + str x8, [sp, #464] // 8-byte Folded Spill + bl malloc + add x8, x25, x27, lsl #1 + add x10, x27, x25 + ldp x6, x5, [sp, #296] // 16-byte Folded Reload + lsl x10, x10, #2 + add x13, x25, x27, lsl #2 + add x11, x0, #63 + ldr x18, [sp, #328] // 8-byte Folded Reload + ldr q2, [x24, x10] + lsl x10, x8, #2 + add x8, x8, x27 + ldr q1, [x24, x10] + lsl x10, x13, #2 + lsl x8, x8, #2 + ldr q3, [x24, x10] + mov w10, #6 // =0x6 + ldr x1, [sp, #344] // 8-byte Folded Reload + ldr q4, [x24, x8] + mul x8, x27, x10 + mov w4, #12 // =0xc + add x13, x13, x27 + lsl x10, x13, #2 + lsl x9, x25, #2 + mov w16, #20 // =0x14 + ldr q5, [x24, x10] + add x10, x6, x21 + mov w15, #24 // =0x18 + ldr q0, [x24, x9] + ldr q23, [x10, x9] + add x8, x8, x25 + add x9, x18, x1 + mul x13, x22, x15 + ldr x7, [sp, #64] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr q22, [x9, x13] + mov w13, #28 // =0x1c + ldr q6, [x24, x8] + sub x8, x25, x27 + ldr q16, [x9] + add x8, x8, x27, lsl #3 + ldr q17, [x9, x28] + ldr x29, [sp, #504] // 8-byte Folded Reload + ldr q18, [x9, x20] + ldr q20, [x9, x22, lsl #4] + lsl x8, x8, #2 + ldr x30, [sp, #104] // 8-byte Folded Reload + mov x12, xzr + ldr q7, [x24, x8] + and x8, x11, #0xffffffffffffffc0 + mul x11, x22, x4 + orr x3, x8, #0x20 + ldr q19, [x9, x11] + mul x11, x22, x16 + ldr q21, [x9, x11] + ldr x11, [sp, #16] // 8-byte Folded Reload + lsl x11, x11, #5 + madd x17, x5, x13, x11 + add x14, x11, x5, lsl #5 + madd x15, x5, x15, x11 + madd x16, x5, x16, x11 + madd x4, x5, x4, x11 + add x2, x11, x5, lsl #3 + add x13, x6, x14 + add x2, x6, x2 + add x14, x6, x17 + add x17, x1, x18 + add x1, x11, x5, lsl #2 + add x18, x11, x26 + mov w5, #16 // =0x10 + add x15, x6, x15 + add x16, x6, x16 + add x4, x6, x4 + add x17, x7, x17 + add x18, x6, x18 + sub x5, x5, x7 + add x17, x17, #16 + add x1, x6, x1 + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v0.4s, v23.4s, v16.s[0] + fmla v2.4s, v23.4s, v17.s[0] + cmp xzr, x23 + b.ge .LBB0_35 + .p2align 2 +.LBB0_34: // =>This Inner Loop Header: Depth=1 + add x6, x16, x21 + stur q23, [x3, #-32] + fmla v1.4s, v23.4s, v18.s[0] + fmla v4.4s, v23.4s, v19.s[0] + prfm pldl1keep, [x6] + ldr q25, [x1, x21] + fmla v3.4s, v23.4s, v20.s[0] + fmla v5.4s, v23.4s, v21.s[0] + fmla v6.4s, v23.4s, v22.s[0] + fmla v7.4s, v23.4s, v24.s[0] + add x6, x15, x21 + add x7, x17, x5 + add x20, x7, x28 + add x25, x20, x28 + add x12, x12, #4 + add x15, x15, x26 + add x16, x16, x26 + add x17, x17, #16 + add x1, x1, x26 + stur q25, [x3, #-16] + prfm pldl1keep, [x6] + ldr q23, [x2, x21] + fmla v0.4s, v25.4s, v16.s[1] + fmla v2.4s, v25.4s, v17.s[1] + fmla v1.4s, v25.4s, v18.s[1] + fmla v4.4s, v25.4s, v19.s[1] + fmla v3.4s, v25.4s, v20.s[1] + fmla v5.4s, v25.4s, v21.s[1] + fmla v6.4s, v25.4s, v22.s[1] + fmla v7.4s, v25.4s, v24.s[1] + add x6, x14, x21 + add x14, x14, x26 + add x2, x2, x26 + fmla v0.4s, v23.4s, v16.s[2] + fmla v2.4s, v23.4s, v17.s[2] + fmla v1.4s, v23.4s, v18.s[2] + fmla v4.4s, v23.4s, v19.s[2] + fmla v3.4s, v23.4s, v20.s[2] + fmla v5.4s, v23.4s, v21.s[2] + fmla v6.4s, v23.4s, v22.s[2] + fmla v7.4s, v23.4s, v24.s[2] + str q23, [x3] + prfm pldl1keep, [x6] + ldr q23, [x4, x21] + add x6, x13, x21 + add x13, x13, x26 + add x4, x4, x26 + str q23, [x3, #16] + prfm pldl1keep, [x6] + add x6, x25, x28 + fmla v0.4s, v23.4s, v16.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v1.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v20.s[3] + fmla v5.4s, v23.4s, v21.s[3] + fmla v6.4s, v23.4s, v22.s[3] + fmla v7.4s, v23.4s, v24.s[3] + ldr q23, [x18, x21] + prfm pldl1keep, [x7] + ldur q16, [x7, #-16] + prfm pldl1keep, [x20] + ldur q17, [x20, #-16] + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + ldr x25, [sp, #384] // 8-byte Folded Reload + add x18, x18, x26 + add x3, x3, #64 + prfm pldl1keep, [x6] + ldur q19, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q20, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q21, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q22, [x6, #-16] + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v0.4s, v23.4s, v16.s[0] + fmla v2.4s, v23.4s, v17.s[0] + cmp x12, x23 + b.lt .LBB0_34 +.LBB0_35: + ldp x13, x14, [sp, #400] // 16-byte Folded Reload + ldr x15, [sp, #304] // 8-byte Folded Reload + fmla v1.4s, v23.4s, v18.s[0] + str q23, [x8, x23, lsl #4] + fmla v4.4s, v23.4s, v19.s[0] + fmla v3.4s, v23.4s, v20.s[0] + fmla v5.4s, v23.4s, v21.s[0] + fmla v6.4s, v23.4s, v22.s[0] + fmla v7.4s, v23.4s, v24.s[0] + ldr x7, [sp, #520] // 8-byte Folded Reload + mul x12, x13, x15 + add x12, x12, x25 + lsl x12, x12, #2 + ldr q23, [x10, x12] + mul x12, x14, x15 + add x12, x12, x25 + lsl x12, x12, #2 + str q23, [x8, x13, lsl #4] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v0.4s, v23.4s, v16.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v1.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v19.s[1] + fmla v3.4s, v23.4s, v20.s[1] + fmla v5.4s, v23.4s, v21.s[1] + fmla v6.4s, v23.4s, v22.s[1] + fmla v7.4s, v23.4s, v24.s[1] + ldr q23, [x10, x12] + madd x12, x13, x15, x25 + fmla v0.4s, v23.4s, v16.s[2] + str q23, [x8, x14, lsl #4] + fmla v2.4s, v23.4s, v17.s[2] + fmla v1.4s, v23.4s, v18.s[2] + fmla v4.4s, v23.4s, v19.s[2] + fmla v3.4s, v23.4s, v20.s[2] + fmla v5.4s, v23.4s, v21.s[2] + fmla v6.4s, v23.4s, v22.s[2] + fmla v7.4s, v23.4s, v24.s[2] + mov x14, x29 + lsl x12, x12, #2 + ldr q23, [x10, x12] + ldr x10, [sp, #72] // 8-byte Folded Reload + add x12, x10, #4 + ldp x17, x10, [sp, #272] // 16-byte Folded Reload + str q23, [x8, x13, lsl #4] + ldr x13, [sp, #120] // 8-byte Folded Reload + fmla v0.4s, v23.4s, v16.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v1.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v20.s[3] + fmla v5.4s, v23.4s, v21.s[3] + fmla v6.4s, v23.4s, v22.s[3] + fmla v7.4s, v23.4s, v24.s[3] + add x10, x11, x10, lsl #2 + ldr x11, [sp, #296] // 8-byte Folded Reload + add x10, x11, x10 + ldr x11, [sp, #320] // 8-byte Folded Reload + add x11, x13, x11, lsl #2 + ldr x13, [sp, #80] // 8-byte Folded Reload + sub x11, x11, x13 + ldr x13, [sp, #328] // 8-byte Folded Reload + add x13, x11, x13 + mul x11, x15, x12 + add x12, x13, #4 + ldr x13, [sp, #96] // 8-byte Folded Reload + lsl x13, x13, #2 + cmp x29, x19 + b.ge .LBB0_37 + .p2align 2 +.LBB0_36: // =>This Inner Loop Header: Depth=1 + add x16, x12, x28 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x15, x10, x11 + prfm pldl1keep, [x16] + ldur s17, [x16, #-4] + add x16, x16, x28 + add x12, x12, #4 + prfm pldl1keep, [x16] + ldur s18, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s19, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s20, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s21, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s22, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s23, [x16, #-4] + prfm pldl1keep, [x15] + ldr q24, [x10, x13] + add x10, x10, x17 + fmla v0.4s, v24.4s, v16.s[0] + str q24, [x8, x14, lsl #4] + add x14, x14, #1 + fmla v2.4s, v24.4s, v17.s[0] + fmla v1.4s, v24.4s, v18.s[0] + fmla v4.4s, v24.4s, v19.s[0] + fmla v3.4s, v24.4s, v20.s[0] + fmla v5.4s, v24.4s, v21.s[0] + fmla v6.4s, v24.4s, v22.s[0] + fmla v7.4s, v24.4s, v23.s[0] + cmp x14, x19 + b.lt .LBB0_36 +.LBB0_37: // %.preheader27 + ldr x10, [sp, #344] // 8-byte Folded Reload + ldr x11, [sp, #496] // 8-byte Folded Reload + mov x6, xzr + mov w4, #7 // =0x7 + ldr x15, [sp, #328] // 8-byte Folded Reload + ldr x12, [sp, #88] // 8-byte Folded Reload + mov w3, #6 // =0x6 + mov w16, #5 // =0x5 + mov w1, #4 // =0x4 + mov w17, #3 // =0x3 + mov w18, #2 // =0x2 + mov w2, #1 // =0x1 + add x13, x11, x10 + sub x10, x8, x30, lsl #4 + add x14, x12, x15 + add x11, x8, #48 + mov w12, #8 // =0x8 + add x15, x13, x15 + add x13, x14, #4 + add x10, x10, x19, lsl #4 + add x14, x15, #32 + add x15, x10, #16 + b .LBB0_39 + .p2align 2 +.LBB0_38: // %.loopexit26 + // in Loop: Header=BB0_39 Depth=1 + ldr x6, [sp, #496] // 8-byte Folded Reload + ldr x7, [sp, #520] // 8-byte Folded Reload + add x14, x14, x6 + ldr x25, [sp, #384] // 8-byte Folded Reload + add x13, x13, x6 + mov x6, x12 + mov x12, x5 +.LBB0_39: // =>This Loop Header: Depth=1 + // Child Loop BB0_41 Depth 2 + // Child Loop BB0_43 Depth 2 + madd x5, x6, x27, x25 + cmp x12, x7 + lsl x5, x5, #2 + madd x2, x2, x27, x25 + madd x18, x18, x27, x25 + madd x17, x17, x27, x25 + madd x1, x1, x27, x25 + lsl x2, x2, #2 + lsl x18, x18, #2 + lsl x17, x17, #2 + lsl x1, x1, #2 + madd x16, x16, x27, x25 + lsl x16, x16, #2 + str q0, [x24, x5] + str q2, [x24, x2] + str q1, [x24, x18] + str q4, [x24, x17] + str q3, [x24, x1] + str q5, [x24, x16] + madd x16, x3, x27, x25 + lsl x16, x16, #2 + str q6, [x24, x16] + madd x16, x4, x27, x25 + lsl x16, x16, #2 + str q7, [x24, x16] + b.ge .LBB0_44 +// %bb.40: // in Loop: Header=BB0_39 Depth=1 + add x17, x12, #3 + add x2, x12, #1 + add x18, x12, #2 + mul x3, x12, x27 + mul x7, x17, x27 + add x1, x12, #4 + add x16, x12, #5 + ldr q24, [x8] + mul x4, x2, x27 + mov x6, xzr + add x3, x3, x25 + mul x5, x18, x27 + mul x20, x1, x27 + add x7, x7, x25 + lsl x3, x3, #2 + add x4, x4, x25 + add x5, x5, x25 + add x20, x20, x25 + lsl x7, x7, #2 + lsl x4, x4, #2 + ldr q0, [x24, x3] + mul x3, x16, x27 + lsl x5, x5, #2 + lsl x20, x20, #2 + ldr q4, [x24, x7] + mul x7, x12, x22 + ldr q2, [x24, x4] + ldr q1, [x24, x5] + ldr q3, [x24, x20] + mov x20, x14 + add x3, x3, x25 + lsl x7, x7, #2 + lsl x3, x3, #2 + ldr q23, [x9, x7] + mul x7, x2, x22 + ldr q5, [x24, x3] + add x3, x12, #6 + mul x4, x3, x27 + lsl x7, x7, #2 + ldr q22, [x9, x7] + mul x7, x18, x22 + add x4, x4, x25 + lsl x4, x4, #2 + lsl x7, x7, #2 + ldr q6, [x24, x4] + add x4, x12, #7 + mul x5, x4, x27 + ldr q21, [x9, x7] + mul x7, x17, x22 + add x5, x5, x25 + lsl x5, x5, #2 + lsl x7, x7, #2 + ldr q7, [x24, x5] + add x5, x12, #8 + ldr q20, [x9, x7] + mul x7, x1, x22 + lsl x7, x7, #2 + ldr q19, [x9, x7] + mul x7, x16, x22 + lsl x7, x7, #2 + ldr q18, [x9, x7] + mul x7, x3, x22 + lsl x7, x7, #2 + ldr q17, [x9, x7] + mul x7, x4, x22 + lsl x7, x7, #2 + ldr q16, [x9, x7] + mov x7, x11 + cmp xzr, x23 + b.ge .LBB0_42 + .p2align 2 +.LBB0_41: // Parent Loop BB0_39 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x7, #32 + fmla v0.4s, v24.4s, v23.s[0] + fmla v2.4s, v24.4s, v22.s[0] + add x6, x6, #4 + fmla v1.4s, v24.4s, v21.s[0] + fmla v4.4s, v24.4s, v20.s[0] + prfm pldl1keep, [x25] + add x25, x20, x28 + fmla v3.4s, v24.4s, v19.s[0] + fmla v5.4s, v24.4s, v18.s[0] + fmla v6.4s, v24.4s, v17.s[0] + fmla v7.4s, v24.4s, v16.s[0] + ldp q24, q25, [x7, #-32] + fmla v0.4s, v24.4s, v23.s[1] + fmla v2.4s, v24.4s, v22.s[1] + fmla v1.4s, v24.4s, v21.s[1] + fmla v4.4s, v24.4s, v20.s[1] + fmla v3.4s, v24.4s, v19.s[1] + fmla v5.4s, v24.4s, v18.s[1] + fmla v6.4s, v24.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v0.4s, v25.4s, v23.s[2] + fmla v2.4s, v25.4s, v22.s[2] + ldp q26, q24, [x7], #64 + fmla v1.4s, v25.4s, v21.s[2] + fmla v4.4s, v25.4s, v20.s[2] + fmla v3.4s, v25.4s, v19.s[2] + prfm pldl1keep, [x20] + fmla v5.4s, v25.4s, v18.s[2] + fmla v6.4s, v25.4s, v17.s[2] + fmla v7.4s, v25.4s, v16.s[2] + fmla v0.4s, v26.4s, v23.s[3] + ldur q23, [x20, #-16] + prfm pldl1keep, [x25] + fmla v2.4s, v26.4s, v22.s[3] + ldur q22, [x25, #-16] + add x25, x25, x28 + fmla v1.4s, v26.4s, v21.s[3] + fmla v4.4s, v26.4s, v20.s[3] + fmla v3.4s, v26.4s, v19.s[3] + fmla v5.4s, v26.4s, v18.s[3] + add x20, x20, #16 + prfm pldl1keep, [x25] + ldur q21, [x25, #-16] + add x25, x25, x28 + fmla v6.4s, v26.4s, v17.s[3] + fmla v7.4s, v26.4s, v16.s[3] + prfm pldl1keep, [x25] + ldur q20, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q19, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q17, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q16, [x25, #-16] + cmp x6, x23 + b.lt .LBB0_41 +.LBB0_42: // in Loop: Header=BB0_39 Depth=1 + ldp x7, x6, [sp, #400] // 16-byte Folded Reload + fmla v0.4s, v24.4s, v23.s[0] + fmla v2.4s, v24.4s, v22.s[0] + fmla v1.4s, v24.4s, v21.s[0] + fmla v4.4s, v24.4s, v20.s[0] + mov x20, x29 + fmla v3.4s, v24.4s, v19.s[0] + fmla v5.4s, v24.4s, v18.s[0] + ldr q25, [x8, x7, lsl #4] + fmla v6.4s, v24.4s, v17.s[0] + fmla v7.4s, v24.4s, v16.s[0] + ldr q24, [x8, x6, lsl #4] + ldr x6, [sp, #416] // 8-byte Folded Reload + mov x7, x15 + ldr q26, [x8, x6, lsl #4] + mov x6, x13 + fmla v0.4s, v25.4s, v23.s[1] + fmla v2.4s, v25.4s, v22.s[1] + fmla v1.4s, v25.4s, v21.s[1] + fmla v4.4s, v25.4s, v20.s[1] + fmla v3.4s, v25.4s, v19.s[1] + fmla v5.4s, v25.4s, v18.s[1] + fmla v6.4s, v25.4s, v17.s[1] + fmla v7.4s, v25.4s, v16.s[1] + fmla v0.4s, v24.4s, v23.s[2] + fmla v2.4s, v24.4s, v22.s[2] + fmla v1.4s, v24.4s, v21.s[2] + fmla v4.4s, v24.4s, v20.s[2] + fmla v3.4s, v24.4s, v19.s[2] + fmla v5.4s, v24.4s, v18.s[2] + fmla v6.4s, v24.4s, v17.s[2] + fmla v7.4s, v24.4s, v16.s[2] + fmla v0.4s, v26.4s, v23.s[3] + fmla v2.4s, v26.4s, v22.s[3] + fmla v1.4s, v26.4s, v21.s[3] + fmla v4.4s, v26.4s, v20.s[3] + fmla v3.4s, v26.4s, v19.s[3] + fmla v5.4s, v26.4s, v18.s[3] + fmla v6.4s, v26.4s, v17.s[3] + fmla v7.4s, v26.4s, v16.s[3] + cmp x29, x19 + b.ge .LBB0_38 + .p2align 2 +.LBB0_43: // Parent Loop BB0_39 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x6, x28 + prfm pldl1keep, [x6] + ldur s16, [x6, #-4] + add x20, x20, #1 + prfm pldl1keep, [x25] + ldur s17, [x25, #-4] + add x25, x25, x28 + add x6, x6, #4 + prfm pldl1keep, [x25] + ldur s18, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s19, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s20, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s21, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s22, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s23, [x25, #-4] + prfm pldl1keep, [x7] + ldur q24, [x7, #-16] + add x7, x7, #16 + fmla v0.4s, v24.4s, v16.s[0] + fmla v2.4s, v24.4s, v17.s[0] + fmla v1.4s, v24.4s, v18.s[0] + fmla v4.4s, v24.4s, v19.s[0] + fmla v3.4s, v24.4s, v20.s[0] + fmla v5.4s, v24.4s, v21.s[0] + fmla v6.4s, v24.4s, v22.s[0] + fmla v7.4s, v24.4s, v23.s[0] + cmp x20, x19 + b.lt .LBB0_43 + b .LBB0_38 +.LBB0_44: + ldr x13, [sp, #336] // 8-byte Folded Reload + cmp x7, x13 + b.lt .LBB0_47 +// %bb.45: + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.lt .LBB0_52 +.LBB0_46: + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.lt .LBB0_57 + b .LBB0_62 +.LBB0_47: + add x18, x7, #1 + add x1, x7, #2 + add x2, x7, #3 + mul x11, x7, x27 + mul x12, x18, x27 + mov x16, xzr + add x11, x11, x25 + mul x18, x18, x22 + mul x13, x1, x27 + mul x14, x2, x27 + lsl x18, x18, #2 + mul x15, x7, x22 + add x12, x12, x25 + add x13, x13, x25 + add x14, x14, x25 + lsl x17, x15, #2 + ldr q5, [x9, x17] + mov x17, x8 + add x11, x24, x11, lsl #2 + ldr q7, [x9, x18] + ldr q16, [x17], #48 + ldr q0, [x11] + mul x18, x1, x22 + lsl x18, x18, #2 + add x12, x24, x12, lsl #2 + add x13, x24, x13, lsl #2 + add x14, x24, x14, lsl #2 + ldr q1, [x12] + ldr q2, [x13] + ldr q3, [x14] + ldr q6, [x9, x18] + mul x18, x2, x22 + ldp x2, x1, [sp, #320] // 16-byte Folded Reload + lsl x18, x18, #2 + ldr q4, [x9, x18] + ldr x18, [sp, #32] // 8-byte Folded Reload + lsl x18, x18, #5 + add x18, x18, x2, lsl #2 + add x18, x18, x1 + add x18, x18, #32 + cmp xzr, x23 + b.ge .LBB0_49 + .p2align 2 +.LBB0_48: // =>This Inner Loop Header: Depth=1 + add x1, x17, #32 + fmla v0.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v7.s[0] + add x16, x16, #4 + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x1] + add x1, x18, x28 + ldp q16, q17, [x17, #-32] + fmla v0.4s, v16.4s, v5.s[1] + fmla v1.4s, v16.4s, v7.s[1] + fmla v2.4s, v16.4s, v6.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v5.s[2] + fmla v1.4s, v17.4s, v7.s[2] + fmla v2.4s, v17.4s, v6.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x17], #64 + prfm pldl1keep, [x18] + fmla v0.4s, v17.4s, v5.s[3] + ldur q5, [x18, #-16] + prfm pldl1keep, [x1] + fmla v1.4s, v17.4s, v7.s[3] + ldur q7, [x1, #-16] + add x1, x1, x28 + fmla v2.4s, v17.4s, v6.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x18, x18, #16 + prfm pldl1keep, [x1] + ldur q6, [x1, #-16] + add x1, x1, x28 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + cmp x16, x23 + b.lt .LBB0_48 +.LBB0_49: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v0.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v4.s[0] + add x15, x19, x15 + sub x15, x15, x30 + add x10, x10, #16 + ldr q17, [x8, x17, lsl #4] + fmla v0.4s, v17.4s, v5.s[1] + ldr q16, [x8, x16, lsl #4] + ldr x16, [sp, #416] // 8-byte Folded Reload + fmla v1.4s, v17.4s, v7.s[1] + fmla v2.4s, v17.4s, v6.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldr q18, [x8, x16, lsl #4] + ldr x16, [sp, #344] // 8-byte Folded Reload + add x15, x16, x15, lsl #2 + ldr x16, [sp, #328] // 8-byte Folded Reload + fmla v0.4s, v16.4s, v5.s[2] + fmla v1.4s, v16.4s, v7.s[2] + fmla v2.4s, v16.4s, v6.s[2] + fmla v3.4s, v16.4s, v4.s[2] + add x15, x15, x16 + mov x16, x29 + add x15, x15, #4 + fmla v0.4s, v18.4s, v5.s[3] + fmla v1.4s, v18.4s, v7.s[3] + fmla v2.4s, v18.4s, v6.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x29, x19 + b.ge .LBB0_51 + .p2align 2 +.LBB0_50: // =>This Inner Loop Header: Depth=1 + add x17, x15, x28 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x16, x16, #1 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x28 + add x15, x15, #4 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x28 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x10] + ldur q16, [x10, #-16] + add x10, x10, #16 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x16, x19 + b.lt .LBB0_50 +.LBB0_51: + str q0, [x11] + str q1, [x12] + str q2, [x13] + ldr x13, [sp, #336] // 8-byte Folded Reload + str q3, [x14] + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.ge .LBB0_46 +.LBB0_52: + mul x14, x13, x22 + add x12, x13, #1 + ldr x18, [sp, #328] // 8-byte Folded Reload + mov x16, x8 + mul x10, x13, x27 + ldr q4, [x16], #48 + mov x15, xzr + mul x11, x12, x27 + lsl x13, x14, #2 + add x10, x10, x25 + add x11, x11, x25 + ldr q3, [x9, x13] + mul x13, x12, x22 + add x10, x24, x10, lsl #2 + add x11, x24, x11, lsl #2 + ldr q0, [x10] + ldr q1, [x11] + lsl x17, x13, #2 + ldr q2, [x9, x17] + add x17, x18, x17 + ldr x18, [sp, #248] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_54 + .p2align 2 +.LBB0_53: // =>This Inner Loop Header: Depth=1 + add x5, x16, #32 + ldr x3, [sp, #344] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + prfm pldl1keep, [x5] + ldp q4, q5, [x16, #-32] + add x15, x15, #4 + add x1, x17, x3 + add x3, x18, x3 + add x17, x17, #16 + add x18, x18, #16 + add x2, x1, #32 + add x4, x3, #32 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x16], #64 + prfm pldl1keep, [x4] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x3, #16] + prfm pldl1keep, [x2] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x1, #16] + cmp x15, x23 + b.lt .LBB0_53 +.LBB0_54: + ldr x15, [sp, #400] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldr x16, [sp, #344] // 8-byte Folded Reload + add x13, x19, x13 + ldr x17, [sp, #328] // 8-byte Folded Reload + sub x13, x13, x30 + add x14, x19, x14 + mul x12, x22, x12 + ldr x18, [sp, #40] // 8-byte Folded Reload + ldr q5, [x8, x15, lsl #4] + ldr x15, [sp, #408] // 8-byte Folded Reload + add x13, x16, x13, lsl #2 + add x12, x16, x12, lsl #2 + add x12, x17, x12 + ldr q4, [x8, x15, lsl #4] + ldr x15, [sp, #416] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x15, lsl #4] + sub x15, x14, x30 + add x14, x13, x17 + ldr x13, [sp, #464] // 8-byte Folded Reload + add x15, x16, x15, lsl #2 + add x16, x16, x18, lsl #4 + add x14, x14, #4 + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + add x15, x15, x17 + add x16, x17, x16 + mov x17, x29 + sub x13, x13, x30, lsl #4 + add x15, x15, #4 + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + add x13, x13, #16 + cmp x29, x19 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // =>This Inner Loop Header: Depth=1 + add x18, x8, x13 + prfm pldl1keep, [x15] + ldr s2, [x16, x17, lsl #2] + prfm pldl1keep, [x14] + ldr s3, [x12, x17, lsl #2] + add x13, x13, #16 + prfm pldl1keep, [x18] + ldr q4, [x8, x17, lsl #4] + add x17, x17, #1 + add x14, x14, #4 + add x15, x15, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + cmp x17, x19 + b.lt .LBB0_55 +.LBB0_56: + str q0, [x10] + str q1, [x11] + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.ge .LBB0_62 +.LBB0_57: + ldr x11, [sp, #312] // 8-byte Folded Reload + mov x13, x8 + mov x12, xzr + mul x10, x11, x27 + mul x11, x11, x22 + ldr q2, [x13], #48 + lsl x14, x11, #2 + add x10, x10, x25 + ldr q1, [x9, x14] + ldr x9, [sp, #48] // 8-byte Folded Reload + ldp x15, x14, [sp, #320] // 16-byte Folded Reload + add x10, x24, x10, lsl #2 + ldr q0, [x10] + lsl x9, x9, #3 + add x9, x9, x15, lsl #2 + add x9, x9, x14 + add x9, x9, #32 + cmp xzr, x23 + b.ge .LBB0_59 + .p2align 2 +.LBB0_58: // =>This Inner Loop Header: Depth=1 + add x14, x13, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x14] + ldp q2, q3, [x13, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x13], #64 + prfm pldl1keep, [x9] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x9, #-16] + add x9, x9, #16 + cmp x12, x23 + b.lt .LBB0_58 +.LBB0_59: + ldr x9, [sp, #400] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + ldr x12, [sp, #328] // 8-byte Folded Reload + ldr x13, [sp, #24] // 8-byte Folded Reload + ldr q3, [x8, x9, lsl #4] + ldr x9, [sp, #408] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x9, lsl #4] + ldr x9, [sp, #416] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q4, [x8, x9, lsl #4] + add x9, x19, x11 + ldr x11, [sp, #344] // 8-byte Folded Reload + sub x9, x9, x30 + add x9, x11, x9, lsl #2 + add x11, x9, x12 + ldr x9, [sp, #464] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v1.s[3] + add x12, x12, x13 + mov x13, x29 + add x11, x11, #4 + sub x9, x9, x30, lsl #4 + add x9, x9, #16 + cmp x29, x19 + b.ge .LBB0_61 + .p2align 2 +.LBB0_60: // =>This Inner Loop Header: Depth=1 + add x14, x8, x9 + prfm pldl1keep, [x11] + ldr s1, [x12, x13, lsl #2] + prfm pldl1keep, [x14] + ldr q2, [x8, x13, lsl #4] + add x13, x13, #1 + add x9, x9, #16 + add x11, x11, #4 + fmla v0.4s, v2.4s, v1.s[0] + cmp x13, x19 + b.lt .LBB0_60 +.LBB0_61: + str q0, [x10] +.LBB0_62: + bl free + ldp x20, x29, [sp, #472] // 16-byte Folded Reload +.LBB0_63: + ldr x8, [sp, #112] // 8-byte Folded Reload + ldr x9, [sp, #128] // 8-byte Folded Reload + add x8, x8, x8, lsr #63 + ldr x25, [sp, #488] // 8-byte Folded Reload + cmp x9, #0 + asr x8, x8, #1 + cinv x8, x8, lt + str x8, [sp, #464] // 8-byte Folded Spill + lsl x8, x8, #1 + cmp x25, x8 + str x8, [sp, #480] // 8-byte Folded Spill + b.ge .LBB0_94 +// %bb.64: + lsl x8, x19, #3 + add x0, x8, #64 + bl malloc + add x8, x27, x25 + add x10, x25, x27, lsl #1 + ldp x6, x5, [sp, #296] // 16-byte Folded Reload + add x11, x25, x27, lsl #2 + lsl x8, x8, #2 + ldr x18, [sp, #328] // 8-byte Folded Reload + ldr x1, [sp, #344] // 8-byte Folded Reload + ldr d5, [x24, x8] + lsl x8, x10, #2 + mov w4, #12 // =0xc + ldr d0, [x24, x8] + lsl x8, x11, #2 + add x10, x10, x27 + ldr d3, [x24, x8] + add x8, x11, x27 + mul x11, x22, x4 + lsl x8, x8, #2 + lsl x10, x10, #2 + lsl x9, x25, #2 + add x13, x0, #63 + ldr d4, [x24, x8] + mov w8, #6 // =0x6 + mov w16, #20 // =0x14 + madd x8, x27, x8, x25 + ldr d2, [x24, x10] + add x10, x6, x21 + ldr d1, [x24, x9] + ldr d23, [x10, x9] + add x9, x18, x1 + mov w15, #24 // =0x18 + ldr q19, [x9, x11] + mul x11, x22, x16 + ldr q17, [x9, x28] + ldr x7, [sp, #64] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr q18, [x9, x20] + ldr q16, [x9] + ldr d6, [x24, x8] + sub x8, x25, x27 + ldr x30, [sp, #104] // 8-byte Folded Reload + add x8, x8, x27, lsl #3 + ldr q21, [x9, x11] + lsl x11, x29, #4 + ldr q20, [x9, x22, lsl #4] + add x14, x11, x5, lsl #5 + madd x16, x5, x16, x11 + lsl x8, x8, #2 + madd x4, x5, x4, x11 + ldr x29, [sp, #504] // 8-byte Folded Reload + add x2, x11, x5, lsl #3 + ldr d7, [x24, x8] + and x8, x13, #0xffffffffffffffc0 + mul x13, x22, x15 + madd x15, x5, x15, x11 + mov x12, xzr + add x16, x6, x16 + add x2, x6, x2 + orr x3, x8, #0x10 + add x4, x6, x4 + ldr q22, [x9, x13] + mov w13, #28 // =0x1c + add x15, x6, x15 + madd x17, x5, x13, x11 + add x13, x6, x14 + add x14, x6, x17 + add x17, x1, x18 + add x1, x11, x5, lsl #2 + add x18, x6, x26 + mov w5, #16 // =0x10 + add x17, x7, x17 + add x18, x18, x11 + sub x5, x5, x7 + add x17, x17, #16 + add x1, x6, x1 + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v1.2s, v23.2s, v16.s[0] + fmla v5.2s, v23.2s, v17.s[0] + cmp xzr, x23 + b.ge .LBB0_66 + .p2align 2 +.LBB0_65: // =>This Inner Loop Header: Depth=1 + add x6, x16, x21 + stur d23, [x3, #-16] + fmla v0.2s, v23.2s, v18.s[0] + fmla v2.2s, v23.2s, v19.s[0] + prfm pldl1keep, [x6] + ldr d25, [x1, x21] + fmla v3.2s, v23.2s, v20.s[0] + fmla v4.2s, v23.2s, v21.s[0] + fmla v6.2s, v23.2s, v22.s[0] + fmla v7.2s, v23.2s, v24.s[0] + add x6, x15, x21 + add x7, x17, x5 + add x20, x7, x28 + add x25, x20, x28 + add x12, x12, #4 + add x15, x15, x26 + add x16, x16, x26 + add x17, x17, #16 + add x1, x1, x26 + stur d25, [x3, #-8] + prfm pldl1keep, [x6] + ldr d23, [x2, x21] + fmla v1.2s, v25.2s, v16.s[1] + fmla v5.2s, v25.2s, v17.s[1] + fmla v0.2s, v25.2s, v18.s[1] + fmla v2.2s, v25.2s, v19.s[1] + fmla v3.2s, v25.2s, v20.s[1] + fmla v4.2s, v25.2s, v21.s[1] + fmla v6.2s, v25.2s, v22.s[1] + fmla v7.2s, v25.2s, v24.s[1] + add x6, x14, x21 + add x14, x14, x26 + add x2, x2, x26 + fmla v1.2s, v23.2s, v16.s[2] + fmla v5.2s, v23.2s, v17.s[2] + fmla v0.2s, v23.2s, v18.s[2] + fmla v2.2s, v23.2s, v19.s[2] + fmla v3.2s, v23.2s, v20.s[2] + fmla v4.2s, v23.2s, v21.s[2] + fmla v6.2s, v23.2s, v22.s[2] + fmla v7.2s, v23.2s, v24.s[2] + str d23, [x3] + prfm pldl1keep, [x6] + ldr d23, [x4, x21] + add x6, x13, x21 + add x13, x13, x26 + add x4, x4, x26 + str d23, [x3, #8] + prfm pldl1keep, [x6] + add x6, x25, x28 + fmla v1.2s, v23.2s, v16.s[3] + fmla v5.2s, v23.2s, v17.s[3] + fmla v0.2s, v23.2s, v18.s[3] + fmla v2.2s, v23.2s, v19.s[3] + fmla v3.2s, v23.2s, v20.s[3] + fmla v4.2s, v23.2s, v21.s[3] + fmla v6.2s, v23.2s, v22.s[3] + fmla v7.2s, v23.2s, v24.s[3] + ldr d23, [x18, x21] + prfm pldl1keep, [x7] + ldur q16, [x7, #-16] + prfm pldl1keep, [x20] + ldur q17, [x20, #-16] + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + add x18, x18, x26 + add x3, x3, #32 + prfm pldl1keep, [x6] + ldur q19, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q20, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q21, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q22, [x6, #-16] + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v1.2s, v23.2s, v16.s[0] + fmla v5.2s, v23.2s, v17.s[0] + cmp x12, x23 + b.lt .LBB0_65 +.LBB0_66: + ldp x13, x14, [sp, #400] // 16-byte Folded Reload + ldr x15, [sp, #304] // 8-byte Folded Reload + fmla v0.2s, v23.2s, v18.s[0] + ldr x20, [sp, #488] // 8-byte Folded Reload + str d23, [x8, x23, lsl #3] + fmla v2.2s, v23.2s, v19.s[0] + fmla v3.2s, v23.2s, v20.s[0] + fmla v4.2s, v23.2s, v21.s[0] + fmla v6.2s, v23.2s, v22.s[0] + fmla v7.2s, v23.2s, v24.s[0] + ldr x7, [sp, #520] // 8-byte Folded Reload + madd x12, x13, x15, x20 + lsl x12, x12, #2 + ldr d23, [x10, x12] + madd x12, x14, x15, x20 + lsl x12, x12, #2 + str d23, [x8, x13, lsl #3] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v1.2s, v23.2s, v16.s[1] + fmla v5.2s, v23.2s, v17.s[1] + fmla v0.2s, v23.2s, v18.s[1] + fmla v2.2s, v23.2s, v19.s[1] + fmla v3.2s, v23.2s, v20.s[1] + fmla v4.2s, v23.2s, v21.s[1] + fmla v6.2s, v23.2s, v22.s[1] + fmla v7.2s, v23.2s, v24.s[1] + ldr d23, [x10, x12] + madd x12, x13, x15, x20 + fmla v1.2s, v23.2s, v16.s[2] + str d23, [x8, x14, lsl #3] + fmla v5.2s, v23.2s, v17.s[2] + fmla v0.2s, v23.2s, v18.s[2] + fmla v2.2s, v23.2s, v19.s[2] + fmla v3.2s, v23.2s, v20.s[2] + fmla v4.2s, v23.2s, v21.s[2] + fmla v6.2s, v23.2s, v22.s[2] + fmla v7.2s, v23.2s, v24.s[2] + mov x14, x29 + lsl x12, x12, #2 + ldr d23, [x10, x12] + ldr x10, [sp, #72] // 8-byte Folded Reload + add x12, x10, #4 + ldp x17, x10, [sp, #272] // 16-byte Folded Reload + str d23, [x8, x13, lsl #3] + ldr x13, [sp, #120] // 8-byte Folded Reload + fmla v1.2s, v23.2s, v16.s[3] + fmla v5.2s, v23.2s, v17.s[3] + fmla v0.2s, v23.2s, v18.s[3] + fmla v2.2s, v23.2s, v19.s[3] + fmla v3.2s, v23.2s, v20.s[3] + fmla v4.2s, v23.2s, v21.s[3] + fmla v6.2s, v23.2s, v22.s[3] + fmla v7.2s, v23.2s, v24.s[3] + add x10, x11, x10, lsl #2 + ldr x11, [sp, #296] // 8-byte Folded Reload + add x10, x11, x10 + ldr x11, [sp, #320] // 8-byte Folded Reload + add x11, x13, x11, lsl #2 + ldr x13, [sp, #80] // 8-byte Folded Reload + sub x11, x11, x13 + ldr x13, [sp, #328] // 8-byte Folded Reload + add x13, x11, x13 + mul x11, x15, x12 + add x12, x13, #4 + ldr x13, [sp, #96] // 8-byte Folded Reload + lsl x13, x13, #2 + cmp x29, x19 + b.ge .LBB0_68 + .p2align 2 +.LBB0_67: // =>This Inner Loop Header: Depth=1 + add x16, x12, x28 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x15, x10, x11 + prfm pldl1keep, [x16] + ldur s17, [x16, #-4] + add x16, x16, x28 + add x12, x12, #4 + prfm pldl1keep, [x16] + ldur s18, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s19, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s20, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s21, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s22, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s23, [x16, #-4] + prfm pldl1keep, [x15] + ldr d24, [x10, x13] + add x10, x10, x17 + fmla v1.2s, v24.2s, v16.s[0] + str d24, [x8, x14, lsl #3] + add x14, x14, #1 + fmla v5.2s, v24.2s, v17.s[0] + fmla v0.2s, v24.2s, v18.s[0] + fmla v2.2s, v24.2s, v19.s[0] + fmla v3.2s, v24.2s, v20.s[0] + fmla v4.2s, v24.2s, v21.s[0] + fmla v6.2s, v24.2s, v22.s[0] + fmla v7.2s, v24.2s, v23.s[0] + cmp x14, x19 + b.lt .LBB0_67 +.LBB0_68: // %.preheader25 + ldr x10, [sp, #344] // 8-byte Folded Reload + ldr x11, [sp, #496] // 8-byte Folded Reload + mov x6, xzr + mov w3, #7 // =0x7 + ldr x15, [sp, #328] // 8-byte Folded Reload + ldr x12, [sp, #88] // 8-byte Folded Reload + mov w2, #6 // =0x6 + mov w16, #5 // =0x5 + mov w18, #4 // =0x4 + mov w17, #3 // =0x3 + mov w1, #2 // =0x2 + mov w4, #1 // =0x1 + add x13, x11, x10 + sub x10, x8, x30, lsl #3 + add x14, x12, x15 + add x11, x8, #24 + mov w12, #8 // =0x8 + add x15, x13, x15 + add x13, x14, #4 + add x10, x10, x19, lsl #3 + add x14, x15, #32 + add x15, x10, #8 + b .LBB0_70 + .p2align 2 +.LBB0_69: // %.loopexit24 + // in Loop: Header=BB0_70 Depth=1 + ldp x20, x6, [sp, #488] // 16-byte Folded Reload + ldr x7, [sp, #520] // 8-byte Folded Reload + add x14, x14, x6 + add x13, x13, x6 + mov x6, x12 + mov x12, x5 +.LBB0_70: // =>This Loop Header: Depth=1 + // Child Loop BB0_72 Depth 2 + // Child Loop BB0_74 Depth 2 + madd x5, x6, x27, x20 + cmp x12, x7 + lsl x5, x5, #2 + madd x4, x4, x27, x20 + madd x1, x1, x27, x20 + madd x17, x17, x27, x20 + madd x18, x18, x27, x20 + lsl x4, x4, #2 + lsl x1, x1, #2 + lsl x17, x17, #2 + lsl x18, x18, #2 + madd x16, x16, x27, x20 + lsl x16, x16, #2 + str d1, [x24, x5] + str d5, [x24, x4] + str d0, [x24, x1] + str d2, [x24, x17] + str d3, [x24, x18] + str d4, [x24, x16] + madd x16, x2, x27, x20 + lsl x16, x16, #2 + str d6, [x24, x16] + madd x16, x3, x27, x20 + lsl x16, x16, #2 + str d7, [x24, x16] + b.ge .LBB0_75 +// %bb.71: // in Loop: Header=BB0_70 Depth=1 + add x17, x12, #3 + add x4, x12, #1 + add x1, x12, #2 + madd x2, x12, x27, x20 + madd x7, x17, x27, x20 + add x18, x12, #4 + add x16, x12, #5 + mov x25, x20 + madd x3, x4, x27, x20 + ldr d24, [x8] + mov x6, xzr + lsl x2, x2, #2 + madd x5, x1, x27, x20 + madd x20, x18, x27, x20 + lsl x7, x7, #2 + lsl x3, x3, #2 + ldr d1, [x24, x2] + madd x2, x16, x27, x25 + lsl x5, x5, #2 + lsl x20, x20, #2 + ldr d2, [x24, x7] + mul x7, x12, x22 + ldr d5, [x24, x3] + ldr d0, [x24, x5] + ldr d3, [x24, x20] + mov x20, x14 + lsl x2, x2, #2 + lsl x7, x7, #2 + ldr d4, [x24, x2] + add x2, x12, #6 + ldr q23, [x9, x7] + mul x7, x4, x22 + madd x3, x2, x27, x25 + lsl x7, x7, #2 + lsl x3, x3, #2 + ldr q22, [x9, x7] + mul x7, x1, x22 + ldr d6, [x24, x3] + add x3, x12, #7 + madd x5, x3, x27, x25 + lsl x7, x7, #2 + ldr q21, [x9, x7] + mul x7, x17, x22 + lsl x5, x5, #2 + ldr d7, [x24, x5] + add x5, x12, #8 + lsl x7, x7, #2 + ldr q20, [x9, x7] + mul x7, x18, x22 + lsl x7, x7, #2 + ldr q19, [x9, x7] + mul x7, x16, x22 + lsl x7, x7, #2 + ldr q18, [x9, x7] + mul x7, x2, x22 + lsl x7, x7, #2 + ldr q17, [x9, x7] + mul x7, x3, x22 + lsl x7, x7, #2 + ldr q16, [x9, x7] + mov x7, x11 + cmp xzr, x23 + b.ge .LBB0_73 + .p2align 2 +.LBB0_72: // Parent Loop BB0_70 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x7, #16 + fmla v1.2s, v24.2s, v23.s[0] + fmla v5.2s, v24.2s, v22.s[0] + add x6, x6, #4 + fmla v0.2s, v24.2s, v21.s[0] + fmla v2.2s, v24.2s, v20.s[0] + prfm pldl1keep, [x25] + add x25, x20, x28 + fmla v3.2s, v24.2s, v19.s[0] + fmla v4.2s, v24.2s, v18.s[0] + fmla v6.2s, v24.2s, v17.s[0] + fmla v7.2s, v24.2s, v16.s[0] + ldp d24, d25, [x7, #-16] + fmla v1.2s, v24.2s, v23.s[1] + fmla v5.2s, v24.2s, v22.s[1] + fmla v0.2s, v24.2s, v21.s[1] + fmla v2.2s, v24.2s, v20.s[1] + fmla v3.2s, v24.2s, v19.s[1] + fmla v4.2s, v24.2s, v18.s[1] + fmla v6.2s, v24.2s, v17.s[1] + fmla v7.2s, v24.2s, v16.s[1] + fmla v1.2s, v25.2s, v23.s[2] + fmla v5.2s, v25.2s, v22.s[2] + ldp d26, d24, [x7], #32 + fmla v0.2s, v25.2s, v21.s[2] + fmla v2.2s, v25.2s, v20.s[2] + fmla v3.2s, v25.2s, v19.s[2] + prfm pldl1keep, [x20] + fmla v4.2s, v25.2s, v18.s[2] + fmla v6.2s, v25.2s, v17.s[2] + fmla v7.2s, v25.2s, v16.s[2] + fmla v1.2s, v26.2s, v23.s[3] + ldur q23, [x20, #-16] + prfm pldl1keep, [x25] + fmla v5.2s, v26.2s, v22.s[3] + ldur q22, [x25, #-16] + add x25, x25, x28 + fmla v0.2s, v26.2s, v21.s[3] + fmla v2.2s, v26.2s, v20.s[3] + fmla v3.2s, v26.2s, v19.s[3] + fmla v4.2s, v26.2s, v18.s[3] + add x20, x20, #16 + prfm pldl1keep, [x25] + ldur q21, [x25, #-16] + add x25, x25, x28 + fmla v6.2s, v26.2s, v17.s[3] + fmla v7.2s, v26.2s, v16.s[3] + prfm pldl1keep, [x25] + ldur q20, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q19, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q17, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q16, [x25, #-16] + cmp x6, x23 + b.lt .LBB0_72 +.LBB0_73: // in Loop: Header=BB0_70 Depth=1 + ldp x7, x6, [sp, #400] // 16-byte Folded Reload + fmla v1.2s, v24.2s, v23.s[0] + fmla v5.2s, v24.2s, v22.s[0] + fmla v0.2s, v24.2s, v21.s[0] + fmla v2.2s, v24.2s, v20.s[0] + mov x20, x29 + fmla v3.2s, v24.2s, v19.s[0] + fmla v4.2s, v24.2s, v18.s[0] + ldr d25, [x8, x7, lsl #3] + fmla v6.2s, v24.2s, v17.s[0] + fmla v7.2s, v24.2s, v16.s[0] + ldr d24, [x8, x6, lsl #3] + ldr x6, [sp, #416] // 8-byte Folded Reload + mov x7, x15 + ldr d26, [x8, x6, lsl #3] + mov x6, x13 + fmla v1.2s, v25.2s, v23.s[1] + fmla v5.2s, v25.2s, v22.s[1] + fmla v0.2s, v25.2s, v21.s[1] + fmla v2.2s, v25.2s, v20.s[1] + fmla v3.2s, v25.2s, v19.s[1] + fmla v4.2s, v25.2s, v18.s[1] + fmla v6.2s, v25.2s, v17.s[1] + fmla v7.2s, v25.2s, v16.s[1] + fmla v1.2s, v24.2s, v23.s[2] + fmla v5.2s, v24.2s, v22.s[2] + fmla v0.2s, v24.2s, v21.s[2] + fmla v2.2s, v24.2s, v20.s[2] + fmla v3.2s, v24.2s, v19.s[2] + fmla v4.2s, v24.2s, v18.s[2] + fmla v6.2s, v24.2s, v17.s[2] + fmla v7.2s, v24.2s, v16.s[2] + fmla v1.2s, v26.2s, v23.s[3] + fmla v5.2s, v26.2s, v22.s[3] + fmla v0.2s, v26.2s, v21.s[3] + fmla v2.2s, v26.2s, v20.s[3] + fmla v3.2s, v26.2s, v19.s[3] + fmla v4.2s, v26.2s, v18.s[3] + fmla v6.2s, v26.2s, v17.s[3] + fmla v7.2s, v26.2s, v16.s[3] + cmp x29, x19 + b.ge .LBB0_69 + .p2align 2 +.LBB0_74: // Parent Loop BB0_70 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x6, x28 + prfm pldl1keep, [x6] + ldur s16, [x6, #-4] + add x20, x20, #1 + prfm pldl1keep, [x25] + ldur s17, [x25, #-4] + add x25, x25, x28 + add x6, x6, #4 + prfm pldl1keep, [x25] + ldur s18, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s19, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s20, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s21, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s22, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s23, [x25, #-4] + prfm pldl1keep, [x7] + ldur d24, [x7, #-8] + add x7, x7, #8 + fmla v1.2s, v24.2s, v16.s[0] + fmla v5.2s, v24.2s, v17.s[0] + fmla v0.2s, v24.2s, v18.s[0] + fmla v2.2s, v24.2s, v19.s[0] + fmla v3.2s, v24.2s, v20.s[0] + fmla v4.2s, v24.2s, v21.s[0] + fmla v6.2s, v24.2s, v22.s[0] + fmla v7.2s, v24.2s, v23.s[0] + cmp x20, x19 + b.lt .LBB0_74 + b .LBB0_69 +.LBB0_75: + ldr x13, [sp, #336] // 8-byte Folded Reload + cmp x7, x13 + b.lt .LBB0_78 +// %bb.76: + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.lt .LBB0_83 +.LBB0_77: + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.lt .LBB0_88 + b .LBB0_93 +.LBB0_78: + add x18, x7, #1 + add x1, x7, #2 + mul x15, x7, x22 + add x2, x7, #3 + madd x12, x18, x27, x20 + mov x17, x8 + ldr d16, [x17], #24 + mul x18, x18, x22 + mov x16, xzr + lsl x14, x15, #2 + mul x11, x7, x27 + madd x13, x1, x27, x20 + add x11, x11, x20 + lsl x18, x18, #2 + add x11, x24, x11, lsl #2 + ldr q5, [x9, x14] + ldr q7, [x9, x18] + mul x18, x1, x22 + ldr d0, [x11] + madd x14, x2, x27, x20 + lsl x18, x18, #2 + add x12, x24, x12, lsl #2 + add x13, x24, x13, lsl #2 + add x14, x24, x14, lsl #2 + ldr d1, [x12] + ldr d2, [x13] + ldr q6, [x9, x18] + mul x18, x2, x22 + ldp x2, x1, [sp, #320] // 16-byte Folded Reload + ldr d3, [x14] + lsl x18, x18, #2 + ldr q4, [x9, x18] + ldr x18, [sp, #32] // 8-byte Folded Reload + lsl x18, x18, #5 + add x18, x18, x2, lsl #2 + add x18, x18, x1 + add x18, x18, #32 + cmp xzr, x23 + b.ge .LBB0_80 + .p2align 2 +.LBB0_79: // =>This Inner Loop Header: Depth=1 + add x1, x17, #16 + fmla v0.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v7.s[0] + add x16, x16, #4 + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x1] + add x1, x18, x28 + ldp d16, d17, [x17, #-16] + fmla v0.2s, v16.2s, v5.s[1] + fmla v1.2s, v16.2s, v7.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v5.s[2] + fmla v1.2s, v17.2s, v7.s[2] + fmla v2.2s, v17.2s, v6.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x17], #32 + prfm pldl1keep, [x18] + fmla v0.2s, v17.2s, v5.s[3] + ldur q5, [x18, #-16] + prfm pldl1keep, [x1] + fmla v1.2s, v17.2s, v7.s[3] + ldur q7, [x1, #-16] + add x1, x1, x28 + fmla v2.2s, v17.2s, v6.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x18, x18, #16 + prfm pldl1keep, [x1] + ldur q6, [x1, #-16] + add x1, x1, x28 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + cmp x16, x23 + b.lt .LBB0_79 +.LBB0_80: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v0.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v4.s[0] + add x15, x19, x15 + sub x15, x15, x30 + add x10, x10, #8 + ldr d17, [x8, x17, lsl #3] + fmla v0.2s, v17.2s, v5.s[1] + ldr d16, [x8, x16, lsl #3] + ldr x16, [sp, #416] // 8-byte Folded Reload + fmla v1.2s, v17.2s, v7.s[1] + fmla v2.2s, v17.2s, v6.s[1] + fmla v3.2s, v17.2s, v4.s[1] + ldr d18, [x8, x16, lsl #3] + ldr x16, [sp, #344] // 8-byte Folded Reload + add x15, x16, x15, lsl #2 + ldr x16, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v16.2s, v5.s[2] + fmla v1.2s, v16.2s, v7.s[2] + fmla v2.2s, v16.2s, v6.s[2] + fmla v3.2s, v16.2s, v4.s[2] + add x15, x15, x16 + mov x16, x29 + add x15, x15, #4 + fmla v0.2s, v18.2s, v5.s[3] + fmla v1.2s, v18.2s, v7.s[3] + fmla v2.2s, v18.2s, v6.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x29, x19 + b.ge .LBB0_82 + .p2align 2 +.LBB0_81: // =>This Inner Loop Header: Depth=1 + add x17, x15, x28 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x16, x16, #1 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x28 + add x15, x15, #4 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x28 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x10] + ldur d16, [x10, #-8] + add x10, x10, #8 + fmla v0.2s, v16.2s, v4.s[0] + fmla v1.2s, v16.2s, v5.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x16, x19 + b.lt .LBB0_81 +.LBB0_82: + str d0, [x11] + str d1, [x12] + str d2, [x13] + ldr x13, [sp, #336] // 8-byte Folded Reload + str d3, [x14] + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.ge .LBB0_77 +.LBB0_83: + mul x10, x13, x27 + add x12, x13, #1 + mov x16, x8 + ldr x18, [sp, #328] // 8-byte Folded Reload + mul x13, x13, x22 + ldr d4, [x16], #24 + mov x15, xzr + madd x11, x12, x27, x20 + lsl x14, x13, #2 + add x10, x10, x20 + add x10, x24, x10, lsl #2 + ldr q3, [x9, x14] + mul x14, x12, x22 + add x11, x24, x11, lsl #2 + ldr d0, [x10] + ldr d1, [x11] + lsl x17, x14, #2 + ldr q2, [x9, x17] + add x17, x18, x17 + cmp xzr, x23 + b.ge .LBB0_85 + .p2align 2 +.LBB0_84: // =>This Inner Loop Header: Depth=1 + add x4, x16, #16 + ldr x2, [sp, #344] // 8-byte Folded Reload + ldr x5, [sp, #248] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + prfm pldl1keep, [x4] + fmla v1.2s, v4.2s, v2.s[0] + ldp d4, d5, [x16, #-16] + add x15, x15, #4 + add x18, x17, x2 + add x2, x5, x2 + add x5, x5, #16 + add x17, x17, #16 + add x1, x18, #32 + add x3, x2, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x16], #32 + prfm pldl1keep, [x3] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x18, #16] + str x5, [sp, #248] // 8-byte Folded Spill + cmp x15, x23 + b.lt .LBB0_84 +.LBB0_85: + ldr x15, [sp, #400] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr x17, [sp, #344] // 8-byte Folded Reload + add x13, x19, x13 + mul x12, x22, x12 + ldr x16, [sp, #328] // 8-byte Folded Reload + add x12, x17, x12, lsl #2 + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #408] // 8-byte Folded Reload + add x12, x16, x12 + ldr d4, [x8, x15, lsl #3] + ldr x15, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x15, lsl #3] + sub x15, x13, x30 + add x13, x19, x14 + add x14, x17, x15, lsl #2 + ldr x15, [sp, #40] // 8-byte Folded Reload + sub x13, x13, x30 + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + add x13, x17, x13, lsl #2 + add x14, x14, x16 + add x13, x13, x16 + add x14, x14, #4 + add x15, x17, x15, lsl #4 + add x13, x13, #4 + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + add x15, x16, x15 + mov x16, x29 + cmp x29, x19 + b.ge .LBB0_87 + .p2align 2 +.LBB0_86: // =>This Inner Loop Header: Depth=1 + add x17, x8, x16, lsl #3 + prfm pldl1keep, [x14] + ldr s2, [x15, x16, lsl #2] + prfm pldl1keep, [x13] + ldr s3, [x12, x16, lsl #2] + add x13, x13, #4 + add x17, x17, #8 + add x14, x14, #4 + prfm pldl1keep, [x17] + ldr d4, [x8, x16, lsl #3] + add x16, x16, #1 + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + cmp x16, x19 + b.lt .LBB0_86 +.LBB0_87: + str d0, [x10] + str d1, [x11] + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.ge .LBB0_93 +.LBB0_88: + ldr x11, [sp, #312] // 8-byte Folded Reload + mov x13, x8 + mov x12, xzr + mul x10, x11, x27 + mul x11, x11, x22 + ldr d2, [x13], #24 + lsl x14, x11, #2 + add x10, x10, x20 + ldr q1, [x9, x14] + ldr x9, [sp, #48] // 8-byte Folded Reload + ldp x15, x14, [sp, #320] // 16-byte Folded Reload + add x10, x24, x10, lsl #2 + ldr d0, [x10] + lsl x9, x9, #3 + add x9, x9, x15, lsl #2 + add x9, x9, x14 + add x9, x9, #32 + cmp xzr, x23 + b.ge .LBB0_90 + .p2align 2 +.LBB0_89: // =>This Inner Loop Header: Depth=1 + add x14, x13, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x14] + ldp d2, d3, [x13, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x13], #32 + prfm pldl1keep, [x9] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x9, #-16] + add x9, x9, #16 + cmp x12, x23 + b.lt .LBB0_89 +.LBB0_90: + ldr x9, [sp, #400] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[0] + ldr x12, [sp, #24] // 8-byte Folded Reload + ldr d3, [x8, x9, lsl #3] + ldr x9, [sp, #408] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + ldr d2, [x8, x9, lsl #3] + ldr x9, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[2] + ldr d3, [x8, x9, lsl #3] + add x9, x19, x11 + ldr x11, [sp, #344] // 8-byte Folded Reload + sub x9, x9, x30 + add x9, x11, x9, lsl #2 + ldr x11, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[3] + add x9, x9, x11 + add x11, x11, x12 + mov x12, x29 + add x9, x9, #4 + cmp x29, x19 + b.ge .LBB0_92 + .p2align 2 +.LBB0_91: // =>This Inner Loop Header: Depth=1 + add x13, x8, x12, lsl #3 + prfm pldl1keep, [x9] + ldr s1, [x11, x12, lsl #2] + add x9, x9, #4 + add x13, x13, #8 + prfm pldl1keep, [x13] + ldr d2, [x8, x12, lsl #3] + add x12, x12, #1 + fmla v0.2s, v2.2s, v1.s[0] + cmp x12, x19 + b.lt .LBB0_91 +.LBB0_92: + str d0, [x10] +.LBB0_93: + bl free + ldr x20, [sp, #472] // 8-byte Folded Reload +.LBB0_94: + ldr x8, [sp, #128] // 8-byte Folded Reload + ldr x25, [sp, #480] // 8-byte Folded Reload + cmp x25, x8 + b.ge .LBB0_126 +// %bb.95: + ldr x8, [sp, #120] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + add x10, x25, x27, lsl #2 + ldr x18, [sp, #328] // 8-byte Folded Reload + ldr x1, [sp, #344] // 8-byte Folded Reload + mov w5, #12 // =0xc + add x9, x25, x27, lsl #1 + sub x13, x25, x27 + mov w11, #6 // =0x6 + add x8, x27, x25 + add x16, x10, x27 + ldr s2, [x24, x10, lsl #2] + mul x10, x22, x5 + add x15, x9, x27 + ldr s5, [x24, x9, lsl #2] + add x13, x13, x27, lsl #3 + ldr s3, [x24, x16, lsl #2] + add x9, x18, x1 + mov w16, #20 // =0x14 + ldr s4, [x24, x15, lsl #2] + mov w15, #24 // =0x18 + madd x11, x27, x11, x25 + ldr q19, [x9, x10] + mul x10, x22, x16 + add x14, x0, #63 + ldr s0, [x24, x13, lsl #2] + mul x13, x22, x15 + ldr x4, [sp, #64] // 8-byte Folded Reload + ldp x7, x6, [sp, #296] // 16-byte Folded Reload + ldr q22, [x9, x13] + mov w13, #28 // =0x1c + ldr s1, [x24, x11, lsl #2] + add x11, x7, x21 + ldr q21, [x9, x10] + ldr x10, [sp, #464] // 8-byte Folded Reload + ldr q16, [x9] + ldr s7, [x24, x8, lsl #2] + ldr s6, [x24, x25, lsl #2] + ldr s23, [x11, x25, lsl #2] + ldr q17, [x9, x28] + ldr q18, [x9, x20] + ldr q20, [x9, x22, lsl #4] + lsl x10, x10, #3 + and x8, x14, #0xffffffffffffffc0 + ldr x29, [sp, #504] // 8-byte Folded Reload + ldr x30, [sp, #104] // 8-byte Folded Reload + madd x17, x6, x13, x10 + add x14, x10, x6, lsl #5 + madd x15, x6, x15, x10 + madd x16, x6, x16, x10 + madd x5, x6, x5, x10 + add x2, x10, x6, lsl #3 + mov w3, #16 // =0x10 + mov x12, xzr + add x13, x7, x14 + add x2, x7, x2 + sub x3, x3, x4 + add x14, x7, x17 + add x17, x1, x18 + add x1, x10, x6, lsl #2 + add x18, x26, x10 + add x15, x7, x15 + add x16, x7, x16 + add x5, x7, x5 + add x17, x4, x17 + add x18, x7, x18 + orr x4, x8, #0x8 + add x17, x17, #16 + add x1, x7, x1 + .p2align 2 +.LBB0_96: // =>This Inner Loop Header: Depth=1 + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + ext v31.16b, v16.16b, v16.16b, #8 + ext v8.16b, v17.16b, v17.16b, #8 + cmp x12, x23 + ext v30.16b, v18.16b, v18.16b, #8 + ext v29.16b, v19.16b, v19.16b, #8 + ext v28.16b, v20.16b, v20.16b, #8 + ext v27.16b, v21.16b, v21.16b, #8 + ext v26.16b, v22.16b, v22.16b, #8 + ext v25.16b, v24.16b, v24.16b, #8 + b.ge .LBB0_98 +// %bb.97: // in Loop: Header=BB0_96 Depth=1 + add x6, x16, x21 + stur s23, [x4, #-8] + fmla v4.2s, v23.2s, v19.2s + fmla v6.2s, v23.2s, v16.2s + prfm pldl1keep, [x6] + ldr s9, [x1, x21] + add x6, x15, x21 + fmla v7.2s, v23.2s, v17.2s + fmla v5.2s, v23.2s, v18.2s + fmla v2.2s, v23.2s, v20.2s + fmla v3.2s, v23.2s, v21.2s + fmla v1.2s, v23.2s, v22.2s + fmla v0.2s, v23.2s, v24.2s + add x7, x14, x21 + add x20, x17, x3 + add x25, x20, x28 + add x12, x12, #4 + add x14, x14, x26 + add x15, x15, x26 + add x16, x16, x26 + add x17, x17, #16 + add x1, x1, x26 + stur s9, [x4, #-4] + prfm pldl1keep, [x6] + ldr s23, [x2, x21] + fmla v4.2s, v9.2s, v19.s[1] + fmla v6.2s, v9.2s, v16.s[1] + fmla v7.2s, v9.2s, v17.s[1] + fmla v5.2s, v9.2s, v18.s[1] + fmla v2.2s, v9.2s, v20.s[1] + fmla v3.2s, v9.2s, v21.s[1] + fmla v1.2s, v9.2s, v22.s[1] + fmla v0.2s, v9.2s, v24.s[1] + add x6, x13, x21 + add x13, x13, x26 + add x2, x2, x26 + str s23, [x4] + prfm pldl1keep, [x7] + fmla v4.2s, v23.2s, v29.2s + ldr s29, [x5, x21] + fmla v6.2s, v23.2s, v31.2s + fmla v7.2s, v23.2s, v8.2s + fmla v5.2s, v23.2s, v30.2s + fmla v2.2s, v23.2s, v28.2s + add x7, x25, x28 + fmla v3.2s, v23.2s, v27.2s + fmla v1.2s, v23.2s, v26.2s + fmla v0.2s, v23.2s, v25.2s + add x5, x5, x26 + str s29, [x4, #4] + prfm pldl1keep, [x6] + add x6, x7, x28 + fmla v6.2s, v29.2s, v16.s[3] + fmla v7.2s, v29.2s, v17.s[3] + fmla v5.2s, v29.2s, v18.s[3] + fmla v4.2s, v29.2s, v19.s[3] + fmla v2.2s, v29.2s, v20.s[3] + ldr s23, [x18, x21] + prfm pldl1keep, [x20] + ldur q16, [x20, #-16] + prfm pldl1keep, [x25] + ldur q17, [x25, #-16] + prfm pldl1keep, [x7] + ldur q18, [x7, #-16] + fmla v3.2s, v29.2s, v21.s[3] + fmla v1.2s, v29.2s, v22.s[3] + fmla v0.2s, v29.2s, v24.s[3] + add x18, x18, x26 + add x4, x4, #16 + prfm pldl1keep, [x6] + ldur q19, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q20, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q21, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q22, [x6, #-16] + b .LBB0_96 +.LBB0_98: + ldp x13, x14, [sp, #400] // 16-byte Folded Reload + ldr x15, [sp, #304] // 8-byte Folded Reload + fmla v6.2s, v23.2s, v16.2s + ldr x25, [sp, #480] // 8-byte Folded Reload + str s23, [x8, x23, lsl #2] + fmla v7.2s, v23.2s, v17.2s + fmla v5.2s, v23.2s, v18.2s + fmla v4.2s, v23.2s, v19.2s + fmla v2.2s, v23.2s, v20.2s + fmla v3.2s, v23.2s, v21.2s + fmla v1.2s, v23.2s, v22.2s + fmla v0.2s, v23.2s, v24.2s + ldr x7, [sp, #520] // 8-byte Folded Reload + ldr x16, [sp, #96] // 8-byte Folded Reload + madd x12, x13, x15, x25 + ldr s23, [x11, x12, lsl #2] + madd x12, x14, x15, x25 + str s23, [x8, x13, lsl #2] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v6.2s, v23.2s, v16.s[1] + fmla v7.2s, v23.2s, v17.s[1] + fmla v5.2s, v23.2s, v18.s[1] + fmla v4.2s, v23.2s, v19.s[1] + fmla v2.2s, v23.2s, v20.s[1] + fmla v3.2s, v23.2s, v21.s[1] + fmla v1.2s, v23.2s, v22.s[1] + fmla v0.2s, v23.2s, v24.s[1] + ldr s23, [x11, x12, lsl #2] + madd x12, x13, x15, x25 + fmla v6.2s, v23.2s, v31.2s + str s23, [x8, x14, lsl #2] + fmla v7.2s, v23.2s, v8.2s + fmla v5.2s, v23.2s, v30.2s + fmla v4.2s, v23.2s, v29.2s + fmla v2.2s, v23.2s, v28.2s + fmla v3.2s, v23.2s, v27.2s + fmla v1.2s, v23.2s, v26.2s + fmla v0.2s, v23.2s, v25.2s + ldr s31, [x11, x12, lsl #2] + ldr x11, [sp, #72] // 8-byte Folded Reload + ldp x17, x12, [sp, #272] // 16-byte Folded Reload + add x10, x10, x12, lsl #2 + ldr x12, [sp, #296] // 8-byte Folded Reload + add x11, x11, #4 + mul x11, x15, x11 + str s31, [x8, x13, lsl #2] + ldr x13, [sp, #120] // 8-byte Folded Reload + fmla v6.2s, v31.2s, v16.s[3] + fmla v7.2s, v31.2s, v17.s[3] + fmla v5.2s, v31.2s, v18.s[3] + fmla v4.2s, v31.2s, v19.s[3] + fmla v2.2s, v31.2s, v20.s[3] + fmla v3.2s, v31.2s, v21.s[3] + fmla v1.2s, v31.2s, v22.s[3] + fmla v0.2s, v31.2s, v24.s[3] + add x10, x12, x10 + ldr x12, [sp, #320] // 8-byte Folded Reload + add x12, x13, x12, lsl #2 + ldr x13, [sp, #80] // 8-byte Folded Reload + sub x12, x12, x13 + ldr x13, [sp, #328] // 8-byte Folded Reload + add x12, x12, x13 + mov x13, x29 + add x12, x12, #4 + cmp x29, x19 + b.ge .LBB0_100 + .p2align 2 +.LBB0_99: // =>This Inner Loop Header: Depth=1 + add x15, x12, x28 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x14, x10, x11 + prfm pldl1keep, [x15] + ldur s17, [x15, #-4] + add x15, x15, x28 + add x12, x12, #4 + prfm pldl1keep, [x15] + ldur s18, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s19, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s20, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s21, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s22, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s23, [x15, #-4] + prfm pldl1keep, [x14] + ldr s24, [x10, x16, lsl #2] + add x10, x10, x17 + fmla v6.2s, v24.2s, v16.2s + str s24, [x8, x13, lsl #2] + add x13, x13, #1 + fmla v7.2s, v24.2s, v17.2s + fmla v5.2s, v24.2s, v18.2s + fmla v4.2s, v24.2s, v19.2s + fmla v2.2s, v24.2s, v20.2s + fmla v3.2s, v24.2s, v21.2s + fmla v1.2s, v24.2s, v22.2s + fmla v0.2s, v24.2s, v23.2s + cmp x13, x19 + b.lt .LBB0_99 +.LBB0_100: // %.preheader + ldr x10, [sp, #344] // 8-byte Folded Reload + ldr x11, [sp, #496] // 8-byte Folded Reload + mov x6, xzr + mov w16, #7 // =0x7 + ldr x15, [sp, #328] // 8-byte Folded Reload + ldr x12, [sp, #88] // 8-byte Folded Reload + mov w17, #6 // =0x6 + mov w18, #5 // =0x5 + mov w1, #4 // =0x4 + mov w2, #3 // =0x3 + mov w3, #2 // =0x2 + mov w4, #1 // =0x1 + add x13, x11, x10 + sub x10, x8, x30, lsl #2 + add x14, x12, x15 + add x11, x8, #12 + mov w12, #8 // =0x8 + add x15, x13, x15 + add x13, x14, #4 + add x10, x10, x19, lsl #2 + add x14, x15, #32 + add x15, x10, #4 + b .LBB0_102 + .p2align 2 +.LBB0_101: // %.loopexit + // in Loop: Header=BB0_102 Depth=1 + ldr x6, [sp, #496] // 8-byte Folded Reload + ldr x7, [sp, #520] // 8-byte Folded Reload + add x14, x14, x6 + add x13, x13, x6 + mov x6, x12 + mov x12, x5 +.LBB0_102: // =>This Loop Header: Depth=1 + // Child Loop BB0_104 Depth 2 + // Child Loop BB0_106 Depth 2 + madd x5, x6, x27, x25 + cmp x12, x7 + str s6, [x24, x5, lsl #2] + madd x4, x4, x27, x25 + madd x3, x3, x27, x25 + madd x2, x2, x27, x25 + madd x1, x1, x27, x25 + str s7, [x24, x4, lsl #2] + str s5, [x24, x3, lsl #2] + str s4, [x24, x2, lsl #2] + str s2, [x24, x1, lsl #2] + madd x18, x18, x27, x25 + str s3, [x24, x18, lsl #2] + madd x17, x17, x27, x25 + str s1, [x24, x17, lsl #2] + madd x16, x16, x27, x25 + str s0, [x24, x16, lsl #2] + b.ge .LBB0_107 +// %bb.103: // in Loop: Header=BB0_102 Depth=1 + madd x3, x12, x27, x25 + add x2, x12, #3 + add x18, x12, #5 + add x1, x12, #4 + madd x4, x2, x27, x25 + add x17, x12, #6 + add x16, x12, #7 + ldr s24, [x8] + madd x7, x18, x27, x25 + mov x6, xzr + ldr s6, [x24, x3, lsl #2] + add x3, x12, #2 + madd x5, x1, x27, x25 + madd x20, x17, x27, x25 + madd x21, x16, x27, x25 + ldr s3, [x24, x7, lsl #2] + ldr s4, [x24, x4, lsl #2] + ldr s2, [x24, x5, lsl #2] + ldr s0, [x24, x21, lsl #2] + madd x4, x3, x27, x25 + ldr s1, [x24, x20, lsl #2] + mov x20, x14 + mul x7, x12, x22 + ldr s5, [x24, x4, lsl #2] + add x4, x12, #1 + lsl x7, x7, #2 + ldr q23, [x9, x7] + mul x7, x4, x22 + madd x5, x4, x27, x25 + lsl x7, x7, #2 + ldr s7, [x24, x5, lsl #2] + add x5, x12, #8 + ldr q22, [x9, x7] + mul x7, x3, x22 + lsl x7, x7, #2 + ldr q21, [x9, x7] + mul x7, x2, x22 + lsl x7, x7, #2 + ldr q20, [x9, x7] + mul x7, x1, x22 + lsl x7, x7, #2 + ldr q19, [x9, x7] + mul x7, x18, x22 + lsl x7, x7, #2 + ldr q18, [x9, x7] + mul x7, x17, x22 + lsl x7, x7, #2 + ldr q17, [x9, x7] + mul x7, x16, x22 + lsl x7, x7, #2 + ldr q16, [x9, x7] + mov x7, x11 + fmla v6.2s, v24.2s, v23.2s + cmp xzr, x23 + b.ge .LBB0_105 + .p2align 2 +.LBB0_104: // Parent Loop BB0_102 Depth=1 + // => This Inner Loop Header: Depth=2 + add x21, x7, #8 + fmla v4.2s, v24.2s, v20.2s + fmla v7.2s, v24.2s, v22.2s + add x6, x6, #4 + prfm pldl1keep, [x21] + ldp s27, s25, [x7, #-8] + fmla v5.2s, v24.2s, v21.2s + fmla v2.2s, v24.2s, v19.2s + fmla v3.2s, v24.2s, v18.2s + fmla v1.2s, v24.2s, v17.2s + add x21, x20, x28 + ext v28.16b, v20.16b, v20.16b, #8 + fmla v0.2s, v24.2s, v16.2s + fmla v4.2s, v27.2s, v20.s[1] + fmla v6.2s, v27.2s, v23.s[1] + fmla v7.2s, v27.2s, v22.s[1] + fmla v5.2s, v27.2s, v21.s[1] + fmla v2.2s, v27.2s, v19.s[1] + ldp s26, s24, [x7], #16 + prfm pldl1keep, [x20] + fmla v3.2s, v27.2s, v18.s[1] + fmla v1.2s, v27.2s, v17.s[1] + fmla v0.2s, v27.2s, v16.s[1] + fmla v4.2s, v25.2s, v28.2s + ext v30.16b, v23.16b, v23.16b, #8 + ext v31.16b, v22.16b, v22.16b, #8 + fmla v6.2s, v25.2s, v30.2s + fmla v7.2s, v25.2s, v31.2s + fmla v6.2s, v26.2s, v23.s[3] + ldur q23, [x20, #-16] + ext v29.16b, v21.16b, v21.16b, #8 + ext v28.16b, v19.16b, v19.16b, #8 + fmla v5.2s, v25.2s, v29.2s + prfm pldl1keep, [x21] + add x20, x20, #16 + fmla v2.2s, v25.2s, v28.2s + fmla v7.2s, v26.2s, v22.s[3] + ldur q22, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + fmla v5.2s, v26.2s, v21.s[3] + ldur q21, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + ext v28.16b, v18.16b, v18.16b, #8 + fmla v3.2s, v25.2s, v28.2s + fmla v4.2s, v26.2s, v20.s[3] + ldur q20, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + fmla v2.2s, v26.2s, v19.s[3] + ldur q19, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + fmla v3.2s, v26.2s, v18.s[3] + ldur q18, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + ext v28.16b, v17.16b, v17.16b, #8 + fmla v1.2s, v25.2s, v28.2s + fmla v1.2s, v26.2s, v17.s[3] + ldur q17, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + ext v27.16b, v16.16b, v16.16b, #8 + fmla v0.2s, v25.2s, v27.2s + fmla v0.2s, v26.2s, v16.s[3] + ldur q16, [x21, #-16] + fmla v6.2s, v24.2s, v23.2s + cmp x6, x23 + b.lt .LBB0_104 +.LBB0_105: // in Loop: Header=BB0_102 Depth=1 + ldp x7, x6, [sp, #400] // 16-byte Folded Reload + fmla v7.2s, v24.2s, v22.2s + fmla v4.2s, v24.2s, v20.2s + fmla v5.2s, v24.2s, v21.2s + fmla v2.2s, v24.2s, v19.2s + mov x20, x29 + fmla v3.2s, v24.2s, v18.2s + fmla v1.2s, v24.2s, v17.2s + ldr s26, [x8, x7, lsl #2] + fmla v0.2s, v24.2s, v16.2s + ldr s27, [x8, x6, lsl #2] + ext v24.16b, v23.16b, v23.16b, #8 + ldr x6, [sp, #416] // 8-byte Folded Reload + mov x7, x15 + ldr s25, [x8, x6, lsl #2] + mov x6, x13 + fmla v6.2s, v26.2s, v23.s[1] + fmla v7.2s, v26.2s, v22.s[1] + fmla v4.2s, v26.2s, v20.s[1] + fmla v2.2s, v26.2s, v19.s[1] + fmla v5.2s, v26.2s, v21.s[1] + fmla v3.2s, v26.2s, v18.s[1] + fmla v1.2s, v26.2s, v17.s[1] + fmla v0.2s, v26.2s, v16.s[1] + ext v26.16b, v21.16b, v21.16b, #8 + fmla v6.2s, v27.2s, v24.2s + ext v24.16b, v22.16b, v22.16b, #8 + fmla v5.2s, v27.2s, v26.2s + fmla v7.2s, v27.2s, v24.2s + ext v24.16b, v20.16b, v20.16b, #8 + ext v26.16b, v17.16b, v17.16b, #8 + fmla v1.2s, v27.2s, v26.2s + fmla v4.2s, v27.2s, v24.2s + ext v24.16b, v19.16b, v19.16b, #8 + fmla v6.2s, v25.2s, v23.s[3] + fmla v5.2s, v25.2s, v21.s[3] + fmla v2.2s, v27.2s, v24.2s + fmla v7.2s, v25.2s, v22.s[3] + ext v24.16b, v18.16b, v18.16b, #8 + fmla v1.2s, v25.2s, v17.s[3] + fmla v3.2s, v27.2s, v24.2s + ext v24.16b, v16.16b, v16.16b, #8 + fmla v4.2s, v25.2s, v20.s[3] + fmla v0.2s, v27.2s, v24.2s + fmla v2.2s, v25.2s, v19.s[3] + fmla v3.2s, v25.2s, v18.s[3] + fmla v0.2s, v25.2s, v16.s[3] + cmp x29, x19 + b.ge .LBB0_101 + .p2align 2 +.LBB0_106: // Parent Loop BB0_102 Depth=1 + // => This Inner Loop Header: Depth=2 + add x21, x6, x28 + prfm pldl1keep, [x6] + ldur s16, [x6, #-4] + add x20, x20, #1 + prfm pldl1keep, [x21] + ldur s17, [x21, #-4] + add x21, x21, x28 + add x6, x6, #4 + prfm pldl1keep, [x21] + ldur s18, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s19, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s20, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s21, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s22, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s23, [x21, #-4] + prfm pldl1keep, [x7] + ldur s24, [x7, #-4] + add x7, x7, #4 + fmla v6.2s, v24.2s, v16.2s + fmla v7.2s, v24.2s, v17.2s + fmla v5.2s, v24.2s, v18.2s + fmla v4.2s, v24.2s, v19.2s + fmla v2.2s, v24.2s, v20.2s + fmla v3.2s, v24.2s, v21.2s + fmla v1.2s, v24.2s, v22.2s + fmla v0.2s, v24.2s, v23.2s + cmp x20, x19 + b.lt .LBB0_106 + b .LBB0_101 +.LBB0_107: + ldr x13, [sp, #336] // 8-byte Folded Reload + cmp x7, x13 + b.lt .LBB0_110 +// %bb.108: + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x13, x11 + b.lt .LBB0_115 +.LBB0_109: + ldr x11, [sp, #288] // 8-byte Folded Reload + ldr x12, [sp, #312] // 8-byte Folded Reload + cmp x12, x11 + b.lt .LBB0_120 + b .LBB0_125 +.LBB0_110: + add x18, x7, #1 + add x1, x7, #2 + add x2, x7, #3 + mul x14, x7, x27 + madd x13, x18, x27, x25 + mov x16, xzr + add x14, x14, x25 + mul x18, x18, x22 + mul x15, x7, x22 + madd x12, x1, x27, x25 + lsl x17, x15, #2 + lsl x18, x18, #2 + madd x11, x2, x27, x25 + ldr s2, [x24, x14, lsl #2] + ldr s0, [x24, x11, lsl #2] + ldr s1, [x24, x12, lsl #2] + ldr s3, [x24, x13, lsl #2] + ldr q6, [x9, x17] + ldr q7, [x9, x18] + mul x18, x1, x22 + mov x17, x8 + ldr s16, [x17], #12 + lsl x18, x18, #2 + ldr q5, [x9, x18] + mul x18, x2, x22 + lsl x18, x18, #2 + ldp x2, x1, [sp, #320] // 16-byte Folded Reload + ldr q4, [x9, x18] + ldr x18, [sp, #32] // 8-byte Folded Reload + lsl x18, x18, #5 + add x18, x18, x2, lsl #2 + add x18, x18, x1 + add x18, x18, #32 + ext v20.16b, v6.16b, v6.16b, #8 + cmp xzr, x23 + ext v19.16b, v7.16b, v7.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_112 + .p2align 2 +.LBB0_111: // =>This Inner Loop Header: Depth=1 + add x1, x17, #8 + fmla v2.2s, v16.2s, v6.2s + fmla v3.2s, v16.2s, v7.2s + add x16, x16, #4 + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x1] + add x1, x18, x28 + ldp s16, s21, [x17, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v3.2s, v16.2s, v7.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v2.2s, v21.2s, v20.2s + ldp s17, s16, [x17], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v1.2s, v21.2s, v18.2s + prfm pldl1keep, [x18] + fmla v2.2s, v17.2s, v6.s[3] + ldur q6, [x18, #-16] + prfm pldl1keep, [x1] + fmla v3.2s, v17.2s, v7.s[3] + ldur q7, [x1, #-16] + add x1, x1, x28 + fmla v1.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x18, x18, #16 + prfm pldl1keep, [x1] + ldur q5, [x1, #-16] + add x1, x1, x28 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + ext v20.16b, v6.16b, v6.16b, #8 + cmp x16, x23 + ext v19.16b, v7.16b, v7.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_111 +.LBB0_112: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v2.2s, v16.2s, v6.2s + fmla v3.2s, v16.2s, v7.2s + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + add x15, x19, x15 + ldr s21, [x8, x17, lsl #2] + ldr s16, [x8, x16, lsl #2] + ldr x16, [sp, #416] // 8-byte Folded Reload + ldr x17, [sp, #344] // 8-byte Folded Reload + ldr s22, [x8, x16, lsl #2] + sub x16, x15, x30 + add x15, x10, #4 + add x16, x17, x16, lsl #2 + ldr x17, [sp, #328] // 8-byte Folded Reload + fmla v2.2s, v21.2s, v6.s[1] + fmla v3.2s, v21.2s, v7.s[1] + fmla v1.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + add x16, x16, x17 + mov x17, x29 + fmla v2.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v1.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + add x16, x16, #4 + fmla v2.2s, v22.2s, v6.s[3] + fmla v3.2s, v22.2s, v7.s[3] + fmla v1.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x29, x19 + b.ge .LBB0_114 + .p2align 2 +.LBB0_113: // =>This Inner Loop Header: Depth=1 + add x18, x16, x28 + prfm pldl1keep, [x16] + ldur s4, [x16, #-4] + add x17, x17, #1 + prfm pldl1keep, [x18] + ldur s5, [x18, #-4] + add x18, x18, x28 + add x16, x16, #4 + prfm pldl1keep, [x18] + ldur s6, [x18, #-4] + add x18, x18, x28 + prfm pldl1keep, [x18] + ldur s7, [x18, #-4] + prfm pldl1keep, [x15] + ldur s16, [x15, #-4] + add x15, x15, #4 + fmla v2.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x17, x19 + b.lt .LBB0_113 +.LBB0_114: + str s2, [x24, x14, lsl #2] + str s3, [x24, x13, lsl #2] + ldr x13, [sp, #336] // 8-byte Folded Reload + str s1, [x24, x12, lsl #2] + str s0, [x24, x11, lsl #2] + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x13, x11 + b.ge .LBB0_109 +.LBB0_115: + mul x11, x13, x27 + add x14, x13, #1 + ldr x1, [sp, #328] // 8-byte Folded Reload + ldr x2, [sp, #344] // 8-byte Folded Reload + mul x13, x13, x22 + ldr s4, [x8] + mov x15, xzr + mov x16, xzr + madd x12, x14, x27, x25 + lsl x17, x13, #2 + mul x14, x14, x22 + add x18, x1, x2 + add x11, x11, x25 + ldr q3, [x9, x17] + lsl x17, x14, #2 + ldr s0, [x24, x11, lsl #2] + ldr s1, [x24, x12, lsl #2] + ldr q2, [x9, x17] + add x17, x18, x17 + ldr x18, [sp, #40] // 8-byte Folded Reload + add x18, x2, x18, lsl #4 + add x18, x1, x18 + ext v6.16b, v3.16b, v3.16b, #8 + cmp xzr, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.ge .LBB0_117 + .p2align 2 +.LBB0_116: // =>This Inner Loop Header: Depth=1 + add x5, x8, x15 + fmla v0.2s, v4.2s, v3.2s + fmla v1.2s, v4.2s, v2.2s + add x1, x17, x15 + add x6, x5, #20 + add x3, x18, x15 + add x2, x1, #32 + add x4, x3, #32 + prfm pldl1keep, [x6] + ldp s4, s7, [x5, #4] + add x16, x16, #4 + add x15, x15, #16 + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v7.2s, v5.2s + ldp s5, s4, [x5, #12] + fmla v0.2s, v7.2s, v6.2s + prfm pldl1keep, [x4] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x3, #16] + prfm pldl1keep, [x2] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x1, #16] + ext v6.16b, v3.16b, v3.16b, #8 + cmp x16, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.lt .LBB0_116 +.LBB0_117: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v0.2s, v4.2s, v3.2s + fmla v1.2s, v4.2s, v2.2s + add x13, x19, x13 + mov x15, xzr + ldr s7, [x8, x17, lsl #2] + ldr s4, [x8, x16, lsl #2] + ldr x16, [sp, #416] // 8-byte Folded Reload + ldr x17, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v3.s[1] + fmla v1.2s, v7.2s, v2.s[1] + ldr s7, [x8, x16, lsl #2] + sub x16, x13, x30 + add x13, x19, x14 + ldr x14, [sp, #344] // 8-byte Folded Reload + sub x13, x13, x30 + fmla v0.2s, v4.2s, v6.2s + fmla v1.2s, v4.2s, v5.2s + add x13, x14, x13, lsl #2 + add x14, x14, x16, lsl #2 + mov x16, x29 + add x13, x17, x13 + add x14, x17, x14 + fmla v0.2s, v7.2s, v3.s[3] + fmla v1.2s, v7.2s, v2.s[3] + cmp x29, x19 + b.ge .LBB0_119 + .p2align 2 +.LBB0_118: // =>This Inner Loop Header: Depth=1 + add x17, x10, x15 + add x18, x13, x15 + add x1, x14, x15 + add x16, x16, #1 + add x17, x17, #4 + add x18, x18, #4 + add x1, x1, #4 + prfm pldl1keep, [x1] + prfm pldl1keep, [x18] + ldr s2, [x14, x15] + prfm pldl1keep, [x17] + ldr s3, [x10, x15] + fmla v0.2s, v3.2s, v2.2s + ldr s2, [x13, x15] + add x15, x15, #4 + fmla v1.2s, v3.2s, v2.2s + cmp x16, x19 + b.lt .LBB0_118 +.LBB0_119: + str s0, [x24, x11, lsl #2] + str s1, [x24, x12, lsl #2] + ldr x11, [sp, #288] // 8-byte Folded Reload + ldr x12, [sp, #312] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_125 +.LBB0_120: + mul x11, x12, x27 + mov x14, x8 + mov x13, xzr + add x11, x11, x25 + mul x12, x12, x22 + lsl x15, x12, #2 + ldr s2, [x14], #12 + ldr q1, [x9, x15] + ldr x9, [sp, #48] // 8-byte Folded Reload + ldr x15, [sp, #320] // 8-byte Folded Reload + ldr s0, [x24, x11, lsl #2] + lsl x9, x9, #3 + add x9, x9, x15, lsl #2 + ldr x15, [sp, #328] // 8-byte Folded Reload + add x9, x9, x15 + add x9, x9, #32 + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x23 + b.ge .LBB0_122 + .p2align 2 +.LBB0_121: // =>This Inner Loop Header: Depth=1 + add x15, x14, #8 + fmla v0.2s, v2.2s, v1.2s + add x13, x13, #4 + prfm pldl1keep, [x15] + ldp s2, s4, [x14, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x14], #16 + prfm pldl1keep, [x9] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x9, #-16] + add x9, x9, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x13, x23 + b.lt .LBB0_121 +.LBB0_122: + ldp x14, x13, [sp, #400] // 16-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x9, xzr + ldr s4, [x8, x14, lsl #2] + ldr s2, [x8, x13, lsl #2] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s4, [x8, x13, lsl #2] + add x8, x19, x12 + ldr x12, [sp, #344] // 8-byte Folded Reload + sub x8, x8, x30 + fmla v0.2s, v2.2s, v3.2s + add x8, x12, x8, lsl #2 + ldr x12, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[3] + add x8, x12, x8 + cmp x29, x19 + b.ge .LBB0_124 + .p2align 2 +.LBB0_123: // =>This Inner Loop Header: Depth=1 + add x12, x10, x9 + add x13, x8, x9 + add x29, x29, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + prfm pldl1keep, [x12] + ldr s1, [x10, x9] + ldr s2, [x8, x9] + add x9, x9, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x29, x19 + b.lt .LBB0_123 +.LBB0_124: + str s0, [x24, x11, lsl #2] +.LBB0_125: + bl free +.LBB0_126: + add sp, sp, #512 + ldp d9, d8, [sp, #32] // 16-byte Folded Reload + ldp d11, d10, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #128] // 16-byte Folded Reload + ldp x22, x21, [sp, #112] // 16-byte Folded Reload + ldp x24, x23, [sp, #96] // 16-byte Folded Reload + ldp x26, x25, [sp, #80] // 16-byte Folded Reload + ldp x28, x27, [sp, #64] // 16-byte Folded Reload + ldp x29, x30, [sp, #48] // 16-byte Folded Reload + ldr d12, [sp], #144 // 8-byte Folded Reload + ret +.Lfunc_end0: + .size sgemm_nn_alpha1_beta1_mlir, .Lfunc_end0-sgemm_nn_alpha1_beta1_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s new file mode 100644 index 00000000000000..ffd32ba76066c8 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s @@ -0,0 +1,709 @@ + .text + .file "LLVMDialectModule" + .globl sgemv_n_alpha1_beta1_mlir // -- Begin function sgemv_n_alpha1_beta1_mlir + .p2align 4 + .type sgemv_n_alpha1_beta1_mlir,@function +sgemv_n_alpha1_beta1_mlir: // @sgemv_n_alpha1_beta1_mlir + .cfi_startproc +// %bb.0: + sub sp, sp, #112 + stp x29, x30, [sp, #16] // 16-byte Folded Spill + stp x28, x27, [sp, #32] // 16-byte Folded Spill + stp x26, x25, [sp, #48] // 16-byte Folded Spill + stp x24, x23, [sp, #64] // 16-byte Folded Spill + stp x22, x21, [sp, #80] // 16-byte Folded Spill + stp x20, x19, [sp, #96] // 16-byte Folded Spill + .cfi_def_cfa_offset 112 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + cmp x4, #0 + ldr x9, [sp, #112] + lsl x7, x5, #2 + mov x27, xzr + cinv x8, x4, lt + add x0, x1, #448 + lsl x2, x5, #4 + add x10, x8, x8, lsr #63 + add x11, x8, #3 + add x12, x8, #7 + add x19, x7, #448 + asr x10, x10, #1 + mov x15, x9 + cinv x10, x10, lt + cmp x8, #0 + csel x11, x11, x8, lt + csel x8, x12, x8, lt + cmp x4, #0 + asr x11, x11, #2 + asr x8, x8, #3 + cinv x12, x11, lt + cinv x14, x8, lt + cmp x3, #0 + lsl x11, x10, #3 + cinv x6, x3, lt + lsl x13, x12, #4 + lsl x16, x14, #5 + add x21, x9, x11 + lsl x12, x12, #2 + lsl x14, x14, #3 + add x8, x6, #3 + cmp x6, #0 + csel x8, x8, x6, lt + cmp x3, #0 + asr x8, x8, #2 + cinv x8, x8, lt + stp x8, x9, [sp] // 16-byte Folded Spill + lsl x17, x8, #2 + add x8, x11, x1 + add x20, x8, #72 + lsl x8, x10, #1 + add x10, x13, #128 + add x22, x1, x10 + add x23, x9, x10 + add x10, x16, #256 + add x24, x1, x10 + add x25, x9, x10 + ldr x10, [sp, #152] + b .LBB0_2 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_2 Depth=1 + mov s5, v0.s[2] + fadd s2, s2, s0 + add x0, x0, x2 + add x20, x20, x2 + add x24, x24, x2 + add x22, x22, x2 + fadd s3, s3, s5 + mov s5, v0.s[1] + mov s0, v0.s[3] + fadd s4, s4, s5 + fadd s0, s1, s0 + mov v2.s[1], v4.s[0] + mov v2.s[2], v3.s[0] + mov v2.s[3], v0.s[0] + str q2, [x26] +.LBB0_2: // =>This Loop Header: Depth=1 + // Child Loop BB0_4 Depth 2 + // Child Loop BB0_6 Depth 2 + // Child Loop BB0_8 Depth 2 + // Child Loop BB0_10 Depth 2 + cmp x27, x17 + b.ge .LBB0_11 +// %bb.3: // in Loop: Header=BB0_2 Depth=1 + add x26, x10, x27, lsl #2 + movi v4.2d, #0000000000000000 + movi v3.2d, #0000000000000000 + movi v5.2d, #0000000000000000 + movi v0.2d, #0000000000000000 + mov x28, x0 + mov x29, xzr + mov x30, x15 + ldr q1, [x26] + movi v7.2d, #0000000000000000 + movi v16.2d, #0000000000000000 + add x27, x27, #4 + movi v2.2d, #0000000000000000 + movi v6.2d, #0000000000000000 + cmp xzr, x14 + b.ge .LBB0_5 + .p2align 2 +.LBB0_4: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + sub x18, x28, #448 + prfm pldl1keep, [x28] + add x28, x28, #32 + add x29, x29, #8 + ldp q18, q17, [x18] + add x18, x18, x19 + prfm pldl1keep, [x18] + sub x18, x18, #448 + add x9, x18, x19 + ldp q20, q19, [x18] + prfm pldl1keep, [x9] + sub x9, x9, #448 + add x18, x9, x19 + ldp q22, q21, [x9] + add x9, x30, #448 + prfm pldl1keep, [x18] + ldp q23, q24, [x18, #-448]! + prfm pldl1keep, [x9] + ldp q26, q25, [x30], #32 + fmla v6.4s, v21.4s, v25.4s + fmla v3.4s, v19.4s, v25.4s + fmla v7.4s, v24.4s, v25.4s + fmla v0.4s, v17.4s, v25.4s + fmla v5.4s, v22.4s, v26.4s + fmla v4.4s, v20.4s, v26.4s + fmla v2.4s, v18.4s, v26.4s + fmla v16.4s, v23.4s, v26.4s + cmp x29, x14 + b.lt .LBB0_4 +.LBB0_5: // in Loop: Header=BB0_2 Depth=1 + mov s17, v1.s[3] + mov s18, v16.s[1] + mov x28, x25 + mov x29, x14 + mov x30, x24 + fadd s17, s17, s16 + fadd s17, s17, s18 + mov s18, v16.s[2] + mov s16, v16.s[3] + fadd s17, s17, s18 + fadd s16, s17, s16 + mov s17, v7.s[1] + fadd s16, s16, s7 + fadd s16, s16, s17 + mov s17, v7.s[2] + mov s7, v7.s[3] + fadd s16, s16, s17 + mov s17, v5.s[1] + fadd s7, s16, s7 + mov s16, v1.s[2] + fadd s16, s16, s5 + fadd s16, s16, s17 + mov s17, v5.s[2] + mov s5, v5.s[3] + fadd s16, s16, s17 + fadd s5, s16, s5 + mov s16, v6.s[1] + fadd s5, s5, s6 + fadd s5, s5, s16 + mov s16, v6.s[2] + mov s6, v6.s[3] + fadd s5, s5, s16 + mov s16, v4.s[1] + fadd s5, s5, s6 + mov s6, v1.s[1] + fadd s1, s1, s2 + fadd s6, s6, s4 + fadd s6, s6, s16 + mov s16, v4.s[2] + mov s4, v4.s[3] + fadd s6, s6, s16 + fadd s4, s6, s4 + mov s6, v3.s[1] + fadd s4, s4, s3 + fadd s4, s4, s6 + mov s6, v3.s[2] + mov s3, v3.s[3] + fadd s4, s4, s6 + fadd s3, s4, s3 + mov s4, v2.s[1] + fadd s1, s1, s4 + mov s4, v2.s[2] + mov s2, v2.s[3] + fadd s1, s1, s4 + movi v4.2d, #0000000000000000 + fadd s1, s1, s2 + mov s2, v0.s[1] + fadd s1, s1, s0 + fadd s1, s1, s2 + mov s2, v0.s[2] + mov s0, v0.s[3] + fadd s1, s1, s2 + movi v2.2d, #0000000000000000 + fadd s0, s1, s0 + movi v1.2d, #0000000000000000 + mov v0.s[1], v3.s[0] + movi v3.2d, #0000000000000000 + mov v0.s[2], v5.s[0] + mov v0.s[3], v7.s[0] + str q0, [x26] + cmp x14, x12 + b.ge .LBB0_7 + .p2align 2 +.LBB0_6: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + add x9, x30, x7 + prfm pldl1keep, [x30] + ldur q5, [x30, #-256] + add x30, x30, #16 + add x18, x9, x7 + prfm pldl1keep, [x9] + ldur q6, [x9, #-256] + add x29, x29, #4 + add x9, x18, x7 + prfm pldl1keep, [x18] + ldur q7, [x18, #-256] + prfm pldl1keep, [x9] + ldur q16, [x9, #-256] + prfm pldl1keep, [x28] + ldur q17, [x28, #-256] + add x28, x28, #16 + fmla v3.4s, v7.4s, v17.4s + fmla v1.4s, v16.4s, v17.4s + fmla v4.4s, v6.4s, v17.4s + fmla v2.4s, v5.4s, v17.4s + cmp x29, x12 + b.lt .LBB0_6 +.LBB0_7: // in Loop: Header=BB0_2 Depth=1 + mov s5, v0.s[1] + mov s6, v4.s[1] + mov x28, x23 + mov x29, x22 + mov s7, v1.s[1] + mov x30, x12 + fadd s5, s5, s4 + fadd s5, s5, s6 + mov s6, v4.s[2] + mov s4, v4.s[3] + fadd s5, s5, s6 + fadd s6, s0, s2 + fadd s4, s5, s4 + mov s5, v2.s[1] + fadd s5, s6, s5 + mov s6, v2.s[2] + mov s2, v2.s[3] + fadd s5, s5, s6 + movi d6, #0000000000000000 + fadd s2, s5, s2 + mov s5, v3.s[1] + mov v2.s[1], v4.s[0] + mov s4, v0.s[2] + mov s0, v0.s[3] + fadd s4, s4, s3 + fadd s0, s0, s1 + fadd s4, s4, s5 + mov s5, v3.s[2] + fadd s0, s0, s7 + mov s7, v1.s[2] + mov s3, v3.s[3] + mov s1, v1.s[3] + fadd s4, s4, s5 + fadd s0, s0, s7 + movi d5, #0000000000000000 + fadd s3, s4, s3 + fadd s0, s0, s1 + movi d4, #0000000000000000 + mov v2.s[2], v3.s[0] + movi d3, #0000000000000000 + mov v2.s[3], v0.s[0] + str q2, [x26] + cmp x12, x8 + b.ge .LBB0_9 + .p2align 2 +.LBB0_8: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + add x9, x29, x7 + prfm pldl1keep, [x29] + ldur d0, [x29, #-128] + add x29, x29, #8 + add x18, x9, x7 + prfm pldl1keep, [x9] + ldur d1, [x9, #-128] + add x30, x30, #2 + add x9, x18, x7 + prfm pldl1keep, [x18] + ldur d7, [x18, #-128] + prfm pldl1keep, [x9] + ldur d16, [x9, #-128] + prfm pldl1keep, [x28] + ldur d17, [x28, #-128] + add x28, x28, #8 + fmla v5.2s, v7.2s, v17.2s + fmla v6.2s, v16.2s, v17.2s + fmla v4.2s, v1.2s, v17.2s + fmla v3.2s, v0.2s, v17.2s + cmp x30, x8 + b.lt .LBB0_8 +.LBB0_9: // in Loop: Header=BB0_2 Depth=1 + mov s0, v2.s[3] + mov s1, v6.s[1] + mov x28, x21 + mov x29, x20 + mov x30, x8 + fadd s0, s0, s6 + mov s6, v2.s[2] + fadd s6, s6, s5 + fadd s1, s0, s1 + mov s0, v2.s[1] + fadd s2, s2, s3 + fadd s0, s0, s4 + mov s4, v4.s[1] + fadd s4, s0, s4 + mov s0, v5.s[1] + fadd s5, s6, s0 + mov s0, v3.s[1] + movi d3, #0000000000000000 + fadd s0, s2, s0 + movi d2, #0000000000000000 + mov v0.s[1], v4.s[0] + movi d4, #0000000000000000 + mov v0.s[2], v5.s[0] + mov v0.s[3], v1.s[0] + movi d1, #0000000000000000 + str q0, [x26] + cmp x8, x4 + b.ge .LBB0_1 + .p2align 2 +.LBB0_10: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + add x9, x29, x7 + prfm pldl1keep, [x29] + ldur s5, [x29, #-72] + add x29, x29, #4 + add x18, x9, x7 + prfm pldl1keep, [x9] + ldur s6, [x9, #-72] + add x30, x30, #1 + prfm pldl1keep, [x18] + add x9, x18, x7 + ldur s7, [x18, #-72] + add x18, x28, #72 + prfm pldl1keep, [x9] + ldur s16, [x9, #-72] + prfm pldl1keep, [x18] + ldr s17, [x28], #4 + fmul s7, s7, s17 + fmul s6, s6, s17 + fmul s5, s5, s17 + fadd v3.2s, v3.2s, v7.2s + fmul s7, s16, s17 + fadd v4.2s, v4.2s, v6.2s + fadd v2.2s, v2.2s, v5.2s + fadd v1.2s, v1.2s, v7.2s + cmp x30, x4 + b.lt .LBB0_10 + b .LBB0_1 +.LBB0_11: + add x9, x6, x6, lsr #63 + cmp x3, #0 + asr x9, x9, #1 + cinv x2, x9, lt + lsl x0, x2, #1 + cmp x17, x0 + b.ge .LBB0_21 +// %bb.12: + ldr x9, [sp] // 8-byte Folded Reload + movi v4.2d, #0000000000000000 + movi v2.2d, #0000000000000000 + mov x7, x15 + movi v3.2d, #0000000000000000 + movi v0.2d, #0000000000000000 + mov x19, xzr + mul x9, x9, x5 + add x18, x1, x9, lsl #4 + add x9, x17, #1 + add x17, x10, x17, lsl #2 + mul x9, x5, x9 + ldr d1, [x17] + mov x20, x18 + add x6, x1, x9, lsl #2 + mov x21, x6 + cmp xzr, x14 + b.ge .LBB0_14 + .p2align 2 +.LBB0_13: // =>This Inner Loop Header: Depth=1 + add x9, x20, #736 + add x19, x19, #8 + prfm pldl1keep, [x9] + add x9, x21, #736 + ldp q6, q5, [x20], #32 + prfm pldl1keep, [x9] + add x9, x7, #736 + ldp q16, q7, [x21], #32 + prfm pldl1keep, [x9] + ldr q17, [x7, #16] + fmla v0.4s, v5.4s, v17.4s + fmla v3.4s, v7.4s, v17.4s + ldr q5, [x7], #32 + fmla v4.4s, v16.4s, v5.4s + fmla v2.4s, v6.4s, v5.4s + cmp x19, x14 + b.lt .LBB0_13 +.LBB0_14: + mov s5, v1.s[1] + mov s6, v4.s[1] + mov x7, x18 + mov x19, x6 + fadd s1, s1, s2 + mov x20, x14 + mov x21, x15 + fadd s5, s5, s4 + fadd s5, s5, s6 + mov s6, v4.s[2] + mov s4, v4.s[3] + fadd s5, s5, s6 + fadd s4, s5, s4 + mov s5, v3.s[1] + fadd s4, s4, s3 + fadd s4, s4, s5 + mov s5, v3.s[2] + mov s3, v3.s[3] + fadd s4, s4, s5 + fadd s3, s4, s3 + mov s4, v2.s[1] + fadd s1, s1, s4 + mov s4, v2.s[2] + mov s2, v2.s[3] + fadd s1, s1, s4 + fadd s1, s1, s2 + mov s2, v0.s[1] + fadd s1, s1, s0 + fadd s1, s1, s2 + mov s2, v0.s[2] + mov s0, v0.s[3] + fadd s1, s1, s2 + movi v2.2d, #0000000000000000 + fadd s0, s1, s0 + movi v1.2d, #0000000000000000 + mov v0.s[1], v3.s[0] + str d0, [x17] + cmp x14, x12 + b.ge .LBB0_16 + .p2align 2 +.LBB0_15: // =>This Inner Loop Header: Depth=1 + add x9, x7, x16 + add x20, x20, #4 + add x9, x9, #432 + prfm pldl1keep, [x9] + add x9, x19, x16 + ldr q3, [x7, x16] + add x7, x7, #16 + add x9, x9, #432 + prfm pldl1keep, [x9] + add x9, x21, x16 + add x9, x9, #432 + ldr q4, [x19, x16] + add x19, x19, #16 + prfm pldl1keep, [x9] + ldr q5, [x21, x16] + add x21, x21, #16 + fmla v2.4s, v4.4s, v5.4s + fmla v1.4s, v3.4s, v5.4s + cmp x20, x12 + b.lt .LBB0_15 +.LBB0_16: + mov s3, v0.s[1] + mov s4, v2.s[1] + mov x7, x18 + mov x19, x6 + fadd s0, s0, s1 + mov x20, x15 + mov x21, x12 + fadd s3, s3, s2 + fadd s3, s3, s4 + mov s4, v2.s[2] + mov s2, v2.s[3] + fadd s3, s3, s4 + fadd s2, s3, s2 + mov s3, v1.s[1] + fadd s0, s0, s3 + mov s3, v1.s[2] + mov s1, v1.s[3] + fadd s0, s0, s3 + fadd s0, s0, s1 + movi d1, #0000000000000000 + mov v0.s[1], v2.s[0] + movi d2, #0000000000000000 + str d0, [x17] + cmp x12, x8 + b.ge .LBB0_18 + .p2align 2 +.LBB0_17: // =>This Inner Loop Header: Depth=1 + add x9, x7, x13 + add x21, x21, #2 + add x9, x9, #216 + prfm pldl1keep, [x9] + add x9, x19, x13 + ldr d3, [x7, x13] + add x7, x7, #8 + add x9, x9, #216 + prfm pldl1keep, [x9] + add x9, x20, x13 + add x9, x9, #216 + ldr d4, [x19, x13] + add x19, x19, #8 + prfm pldl1keep, [x9] + ldr d5, [x20, x13] + add x20, x20, #8 + fmla v2.2s, v4.2s, v5.2s + fmla v1.2s, v3.2s, v5.2s + cmp x21, x8 + b.lt .LBB0_17 +.LBB0_18: + mov s3, v0.s[1] + fadd s0, s0, s1 + mov x7, x15 + mov x19, x8 + fadd s3, s3, s2 + mov s2, v2.s[1] + fadd s2, s3, s2 + mov s3, v1.s[1] + movi d1, #0000000000000000 + fadd s0, s0, s3 + mov v0.s[1], v2.s[0] + movi d2, #0000000000000000 + str d0, [x17] + cmp x8, x4 + b.ge .LBB0_20 + .p2align 2 +.LBB0_19: // =>This Inner Loop Header: Depth=1 + add x9, x18, x11 + add x19, x19, #1 + add x9, x9, #128 + prfm pldl1keep, [x9] + add x9, x6, x11 + ldr s3, [x18, x11] + add x18, x18, #4 + add x9, x9, #128 + prfm pldl1keep, [x9] + add x9, x7, x11 + add x9, x9, #128 + ldr s4, [x6, x11] + add x6, x6, #4 + prfm pldl1keep, [x9] + ldr s5, [x7, x11] + add x7, x7, #4 + fmul s4, s4, s5 + fmul s3, s3, s5 + fadd v1.2s, v1.2s, v4.2s + fadd v2.2s, v2.2s, v3.2s + cmp x19, x4 + b.lt .LBB0_19 +.LBB0_20: + mov s3, v0.s[1] + fadd s0, s2, s0 + fadd s1, s1, s3 + mov v0.s[1], v1.s[0] + str d0, [x17] +.LBB0_21: + cmp x0, x3 + b.ge .LBB0_31 +// %bb.22: + mul x17, x2, x5 + ldr s2, [x10, x0, lsl #2] + movi v0.2d, #0000000000000000 + movi v1.2d, #0000000000000000 + mov x18, xzr + add x2, x1, x17, lsl #3 + cmp xzr, x14 + b.ge .LBB0_24 + .p2align 2 +.LBB0_23: // =>This Inner Loop Header: Depth=1 + add x9, x2, #1152 + add x18, x18, #8 + prfm pldl1keep, [x9] + add x9, x15, #1152 + ldp q3, q4, [x2], #32 + prfm pldl1keep, [x9] + ldr q5, [x15] + fmla v1.4s, v3.4s, v5.4s + ldr q3, [x15, #16] + add x15, x15, #32 + fmla v0.4s, v4.4s, v3.4s + cmp x18, x14 + b.lt .LBB0_23 +.LBB0_24: + fadd s2, s2, s1 + mov s3, v1.s[1] + ldr x18, [sp, #8] // 8-byte Folded Reload + add x9, x16, x17, lsl #3 + add x15, x1, x9 + add x16, x18, x16 + fadd s2, s2, s3 + mov s3, v1.s[2] + mov s1, v1.s[3] + fadd s2, s2, s3 + fadd s1, s2, s1 + mov s2, v0.s[1] + fadd s1, s1, s0 + fadd s1, s1, s2 + mov s2, v0.s[2] + mov s0, v0.s[3] + fadd s1, s1, s2 + fadd s0, s1, s0 + movi v1.2d, #0000000000000000 + str s0, [x10, x0, lsl #2] + cmp x14, x12 + b.ge .LBB0_26 + .p2align 2 +.LBB0_25: // =>This Inner Loop Header: Depth=1 + add x9, x15, #672 + add x14, x14, #4 + prfm pldl1keep, [x9] + add x9, x16, #672 + ldr q2, [x15], #16 + prfm pldl1keep, [x9] + ldr q3, [x16], #16 + fmla v1.4s, v2.4s, v3.4s + cmp x14, x12 + b.lt .LBB0_25 +.LBB0_26: + fadd s0, s0, s1 + mov s2, v1.s[1] + add x9, x13, x17, lsl #3 + add x13, x18, x13 + add x14, x1, x9 + fadd s0, s0, s2 + mov s2, v1.s[2] + mov s1, v1.s[3] + fadd s0, s0, s2 + fadd s0, s0, s1 + movi d1, #0000000000000000 + str s0, [x10, x0, lsl #2] + cmp x12, x8 + b.ge .LBB0_28 + .p2align 2 +.LBB0_27: // =>This Inner Loop Header: Depth=1 + add x9, x14, #336 + add x12, x12, #2 + prfm pldl1keep, [x9] + add x9, x13, #336 + ldr d2, [x14], #8 + prfm pldl1keep, [x9] + ldr d3, [x13], #8 + fmla v1.2s, v2.2s, v3.2s + cmp x12, x8 + b.lt .LBB0_27 +.LBB0_28: + fadd s0, s0, s1 + mov s2, v1.s[1] + add x9, x11, x17, lsl #3 + add x12, x1, x9 + movi d1, #0000000000000000 + add x9, x18, x11 + fadd s0, s0, s2 + str s0, [x10, x0, lsl #2] + cmp x8, x4 + b.ge .LBB0_30 + .p2align 2 +.LBB0_29: // =>This Inner Loop Header: Depth=1 + add x11, x12, #200 + add x8, x8, #1 + prfm pldl1keep, [x11] + add x11, x9, #200 + ldr s2, [x12], #4 + prfm pldl1keep, [x11] + ldr s3, [x9], #4 + fmul s2, s2, s3 + fadd v1.2s, v1.2s, v2.2s + cmp x8, x4 + b.lt .LBB0_29 +.LBB0_30: + fadd s0, s1, s0 + str s0, [x10, x0, lsl #2] +.LBB0_31: + ldp x20, x19, [sp, #96] // 16-byte Folded Reload + ldp x22, x21, [sp, #80] // 16-byte Folded Reload + ldp x24, x23, [sp, #64] // 16-byte Folded Reload + ldp x26, x25, [sp, #48] // 16-byte Folded Reload + ldp x28, x27, [sp, #32] // 16-byte Folded Reload + ldp x29, x30, [sp, #16] // 16-byte Folded Reload + add sp, sp, #112 + ret +.Lfunc_end0: + .size sgemv_n_alpha1_beta1_mlir, .Lfunc_end0-sgemv_n_alpha1_beta1_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp new file mode 100644 index 00000000000000..c65157a12444ae --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp @@ -0,0 +1,46 @@ +#include +#include + +#include +#include +#include + +extern "C" void sbatch_matmul_3d_nn_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t); + +extern "C" void sbatch_matmul_3d_nt_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t); + +// C interface +extern "C" void cblas_sbatch_matmul_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const BLASINT BATCH, const BLASINT M, + const BLASINT N, const BLASINT K, const float *A, const BLASINT LDA, + const float *B, const BLASINT LDB, float *C, const BLASINT LDC) { + + // For the mini lib we only have nn,nt + assert(Order == CblasRowMajor); + assert(TransA == CblasNoTrans); + + memset(C, 0, BATCH * M * N * sizeof(float)); + + if (TransB == CblasTrans) { + sbatch_matmul_3d_nt_mlir(/* A */ Memref_3D_Args(A, BATCH, M, K, LDA), + /* B */ Memref_3D_Args(B, BATCH, N, K, LDB), + /* C */ Memref_3D_Args(C, BATCH, M, N, LDC)); + } else { + sbatch_matmul_3d_nn_mlir(/* A */ Memref_3D_Args(A, BATCH, M, K, LDA), + /* B */ Memref_3D_Args(B, BATCH, K, N, LDB), + /* C */ Memref_3D_Args(C, BATCH, M, N, LDC)); + } +} diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp new file mode 100644 index 00000000000000..f92e217d3a1693 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp @@ -0,0 +1,49 @@ +#include +#include + +#include +#include +#include + +extern "C" void sbatch_matmul_4d_nn_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t); + +extern "C" void sbatch_matmul_4d_nt_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t); + +// C interface +extern "C" void cblas_sbatch_matmul_4d_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const BLASINT BATCH1, + const BLASINT BATCH2, const BLASINT M, const BLASINT N, const BLASINT K, + const float *A, const BLASINT LDA, const float *B, const BLASINT LDB, + float *C, const BLASINT LDC) { + + // For the mini lib we only have nn,nt + assert(Order == CblasRowMajor); + assert(TransA == CblasNoTrans); + + memset(C, 0, BATCH1 * BATCH2 * M * N * sizeof(float)); + + if (TransB == CblasTrans) { + sbatch_matmul_4d_nt_mlir( + /* A */ Memref_4D_Args(A, BATCH1, BATCH2, M, K, LDA), + /* B */ Memref_4D_Args(B, BATCH1, BATCH2, N, K, LDB), + /* C */ Memref_4D_Args(C, BATCH1, BATCH2, M, N, LDC)); + } else { + sbatch_matmul_4d_nn_mlir( + /* A */ Memref_4D_Args(A, BATCH1, BATCH2, M, K, LDA), + /* B */ Memref_4D_Args(B, BATCH1, BATCH2, K, N, LDB), + /* C */ Memref_4D_Args(C, BATCH1, BATCH2, M, N, LDC)); + } +} diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp new file mode 100644 index 00000000000000..b51efca3f51b71 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp @@ -0,0 +1,43 @@ +#include +#include + +#include +#include + +#include + +extern "C" void sgemm_nn_alpha1_beta1_mlir( + /* alpha */ float, + /* beta */ float, + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t); + +// C interface +extern "C" void cblas_sgemm_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const BLASINT M, const BLASINT N, + const BLASINT K, const float alpha, const float *A, const BLASINT LDA, + const float *B, const BLASINT LDB, const float beta, float *C, + const BLASINT LDC) { + // For the mini lib we only have nn, alpha=1, beta=1 or beta=0. + assert(Order == CblasRowMajor); + assert(TransA == CblasNoTrans); + assert(TransB == CblasNoTrans); + assert(alpha == 1.0); + assert(beta == 1.0 || beta == 0.0); + + // This is faster + if (beta == 0.0) { + memset(C, 0, M * N * sizeof(float)); + } + + // Call MLIR kernel + sgemm_nn_alpha1_beta1_mlir(/* alpha */ 1.0, + /* beta */ 1.0, + /* A */ Memref_2D_Args(A, M, K, LDA), + /* B */ Memref_2D_Args(B, K, N, LDB), + /* C */ Memref_2D_Args(C, M, N, LDC)); +} diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp new file mode 100644 index 00000000000000..4ee3441735218a --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp @@ -0,0 +1,43 @@ +#include +#include +#include + +#include +#include + +extern "C" void sgemv_n_alpha1_beta1_mlir(/* alpha */ float, + /* beta */ float, + /* A */ const float *, const float *, + int64_t, int64_t, int64_t, int64_t, + int64_t, + /* X */ const float *, const float *, + int64_t, int64_t, int64_t, + /* Y */ float *, float *, int64_t, + int64_t, int64_t); + +// C interface +extern "C" void cblas_sgemv_mlir(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, + const BLASINT M, const BLASINT N, + const float alpha, const float *A, + const BLASINT LDA, const float *X, + const BLASINT INCX, const float beta, float *Y, + const BLASINT INCY) { + // For the mini lib we only have nn, alpha=1, beta=0 or beta=1. + assert(TransA == CblasNoTrans); + assert(Order == CblasRowMajor); + assert(alpha == 1.0); + assert(beta == 1.0 || beta == 0.0); + + // This is faster + if (beta == 0.0) { + memset(Y, 0, M * sizeof(float)); + } + + // Call MLIR kernel + sgemv_n_alpha1_beta1_mlir(/* alpha */ 1.0, + /* beta */ 1.0, + /* A */ Memref_2D_Args(A, M, N, LDA), + /* X */ Memref_1D_Args(X, N, INCX), + /* Y */ Memref_1D_Args(Y, M, INCY)); +} diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto index 854eed7235720a..7d2a49a59c2f75 100644 --- a/third_party/xla/xla/xla.proto +++ b/third_party/xla/xla/xla.proto @@ -223,6 +223,7 @@ message DebugOptions { bool xla_cpu_use_xnnpack = 359; bool xla_cpu_enable_xnnpack = 389; + bool xla_cpu_use_kernel_selector = 390; // Enabling this will enable optimizations that ignore the possibility of NaN. bool xla_enable_fast_math = 335; @@ -1210,7 +1211,7 @@ message DebugOptions { // Note: when adding a new flag, please add it to one of the hardware-specific // or hardware-agnostic sections at the top of this proto message. - // Next id: 389 + // Next id: 391 // Extra options to pass to the compilation backend (e.g. LLVM); specific // interpretation of these values is left to the backend. From e5dedb2a04bbbec73a03e852e424fe6cf405f31e Mon Sep 17 00:00:00 2001 From: Wen Di Date: Mon, 12 Jan 2026 17:38:20 +0800 Subject: [PATCH 3/3] add env to set cpu instructions fusion not duplicate --- .../xla/third_party/openblas/workspace.bzl | 4 +- third_party/xla/xla/debug_options_flags.cc | 4 +- third_party/xla/xla/service/cpu/BUILD.orig | 2224 -------------- .../xla/xla/service/cpu/cpu_compiler.cc | 19 +- .../xla/xla/service/cpu/cpu_compiler.cc.orig | 2720 ----------------- .../xla/service/cpu/cpu_instruction_fusion.h | 4 +- 6 files changed, 17 insertions(+), 4958 deletions(-) delete mode 100644 third_party/xla/xla/service/cpu/BUILD.orig delete mode 100644 third_party/xla/xla/service/cpu/cpu_compiler.cc.orig diff --git a/third_party/xla/third_party/openblas/workspace.bzl b/third_party/xla/third_party/openblas/workspace.bzl index 6728207dbfe58f..74367fa1a8801d 100644 --- a/third_party/xla/third_party/openblas/workspace.bzl +++ b/third_party/xla/third_party/openblas/workspace.bzl @@ -3,8 +3,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls") def repo(): tf_http_archive( name = "openblas", - strip_prefix = "OpenBLAS-0.3.29", - sha256 = "38240eee1b29e2bde47ebb5d61160207dc68668a54cac62c076bb5032013b1eb", + strip_prefix = "OpenBLAS-8795fc7985635de1ecf674b87e2008a15097ffab", + sha256 = "f5ff825b3a82417d47c2ba97606ce8a5d868f863e555025f5d4112e6dfd62e2f", urls = tf_mirror_urls("https://github.com/OpenMathLib/OpenBLAS/archive/8795fc7985635de1ecf674b87e2008a15097ffab.tar.gz"), build_file = "//third_party/openblas:openblas.BUILD", ) diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc index 7792ab22f7f929..6cdfcfa8241c7c 100644 --- a/third_party/xla/xla/debug_options_flags.cc +++ b/third_party/xla/xla/debug_options_flags.cc @@ -100,8 +100,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { #ifdef XLA_CPU_USE_ACL opts.set_xla_cpu_use_acl(true); #endif - opts.set_xla_cpu_use_fusion_emitters(false); - opts.set_xla_cpu_use_thunk_runtime(false); + opts.set_xla_cpu_use_fusion_emitters(true); + opts.set_xla_cpu_use_thunk_runtime(true); opts.set_xla_cpu_use_xnnpack(false); opts.set_xla_cpu_enable_xnnpack(false); // For softmax opts.set_xla_cpu_use_kernel_selector(false); diff --git a/third_party/xla/xla/service/cpu/BUILD.orig b/third_party/xla/xla/service/cpu/BUILD.orig deleted file mode 100644 index f951a6ac93b626..00000000000000 --- a/third_party/xla/xla/service/cpu/BUILD.orig +++ /dev/null @@ -1,2224 +0,0 @@ -# Description: -# LLVM-based CPU backend for XLA. - -load("@bazel_skylib//rules:build_test.bzl", "build_test") -load( - "//third_party/compute_library:build_defs.bzl", - "acl_deps", - "if_enable_acl", -) -load( - "//xla:xla.default.bzl", - "xla_cc_binary", - "xla_cc_test", -) -load("//xla/tests:build_defs.bzl", "xla_test") -load("//xla/tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts") -load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable") -load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api", "mkl_deps") -load("//xla/tsl/platform:build_config.bzl", "tf_proto_library") -load( - "//xla/tsl/platform:build_config_root.bzl", - "if_llvm_aarch64_available", - "if_llvm_powerpc_available", - "if_llvm_system_z_available", - "if_llvm_x86_available", -) -load("//xla/tsl/platform:rules_cc.bzl", "cc_library") -load(":build_defs.bzl", "runtime_copts") - -package( - # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"], - default_visibility = internal_visibility([":friends"]), - licenses = ["notice"], -) - -package_group( - name = "friends", - includes = [ - "//xla:friends", - ], -) - -# Filegroup used to collect source files for dependency checking. -filegroup( - name = "c_srcs", - data = glob([ - "**/*.cc", - "**/*.h", - ]), -) - -cc_library( - name = "test_header_helper", - testonly = True, - hdrs = ["test_target_triple_helper.h"], -) - -filegroup( - name = "runtime_srcs", - srcs = [ - # Single-threaded support. - "runtime_custom_call_status.cc", - "runtime_fp16.cc", - "runtime_key_value_sort.cc", - "runtime_pow.cc", - "runtime_single_threaded_conv2d.cc", - "runtime_single_threaded_conv3d.cc", - "runtime_single_threaded_fft.cc", - "runtime_single_threaded_matmul_c128.cc", - "runtime_single_threaded_matmul_c64.cc", - "runtime_single_threaded_matmul_common.h", - "runtime_single_threaded_matmul_f8.cc", - "runtime_single_threaded_matmul_f16.cc", - "runtime_single_threaded_matmul_f32.cc", - "runtime_single_threaded_matmul_f64.cc", - "runtime_single_threaded_matmul_s32.cc", - "runtime_single_threaded_matmul_u8.cc", - "runtime_topk.cc", - "xnnpack_ops.cc", - # Multi-threaded support. - "runtime_conv2d.cc", - "runtime_conv3d.cc", - "runtime_fft.cc", - "runtime_matmul_c128.cc", - "runtime_matmul_c64.cc", - "runtime_matmul_common.h", - "runtime_matmul_f16.cc", - "runtime_matmul_f32.cc", - "runtime_matmul_f64.cc", - "runtime_matmul_s32.cc", - "runtime_fork_join.cc", - "//xla/backends/cpu/runtime:runtime_srcs", - #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add "runtime_handle_ffi_call.cc". - ], - visibility = internal_visibility([":friends"]), -) - -filegroup( - name = "runtime_hdrs", - srcs = [ - # XLA Runtime support. - "buffer_desc.h", - # Single-threaded support. - "runtime_custom_call_status.h", - "runtime_fp16.h", - "runtime_key_value_sort.h", - "runtime_pow.h", - "runtime_single_threaded_conv2d.h", - "runtime_single_threaded_conv3d.h", - "runtime_single_threaded_fft.h", - "runtime_single_threaded_matmul.h", - "runtime_topk.h", - "xnnpack_ops.h", - # Multi-threaded support. - "runtime_conv2d.h", - "runtime_conv3d.h", - "runtime_fft.h", - "runtime_fork_join.h", - "runtime_lightweight_check.h", - "runtime_matmul.h", - "//xla/backends/cpu/runtime:runtime_hdrs", - #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add "runtime_handle_ffi_call.h" - ], - visibility = internal_visibility([":friends"]), -) - -cc_library( - name = "cpu_xfeed", - srcs = ["cpu_xfeed.cc"], - hdrs = ["cpu_xfeed.h"], - deps = [ - ":cpu_runtime", - "//xla:literal", - "//xla:literal_util", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla:util", - "//xla/service:hlo_cost_analysis", - "//xla/service:shaped_buffer", - "@com_google_absl//absl/base", - "@com_google_absl//absl/cleanup", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/types:span", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:notification", - ], -) - -cc_library( - name = "cpu_transfer_manager", - srcs = ["cpu_transfer_manager.cc"], - hdrs = ["cpu_transfer_manager.h"], - deps = [ - ":cpu_runtime", - ":cpu_xfeed", - "//xla:literal", - "//xla:literal_util", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/service:compiler", - "//xla/service:generic_transfer_manager", - "//xla/service:transfer_manager", - "//xla/stream_executor:device_memory", - "//xla/stream_executor:platform_manager", - "//xla/stream_executor:stream_executor_h", - "//xla/stream_executor/host:host_platform_id", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/types:span", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - ], - alwayslink = True, # Contains per-platform transfer manager registration -) - -cc_library( - name = "buffer_info_util", - srcs = ["buffer_info_util.cc"], - hdrs = ["buffer_info_util.h"], - deps = [ - "//xla:cpu_function_runtime", - "//xla/hlo/ir:hlo", - "//xla/service:buffer_assignment", - "@com_google_absl//absl/types:span", - ], -) - -cc_library( - name = "cpu_compiler_pure", - srcs = ["cpu_compiler.cc"], - hdrs = ["cpu_compiler.h"], - copts = tsl_copts(), - deps = [ - ":buffer_info_util", - ":conv_canonicalization", - ":cpu_aot_compilation_result", - ":cpu_executable", - ":cpu_float_support", - ":cpu_instruction_fusion", - ":cpu_layout_assignment", - ":cpu_options", - ":dot_op_emitter", - ":executable_proto_cc", - ":fusion_wrapper", - ":ir_emission_utils", - ":ir_emitter", - ":ir_emitter2", - ":metrics", - ":onednn_contraction_rewriter", - ":onednn_float_support", - ":onednn_ops_rewriter", - ":parallel_task_assignment", - ":runtime_symbol_generator", - ":small_while_loop_hoisting_pass", - ":thunk_emitter", - ":xla_framework", - ":xnnpack_ops_rewriter", - "//xla:cpu_function_runtime", - "//xla:debug_options_flags", - "//xla:literal", - "//xla:literal_pool", - "//xla:protobuf_util", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla:xla_proto_cc", - "//xla/backends/cpu:constant_allocation", - "//xla/backends/cpu:xnn_fusion", - "//xla/backends/cpu/codegen:compiled_function_library", - "//xla/backends/cpu/codegen:cpu_features", - "//xla/backends/cpu/codegen:execution_engine", - "//xla/backends/cpu/codegen:ir_compiler", - "//xla/backends/cpu/codegen:jit_compiler", - "//xla/backends/cpu/codegen:object_loader", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config", - "//xla/backends/cpu/runtime:function_library", - "//xla/backends/cpu/runtime:kernel_thunk", - "//xla/backends/cpu/runtime:thunk", - "//xla/backends/cpu/runtime:thunk_proto_cc_impl", - "//xla/backends/cpu/runtime:thunk_proto_serdes", - "//xla/backends/cpu/transforms:xnn_graph_fusion", - "//xla/hlo/analysis:hlo_ordering", - "//xla/hlo/analysis:indexed_array_analysis", - "//xla/hlo/ir:hlo", - "//xla/hlo/ir:hlo_module_group", - "//xla/hlo/pass:hlo_pass", - "//xla/hlo/pass:hlo_pass_pipeline", - "//xla/hlo/transforms:literal_canonicalizer", - "//xla/hlo/transforms:operand_upcaster", - "//xla/hlo/transforms:while_loop_trip_count_annotator", - "//xla/hlo/transforms/expanders:bitcast_dtypes_expander", - "//xla/hlo/transforms/expanders:cholesky_expander", - "//xla/hlo/transforms/expanders:comparison_expander", - "//xla/hlo/transforms/expanders:dot_decomposer", - "//xla/hlo/transforms/expanders:dynamic_index_splitter", - "//xla/hlo/transforms/expanders:eigh_expander", - "//xla/hlo/transforms/expanders:logistic_expander", - "//xla/hlo/transforms/expanders:optimization_barrier_expander", - "//xla/hlo/transforms/expanders:qr_expander", - "//xla/hlo/transforms/expanders:reduce_decomposer", - "//xla/hlo/transforms/expanders:reshape_decomposer", - "//xla/hlo/transforms/expanders:rng_bit_generator_expander", - "//xla/hlo/transforms/expanders:rng_expander", - "//xla/hlo/transforms/expanders:stochastic_convert_decomposer", - "//xla/hlo/transforms/simplifiers:algebraic_simplifier", - "//xla/hlo/transforms/simplifiers:batch_dot_simplification", - "//xla/hlo/transforms/simplifiers:broadcast_canonicalizer", - "//xla/hlo/transforms/simplifiers:conditional_canonicalizer", - "//xla/hlo/transforms/simplifiers:convolution_group_converter", - "//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier", - "//xla/hlo/transforms/simplifiers:flatten_call_graph", - "//xla/hlo/transforms/simplifiers:float_normalization", - "//xla/hlo/transforms/simplifiers:gather_simplifier", - "//xla/hlo/transforms/simplifiers:hlo_constant_folding", - "//xla/hlo/transforms/simplifiers:hlo_dce", - "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler", - "//xla/hlo/transforms/simplifiers:optimize_input_output_buffer_alias", - "//xla/hlo/transforms/simplifiers:reduce_window_rewriter", - "//xla/hlo/transforms/simplifiers:reshape_mover", - "//xla/hlo/transforms/simplifiers:result_caster", - "//xla/hlo/transforms/simplifiers:simplify_fp_conversions", - "//xla/hlo/transforms/simplifiers:slice_sinker", - "//xla/hlo/transforms/simplifiers:sort_simplifier", - "//xla/hlo/transforms/simplifiers:sub_byte_normalization", - "//xla/hlo/transforms/simplifiers:tree_reduction_rewriter", - "//xla/hlo/transforms/simplifiers:tuple_simplifier", - "//xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination", - "//xla/mlir_hlo", - "//xla/mlir_hlo:all_passes", - "//xla/mlir_hlo:transforms_passes", - "//xla/service:all_reduce_promotion", - "//xla/service:outer_dimension_propagation", - "//xla/service:get_outer_batch_value_simplifier", - "//xla/service:all_to_all_decomposer", - "//xla/service:batched_gather_scatter_normalizer", - "//xla/service:batchnorm_expander", - "//xla/service:buffer_assignment", - "//xla/service:call_graph", - "//xla/service:call_inliner", - "//xla/service:change_op_data_type", - "//xla/service:compiler", - "//xla/service:conditional_simplifier", - "//xla/service:conditional_to_select", - "//xla/service:copy_insertion", - "//xla/service:cpu_gpu_shape_verifier", - "//xla/service:dump", - "//xla/service:dynamic_dimension_inference", - "//xla/service:dynamic_padder", - "//xla/service:executable", - "//xla/service:float_support", - "//xla/service:gather_expander", - "//xla/service:hlo_cost_analysis", - "//xla/service:hlo_cse", - "//xla/service:hlo_execution_profile", - "//xla/service:hlo_module_config", - "//xla/service:hlo_profile_printer_data_cc", - "//xla/service:hlo_proto_cc", - "//xla/service:hlo_proto_util", - "//xla/service:hlo_verifier", - "//xla/service:layout_assignment", - "//xla/service:llvm_compiler", - "//xla/service:logical_buffer", - "//xla/service:map_inliner", - "//xla/service:scatter_expander", - "//xla/service:scatter_simplifier", - "//xla/service:select_and_scatter_expander", - "//xla/service:sharding_propagation", - "//xla/service:sharding_remover", - "//xla/service:slow_operation_alarm", - "//xla/service:topk_rewriter", - "//xla/service:transpose_folding", - "//xla/service:triangular_solve_expander", - "//xla/service:while_loop_constant_sinking", - "//xla/service:while_loop_invariant_code_motion", - "//xla/service:while_loop_simplifier", - "//xla/service/llvm_ir:llvm_command_line_options", - "//xla/service/llvm_ir:llvm_util", - "//xla/service/spmd:stateful_rng_spmd_partitioner", - "//xla/service/spmd/shardy:shardy_xla_pass", - "//xla/stream_executor:platform", - "//xla/stream_executor:stream_executor_h", - "//xla/stream_executor/host:host_platform_id", - "//xla/tsl/concurrency:async_value", - "//xla/tsl/platform:env", - "//xla/tsl/platform:status", - "//xla/tsl/platform:statusor", - "//xla/tsl/protobuf:error_codes_proto_impl_cc", - "@com_google_absl//absl/base", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/cleanup", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/functional:any_invocable", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:BitReader", - "@llvm-project//llvm:BitWriter", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Linker", - "@llvm-project//llvm:MC", - "@llvm-project//llvm:Object", - "@llvm-project//llvm:OrcJIT", - "@llvm-project//llvm:Support", - "@llvm-project//llvm:Target", - "@llvm-project//llvm:TargetParser", - "@llvm-project//llvm:TransformUtils", - "@llvm-project//mlir:AffineDialect", - "@llvm-project//mlir:AffineToStandard", - "@llvm-project//mlir:ArithDialect", - "@llvm-project//mlir:ArithTransforms", - "@llvm-project//mlir:BufferizationTransforms", - "@llvm-project//mlir:BuiltinToLLVMIRTranslation", - "@llvm-project//mlir:FuncDialect", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:LLVMDialect", - "@llvm-project//mlir:LLVMToLLVMIRTranslation", - "@llvm-project//mlir:LinalgDialect", - "@llvm-project//mlir:LinalgTransforms", - "@llvm-project//mlir:MemRefTransforms", - "@llvm-project//mlir:Pass", - "@llvm-project//mlir:ReconcileUnrealizedCasts", - "@llvm-project//mlir:SCFDialect", - "@llvm-project//mlir:Support", - "@llvm-project//mlir:TensorDialect", - "@llvm-project//mlir:ToLLVMIRTranslation", - "@llvm-project//mlir:TransformUtils", - "@llvm-project//mlir:Transforms", - "@llvm-project//mlir:VectorDialect", - "@local_tsl//tsl/platform:casts", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:platform_port", - "@local_tsl//tsl/platform:status", - "@local_tsl//tsl/platform:statusor", - "@local_tsl//tsl/platform:threadpool_async_executor", - "@local_tsl//tsl/profiler/lib:traceme", - "@local_tsl//tsl/profiler/lib:traceme_encode", - ] + if_llvm_aarch64_available([ - "@llvm-project//llvm:AArch64CodeGen", # fixdeps: keep - ]) + if_llvm_powerpc_available([ - "@llvm-project//llvm:PowerPCCodeGen", # fixdeps: keep - ]) + if_llvm_system_z_available([ - "@llvm-project//llvm:SystemZCodeGen", # fixdeps: keep - ]) + if_llvm_x86_available([ - "@llvm-project//llvm:X86CodeGen", # fixdeps: keep - ]), -) - -cc_library( - name = "cpu_aot_compilation_result", - srcs = ["cpu_aot_compilation_result.cc"], - hdrs = ["cpu_aot_compilation_result.h"], - deps = [ - ":buffer_info_util", - ":cpu_executable", - ":executable_proto_cc", - "//xla:cpu_function_runtime", - "//xla:util", - "//xla/backends/cpu:constant_allocation", - "//xla/backends/cpu/runtime:function_library", - "//xla/backends/cpu/runtime:thunk", - "//xla/backends/cpu/runtime:thunk_proto_cc", - "//xla/backends/cpu/runtime:thunk_proto_serdes", - "//xla/hlo/ir:hlo", - "//xla/service:buffer_assignment", - "//xla/service:buffer_value", - "//xla/service:compiler", - "//xla/service:executable", - "//xla/service:hlo_cost_analysis", - "//xla/service:hlo_module_config", - "//xla/service:hlo_profile_printer_data_cc", - "//xla/service:hlo_proto_cc", - "//xla/stream_executor:platform", - "//xla/stream_executor/host:host_platform_id", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - ], -) - -cc_library( - # The old target name will still be used so that dependencies won't break. - # In the future, dependencies should be cleaned up and relinked to the above - # target if registration is not necesary. - name = "cpu_compiler", - srcs = ["cpu_compiler_registerer.cc"], - hdrs = ["cpu_compiler.h"], - deps = [ - "cpu_compiler_pure", - ":cpu_aot_compilation_result", - ":executable_proto_cc", - "//xla:util", - "//xla/backends/cpu/codegen:ir_compiler", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/hlo/ir:hlo_module_group", - "//xla/service:buffer_assignment", - "//xla/service:compiler", - "//xla/service:executable", - "//xla/service:hlo_cost_analysis", - "//xla/service:hlo_profile_printer_data_cc", - "//xla/service:hlo_proto_cc", - "//xla/service:llvm_compiler", - "//xla/stream_executor:platform", - "//xla/stream_executor:stream_executor_h", - "//xla/stream_executor/host:host_platform_id", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@llvm-project//llvm:Support", - "@llvm-project//llvm:Target", - "@llvm-project//llvm:TargetParser", - ], - alwayslink = True, # Contains compiler registration -) - -xla_test( - name = "cpu_compiler_test", - srcs = ["cpu_compiler_test.cc"], - backends = [ - "cpu", - ], - tags = [ - "test_migrated_to_hlo_runner_pjrt", - "test_xla_cpu_no_thunks", - ], - deps = [ - "//xla/hlo/testlib:verified_hlo_module", - "//xla/tests:hlo_pjrt_test_base", - "//xla/tests:xla_internal_test_main", - "//xla/tsl/lib/monitoring:collected_metrics", - "//xla/tsl/lib/monitoring:collection_registry", - "@com_google_absl//absl/strings:string_view", - "@local_tsl//tsl/platform:statusor", - "@local_tsl//tsl/platform:test", - ], -) - -xla_test( - name = "cpu_compiler_internals_test", - srcs = ["cpu_compiler_internals_test.cc"], - backends = [ - "cpu", - ], - deps = [ - "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:verified_hlo_module", - "//xla/service:llvm_compiler", - "//xla/tests:hlo_test_base", - "//xla/tests:xla_internal_test_main", - "//xla/tsl/platform:statusor", - "//xla/tsl/platform:test", - "@com_google_absl//absl/base:nullability", - "@com_google_absl//absl/strings:string_view", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - ], -) - -xla_test( - name = "cpu_aot_compiler_test", - srcs = ["cpu_aot_compiler_test.cc"], - backends = [ - "cpu", - ], - deps = [ - ":cpu_aot_compilation_result", - ":test_header_helper", - "//xla:literal", - "//xla:literal_util", - "//xla/hlo/ir:hlo", - "//xla/hlo/ir:hlo_module_group", - "//xla/service:compiler", - "//xla/service:executable", - "//xla/service:hlo_runner", - "//xla/service:hlo_runner_interface", - "//xla/stream_executor:platform", - "//xla/stream_executor:platform_manager", - "//xla/tests:hlo_test_base", - "//xla/tests:literal_test_util", - "//xla/tests:xla_internal_test_main", - "//xla/tsl/platform:statusor", - "//xla/tsl/platform:test", - "@com_google_absl//absl/strings:string_view", - ], -) - -tf_proto_library( - name = "executable_proto", - srcs = ["executable.proto"], - protodeps = [ - ":xla_framework_proto", - "//xla/service:hlo_proto", - "//xla:xla_proto", - "//xla/backends/cpu/runtime:thunk_proto", - ], -) - -tf_proto_library( - name = "xla_framework_proto", - srcs = ["xla_framework.proto"], -) - -cc_library( - name = "xla_framework", - hdrs = ["xla_framework.h"], - deps = [":xla_framework_proto_cc"], -) - -cc_library( - name = "runtime_symbol_generator", - srcs = [ - "runtime_symbol_generator.cc", - "windows_compatibility.cc", - "windows_compatibility.h", - ], - hdrs = ["runtime_symbol_generator.h"], - copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(), - deps = [ - ":cpu_runtime", - ":onednn_convolution", - ":onednn_layer_norm", - ":onednn_matmul", - ":onednn_softmax", - ":runtime_conv2d", - ":runtime_conv2d_acl", - ":runtime_conv2d_mkl", - ":runtime_conv3d", - ":runtime_custom_call_status", - ":runtime_fft", - ":runtime_fork_join", - ":runtime_fp16", - ":runtime_handle_ffi_call", - ":runtime_key_value_sort", - ":runtime_matmul", - ":runtime_matmul_acl", - ":runtime_pow", - ":runtime_single_threaded_conv2d", - ":runtime_single_threaded_conv3d", - ":runtime_single_threaded_fft", - ":runtime_single_threaded_matmul", - ":runtime_topk", - ":xnnpack_ops", - "//xla/service:custom_call_target_registry", - "@com_google_absl//absl/functional:any_invocable", - "@com_google_absl//absl/strings:string_view", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:OrcJIT", - "@llvm-project//llvm:OrcShared", - "@llvm-project//llvm:Support", - "@llvm-project//mlir:mlir_c_runner_utils", - "@local_tsl//tsl/platform:logging", - ], -) - -cc_library( - name = "runtime_lightweight_check", - hdrs = ["runtime_lightweight_check.h"], - compatible_with = get_compatible_with_portable(), - copts = runtime_copts(), -) - -cc_library( - name = "runtime_fp16", - srcs = [ - "runtime_fp16.cc", - ], - hdrs = [ - "runtime_fp16.h", - ], - copts = runtime_copts(), - deps = ["@com_google_absl//absl/base:core_headers"], -) - -cc_library( - name = "runtime_pow", - srcs = [ - "runtime_pow.cc", - ], - hdrs = [ - "runtime_pow.h", - ], - copts = runtime_copts(), - deps = ["@com_google_absl//absl/base:core_headers"], -) - -cc_library( - name = "buffer_desc", - hdrs = ["buffer_desc.h"], -) - -cc_library( - name = "cpu_executable", - srcs = ["cpu_executable.cc"], - hdrs = ["cpu_executable.h"], - deps = [ - ":cpu_runtime", - ":executable_proto_cc", - "//xla:executable_run_options", - "//xla:literal", - "//xla:shape_tree", - "//xla:shape_util", - "//xla:status_macros", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu:constant_allocation", - "//xla/backends/cpu/runtime:buffer_allocations", - "//xla/backends/cpu/runtime:function_library", - "//xla/backends/cpu/runtime:thread_pool_task_runner", - "//xla/backends/cpu/runtime:thunk", - "//xla/backends/cpu/runtime:thunk_executor", - "//xla/hlo/ir:hlo", - "//xla/service:buffer_assignment", - "//xla/service:custom_call_status", - "//xla/service:custom_call_status_internal", - "//xla/service:executable", - "//xla/service:hlo_execution_profile", - "//xla/service:hlo_profile_printer_data_cc", - "//xla/service:hlo_value", - "//xla/service:maybe_owning_device_memory", - "//xla/service:shaped_buffer", - "//xla/service:xla_debug_info_manager", - "//xla/stream_executor:device_memory", - "//xla/stream_executor:device_memory_allocator", - "//xla/stream_executor/host:host_stream", - "//xla/tsl/concurrency:async_value", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:statusor", - ], -) - -cc_library( - name = "elemental_math_emitter", - srcs = ["elemental_math_emitter.cc"], - hdrs = ["elemental_math_emitter.h"], - deps = [ - "//xla:xla_data_proto_cc", - "//xla/service/llvm_ir:math_ops", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - ], -) - -cc_library( - name = "ir_emitter2", - srcs = ["ir_emitter2.cc"], - hdrs = ["ir_emitter2.h"], - deps = [ - ":backend_config_proto_cc", - ":dot_op_emitter", - ":elemental_ir_emitter", - ":ir_emitter", - ":parallel_loop_emitter", - "//xla:shape_util", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla:xla_proto_cc", - "//xla/backends/cpu/codegen:fusion_compiler", - "//xla/backends/cpu/codegen:kernel_api_ir_builder", - "//xla/backends/cpu/codegen:symbol_name_util", - "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config", - "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters", - "//xla/hlo/ir:hlo", - "//xla/service:buffer_assignment", - "//xla/service:hlo_module_config", - "//xla/service/llvm_ir:dynamic_update_slice_util", - "//xla/service/llvm_ir:fused_ir_emitter", - "//xla/service/llvm_ir:ir_array", - "//xla/service/llvm_ir:llvm_util", - "//xla/service/llvm_ir:loop_emitter", - "//xla/stream_executor:launch_dim", - "//xla/tsl/platform:errors", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/strings:string_view", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Linker", - "@llvm-project//llvm:Support", - "@llvm-project//mlir:IR", - ], -) - -xla_cc_test( - name = "ir_emitter_test", - srcs = ["ir_emitter_test.cc"], - deps = [ - ":cpu_compiler", - ":cpu_executable", - ":cpu_options", - ":ir_emitter", - ":ir_function", - ":runtime_symbol_generator", - ":target_machine_features_stub", - "//xla:cpu_function_runtime", - "//xla:shape_util", - "//xla/backends/cpu/codegen:cpu_features", - "//xla/backends/cpu/codegen:execution_engine", - "//xla/backends/cpu/codegen:ir_compiler", - "//xla/backends/cpu/codegen:jit_compiler", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/analysis:hlo_ordering", - "//xla/hlo/ir:hlo", - "//xla/hlo/parser:hlo_parser", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler", - "//xla/service:buffer_assignment", - "//xla/service:buffer_value", - "//xla/service:hlo_module_config", - "//xla/service:logical_buffer", - "//xla/service/llvm_ir:llvm_util", - "//xla/tsl/lib/core:status_test_util", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - "@com_google_googletest//:gtest", - "@com_google_googletest//:gtest_main", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - "@llvm-project//llvm:Target", - "@llvm-project//mlir:IR", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:statusor", - "@local_tsl//tsl/platform:test", - ], -) - -cc_library( - name = "ir_emitter", - srcs = ["ir_emitter.cc"], - hdrs = ["ir_emitter.h"], - copts = tsl_copts(), - deps = [ - ":backend_config_proto_cc", - ":cpu_instruction_fusion", - ":cpu_options", - ":cpu_runtime", - ":dot_op_emitter", - ":elemental_ir_emitter", - ":ir_emission_utils", - ":ir_function", - ":onednn_config_proto_cc", - ":onednn_memory_util", - ":parallel_loop_emitter", - ":xnnpack_ops_rewriter", - ":xnnpack_ops", - "//xla:literal", - "//xla:literal_util", - "//xla:shape_util", - "//xla:status_macros", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/service:buffer_assignment", - "//xla/service:collective_ops_utils", - "//xla/service:elemental_ir_emitter", - "//xla/service:hlo_module_config", - "//xla/service:name_uniquer", - "//xla/service/llvm_ir:alias_analysis", - "//xla/service/llvm_ir:buffer_assignment_util", - "//xla/service/llvm_ir:dynamic_update_slice_util", - "//xla/service/llvm_ir:fused_ir_emitter", - "//xla/service/llvm_ir:ir_array", - "//xla/service/llvm_ir:ir_builder_mixin", - "//xla/service/llvm_ir:llvm_loop", - "//xla/service/llvm_ir:llvm_type_conversion_util", - "//xla/service/llvm_ir:llvm_util", - "//xla/service/llvm_ir:loop_emitter", - "//xla/service/llvm_ir:tuple_ops", - "//xla/tsl/lib/math:math_util", - "//xla/tsl/platform:errors", - "//xla/tsl/platform:status", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/cleanup", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/container:inlined_vector", - "@com_google_absl//absl/log", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/meta:type_traits", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - "@llvm-project//llvm:TargetParser", - "@llvm-project//mlir:IR", - ], -) - -cc_library( - name = "target_machine_features_stub", - testonly = 1, - hdrs = ["target_machine_features_stub.h"], - deps = [ - "//xla/backends/cpu/codegen:target_machine_features", - "@llvm-project//llvm:Core", - "@local_tsl//tsl/platform:logging", - ], -) - -cc_library( - name = "ir_function", - srcs = ["ir_function.cc"], - hdrs = ["ir_function.h"], - deps = [ - ":cpu_runtime", - ":ir_emission_utils", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla/service:hlo_module_config", - "//xla/service/llvm_ir:llvm_util", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:Core", - ], -) - -cc_library( - name = "parallel_loop_emitter", - srcs = ["parallel_loop_emitter.cc"], - hdrs = ["parallel_loop_emitter.h"], - deps = [ - ":ir_emission_utils", - "//xla/service/llvm_ir:ir_array", - "//xla/service/llvm_ir:llvm_loop", - "//xla/service/llvm_ir:llvm_util", - "//xla/service/llvm_ir:loop_emitter", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/strings:string_view", - "@llvm-project//llvm:Core", - ], -) - -cc_library( - name = "thunk_emitter", - srcs = ["thunk_emitter.cc"], - hdrs = ["thunk_emitter.h"], - local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]), - deps = [ - ":backend_config_proto_cc", - ":dot_op_emitter", - ":ir_emission_utils", - ":ir_emitter2", - "//xla:comparison_util", - "//xla:cpu_function_runtime", - "//xla:shape_util", - "//xla:status_macros", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu:onednn_emitter", - "//xla/backends/cpu:onednn_fusion", - "//xla/backends/cpu:xnn_emitter", - "//xla/backends/cpu:xnn_fusion", - "//xla/backends/cpu/codegen:computation_kernel_emitter", - "//xla/backends/cpu/codegen:fusion_compiler", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/backends/cpu/codegen/dot:dot_kernel_emitter", - "//xla/backends/cpu/codegen/elemental:concatenate_kernel_emitter", - "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter", - "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters", - "//xla/backends/cpu/runtime:all_gather_thunk", - "//xla/backends/cpu/runtime:all_reduce_thunk", - "//xla/backends/cpu/runtime:all_to_all_thunk", - "//xla/backends/cpu/runtime:call_thunk", - "//xla/backends/cpu/runtime:collective_permute_thunk", - "//xla/backends/cpu/runtime:collective_thunk", - "//xla/backends/cpu/runtime:conditional_thunk", - "//xla/backends/cpu/runtime:convolution_thunk", - "//xla/backends/cpu/runtime:copy_thunk", - "//xla/backends/cpu/runtime:custom_call_thunk", - "//xla/backends/cpu/runtime:dot_thunk", - "//xla/backends/cpu/runtime:fft_thunk", - "//xla/backends/cpu/runtime:infeed_thunk", - "//xla/backends/cpu/runtime:kernel_thunk", - "//xla/backends/cpu/runtime:logical_id_thunk", - "//xla/backends/cpu/runtime:outfeed_thunk", - "//xla/backends/cpu/runtime:reduce_scatter_thunk", - "//xla/backends/cpu/runtime:rng_state_thunk", - "//xla/backends/cpu/runtime:sort_thunk", - "//xla/backends/cpu/runtime:thunk", - "//xla/backends/cpu/runtime:topk_thunk", - "//xla/backends/cpu/runtime:while_thunk", - "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk", - "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk", - "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk", - "//xla/codegen:kernel_definition", - "//xla/codegen:kernel_spec", - "//xla/codegen:llvm_ir_kernel_source", - "//xla/codegen:mlir_kernel_source", - "//xla/hlo/ir:hlo", - "//xla/runtime:resource_use", - "//xla/service:buffer_assignment", - "//xla/service:collective_ops_utils", - "//xla/service:hlo_module_config", - "//xla/service:hlo_proto_cc", - "//xla/service:pattern_matcher", - "//xla/tsl/platform:errors", - "//xla/tsl/platform:logging", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:JITLink", - "@llvm-project//llvm:ir_headers", - "@local_tsl//tsl/platform:casts", - "@local_tsl//tsl/profiler/lib:traceme", - ], -) - -cc_library( - name = "tiled_dot_emitter", - srcs = ["tiled_dot_emitter.cc"], - hdrs = ["tiled_dot_emitter.h"], - deps = [ - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/codegen:vector_ir_builder", - "//xla/service:hlo_module_config", - "//xla/service/llvm_ir:kernel_support_library", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/numeric:bits", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:Core", - ], -) - -cc_library( - name = "dot_op_emitter", - srcs = ["dot_op_emitter.cc"], - hdrs = [ - "dot_op_emitter.h", - ], - deps = [ - ":backend_config_proto_cc", - ":cpu_options", - ":cpu_runtime", - ":tiled_dot_emitter", - "//xla:shape_util", - "//xla:status_macros", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/service:hlo_module_config", - "//xla/service/llvm_ir:ir_array", - "//xla/service/llvm_ir:kernel_support_library", - "//xla/service/llvm_ir:llvm_loop", - "//xla/service/llvm_ir:llvm_util", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/status", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:Support", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - ], -) - -build_test( - name = "sample_harness_build_test", - targets = [ - ":sample_harness", - ], -) - -xla_cc_binary( - name = "sample_harness", - srcs = ["sample_harness.cc"], - deps = [ - "//xla:array4d", - "//xla:literal", - "//xla:types", - "//xla:xla_data_proto_cc", - "//xla/client", - "//xla/client:client_library", - "//xla/client:local_client", - "//xla/hlo/builder:xla_builder", - "//xla/hlo/builder:xla_computation", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:str_format", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:platform_port", - ], -) - -cc_library( - name = "cpu_runtime", - srcs = [ - "cpu_runtime.cc", - "xfeed_manager.cc", - ], - hdrs = [ - "cpu_runtime.h", - "xfeed_manager.h", - ], - copts = runtime_copts(), - deps = [ - ":cpu_executable_run_options", - "//xla:executable_run_options", - "//xla:shape_util", - "//xla:types", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/collectives:cpu_clique_key", - "//xla/backends/cpu/collectives:cpu_cliques", - "//xla/backends/cpu/collectives:cpu_collectives", - "//xla/backends/cpu/collectives:in_process_collectives", - "//xla/core/collectives:communicator", - "//xla/core/collectives:rank_id", - "//xla/hlo/parser:hlo_parser", - "//xla/service:collective_ops_utils", - "//xla/service:computation_placer", - "//xla/service:global_device_id", - "//xla/stream_executor:device_memory", - "//xla/stream_executor:stream_executor_h", - "//xla/tsl/concurrency:async_value", - "//xla/tsl/platform:errors", - "//xla/tsl/platform:logging", - "//xla/tsl/platform:status", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:inlined_vector", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/synchronization", - "@com_google_absl//absl/time", - "@com_google_absl//absl/types:span", - "@local_tsl//tsl/profiler/lib:traceme", - ], -) - -cc_library( - name = "runtime_conv2d", - srcs = ["runtime_conv2d.cc"], - hdrs = ["runtime_conv2d.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla/backends/cpu/runtime:convolution_thunk_internal", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/synchronization", # build_cleaner: keep - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_conv3d", - srcs = ["runtime_conv3d.cc"], - hdrs = ["runtime_conv3d.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla/backends/cpu/runtime:convolution_thunk_internal", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/synchronization", # build_cleaner: keep - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_custom_call_status", - srcs = ["runtime_custom_call_status.cc"], - hdrs = ["runtime_custom_call_status.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "//xla/service:custom_call_status_internal", - "@com_google_absl//absl/base:core_headers", - ], -) - -cc_library( - name = "runtime_conv2d_mkl", - srcs = [ - "runtime_conv2d_mkl.cc", - ], - hdrs = ["runtime_conv2d_mkl.h"], - copts = runtime_copts() + tf_openmp_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_conv2d", - ":runtime_single_threaded_conv2d", - "//xla:executable_run_options", - "//xla/tsl/framework/convolution:eigen_helpers", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@eigen_archive//:eigen3", - ] + mkl_deps(), -) - -cc_library( - name = "runtime_fft", - srcs = [ - "runtime_fft.cc", - ], - hdrs = ["runtime_fft.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "//xla:executable_run_options", - "@com_google_absl//absl/base:core_headers", - "@ducc//:fft_wrapper", - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_matmul", - srcs = [ - "runtime_matmul_c128.cc", - "runtime_matmul_c64.cc", - "runtime_matmul_common.h", - "runtime_matmul_f16.cc", - "runtime_matmul_f32.cc", - "runtime_matmul_f64.cc", - "runtime_matmul_s32.cc", - ], - hdrs = ["runtime_matmul.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla/tsl/framework/contraction:eigen_contraction_kernel", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/synchronization", # build_cleaner: keep - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_matmul_acl", - srcs = ["runtime_matmul_acl.cc"], - hdrs = ["runtime_matmul_acl.h"], - copts = tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_lightweight_check", - ":runtime_matmul", - "//xla:executable_run_options", - "//xla/tsl/platform:dynamic_annotations", - "@com_google_absl//absl/base", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:types", - ] + acl_deps(), -) - -cc_library( - name = "runtime_conv2d_acl", - srcs = [ - "runtime_conv2d_acl.cc", - ], - hdrs = ["runtime_conv2d_acl.h"], - copts = tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_conv2d", - ":runtime_lightweight_check", - ":runtime_single_threaded_conv2d", - "//xla:executable_run_options", - "//xla/tsl/framework/convolution:eigen_helpers", - "//xla/tsl/platform:dynamic_annotations", - "@com_google_absl//absl/base", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:types", - ] + acl_deps(), -) - -cc_library( - name = "runtime_single_threaded_conv2d", - srcs = ["runtime_single_threaded_conv2d.cc"], - hdrs = ["runtime_single_threaded_conv2d.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "//xla/backends/cpu/runtime:convolution_thunk_internal", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/synchronization", # build_cleaner: keep - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_single_threaded_conv3d", - srcs = ["runtime_single_threaded_conv3d.cc"], - hdrs = ["runtime_single_threaded_conv3d.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "//xla/backends/cpu/runtime:convolution_thunk_internal", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/synchronization", # build_cleaner: keep - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_single_threaded_fft", - srcs = [ - "runtime_single_threaded_fft.cc", - ], - hdrs = ["runtime_single_threaded_fft.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_fft", - "@com_google_absl//absl/base:core_headers", - ], -) - -cc_library( - name = "runtime_single_threaded_matmul_impl", - srcs = [ - "runtime_single_threaded_matmul_c128.cc", - "runtime_single_threaded_matmul_c64.cc", - "runtime_single_threaded_matmul_common.h", - "runtime_single_threaded_matmul_f16.cc", - "runtime_single_threaded_matmul_f32.cc", - "runtime_single_threaded_matmul_f64.cc", - "runtime_single_threaded_matmul_f8.cc", - "runtime_single_threaded_matmul_s32.cc", - "runtime_single_threaded_matmul_u8.cc", - ], - hdrs = ["runtime_single_threaded_matmul.h"], - compatible_with = get_compatible_with_portable(), - copts = runtime_copts(), - linkstatic = 1, - visibility = ["//visibility:private"], - deps = [ - "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl", - "@com_google_absl//absl/base:core_headers", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:ml_dtypes", - ], -) - -cc_library( - name = "runtime_single_threaded_matmul", - hdrs = ["runtime_single_threaded_matmul.h"], - compatible_with = get_compatible_with_portable(), - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_single_threaded_matmul_impl", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:ml_dtypes", - ], -) - -cc_library( - name = "runtime_single_threaded_matmul_nomkl", - compatible_with = get_compatible_with_portable(), - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_single_threaded_matmul_impl", - "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl", - "@com_google_absl//absl/base:core_headers", - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_key_value_sort", - srcs = ["runtime_key_value_sort.cc"], - hdrs = ["runtime_key_value_sort.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@eigen_archive//:eigen3", - ], -) - -cc_library( - name = "runtime_topk", - srcs = ["runtime_topk.cc"], - hdrs = ["runtime_topk.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "@com_google_absl//absl/base", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - ], -) - -cc_library( - name = "runtime_fork_join", - srcs = ["runtime_fork_join.cc"], - hdrs = ["runtime_fork_join.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "//xla:executable_run_options", - "//xla/service:custom_call_status_internal", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:logging", - ], -) - -cc_library( - name = "runtime_handle_ffi_call", - srcs = ["runtime_handle_ffi_call.cc"], - hdrs = ["runtime_handle_ffi_call.h"], - copts = runtime_copts(), - visibility = ["//visibility:public"], - deps = [ - "//xla:executable_run_options", - "//xla:shape_util", - "//xla:xla_data_proto_cc", - "//xla/ffi:attribute_map", - "//xla/ffi:call_frame", - "//xla/ffi:execution_state", - "//xla/ffi:ffi_api", - "//xla/ffi/api:c_api", - "//xla/service:custom_call_status_public_headers", - "//xla/tsl/platform:errors", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - "@llvm-project//mlir:AsmParser", - "@llvm-project//mlir:IR", - "@llvm-project//mlir:Support", - ], -) - -xla_cc_test( - name = "cpu_runtime_test", - srcs = ["cpu_runtime_test.cc"], - shard_count = 10, - tags = ["optonly"], - deps = [ - ":cpu_runtime", - ":runtime_custom_call_status", - ":runtime_matmul", - ":runtime_matmul_acl", - ":runtime_single_threaded_matmul", - "//xla:array2d", - "//xla:executable_run_options", - "//xla:types", - "//xla/client:local_client", - "//xla/service:custom_call_status_internal", - "//xla/tests:xla_internal_test_main", - "@com_google_absl//absl/strings:str_format", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:test", - ], -) - -xla_cc_test( - name = "cpu_instruction_fusion_test", - srcs = ["cpu_instruction_fusion_test.cc"], - tags = ["not_run:arm"], - deps = [ - ":cpu_instruction_fusion", - "//xla:literal_util", - "//xla:shape_util", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/utils:hlo_matchers", - "//xla/service:transpose_folding", - "//xla/tests:test_utils", - "//xla/tests:xla_internal_test_main", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/types:span", - "@com_google_googletest//:gtest", - "@local_tsl//tsl/platform:statusor", - ], -) - -xla_cc_test( - name = "xfeed_manager_test", - size = "small", - srcs = ["xfeed_manager_test.cc"], - deps = [ - ":cpu_runtime", - "//xla:shape_util", - "//xla:xla_data_proto_cc", - "//xla/tests:xla_internal_test_main", - "//xla/tsl/lib/core:status_test_util", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status:statusor", - "@com_google_googletest//:gtest", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:test", - ], -) - -cc_library( - name = "cpu_instruction_fusion", - srcs = ["cpu_instruction_fusion.cc"], - hdrs = ["cpu_instruction_fusion.h"], - deps = [ - "//xla:shape_util", - "//xla/hlo/ir:hlo", - "//xla/service:fusion_node_indexing_evaluation", - "//xla/service:instruction_fusion", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/log", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - ], -) - -cc_library( - name = "fusion_wrapper", - srcs = ["fusion_wrapper.cc"], - hdrs = ["fusion_wrapper.h"], - deps = [ - "//xla/codegen/emitters:fusion_wrapper_base", - "//xla/hlo/ir:hlo", - "@com_google_absl//absl/strings:string_view", - ], -) - -xla_cc_test( - name = "fusion_wrapper_test", - srcs = ["fusion_wrapper_test.cc"], - deps = [ - ":fusion_wrapper", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/tests:xla_internal_test_main", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/strings:string_view", - "@com_google_googletest//:gtest", - ], -) - -cc_library( - name = "ir_emission_utils", - srcs = ["ir_emission_utils.cc"], - hdrs = ["ir_emission_utils.h"], - deps = [ - ":cpu_runtime", - "//xla:shape_util", - "//xla:window_util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "@com_google_absl//absl/log:check", - "@llvm-project//llvm:Core", - ], -) - -xla_cc_test( - name = "ir_emission_utils_test", - srcs = ["ir_emission_utils_test.cc"], - deps = [ - ":ir_emission_utils", - ":target_machine_features_stub", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/tests:xla_internal_test_main", - ], -) - -cc_library( - name = "cpu_layout_assignment", - srcs = ["cpu_layout_assignment.cc"], - hdrs = ["cpu_layout_assignment.h"], - deps = [ - ":dot_op_emitter", - ":ir_emission_utils", - "//xla:shape_util", - "//xla:util", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/service:computation_layout", - "//xla/service:layout_assignment", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:status", - ], -) - -xla_cc_test( - name = "cpu_layout_assignment_test", - size = "small", - srcs = ["cpu_layout_assignment_test.cc"], - deps = [ - ":cpu_layout_assignment", - ":target_machine_features_stub", - "//xla:literal", - "//xla:shape_layout", - "//xla:shape_util", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/hlo/testlib:test_helpers", - "//xla/hlo/utils:hlo_matchers", - "//xla/service:computation_layout", - "//xla/tests:test_utils", - "//xla/tests:xla_internal_test_main", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/types:span", - "@local_tsl//tsl/platform:status", - ], -) - -cc_library( - name = "conv_canonicalization", - srcs = ["conv_canonicalization.cc"], - hdrs = ["conv_canonicalization.h"], - deps = [ - ":cpu_runtime", - ":ir_emission_utils", - "//xla:permutation_util", - "//xla:shape_util", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/hlo/pass:hlo_pass", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - "@local_tsl//tsl/platform:errors", - ], -) - -xla_cc_test( - name = "conv_canonicalization_test", - srcs = ["conv_canonicalization_test.cc"], - deps = [ - ":conv_canonicalization", - ":target_machine_features_stub", - "//xla:literal_util", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/hlo/testlib:test_helpers", - "//xla/tests:xla_internal_test_main", - ], -) - -cc_library( - name = "parallel_task_assignment", - srcs = ["parallel_task_assignment.cc"], - hdrs = ["parallel_task_assignment.h"], - deps = [ - ":backend_config_proto_cc", - ":ir_emission_utils", - "//xla:shape_util", - "//xla:util", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/hlo/pass:hlo_pass", - "//xla/service:hlo_cost_analysis", - "//xla/service/llvm_ir:dynamic_update_slice_util", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:platform_port", - "@local_tsl//tsl/platform:status", - ], -) - -xla_cc_test( - name = "parallel_task_assignment_test", - srcs = ["parallel_task_assignment_test.cc"], - deps = [ - ":backend_config_proto_cc", - ":cpu_executable", - ":parallel_task_assignment", - ":target_machine_features_stub", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/service:hlo_cost_analysis", - "//xla/tests:xla_internal_test_main", - "@com_google_absl//absl/status:statusor", - "@local_tsl//tsl/platform:statusor", - ], -) - -cc_library( - name = "cpu_options", - srcs = ["cpu_options.cc"], - hdrs = ["cpu_options.h"], - deps = [ - "//xla/service:hlo_module_config", - "@com_google_absl//absl/log:check", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - ], -) - -cc_library( - name = "orc_jit_memory_mapper", - srcs = ["orc_jit_memory_mapper.cc"], - hdrs = ["orc_jit_memory_mapper.h"], - deps = [ - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/synchronization", - "@llvm-project//llvm:ExecutionEngine", - "@local_tsl//tsl/platform:logging", - ], -) - -xla_cc_test( - name = "cpu_eigen_tensor_alignment_test", - size = "small", - srcs = ["cpu_eigen_tensor_alignment_test.cc"], - deps = [ - ":ir_emission_utils", - ":target_machine_features_stub", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/tests:xla_internal_test_main", - ], -) - -xla_cc_test( - name = "vectorized_reduce_with_no_vector_registers_test", - size = "small", - srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"], - tags = ["not_run:arm"], - target_compatible_with = ["@platforms//cpu:x86_64"], - deps = [ - ":cpu_compiler", - ":cpu_transfer_manager", - ":test_header_helper", - "//xla:util", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/hlo/ir:hlo_module_group", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/service:compiler", - "//xla/tests:xla_internal_test_main", - "@com_google_absl//absl/memory", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - "@com_google_googletest//:gtest", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:MC", - "@llvm-project//llvm:Target", - "@local_tsl//tsl/platform:statusor", - ], -) - -xla_cc_test( - name = "scoped_ir_builder_test", - srcs = ["scoped_ir_builder_test.cc"], - deps = [ - ":cpu_executable", - ":ir_emitter", - ":target_machine_features_stub", - "//xla/hlo/analysis:hlo_ordering", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/service:buffer_assignment", - "//xla/service:buffer_value", - "//xla/service:logical_buffer", - "@com_google_googletest//:gtest_main", - "@llvm-project//llvm:Core", - "@local_tsl//tsl/platform:test", - ], -) - -tf_proto_library( - name = "onednn_config_proto", - srcs = ["onednn_config.proto"], -) - -tf_proto_library( - name = "backend_config_proto", - srcs = ["backend_config.proto"], - protodeps = [ - ":onednn_config_proto", - ], -) - -cc_library( - name = "onednn_util", - srcs = ["onednn_util.cc"], - hdrs = [ - "onednn_util.h", - "//xla/tsl/util:onednn_util_hdrs", - ], - copts = runtime_copts() + tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/tsl/platform:env", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_memory_util", - srcs = ["onednn_memory_util.cc"], - hdrs = ["onednn_memory_util.h"], - copts = runtime_copts() + tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":runtime_lightweight_check", - "//xla:literal", - "//xla:shape_util", - "//xla:status_macros", - "//xla:types", - "//xla:util", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/service/llvm_ir:ir_array", - "//xla/service/llvm_ir:ir_builder_mixin", - "//xla/service/llvm_ir:llvm_util", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:Core", - "@llvm-project//llvm:TargetParser", - "@llvm-project//mlir:IR", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_matmul", - srcs = ["onednn_matmul.cc"], - hdrs = ["onednn_matmul.h"], - copts = runtime_copts() + tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - ":onednn_memory_util", - ":onednn_util", - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla:shape_util", - "//xla/hlo/ir:hlo", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_convolution", - srcs = ["onednn_convolution.cc"], - hdrs = ["onednn_convolution.h"], - copts = runtime_copts() + tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - ":onednn_memory_util", - ":onednn_util", - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla:shape_util", - "//xla/hlo/ir:hlo", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_layer_norm", - srcs = ["onednn_layer_norm.cc"], - hdrs = [ - "onednn_layer_norm.h", - "//xla/tsl/util:onednn_util_hdrs", - ], - copts = runtime_copts() + tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - ":onednn_memory_util", - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla/tsl/platform:env", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_softmax", - srcs = ["onednn_softmax.cc"], - hdrs = [ - "onednn_softmax.h", - "//xla/tsl/util:onednn_util_hdrs", - ], - copts = runtime_copts() + tsl_copts(), - visibility = ["//visibility:public"], - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - ":onednn_memory_util", - ":runtime_lightweight_check", - "//xla:executable_run_options", - "//xla/tsl/platform:env", - "@com_google_absl//absl/base:core_headers", - "@com_google_absl//absl/base:dynamic_annotations", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_pattern_utils", - hdrs = ["onednn_pattern_utils.h"], - visibility = ["//visibility:public"], - deps = [ - ":onednn_util", - "//xla/hlo/ir:hlo", - "//xla/service:pattern_matcher", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_contraction_rewriter", - srcs = ["onednn_contraction_rewriter.cc"], - hdrs = [ - "onednn_contraction_rewriter.h", - "onednn_convolution.h", - "onednn_matmul.h", - "//xla/tsl/util:onednn_util_hdrs", - ], - copts = tsl_copts(), - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - ":onednn_convolution", - ":onednn_matmul", - ":onednn_memory_util", - ":onednn_pattern_utils", - ":onednn_util", - "//xla:executable_run_options", - "//xla:shape_util", - "//xla:status_macros", - "//xla:xla_data_proto_cc", - "//xla/hlo/evaluator:hlo_evaluator", - "//xla/hlo/ir:hlo", - "//xla/hlo/pass:hlo_pass", - "//xla/service:hlo_cost_analysis", - "//xla/service:hlo_creation_utils", - "//xla/service:pattern_matcher", - "//xla/tsl/platform:env", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/synchronization", - "@eigen_archive//:eigen3", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_ops_rewriter", - srcs = ["onednn_ops_rewriter.cc"], - hdrs = ["onednn_ops_rewriter.h"], - copts = tsl_copts(), - deps = [ - ":backend_config_proto_cc", - ":onednn_config_proto_cc", - ":onednn_memory_util", - ":onednn_pattern_utils", - ":onednn_util", - "//xla:literal_comparison", - "//xla:literal_util", - "//xla:status_macros", - "//xla:xla_data_proto_cc", - "//xla/hlo/ir:hlo", - "//xla/hlo/pass:hlo_pass", - "//xla/service:hlo_creation_utils", - "//xla/service:pattern_matcher", - "@com_google_absl//absl/algorithm:container", - "@local_tsl//tsl/platform:platform_port", - ] + mkl_deps(), -) - -cc_library( - name = "onednn_float_support", - srcs = ["onednn_float_support.cc"], - hdrs = ["onednn_float_support.h"], - copts = tsl_copts(), - deps = [ - ":onednn_contraction_rewriter", - "//xla/service:float_support", - ], -) - -cc_library( - name = "cpu_float_support", - hdrs = ["cpu_float_support.h"], - copts = tsl_copts(), - deps = [ - "//xla/backends/cpu:xnn_fusion", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/hlo/ir:hlo", - "//xla/service:float_support", - ], -) - -xla_cc_test( - name = "cpu_float_support_test", - srcs = ["cpu_float_support_test.cc"], - deps = [ - ":cpu_float_support", - "//xla:shape_util", - "//xla:xla_data_proto_cc", - "//xla/backends/cpu/codegen:target_machine_features", - "//xla/backends/cpu/codegen:target_machine_test_base", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:verified_hlo_module", - "//xla/hlo/transforms/simplifiers:float_normalization", - "//xla/service:hlo_module_config", - "//xla/tsl/platform:statusor", - "//xla/tsl/platform:test", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:string_view", - "@com_google_googletest//:gtest_main", - ], -) - -cc_library( - name = "cpu_symbol_repository", - hdrs = ["cpu_symbol_repository.h"], - deps = [ - "//xla:xla_proto_cc", - "//xla/service:symbol_repository", - ], -) - -cc_library( - name = "cpu_executable_run_options", - hdrs = ["cpu_executable_run_options.h"], - deps = ["//xla/backends/cpu/collectives:cpu_collectives"], -) - -cc_library( - name = "metrics", - srcs = ["metrics.cc"], - hdrs = ["metrics.h"], - deps = [ - "//xla/tsl/lib/monitoring:counter", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:string_view", - "@local_tsl//tsl/platform:stacktrace", - "@local_tsl//tsl/profiler/lib:traceme", - ], -) - -cc_library( - name = "elemental_ir_emitter", - srcs = ["elemental_ir_emitter.cc"], - hdrs = ["elemental_ir_emitter.h"], - deps = [ - ":elemental_math_emitter", - "//xla/hlo/ir:hlo", - "//xla/service:elemental_ir_emitter", - "@com_google_absl//absl/functional:any_invocable", - "@com_google_absl//absl/status", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - "@com_google_absl//absl/types:span", - "@llvm-project//llvm:ir_headers", - ], -) - -cc_library( - name = "small_while_loop_hoisting_pass", - srcs = ["small_while_loop_hoisting_pass.cc"], - hdrs = ["small_while_loop_hoisting_pass.h"], - deps = [ - ":cpu_executable", - "//xla:util", - "//xla/hlo/ir:hlo", - "//xla/hlo/pass:hlo_pass", - "//xla/service:collective_ops_utils", - "//xla/service:hlo_cost_analysis", - "//xla/tsl/platform:errors", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/container:flat_hash_set", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings", - "@com_google_absl//absl/strings:string_view", - ], -) - -xla_cc_test( - name = "small_while_loop_hoisting_pass_test", - srcs = ["small_while_loop_hoisting_pass_test.cc"], - deps = [ - ":backend_config_proto_cc", - ":small_while_loop_hoisting_pass", - "//xla/hlo/ir:hlo", - "//xla/hlo/testlib:hlo_hardware_independent_test_base", - "//xla/hlo/testlib:test", - "//xla/tsl/platform:statusor", - "@com_google_absl//absl/status:statusor", - "@com_google_absl//absl/strings:string_view", - "@com_google_googletest//:gtest_main", - ], -) - -xla_cc_test( - name = "metrics_test", - srcs = ["metrics_test.cc"], - deps = [ - ":metrics", - "//xla/tests:xla_internal_test_main", - "//xla/tsl/lib/monitoring:collected_metrics", - "//xla/tsl/lib/monitoring:collection_registry", - "@local_tsl//tsl/platform:test", - ], -) - -cc_library( - name = "xnnpack_ops_rewriter", - srcs = ["xnnpack_ops_rewriter.cc"], - hdrs = [ - "xnnpack_ops_rewriter.h", - "xnnpack_pattern_utils.h", - ], - visibility = ["//visibility:public"], - deps = [ - "//xla/hlo/ir:hlo", - "//xla:literal_comparison", - "//xla:literal_util", - "//xla:status_macros", - "//xla/hlo/pass:hlo_pass", - "//xla/service:pattern_matcher", - ], -) - -cc_library( - name = "xnnpack_ops", - srcs = ["xnnpack_ops.cc"], - hdrs = ["xnnpack_ops.h"], - visibility = ["//visibility:public"], - deps = [ - "@XNNPACK", - "@com_google_absl//absl/base", - ], -) diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc index c6d02568dfb9e4..9f2e6f5e6210d5 100644 --- a/third_party/xla/xla/service/cpu/cpu_compiler.cc +++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc @@ -599,13 +599,6 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( if (enable_xnnpack) pipeline.AddPass(); - bool use_kernel_selector = - xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector(); - if (use_kernel_selector) { - // This pass rewrites hlo.dot into custom calls. - pipeline.AddPass(); - } - // Expand random number generation. pipeline.AddPass(); pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); @@ -846,6 +839,13 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn( pipeline.AddPass(); + bool use_kernel_selector = + xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector(); + if (use_kernel_selector) { + // This pass rewrites hlo.dot into custom calls. + pipeline.AddPass(); + } + const int max_parallelism = module->config().intra_op_parallelism_threads() > 0 ? module->config().intra_op_parallelism_threads() @@ -878,7 +878,10 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn( } // Add a fusion pass now that layout assignment is done. - pipeline.AddPass(); + if (getenv("SET_CPU_INS_FUSION_NOT_DUPLICATE") != NULL) + pipeline.AddPass(/*may_duplicate=*/false); + else + pipeline.AddPass(/*may_duplicate=*/true); if (is_fusion_emitters) { pipeline.AddPass(); } diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig deleted file mode 100644 index 4a1402c6934cba..00000000000000 --- a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig +++ /dev/null @@ -1,2720 +0,0 @@ -/* Copyright 2017 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "xla/service/cpu/cpu_compiler.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc" -// IWYU pragma: no_include "llvm/Config/Targets.def.inc" - -#include "absl/cleanup/cleanup.h" -#include "absl/container/flat_hash_map.h" -#include "absl/container/flat_hash_set.h" -#include "absl/log/check.h" -#include "absl/log/log.h" -#include "absl/memory/memory.h" -#include "absl/status/status.h" -#include "absl/status/statusor.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_format.h" -#include "absl/strings/str_join.h" -#include "absl/strings/string_view.h" -#include "absl/types/span.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Bitcode/BitcodeReader.h" -#include "llvm/Bitcode/BitcodeWriter.h" -#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Mangler.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Linker/Linker.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Object/ObjectFile.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CodeGen.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/MemoryBufferRef.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/TargetParser/Host.h" -#include "llvm/TargetParser/Triple.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/SplitModule.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Support/LLVM.h" -#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Export.h" -#include "mlir/Transforms/DialectConversion.h" -#include "xla/backends/cpu/codegen/cpu_features.h" -#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h" -#include "xla/backends/cpu/codegen/execution_engine.h" -#include "xla/backends/cpu/codegen/ir_compiler.h" -#include "xla/backends/cpu/codegen/jit_compiler.h" -#include "xla/backends/cpu/codegen/object_loader.h" -#include "xla/backends/cpu/codegen/target_machine_features.h" -#include "xla/backends/cpu/constant_allocation.h" -#include "xla/backends/cpu/runtime/function_library.h" -#include "xla/backends/cpu/runtime/thunk.h" -#include "xla/backends/cpu/runtime/thunk.pb.h" -#include "xla/backends/cpu/runtime/thunk_proto_serdes.h" -#include "xla/backends/cpu/transforms/xnn_graph_fusion.h" -#include "xla/backends/cpu/xnn_fusion.h" -#include "xla/cpu_function_runtime.h" -#include "xla/hlo/analysis/hlo_ordering.h" -#include "xla/hlo/analysis/indexed_array_analysis.h" -#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h" -#include "xla/hlo/ir/hlo_casting_utils.h" -#include "xla/hlo/ir/hlo_computation.h" -#include "xla/hlo/ir/hlo_instruction.h" -#include "xla/hlo/ir/hlo_instructions.h" -#include "xla/hlo/ir/hlo_module.h" -#include "xla/hlo/ir/hlo_module_group.h" -#include "xla/hlo/ir/hlo_opcode.h" -#include "xla/hlo/ir/hlo_schedule.h" -#include "xla/hlo/pass/hlo_pass_fix.h" -#include "xla/hlo/pass/hlo_pass_pipeline.h" -#include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h" -#include "xla/hlo/transforms/expanders/cholesky_expander.h" -#include "xla/hlo/transforms/expanders/comparison_expander.h" -#include "xla/hlo/transforms/expanders/dot_decomposer.h" -#include "xla/hlo/transforms/expanders/dynamic_index_splitter.h" -#include "xla/hlo/transforms/expanders/eigh_expander.h" -#include "xla/hlo/transforms/expanders/logistic_expander.h" -#include "xla/hlo/transforms/expanders/optimization_barrier_expander.h" -#include "xla/hlo/transforms/expanders/qr_expander.h" -#include "xla/hlo/transforms/expanders/reduce_decomposer.h" -#include "xla/hlo/transforms/expanders/reshape_decomposer.h" -#include "xla/hlo/transforms/expanders/rng_bit_generator_expander.h" -#include "xla/hlo/transforms/expanders/rng_expander.h" -#include "xla/hlo/transforms/expanders/stochastic_convert_decomposer.h" -#include "xla/hlo/transforms/literal_canonicalizer.h" -#include "xla/hlo/transforms/operand_upcaster.h" -#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h" -#include "xla/hlo/transforms/simplifiers/batch_dot_simplification.h" -#include "xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h" -#include "xla/hlo/transforms/simplifiers/conditional_canonicalizer.h" -#include "xla/hlo/transforms/simplifiers/convolution_group_converter.h" -#include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h" -#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h" -#include "xla/hlo/transforms/simplifiers/float_normalization.h" -#include "xla/hlo/transforms/simplifiers/gather_simplifier.h" -#include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h" -#include "xla/hlo/transforms/simplifiers/hlo_dce.h" -#include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h" -#include "xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h" -#include "xla/hlo/transforms/simplifiers/reduce_window_rewriter.h" -#include "xla/hlo/transforms/simplifiers/reshape_mover.h" -#include "xla/hlo/transforms/simplifiers/result_caster.h" -#include "xla/hlo/transforms/simplifiers/sort_simplifier.h" -#include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h" -#include "xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h" -#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h" -#include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h" -#include "xla/hlo/transforms/while_loop_trip_count_annotator.h" -#include "xla/literal_pool.h" -#include "xla/map_util.h" -#include "xla/mlir_hlo/transforms/passes.h" -#include "xla/service/all_reduce_promotion.h" -#include "xla/service/outer_dimension_propagation.h" -#include "xla/service/get_outer_batch_value_simplifier.h" -#include "xla/service/all_to_all_decomposer.h" -#include "xla/service/batched_gather_scatter_normalizer.h" -#include "xla/service/batchnorm_expander.h" -#include "xla/service/buffer_assignment.h" -#include "xla/service/call_graph.h" -#include "xla/service/call_inliner.h" -#include "xla/service/change_op_data_type.h" -#include "xla/service/compiler.h" -#include "xla/service/conditional_simplifier.h" -#include "xla/service/conditional_to_select.h" -#include "xla/service/copy_insertion.h" -#include "xla/service/cpu/buffer_info_util.h" -#include "xla/service/cpu/conv_canonicalization.h" -#include "xla/service/cpu/cpu_aot_compilation_result.h" -#include "xla/service/cpu/cpu_executable.h" -#include "xla/service/cpu/cpu_float_support.h" -#include "xla/service/cpu/cpu_instruction_fusion.h" -#include "xla/service/cpu/cpu_layout_assignment.h" -#include "xla/service/cpu/cpu_options.h" -#include "xla/service/cpu/dot_op_emitter.h" -#include "xla/service/cpu/executable.pb.h" -#include "xla/service/cpu/fusion_wrapper.h" -#include "xla/service/cpu/ir_emitter.h" -#include "xla/service/cpu/ir_emitter2.h" -#include "xla/service/cpu/metrics.h" -#include "xla/service/cpu/parallel_task_assignment.h" -#include "xla/service/cpu/runtime_symbol_generator.h" -#include "xla/service/cpu/small_while_loop_hoisting_pass.h" -#include "xla/service/cpu/thunk_emitter.h" -#include "xla/service/cpu_gpu_shape_verifier.h" -#include "xla/service/dump.h" -#include "xla/service/dynamic_dimension_inference.h" -#include "xla/service/dynamic_padder.h" -#include "xla/service/executable.h" -#include "xla/service/float_support.h" -#include "xla/service/gather_expander.h" -#include "xla/service/hlo.pb.h" -#include "xla/service/hlo_cost_analysis.h" -#include "xla/service/hlo_cse.h" -#include "xla/service/hlo_execution_profile.h" -#include "xla/service/hlo_module_config.h" -#include "xla/service/hlo_profile_printer_data.pb.h" -#include "xla/service/hlo_verifier.h" -#include "xla/service/layout_assignment.h" -#include "xla/service/llvm_compiler.h" -#include "xla/service/llvm_ir/llvm_command_line_options.h" -#include "xla/service/llvm_ir/llvm_util.h" -#include "xla/service/logical_buffer.h" -#include "xla/service/map_inliner.h" -#include "xla/service/scatter_expander.h" -#include "xla/service/scatter_simplifier.h" -#include "xla/service/select_and_scatter_expander.h" -#include "xla/service/sharding_propagation.h" -#include "xla/service/sharding_remover.h" -#include "xla/service/slow_operation_alarm.h" -#include "xla/service/spmd/shardy/shardy_xla_pass.h" -#include "xla/service/spmd/stateful_rng_spmd_partitioner.h" -#include "xla/service/topk_rewriter.h" -#include "xla/service/transpose_folding.h" -#include "xla/service/triangular_solve_expander.h" -#include "xla/service/while_loop_constant_sinking.h" -#include "xla/service/while_loop_invariant_code_motion.h" -#include "xla/service/while_loop_simplifier.h" -#include "xla/shape.h" -#include "xla/shape_util.h" -#include "xla/status_macros.h" -#include "xla/stream_executor/host/host_platform_id.h" -#include "xla/stream_executor/platform.h" -#include "xla/stream_executor/stream_executor.h" -#include "xla/tsl/platform/env.h" -#include "xla/tsl/platform/status.h" -#include "xla/tsl/platform/statusor.h" -#include "xla/tsl/platform/threadpool.h" -#include "xla/util.h" -#include "xla/xla.pb.h" -#include "xla/xla_data.pb.h" -#include "tsl/platform/casts.h" -#include "tsl/platform/cpu_info.h" -#include "tsl/platform/logging.h" // IWYU pragma: keep -#include "tsl/profiler/lib/traceme.h" -#include "tsl/profiler/lib/traceme_encode.h" - -#include "xnnpack_ops_rewriter.h" - -#ifdef TF_LLVM_X86_AVAILABLE -#include "llvm/TargetParser/X86TargetParser.h" -#endif - -#if defined(INTEL_MKL) -#include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h" -#include "xla/service/cpu/onednn_contraction_rewriter.h" -#include "xla/service/cpu/onednn_float_support.h" -#include "xla/service/cpu/onednn_ops_rewriter.h" -#endif - -namespace xla { -namespace { - -using tsl::profiler::TraceMe; -using tsl::profiler::TraceMeEncode; - -// A module identifier (prefix) for emitted LLVM modules. -static constexpr absl::string_view kXlaModuleIdentifier = "__compute_module"; - -// Returns a global (per-process) thread pool for XLA CPU compilation tasks. -static tsl::thread::ThreadPool* GetCompilationThreadPool() { - // LLVM compilation has a lot of memory-bound pointer chasing and not - // so much CPU-bound work. Based on profiling a few examples, 32 threads seems - // to be enough to achieve maximum parallel compilation speedup. - static constexpr int kMaxCompilationThreads = 32; - - // On Mac OS the default stack size is 512KiB, this is too small for compiling - // reasonably sized programs - tsl::ThreadOptions thread_options; - thread_options.stack_size = 4 * 1024 * 1024; // 4 MB - - static auto* const thread_pool = new tsl::thread::ThreadPool( - tsl::Env::Default(), thread_options, "xla-cpu-llvm-codegen", - std::min(kMaxCompilationThreads, tsl::port::MaxParallelism())); - return thread_pool; -} - -// Returns task runner that uses the global compilation thread pool. -static cpu::JitCompiler::TaskRunner GetCompilationTaskRunner() { - return [](cpu::JitCompiler::Task task) { - GetCompilationThreadPool()->Schedule(std::move(task)); - }; -} - -// For each computation in the module, determines whether that computation -// calls a custom-call function, either directly or indirectly (e.g. because it -// calls another computation that does). -absl::flat_hash_map -ModuleComputationsTransitivelyContainCustomCall(const HloModule& module) { - absl::flat_hash_map custom_call_map; - std::unique_ptr call_graph = CallGraph::Build(&module); - - // Can never fail because we always return an OK status from the visitor. - TF_CHECK_OK(call_graph->VisitNodes([&custom_call_map]( - const CallGraphNode& node) { - const HloComputation* computation = node.computation(); - - for (const HloInstruction* instruction : computation->instructions()) { - // The computation contains a custom-call instruction directly. - if (DynCast(instruction)) { - custom_call_map[computation] = true; - return absl::OkStatus(); - } - // The computation calls something that contains a custom-call - // instruction (directly or indirectly). This lookup relies on the call - // graph traversing callees before callers, so that the map is always - // populated for all callees at this point. - for (const HloComputation* callee : instruction->called_computations()) { - bool callee_contains_custom_call = FindOrDie(custom_call_map, callee); - if (callee_contains_custom_call) { - custom_call_map[computation] = true; - return absl::OkStatus(); - } - } - } - - custom_call_map[computation] = false; - return absl::OkStatus(); - })); - - return custom_call_map; -} - -} // namespace - -namespace cpu { - -CpuCompiler::CpuCompiler() { - // Initialize LLVM the first time the CpuCompiler is initialized. - static bool llvm_initialized = []() { - InitializeLLVMTarget(); - return true; - }(); - (void)llvm_initialized; -} - -absl::StatusOr>> CpuCompiler::Compile( - std::unique_ptr module_group, - std::vector> stream_execs, - const CompileOptions& options) { - for (const std::vector& se_vector : stream_execs) { - if (se_vector.size() != 1) { - return Unimplemented( - "Model partitioning not implemented for the CPU compiler"); - } - } - return LLVMCompiler::Compile(std::move(module_group), stream_execs, options); -} - -/* static */ void CpuCompiler::InitializeLLVMTarget() { - // Initialize LLVM's MC layer for the native target. - llvm::InitializeNativeTarget(); - llvm::InitializeNativeTargetAsmPrinter(); -} - -namespace { - -// This visitor records which HLO instructions should have profiling information -// recorded. -class CollectProfileCandidates : public DfsHloVisitorWithDefault { - public: - static absl::StatusOr> - GetCandidatesForComputation( - const HloComputation& computation, - const absl::flat_hash_map& - assigned_indices) { - absl::flat_hash_map hlo_to_profile_idx; - CollectProfileCandidates profile_candidates_for_computation( - &hlo_to_profile_idx, assigned_indices); - TF_RETURN_IF_ERROR(computation.Accept(&profile_candidates_for_computation)); - return hlo_to_profile_idx; - } - - private: - CollectProfileCandidates( - absl::flat_hash_map* hlo_to_profile_idx, - const absl::flat_hash_map& - assigned_indices) - : hlo_to_profile_idx_(hlo_to_profile_idx), - assigned_indices_(assigned_indices) {} - - absl::Status DefaultAction(HloInstruction* hlo_instruction) override { - hlo_to_profile_idx_->insert( - {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)}); - return absl::OkStatus(); - } - - absl::Status HandleCall(HloInstruction* call) override { - TF_RETURN_IF_ERROR(DefaultAction(call)); - CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_, - assigned_indices_); - TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call)); - return absl::OkStatus(); - } - // Recurse into "conditional" so we can profile inside of it. - absl::Status HandleConditional(HloInstruction* conditional) override { - TF_RETURN_IF_ERROR(DefaultAction(conditional)); - - CollectProfileCandidates candidates_for_true(hlo_to_profile_idx_, - assigned_indices_); - TF_RETURN_IF_ERROR( - conditional->true_computation()->Accept(&candidates_for_true)); - - CollectProfileCandidates candidates_for_false(hlo_to_profile_idx_, - assigned_indices_); - TF_RETURN_IF_ERROR( - conditional->false_computation()->Accept(&candidates_for_false)); - - return absl::OkStatus(); - } - - // Skip constants, there is nothing to profile. - absl::Status HandleConstant(HloInstruction*) override { - return absl::OkStatus(); - } - // Skip parameters, they are a simple load. - absl::Status HandleParameter(HloInstruction*) override { - return absl::OkStatus(); - } - // It is important to recurse for "while" or else we risk overly coarse - // profiling information. - absl::Status HandleWhile(HloInstruction* xla_while) override { - TF_RETURN_IF_ERROR(DefaultAction(xla_while)); - - CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_, - assigned_indices_); - TF_RETURN_IF_ERROR( - xla_while->while_condition()->Accept(&candidates_for_condition)); - - CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_, - assigned_indices_); - TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body)); - - return absl::OkStatus(); - } - - absl::flat_hash_map* hlo_to_profile_idx_; - const absl::flat_hash_map& assigned_indices_; -}; - -// Adds the HloVerifier for CPU to the given pipeline. -void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {}, - bool debug_only = false) { - auto verifier_metadata = - std::make_unique(std::move(opts)); - - if (debug_only) { - pipeline->AddInvariantCheckerDebug( - std::move(verifier_metadata), "hlo verifier (debug)"); - } else { - pipeline->AddInvariantChecker(std::move(verifier_metadata), - "hlo verifier"); - } -} - -std::unique_ptr> CreateSimplificationPipeline( - absl::string_view name, HloModule* module, bool is_fusion_emitters) { - // Run the following passes to a fixed point. - auto pipeline = - std::make_unique>(std::string(name)); - AddHloVerifier(pipeline.get(), HloVerifierOpts{}, - /*debug_only=*/true); - - AlgebraicSimplifierOptions options; - options.set_enable_dot_strength_reduction(false); - // "slow" minmax means we propagate nan. - options.set_minmax_propagate_nan( - !module->config().debug_options().xla_cpu_enable_fast_min_max()); - options.set_supports_non_canonical_dots(false); - options.set_executing_on_cpu(true); - pipeline->AddPass(options); - pipeline->AddPass(); - pipeline->AddPass(); - pipeline->AddPass(GatherExpander::kEliminateSimpleGathers); - if (is_fusion_emitters) { - // Conversion to MLIR only works with simplified gathers. - pipeline->AddPass(); - } - - // Needs to happen after algebraic simplifier. - // pipeline->AddPass(); - - // BatchNormExpander can create zero-sized ops, so zero-sized HLO - // elimination has to come after that pass. - pipeline->AddPass(); - - pipeline->AddPass(); - pipeline->AddPass(); - pipeline->AddPass(); - pipeline->AddPass(); - - // TODO(b/134075051): Re-enable after b/134075051 is fixed. - // pipeline->AddPass(); - - pipeline->AddPass(); - pipeline->AddPass(); - pipeline->AddPass( - options::FoldAllConstants(module->config()) - ? HloConstantFolding::Level::kAggressive - : HloConstantFolding::Level::kDefault); - pipeline->AddPass(); - - return pipeline; -} - -} // namespace - -absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( - HloModule* module, bool is_aot_compile, - TargetMachineFeatures* target_machine_features) { - const int64_t num_partitions = module->config().num_partitions(); - const bool is_thunk_runtime = - module->config().debug_options().xla_cpu_use_thunk_runtime(); - const bool is_fusion_emitters = - is_thunk_runtime && - module->config().debug_options().xla_cpu_use_fusion_emitters(); - bool use_shardy_partitioner = module->config().use_shardy_partitioner(); - if (num_partitions > 1) { - if (!module->config().use_spmd_partitioning()) { - return InvalidArgument( - "num_partitions=%d but SPMD partitioning not enabled.", - num_partitions); - } - HloPassPipeline spmd_pipeline("spmd-partitioner"); - // Run some IR cleanup passes before running the SPMD partitioning - // passes. - AddHloVerifier(&spmd_pipeline); - spmd_pipeline.AddPass(); - spmd_pipeline.AddPass(); - spmd_pipeline.AddPass(); - spmd_pipeline.AddPass(); - if (use_shardy_partitioner) { - spmd_pipeline.AddPass(); - } else { - spmd_pipeline.AddPass( - /*is_spmd=*/true, /*propagate_metadata=*/false, - module->config().allow_spmd_sharding_propagation_to_output(), - module->config().allow_spmd_sharding_propagation_to_parameters()); - } - spmd_pipeline.AddPass( - num_partitions, module->config().replica_count()); - TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status()); - } else { - HloPassPipeline sharding_removal_pipeline("sharding-removal"); - AddHloVerifier(&sharding_removal_pipeline); - // Remove redundant sharding ops when partition_count == 1. - sharding_removal_pipeline.AddPass(); - // Run ShardyXLA without propagation, which enforces use-tuple-args. - if (use_shardy_partitioner) { - sharding_removal_pipeline.AddPass( - /*runSdyShardingPropagation=*/false); - } - sharding_removal_pipeline.AddPass(); - TF_RETURN_IF_ERROR(sharding_removal_pipeline.Run(module).status()); - } - - { - // SubbytePacker must be run before the rest of the pipeline since it - // modifies the layout of the entry computation inputs/outputs, which is - // passed to LayoutAssignment. - HloPassPipeline subbyte_packer_pipeline("SubbytePacker pipeline"); - subbyte_packer_pipeline.AddPass( - SubByteNormalization::SET_ELEMENT_SIZE); - TF_RETURN_IF_ERROR(subbyte_packer_pipeline.Run(module).status()); - } - HloPassPipeline pipeline("HLO passes through layout assignment"); - AddHloVerifier(&pipeline); - pipeline.AddPass(); - pipeline.AddPass(); - - // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does - // not support. `upcaster_filter` returns false if the instruction shouldn't - // be processed. - // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN - // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in - // `XnnFusionThunk`. - bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack(); - auto call_library_for_dot = [&](const HloInstruction& instr) { - if (!xnnpack_enabled) return false; - DotImplementationStrategy strategy = GetDotImplementationStrategy( - module->config(), instr, *target_machine_features, - /*allow_runtime_calls=*/true); - return strategy == DotImplementationStrategy::kEigen; - }; - HloPredicate upcaster_filter = [&](const HloInstruction* instr) { - if (!call_library_for_dot(*instr)) return true; - return !IsXnnDotSupported(instr->dot_dimension_numbers(), - instr->operand(0)->shape(), - instr->operand(1)->shape(), instr->shape(), - target_machine_features) - .value_or(false); - }; - pipeline.AddPass(upcaster_filter); - - // For softmax, rewrite to custom calls with XNNPACK targets. - bool enable_xnnpack = - xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack(); - if (enable_xnnpack) - pipeline.AddPass(); - - // Expand random number generation. - pipeline.AddPass(); - pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); - - // Remove zero-sized HLO from the input so that other passes don't have to - // handle it. - pipeline.AddPass(); - - pipeline.AddPass(); - - pipeline.AddPass(); - pipeline.AddPass(); - - // The TopkDecomposer generates a compare op with type=TOTALORDER and must - // run before the ComparisonExpander which rewrites such comparisons. - pipeline.AddPass([&](const HloInstruction* instr) { - return instr->opcode() == HloOpcode::kTopK; - }); - - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - - // Inline computations with a single call site. - pipeline.AddPass(/*single_call_site=*/true); - pipeline.AddPass(); - pipeline.AddPass(); - - // Rewrite to custom calls with target as oneDNN library calls. -#if defined(INTEL_MKL) - // AOT compiled code runs in single thread. - if (!is_aot_compile && !is_thunk_runtime) { - // Placing OneDnnOpsRewriter here to match the flax patterns - // TODO: Decide where would be the appropriate place for this pass to make - // it more generic - // TODO - intel: Name of the pass might seem redundant as oneDnnRewriter, - // but in future plan to rename oneDNNrewriter to specific to onednn matmul - pipeline.AddPass(); - } -#endif // INTEL_MKL - - // Promote BF16 all-reduce to F32. - const std::pair ar_promoted_types[] = { - {BF16, F32}}; - pipeline.AddPass(ar_promoted_types); - // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU - // backend can support BF16/F8 operations without directly implementing a - // BF16/F8 lowering for most ops. - CpuFloatSupport bf16_support(BF16, call_library_for_dot, - target_machine_features); -#if defined(INTEL_MKL) - OneDnnFloatSupport onednn_bf16_support(BF16); - if (!is_aot_compile && !is_thunk_runtime) { - pipeline.AddPass(&onednn_bf16_support); - } else { - pipeline.AddPass(&bf16_support); - } -#else - pipeline.AddPass(&bf16_support); -#endif - FloatSupport f8e5m2_support(F8E5M2, F16); - pipeline.AddPass(&f8e5m2_support); - FloatSupport f8e4m3_support(F8E4M3, F16); - pipeline.AddPass(&f8e4m3_support); - FloatSupport f8e4m3fn_support(F8E4M3FN, F16); - pipeline.AddPass(&f8e4m3fn_support); - FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ, F16); - pipeline.AddPass(&f8e4m3b11fnuz_support); - FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ, F16); - pipeline.AddPass(&f8e5m2fnuz_support); - FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ, F16); - pipeline.AddPass(&f8e4m3fnuz_support); - FloatSupport f8e3m4_support(F8E3M4, F16); - pipeline.AddPass(&f8e3m4_support); - FloatSupport s4_support(S4, S8); - pipeline.AddPass(&s4_support); - FloatSupport u4_support(U4, U8); - pipeline.AddPass(&u4_support); - FloatSupport f4e2m1fn_support(F4E2M1FN, F16); - pipeline.AddPass(&f4e2m1fn_support); - FloatSupport f8e8m0fnu_support(F8E8M0FNU, F32); - pipeline.AddPass(&f8e8m0fnu_support); - // After canonicalization, there may be more batch dots that can be - // simplified. - pipeline.AddPass(); - auto cost_model = [](HloInstruction* conv) { - // We need a cost model for CPUs. Currently, do nothing. - return false; - }; - pipeline.AddPass( - /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model, - /*convert_batch_groups_only=*/true); - auto feature_group_should_expand = [](HloInstruction* conv) { - switch (conv->shape().element_type()) { - case F16: - case F32: - return false; - default: - return true; - } - }; - pipeline.AddPass( - feature_group_should_expand, cost_model, - /*convert_batch_groups_only=*/false); - pipeline.AddPass( - /*rewrite_training_op=*/true, - /*rewrite_inference_op=*/true, - /*rewrite_grad_op=*/true); - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - - if (module->config() - .debug_options() - .xla_reduce_window_rewrite_base_length() != 0) { - pipeline.AddPass>( - module->config() - .debug_options() - .xla_reduce_window_rewrite_base_length()); - } - auto dynamic_padder_options = DynamicPadderOptions(); - // TODO(pgavin): ShapeChecks were never implemented correctly by the dynamic - // padder. The mode defaults to kIgnore, and it was not overridden for nested - // computations (such as while bodies or conditional branches), and so cases - // that could not be proven would still be accepted even with compile-time - // checks enabled. Recent changes to the DynamicPadder correctly - // override the mode. However, some models have started to rely on the check - // being ignored, and they would be broken if it is enforced. - dynamic_padder_options.shape_check_mode = - DynamicDimensionInference::ShapeCheckMode::kIgnore; - pipeline.AddPass(dynamic_padder_options); - - pipeline.AddPass(target_machine_features); - - // Run fp16 dots/convs in fp32 and then downcast the result to fp16. - // Justification: - // - // - This is significantly faster on our CPUs today than true fp16. - // - It's numerically more accurate. (Granted, this is not always - // desirable, thus the ability to disable this functionality.) - // - It matches more closely the GPU's behavior on fp16 dot/conv, where - // accumulation happens in f32. - if (!module->config().debug_options().xla_cpu_strict_dot_conv_math()) { - pipeline.AddPass( - F16, F32, HloPredicateIsOp); - } - - pipeline.AddPass(CreateSimplificationPipeline("simplification", module, - is_fusion_emitters)); - - // Scatter expander is sandwiched between two simplification pipelines to - // enable constant folding with the original scatter instructions (which is - // more efficient than with the expanded version) but then to also ensure that - // the resulting while loops are simplified. - pipeline.AddPass(); - if (is_fusion_emitters) { - pipeline.AddPass( - ScatterExpander::kEliminateSimpleScatters); - pipeline.AddPass(); - } - if (!is_fusion_emitters || !kFusionEmitterScatterEnabled) { - pipeline.AddPass(ScatterExpander::kEliminateAllScatters); - } - - pipeline.AddPass(CreateSimplificationPipeline( - "post_scatter_expansion_simplification", module, is_fusion_emitters)); - - pipeline.AddPass(); - - pipeline.AddPass([](const HloSortInstruction* sort, int64_t) { - return sort->operand(0)->shape().element_type() == F32; - }); - pipeline.AddPass(); - pipeline.AddPass( - [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr { - if (DotImplementationCanHandleTranspose(dot, *target_machine_features, - /*allow_runtime_calls=*/true)) { - return TransposeFolding::IsRowColumnTransposeDotOperand(dot, operand); - } - return false; - }, - TransposeFolding::NeverFoldTranspose); - pipeline.AddPass(/*is_layout_sensitive=*/false); - - pipeline.AddPass(); - pipeline.AddPass(); - - // Annotate while loops with statically known trip counts, so that at run time - // we can avoid running the loop condition computations. - pipeline.AddPass(); - - // Layout assignment uses alias analysis, which requires the call graph to be - // flattened. - pipeline.AddPass(); - ChannelLayoutConstraints layout_constraints; - pipeline.AddPass( - module->mutable_entry_computation_layout(), target_machine_features, - &layout_constraints); - // Run SubByteNormalization because CpuLayoutAssignment may modify a - // Layout's element_size_in_bits field. - pipeline.AddPass( - SubByteNormalization::SET_ELEMENT_SIZE); - - // Finally canonicalize all literals larger than 1024 bytes in the module to - // reuse the same literal across multiple HLO modules. - pipeline.AddPass(LiteralPool::Default(), - /*min_size_bytes=*/1024); - - return pipeline.Run(module).status(); -} - -absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn( - HloModule* module, bool is_aot_compile, - TargetMachineFeatures* target_machine_features, - const CompileOptions& compile_options) { - const auto& debug_options = module->config().debug_options(); - const bool is_thunk_runtime = debug_options.xla_cpu_use_thunk_runtime(); - const bool is_fusion_emitters = - is_thunk_runtime && debug_options.xla_cpu_use_fusion_emitters(); - HloPassPipeline pipeline("HLO passes after layout assignment"); - - { - HloPassPipeline normalization_pipeline("hlo normalization"); - normalization_pipeline.AddPass(); - normalization_pipeline.AddPass(); - normalization_pipeline.AddPass(); - TF_RETURN_IF_ERROR(normalization_pipeline.Run(module).status()); - } - - // After layout assignment, use a layout-sensitive verifier. - pipeline.AddPass("after layout assignment"); - AddHloVerifier(&pipeline, HloVerifierOpts{}.MakeLayoutSensitive(), - /*debug_only=*/true); - - pipeline.AddPass(); - - const int max_parallelism = - module->config().intra_op_parallelism_threads() > 0 - ? module->config().intra_op_parallelism_threads() - : tsl::port::NumSchedulableCPUs(); - -#if defined(INTEL_MKL) - // AOT compiled code runs in single thread. - if (!is_aot_compile && !is_thunk_runtime) { - // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it - // easier to match. - // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization. - if (debug_options.xla_allow_excess_precision()) { - pipeline.AddPass(); - } - pipeline.AddPass(max_parallelism, - compile_options.thread_pool); - // Run SimplifyFPConversions pass again to remove redundant Convert ops - // that may exist as a result of running OneDnnContractionRewriter pass. - if (debug_options.xla_allow_excess_precision()) { - pipeline.AddPass(); - } - } -#endif // INTEL_MKL - - if (module->config() - .debug_options() - .xla_cpu_experimental_xnn_graph_fusion_mode() != - DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED) { - pipeline.AddPass(); - } - - // Add a fusion pass now that layout assignment is done. - pipeline.AddPass(); - if (is_fusion_emitters) { - pipeline.AddPass(); - } - - // The LayoutAssignment pass may leave behind kCopy instructions which are - // duplicate or NOPs, so remove them with algebraic simplification and CSE. - // Run this to a fixed point. - [&pipeline = pipeline.AddPass>( - "simplification after layout assignment"), - &module] { - AddHloVerifier( - &pipeline, - HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout( - LayoutAssignment::InstructionCanChangeLayout), - /*debug_only=*/true); - AlgebraicSimplifierOptions options; - options.set_is_layout_sensitive(true); - options.set_supports_non_canonical_dots(false); - options.set_enable_dot_strength_reduction(false); - // "slow" minmax means we propagate nan. - options.set_minmax_propagate_nan( - !module->config().debug_options().xla_cpu_enable_fast_min_max()); - options.set_executing_on_cpu(true); - pipeline.AddPass(options); - pipeline.AddPass(); - pipeline.AddPass(/*is_layout_sensitive=*/true); - }(); - - // Outline ops in the entry computation into calls to subcomputations. - if (!is_aot_compile) { - // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module. - // Note this is not run for AOT because it would bring in thread pool - // and thread synchronization dependencies which would likely increase - // binary size (and most AOT applications are single-threaded). - // TODO(b/29630486) Support multi-threaded AOT. - pipeline.AddPass( - max_parallelism, ShapeSizeBytesFunction(), target_machine_features); - } - // Copy insertion should be performed immediately before IR emission to - // avoid inserting unnecessary copies (later pass adds an instruction which - // materializes the value) or missing a necessary copy (later pass removes - // an instruction which materializes a value). DCE must be run immediately - // before (and sometime after) copy insertion, to avoid dead code from - // interfering with the rewrites. - pipeline.AddPass(); - pipeline.AddPass(true); - - // If enabled we'll use more precise region based analysis for copy removal. - if (debug_options.xla_cpu_copy_insertion_use_region_analysis()) { - pipeline.AddPass( - /*can_share_buffer=*/nullptr, - /*use_region_based_live_range_analysis=*/-1); - } else { - pipeline.AddPass(); - } - - // The hoisting of small while loops is only useful in the context of the - // thunk runtime. - if (module->config().debug_options().xla_cpu_use_thunk_runtime()) { - TF_ASSIGN_OR_RETURN( - int64_t byte_threshold, - xla::cpu::options::SmallWhileLoopByteThreshold(module->config())); - pipeline.AddPass(byte_threshold); - } - - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - return pipeline.Run(module).status(); -} - -absl::Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, - llvm::TargetMachine* target_machine, - const CompileOptions& compile_options) { - TargetMachineFeatures target_machine_features(target_machine); - TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(module, is_aot_compile, - &target_machine_features)); - - return RunHloPassesAfterLayoutAssn(module, is_aot_compile, - &target_machine_features, compile_options); -} - -namespace { - -// Align buffers to XLA:CPU minimal alignment. -int64_t memory_alignment(LogicalBuffer::Color) { - return cpu_function_runtime::MinAlign(); -} - -llvm::TargetOptions CompilerTargetOptions( - const HloModuleConfig& module_config) { - llvm::TargetOptions target_options; - // Always allow FMA fusion. This increases precision instead of decreasing it. - target_options.AllowFPOpFusion = llvm::FPOpFusion::Fast; - return target_options; -} - -std::pair GetIRModuleHooks( - const HloModule& hlo_module, - const LLVMCompiler::ModuleHook& user_pre_optimization_hook, - const LLVMCompiler::ModuleHook& user_post_optimization_hook) { - // Create the IR hooks. If applicable, each IR hook does the following: - // - // * Calls the user supplied module hook. - // * Writes out the IR to a file in the output directory designated by - // --xla_dump_to - const HloModule* hlo_module_ptr = &hlo_module; - auto hook = [user_pre_optimization_hook, user_post_optimization_hook, - hlo_module_ptr](bool optimized, - const llvm::Module& llvm_module) { - const auto& user_hook = - !optimized ? user_pre_optimization_hook : user_post_optimization_hook; - if (user_hook) { - user_hook(llvm_module); - } - - // Include LLVM module identifier suffix in case `llvm_module` is just a - // part of the original LLVM module constructed by the XLA. - absl::string_view id = llvm_module.getModuleIdentifier(); - size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size()); - llvm_ir::DumpIrIfEnabled(*hlo_module_ptr, llvm_module, optimized, - /*filename_suffix=*/id.substr(pos)); - }; - return {[hook](const llvm::Module& llvm_module) { - return hook(/*optimized=*/false, llvm_module); - }, - [hook](const llvm::Module& llvm_module) { - return hook(/*optimized=*/true, llvm_module); - }}; -} - -absl::Status VerifyLlvmModule(const llvm::Module& llvm_module) { - XLA_SCOPED_LOGGING_TIMER("CpuCompiler - Running LLVM verifier"); - - std::string err; - llvm::raw_string_ostream err_stream(err); - - // verifyModule() returns true if the module is broken. - TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream)) - << "Invalid LLVM IR before optimizations:\n" - << err_stream.str() - << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. " - "Rerun with --xla_dump_to to get the IR. "; - return absl::OkStatus(); -} - -absl::Status CreateHloProfilingArtifacts( - const HloModule& module, - absl::flat_hash_map* - instruction_to_profile_idx, - absl::flat_hash_map* - computation_to_profile_idx, - std::unique_ptr* hlo_profile_index_map, - std::unique_ptr* hlo_profile_printer_data) { - *hlo_profile_index_map = std::make_unique(module); - const HloComputation& entry_computation = *module.entry_computation(); - - TF_ASSIGN_OR_RETURN( - *instruction_to_profile_idx, - CollectProfileCandidates::GetCandidatesForComputation( - entry_computation, - (*hlo_profile_index_map)->instruction_to_profile_idx())); - - auto shape_size_bytes = [](const Shape& shape) { - // On the cpu, opaques are pointers. - if (shape.IsOpaque()) { - return static_cast(sizeof(void*)); - } - return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); - }; - - HloCostAnalysis cost_analysis(shape_size_bytes); - TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis)); - *hlo_profile_printer_data = CreateHloProfilePrinterData( - **hlo_profile_index_map, cost_analysis, entry_computation.name()); - *computation_to_profile_idx = - (*hlo_profile_index_map)->computation_to_profile_idx(); - - return absl::OkStatus(); -} - -} // namespace - -absl::StatusOr> CpuCompiler::RunHloPasses( - std::unique_ptr module, se::StreamExecutor* /*stream_exec*/, - const CompileOptions& options) { - auto& config = module->config(); - - TF_ASSIGN_OR_RETURN( - std::unique_ptr jit_target_machine, - IrCompiler::InferTargetMachine( - CompilerTargetOptions(config), IrCompiler::GetCodeGenOptLevel(config), - CpuFeatureFromString(config.debug_options().xla_cpu_max_isa()))); - - TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false, - jit_target_machine.get(), - /*compile_options=*/options)); - return std::move(module); -} - -namespace { - -static void DumpModuleToFile(const llvm::Module& llvm_module, - const llvm::object::ObjectFile& obj_file, - const HloModule& hlo_module) { - absl::string_view id = llvm_module.getModuleIdentifier(); - size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size()); - auto get_file_suffix = [&]() { - std::vector parts = {"obj-file"}; - parts.reserve(3); - absl::string_view middle_name = id.substr(pos); - if (!middle_name.empty()) { - parts.push_back(middle_name); - } - parts.push_back("o"); - return absl::StrJoin(parts, "."); - }; - DumpToFileInDir( - hlo_module, /*file_prefix=*/"", get_file_suffix(), - absl::string_view(obj_file.getData().data(), obj_file.getData().size())); -} - -// Post-compilation callback functor for use by SimpleOrcJIT. -// -// Dumps machine code if dumping is enabled for the module. -static std::function -CreateOrcJITPostCompilationHook(const HloModule* hlo_module, - std::vector* obj_files) { - return [=](const llvm::Module& llvm_module, - const llvm::object::ObjectFile& obj_file) { - if (obj_files) obj_files->push_back(obj_file.getData().str()); - - if (DumpingEnabledForHloModule(*hlo_module)) { - DumpModuleToFile(llvm_module, obj_file, *hlo_module); - } - }; -} - -struct ComputationToEmit { - HloComputation* computation; - - // Are we emitting this computation with fast-math reassociation enabled? - // We enable reassociation for reductions because it has a significant - // performance impact. - bool allow_reassociation; - - bool operator==(const ComputationToEmit& other) const { - return computation == other.computation && - allow_reassociation == other.allow_reassociation; - } - - template - friend H AbslHashValue(H h, const ComputationToEmit& c) { - return H::combine(std::move(h), c.computation, c.allow_reassociation); - } -}; - -std::vector SubcomputationEmissionOrder( - HloComputation* root) { - absl::flat_hash_set visited; - std::vector postorder; - - // agenda of (node, leave) pairs. - std::stack> agenda; - agenda.emplace(ComputationToEmit{root, false}, false); - while (!agenda.empty()) { - ComputationToEmit c; - bool leave; - std::tie(c, leave) = agenda.top(); - agenda.pop(); - - if (leave) { - postorder.push_back(c); - continue; - } - - if (visited.insert(c).second) { - agenda.emplace(c, true); - for (auto* instruction : c.computation->instructions()) { - bool allow_reassociation = - instruction->opcode() == HloOpcode::kAllReduce || - instruction->opcode() == HloOpcode::kReduce || - instruction->opcode() == HloOpcode::kReduceWindow; - auto cc = absl::MakeSpan(instruction->called_computations()); - for (auto it = cc.rbegin(); it != cc.rend(); ++it) { - HloComputation* called_computation = *it; - ComputationToEmit callee{ - called_computation, c.allow_reassociation || allow_reassociation}; - if (!visited.contains(callee)) { - agenda.emplace(callee, false); - } - } - } - } - } - DCHECK(!postorder.empty() && postorder.back().computation == root); - postorder.pop_back(); - return postorder; -} - -} // namespace - -// Removes unused globals and function declarations from the LLVM module. -// -// After splitting LLVM module into multiple parts, we end up with unused -// symbols in each part: external globals and function declarations. We don't -// support linking across modules added to SimpleOrcJIT, and we don't need it, -// because we never construct LLVM IR that might require cross-module linking, -// so we can just remove unused symbols from each part. -static void RemoveUnusedSymbols(llvm::Module& module) { - llvm::SmallVector unused_globals; - llvm::SmallVector unused_functions; - - for (llvm::GlobalVariable& gv : module.globals()) { - if (gv.use_empty()) unused_globals.push_back(&gv); - } - for (llvm::Function& f : module.functions()) { - if (f.isDeclaration() && f.use_empty()) unused_functions.push_back(&f); - } - - for (auto* gv : unused_globals) { - module.eraseGlobalVariable(gv); - } - for (auto* f : unused_functions) { - f->eraseFromParent(); - } -} - -// Clones a ThreadSafeModule from the given LLVM module in a new LLVM context. -// -// To enable parallel compilation, each LLVM module has to be owned by a -// separate LLVM context. We take each part of the original module after a -// split, and clone it into a new LLVM context. -static llvm::orc::ThreadSafeModule CloneAsThreadSafeModule( - int64_t part, std::unique_ptr module) { - TraceMe trace([&] { - return TraceMeEncode("CpuCompiler::CloneAsThreadSafeModule", - {{"part", part}}); - }); - - // There is no way to clone a module from one context to another, so we need - // to serialize the module to bitcode and parse it back into the new context. - llvm::SmallString<0> bc; - llvm::raw_svector_ostream bcos(bc); - llvm::WriteBitcodeToFile(*module, bcos); - - // Parse module back into its own LLVM context. - auto clone_context = std::make_unique(); - auto clone_module = llvm::parseBitcodeFile( - llvm::MemoryBufferRef( - llvm::StringRef(bc.data(), bc.size()), - absl::StrFormat("%s_part_%02d", kXlaModuleIdentifier, part)), - *clone_context); - - return llvm::orc::ThreadSafeModule(std::move(*clone_module), - std::move(clone_context)); -} - -namespace { -// Compiled symbols (kernels and comparators) from a single LLVM module part. -struct CompiledSymbolsPart { - std::vector kernels; - std::vector comparators; -}; -} // namespace - -// Collect IrEmitter2 symbols that got into the LLVM module part. We issue -// compilation tasks in parallel, and to maximize concurrency we don't issue -// separate compilation tasks that compile symbols from the same module. -static CompiledSymbolsPart CollectCompiledSymbolsPart( - const IrEmitter2& ir_emitter, const llvm::Module& module) { - CompiledSymbolsPart syms; - - auto find_kernel = - [&](llvm::StringRef name) -> std::optional { - for (auto& k : ir_emitter.kernels()) { - if (k.name == name) return k; - } - return std::nullopt; - }; - - auto find_comparator = - [&](llvm::StringRef name) -> std::optional { - for (auto& c : ir_emitter.comparators()) { - if (c.name == name) return c; - } - return std::nullopt; - }; - - for (auto& f : module.functions()) { - if (auto kernel = find_kernel(f.getName())) { - syms.kernels.push_back(*kernel); - } - if (auto comparator = find_comparator(f.getName())) { - syms.comparators.push_back(*comparator); - } - } - - return syms; -} - -// If LLVM module has large constants constructed from literals, we don't want -// to split it, because it will cause us to copy large constants across module -// parts. We should not be storing large constants in LLVM IR in a first place, -// but while we do that, we have to be extra-careful, or it leads to extremely -// long compilation times, OOMs and timeouts. -// -// TODO(b/361800465): Figure out how to avoid putting large constants into -// LLVM IR in the first place. -static bool HasLargeConstants(llvm::Module& module) { - static constexpr int kMaxConstantSize = 10000; - for (auto& g : module.globals()) { - if (!g.hasInitializer()) { - continue; - } - - llvm::Constant* initializer = g.getInitializer(); - if (auto* arr = llvm::dyn_cast(initializer->getType())) { - if (arr->getNumElements() > kMaxConstantSize) return true; - } - } - return false; -} - -inline void VlogMaxIsa(absl::string_view max_cpu_isa) { - if (VLOG_IS_ON(1) && !max_cpu_isa.empty()) { - if (tsl::port::IsX86CPU()) { - VLOG(1) << "`xla_cpu_max_isa` is set. Will not use features newer than: " - << max_cpu_isa; - } else { - VLOG(1) << "`xla_cpu_max_isa` is set to `" << max_cpu_isa - << "`. This flag is not supported on non-x86 CPUs yet."; - } - } -} - -// We keep HloProto in the CpuExecutable, but we don't need to keep literals -// payload in it as we use it only for debugging and memory analysis. -static void StripPayloadFromLiteralProto(HloProto& proto) { - auto* module = proto.mutable_hlo_module(); - for (auto& computation : *module->mutable_computations()) { - for (auto& instruction : *computation.mutable_instructions()) { - // We only keep literal shape to correctly estimate memory usage of the - // HLO module, but we don't need the actual literal data. - if (instruction.has_literal()) { - LiteralProto literal; - *literal.mutable_shape() = instruction.literal().shape(); - *instruction.mutable_literal() = std::move(literal); - } - } - } -} - -// Extracts the given set of kernels from the original module. -// Returns a new module with the extracted kernels. -static absl::StatusOr> ExtractKernelsFromModule( - llvm::Module* original_module, - absl::flat_hash_set kernels) { - // Clone into a new module, only keeping definitions of the relevant kernels. - auto should_clone_definition = [&kernels](const llvm::GlobalValue* gv) { - if (auto* func = llvm::dyn_cast(gv)) { - return kernels.contains(func->getName()); - } - return false; - }; - llvm::ValueToValueMapTy vmap; - std::unique_ptr module = - llvm::CloneModule(*original_module, vmap, should_clone_definition); - - // Erase the cloned symbols from the original module. - for (const auto& kernel_name : kernels) { - llvm::Function* to_be_removed = original_module->getFunction(kernel_name); - if (to_be_removed == nullptr) { - return Internal("Cannot remove kernel %s: cannot be found in module %s", - kernel_name, original_module->getName()); - } - to_be_removed->eraseFromParent(); - } - return module; -} - -static void AddXlaBackendExtraOptionsAsModuleFlag( - llvm::Module* llvm_module, llvm::StringRef backend_extra_options) { - auto* options_mdstring = - llvm::MDString::get(llvm_module->getContext(), backend_extra_options); - llvm_module->addModuleFlag(llvm::Module::Error, "xla_backend_extra_options", - options_mdstring); -} - -absl::StatusOr> -CpuCompiler::CompileCpuExecutable(std::unique_ptr module) { - TraceMe trace([&] { - return TraceMeEncode("CpuCompiler::CompileCpuExecutable", - {{"name", module->name()}}); - }); - - ModuleHook pre_optimization_ir_hook; - ModuleHook post_optimization_ir_hook; - std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) = - GetIRModuleHooks(*module, user_pre_optimization_hook_, - user_post_optimization_hook_); - - // Compile must be thread-safe so create a new LLVM context for the module. - mlir::MLIRContext mlir_context; - auto llvm_context = std::make_unique(); - auto llvm_module = - std::make_unique(kXlaModuleIdentifier, *llvm_context); - - const DebugOptions& debug_options = module->config().debug_options(); - - // We collect compiled object files (machine code) so we can export - // CpuExecutable to an AOT compilation result. - std::vector obj_files; - - // We split LLVM module and distribute it across separate DyLibs to enable - // parallel compilation at run time. - size_t parallel_codegen_split_count = - debug_options.xla_cpu_parallel_codegen_split_count(); - VlogMaxIsa(debug_options.xla_cpu_max_isa()); - - const HloModuleConfig& config = module->config(); - - // Options for compiling LLVM IR to machine code. - IrCompiler::Options ir_compiler_options{ - /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config), - /*optimize_for_size=*/options::OptimizeForSizeRequested(config), - /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()), - /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config), - /*disable_expensive_passes=*/ - debug_options.xla_llvm_disable_expensive_passes(), - /*slp_vectorizer_disabled=*/options::SlpVectorizerDisabled(config), - /*disable_loop_unrolling=*/options::DisableLoopUnrolling(config), - }; - - // Compiler hooks to intercept compiled LLVM IR modules. - IrCompiler::CompilationHooks ir_compiler_hooks{ - pre_optimization_ir_hook, - post_optimization_ir_hook, - CreateOrcJITPostCompilationHook(module.get(), &obj_files), - }; - - // Definition generator to link with XLA:CPU host runtime symbols. - ExecutionEngine::DefinitionGenerator definition_generator = - [](const llvm::DataLayout& data_layout) { - return std::make_unique(data_layout); - }; - - // Options for orchestrating the JIT compilation process. - JitCompiler::Options jit_compiler_options{ - /*num_dylibs=*/parallel_codegen_split_count, - /*definition_generator=*/std::move(definition_generator), - }; - - std::unique_ptr ir_compiler = IrCompiler::Create( - CompilerTargetOptions(module->config()), std::move(ir_compiler_options), - std::move(ir_compiler_hooks)); - - TF_ASSIGN_OR_RETURN( - JitCompiler jit_compiler, - JitCompiler::Create(std::move(jit_compiler_options), - std::move(ir_compiler), GetCompilationTaskRunner())); - - HloComputation* entry_computation = module->entry_computation(); - absl::flat_hash_map - instruction_to_profile_idx; - absl::flat_hash_map - computation_to_profile_idx; - std::unique_ptr hlo_profile_index_map; - std::unique_ptr hlo_profile_printer_data; - if (module->config().hlo_profiling_enabled()) { - TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts( - *module, &instruction_to_profile_idx, &computation_to_profile_idx, - &hlo_profile_index_map, &hlo_profile_printer_data)); - } - - // Cache these flags here since we'll want to access them after the module's - // ownership is std::moved. - const bool embed_ir_in_executable = - debug_options.xla_embed_ir_in_executable(); - - TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module)); - TF_RETURN_IF_ERROR(module->set_schedule(schedule)); - - TF_ASSIGN_OR_RETURN(std::unique_ptr assignment, - CreateBufferAssignment(*module)); - DumpHloModuleIfEnabled(*module, *assignment, - absl::StrCat("cpu_", kAfterOptimizationsDumpName)); - - // Dump computation proto state and buffer assignment for - // GetCompiledMemoryStats results. - auto with_hlo_proto = [&](std::unique_ptr cpu_executable) { - auto hlo_proto = std::make_unique(); - *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto(); - *hlo_proto->mutable_buffer_assignment() = - cpu_executable->buffer_assignment().ToProto(); - StripPayloadFromLiteralProto(*hlo_proto); - cpu_executable->set_hlo_proto(std::move(hlo_proto)); - return cpu_executable; - }; - - TargetMachineFeatures target_machine_features(jit_compiler.target_machine()); - - // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should - // be renamed to NestedIrEmitter and be used only for emitting nested (aka - // thread local or embedded) computations (reductions, maps, etc.). - - // (Nested) IrEmitter is responsible for building LLVM module with functions - // for all HLO computations. In thunk execution mode we only build LLVM - // functions for embedded computations (e.g. reduction computations) and all - // high-level operations (fusions, elementwise, etc.) are lowered to kernel - // functions (which are also LLVM functions, but use a HostKernel ABI). - IrEmitter nested_ir_emitter( - &mlir_context, *module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - ModuleComputationsTransitivelyContainCustomCall(*module), - &target_machine_features, -#ifdef MEMORY_SANITIZER - /*emit_code_for_msan=*/true -#else - /*emit_code_for_msan=*/false -#endif - ); - - // If we use Thunk runtime then instead of emitting LLVM function for the - // entry computation we emit a sequence of thunks that implement the - // computation as a sequence of interpreted commands. - if (module->config().debug_options().xla_cpu_use_thunk_runtime()) { - // The thunk runtime manages large constants, therefore we only emit - // small ones. - TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals()); - - // IR emitter is responsible for building LLVM module with host kernels for - // corresponding HLO instructions (fusions, elemental instructions, etc.). - IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter); - - // Thunk emitter is responsible for building a Thunk sequence that will - // resolved kernels in the compiled LLVM module and execute them together - // with Thunks implemented as library calls (e.g. oneDNN or Eigen). - ThunkEmitter thunk_emitter(ir_emitter2, *assignment, - target_machine_features, module->config()); - TF_ASSIGN_OR_RETURN(ThunkSequence thunks, - thunk_emitter.EmitEntryComputation(*module)); - - std::string ir_module_string; - if (embed_ir_in_executable) { - std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get()); - - auto thunk_kernel_fmt = [](std::string* out, - const ThunkEmitter::EmittedKernel& kernel) { - absl::StrAppend( - out, llvm_ir::DumpToString(kernel.module.getModuleUnlocked())); - }; - std::string thunks_ir = - absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt); - - ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir); - } - - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - for (const auto& [name, module] : thunk_emitter.kernels()) { - TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked())); - } - - // Some kernels have to be compiled separately because they have - // extra backend options. - int num_extra_functions = 0; - using BackendOptions = llvm::StringRef; - using Kernel = llvm::StringRef; - absl::flat_hash_map> - backend_extra_options_to_kernels; - for (const auto& k : ir_emitter2.kernels()) { - if (k.backend_extra_options.empty()) continue; - auto [_, inserted] = - backend_extra_options_to_kernels[k.backend_extra_options].insert( - k.name); - CHECK(inserted) << "Kernel " << k.name << " is not unique"; - num_extra_functions++; - } - const int num_extra_parts = backend_extra_options_to_kernels.size(); - // We assign one dylib to each set of kernels that have the same extra - // backend options. We do this because we work under the assumption that - // very few kernels will set extra options, and if they do, the options are - // likely to be identical. - if (num_extra_parts >= parallel_codegen_split_count) { - return Internal( - "Too many extra compilation parts due to non-default options (%d). " - "Consider reducing this number or increasing " - "parallel_codegen_split_count (%d)", - num_extra_parts, parallel_codegen_split_count); - } - - // We define the number of module parts based on the total number of - // compiled functions (kernels and comparators) that are called from thunks, - // and the maximum number of parts that we want to split the module into. - size_t num_compiled_functions = ir_emitter2.kernels().size() + - ir_emitter2.comparators().size() + - thunk_emitter.kernels().size(); - size_t num_default_parts = - std::min(num_compiled_functions - num_extra_functions, - parallel_codegen_split_count - num_extra_parts); - - // JIT compile the LLVM IR module to in-memory machine code. We split the - // module into `num_jit_dylibs` parts to allow parallel compilation. In - // practice, all of the kernel functions are independent and don't call each - // other, so we can compile each individual part in parallel. We split - // module preserving locals, which should guarantee that all thread local - // computations end up in the same module with the corresponding kernel. - - // Collect all compiled symbols grouped by LLVM module part, so that we can - // issue compile tasks in parallel without any interference. - std::vector compiled_parts; - - VLOG(2) << "Compile LLVM module with " << ir_emitter2.kernels().size() - << " kernels and " << ir_emitter2.comparators().size() - << " comparators"; - - int dylib_index = 0; - auto add_jit_module = [&](std::unique_ptr llvm_module_part) { - // Collect symbols that are compiled in this LLVM module part. - RemoveUnusedSymbols(*llvm_module_part); - compiled_parts.push_back( - CollectCompiledSymbolsPart(ir_emitter2, *llvm_module_part)); - - std::string dump = llvm_ir::DumpToString(llvm_module_part.get()); - VLOG(5) << "Adding compilation module:\n" << dump; - - // Clone LLVM module part into its own thread safe context. - auto tsm = - CloneAsThreadSafeModule(dylib_index, std::move(llvm_module_part)); - TF_CHECK_OK(jit_compiler.AddModule(std::move(tsm), dylib_index++)); - }; - - // If there are extra parts, compile them first, since we must - // remove the affected kernels from the LLVM module. - if (num_extra_parts > 0) { - TraceMe trace([&] { - return TraceMeEncode("CompileExtraKernels", - {{"num_extra_parts", num_extra_parts}}); - }); - for (const auto& [backend_extra_options, kernels] : - backend_extra_options_to_kernels) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr new_module, - ExtractKernelsFromModule(llvm_module.get(), kernels)); - AddXlaBackendExtraOptionsAsModuleFlag(new_module.get(), - backend_extra_options); - add_jit_module(std::move(new_module)); - } - } - - if (HasLargeConstants(*llvm_module)) { - VLOG(3) << "Skip parallel compilation due to large constants"; - num_default_parts = 1; - } - - if (num_default_parts > 1) { - VLOG(3) << "Split LLVM module into " << num_default_parts - << " parts before codegen to enable parallel compilation" - << " (max split count: " << parallel_codegen_split_count << ")"; - - TraceMe trace([&] { - return TraceMeEncode("SplitModule", - {{"num_default_parts", num_default_parts}}); - }); - - llvm::SplitModule(*llvm_module, num_default_parts, add_jit_module, - /*PreserveLocals=*/true, /*RoundRobin=*/true); - // Free resources used by the original LLVM module. - llvm_module.reset(); - llvm_context.reset(); - - } else { - VLOG(3) << "Compile LLVM module without splitting (max split count: " - << parallel_codegen_split_count << ")"; - compiled_parts.push_back( - CollectCompiledSymbolsPart(ir_emitter2, *llvm_module)); - TF_CHECK_OK(jit_compiler.AddModule(llvm::orc::ThreadSafeModule( - std::move(llvm_module), std::move(llvm_context)))); - } - - // Collect compiled symbols from all LLVM module parts. - std::vector compiled_symbols; - - absl::flat_hash_map - symbol_type_id_to_function_type_id; - - VLOG(3) << "Adding " << thunk_emitter.kernels().size() - << " kernels to the JIT compiler"; - // Make sure we use all the "default" modules for maximum parallelism. - int num_default_so_far = dylib_index - num_extra_parts; - int kernel_dylib_index = - num_default_so_far < num_default_parts ? num_default_so_far : 0; - for (auto& [name, module] : thunk_emitter.kernels()) { - compiled_symbols.push_back( - FunctionLibrary::Sym(name)); - symbol_type_id_to_function_type_id.emplace( - compiled_symbols.back().type_id, SymbolProto::KERNEL); - TF_CHECK_OK(jit_compiler.AddModule(std::move(module), - num_extra_parts + kernel_dylib_index)); - // Simply roundrobin the default kernel dylibs - kernel_dylib_index = (kernel_dylib_index + 1) % num_default_parts; - } - - for (const CompiledSymbolsPart& part : compiled_parts) { - for (const IrEmitter2::KernelInfo& kernel : part.kernels) { - compiled_symbols.push_back( - FunctionLibrary::Sym(kernel.name)); - symbol_type_id_to_function_type_id.emplace( - compiled_symbols.back().type_id, SymbolProto::KERNEL); - } - for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) { - compiled_symbols.push_back( - FunctionLibrary::Sym(comparator.name)); - symbol_type_id_to_function_type_id.emplace( - compiled_symbols.back().type_id, SymbolProto::COMPARATOR); - } - } - - VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols"; - - TraceMe trace_codegen([&] { - return TraceMeEncode( - "Codegen", {{"num_default_parts", num_default_parts}, - {"num_extra_parts", num_extra_parts}, - {"num_compiled_functions", num_compiled_functions}}); - }); - - TF_ASSIGN_OR_RETURN(std::unique_ptr function_library, - std::move(jit_compiler).Compile(compiled_symbols)); - - // Create constant allocations from the buffer assignment. - TF_ASSIGN_OR_RETURN(std::vector constants, - CreateConstantAllocations(*assignment)); - - TF_ASSIGN_OR_RETURN( - auto cpu_executable, - CpuExecutable::Create(std::move(function_library), - std::move(assignment), std::move(module), - std::move(thunks), std::move(constants), - std::move(hlo_profile_printer_data), - std::move(hlo_profile_index_map))); - - // Save object files to be able to export them to AOT compilation - // result. - cpu_executable->set_obj_files(std::move(obj_files)); - - // Save compiled symbols to be able to export them to AOT compilation - // result. - cpu_executable->set_compiled_symbols(std::move(compiled_symbols)); - - // Save mapping between symbol type id and function type id to be able to - // export them to AOT compilation result. - cpu_executable->set_symbol_type_id_to_function_type_id( - symbol_type_id_to_function_type_id); - - if (embed_ir_in_executable) { - cpu_executable->set_ir_module_string(ir_module_string); - } - - return with_hlo_proto(std::move(cpu_executable)); - } - - TF_RETURN_IF_ERROR(nested_ir_emitter.EmitAllConstantGlobals()); - - // Each computation is a single function. Emit all embedded computations - // before the entry computation. The order of computations returned from - // SubcomputationEmissionOrder guarantees that a called computation occurs - // before a caller computation. - for (ComputationToEmit subcomputation : - SubcomputationEmissionOrder(entry_computation)) { - if (subcomputation.computation->IsFusionComputation()) { - continue; - } - TF_RETURN_IF_ERROR( - nested_ir_emitter - .EmitComputation( - subcomputation.computation, subcomputation.computation->name(), - /*is_top_level_computation=*/false, - schedule.sequence(subcomputation.computation).instructions(), - subcomputation.allow_reassociation) - .status()); - } - absl::string_view function_name_prefix = entry_computation->name().empty() - ? "__compute" - : entry_computation->name(); - TF_ASSIGN_OR_RETURN(llvm::Function * entry_function, - nested_ir_emitter.EmitComputation( - entry_computation, function_name_prefix, - /*is_top_level_computation=*/true, - schedule.sequence(entry_computation).instructions(), - /*allow_reassociation=*/false)); - - std::string ir_module_string; - if (embed_ir_in_executable) { - ir_module_string = llvm_ir::DumpToString(llvm_module.get()); - } - - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - - // Save entry function name before destroying LLVM module. - std::string entry_function_name = entry_function->getName().str(); - - // JIT compile the LLVM IR module to in-memory machine code. - llvm::orc::ThreadSafeModule thread_safe_module(std::move(llvm_module), - std::move(llvm_context)); - TF_RETURN_IF_ERROR(jit_compiler.AddModule(std::move(thread_safe_module))); - - using ComputeFn = std::remove_pointer_t; - TF_ASSIGN_OR_RETURN( - std::unique_ptr function_library, - std::move(jit_compiler) - .Compile({FunctionLibrary::Sym(entry_function_name)})); - - TF_ASSIGN_OR_RETURN( - auto cpu_executable, - CpuExecutable::Create(std::move(function_library), std::move(assignment), - std::move(module), entry_function_name, - std::move(hlo_profile_printer_data), - std::move(hlo_profile_index_map))); - - cpu_executable->set_obj_files(std::move(obj_files)); - - if (embed_ir_in_executable) { - cpu_executable->set_ir_module_string(ir_module_string); - } - - return with_hlo_proto(std::move(cpu_executable)); -} - -absl::StatusOr> CpuCompiler::RunBackend( - std::unique_ptr module, - [[maybe_unused]] se::StreamExecutor* stream_exec, - const CompileOptions& options) { - TraceMe trace([&] { - return TraceMeEncode("CpuCompiler::RunBackend", {{"name", module->name()}}); - }); - - VLOG(1) << "Compiling: " << module->name(); - RecordCpuCompilerStacktrace(); - XLA_SCOPED_LOGGING_TIMER( - absl::StrFormat("Compiling [%s] for CPU using JIT", module->name())); - std::string slow_compilation_msg = - absl::StrCat("Compiling module ", module->name()); - auto slow_compile_alarm = SlowCompilationAlarm(slow_compilation_msg); - auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions( - module->config().debug_options().xla_backend_extra_options()); - llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options); - - std::unique_ptr cpu_executable; - TF_ASSIGN_OR_RETURN(cpu_executable, CompileCpuExecutable(std::move(module))); - - cpu_executable->set_debug_info( - cpu_executable->buffer_assignment().StatsString( - /*report_total_fragmentation=*/true)); - VLOG(1) << "Compilation finished"; - return std::unique_ptr(std::move(cpu_executable)); -} - -absl::StatusOr>> -CpuCompiler::CompileAheadOfTime(std::unique_ptr module_group, - const AotCompilationOptions& aot_options) { - TF_RET_CHECK(!module_group->empty()); - std::vector> modules = - module_group->ConsumeModules(); - - auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions( - modules[0]->config().debug_options().xla_backend_extra_options()); - VlogMaxIsa(modules[0]->config().debug_options().xla_cpu_max_isa()); - llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options); - - // We can pass just one llvm::TargetOptions when we compile the LLVM module, - // so we bail if the configs have conflicting flags. At the moment, the only - // flags that need to be consistent are for fast-math. - for (const auto& fn_and_name : - {std::make_pair(&DebugOptions::xla_cpu_enable_fast_math, - "xla_cpu_enable_fast_math"), - std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_infs, - "xla_cpu_fast_math_honor_infs"), - std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_nans, - "xla_cpu_fast_math_honor_nans")}) { - // This only works because each of the method pointers above returns a - // bool. Otherwise we'd have to do some template magic. - const auto& field_method_ptr = fn_and_name.first; - const auto& field_name = fn_and_name.second; - bool first_module_val = - (modules[0]->config().debug_options().*field_method_ptr)(); - for (int64_t i = 0; i < modules.size(); ++i) { - bool cur_module_val = - (modules[i]->config().debug_options().*field_method_ptr)(); - if (first_module_val != cur_module_val) { - return InvalidArgument( - "All HLO module configs must have the same value for %s, but " - "module 0 and %d have different values (%d vs %d).", - field_name, i, first_module_val, cur_module_val); - } - } - } - - if (aot_options.PlatformId() != se::host::kHostPlatformId) { - return InvalidArgument("Incompatible AOT compilation platform"); - } - const CpuAotCompilationOptions& options = - static_cast(aot_options); - llvm::Triple triple(llvm::Triple::normalize(options.triple())); - std::string error; - const llvm::Target* target = - llvm::TargetRegistry::lookupTarget(triple.getTriple(), error); - if (target == nullptr) { - return Internal("TargetRegistry::lookupTarget failed: %s", error); - } - - llvm::Reloc::Model reloc_model = llvm::Reloc::Static; - llvm::PICLevel::Level pic_level = llvm::PICLevel::NotPIC; - llvm::PIELevel::Level pie_level = llvm::PIELevel::Default; - switch (options.relocation_model()) { - case CpuAotCompilationOptions::RelocationModel::Static: - reloc_model = llvm::Reloc::Static; - pic_level = llvm::PICLevel::NotPIC; - pie_level = llvm::PIELevel::Default; - break; - case CpuAotCompilationOptions::RelocationModel::SmallPic: - reloc_model = llvm::Reloc::PIC_; - pic_level = llvm::PICLevel::SmallPIC; - pie_level = llvm::PIELevel::Default; - break; - case CpuAotCompilationOptions::RelocationModel::BigPic: - reloc_model = llvm::Reloc::PIC_; - pic_level = llvm::PICLevel::BigPIC; - pie_level = llvm::PIELevel::Default; - break; - case CpuAotCompilationOptions::RelocationModel::SmallPie: - reloc_model = llvm::Reloc::PIC_; - pic_level = llvm::PICLevel::SmallPIC; - pie_level = llvm::PIELevel::Small; - break; - case CpuAotCompilationOptions::RelocationModel::BigPie: - reloc_model = llvm::Reloc::PIC_; - pic_level = llvm::PICLevel::BigPIC; - pie_level = llvm::PIELevel::Large; - break; - } - llvm::CodeGenOptLevel opt_level = - IrCompiler::GetCodeGenOptLevel(modules[0]->config()); - llvm::TargetOptions target_options = - CompilerTargetOptions(modules[0]->config()); - auto target_machine_builder = [&]() { - return absl::WrapUnique(target->createTargetMachine( - triple.getTriple(), options.cpu_name(), options.features(), - target_options, reloc_model, std::nullopt, opt_level)); - }; - - std::unique_ptr target_machine = - target_machine_builder(); - - // Compile must be thread-safe so create a new LLVM context for the module. - mlir::MLIRContext mlir_context; - llvm::LLVMContext llvm_context; - - std::vector> results; - for (auto& hlo_module : modules) { - VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name(); - if (hlo_module->has_schedule()) { - continue; - } - - TF_RETURN_IF_ERROR(RunHloPasses(hlo_module.get(), /*is_aot_compile=*/true, - target_machine.get(), - /*dummy*/ CompileOptions{})); - - if (hlo_module->config().debug_options().xla_cpu_use_thunk_runtime()) { - TF_ASSIGN_OR_RETURN(results.emplace_back(), - CompileAheadOfTimeThunks( - std::move(hlo_module), target_machine_builder, - options, triple, pic_level, pie_level)); - } else { - TF_ASSIGN_OR_RETURN(results.emplace_back(), - CompileAheadOfTimeLegacy( - std::move(hlo_module), target_machine_builder, - options, triple, pic_level, pie_level)); - } - } - - VLOG(1) << "Compilation finished"; - return std::move(results); -} - -absl::StatusOr> -CpuCompiler::CompileAheadOfTimeLegacy( - std::unique_ptr module, - IrCompiler::TargetMachineBuilder target_machine_builder, - const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple, - const llvm::PICLevel::Level& pic_level, - const llvm::PIELevel::Level& pie_level) { - TF_ASSIGN_OR_RETURN(HloSchedule schedule, - ScheduleModule(module.get(), BufferSizeBytesFunction())); - - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - TF_ASSIGN_OR_RETURN( - std::unique_ptr assignment, - BufferAssigner::Run(module.get(), - std::make_unique(schedule), - BufferSizeBytesFunction(), memory_alignment, - /*allocate_buffers_for_constants=*/true)); - // BufferAssignment::ToString() includes a header, so no need for us to - // print one ourselves. - if (DumpingEnabledForHloModule(*module)) { - DumpToFileInDirOrStdout(*module, "", "buffer_assignment", - assignment->ToString()); - } - DumpHloModuleIfEnabled(*module, *assignment, - absl::StrCat("cpu_", kAfterOptimizationsDumpName)); - - absl::flat_hash_map - instruction_to_profile_idx; - absl::flat_hash_map - computation_to_profile_idx; - std::unique_ptr hlo_profile_index_map; - std::unique_ptr hlo_profile_printer_data; - - if (module->config().hlo_profiling_enabled()) { - TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts( - *module, &instruction_to_profile_idx, &computation_to_profile_idx, - &hlo_profile_index_map, &hlo_profile_printer_data)); - } - - TF_ASSIGN_OR_RETURN(std::unique_ptr target_machine, - target_machine_builder()); - TargetMachineFeatures target_machine_features(target_machine.get()); - std::vector buffer_infos = - CreateBufferInfosFromBufferAssignment(*module, *assignment); - HloComputation* computation = module->entry_computation(); - - // Compile must be thread-safe so create a new LLVM context for the module. - mlir::MLIRContext mlir_context; - auto llvm_context = std::make_unique(); - - // Set required information before emitting IR - auto llvm_module = - std::make_unique(kXlaModuleIdentifier, *llvm_context); - llvm_module->setDataLayout(target_machine->createDataLayout()); - llvm_module->setTargetTriple(triple); - if (pic_level != llvm::PICLevel::NotPIC) { - llvm_module->setPICLevel(pic_level); - } - if (pie_level != llvm::PIELevel::Default) { - llvm_module->setPIELevel(pie_level); - } - IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - ModuleComputationsTransitivelyContainCustomCall(*module), - &target_machine_features, - // TODO(b/66051036): Run full msan for AOT. - /*emit_code_for_msan=*/false); - - TF_RETURN_IF_ERROR(ir_emitter.EmitAllConstantGlobals()); - - for (ComputationToEmit subcomputation : - SubcomputationEmissionOrder(computation)) { - if (subcomputation.computation->IsFusionComputation()) { - continue; - } - TF_RETURN_IF_ERROR( - ir_emitter - .EmitComputation( - subcomputation.computation, subcomputation.computation->name(), - /*is_top_level_computation=*/false, - schedule.sequence(subcomputation.computation).instructions(), - subcomputation.allow_reassociation) - .status()); - } - const std::string& entry_point_name = aot_options.entry_point_name(); - TF_ASSIGN_OR_RETURN( - llvm::Function * entry_function, - ir_emitter.EmitComputation(computation, entry_point_name, - /*is_top_level_computation=*/true, - schedule.sequence(computation).instructions(), - /*allow_reassociation=*/false)); - - CHECK(entry_function->getName() == entry_point_name); - - ModuleHook pre_optimization_ir_hook; - ModuleHook post_optimization_ir_hook; - std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) = - GetIRModuleHooks(*module, user_pre_optimization_hook_, - user_post_optimization_hook_); - - // Run the LLVM verifier over the unoptimized LLVM IR. If it fails, run - // the pre-optimization IR dump hook before returning. - { - absl::Status verify_status = VerifyLlvmModule(*llvm_module); - if (!verify_status.ok() && pre_optimization_ir_hook) { - pre_optimization_ir_hook(*llvm_module); - } - TF_RETURN_IF_ERROR(verify_status); - } - - auto post_codegen_hook = [&](const llvm::Module& llvm_module, - const llvm::object::ObjectFile& obj_file) { - if (!DumpingEnabledForHloModule(*module)) { - return; - } - DumpModuleToFile(llvm_module, obj_file, *module); - }; - - DebugOptions debug_options = module->config().debug_options(); - IrCompiler::Options ir_compiler_options = { - /*optimization_level=*/target_machine->getOptLevel(), - /*optimize_for_size=*/ - options::OptimizeForSizeRequested(module->config()), - /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()), - /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()), - /*disable_expensive_passes=*/ - debug_options.xla_llvm_disable_expensive_passes(), - /*disable_slp_vectorizer=*/ - options::SlpVectorizerDisabled(module->config()), - /*disable_loop_unrolling=*/ - options::DisableLoopUnrolling(module->config()), - /*dfsan_enabled=*/aot_options.sanitize_dataflow(), - /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()}; - - IrCompiler::CompilationHooks ir_compiler_hooks = { - pre_optimization_ir_hook, - post_optimization_ir_hook, - post_codegen_hook, - }; - - IrCompiler ir_compiler(std::move(target_machine_builder), - std::move(ir_compiler_options), - std::move(ir_compiler_hooks)); - - std::unique_ptr object_file = - cantFail(ir_compiler(*llvm_module)); - ObjectFileData object_file_data(object_file->getBufferStart(), - object_file->getBufferEnd()); - - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment->GetUniqueTopLevelOutputSlice()); - - return std::make_unique( - std::move(object_file_data), std::move(buffer_infos), - result_slice.index(), std::move(module), - std::move(hlo_profile_printer_data)); -} - -absl::StatusOr> -CpuCompiler::CompileAheadOfTimeThunks( - std::unique_ptr module, - IrCompiler::TargetMachineBuilder target_machine_builder, - const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple, - const llvm::PICLevel::Level& pic_level, - const llvm::PIELevel::Level& pie_level) { - TraceMe trace([&] { - return TraceMeEncode("CpuCompiler::CompileAheadOfTimeThunks", - {{"name", module->name()}}); - }); - // Compile must be thread-safe so create a new LLVM context for the module. - mlir::MLIRContext mlir_context; - auto llvm_context = std::make_unique(); - - const DebugOptions& debug_options = module->config().debug_options(); - - TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module)); - TF_RETURN_IF_ERROR(module->set_schedule(schedule)); - - TF_ASSIGN_OR_RETURN(std::unique_ptr assignment, - CreateBufferAssignment(*module)); - DumpHloModuleIfEnabled(*module, *assignment, - absl::StrCat("cpu_aot_", kAfterOptimizationsDumpName)); - - // TODO profiling related, probably delete this - absl::flat_hash_map - instruction_to_profile_idx; - absl::flat_hash_map - computation_to_profile_idx; - std::unique_ptr hlo_profile_index_map; - std::unique_ptr hlo_profile_printer_data; - if (module->config().hlo_profiling_enabled()) { - TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts( - *module, &instruction_to_profile_idx, &computation_to_profile_idx, - &hlo_profile_index_map, &hlo_profile_printer_data)); - } - // probably delete this end - - TF_ASSIGN_OR_RETURN(std::unique_ptr target_machine, - target_machine_builder()); - TargetMachineFeatures target_machine_features(target_machine.get()); - - auto llvm_module = - std::make_unique(kXlaModuleIdentifier, *llvm_context); - - llvm_module->setDataLayout(target_machine->createDataLayout()); - llvm_module->setTargetTriple(triple); - if (pic_level != llvm::PICLevel::NotPIC) { - llvm_module->setPICLevel(pic_level); - } - if (pie_level != llvm::PIELevel::Default) { - llvm_module->setPIELevel(pie_level); - } - - // Emitting part - // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should - // be renamed to NestedIrEmitter and be used only for emitting nested (aka - // thread local or embedded) computations (reductions, maps, etc.). - - // (Nested) IrEmitter is responsible for building LLVM module with functions - // for all HLO computations. In thunk execution mode we only build LLVM - // functions for embedded computations (e.g. reduction computations) and all - // high-level operations (fusions, elementwise, etc.) are lowered to kernel - // functions (which are also LLVM functions, but use a HostKernel ABI). - IrEmitter nested_ir_emitter( - &mlir_context, *module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - ModuleComputationsTransitivelyContainCustomCall(*module), - &target_machine_features, - // TODO(b/66051036): Run full msan for AOT. - /*emit_code_for_msan=*/false); - - // The thunk runtime manages large constants, therefore we only emit - // small ones. - TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals()); - - // IR emitter is responsible for building LLVM module with host kernels for - // corresponding HLO instructions (fusions, elemental instructions, etc.). - IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter); - - // Thunk emitter is responsible for building a Thunk sequence that will - // resolved kernels in the compiled LLVM module and execute them together - // with Thunks implemented as library calls (e.g. oneDNN or Eigen). - ThunkEmitter thunk_emitter(ir_emitter2, *assignment, target_machine_features, - module->config()); - TF_ASSIGN_OR_RETURN(ThunkSequence thunks, - thunk_emitter.EmitEntryComputation(*module)); - - // Cache these flags here since we'll want to access them after the module's - // ownership is std::moved. - const bool embed_ir_in_executable = - debug_options.xla_embed_ir_in_executable(); - - std::string ir_module_string; - if (embed_ir_in_executable) { - std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get()); - - auto thunk_kernel_fmt = [](std::string* out, - const ThunkEmitter::EmittedKernel& kernel) { - absl::StrAppend(out, - llvm_ir::DumpToString(kernel.module.getModuleUnlocked())); - }; - std::string thunks_ir = - absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt); - - ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir); - } - - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - for (const auto& [name, module] : thunk_emitter.kernels()) { - TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked())); - } - - // Compilation part - ModuleHook pre_optimization_ir_hook; - ModuleHook post_optimization_ir_hook; - std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) = - GetIRModuleHooks(*module, user_pre_optimization_hook_, - user_post_optimization_hook_); - - std::vector obj_files; - auto post_codegen_hook = [&](const llvm::Module& llvm_module, - const llvm::object::ObjectFile& obj_file) { - obj_files.push_back(obj_file.getData().str()); - if (!DumpingEnabledForHloModule(*module)) { - return; - } - absl::string_view id = llvm_module.getModuleIdentifier(); - size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size()); - DumpToFileInDir( - *module, /*file_prefix=*/"", - /*file_suffix=*/absl::StrCat("obj-file.", id.substr(pos), ".o"), - absl::string_view(obj_file.getData().data(), - obj_file.getData().size())); - }; - - IrCompiler::Options ir_compiler_options = { - /*optimization_level=*/target_machine->getOptLevel(), - /*optimize_for_size=*/ - options::OptimizeForSizeRequested(module->config()), - /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()), - /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()), - /*disable_expensive_passes=*/ - module->config().debug_options().xla_llvm_disable_expensive_passes(), - /*disable_slp_vectorizer=*/ - options::SlpVectorizerDisabled(module->config()), - /*disable_loop_unrolling=*/ - options::DisableLoopUnrolling(module->config()), - /*dfsan_enabled=*/aot_options.sanitize_dataflow(), - /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()}; - - IrCompiler::CompilationHooks ir_compiler_hooks = { - pre_optimization_ir_hook, - post_optimization_ir_hook, - post_codegen_hook, - }; - - IrCompiler ir_compiler(std::move(target_machine_builder), - std::move(ir_compiler_options), - std::move(ir_compiler_hooks)); - - // For simplicity no parallel compilation is used. - std::vector compiled_parts; - compiled_parts.push_back( - CollectCompiledSymbolsPart(ir_emitter2, *llvm_module)); - - // Collect compiled symbols from all LLVM module parts. - std::vector compiled_symbols; - - absl::flat_hash_map - symbol_type_id_to_function_type_id; - - VLOG(3) << "Compiling " << thunk_emitter.kernels().size() - << " thunk kernels."; - - // We have to clone the LLVM module into a local context to be able to link - // it with the other modules. This enables us to have one object file for all - // the kernels. - auto copy_llvm_module_to_local_context = - [&llvm_context](llvm::Module& module) { - // There is no way to clone a module from one context to another, so we - // need to serialize the module to bitcode and parse it back into the - // new context. - llvm::SmallString<0> bc; - llvm::raw_svector_ostream bcos(bc); - llvm::WriteBitcodeToFile(module, bcos); - - // Parse module back into its own LLVM context. - auto clone_module = llvm::parseBitcodeFile( - llvm::MemoryBufferRef(llvm::StringRef(bc.data(), bc.size()), - absl::StrFormat("%s_cloned_to_local_context", - kXlaModuleIdentifier)), - *llvm_context); - - return clone_module; - }; - - llvm::Linker linker(*llvm_module); - - for (auto& [name, module] : thunk_emitter.kernels()) { - compiled_symbols.push_back( - FunctionLibrary::Sym(name)); - symbol_type_id_to_function_type_id.emplace(compiled_symbols.back().type_id, - SymbolProto::KERNEL); - auto cloned_module = - copy_llvm_module_to_local_context(*module.getModuleUnlocked()); - if (!cloned_module) { - return Internal("Failed to clone LLVM module."); - } - // Match data layouts to avoid warning messages. - cloned_module->get()->setDataLayout(llvm_module->getDataLayout()); - linker.linkInModule(std::move(cloned_module.get())); - } - - cantFail(ir_compiler(*llvm_module)); - - for (const CompiledSymbolsPart& part : compiled_parts) { - for (const IrEmitter2::KernelInfo& kernel : part.kernels) { - compiled_symbols.push_back( - FunctionLibrary::Sym(kernel.name)); - symbol_type_id_to_function_type_id.emplace( - compiled_symbols.back().type_id, SymbolProto::KERNEL); - } - for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) { - compiled_symbols.push_back( - FunctionLibrary::Sym(comparator.name)); - symbol_type_id_to_function_type_id.emplace( - compiled_symbols.back().type_id, SymbolProto::COMPARATOR); - } - } - - VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols"; - - // Create constant allocations from the buffer assignment. - TF_ASSIGN_OR_RETURN(std::vector constants, - CreateConstantAllocations(*assignment)); - - TF_ASSIGN_OR_RETURN( - auto cpu_executable, - CpuExecutable::Create( - /*function_library=*/nullptr, // NOTE: We don't need to generate a - // function library as the only purpose - // of this executable is to get - // exported. - std::move(assignment), std::move(module), std::move(thunks), - std::move(constants), std::move(hlo_profile_printer_data), - std::move(hlo_profile_index_map))); - - // Save compiled symbols to be able to export them to AOT compilation - // result. - cpu_executable->set_compiled_symbols(std::move(compiled_symbols)); - - // Save mapping between symbol type id and function type id to be able to - // export them to AOT compilation result. - cpu_executable->set_symbol_type_id_to_function_type_id( - symbol_type_id_to_function_type_id); - - if (embed_ir_in_executable) { - cpu_executable->set_ir_module_string(ir_module_string); - } - - // Dump computation proto state and buffer assignment for - // GetCompiledMemoryStats results. - auto with_hlo_proto = [&](std::unique_ptr cpu_executable) { - auto hlo_proto = std::make_unique(); - *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto(); - *hlo_proto->mutable_buffer_assignment() = - cpu_executable->buffer_assignment().ToProto(); - StripPayloadFromLiteralProto(*hlo_proto); - cpu_executable->set_hlo_proto(std::move(hlo_proto)); - return cpu_executable; - }; - - cpu_executable = with_hlo_proto(std::move(cpu_executable)); - - const ThunkSequence& thunk_sequence = - cpu_executable->thunks().thunk_sequence(); - - std::unique_ptr executable_hlo_profile_printer_data = - cpu_executable->module().config().hlo_profiling_enabled() - ? std::make_unique( - cpu_executable->hlo_profile_printer_data()) - : nullptr; - - return CpuAotCompilationResultThunks::Create( - &cpu_executable->module(), &cpu_executable->buffer_assignment(), - cpu_executable->module_name(), std::move(obj_files), - cpu_executable->get_compiled_symbols_proto(), thunk_sequence, - std::move(*cpu_executable).consume_function_library().release(), - std::move(executable_hlo_profile_printer_data)); -} - -se::Platform::Id CpuCompiler::PlatformId() const { - return se::host::kHostPlatformId; -} - -HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const { - return CpuExecutable::ShapeSizeBytes; -} - -namespace { - -// TODO(basioli): This should be removed once new runtime is implemented, and -// CpuAotCompilationResult will be the only implementation of -// AotCompilationResult. This is still used as it allows us to `Export` and -// subsequently load both runtimes. - -// This is a result of exporting JIT compiled -// CpuExecutable to AOT compilation result that can be saved on disk and shipped -// over the wire. -class CpuExecutableAotCompilationResult : public AotCompilationResult { - public: - static absl::StatusOr> - Create(const HloModule* hlo_module, const BufferAssignment* buffer_assignment, - absl::string_view function_name, std::vector obj_files, - std::vector symbols, const ThunkSequence* thunks, - CompilationResultProto::ObjFileKind obj_file_kind) { - std::optional thunk_proto; - - if (thunks != nullptr) { - ThunkSequenceSerDesProtobuf thunk_sequence_serdes( - &buffer_assignment->Allocations()); - TF_ASSIGN_OR_RETURN(thunk_proto, thunk_sequence_serdes.ToProto(*thunks)); - } - - return absl::WrapUnique(new CpuExecutableAotCompilationResult( - hlo_module, buffer_assignment, function_name, std::move(obj_files), - std::move(symbols), thunk_proto, obj_file_kind)); - } - - absl::StatusOr SerializeAsString() const override { - return proto_.SerializeAsString(); - } - - static absl::StatusOr> - FromString(const std::string& serialized) { - CompilationResultProto proto; - if (!proto.ParseFromString(serialized)) { - return Internal( - "Failed to parse serialized CpuExecutableAotCompilationResult."); - } - - TF_ASSIGN_OR_RETURN( - std::unique_ptr module, - HloModule::CreateFromProtoWithConfig(proto.hlo_module())); - - return std::unique_ptr( - new CpuExecutableAotCompilationResult(proto, std::move(module))); - } - - absl::StatusOr> LoadExecutable( - Compiler* compiler, - const se::StreamExecutor* stream_exec) const&& override; - - const HloModule* optimized_module() const override { return module_.get(); } - - std::unique_ptr consume_optimized_module() override { - return std::move(module_); - } - - private: - CpuExecutableAotCompilationResult( - const HloModule* hlo_module, const BufferAssignment* buffer_assignment, - absl::string_view function_name, std::vector obj_files, - std::vector symbols, - const std::optional& thunks, - CompilationResultProto::ObjFileKind obj_file_kind) { - *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto(); - *proto_.mutable_hlo_module()->mutable_config() = - hlo_module->config().ToProto(); - *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto(); - proto_.set_entry_function_name(std::string(function_name)); - for (std::string& obj_file : obj_files) { - proto_.add_obj_files(std::move(obj_file)); - } - - for (const auto& symbol : symbols) { - auto* symbol_proto = proto_.add_compiled_symbols(); - *symbol_proto = symbol; - } - proto_.set_obj_files_kind(obj_file_kind); - module_ = hlo_module->Clone(); - - if (thunks.has_value()) { - ThunkSequenceSerDesProtobuf thunk_sequence_serdes( - &buffer_assignment->Allocations()); - *proto_.mutable_thunk_sequence() = *thunks; - } - } - - explicit CpuExecutableAotCompilationResult(CompilationResultProto proto, - std::unique_ptr module) - : proto_(std::move(proto)), module_(std::move(module)) {} - - CompilationResultProto proto_; - std::unique_ptr module_; -}; - -} // namespace - -absl::StatusOr> -CpuExecutableAotCompilationResult::LoadExecutable( - Compiler* compiler, const se::StreamExecutor* stream_exec) const&& { - // Recreate HloModule from proto. - TF_ASSIGN_OR_RETURN( - std::unique_ptr module, - HloModule::CreateFromProtoWithConfig(proto_.hlo_module())); - - VLOG(2) << "Load XLA:CPU executable for module: " << module->name(); - - // Recreate BufferAssignment from proto. - TF_ASSIGN_OR_RETURN( - std::unique_ptr buffer_assignment, - BufferAssignment::FromProto(proto_.buffer_assignment(), module.get(), - compiler->BufferSizeBytesFunction(), - /*can_share_buffer=*/nullptr)); - - const DebugOptions& debug_options = module->config().debug_options(); - VlogMaxIsa(debug_options.xla_cpu_max_isa()); - const HloModuleConfig& config = module->config(); - - // Infer target machine from the current host CPU. - TF_ASSIGN_OR_RETURN( - std::unique_ptr target_machine, - IrCompiler::InferTargetMachine( - std::move(CompilerTargetOptions(module->config())), - IrCompiler::GetCodeGenOptLevel(config), - CpuFeatureFromString(debug_options.xla_cpu_max_isa()))); - - // Definition generator to link with XLA:CPU host runtime symbols. - ExecutionEngine::DefinitionGenerator definition_generator = - [](const llvm::DataLayout& data_layout) { - return std::make_unique(data_layout); - }; - - ObjectLoader object_loader(/*num_dylibs=*/1, - target_machine->createDataLayout(), - definition_generator); - - for (size_t i = 0; i < object_loader.num_dylibs(); ++i) { - object_loader.dylib(i).value()->addGenerator( - std::make_unique( - target_machine->createDataLayout())); - } - - // We might have an XLA:CPU executable that has only runtime thunks and - // doesn't have any corresponding object files, and it's absolutely fine. - VLOG(2) << "Load XLA:CPU executable from " << proto_.obj_files_size() - << " object files; entry_function_name=" - << proto_.entry_function_name(); - - size_t obj_file_index = 0; - for (auto& obj_file : proto_.obj_files()) { - llvm::StringRef data(obj_file.data(), obj_file.size()); - TF_RETURN_IF_ERROR( - object_loader.AddObjFile(llvm::MemoryBuffer::getMemBuffer( - data, absl::StrCat(proto_.entry_function_name(), "_", - obj_file_index++)))); - } - - std::unique_ptr cpu_executable; - - if (proto_.obj_files_kind() == CompilationResultProto::KERNELS) { - ThunkSequenceSerDesProtobuf thunk_sequence_serdes( - &buffer_assignment->Allocations()); - TF_ASSIGN_OR_RETURN( - std::unique_ptr thunks, - thunk_sequence_serdes.FromProto(proto_.thunk_sequence())); - - VLOG(3) << "Loaded " << thunks->size() << " thunks."; - - std::vector compiled_symbols; - - for (const auto& symbol_proto : proto_.compiled_symbols()) { - switch (symbol_proto.function_type_id()) { - case SymbolProto::KERNEL: - compiled_symbols.push_back( - FunctionLibrary::Sym( - symbol_proto.name())); - break; - case SymbolProto::COMPARATOR: - compiled_symbols.push_back( - FunctionLibrary::Sym( - symbol_proto.name())); - break; - default: - return Internal( - "Unknown function type id %s", - SymbolProto_FunctionTypeId_Name(symbol_proto.function_type_id())); - } - } - - VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols"; - TF_ASSIGN_OR_RETURN(std::unique_ptr function_library, - std::move(object_loader).Load(compiled_symbols)); - - // Create constant allocations from the buffer assignment. - TF_ASSIGN_OR_RETURN(std::vector constants, - CreateConstantAllocations(*buffer_assignment)); - - TF_ASSIGN_OR_RETURN( - cpu_executable, - CpuExecutable::Create(std::move(function_library), - std::move(buffer_assignment), std::move(module), - std::move(*thunks), std::move(constants), nullptr, - nullptr)); - - } else if (proto_.obj_files_kind() == CompilationResultProto::CLASSIC) { - // Create a "classic" CPU executable. - using ComputeFn = std::remove_pointer_t; - TF_ASSIGN_OR_RETURN(std::unique_ptr function_library, - std::move(object_loader) - .Load({FunctionLibrary::Sym( - proto_.entry_function_name())})); - - TF_ASSIGN_OR_RETURN( - cpu_executable, - CpuExecutable::Create(std::move(function_library), - std::move(buffer_assignment), std::move(module), - proto_.entry_function_name(), nullptr, nullptr)); - - } else { - return Internal("Unknown obj file kind"); - } - - // Dump computation proto state and buffer assignment for - // GetCompiledMemoryStats results. - auto hlo_proto = std::make_unique(); - *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto(); - *hlo_proto->mutable_buffer_assignment() = - cpu_executable->buffer_assignment().ToProto(); - cpu_executable->set_hlo_proto(std::move(hlo_proto)); - - return cpu_executable; -} - -absl::StatusOr> CpuCompiler::Export( - Executable* executable) const { - auto* cpu_executable = tensorflow::down_cast(executable); - if (!cpu_executable) - return Internal("Could not downcast Executable to CpuExecutable"); - - // Export object files for all dylibs. - std::vector obj_files; - for (const auto& obj_file : cpu_executable->obj_files()) { - obj_files.push_back(std::string(obj_file)); - } - - auto kind = cpu_executable->has_thunks() ? CompilationResultProto::KERNELS - : CompilationResultProto::CLASSIC; - const ThunkSequence* thunk_sequence = - cpu_executable->has_thunks() ? &cpu_executable->thunks().thunk_sequence() - : nullptr; - - std::vector compiled_symbols = - cpu_executable->get_compiled_symbols_proto(); - - return CpuExecutableAotCompilationResult::Create( - &cpu_executable->module(), &cpu_executable->buffer_assignment(), - cpu_executable->module_name(), std::move(obj_files), - std::move(compiled_symbols), thunk_sequence, kind); -} - -absl::StatusOr> -CpuCompiler::LoadAotCompilationResult( - const std::string& serialized_aot_result) { - return CpuExecutableAotCompilationResult::FromString(serialized_aot_result); -} - -absl::StatusOr CpuCompiler::CreateHloSchedule( - const HloModule& hlo_module) const { - // Select a memory scheduler optimized for concurrency vs minimal memory. - auto scheduler = - hlo_module.config() - .debug_options() - .xla_cpu_enable_concurrency_optimized_scheduler() - ? std::unique_ptr( - std::make_unique(BufferSizeBytesFunction())) - : std::make_unique(BufferSizeBytesFunction()); - - // Select an order for emitting the HLO instructions for each - // computation. Using this sequence enables tighter buffer liveness analysis - // and reduced memory usage (as compared to using `DependencyHloOrdering`). - return ScheduleModule(&hlo_module, *scheduler); -} - -absl::StatusOr> -CpuCompiler::CreateBufferAssignment(const HloModule& module) const { - // Run buffer allocation on the HLO graph. - return BufferAssigner::Run( - &module, std::make_unique(module.schedule()), - BufferSizeBytesFunction(), memory_alignment, - /*allocate_buffers_for_constants=*/true); -} - -} // namespace cpu -} // namespace xla diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h index 68da3fd55523df..a04432292b43f1 100644 --- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h +++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h @@ -31,8 +31,8 @@ namespace cpu { class CpuInstructionFusion : public InstructionFusion { public: - CpuInstructionFusion() - : InstructionFusion(CpuInstructionFusion::IsExpensive) {} + CpuInstructionFusion(bool may_duplicate) + : InstructionFusion(CpuInstructionFusion::IsExpensive, may_duplicate) {} ~CpuInstructionFusion() override = default; using HloPassInterface::Run;