microsoft · raneashay · Mar 23, 2026
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -136,17 +136,7 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          opcode == Op_MulVL ||
-          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
-          // They are not suitable for auto-vectorization because the result would not conform
-          // to the JLS, Section Evaluation Order.
-          // Note: we could implement sequential reductions for these reduction operators, but
-          //       this will still almost never lead to speedups, because the sequential
-          //       reductions are latency limited along the reduction chain, and not
-          //       throughput limited. This is unlike unordered reductions (associative op)
-          //       and element-wise ops which are usually throughput limited.
-          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
+          opcode == Op_MulVL) {
         return false;
       }
     }
@@ -3377,6 +3367,40 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
 
 // reduction addF
 
+instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 2 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 4 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
@@ -3415,8 +3439,9 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
 // 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
 //    auto-vectorization on SVE machine.
 instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+  predicate(UseSVE > 0 &&
+            (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+             n->as_Reduction()->requires_strict_order()));
   match(Set dst_src1 (AddReductionVF dst_src1 src2));
   format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
@@ -3430,6 +3455,19 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
 
 // reduction addD
 
+instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{
+  predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVD dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %}
+  ins_encode %{
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for doubles. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
@@ -3453,8 +3491,9 @@ instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
 // 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
 //    auto-vectorization on SVE machine.
 instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+  predicate(UseSVE > 0 &&
+            (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+             n->as_Reduction()->requires_strict_order()));
   match(Set dst_src1 (AddReductionVD dst_src1 src2));
   format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -126,17 +126,7 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          opcode == Op_MulVL ||
-          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
-          // They are not suitable for auto-vectorization because the result would not conform
-          // to the JLS, Section Evaluation Order.
-          // Note: we could implement sequential reductions for these reduction operators, but
-          //       this will still almost never lead to speedups, because the sequential
-          //       reductions are latency limited along the reduction chain, and not
-          //       throughput limited. This is unlike unordered reductions (associative op)
-          //       and element-wise ops which are usually throughput limited.
-          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
+          opcode == Op_MulVL) {
         return false;
       }
     }
@@ -2034,6 +2024,40 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
 
 // reduction addF
 
+instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 2 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 4 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
@@ -2075,8 +2099,9 @@ define(`REDUCE_ADD_FP_SVE', `
 // 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
 //    auto-vectorization on SVE machine.
 instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+  predicate(UseSVE > 0 &&
+            (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+             n->as_Reduction()->requires_strict_order()));
   match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
   format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
@@ -2092,6 +2117,19 @@ REDUCE_ADD_FP_SVE(F, S)
 
 // reduction addD
 
+instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{
+  predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVD dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %}
+  ins_encode %{
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for doubles. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).

diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java b/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java
@@ -76,46 +76,6 @@ private static void testMulVL() {
         }
     }
 
-    @Test
-    @IR(failOn = {IRNode.ADD_REDUCTION_VF})
-    private static void testAddReductionVF() {
-        float result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result += (floata[i] + floatb[i]);
-        }
-        fresult += result;
-    }
-
-    @Test
-    @IR(failOn = {IRNode.ADD_REDUCTION_VD})
-    private static void testAddReductionVD() {
-        double result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result += (doublea[i] + doubleb[i]);
-        }
-        dresult += result;
-    }
-
-    @Test
-    @IR(failOn = {IRNode.MUL_REDUCTION_VF})
-    private static void testMulReductionVF() {
-        float result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result *= (floata[i] + floatb[i]);
-        }
-        fresult += result;
-    }
-
-    @Test
-    @IR(failOn = {IRNode.MUL_REDUCTION_VD})
-    private static void testMulReductionVD() {
-        double result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result *= (doublea[i] + doubleb[i]);
-        }
-        dresult += result;
-    }
-
     @Test
     @IR(failOn = {IRNode.COUNT_TRAILING_ZEROS_VL})
     public void testNumberOfTrailingZeros() {