diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 19f03d97a72e2..8d6aa9b6b5069 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -136,17 +136,7 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          opcode == Op_MulVL ||
-          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
-          // They are not suitable for auto-vectorization because the result would not conform
-          // to the JLS, Section Evaluation Order.
-          // Note: we could implement sequential reductions for these reduction operators, but
-          //       this will still almost never lead to speedups, because the sequential
-          //       reductions are latency limited along the reduction chain, and not
-          //       throughput limited. This is unlike unordered reductions (associative op)
-          //       and element-wise ops which are usually throughput limited.
-          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
+          opcode == Op_MulVL) {
         return false;
       }
     }
@@ -3377,6 +3367,40 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
 
 // reduction addF
 
+instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 2 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 4 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
@@ -3415,8 +3439,9 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
 // 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
 //    auto-vectorization on SVE machine.
 instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+  predicate(UseSVE > 0 &&
+            (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+             n->as_Reduction()->requires_strict_order()));
   match(Set dst_src1 (AddReductionVF dst_src1 src2));
   format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
@@ -3430,6 +3455,19 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
 
 // reduction addD
 
+instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{
+  predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVD dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %}
+  ins_encode %{
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for doubles. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
@@ -3453,8 +3491,9 @@ instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
 // 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
 //    auto-vectorization on SVE machine.
 instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+  predicate(UseSVE > 0 &&
+            (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+             n->as_Reduction()->requires_strict_order()));
   match(Set dst_src1 (AddReductionVD dst_src1 src2));
   format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 48bffb3cf3588..fb5194f689588 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -126,17 +126,7 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          opcode == Op_MulVL ||
-          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
-          // They are not suitable for auto-vectorization because the result would not conform
-          // to the JLS, Section Evaluation Order.
-          // Note: we could implement sequential reductions for these reduction operators, but
-          //       this will still almost never lead to speedups, because the sequential
-          //       reductions are latency limited along the reduction chain, and not
-          //       throughput limited. This is unlike unordered reductions (associative op)
-          //       and element-wise ops which are usually throughput limited.
-          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
+          opcode == Op_MulVL) {
         return false;
       }
     }
@@ -2034,6 +2024,40 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
 
 // reduction addF
 
+instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 2 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE == 0 &&
+            Matcher::vector_length(n->in(2)) == 4 &&
+            n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %}
+  ins_encode %{
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
+    __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
@@ -2075,8 +2099,9 @@ define(`REDUCE_ADD_FP_SVE', `
 // 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
 //    auto-vectorization on SVE machine.
 instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+  predicate(UseSVE > 0 &&
+            (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+             n->as_Reduction()->requires_strict_order()));
   match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
   format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
@@ -2092,6 +2117,19 @@ REDUCE_ADD_FP_SVE(F, S)
 
 // reduction addD
 
+instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{
+  predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order());
+  match(Set dst_src1 (AddReductionVD dst_src1 vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %}
+  ins_encode %{
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
+    __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
+    __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
   // Non-strictly ordered floating-point add reduction for doubles. This rule is
   // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java b/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java
index 8ee3d2ecdd9e6..7baa20b8c349f 100644
--- a/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java
+++ b/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java
@@ -76,46 +76,6 @@ private static void testMulVL() {
         }
     }
 
-    @Test
-    @IR(failOn = {IRNode.ADD_REDUCTION_VF})
-    private static void testAddReductionVF() {
-        float result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result += (floata[i] + floatb[i]);
-        }
-        fresult += result;
-    }
-
-    @Test
-    @IR(failOn = {IRNode.ADD_REDUCTION_VD})
-    private static void testAddReductionVD() {
-        double result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result += (doublea[i] + doubleb[i]);
-        }
-        dresult += result;
-    }
-
-    @Test
-    @IR(failOn = {IRNode.MUL_REDUCTION_VF})
-    private static void testMulReductionVF() {
-        float result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result *= (floata[i] + floatb[i]);
-        }
-        fresult += result;
-    }
-
-    @Test
-    @IR(failOn = {IRNode.MUL_REDUCTION_VD})
-    private static void testMulReductionVD() {
-        double result = 1;
-        for(int i = 0; i < SIZE; i++) {
-            result *= (doublea[i] + doubleb[i]);
-        }
-        dresult += result;
-    }
-
     @Test
     @IR(failOn = {IRNode.COUNT_TRAILING_ZEROS_VL})
     public void testNumberOfTrailingZeros() {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOfReductionLoops.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOfReductionLoops.java
new file mode 100644
index 0000000000000..3a16a3f4dbb69
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOfReductionLoops.java
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.loopopts.superword;
+
+import compiler.lib.ir_framework.*;
+import jdk.test.lib.Asserts;
+import jdk.test.lib.Utils;
+import java.util.Random;
+
+/*
+ * @test
+ * @summary Test auto-vectorization of loops containing sum and mul reductions
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestAutoVectorizationOfReductionLoops
+ */
+public class TestAutoVectorizationOfReductionLoops {
+    static final int DIM = 16;
+    static final int SIZE = 256;
+    static final Random RANDOM = Utils.getRandomInstance();
+
+    float[] fx;
+    float[] fy;
+    float[] fm;
+    float[] fm2;
+
+    double[] dx;
+    double[] dy;
+    double[] dm;
+    double[] dm2;
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("-XX:-TieredCompilation");
+    }
+
+    public TestAutoVectorizationOfReductionLoops() {
+        fx = new float[SIZE];
+        fy = new float[SIZE];
+        fm = new float[SIZE * DIM];
+        fm2 = new float[SIZE * DIM];
+
+        dx = new double[SIZE];
+        dy = new double[SIZE];
+        dm = new double[SIZE * DIM];
+        dm2 = new double[SIZE * DIM];
+
+        for (int i = 0; i < SIZE; i++) {
+            fx[i] = RANDOM.nextFloat();
+            fy[i] = RANDOM.nextFloat();
+            dx[i] = RANDOM.nextDouble();
+            dy[i] = RANDOM.nextDouble();
+        }
+
+        for (int i = 0; i < SIZE * DIM; i++) {
+            fm[i] = RANDOM.nextFloat();
+            fm2[i] = RANDOM.nextFloat();
+            dm[i] = RANDOM.nextDouble();
+            dm2[i] = RANDOM.nextDouble();
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0",
+                  IRNode.MUL_VF,  "> 0",
+                  IRNode.ADD_REDUCTION_VF,  "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static float testSDot(int n, float[] x, float[] y) {
+        float sum = 0.0f;
+        for (int i = 0; i < n; i++) {
+            sum += x[i] * y[i];
+        }
+        return sum;
+    }
+
+    @Run(test = "testSDot")
+    void runSDot() {
+        float expected = 0.0f;
+        for (int i = 0; i < SIZE; i++) {
+            expected += fx[i] * fy[i];
+        }
+        int expectedBits = Float.floatToIntBits(expected);
+        int computedBits = Float.floatToIntBits(testSDot(SIZE, fx, fy));
+        Asserts.assertEquals(computedBits, expectedBits);
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0",
+                  IRNode.MUL_VD,  "> 0",
+                  IRNode.ADD_REDUCTION_VD,  "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static double testDDot(int n, double[] x, double[] y) {
+        double sum = 0.0;
+        for (int i = 0; i < n; i++) {
+            sum += x[i] * y[i];
+        }
+        return sum;
+    }
+
+    @Run(test = "testDDot")
+    void runDDot() {
+        double expected = 0.0;
+        for (int i = 0; i < SIZE; i++) {
+            expected += dx[i] * dy[i];
+        }
+        long expectedBits = Double.doubleToLongBits(expected);
+        long computedBits = Double.doubleToLongBits(testDDot(SIZE, dx, dy));
+        Asserts.assertEquals(computedBits, expectedBits);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MUL_VD, "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static float testSdsdot(int n, float sb, float[] x, float[] y) {
+        double sum = sb;
+        for (int i = 0; i < n; i++) {
+            sum += (double) x[i] * (double) y[i];
+        }
+        return (float) sum;
+    }
+
+    @Run(test = "testSdsdot")
+    void runSdsdot() {
+        double sum = 1.0f;
+        for (int i = 0; i < SIZE; i++) {
+            sum += (double) fx[i] * (double) fy[i];
+        }
+        float expected = (float) sum;
+        int expectedBits = Float.floatToIntBits(expected);
+        int computedBits = Float.floatToIntBits(testSdsdot(SIZE, 1.0f, fx, fy));
+        Asserts.assertEquals(computedBits, expectedBits);
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0",
+                  IRNode.MUL_VF, "> 0",
+                  IRNode.ADD_REDUCTION_VF, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testSgemvT(int m, int n, float alpha, float[] a, int lda, float[] x, float beta, float[] y) {
+        for (int col = 0; col < n; col++) {
+            float sum = 0.0f;
+            for (int row = 0; row < m; row++) {
+                sum += x[row] * a[row + col * lda];
+            }
+            if (beta != 0.0f) {
+                y[col] = alpha * sum + beta * y[col];
+            } else {
+                y[col] = alpha * sum;
+            }
+        }
+    }
+
+    @Run(test = "testSgemvT")
+    void runSgemvT() {
+        float[] yCopy = new float[DIM];
+        float[] yExpected = new float[DIM];
+        for (int i = 0; i < DIM; i++) {
+            yCopy[i] = fy[i];
+            yExpected[i] = fy[i];
+        }
+        testSgemvT(SIZE, DIM, 2.0f, fm, SIZE, fx, 1.0f, yCopy);
+        for (int col = 0; col < DIM; col++) {
+            float sum = 0.0f;
+            for (int row = 0; row < SIZE; row++) {
+                sum += fx[row] * fm[row + col * SIZE];
+            }
+            yExpected[col] = 2.0f * sum + 1.0f * yExpected[col];
+        }
+        for (int i = 0; i < DIM; i++) {
+            Asserts.assertEquals(Float.floatToIntBits(yCopy[i]), Float.floatToIntBits(yExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0",
+                  IRNode.MUL_VD, "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testDgemvT(int m, int n, double alpha, double[] a, int lda, double[] x, double beta, double[] y) {
+        for (int col = 0; col < n; col++) {
+            double sum = 0.0;
+            for (int row = 0; row < m; row++) {
+                sum += x[row] * a[row + col * lda];
+            }
+            if (beta != 0.0) {
+                y[col] = alpha * sum + beta * y[col];
+            } else {
+                y[col] = alpha * sum;
+            }
+        }
+    }
+
+    @Run(test = "testDgemvT")
+    void runDgemvT() {
+        double[] yCopy = new double[DIM];
+        double[] yExpected = new double[DIM];
+        for (int i = 0; i < DIM; i++) {
+            yCopy[i] = dy[i];
+            yExpected[i] = dy[i];
+        }
+        testDgemvT(SIZE, DIM, 2.0, dm, SIZE, dx, 1.0, yCopy);
+        for (int col = 0; col < DIM; col++) {
+            double sum = 0.0;
+            for (int row = 0; row < SIZE; row++) {
+                sum += dx[row] * dm[row + col * SIZE];
+            }
+            yExpected[col] = 2.0 * sum + 1.0 * yExpected[col];
+        }
+        for (int i = 0; i < DIM; i++) {
+            Asserts.assertEquals(Double.doubleToLongBits(yCopy[i]), Double.doubleToLongBits(yExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0",
+                  IRNode.MUL_VF, "> 0",
+                  IRNode.ADD_REDUCTION_VF, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testSgemmTN(int m, int n, int k, float alpha, float[] a, int lda, float[] b, int ldb, float beta, float[] c, int ldc) {
+        for (int col = 0; col < n; col++) {
+            for (int row = 0; row < m; row++) {
+                float sum = 0.0f;
+                for (int i = 0; i < k; i++) {
+                    sum += a[i + row * lda] * b[i + col * ldb];
+                }
+                if (beta != 0.0f) {
+                    c[row + col * ldc] = alpha * sum + beta * c[row + col * ldc];
+                } else {
+                    c[row + col * ldc] = alpha * sum;
+                }
+            }
+        }
+    }
+
+    @Run(test = "testSgemmTN")
+    void runSgemmTN() {
+        float[] c = new float[DIM * DIM];
+        float[] cExpected = new float[DIM * DIM];
+        for (int i = 0; i < DIM * DIM; i++) {
+            c[i] = RANDOM.nextFloat();
+            cExpected[i] = c[i];
+        }
+        testSgemmTN(DIM, DIM, SIZE, 1.0f, fm, SIZE, fm2, SIZE, 1.0f, c, DIM);
+        for (int col = 0; col < DIM; col++) {
+            for (int row = 0; row < DIM; row++) {
+                float sum = 0.0f;
+                for (int i = 0; i < SIZE; i++) {
+                    sum += fm[i + row * SIZE] * fm2[i + col * SIZE];
+                }
+                cExpected[row + col * DIM] = 1.0f * sum + 1.0f * cExpected[row + col * DIM];
+            }
+        }
+        for (int i = 0; i < DIM * DIM; i++) {
+            Asserts.assertEquals(Float.floatToIntBits(c[i]), Float.floatToIntBits(cExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0",
+                  IRNode.MUL_VD, "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testDgemmTN(int m, int n, int k, double alpha, double[] a, int lda, double[] b, int ldb, double beta, double[] c, int ldc) {
+        for (int col = 0; col < n; col++) {
+            for (int row = 0; row < m; row++) {
+                double sum = 0.0;
+                for (int i = 0; i < k; i++) {
+                    sum += a[i + row * lda] * b[i + col * ldb];
+                }
+                if (beta != 0.0) {
+                    c[row + col * ldc] = alpha * sum + beta * c[row + col * ldc];
+                } else {
+                    c[row + col * ldc] = alpha * sum;
+                }
+            }
+        }
+    }
+
+    @Run(test = "testDgemmTN")
+    void runDgemmTN() {
+        double[] c = new double[DIM * DIM];
+        double[] cExpected = new double[DIM * DIM];
+        for (int i = 0; i < DIM * DIM; i++) {
+            c[i] = RANDOM.nextDouble();
+            cExpected[i] = c[i];
+        }
+        testDgemmTN(DIM, DIM, SIZE, 1.0, dm, SIZE, dm2, SIZE, 1.0, c, DIM);
+        for (int col = 0; col < DIM; col++) {
+            for (int row = 0; row < DIM; row++) {
+                double sum = 0.0;
+                for (int i = 0; i < SIZE; i++) {
+                    sum += dm[i + row * SIZE] * dm2[i + col * SIZE];
+                }
+                cExpected[row + col * DIM] = 1.0 * sum + 1.0 * cExpected[row + col * DIM];
+            }
+        }
+        for (int i = 0; i < DIM * DIM; i++) {
+            Asserts.assertEquals(Double.doubleToLongBits(c[i]), Double.doubleToLongBits(cExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0",
+                  IRNode.MUL_VF, "> 0",
+                  IRNode.ADD_REDUCTION_VF, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testSgebpTN(int rows, int rowe, int cols, int cole, int is, int ie, float alpha, float[] a, int lda, float[] b, int ldb, float[] c, int ldc) {
+        for (int col = cols; col < cole; col++) {
+            for (int row = rows; row < rowe; row++) {
+                float sum = 0.0f;
+                for (int i = is; i < ie; i++) {
+                    sum += a[i + row * lda] * b[i + col * ldb];
+                }
+                c[row + col * ldc] += alpha * sum;
+            }
+        }
+    }
+
+    @Run(test = "testSgebpTN")
+    void runSgebpTN() {
+        float[] c = new float[DIM * DIM];
+        float[] cExpected = new float[DIM * DIM];
+        for (int i = 0; i < DIM * DIM; i++) {
+            c[i] = RANDOM.nextFloat();
+            cExpected[i] = c[i];
+        }
+        testSgebpTN(0, DIM, 0, DIM, 0, SIZE, 1.0f, fm, SIZE, fm2, SIZE, c, DIM);
+        for (int col = 0; col < DIM; col++) {
+            for (int row = 0; row < DIM; row++) {
+                float sum = 0.0f;
+                for (int i = 0; i < SIZE; i++) {
+                    sum += fm[i + row * SIZE] * fm2[i + col * SIZE];
+                }
+                cExpected[row + col * DIM] += 1.0f * sum;
+            }
+        }
+        for (int i = 0; i < DIM * DIM; i++) {
+            Asserts.assertEquals(Float.floatToIntBits(c[i]), Float.floatToIntBits(cExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0",
+                  IRNode.MUL_VD, "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testDgebpTN(int rows, int rowe, int cols, int cole, int is, int ie, double alpha, double[] a, int lda, double[] b, int ldb, double[] c, int ldc) {
+        for (int col = cols; col < cole; col++) {
+            for (int row = rows; row < rowe; row++) {
+                double sum = 0.0;
+                for (int i = is; i < ie; i++) {
+                    sum += a[i + row * lda] * b[i + col * ldb];
+                }
+                c[row + col * ldc] += alpha * sum;
+            }
+        }
+    }
+
+    @Run(test = "testDgebpTN")
+    void runDgebpTN() {
+        double[] c = new double[DIM * DIM];
+        double[] cExpected = new double[DIM * DIM];
+        for (int i = 0; i < DIM * DIM; i++) {
+            c[i] = RANDOM.nextDouble();
+            cExpected[i] = c[i];
+        }
+        testDgebpTN(0, DIM, 0, DIM, 0, SIZE, 1.0, dm, SIZE, dm2, SIZE, c, DIM);
+        for (int col = 0; col < DIM; col++) {
+            for (int row = 0; row < DIM; row++) {
+                double sum = 0.0;
+                for (int i = 0; i < SIZE; i++) {
+                    sum += dm[i + row * SIZE] * dm2[i + col * SIZE];
+                }
+                cExpected[row + col * DIM] += 1.0 * sum;
+            }
+        }
+        for (int i = 0; i < DIM * DIM; i++) {
+            Asserts.assertEquals(Double.doubleToLongBits(c[i]), Double.doubleToLongBits(cExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0",
+                  IRNode.MUL_VF, "> 0",
+                  IRNode.ADD_REDUCTION_VF, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testSgepdotTN(int rows, int rowe, int cols, int cole, int is, int ie, float alpha, float[] a, int lda, float[] b, int ldb, float[] c, int ldc) {
+        for (int col = cols; col < cole; col++) {
+            for (int row = rows; row < rowe; row++) {
+                float sum = 0.0f;
+                for (int i = is; i < ie; i++) {
+                    sum += a[i + row * lda] * b[i + col * ldb];
+                }
+                c[row + col * ldc] += alpha * sum;
+            }
+        }
+    }
+
+    @Run(test = "testSgepdotTN")
+    void runSgepdotTN() {
+        float[] c = new float[DIM * DIM];
+        float[] cExpected = new float[DIM * DIM];
+        for (int i = 0; i < DIM * DIM; i++) {
+            c[i] = RANDOM.nextFloat();
+            cExpected[i] = c[i];
+        }
+        testSgepdotTN(0, 3, 0, 3, 0, SIZE, 1.0f, fm, SIZE, fm2, SIZE, c, DIM);
+        for (int col = 0; col < 3; col++) {
+            for (int row = 0; row < 3; row++) {
+                float sum = 0.0f;
+                for (int i = 0; i < SIZE; i++) {
+                    sum += fm[i + row * SIZE] * fm2[i + col * SIZE];
+                }
+                cExpected[row + col * DIM] += 1.0f * sum;
+            }
+        }
+        for (int i = 0; i < DIM * DIM; i++) {
+            Asserts.assertEquals(Float.floatToIntBits(c[i]), Float.floatToIntBits(cExpected[i]));
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0",
+                  IRNode.MUL_VD, "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    static void testDgepdotTN(int rows, int rowe, int cols, int cole, int is, int ie, double alpha, double[] a, int lda, double[] b, int ldb, double[] c, int ldc) {
+        for (int col = cols; col < cole; col++) {
+            for (int row = rows; row < rowe; row++) {
+                double sum = 0.0;
+                for (int i = is; i < ie; i++) {
+                    sum += a[i + row * lda] * b[i + col * ldb];
+                }
+                c[row + col * ldc] += alpha * sum;
+            }
+        }
+    }
+
+    @Run(test = "testDgepdotTN")
+    void runDgepdotTN() {
+        double[] c = new double[DIM * DIM];
+        double[] cExpected = new double[DIM * DIM];
+        for (int i = 0; i < DIM * DIM; i++) {
+            c[i] = RANDOM.nextDouble();
+            cExpected[i] = c[i];
+        }
+        testDgepdotTN(0, 3, 0, 3, 0, SIZE, 1.0, dm, SIZE, dm2, SIZE, c, DIM);
+        for (int col = 0; col < 3; col++) {
+            for (int row = 0; row < 3; row++) {
+                double sum = 0.0;
+                for (int i = 0; i < SIZE; i++) {
+                    sum += dm[i + row * SIZE] * dm2[i + col * SIZE];
+                }
+                cExpected[row + col * DIM] += 1.0 * sum;
+            }
+        }
+        for (int i = 0; i < DIM * DIM; i++) {
+            Asserts.assertEquals(Double.doubleToLongBits(c[i]), Double.doubleToLongBits(cExpected[i]));
+        }
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 5c085e6a3a343..0d8b41e331594 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -23,7 +23,7 @@
 
 /*
  * @test id=no-vectorization
- * @bug 8340093 8342095
+ * @bug 8340093 8342095 8370677
  * @summary Test vectorization of reduction loops.
  * @library /test/lib /
  * @run driver compiler.loopopts.superword.TestReductions P0
@@ -2159,12 +2159,8 @@ private static long longMaxBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VF,          "= 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
-    @IR(failOn = IRNode.LOAD_VECTOR_F,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -2183,12 +2179,8 @@ private static float floatAddSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
                   IRNode.MUL_REDUCTION_VF, "> 0",
                   IRNode.MUL_VF,           "= 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
-    @IR(failOn = IRNode.LOAD_VECTOR_F,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -2242,12 +2234,8 @@ private static float floatMaxSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VF,          "= 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_F,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatAddDotProduct() {
@@ -2263,12 +2251,8 @@ private static float floatAddDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
                   IRNode.MUL_REDUCTION_VF, "> 0",
                   IRNode.MUL_VF,           "> 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_F,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMulDotProduct() {
@@ -2319,12 +2303,8 @@ private static float floatMaxDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VF,          "> 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_F,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatAddBig() {
@@ -2340,12 +2320,8 @@ private static float floatAddBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
                   IRNode.MUL_REDUCTION_VF, "> 0",
                   IRNode.MUL_VF,           "> 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_F,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMulBig() {
@@ -2396,12 +2372,8 @@ private static float floatMaxBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.ADD_REDUCTION_VD, "> 0",
                   IRNode.ADD_VD,           "= 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
-    @IR(failOn = IRNode.LOAD_VECTOR_D,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -2420,12 +2392,8 @@ private static double doubleAddSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.MUL_REDUCTION_VD, "> 0",
                   IRNode.MUL_VD,           "= 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
-    @IR(failOn = IRNode.LOAD_VECTOR_D,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -2479,12 +2447,8 @@ private static double doubleMaxSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VD,          "= 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_D,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleAddDotProduct() {
@@ -2500,12 +2464,8 @@ private static double doubleAddDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.MUL_REDUCTION_VD, "> 0",
                   IRNode.MUL_VD,           "> 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_D,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMulDotProduct() {
@@ -2556,12 +2516,8 @@ private static double doubleMaxDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VD,          "> 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_D,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleAddBig() {
@@ -2577,12 +2533,8 @@ private static double doubleAddBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.MUL_REDUCTION_VD, "> 0",
                   IRNode.MUL_VD,           "> 0"},
-        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_D,
-        applyIfCPUFeatureAnd = {"asimd", "true"})
-    // I think this could vectorize, but currently does not. Filed: JDK-8370677
-    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMulBig() {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java
index b328d4135ecfe..fd18fcc5cb4e6 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java
@@ -50,8 +50,10 @@ public static void main(String[] args) {
     }
 
     @Test
-    @IR(failOn = {IRNode.ADD_REDUCTION_VF},
-        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"},
+        phase = CompilePhase.PRINT_IDEAL)
     @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"},
         failOn = {"no_strict_order"},
         applyIfCPUFeatureOr = {"sve", "true", "sse2", "true", "rvv", "true"},
@@ -65,8 +67,10 @@ private static void testAddReductionVF() {
     }
 
     @Test
-    @IR(failOn = {IRNode.ADD_REDUCTION_VD},
-        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"},
+        phase = CompilePhase.PRINT_IDEAL)
     @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"},
         failOn = {"no_strict_order"},
         applyIfCPUFeatureOr = {"sve", "true", "sse2", "true", "rvv", "true"},
@@ -80,8 +84,10 @@ private static void testAddReductionVD() {
     }
 
     @Test
-    @IR(failOn = {IRNode.MUL_REDUCTION_VF},
-        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"},
+        phase = CompilePhase.PRINT_IDEAL)
     @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"},
         failOn = {"no_strict_order"},
         applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},
@@ -95,8 +101,10 @@ private static void testMulReductionVF() {
     }
 
     @Test
-    @IR(failOn = {IRNode.MUL_REDUCTION_VD},
-        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"})
+    @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"},
+        failOn = {"no_strict_order"},
+        applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"},
+        phase = CompilePhase.PRINT_IDEAL)
     @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"},
         failOn = {"no_strict_order"},
         applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},