diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 19f03d97a72e2..8d6aa9b6b5069 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -136,17 +136,7 @@ source %{ (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || - opcode == Op_MulVL || - // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. - // They are not suitable for auto-vectorization because the result would not conform - // to the JLS, Section Evaluation Order. - // Note: we could implement sequential reductions for these reduction operators, but - // this will still almost never lead to speedups, because the sequential - // reductions are latency limited along the reduction chain, and not - // throughput limited. This is unlike unordered reductions (associative op) - // and element-wise ops which are usually throughput limited. - opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || - opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { + opcode == Op_MulVL) { return false; } } @@ -3377,6 +3367,40 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{ // reduction addF +instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{ + predicate(UseSVE == 0 && + Matcher::vector_length(n->in(2)) == 2 && + n->as_Reduction()->requires_strict_order()); + match(Set dst_src1 (AddReductionVF dst_src1 vsrc)); + effect(TEMP tmp); + format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %} + ins_encode %{ + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{ + predicate(UseSVE == 0 && + Matcher::vector_length(n->in(2)) == 4 && + n->as_Reduction()->requires_strict_order()); + match(Set dst_src1 (AddReductionVF dst_src1 vsrc)); + effect(TEMP tmp); + format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %} + ins_encode %{ + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is // intended for the VectorAPI (which allows for non-strictly ordered add reduction). @@ -3415,8 +3439,9 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR // 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by // auto-vectorization on SVE machine. instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{ - predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || - n->as_Reduction()->requires_strict_order()); + predicate(UseSVE > 0 && + (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || + n->as_Reduction()->requires_strict_order())); match(Set dst_src1 (AddReductionVF dst_src1 src2)); format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %} ins_encode %{ @@ -3430,6 +3455,19 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{ // reduction addD +instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{ + predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order()); + match(Set dst_src1 (AddReductionVD dst_src1 vsrc)); + effect(TEMP tmp); + format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %} + ins_encode %{ + __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister); + __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1); + __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ // Non-strictly ordered floating-point add reduction for doubles. This rule is // intended for the VectorAPI (which allows for non-strictly ordered add reduction). @@ -3453,8 +3491,9 @@ instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ // 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by // auto-vectorization on SVE machine. instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{ - predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || - n->as_Reduction()->requires_strict_order()); + predicate(UseSVE > 0 && + (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || + n->as_Reduction()->requires_strict_order())); match(Set dst_src1 (AddReductionVD dst_src1 src2)); format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %} ins_encode %{ diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 48bffb3cf3588..fb5194f689588 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -126,17 +126,7 @@ source %{ (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || - opcode == Op_MulVL || - // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. - // They are not suitable for auto-vectorization because the result would not conform - // to the JLS, Section Evaluation Order. - // Note: we could implement sequential reductions for these reduction operators, but - // this will still almost never lead to speedups, because the sequential - // reductions are latency limited along the reduction chain, and not - // throughput limited. This is unlike unordered reductions (associative op) - // and element-wise ops which are usually throughput limited. - opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || - opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { + opcode == Op_MulVL) { return false; } } @@ -2034,6 +2024,40 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL) // reduction addF +instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{ + predicate(UseSVE == 0 && + Matcher::vector_length(n->in(2)) == 2 && + n->as_Reduction()->requires_strict_order()); + match(Set dst_src1 (AddReductionVF dst_src1 vsrc)); + effect(TEMP tmp); + format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %} + ins_encode %{ + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{ + predicate(UseSVE == 0 && + Matcher::vector_length(n->in(2)) == 4 && + n->as_Reduction()->requires_strict_order()); + match(Set dst_src1 (AddReductionVF dst_src1 vsrc)); + effect(TEMP tmp); + format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %} + ins_encode %{ + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + __ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3); + __ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{ // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is // intended for the VectorAPI (which allows for non-strictly ordered add reduction). @@ -2075,8 +2099,9 @@ define(`REDUCE_ADD_FP_SVE', ` // 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by // auto-vectorization on SVE machine. instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{ - predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || - n->as_Reduction()->requires_strict_order()); + predicate(UseSVE > 0 && + (!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) || + n->as_Reduction()->requires_strict_order())); match(Set dst_src1 (AddReductionV$1 dst_src1 src2)); format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %} ins_encode %{ @@ -2092,6 +2117,19 @@ REDUCE_ADD_FP_SVE(F, S) // reduction addD +instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{ + predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order()); + match(Set dst_src1 (AddReductionVD dst_src1 vsrc)); + effect(TEMP tmp); + format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %} + ins_encode %{ + __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister); + __ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1); + __ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{ // Non-strictly ordered floating-point add reduction for doubles. This rule is // intended for the VectorAPI (which allows for non-strictly ordered add reduction). diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java b/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java index 8ee3d2ecdd9e6..7baa20b8c349f 100644 --- a/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java +++ b/test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java @@ -76,46 +76,6 @@ private static void testMulVL() { } } - @Test - @IR(failOn = {IRNode.ADD_REDUCTION_VF}) - private static void testAddReductionVF() { - float result = 1; - for(int i = 0; i < SIZE; i++) { - result += (floata[i] + floatb[i]); - } - fresult += result; - } - - @Test - @IR(failOn = {IRNode.ADD_REDUCTION_VD}) - private static void testAddReductionVD() { - double result = 1; - for(int i = 0; i < SIZE; i++) { - result += (doublea[i] + doubleb[i]); - } - dresult += result; - } - - @Test - @IR(failOn = {IRNode.MUL_REDUCTION_VF}) - private static void testMulReductionVF() { - float result = 1; - for(int i = 0; i < SIZE; i++) { - result *= (floata[i] + floatb[i]); - } - fresult += result; - } - - @Test - @IR(failOn = {IRNode.MUL_REDUCTION_VD}) - private static void testMulReductionVD() { - double result = 1; - for(int i = 0; i < SIZE; i++) { - result *= (doublea[i] + doubleb[i]); - } - dresult += result; - } - @Test @IR(failOn = {IRNode.COUNT_TRAILING_ZEROS_VL}) public void testNumberOfTrailingZeros() { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOfReductionLoops.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOfReductionLoops.java new file mode 100644 index 0000000000000..3a16a3f4dbb69 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOfReductionLoops.java @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; +import jdk.test.lib.Asserts; +import jdk.test.lib.Utils; +import java.util.Random; + +/* + * @test + * @summary Test auto-vectorization of loops containing sum and mul reductions + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestAutoVectorizationOfReductionLoops + */ +public class TestAutoVectorizationOfReductionLoops { + static final int DIM = 16; + static final int SIZE = 256; + static final Random RANDOM = Utils.getRandomInstance(); + + float[] fx; + float[] fy; + float[] fm; + float[] fm2; + + double[] dx; + double[] dy; + double[] dm; + double[] dm2; + + public static void main(String[] args) { + TestFramework.runWithFlags("-XX:-TieredCompilation"); + } + + public TestAutoVectorizationOfReductionLoops() { + fx = new float[SIZE]; + fy = new float[SIZE]; + fm = new float[SIZE * DIM]; + fm2 = new float[SIZE * DIM]; + + dx = new double[SIZE]; + dy = new double[SIZE]; + dm = new double[SIZE * DIM]; + dm2 = new double[SIZE * DIM]; + + for (int i = 0; i < SIZE; i++) { + fx[i] = RANDOM.nextFloat(); + fy[i] = RANDOM.nextFloat(); + dx[i] = RANDOM.nextDouble(); + dy[i] = RANDOM.nextDouble(); + } + + for (int i = 0; i < SIZE * DIM; i++) { + fm[i] = RANDOM.nextFloat(); + fm2[i] = RANDOM.nextFloat(); + dm[i] = RANDOM.nextDouble(); + dm2[i] = RANDOM.nextDouble(); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_VF, "> 0", + IRNode.ADD_REDUCTION_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static float testSDot(int n, float[] x, float[] y) { + float sum = 0.0f; + for (int i = 0; i < n; i++) { + sum += x[i] * y[i]; + } + return sum; + } + + @Run(test = "testSDot") + void runSDot() { + float expected = 0.0f; + for (int i = 0; i < SIZE; i++) { + expected += fx[i] * fy[i]; + } + int expectedBits = Float.floatToIntBits(expected); + int computedBits = Float.floatToIntBits(testSDot(SIZE, fx, fy)); + Asserts.assertEquals(computedBits, expectedBits); + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_VD, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static double testDDot(int n, double[] x, double[] y) { + double sum = 0.0; + for (int i = 0; i < n; i++) { + sum += x[i] * y[i]; + } + return sum; + } + + @Run(test = "testDDot") + void runDDot() { + double expected = 0.0; + for (int i = 0; i < SIZE; i++) { + expected += dx[i] * dy[i]; + } + long expectedBits = Double.doubleToLongBits(expected); + long computedBits = Double.doubleToLongBits(testDDot(SIZE, dx, dy)); + Asserts.assertEquals(computedBits, expectedBits); + } + + @Test + @IR(counts = {IRNode.MUL_VD, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static float testSdsdot(int n, float sb, float[] x, float[] y) { + double sum = sb; + for (int i = 0; i < n; i++) { + sum += (double) x[i] * (double) y[i]; + } + return (float) sum; + } + + @Run(test = "testSdsdot") + void runSdsdot() { + double sum = 1.0f; + for (int i = 0; i < SIZE; i++) { + sum += (double) fx[i] * (double) fy[i]; + } + float expected = (float) sum; + int expectedBits = Float.floatToIntBits(expected); + int computedBits = Float.floatToIntBits(testSdsdot(SIZE, 1.0f, fx, fy)); + Asserts.assertEquals(computedBits, expectedBits); + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_VF, "> 0", + IRNode.ADD_REDUCTION_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testSgemvT(int m, int n, float alpha, float[] a, int lda, float[] x, float beta, float[] y) { + for (int col = 0; col < n; col++) { + float sum = 0.0f; + for (int row = 0; row < m; row++) { + sum += x[row] * a[row + col * lda]; + } + if (beta != 0.0f) { + y[col] = alpha * sum + beta * y[col]; + } else { + y[col] = alpha * sum; + } + } + } + + @Run(test = "testSgemvT") + void runSgemvT() { + float[] yCopy = new float[DIM]; + float[] yExpected = new float[DIM]; + for (int i = 0; i < DIM; i++) { + yCopy[i] = fy[i]; + yExpected[i] = fy[i]; + } + testSgemvT(SIZE, DIM, 2.0f, fm, SIZE, fx, 1.0f, yCopy); + for (int col = 0; col < DIM; col++) { + float sum = 0.0f; + for (int row = 0; row < SIZE; row++) { + sum += fx[row] * fm[row + col * SIZE]; + } + yExpected[col] = 2.0f * sum + 1.0f * yExpected[col]; + } + for (int i = 0; i < DIM; i++) { + Asserts.assertEquals(Float.floatToIntBits(yCopy[i]), Float.floatToIntBits(yExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_VD, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testDgemvT(int m, int n, double alpha, double[] a, int lda, double[] x, double beta, double[] y) { + for (int col = 0; col < n; col++) { + double sum = 0.0; + for (int row = 0; row < m; row++) { + sum += x[row] * a[row + col * lda]; + } + if (beta != 0.0) { + y[col] = alpha * sum + beta * y[col]; + } else { + y[col] = alpha * sum; + } + } + } + + @Run(test = "testDgemvT") + void runDgemvT() { + double[] yCopy = new double[DIM]; + double[] yExpected = new double[DIM]; + for (int i = 0; i < DIM; i++) { + yCopy[i] = dy[i]; + yExpected[i] = dy[i]; + } + testDgemvT(SIZE, DIM, 2.0, dm, SIZE, dx, 1.0, yCopy); + for (int col = 0; col < DIM; col++) { + double sum = 0.0; + for (int row = 0; row < SIZE; row++) { + sum += dx[row] * dm[row + col * SIZE]; + } + yExpected[col] = 2.0 * sum + 1.0 * yExpected[col]; + } + for (int i = 0; i < DIM; i++) { + Asserts.assertEquals(Double.doubleToLongBits(yCopy[i]), Double.doubleToLongBits(yExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_VF, "> 0", + IRNode.ADD_REDUCTION_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testSgemmTN(int m, int n, int k, float alpha, float[] a, int lda, float[] b, int ldb, float beta, float[] c, int ldc) { + for (int col = 0; col < n; col++) { + for (int row = 0; row < m; row++) { + float sum = 0.0f; + for (int i = 0; i < k; i++) { + sum += a[i + row * lda] * b[i + col * ldb]; + } + if (beta != 0.0f) { + c[row + col * ldc] = alpha * sum + beta * c[row + col * ldc]; + } else { + c[row + col * ldc] = alpha * sum; + } + } + } + } + + @Run(test = "testSgemmTN") + void runSgemmTN() { + float[] c = new float[DIM * DIM]; + float[] cExpected = new float[DIM * DIM]; + for (int i = 0; i < DIM * DIM; i++) { + c[i] = RANDOM.nextFloat(); + cExpected[i] = c[i]; + } + testSgemmTN(DIM, DIM, SIZE, 1.0f, fm, SIZE, fm2, SIZE, 1.0f, c, DIM); + for (int col = 0; col < DIM; col++) { + for (int row = 0; row < DIM; row++) { + float sum = 0.0f; + for (int i = 0; i < SIZE; i++) { + sum += fm[i + row * SIZE] * fm2[i + col * SIZE]; + } + cExpected[row + col * DIM] = 1.0f * sum + 1.0f * cExpected[row + col * DIM]; + } + } + for (int i = 0; i < DIM * DIM; i++) { + Asserts.assertEquals(Float.floatToIntBits(c[i]), Float.floatToIntBits(cExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_VD, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testDgemmTN(int m, int n, int k, double alpha, double[] a, int lda, double[] b, int ldb, double beta, double[] c, int ldc) { + for (int col = 0; col < n; col++) { + for (int row = 0; row < m; row++) { + double sum = 0.0; + for (int i = 0; i < k; i++) { + sum += a[i + row * lda] * b[i + col * ldb]; + } + if (beta != 0.0) { + c[row + col * ldc] = alpha * sum + beta * c[row + col * ldc]; + } else { + c[row + col * ldc] = alpha * sum; + } + } + } + } + + @Run(test = "testDgemmTN") + void runDgemmTN() { + double[] c = new double[DIM * DIM]; + double[] cExpected = new double[DIM * DIM]; + for (int i = 0; i < DIM * DIM; i++) { + c[i] = RANDOM.nextDouble(); + cExpected[i] = c[i]; + } + testDgemmTN(DIM, DIM, SIZE, 1.0, dm, SIZE, dm2, SIZE, 1.0, c, DIM); + for (int col = 0; col < DIM; col++) { + for (int row = 0; row < DIM; row++) { + double sum = 0.0; + for (int i = 0; i < SIZE; i++) { + sum += dm[i + row * SIZE] * dm2[i + col * SIZE]; + } + cExpected[row + col * DIM] = 1.0 * sum + 1.0 * cExpected[row + col * DIM]; + } + } + for (int i = 0; i < DIM * DIM; i++) { + Asserts.assertEquals(Double.doubleToLongBits(c[i]), Double.doubleToLongBits(cExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_VF, "> 0", + IRNode.ADD_REDUCTION_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testSgebpTN(int rows, int rowe, int cols, int cole, int is, int ie, float alpha, float[] a, int lda, float[] b, int ldb, float[] c, int ldc) { + for (int col = cols; col < cole; col++) { + for (int row = rows; row < rowe; row++) { + float sum = 0.0f; + for (int i = is; i < ie; i++) { + sum += a[i + row * lda] * b[i + col * ldb]; + } + c[row + col * ldc] += alpha * sum; + } + } + } + + @Run(test = "testSgebpTN") + void runSgebpTN() { + float[] c = new float[DIM * DIM]; + float[] cExpected = new float[DIM * DIM]; + for (int i = 0; i < DIM * DIM; i++) { + c[i] = RANDOM.nextFloat(); + cExpected[i] = c[i]; + } + testSgebpTN(0, DIM, 0, DIM, 0, SIZE, 1.0f, fm, SIZE, fm2, SIZE, c, DIM); + for (int col = 0; col < DIM; col++) { + for (int row = 0; row < DIM; row++) { + float sum = 0.0f; + for (int i = 0; i < SIZE; i++) { + sum += fm[i + row * SIZE] * fm2[i + col * SIZE]; + } + cExpected[row + col * DIM] += 1.0f * sum; + } + } + for (int i = 0; i < DIM * DIM; i++) { + Asserts.assertEquals(Float.floatToIntBits(c[i]), Float.floatToIntBits(cExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_VD, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testDgebpTN(int rows, int rowe, int cols, int cole, int is, int ie, double alpha, double[] a, int lda, double[] b, int ldb, double[] c, int ldc) { + for (int col = cols; col < cole; col++) { + for (int row = rows; row < rowe; row++) { + double sum = 0.0; + for (int i = is; i < ie; i++) { + sum += a[i + row * lda] * b[i + col * ldb]; + } + c[row + col * ldc] += alpha * sum; + } + } + } + + @Run(test = "testDgebpTN") + void runDgebpTN() { + double[] c = new double[DIM * DIM]; + double[] cExpected = new double[DIM * DIM]; + for (int i = 0; i < DIM * DIM; i++) { + c[i] = RANDOM.nextDouble(); + cExpected[i] = c[i]; + } + testDgebpTN(0, DIM, 0, DIM, 0, SIZE, 1.0, dm, SIZE, dm2, SIZE, c, DIM); + for (int col = 0; col < DIM; col++) { + for (int row = 0; row < DIM; row++) { + double sum = 0.0; + for (int i = 0; i < SIZE; i++) { + sum += dm[i + row * SIZE] * dm2[i + col * SIZE]; + } + cExpected[row + col * DIM] += 1.0 * sum; + } + } + for (int i = 0; i < DIM * DIM; i++) { + Asserts.assertEquals(Double.doubleToLongBits(c[i]), Double.doubleToLongBits(cExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_VF, "> 0", + IRNode.ADD_REDUCTION_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testSgepdotTN(int rows, int rowe, int cols, int cole, int is, int ie, float alpha, float[] a, int lda, float[] b, int ldb, float[] c, int ldc) { + for (int col = cols; col < cole; col++) { + for (int row = rows; row < rowe; row++) { + float sum = 0.0f; + for (int i = is; i < ie; i++) { + sum += a[i + row * lda] * b[i + col * ldb]; + } + c[row + col * ldc] += alpha * sum; + } + } + } + + @Run(test = "testSgepdotTN") + void runSgepdotTN() { + float[] c = new float[DIM * DIM]; + float[] cExpected = new float[DIM * DIM]; + for (int i = 0; i < DIM * DIM; i++) { + c[i] = RANDOM.nextFloat(); + cExpected[i] = c[i]; + } + testSgepdotTN(0, 3, 0, 3, 0, SIZE, 1.0f, fm, SIZE, fm2, SIZE, c, DIM); + for (int col = 0; col < 3; col++) { + for (int row = 0; row < 3; row++) { + float sum = 0.0f; + for (int i = 0; i < SIZE; i++) { + sum += fm[i + row * SIZE] * fm2[i + col * SIZE]; + } + cExpected[row + col * DIM] += 1.0f * sum; + } + } + for (int i = 0; i < DIM * DIM; i++) { + Asserts.assertEquals(Float.floatToIntBits(c[i]), Float.floatToIntBits(cExpected[i])); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_VD, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void testDgepdotTN(int rows, int rowe, int cols, int cole, int is, int ie, double alpha, double[] a, int lda, double[] b, int ldb, double[] c, int ldc) { + for (int col = cols; col < cole; col++) { + for (int row = rows; row < rowe; row++) { + double sum = 0.0; + for (int i = is; i < ie; i++) { + sum += a[i + row * lda] * b[i + col * ldb]; + } + c[row + col * ldc] += alpha * sum; + } + } + } + + @Run(test = "testDgepdotTN") + void runDgepdotTN() { + double[] c = new double[DIM * DIM]; + double[] cExpected = new double[DIM * DIM]; + for (int i = 0; i < DIM * DIM; i++) { + c[i] = RANDOM.nextDouble(); + cExpected[i] = c[i]; + } + testDgepdotTN(0, 3, 0, 3, 0, SIZE, 1.0, dm, SIZE, dm2, SIZE, c, DIM); + for (int col = 0; col < 3; col++) { + for (int row = 0; row < 3; row++) { + double sum = 0.0; + for (int i = 0; i < SIZE; i++) { + sum += dm[i + row * SIZE] * dm2[i + col * SIZE]; + } + cExpected[row + col * DIM] += 1.0 * sum; + } + } + for (int i = 0; i < DIM * DIM; i++) { + Asserts.assertEquals(Double.doubleToLongBits(c[i]), Double.doubleToLongBits(cExpected[i])); + } + } +} diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 5c085e6a3a343..0d8b41e331594 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -23,7 +23,7 @@ /* * @test id=no-vectorization - * @bug 8340093 8342095 + * @bug 8340093 8342095 8370677 * @summary Test vectorization of reduction loops. * @library /test/lib / * @run driver compiler.loopopts.superword.TestReductions P0 @@ -2159,12 +2159,8 @@ private static long longMaxBig() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VF, "= 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) - @IR(failOn = IRNode.LOAD_VECTOR_F, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -2183,12 +2179,8 @@ private static float floatAddSimple() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MUL_REDUCTION_VF, "> 0", IRNode.MUL_VF, "= 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) - @IR(failOn = IRNode.LOAD_VECTOR_F, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -2242,12 +2234,8 @@ private static float floatMaxSimple() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VF, "= 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_F, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatAddDotProduct() { @@ -2263,12 +2251,8 @@ private static float floatAddDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MUL_REDUCTION_VF, "> 0", IRNode.MUL_VF, "> 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_F, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMulDotProduct() { @@ -2319,12 +2303,8 @@ private static float floatMaxDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VF, "> 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_F, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatAddBig() { @@ -2340,12 +2320,8 @@ private static float floatAddBig() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MUL_REDUCTION_VF, "> 0", IRNode.MUL_VF, "> 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_F, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMulBig() { @@ -2396,12 +2372,8 @@ private static float floatMaxBig() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.ADD_REDUCTION_VD, "> 0", IRNode.ADD_VD, "= 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) - @IR(failOn = IRNode.LOAD_VECTOR_D, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -2420,12 +2392,8 @@ private static double doubleAddSimple() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MUL_REDUCTION_VD, "> 0", IRNode.MUL_VD, "= 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) - @IR(failOn = IRNode.LOAD_VECTOR_D, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -2479,12 +2447,8 @@ private static double doubleMaxSimple() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VD, "= 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_D, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleAddDotProduct() { @@ -2500,12 +2464,8 @@ private static double doubleAddDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MUL_REDUCTION_VD, "> 0", IRNode.MUL_VD, "> 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_D, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMulDotProduct() { @@ -2556,12 +2516,8 @@ private static double doubleMaxDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VD, "> 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_D, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleAddBig() { @@ -2577,12 +2533,8 @@ private static double doubleAddBig() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MUL_REDUCTION_VD, "> 0", IRNode.MUL_VD, "> 0"}, - applyIfCPUFeature = {"sse4.1", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_D, - applyIfCPUFeatureAnd = {"asimd", "true"}) - // I think this could vectorize, but currently does not. Filed: JDK-8370677 - // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMulBig() { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java index b328d4135ecfe..fd18fcc5cb4e6 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestVectorFPReduction.java @@ -50,8 +50,10 @@ public static void main(String[] args) { } @Test - @IR(failOn = {IRNode.ADD_REDUCTION_VF}, - applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}, + phase = CompilePhase.PRINT_IDEAL) @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VF, ">=1"}, failOn = {"no_strict_order"}, applyIfCPUFeatureOr = {"sve", "true", "sse2", "true", "rvv", "true"}, @@ -65,8 +67,10 @@ private static void testAddReductionVF() { } @Test - @IR(failOn = {IRNode.ADD_REDUCTION_VD}, - applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}, + phase = CompilePhase.PRINT_IDEAL) @IR(counts = {"requires_strict_order", ">=1", IRNode.ADD_REDUCTION_VD, ">=1"}, failOn = {"no_strict_order"}, applyIfCPUFeatureOr = {"sve", "true", "sse2", "true", "rvv", "true"}, @@ -80,8 +84,10 @@ private static void testAddReductionVD() { } @Test - @IR(failOn = {IRNode.MUL_REDUCTION_VF}, - applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}, + phase = CompilePhase.PRINT_IDEAL) @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VF, ">=1"}, failOn = {"no_strict_order"}, applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"}, @@ -95,8 +101,10 @@ private static void testMulReductionVF() { } @Test - @IR(failOn = {IRNode.MUL_REDUCTION_VD}, - applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}) + @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"}, + failOn = {"no_strict_order"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve", "false"}, + phase = CompilePhase.PRINT_IDEAL) @IR(counts = {"requires_strict_order", ">=1", IRNode.MUL_REDUCTION_VD, ">=1"}, failOn = {"no_strict_order"}, applyIfCPUFeatureOr = {"sve", "true", "sse2", "true"},