Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 54 additions & 15 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,7 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
opcode == Op_MulVL ||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
// Note: we could implement sequential reductions for these reduction operators, but
// this will still almost never lead to speedups, because the sequential
// reductions are latency limited along the reduction chain, and not
// throughput limited. This is unlike unordered reductions (associative op)
// and element-wise ops which are usually throughput limited.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
opcode == Op_MulVL) {
return false;
}
}
Expand Down Expand Up @@ -3377,6 +3367,40 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{

// reduction addF

instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
predicate(UseSVE == 0 &&
Matcher::vector_length(n->in(2)) == 2 &&
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
effect(TEMP tmp);
format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %}
ins_encode %{
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
predicate(UseSVE == 0 &&
Matcher::vector_length(n->in(2)) == 4 &&
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
effect(TEMP tmp);
format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %}
ins_encode %{
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
Expand Down Expand Up @@ -3415,8 +3439,9 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
// auto-vectorization on SVE machine.
instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
predicate(UseSVE > 0 &&
(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order()));
match(Set dst_src1 (AddReductionVF dst_src1 src2));
format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
Expand All @@ -3430,6 +3455,19 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{

// reduction addD

instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{
predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVD dst_src1 vsrc));
effect(TEMP tmp);
format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %}
ins_encode %{
__ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
__ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
Expand All @@ -3453,8 +3491,9 @@ instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
// auto-vectorization on SVE machine.
instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
predicate(UseSVE > 0 &&
(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order()));
match(Set dst_src1 (AddReductionVD dst_src1 src2));
format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
Expand Down
64 changes: 51 additions & 13 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,7 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
opcode == Op_MulVL ||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
// Note: we could implement sequential reductions for these reduction operators, but
// this will still almost never lead to speedups, because the sequential
// reductions are latency limited along the reduction chain, and not
// throughput limited. This is unlike unordered reductions (associative op)
// and element-wise ops which are usually throughput limited.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
opcode == Op_MulVL) {
return false;
}
}
Expand Down Expand Up @@ -2034,6 +2024,40 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)

// reduction addF

instruct reduce_strict_order_add2F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
predicate(UseSVE == 0 &&
Matcher::vector_length(n->in(2)) == 2 &&
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
effect(TEMP tmp);
format %{ "reduce_strict_order_add2F_neon $dst_src1, $dst_src1, $vsrc\t# 2F, strict order" %}
ins_encode %{
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_strict_order_add4F_neon(vRegF dst_src1, vReg vsrc, vRegF tmp) %{
predicate(UseSVE == 0 &&
Matcher::vector_length(n->in(2)) == 4 &&
n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVF dst_src1 vsrc));
effect(TEMP tmp);
format %{ "reduce_strict_order_add4F_neon $dst_src1, $dst_src1, $vsrc\t# 4F, strict order" %}
ins_encode %{
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
__ fadds($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
Expand Down Expand Up @@ -2075,8 +2099,9 @@ define(`REDUCE_ADD_FP_SVE', `
// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
// auto-vectorization on SVE machine.
instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order());
predicate(UseSVE > 0 &&
(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
n->as_Reduction()->requires_strict_order()));
match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
Expand All @@ -2092,6 +2117,19 @@ REDUCE_ADD_FP_SVE(F, S)

// reduction addD

instruct reduce_strict_order_add2D_neon(vRegD dst_src1, vReg vsrc, vRegD tmp) %{
predicate(UseSVE == 0 && n->as_Reduction()->requires_strict_order());
match(Set dst_src1 (AddReductionVD dst_src1 vsrc));
effect(TEMP tmp);
format %{ "reduce_strict_order_add2D_neon $dst_src1, $dst_src1, $vsrc\t# 2D, strict order" %}
ins_encode %{
__ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
__ faddd($dst_src1$$FloatRegister, $dst_src1$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
// Non-strictly ordered floating-point add reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add reduction).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,46 +76,6 @@ private static void testMulVL() {
}
}

@Test
@IR(failOn = {IRNode.ADD_REDUCTION_VF})
private static void testAddReductionVF() {
float result = 1;
for(int i = 0; i < SIZE; i++) {
result += (floata[i] + floatb[i]);
}
fresult += result;
}

@Test
@IR(failOn = {IRNode.ADD_REDUCTION_VD})
private static void testAddReductionVD() {
double result = 1;
for(int i = 0; i < SIZE; i++) {
result += (doublea[i] + doubleb[i]);
}
dresult += result;
}

@Test
@IR(failOn = {IRNode.MUL_REDUCTION_VF})
private static void testMulReductionVF() {
float result = 1;
for(int i = 0; i < SIZE; i++) {
result *= (floata[i] + floatb[i]);
}
fresult += result;
}

@Test
@IR(failOn = {IRNode.MUL_REDUCTION_VD})
private static void testMulReductionVD() {
double result = 1;
for(int i = 0; i < SIZE; i++) {
result *= (doublea[i] + doubleb[i]);
}
dresult += result;
}

@Test
@IR(failOn = {IRNode.COUNT_TRAILING_ZEROS_VL})
public void testNumberOfTrailingZeros() {
Expand Down
Loading