From 52dbe72837db858b5ad00d0b64a9fc539539ae7a Mon Sep 17 00:00:00 2001 From: Joao-Pedro-Cabral Date: Tue, 20 Jan 2026 18:32:37 +0000 Subject: [PATCH 1/2] Fixing reduction operations --- rtl/vproc_pipeline.sv | 4 ++++ rtl/vproc_pipeline_wrapper.sv | 36 ++++++++++++++++++++++++++--------- rtl/vproc_pkg.sv | 1 + rtl/vproc_unit_wrapper.sv | 3 ++- rtl/vproc_vregpack.sv | 25 ++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 10 deletions(-) diff --git a/rtl/vproc_pipeline.sv b/rtl/vproc_pipeline.sv index e8ea027b..11ed565f 100644 --- a/rtl/vproc_pipeline.sv +++ b/rtl/vproc_pipeline.sv @@ -130,6 +130,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic alt_last_cycle; logic init_addr; // initialize address (used by LSU) logic requires_flush; + logic red_op; logic [XIF_ID_W -1:0] id; op_unit unit; op_mode mode; @@ -241,6 +242,7 @@ module vproc_pipeline import vproc_pkg::*; #( state_next.first_cycle = 1'b1; state_next.init_addr = 1'b1; state_next.requires_flush = pipe_in_state_i.requires_flush; + state_next.red_op = pipe_in_state_i.red_op; state_next.id = pipe_in_state_i.id; state_next.unit = pipe_in_state_i.unit; state_next.mode = pipe_in_state_i.mode; @@ -679,6 +681,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic last_cycle; logic init_addr; // initialize address (used by LSU) logic requires_flush; + logic red_op; logic alt_count_valid; // alternative counter value is valid logic [AUX_COUNTER_W-1:0] aux_count; logic [XIF_ID_W-1:0] id; @@ -711,6 +714,7 @@ module vproc_pipeline import vproc_pkg::*; #( (~FIELD_COUNT_USED | (state_q.field_count == '0)); unpack_ctrl.init_addr = state_q.init_addr; unpack_ctrl.requires_flush = state_q.requires_flush; + unpack_ctrl.red_op = state_q.red_op; unpack_ctrl.alt_count_valid = DONT_CARE_ZERO ? '0 : 'x; unique case (state_q.emul) EMUL_1: unpack_ctrl.alt_count_valid = state_q.alt_count.val[COUNTER_W-1 -: 4] == '0; diff --git a/rtl/vproc_pipeline_wrapper.sv b/rtl/vproc_pipeline_wrapper.sv index 94ae9adf..e04a2664 100644 --- a/rtl/vproc_pipeline_wrapper.sv +++ b/rtl/vproc_pipeline_wrapper.sv @@ -198,6 +198,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( count_inc_e count_inc; // counter increment policy logic [2:0] field_count_init; // field counter initial value logic requires_flush; // whether the instr requires flushing + logic red_op; // whether the instr is a reduction logic [XIF_ID_W -1:0] id; op_unit unit; op_mode mode; @@ -224,99 +225,115 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( assign unit_elem = UNITS[UNIT_ELEM] & (pipe_in_data_i.unit == UNIT_ELEM); // identify the type of data that vs2 supplies for ELEM instructions - logic elem_flush, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; + logic elem_flush, red_op, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; always_comb begin elem_flush = DONT_CARE_ZERO ? 1'b0 : 1'bx; + red_op = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_data = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_mask = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_dyn_addr = DONT_CARE_ZERO ? 1'b0 : 1'bx; unique case (pipe_in_data_i.mode.elem.op) ELEM_XMV: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VPOPC: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VFIRST: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VID: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VIOTA: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VRGATHER: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b1; end ELEM_VCOMPRESS: begin elem_flush = 1'b1; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDSUM: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDAND: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDOR: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDXOR: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMINU: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMIN: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMAXU: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMAX: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; @@ -403,6 +420,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( state_init.field_count_init = unit_lsu ? pipe_in_data_i.mode.lsu.nfields : '0; state_init.requires_flush = unit_elem & elem_flush; + state_init.red_op = red_op; state_init.id = pipe_in_data_i.id; state_init.unit = pipe_in_data_i.unit; state_init.mode = pipe_in_data_i.mode; diff --git a/rtl/vproc_pkg.sv b/rtl/vproc_pkg.sv index f448e77d..bc9749d3 100644 --- a/rtl/vproc_pkg.sv +++ b/rtl/vproc_pkg.sv @@ -332,6 +332,7 @@ typedef struct packed { logic narrow; logic saturate; logic sig; + logic red_op; logic [2:0] mul_idx; } pack_flags; diff --git a/rtl/vproc_unit_wrapper.sv b/rtl/vproc_unit_wrapper.sv index b631ec3b..3c53aab6 100644 --- a/rtl/vproc_unit_wrapper.sv +++ b/rtl/vproc_unit_wrapper.sv @@ -449,7 +449,8 @@ module vproc_unit_wrapper import vproc_pkg::*; #( default: ; endcase pipe_out_res_flags_o[0].elemwise = 1'b1; - pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1); + pipe_out_res_flags_o[0].red_op = unit_out_ctrl.red_op; + pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1 | unit_out_ctrl.red_op); pipe_out_res_valid_o[0] = flushing_q | unit_out_res_valid; pipe_out_res_data_o [0] = unit_out_res; pipe_out_res_mask_o [0][3:0] = flushing_q ? '0 : unit_out_mask; diff --git a/rtl/vproc_vregpack.sv b/rtl/vproc_vregpack.sv index 9c276cb0..6507e727 100644 --- a/rtl/vproc_vregpack.sv +++ b/rtl/vproc_vregpack.sv @@ -329,6 +329,31 @@ module vproc_vregpack #( res_buffer_next[i][VPORT_W -RES_W[i] -1:0] = res_buffer[i][VPORT_W -1:RES_W[i] ]; msk_buffer_next[i][VPORT_W/8-RES_W[i]/8-1:0] = msk_buffer[i][VPORT_W/8-1:RES_W[i]/8]; end + // For reduction operations, we write the reduction value directly in the lowest bits of the result + // and set only the lowest bits of the mask buffer + // We did this, because the current Vicuna code has a bug when the following instruction is executed + // at the same pipeline, the unit deque will not be available for the new instruction because + // it is processing the flush logic for the reduction/compress operations + // This extra if removes the need for the flush logic for reduction operations, but doesn't fix + // the problem for compress instructions + if((RES_ALLOW_ELEMWISE[i] | RES_ALWAYS_ELEMWISE[i]) & pipe_in_res_flags_i[i].red_op) begin + msk_buffer_next[i] = '0; + unique case (pipe_in_eew_i) + VSEW_8: begin + res_buffer_next[i][7:0] = pipe_in_res_data_i[i][7 :0]; + msk_buffer_next[i][0] = pipe_in_res_mask_i[i][0]; + end + VSEW_16: begin + res_buffer_next[i][15:0] = pipe_in_res_data_i[i][15:0]; + msk_buffer_next[i][1:0] = {2{pipe_in_res_mask_i[i][0]}}; + end + VSEW_32: begin + res_buffer_next[i][31:0] = pipe_in_res_data_i[i][31:0]; + msk_buffer_next[i][3:0] = {4{pipe_in_res_mask_i[i][0]}}; + end + default: ; + endcase + end end end From d10948eccadde04858527177e8d196675e15910e Mon Sep 17 00:00:00 2001 From: Joao-Pedro-Cabral Date: Tue, 20 Jan 2026 22:07:19 +0000 Subject: [PATCH 2/2] Fixing vcompress for VREG_W >= 256 --- rtl/vproc_pipeline.sv | 25 +++++++++++-- rtl/vproc_pipeline_wrapper.sv | 44 ++++++++++++++++------- rtl/vproc_unit_mux.sv | 2 +- rtl/vproc_unit_wrapper.sv | 61 +++++++++++++++++++++++++++----- test/elem/test_configs.conf | 2 +- test/elem/vcompress_16.S | 14 ++++---- test/elem/vcompress_32.S | 66 ++++++++++++++++++++++------------- test/elem/vcompress_8.S | 12 ++++--- 8 files changed, 167 insertions(+), 59 deletions(-) diff --git a/rtl/vproc_pipeline.sv b/rtl/vproc_pipeline.sv index 11ed565f..52578a35 100644 --- a/rtl/vproc_pipeline.sv +++ b/rtl/vproc_pipeline.sv @@ -180,6 +180,7 @@ module vproc_pipeline import vproc_pkg::*; #( counter_t count_next_inc, alt_count_next_inc; logic last_cycle_next, alt_last_cycle_next, wait_alt_count_next; logic [OP_CNT-1:0] op_load_next, op_shift_next; + logic vcompress_flushing; always_comb begin state_valid_d = state_valid_q; state_wait_alt_count_d = state_wait_alt_count_q; @@ -358,6 +359,9 @@ module vproc_pipeline import vproc_pkg::*; #( end default: ; endcase + if (state_q.requires_flush) begin + last_cycle_next &= vcompress_flushing; + end else begin // clear last cycle based on EMUL (note: the alt_last_cycle signal is not cleared here // as that is only required to indicate completion of one vreg cycle) unique case (state_q.emul) @@ -366,6 +370,7 @@ module vproc_pipeline import vproc_pkg::*; #( EMUL_8: last_cycle_next &= count_next_inc.part.mul[2:0] == '1; default: ; endcase + end if ((OP_ALT_COUNTER != '0) & state_q.count.part.sign) begin last_cycle_next = '0; end @@ -375,6 +380,20 @@ module vproc_pipeline import vproc_pkg::*; #( end end + // Extra cycles for flushing output in vcompress + always_comb begin + vcompress_flushing = 1'b0; + unique case (state_q.emul) + EMUL_1: vcompress_flushing = count_next_inc.part.mul[0] == 1'b1; + EMUL_2: vcompress_flushing = count_next_inc.part.mul[1] == 1'b1; + EMUL_4: vcompress_flushing = count_next_inc.part.mul[2] == 1'b1; + EMUL_8: vcompress_flushing = count_next_inc.part.sign == 1'b1; + default: ; + endcase + // Doesn't make sense to consider the vcompress_flushing when starting the pipe operation + vcompress_flushing &= ~pipe_in_ready_o; + end + // Operand load and shift signals counter_t [OP_CNT-1:0] op_count; always_comb begin @@ -401,7 +420,8 @@ module vproc_pipeline import vproc_pkg::*; #( else if (~aux_count_used | (state_next.aux_count == '0) | pipe_in_ready_o) begin if (~OP_MASK[i]) begin if ((op_count[i].part.low == '0) & - (~OP_NARROW[i] | ~state_next.op_flags[i].narrow | ~op_count[i].part.mul[0]) + (~OP_NARROW[i] | ~state_next.op_flags[i].narrow | ~op_count[i].part.mul[0]) & + (~state_next.requires_flush | ~vcompress_flushing) // We don't load ops for vcompress when flushing ) begin op_load_next[i] = OP_ALWAYS_VREG[i] | state_next.op_flags[i].vreg; @@ -596,7 +616,8 @@ module vproc_pipeline import vproc_pkg::*; #( end //else if (OP_ALT_COUNTER != '0) begin //end - else begin + // In the second part of vcompress, we don't generate extra pending reads + else if(~state_q.requires_flush | (~vcompress_flushing & ~state_done)) begin if (OP_ALWAYS_VREG[i] | state_q.op_flags[i].vreg) begin op_pend_reads[i] = DONT_CARE_ZERO ? '0 : 'x; unique case ({state_q.emul, OP_NARROW[i] & state_q.op_flags[i].narrow}) diff --git a/rtl/vproc_pipeline_wrapper.sv b/rtl/vproc_pipeline_wrapper.sv index e04a2664..34e8a6b4 100644 --- a/rtl/vproc_pipeline_wrapper.sv +++ b/rtl/vproc_pipeline_wrapper.sv @@ -97,19 +97,20 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( // by indices loaded by a previous operand. These operands have following indices (negative // indices must be added to the operand count): // - // | Idx | Type | Address | Units using it | Comment | - // +-----+------+----------+----------------+-----------------------------------------------+ - // | 0 | data | vs2 (vd) | all | Only MUL may change address to vd | - // | 1 | data | vs1 (vd) | all except SLD | Only LSU uses vd as address instead of vs1 | - // | 2 | data | vd/vs2 | MUL | MUL may use either vd or vs2 as address | - // | -3 | data | dynamic | ELEM | Index-based dynamic address within vreg group | - // | -2 | mask | vs2 | ELEM | Mask operand for some ELEM operations | - // | -1 | mask | v0 | all | Mask operand for masked operations | + // | Idx | Type | Address | Units using it | Comment | + // +-----+------+----------+----------------+--------------------------------------------------------------------+ + // | 0 | data | vs2 (vd) | all | Only MUL may change address to vd | + // | 1 | data | anything | all except SLD | LSU uses as vd, vcompress uses as vs2, everyone else use as vs1 | + // | 2 | data | vd/vs2 | MUL | MUL may use either vd or vs2 as address | + // | -3 | data | dynamic | ELEM | Index-based dynamic address within vreg group | + // | -2 | mask | vs2(vs1) | ELEM | Mask operand for some ELEM operations | + // | -1 | mask | v0 | all | Mask operand for masked operations | // Operand count: // - default is 3 (indices 0, 1, and -1 from above table, required by almost all units) // - MUL unit additionally requires index 2, raising the operand count to a minimum of 4 // - ELEM unit additionally requires indices -3 and -2, hence a minimum of 5 operands + // - For indice -2, vs2 is required by vpopc, vfirst and viota, while vs1 is required for vcompress // - if MUL and ELEM units are both present in same pipeline, then all 6 operands are required // - in case a pipeline contains only the SLD unit the operand count is 2 (indices 0 and -1) localparam int unsigned OP_CNT = UNITS[UNIT_MUL] ? ( @@ -225,11 +226,12 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( assign unit_elem = UNITS[UNIT_ELEM] & (pipe_in_data_i.unit == UNIT_ELEM); // identify the type of data that vs2 supplies for ELEM instructions - logic elem_flush, red_op, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; + logic elem_flush, red_op, elem_vs2_data, elem_vs1_mask, elem_vs2_mask, elem_vs2_dyn_addr; always_comb begin elem_flush = DONT_CARE_ZERO ? 1'b0 : 1'bx; red_op = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_data = DONT_CARE_ZERO ? 1'b0 : 1'bx; + elem_vs1_mask = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_mask = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_dyn_addr = DONT_CARE_ZERO ? 1'b0 : 1'bx; unique case (pipe_in_data_i.mode.elem.op) @@ -237,6 +239,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b0; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -244,6 +247,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b0; elem_vs2_data = 1'b0; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end @@ -251,6 +255,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b0; elem_vs2_data = 1'b0; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end @@ -258,6 +263,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b0; elem_vs2_data = 1'b0; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -265,6 +271,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b0; elem_vs2_data = 1'b0; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end @@ -272,6 +279,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b0; elem_vs2_data = 1'b0; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b1; end @@ -279,13 +287,15 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b1; red_op = 1'b0; elem_vs2_data = 1'b0; - elem_vs2_mask = 1'b1; + elem_vs1_mask = 1'b1; + elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDSUM: begin elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -293,6 +303,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -300,6 +311,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -307,6 +319,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -314,6 +327,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -321,6 +335,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -328,6 +343,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -335,6 +351,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( elem_flush = 1'b0; red_op = 1'b1; elem_vs2_data = 1'b1; + elem_vs1_mask = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end @@ -501,16 +518,19 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( state_init.op_vaddr[(OP_CNT >= 3) ? 2 : 0] = pipe_in_data_i.mode.mul.op2_is_vd ? pipe_in_data_i.rs2.r.vaddr : pipe_in_data_i.rd.addr; end if (unit_elem) begin + // elem_vs1_mask is used for vcompress, where OP1_SRC is used to fetch vs2 and OP0_SRC fetches vs1 (mask) state_init.op_flags[0 ].vreg = pipe_in_data_i.rs2.vreg & elem_vs2_data; state_init.op_flags[0 ].elemwise = 1'b1; state_init.op_flags[0 ].sigext = pipe_in_data_i.mode.elem.sigext; + state_init.op_flags[1 ].vreg = pipe_in_data_i.rs1.vreg | (elem_vs1_mask & pipe_in_data_i.rs2.vreg); state_init.op_flags[1 ].elemwise = 1'b1; state_init.op_flags[1 ].narrow = 1'b0; // only op 0 can be narrow + state_init.op_vaddr[1 ] = elem_vs1_mask ? pipe_in_data_i.rs2.r.vaddr : pipe_in_data_i.rs1.r.vaddr; state_init.op_flags[(OP_CNT >= 3) ? OP_CNT-3 : 0].vreg = elem_vs2_dyn_addr; state_init.op_vaddr[(OP_CNT >= 3) ? OP_CNT-3 : 0] = pipe_in_data_i.rs2.r.vaddr; - state_init.op_flags[ OP_CNT-2 ].vreg = pipe_in_data_i.rs2.vreg & elem_vs2_mask; + state_init.op_flags[ OP_CNT-2 ].vreg = (pipe_in_data_i.rs2.vreg & elem_vs2_mask) | (pipe_in_data_i.rs1.vreg & elem_vs1_mask); state_init.op_flags[ OP_CNT-2 ].elemwise = 1'b1; - state_init.op_vaddr[ OP_CNT-2 ] = pipe_in_data_i.rs2.r.vaddr; + state_init.op_vaddr[ OP_CNT-2 ] = elem_vs1_mask ? pipe_in_data_i.rs1.r.vaddr : pipe_in_data_i.rs2.r.vaddr; state_init.op_flags[ OP_CNT-1 ].elemwise = 1'b1; end end diff --git a/rtl/vproc_unit_mux.sv b/rtl/vproc_unit_mux.sv index 95a32b0f..1e18938d 100644 --- a/rtl/vproc_unit_mux.sv +++ b/rtl/vproc_unit_mux.sv @@ -269,7 +269,7 @@ module vproc_unit_mux import vproc_pkg::*; #( pipe_out_res_mask_o = unit_out_res_mask [i]; pipe_out_pend_clear_o = unit_out_pend_clear [i]; pipe_out_pend_clear_cnt_o = unit_out_pend_clear_cnt[i]; - pipe_out_instr_done_o = unit_out_instr_done [i]; + pipe_out_instr_done_o = unit_out_instr_done [i] & unit_out_valid[i]; end end end diff --git a/rtl/vproc_unit_wrapper.sv b/rtl/vproc_unit_wrapper.sv index 3c53aab6..5a3fcb9c 100644 --- a/rtl/vproc_unit_wrapper.sv +++ b/rtl/vproc_unit_wrapper.sv @@ -311,19 +311,26 @@ module vproc_unit_wrapper import vproc_pkg::*; #( // ELEM unit's output buffer signals logic has_valid_result_q, has_valid_result_d; COUNTER_T vd_count_q, vd_count_d; + COUNTER_T vs_count_q, vs_count_d; logic flushing_q, flushing_d; logic [XIF_ID_W-1:0] flushing_id_q, flushing_id_d; vproc_pkg::cfg_vsew flushing_eew_q, flushing_eew_d; vproc_pkg::cfg_emul flushing_emul_q, flushing_emul_d; logic [4:0] flushing_vaddr_q, flushing_vaddr_d; + // flush the downstream part of the pipeline after the last cycle if needed + logic flush_finished_d, flush_finished_q; + // Symbolizes when we finish the compress operation and starts the flush process in vcompress + logic compress_last_cycle; always_ff @(posedge clk_i) begin if (pipe_out_ready_i) begin vd_count_q <= vd_count_d; + vs_count_q <= vs_count_d; has_valid_result_q <= has_valid_result_d; flushing_id_q <= flushing_id_d; flushing_eew_q <= flushing_eew_d; flushing_emul_q <= flushing_emul_d; flushing_vaddr_q <= flushing_vaddr_d; + flush_finished_q <= flush_finished_d; end end always_ff @(posedge clk_i or negedge async_rst_ni) begin @@ -352,6 +359,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( assign first_valid_result = ~flushing_q & res_valid & (unit_out_ctrl.first_cycle | ~has_valid_result_q); always_comb begin vd_count_d.val = DONT_CARE_ZERO ? '0 : 'x; + vs_count_d.val = DONT_CARE_ZERO ? '0 : 'x; unique case (flushing_q ? flushing_eew_q : unit_out_ctrl.eew) VSEW_8: vd_count_d.val = vd_count_q.val + {{(COUNTER_W-1){1'b0}}, flushing_q | res_valid }; VSEW_16: vd_count_d.val = vd_count_q.val + {{(COUNTER_W-2){1'b0}}, flushing_q | res_valid, 1'b0}; @@ -368,6 +376,22 @@ module vproc_unit_wrapper import vproc_pkg::*; #( default: ; endcase end + unique case (unit_out_ctrl.eew) + VSEW_8: vs_count_d.val = vs_count_q.val + {{(COUNTER_W-1){1'b0}}, ~flushing_q & unit_out_valid }; + VSEW_16: vs_count_d.val = vs_count_q.val + {{(COUNTER_W-2){1'b0}}, ~flushing_q & unit_out_valid, 1'b0}; + VSEW_32: vs_count_d.val = vs_count_q.val + {{(COUNTER_W-3){1'b0}}, ~flushing_q & unit_out_valid, 2'b0}; + default: ; + endcase + if (unit_out_ctrl.first_cycle) begin + vs_count_d.val = '0; + vs_count_d.val[1:0] = DONT_CARE_ZERO ? '0 : 'x; + unique case (unit_out_ctrl.eew) + VSEW_8: vs_count_d.val[1:0] = 2'b00; + VSEW_16: vs_count_d.val[1:0] = 2'b01; + VSEW_32: vs_count_d.val[1:0] = 2'b11; + default: ; + endcase + end end logic instr_speculative, instr_committed; @@ -388,27 +412,48 @@ module vproc_unit_wrapper import vproc_pkg::*; #( endcase end + always_comb begin + compress_last_cycle = 1'b0; + unique case (unit_out_ctrl.eew) + VSEW_8: compress_last_cycle = vs_count_d.val[COUNTER_W-5:0] == '1; + VSEW_16: compress_last_cycle = vs_count_d.val[COUNTER_W-5:1] == '1; + VSEW_32: compress_last_cycle = vs_count_d.val[COUNTER_W-5:2] == '1; + default: ; + endcase + unique case (unit_out_ctrl.emul) + EMUL_2: compress_last_cycle &= vs_count_d.part.mul[ 0] == '1; + EMUL_4: compress_last_cycle &= vs_count_d.part.mul[1:0] == '1; + EMUL_8: compress_last_cycle &= vs_count_d.part.mul[2:0] == '1; + default: ; + endcase + // We're flushing, so no compress more + if (flushing_q) begin + compress_last_cycle = 1'b0; + end + end + assign unit_out_stall = unit_out_xreg_valid & (instr_speculative | ~xreg_ready_i); - // flush the downstream part of the pipeline after the last cycle if needed - logic flushing_last_cycle; always_comb begin flushing_d = flushing_q; flushing_id_d = flushing_id_q; flushing_eew_d = flushing_eew_q; flushing_emul_d = flushing_emul_q; flushing_vaddr_d = flushing_vaddr_q; - flushing_last_cycle = 1'b0; - if (~flushing_q & unit_out_valid & unit_out_ctrl.last_cycle & unit_out_ctrl.requires_flush) begin + flush_finished_d = flush_finished_q; + if (~flushing_q & ~flush_finished_d & unit_out_valid & compress_last_cycle & unit_out_ctrl.requires_flush) begin flushing_d = 1'b1; flushing_id_d = unit_out_ctrl.id; flushing_eew_d = unit_out_ctrl.eew; flushing_emul_d = unit_out_ctrl.emul; flushing_vaddr_d = unit_out_ctrl.res_vaddr; end + if (unit_out_valid & unit_out_ctrl.last_cycle) begin + flush_finished_d = 1'b0; + end if (flushing_q & (vd_count_d.part.low == '1)) begin flushing_d = 1'b0; - flushing_last_cycle = 1'b1; + flush_finished_d = 1'b1; end end @@ -451,12 +496,12 @@ module vproc_unit_wrapper import vproc_pkg::*; #( pipe_out_res_flags_o[0].elemwise = 1'b1; pipe_out_res_flags_o[0].red_op = unit_out_ctrl.red_op; pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1 | unit_out_ctrl.red_op); - pipe_out_res_valid_o[0] = flushing_q | unit_out_res_valid; + pipe_out_res_valid_o[0] = flushing_q | (unit_out_res_valid & ~flush_finished_q); pipe_out_res_data_o [0] = unit_out_res; pipe_out_res_mask_o [0][3:0] = flushing_q ? '0 : unit_out_mask; end - assign pipe_out_instr_done_o = (~flushing_q & unit_out_ctrl.last_cycle & ~unit_out_ctrl.requires_flush ) | flushing_last_cycle; - assign pipe_out_pend_clear_o = (~flushing_q & unit_out_ctrl.last_cycle & ~unit_out_ctrl.requires_flush & ~unit_out_ctrl.mode.elem.xreg) | flushing_last_cycle; + assign pipe_out_instr_done_o = unit_out_ctrl.last_cycle; // We only finish after the state being done + assign pipe_out_pend_clear_o = (~flushing_q & unit_out_ctrl.last_cycle & ~unit_out_ctrl.requires_flush & ~unit_out_ctrl.mode.elem.xreg) | flush_finished_d; assign pipe_out_pend_clear_cnt_o = unit_out_ctrl.emul; // TODO reductions always have destination EMUL == 1 end endgenerate diff --git a/test/elem/test_configs.conf b/test/elem/test_configs.conf index c22da48d..f698c009 100644 --- a/test/elem/test_configs.conf +++ b/test/elem/test_configs.conf @@ -1,2 +1,2 @@ -VPROC_CONFIG=dual VREG_W=128 VMEM_W=32 +VPROC_CONFIG=dual VREG_W=256 VMEM_W=32 VPROC_CONFIG=dual VREG_W=512 VMEM_W=256 ICACHE_SZ=8192 DCACHE_SZ=65536 MEM_LATENCY=5 diff --git a/test/elem/vcompress_16.S b/test/elem/vcompress_16.S index ccf89f51..80901fa6 100644 --- a/test/elem/vcompress_16.S +++ b/test/elem/vcompress_16.S @@ -11,10 +11,12 @@ main: li t0, 9 vsetvli t0, t0, e16,m2,tu,mu - vle16.v v0, (a0) - vmv.v.x v4, x0 - vcompress.vm v4, v0, v0 - vse16.v v4, (a0) + vle16.v v4, (a0) + vmv.v.x v2, x0 + li t0, 0x0123 + vmv.v.x v0, t0 + vcompress.vm v2, v4, v0 + vse16.v v2, (a0) la a0, vdata_start la a1, vdata_end @@ -65,8 +67,8 @@ vdata_end: .global vref_end vref_start: .word 0x323b3f47 - .word 0x4a514b3a - .word 0x0000383b + .word 0x383b302f + .word 0x00000000 .word 0x00000000 .word 0x3f440000 .word 0x37424d54 diff --git a/test/elem/vcompress_32.S b/test/elem/vcompress_32.S index d7599146..ddc52f4d 100644 --- a/test/elem/vcompress_32.S +++ b/test/elem/vcompress_32.S @@ -8,13 +8,15 @@ main: la a0, vdata_start - li t0, 13 - vsetvli t0, t0, e32,m4,tu,mu + li t0, 40 + vsetvli t0, t0, e32,m8,tu,mu - vle32.v v0, (a0) - vmv.v.x v4, x0 - vcompress.vm v4, v0, v0 - vse32.v v4, (a0) + vle32.v v16, (a0) + vmv.v.x v8, x0 + li t0, 0x2F704A1F + vmv.v.x v0, t0 + vcompress.vm v8, v16, v0 + vse32.v v8, (a0) la a0, vdata_start la a1, vdata_end @@ -58,6 +60,14 @@ vdata_start: .word 0x454c4342 .word 0x40504a3f .word 0x4448535a + .word 0x323b3f47 + .word 0x47434b3a + .word 0x302f2e32 + .word 0xe8404a51 + .word 0x3f44383b + .word 0x37424d54 + .word 0x5e4b5049 + .word 0x4c4c4c4a vdata_end: .align 10 @@ -67,33 +77,41 @@ vref_start: .word 0x323b3f47 .word 0x47434b3a .word 0x302f2e32 - .word 0x5e4b5049 - .word 0x4a505f3e + .word 0xe8404a51 + .word 0x3f44383b .word 0x485e5455 - .word 0x4d4c4a41 .word 0x373b5451 - .word 0x41454c45 - .word 0x00000000 - .word 0x00000000 - .word 0x00000000 - .word 0x00000000 - .word 0x3a3e3738 .word 0x312f2e2f - .word 0x3d433f45 - .word 0x46424949 - .word 0x494a4d51 - .word 0x49413c38 - .word 0x3e514143 .word 0x47525353 .word 0x514e5052 .word 0x525a5b58 - .word 0x5e575254 .word 0x56545058 .word 0x5a534947 .word 0x4744544f .word 0x4e515051 - .word 0x5a4b4545 .word 0x454c4342 - .word 0x40504a3f - .word 0x4448535a + .word 0x323b3f47 + .word 0x47434b3a + .word 0x302f2e32 + .word 0xe8404a51 + .word 0x3f44383b + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 vref_end: diff --git a/test/elem/vcompress_8.S b/test/elem/vcompress_8.S index 65f489eb..3bb95984 100644 --- a/test/elem/vcompress_8.S +++ b/test/elem/vcompress_8.S @@ -11,9 +11,11 @@ main: li t0, 16 vsetvli t0, t0, e8,m1,tu,mu - vle8.v v0, (a0) + vle8.v v4, (a0) vmv.v.x v2, x0 - vcompress.vm v2, v0, v0 + li t0, 0xAAAAAAAA + vmv.v.x v0, t0 + vcompress.vm v2, v4, v0 vse8.v v2, (a0) la a0, vdata_start @@ -64,9 +66,9 @@ vdata_end: .global vref_start .global vref_end vref_start: - .word 0x433b3f47 - .word 0x302f2e32 - .word 0x00004a51 + .word 0x474b323f + .word 0xe84a302e + .word 0x00000000 .word 0x00000000 .word 0x3f44383b .word 0x37424d54