From 152d7dfcaa3ee526f3a80281956f12d285492b35 Mon Sep 17 00:00:00 2001 From: Joao-Pedro-Cabral Date: Tue, 20 Jan 2026 18:32:37 +0000 Subject: [PATCH 1/2] Fixing reduction operations --- rtl/vproc_pipeline.sv | 4 ++++ rtl/vproc_pipeline_wrapper.sv | 36 ++++++++++++++++++++++++++--------- rtl/vproc_pkg.sv | 1 + rtl/vproc_unit_wrapper.sv | 3 ++- rtl/vproc_vregpack.sv | 25 ++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 10 deletions(-) diff --git a/rtl/vproc_pipeline.sv b/rtl/vproc_pipeline.sv index e8ea027b..11ed565f 100644 --- a/rtl/vproc_pipeline.sv +++ b/rtl/vproc_pipeline.sv @@ -130,6 +130,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic alt_last_cycle; logic init_addr; // initialize address (used by LSU) logic requires_flush; + logic red_op; logic [XIF_ID_W -1:0] id; op_unit unit; op_mode mode; @@ -241,6 +242,7 @@ module vproc_pipeline import vproc_pkg::*; #( state_next.first_cycle = 1'b1; state_next.init_addr = 1'b1; state_next.requires_flush = pipe_in_state_i.requires_flush; + state_next.red_op = pipe_in_state_i.red_op; state_next.id = pipe_in_state_i.id; state_next.unit = pipe_in_state_i.unit; state_next.mode = pipe_in_state_i.mode; @@ -679,6 +681,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic last_cycle; logic init_addr; // initialize address (used by LSU) logic requires_flush; + logic red_op; logic alt_count_valid; // alternative counter value is valid logic [AUX_COUNTER_W-1:0] aux_count; logic [XIF_ID_W-1:0] id; @@ -711,6 +714,7 @@ module vproc_pipeline import vproc_pkg::*; #( (~FIELD_COUNT_USED | (state_q.field_count == '0)); unpack_ctrl.init_addr = state_q.init_addr; unpack_ctrl.requires_flush = state_q.requires_flush; + unpack_ctrl.red_op = state_q.red_op; unpack_ctrl.alt_count_valid = DONT_CARE_ZERO ? '0 : 'x; unique case (state_q.emul) EMUL_1: unpack_ctrl.alt_count_valid = state_q.alt_count.val[COUNTER_W-1 -: 4] == '0; diff --git a/rtl/vproc_pipeline_wrapper.sv b/rtl/vproc_pipeline_wrapper.sv index 94ae9adf..e04a2664 100644 --- a/rtl/vproc_pipeline_wrapper.sv +++ b/rtl/vproc_pipeline_wrapper.sv @@ -198,6 +198,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( count_inc_e count_inc; // counter increment policy logic [2:0] field_count_init; // field counter initial value logic requires_flush; // whether the instr requires flushing + logic red_op; // whether the instr is a reduction logic [XIF_ID_W -1:0] id; op_unit unit; op_mode mode; @@ -224,99 +225,115 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( assign unit_elem = UNITS[UNIT_ELEM] & (pipe_in_data_i.unit == UNIT_ELEM); // identify the type of data that vs2 supplies for ELEM instructions - logic elem_flush, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; + logic elem_flush, red_op, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; always_comb begin elem_flush = DONT_CARE_ZERO ? 1'b0 : 1'bx; + red_op = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_data = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_mask = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_dyn_addr = DONT_CARE_ZERO ? 1'b0 : 1'bx; unique case (pipe_in_data_i.mode.elem.op) ELEM_XMV: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VPOPC: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VFIRST: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VID: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VIOTA: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VRGATHER: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b1; end ELEM_VCOMPRESS: begin elem_flush = 1'b1; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDSUM: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDAND: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDOR: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDXOR: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMINU: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMIN: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMAXU: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMAX: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; @@ -403,6 +420,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( state_init.field_count_init = unit_lsu ? pipe_in_data_i.mode.lsu.nfields : '0; state_init.requires_flush = unit_elem & elem_flush; + state_init.red_op = red_op; state_init.id = pipe_in_data_i.id; state_init.unit = pipe_in_data_i.unit; state_init.mode = pipe_in_data_i.mode; diff --git a/rtl/vproc_pkg.sv b/rtl/vproc_pkg.sv index f448e77d..bc9749d3 100644 --- a/rtl/vproc_pkg.sv +++ b/rtl/vproc_pkg.sv @@ -332,6 +332,7 @@ typedef struct packed { logic narrow; logic saturate; logic sig; + logic red_op; logic [2:0] mul_idx; } pack_flags; diff --git a/rtl/vproc_unit_wrapper.sv b/rtl/vproc_unit_wrapper.sv index b631ec3b..3c53aab6 100644 --- a/rtl/vproc_unit_wrapper.sv +++ b/rtl/vproc_unit_wrapper.sv @@ -449,7 +449,8 @@ module vproc_unit_wrapper import vproc_pkg::*; #( default: ; endcase pipe_out_res_flags_o[0].elemwise = 1'b1; - pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1); + pipe_out_res_flags_o[0].red_op = unit_out_ctrl.red_op; + pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1 | unit_out_ctrl.red_op); pipe_out_res_valid_o[0] = flushing_q | unit_out_res_valid; pipe_out_res_data_o [0] = unit_out_res; pipe_out_res_mask_o [0][3:0] = flushing_q ? '0 : unit_out_mask; diff --git a/rtl/vproc_vregpack.sv b/rtl/vproc_vregpack.sv index 9c276cb0..6507e727 100644 --- a/rtl/vproc_vregpack.sv +++ b/rtl/vproc_vregpack.sv @@ -329,6 +329,31 @@ module vproc_vregpack #( res_buffer_next[i][VPORT_W -RES_W[i] -1:0] = res_buffer[i][VPORT_W -1:RES_W[i] ]; msk_buffer_next[i][VPORT_W/8-RES_W[i]/8-1:0] = msk_buffer[i][VPORT_W/8-1:RES_W[i]/8]; end + // For reduction operations, we write the reduction value directly in the lowest bits of the result + // and set only the lowest bits of the mask buffer + // We did this, because the current Vicuna code has a bug when the following instruction is executed + // at the same pipeline, the unit deque will not be available for the new instruction because + // it is processing the flush logic for the reduction/compress operations + // This extra if removes the need for the flush logic for reduction operations, but doesn't fix + // the problem for compress instructions + if((RES_ALLOW_ELEMWISE[i] | RES_ALWAYS_ELEMWISE[i]) & pipe_in_res_flags_i[i].red_op) begin + msk_buffer_next[i] = '0; + unique case (pipe_in_eew_i) + VSEW_8: begin + res_buffer_next[i][7:0] = pipe_in_res_data_i[i][7 :0]; + msk_buffer_next[i][0] = pipe_in_res_mask_i[i][0]; + end + VSEW_16: begin + res_buffer_next[i][15:0] = pipe_in_res_data_i[i][15:0]; + msk_buffer_next[i][1:0] = {2{pipe_in_res_mask_i[i][0]}}; + end + VSEW_32: begin + res_buffer_next[i][31:0] = pipe_in_res_data_i[i][31:0]; + msk_buffer_next[i][3:0] = {4{pipe_in_res_mask_i[i][0]}}; + end + default: ; + endcase + end end end From 449b1bb5928141ef0792bb8551e7c6e41bada216 Mon Sep 17 00:00:00 2001 From: Joao-Pedro-Cabral Date: Tue, 20 Jan 2026 22:39:17 +0000 Subject: [PATCH 2/2] Fixing vnsrl[a] for LMUL < 1 --- rtl/vproc_pending_wr.sv | 1 + rtl/vproc_pipeline.sv | 6 +++++- rtl/vproc_unit_mux.sv | 5 +++++ rtl/vproc_unit_wrapper.sv | 6 ++++++ rtl/vproc_vregpack.sv | 9 ++++++++- 5 files changed, 25 insertions(+), 2 deletions(-) diff --git a/rtl/vproc_pending_wr.sv b/rtl/vproc_pending_wr.sv index 3902c779..7a164416 100644 --- a/rtl/vproc_pending_wr.sv +++ b/rtl/vproc_pending_wr.sv @@ -45,6 +45,7 @@ module vproc_pending_wr #( end else begin unique case ({emul_i, widenarrow_i == OP_NARROWING}) {EMUL_1, 1'b0}, + {EMUL_1, 1'b1}, {EMUL_2, 1'b1}: begin pend_vd = rd_i.vreg ? (32'h00000001 << rd_i.addr ) : 32'b0; end diff --git a/rtl/vproc_pipeline.sv b/rtl/vproc_pipeline.sv index 11ed565f..677cf99b 100644 --- a/rtl/vproc_pipeline.sv +++ b/rtl/vproc_pipeline.sv @@ -468,7 +468,8 @@ module vproc_pipeline import vproc_pkg::*; #( if ((count_next_inc.part.low == '0) & ((OP_ALT_COUNTER == '0) | ~state_q.count.part.sign) & ((RES_ALWAYS_VREG | state_q.res_vreg) != '0) // at least one valid vreg ) begin - res_store = ((RES_NARROW & state_q.res_narrow) == '0) | ~count_next_inc.part.mul[0]; + // state_q.last_cycle is necessary for narrowing instructions with EMUL < 1 + res_store = ((RES_NARROW & state_q.res_narrow) == '0) | ~count_next_inc.part.mul[0] | state_q.last_cycle; end // Shifting is delayed by one cycle compared to the store and hence uses the current counter if ((state_q.count.val & ~({COUNTER_W{1'b1}} << $clog2(RES_W[0] / COUNTER_OP_W))) == '0) begin @@ -862,6 +863,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic unit_out_ready; logic [XIF_ID_W -1:0] unit_out_instr_id; vproc_pkg::cfg_vsew unit_out_eew; + vproc_pkg::cfg_emul unit_out_emul; logic [4:0] unit_out_vaddr; logic unit_out_res_vaddr; logic [RES_CNT-1:0] unit_out_res_store; @@ -900,6 +902,7 @@ module vproc_pipeline import vproc_pkg::*; #( .pipe_out_ready_i ( unit_out_ready ), .pipe_out_instr_id_o ( unit_out_instr_id ), .pipe_out_eew_o ( unit_out_eew ), + .pipe_out_emul_o ( unit_out_emul ), .pipe_out_vaddr_o ( unit_out_vaddr ), .pipe_out_res_store_o ( unit_out_res_store ), .pipe_out_res_valid_o ( unit_out_res_valid ), @@ -952,6 +955,7 @@ module vproc_pipeline import vproc_pkg::*; #( .pipe_in_ready_o ( unit_out_ready ), .pipe_in_instr_id_i ( unit_out_instr_id ), .pipe_in_eew_i ( unit_out_eew ), + .pipe_in_emul_i ( unit_out_emul ), .pipe_in_vaddr_i ( unit_out_vaddr ), .pipe_in_res_store_i ( unit_out_res_store ), .pipe_in_res_valid_i ( unit_out_res_valid ), diff --git a/rtl/vproc_unit_mux.sv b/rtl/vproc_unit_mux.sv index 95a32b0f..f093e38d 100644 --- a/rtl/vproc_unit_mux.sv +++ b/rtl/vproc_unit_mux.sv @@ -34,6 +34,7 @@ module vproc_unit_mux import vproc_pkg::*; #( input logic pipe_out_ready_i, output logic [XIF_ID_W -1:0] pipe_out_instr_id_o, output cfg_vsew pipe_out_eew_o, + output cfg_emul pipe_out_emul_o, output logic [4:0] pipe_out_vaddr_o, output logic [RES_CNT-1:0] pipe_out_res_store_o, output logic [RES_CNT-1:0] pipe_out_res_valid_o, @@ -87,6 +88,7 @@ module vproc_unit_mux import vproc_pkg::*; #( logic [UNIT_CNT-1:0] unit_out_ready; logic [UNIT_CNT-1:0][XIF_ID_W -1:0] unit_out_instr_id; cfg_vsew [UNIT_CNT-1:0] unit_out_eew; + cfg_emul [UNIT_CNT-1:0] unit_out_emul; logic [UNIT_CNT-1:0][4:0] unit_out_vaddr; logic [UNIT_CNT-1:0][RES_CNT-1:0] unit_out_res_store; logic [UNIT_CNT-1:0][RES_CNT-1:0] unit_out_res_valid; @@ -148,6 +150,7 @@ module vproc_unit_mux import vproc_pkg::*; #( .pipe_out_ready_i ( unit_out_ready [i] ), .pipe_out_instr_id_o ( unit_out_instr_id [i] ), .pipe_out_eew_o ( unit_out_eew [i] ), + .pipe_out_emul_o ( unit_out_emul [i] ), .pipe_out_vaddr_o ( unit_out_vaddr [i] ), .pipe_out_res_store_o ( unit_out_res_store [i] ), .pipe_out_res_valid_o ( unit_out_res_valid [i] ), @@ -247,6 +250,7 @@ module vproc_unit_mux import vproc_pkg::*; #( pipe_out_valid_o = '0; pipe_out_instr_id_o = DONT_CARE_ZERO ? '0 : 'x ; pipe_out_eew_o = DONT_CARE_ZERO ? cfg_vsew' ('0) : cfg_vsew' ('x) ; + pipe_out_emul_o = DONT_CARE_ZERO ? cfg_emul' ('0) : cfg_emul' ('x) ; pipe_out_vaddr_o = DONT_CARE_ZERO ? '0 : 'x ; pipe_out_res_store_o = DONT_CARE_ZERO ? '0 : 'x ; pipe_out_res_valid_o = DONT_CARE_ZERO ? '0 : 'x ; @@ -261,6 +265,7 @@ module vproc_unit_mux import vproc_pkg::*; #( pipe_out_valid_o = unit_out_valid [i]; pipe_out_instr_id_o = unit_out_instr_id [i]; pipe_out_eew_o = unit_out_eew [i]; + pipe_out_emul_o = unit_out_emul [i]; pipe_out_vaddr_o = unit_out_vaddr [i]; pipe_out_res_store_o = unit_out_res_store [i]; pipe_out_res_valid_o = unit_out_res_valid [i]; diff --git a/rtl/vproc_unit_wrapper.sv b/rtl/vproc_unit_wrapper.sv index 3c53aab6..415ddd4f 100644 --- a/rtl/vproc_unit_wrapper.sv +++ b/rtl/vproc_unit_wrapper.sv @@ -34,6 +34,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( input logic pipe_out_ready_i, output logic [XIF_ID_W -1:0] pipe_out_instr_id_o, output cfg_vsew pipe_out_eew_o, + output cfg_emul pipe_out_emul_o, output logic [4:0] pipe_out_vaddr_o, output logic [RES_CNT-1:0] pipe_out_res_store_o, output logic [RES_CNT-1:0] pipe_out_res_valid_o, @@ -111,6 +112,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( always_comb begin pipe_out_instr_id_o = unit_out_ctrl.id; pipe_out_eew_o = unit_out_ctrl.eew; + pipe_out_emul_o = unit_out_ctrl.emul; pipe_out_vaddr_o = unit_out_ctrl.res_vaddr; pipe_out_res_store_o = '0; pipe_out_res_valid_o = '0; @@ -156,6 +158,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( always_comb begin pipe_out_instr_id_o = unit_out_ctrl.id; pipe_out_eew_o = unit_out_ctrl.eew; + pipe_out_emul_o = unit_out_ctrl.emul; pipe_out_vaddr_o = unit_out_ctrl.res_vaddr; pipe_out_res_store_o = '0; pipe_out_res_valid_o = '0; @@ -211,6 +214,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( always_comb begin pipe_out_instr_id_o = unit_out_ctrl.id; pipe_out_eew_o = unit_out_ctrl.eew; + pipe_out_emul_o = unit_out_ctrl.emul; pipe_out_vaddr_o = unit_out_ctrl.res_vaddr; pipe_out_res_store_o = '0; pipe_out_res_valid_o = '0; @@ -253,6 +257,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( always_comb begin pipe_out_instr_id_o = unit_out_ctrl.id; pipe_out_eew_o = unit_out_ctrl.eew; + pipe_out_emul_o = unit_out_ctrl.emul; pipe_out_vaddr_o = unit_out_ctrl.res_vaddr; pipe_out_res_store_o = '0; pipe_out_res_valid_o = '0; @@ -428,6 +433,7 @@ module vproc_unit_wrapper import vproc_pkg::*; #( always_comb begin pipe_out_instr_id_o = flushing_q ? flushing_id_q : unit_out_ctrl.id; pipe_out_eew_o = flushing_q ? flushing_eew_q : unit_out_ctrl.eew; + pipe_out_emul_o = flushing_q ? flushing_emul_q : unit_out_ctrl.emul; pipe_out_vaddr_o = DONT_CARE_ZERO ? '0 : 'x; unique case (flushing_q ? flushing_emul_q : unit_out_ctrl.emul) EMUL_1: pipe_out_vaddr_o = base_vaddr; diff --git a/rtl/vproc_vregpack.sv b/rtl/vproc_vregpack.sv index 6507e727..231b1114 100644 --- a/rtl/vproc_vregpack.sv +++ b/rtl/vproc_vregpack.sv @@ -32,6 +32,7 @@ module vproc_vregpack #( output logic pipe_in_ready_o, input logic [INSTR_ID_W -1:0] pipe_in_instr_id_i, // ID of instruction input vproc_pkg::cfg_vsew pipe_in_eew_i, // current elem width + input vproc_pkg::cfg_emul pipe_in_emul_i, // current mul width input logic [VADDR_W -1:0] pipe_in_vaddr_i, // vreg address input logic [RES_CNT-1:0] pipe_in_res_store_i, // result store signal input logic [RES_CNT-1:0] pipe_in_res_valid_i, // result is valid @@ -321,9 +322,15 @@ module vproc_vregpack #( // by default, retain current value for lower part and assign default value for upper part res_buffer_next[i] = {res_default, res_buffer[i][VPORT_W -RES_W[i] -1:0]}; msk_buffer_next[i] = {msk_default, msk_buffer[i][VPORT_W/8-RES_W[i]/8-1:0]}; + // For narrow with EMUL == 1, we need to put the new res(msk) at the middle of the buffer + if(RES_NARROW[i] & pipe_in_res_flags_i[i].narrow & pipe_in_emul_i == EMUL_1) begin + msk_buffer_next[i][VPORT_W/8-1:VPORT_W/8-RES_W[i]/8] = {RES_W[i]/8{1'b0}}; + res_buffer_next[i][VPORT_W/2-1-:RES_W[i]] = {res_default[RES_W[i]-1:RES_W[i]/2], res_buffer[i][VPORT_W/2-1-:RES_W[i]/2]}; + msk_buffer_next[i][VPORT_W/16-1-:RES_W[i]/8] = {msk_default[RES_W[i]/8-1:RES_W[i]/16], msk_buffer[i][VPORT_W/16-1-:RES_W[i]/16]}; + end // shift signal shifts entire content right by the width of the result; full-size results // shift every cycle - if ((~RES_MASK[i] & ~RES_NARROW[i] & ~RES_ALLOW_ELEMWISE[i] & ~RES_ALWAYS_ELEMWISE[i]) | + else if ((~RES_MASK[i] & ~RES_NARROW[i] & ~RES_ALLOW_ELEMWISE[i] & ~RES_ALWAYS_ELEMWISE[i]) | pipe_in_res_flags_i[i].shift ) begin res_buffer_next[i][VPORT_W -RES_W[i] -1:0] = res_buffer[i][VPORT_W -1:RES_W[i] ];