diff --git a/rtl/vproc_pipeline.sv b/rtl/vproc_pipeline.sv index e8ea027b..11ed565f 100644 --- a/rtl/vproc_pipeline.sv +++ b/rtl/vproc_pipeline.sv @@ -130,6 +130,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic alt_last_cycle; logic init_addr; // initialize address (used by LSU) logic requires_flush; + logic red_op; logic [XIF_ID_W -1:0] id; op_unit unit; op_mode mode; @@ -241,6 +242,7 @@ module vproc_pipeline import vproc_pkg::*; #( state_next.first_cycle = 1'b1; state_next.init_addr = 1'b1; state_next.requires_flush = pipe_in_state_i.requires_flush; + state_next.red_op = pipe_in_state_i.red_op; state_next.id = pipe_in_state_i.id; state_next.unit = pipe_in_state_i.unit; state_next.mode = pipe_in_state_i.mode; @@ -679,6 +681,7 @@ module vproc_pipeline import vproc_pkg::*; #( logic last_cycle; logic init_addr; // initialize address (used by LSU) logic requires_flush; + logic red_op; logic alt_count_valid; // alternative counter value is valid logic [AUX_COUNTER_W-1:0] aux_count; logic [XIF_ID_W-1:0] id; @@ -711,6 +714,7 @@ module vproc_pipeline import vproc_pkg::*; #( (~FIELD_COUNT_USED | (state_q.field_count == '0)); unpack_ctrl.init_addr = state_q.init_addr; unpack_ctrl.requires_flush = state_q.requires_flush; + unpack_ctrl.red_op = state_q.red_op; unpack_ctrl.alt_count_valid = DONT_CARE_ZERO ? '0 : 'x; unique case (state_q.emul) EMUL_1: unpack_ctrl.alt_count_valid = state_q.alt_count.val[COUNTER_W-1 -: 4] == '0; diff --git a/rtl/vproc_pipeline_wrapper.sv b/rtl/vproc_pipeline_wrapper.sv index 94ae9adf..e04a2664 100644 --- a/rtl/vproc_pipeline_wrapper.sv +++ b/rtl/vproc_pipeline_wrapper.sv @@ -198,6 +198,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( count_inc_e count_inc; // counter increment policy logic [2:0] field_count_init; // field counter initial value logic requires_flush; // whether the instr requires flushing + logic red_op; // whether the instr is a reduction logic [XIF_ID_W -1:0] id; op_unit unit; op_mode mode; @@ -224,99 +225,115 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( assign unit_elem = UNITS[UNIT_ELEM] & (pipe_in_data_i.unit == UNIT_ELEM); // identify the type of data that vs2 supplies for ELEM instructions - logic elem_flush, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; + logic elem_flush, red_op, elem_vs2_data, elem_vs2_mask, elem_vs2_dyn_addr; always_comb begin elem_flush = DONT_CARE_ZERO ? 1'b0 : 1'bx; + red_op = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_data = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_mask = DONT_CARE_ZERO ? 1'b0 : 1'bx; elem_vs2_dyn_addr = DONT_CARE_ZERO ? 1'b0 : 1'bx; unique case (pipe_in_data_i.mode.elem.op) ELEM_XMV: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VPOPC: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VFIRST: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VID: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VIOTA: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VRGATHER: begin elem_flush = 1'b0; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b1; end ELEM_VCOMPRESS: begin elem_flush = 1'b1; + red_op = 1'b0; elem_vs2_data = 1'b0; elem_vs2_mask = 1'b1; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDSUM: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDAND: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDOR: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDXOR: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMINU: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMIN: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMAXU: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; end ELEM_VREDMAX: begin - elem_flush = 1'b1; + elem_flush = 1'b0; + red_op = 1'b1; elem_vs2_data = 1'b1; elem_vs2_mask = 1'b0; elem_vs2_dyn_addr = 1'b0; @@ -403,6 +420,7 @@ module vproc_pipeline_wrapper import vproc_pkg::*; #( state_init.field_count_init = unit_lsu ? pipe_in_data_i.mode.lsu.nfields : '0; state_init.requires_flush = unit_elem & elem_flush; + state_init.red_op = red_op; state_init.id = pipe_in_data_i.id; state_init.unit = pipe_in_data_i.unit; state_init.mode = pipe_in_data_i.mode; diff --git a/rtl/vproc_pkg.sv b/rtl/vproc_pkg.sv index f448e77d..bc9749d3 100644 --- a/rtl/vproc_pkg.sv +++ b/rtl/vproc_pkg.sv @@ -332,6 +332,7 @@ typedef struct packed { logic narrow; logic saturate; logic sig; + logic red_op; logic [2:0] mul_idx; } pack_flags; diff --git a/rtl/vproc_unit_wrapper.sv b/rtl/vproc_unit_wrapper.sv index b631ec3b..3c53aab6 100644 --- a/rtl/vproc_unit_wrapper.sv +++ b/rtl/vproc_unit_wrapper.sv @@ -449,7 +449,8 @@ module vproc_unit_wrapper import vproc_pkg::*; #( default: ; endcase pipe_out_res_flags_o[0].elemwise = 1'b1; - pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1); + pipe_out_res_flags_o[0].red_op = unit_out_ctrl.red_op; + pipe_out_res_store_o[0] = ((~unit_out_ctrl.mode.elem.xreg & unit_out_res_valid) | flushing_q) & (vd_count_d.part.low == '1 | unit_out_ctrl.red_op); pipe_out_res_valid_o[0] = flushing_q | unit_out_res_valid; pipe_out_res_data_o [0] = unit_out_res; pipe_out_res_mask_o [0][3:0] = flushing_q ? '0 : unit_out_mask; diff --git a/rtl/vproc_vregpack.sv b/rtl/vproc_vregpack.sv index 9c276cb0..6507e727 100644 --- a/rtl/vproc_vregpack.sv +++ b/rtl/vproc_vregpack.sv @@ -329,6 +329,31 @@ module vproc_vregpack #( res_buffer_next[i][VPORT_W -RES_W[i] -1:0] = res_buffer[i][VPORT_W -1:RES_W[i] ]; msk_buffer_next[i][VPORT_W/8-RES_W[i]/8-1:0] = msk_buffer[i][VPORT_W/8-1:RES_W[i]/8]; end + // For reduction operations, we write the reduction value directly in the lowest bits of the result + // and set only the lowest bits of the mask buffer + // We did this, because the current Vicuna code has a bug when the following instruction is executed + // at the same pipeline, the unit deque will not be available for the new instruction because + // it is processing the flush logic for the reduction/compress operations + // This extra if removes the need for the flush logic for reduction operations, but doesn't fix + // the problem for compress instructions + if((RES_ALLOW_ELEMWISE[i] | RES_ALWAYS_ELEMWISE[i]) & pipe_in_res_flags_i[i].red_op) begin + msk_buffer_next[i] = '0; + unique case (pipe_in_eew_i) + VSEW_8: begin + res_buffer_next[i][7:0] = pipe_in_res_data_i[i][7 :0]; + msk_buffer_next[i][0] = pipe_in_res_mask_i[i][0]; + end + VSEW_16: begin + res_buffer_next[i][15:0] = pipe_in_res_data_i[i][15:0]; + msk_buffer_next[i][1:0] = {2{pipe_in_res_mask_i[i][0]}}; + end + VSEW_32: begin + res_buffer_next[i][31:0] = pipe_in_res_data_i[i][31:0]; + msk_buffer_next[i][3:0] = {4{pipe_in_res_mask_i[i][0]}}; + end + default: ; + endcase + end end end