diff --git a/do_sim.sh b/do_sim.sh new file mode 100644 index 000000000..b82ec5392 --- /dev/null +++ b/do_sim.sh @@ -0,0 +1,4 @@ +make questasim-sim FUSESOC_PARAM="--X_EXT=1" +cd ./build/openhwgroup.org_systems_core-v-mini-mcu_0/sim-modelsim/ +make run-gui PLUSARGS="c firmware=../../../sw/build/main.hex" +cd ../../.. \ No newline at end of file diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index 74d7a277a..7414e84e9 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -8,7 +8,8 @@ package quadrilatero_pkg; parameter int unsigned N_REGS = 8; parameter int unsigned DATA_WIDTH = 32; parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 4; + parameter int unsigned MESH_WIDTH = 8; // change register dimension + parameter int unsigned SA_MESH_WIDTH = 4; // change systolic array dimension parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units parameter int unsigned MAX_NUM_READ_OPERANDS = 3; parameter int unsigned MAX_NUM_WRITE_OPERANDS = 1; @@ -17,8 +18,14 @@ package quadrilatero_pkg; parameter int unsigned RF_READ_PORTS = 4; parameter int unsigned RF_WRITE_PORTS = 3; - localparam int unsigned N_ROWS = MESH_WIDTH ; - localparam int unsigned RLEN = DATA_WIDTH * MESH_WIDTH; + localparam int unsigned RLEN = DATA_WIDTH * MESH_WIDTH; + localparam int unsigned ALEN = 128; + localparam int unsigned LLEN = 128; + localparam int unsigned LEN = ALEN; + localparam int unsigned N_ROWS = LEN / DATA_WIDTH ; //TODO: not sure if this is correct? + localparam int unsigned N_TILES = (RLEN/LEN)**2; + localparam int unsigned TILE_ADDR = (RLEN/LEN) == 1? 0: RLEN/LEN; + localparam int unsigned N_IREGS = N_REGS * N_TILES; typedef enum logic [2:0] { @@ -54,9 +61,8 @@ package quadrilatero_pkg; } lsu_conf_t; typedef struct packed { - logic [xif_pkg::X_ID_WIDTH-1:0] id; - logic rvalid; - logic wready; + logic [xif_pkg::X_ID_WIDTH-1:0] id; + logic valid; } rw_queue_t; localparam int unsigned WR_PORT = (WRITE_PORTS > 1) ? $clog2(WRITE_PORTS) : 1; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv index 5a4d062a1..77452023a 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv @@ -126,29 +126,29 @@ module quadrilatero // RF Sequencer - logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_raddr_from_fu ; + logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_raddr_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_rrowaddr_from_fu; - logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_rdata_from_fu ; + logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_rdata_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rvalid_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rlast_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rready_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_rd_id_from_fu ; - logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_waddr_from_fu ; + logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_waddr_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_wrowaddr_from_fu; - logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_wdata_from_fu ; + logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_wdata_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_we_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wlast_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wready_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_wr_id_from_fu ; - logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_raddr_to_rf ; + logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_raddr_to_rf ; logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_rrowaddr_to_rf ; - logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_rdata_to_rf ; + logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_rdata_to_rf ; - logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_waddr_to_rf ; + logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_waddr_to_rf ; logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_wrowaddr_to_rf ; - logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][quadrilatero_pkg::RLEN-1:0] rf_seq_wdata_to_rf ; + logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rf_seq_wdata_to_rf ; logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0] rf_seq_we_to_rf ; quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_REGS-1:0] rf_seq_rw_queue_entry ; @@ -170,30 +170,30 @@ module quadrilatero logic sa_weight_rdata_ready; logic sa_weight_rlast ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id ; - logic [quadrilatero_pkg::RLEN-1:0] sa_weight_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_weight_raddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_weight_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_weight_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_weight_rrowaddr ; logic sa_data_rdata_valid ; logic sa_data_rdata_ready ; logic sa_data_rlast ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id ; - logic [quadrilatero_pkg::RLEN-1:0] sa_data_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_data_raddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_data_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_data_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_data_rrowaddr ; logic sa_acc_rdata_valid ; logic sa_acc_rdata_ready ; logic sa_acc_rlast ; - logic [quadrilatero_pkg::RLEN-1:0] sa_acc_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_acc_raddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_acc_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_acc_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_acc_rrowaddr ; logic sa_res_we ; logic sa_res_wready ; logic sa_res_wlast ; - logic [quadrilatero_pkg::RLEN-1:0] sa_res_wdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_res_waddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_res_wdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_res_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_res_wrowaddr ; logic sa_finished ; @@ -226,15 +226,15 @@ module quadrilatero logic lsu_wlast ; logic lsu_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] lsu_id ; - logic [quadrilatero_pkg::RLEN-1:0] lsu_wdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] lsu_waddr ; + logic [quadrilatero_pkg::LEN-1:0] lsu_wdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] lsu_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_wrowaddr ; logic lsu_rlast ; logic lsu_rready ; logic lsu_rvalid ; - logic [quadrilatero_pkg::RLEN-1:0] lsu_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] lsu_raddr ; + logic [quadrilatero_pkg::LEN-1:0] lsu_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] lsu_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_rrowaddr ; logic lsu_busy ; @@ -254,8 +254,8 @@ module quadrilatero logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_instr_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_finished_instr_id; - logic [quadrilatero_pkg::RLEN-1:0] perm_unit_wdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] perm_unit_waddr ; + logic [quadrilatero_pkg::LEN-1:0] perm_unit_wdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] perm_unit_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] perm_unit_wrowaddr ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] perm_unit_reg ; @@ -671,7 +671,7 @@ module quadrilatero ); quadrilatero_systolic_array #( - .MESH_WIDTH(MESH_WIDTH), + .MESH_WIDTH(quadrilatero_pkg::SA_MESH_WIDTH), .FPU (FPU ) ) sa_inst ( .clk_i , @@ -774,6 +774,7 @@ module quadrilatero // To Register Loader .busy_i (lsu_busy | x_res_almost_full), // Load Unit busy + .finished_i (lsu_finished), .start_o (lsu_ctrl_start ), // .issued_instr_o (lsu_ctrl_issued_instr ), // issued instruction .issued_instr_conf_o (lsu_ctrl_issued_instr_conf ) // issued instruction configuration diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv index 2e858cc3c..611502f70 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv @@ -168,8 +168,8 @@ module quadrilatero_dispatcher #( delta = 3'b0; for(int ii = 0; ii < N_REGS; ii++) begin - delta += {2'b0, rw_queue_entry_o[ii].rvalid}; - delta += {2'b0, rw_queue_entry_o[ii].wready}; + delta += {2'b0, rvalid[ii]}; + delta += {2'b0, wready[ii]}; end done = (delta == outstanding_op_q); @@ -210,10 +210,9 @@ module quadrilatero_dispatcher #( rvalid[rreg_q[2]] |= reg3_valid &~ ld_reg3; wready[wreg_q ] = regw_valid &~ ld_regw; for(int ii = 0; ii < N_REGS; ii++) begin - rw_queue_entry_o[ii].rvalid = rvalid[ii]; - rw_queue_entry_o[ii].wready = wready[ii]; rw_queue_entry_o[ii].id = instr_id_q; - rw_queue_push_o [ii] = rw_queue_entry_o[ii].rvalid | rw_queue_entry_o[ii].wready; + rw_queue_push_o [ii] = rvalid[ii] | wready[ii]; + rw_queue_entry_o[ii].valid = rvalid[ii] | wready[ii]; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv index 2b2600dac..0c327f8c5 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv @@ -12,7 +12,7 @@ module quadrilatero_ff_fs_dr_stage #( parameter DATA_WIDTH = 32, parameter N_REGS = 8, localparam N_ROWS = MESH_WIDTH, - localparam RLEN = DATA_WIDTH * MESH_WIDTH + localparam ALEN = DATA_WIDTH * MESH_WIDTH ) ( input logic clk_i, input logic rst_ni, @@ -28,7 +28,7 @@ module quadrilatero_ff_fs_dr_stage #( // Data Read Register Port output logic [$clog2(N_REGS)-1:0] data_raddr_o, output logic [$clog2(N_ROWS)-1:0] data_rrowaddr_o, - input logic [RLEN-1:0] data_rdata_i, + input logic [ALEN-1:0] data_rdata_i, input logic data_rdata_valid_i, output logic data_rdata_ready_o, output logic data_rlast_o, @@ -36,7 +36,7 @@ module quadrilatero_ff_fs_dr_stage #( // Accumulator Read Register Port output logic [$clog2(N_REGS)-1:0] acc_raddr_o, output logic [$clog2(N_ROWS)-1:0] acc_rrowaddr_o, - input logic [RLEN-1:0] acc_rdata_i, + input logic [ALEN-1:0] acc_rdata_i, input logic acc_rdata_valid_i, output logic acc_rdata_ready_o, output logic acc_rlast_o, @@ -44,7 +44,7 @@ module quadrilatero_ff_fs_dr_stage #( // Accumulator Out Write Register Port output logic [$clog2(N_REGS)-1:0] res_waddr_o, output logic [$clog2(N_ROWS)-1:0] res_wrowaddr_o, - output logic [ RLEN-1:0] res_wdata_o, + output logic [ ALEN-1:0] res_wdata_o, output logic res_we_o, output logic res_wlast_o, input logic res_wready_i, @@ -82,7 +82,7 @@ module quadrilatero_ff_fs_dr_stage #( logic [ $clog2(N_REGS)-1:0] n_res_waddr; logic [ $clog2(N_ROWS)-1:0] n_res_wrowaddr; - logic [ RLEN-1:0] n_res_wdata; + logic [ ALEN-1:0] n_res_wdata; logic n_res_we; logic [ $clog2(N_REGS)-1:0] data_reg_ff; // data register diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index cb5f9b2ff..3b0163568 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -52,6 +52,8 @@ module quadrilatero_lsu #( localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; localparam int unsigned LastFifoUsage = DEPTH - 1; + localparam int unsigned LastRow = quadrilatero_pkg::MESH_WIDTH-1; + localparam int unsigned LastCol = quadrilatero_pkg::TILE_ADDR-1; logic terminate ; @@ -100,7 +102,10 @@ module quadrilatero_lsu #( logic store_fifo_empty ; logic [ DATA_WIDTH-1:0] store_fifo_output ; logic store_fifo_pop ; - + logic [$clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] row_counter_d; + logic [$clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] row_counter_q; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_q; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_d; enum { LSU_READY, @@ -128,24 +133,44 @@ module quadrilatero_lsu #( always_comb begin : ctrl_block terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); - load_fifo_valid_o = rd_valid_d; + load_fifo_valid_o = rd_valid_d | rd_valid_q; busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; terminate_o = terminate; end always_comb begin : addr_block src_ptr_inc = DATA_WIDTH / 8; - addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; + addr_op2 = (stride_i * row_counter_q) + (src_ptr_inc * col_counter_q); + addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : src_ptr_i + addr_op2; ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; end always_comb begin : counters_block rows_d = rows_q; cols_d = cols_q; + row_counter_d = row_counter_q; + col_counter_d = col_counter_q; if(start_i) begin if(data_gnt_i && data_req_o) begin + if(quadrilatero_pkg::TILE_ADDR != 0) begin + if(col_counter_q == LastCol) begin + col_counter_d = '0; + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + col_counter_d = col_counter_d + 1; + end + end else begin + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end if(cols_i > 1) begin rows_d = rows_i - 1; cols_d = cols_i - 2; @@ -158,6 +183,24 @@ module quadrilatero_lsu #( cols_d = cols_i - 1; end end else if (data_gnt_i && data_req_o) begin + if(quadrilatero_pkg::TILE_ADDR != 0) begin + if(col_counter_q == LastCol) begin + col_counter_d = '0; + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + col_counter_d = col_counter_d + 1; + end + end else begin + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end if (cols_q > 0) cols_d = cols_q - 1; else if (rows_q > 0) begin cols_d = cols_i - 1; @@ -296,6 +339,8 @@ module quadrilatero_lsu #( rd_head_q <= '0 ; rd_valid_q <= '0 ; data_we_q <= '0 ; + row_counter_q <= '0 ; + col_counter_q <= '0 ; end else begin lsu_state_q <= lsu_state_d; ptr_q <= ptr_d ; @@ -304,7 +349,9 @@ module quadrilatero_lsu #( rd_head_q <= rd_head_d ; rd_valid_q <= rd_valid_d ; data_we_q <= data_we_d ; + row_counter_q <= row_counter_d; + col_counter_q <= col_counter_d; end end -endmodule +endmodule \ No newline at end of file diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv index 615dbe5bf..5444ab137 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv @@ -14,9 +14,9 @@ module quadrilatero_perm_unit #( input logic rst_ni , // Register Write Port - output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_o , output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ RLEN-1:0] wdata_o , + output logic [ quadrilatero_pkg::LEN-1:0] wdata_o , output logic we_o , output logic wlast_o , input logic wready_i , // to stall the request in case the port is busy @@ -58,8 +58,13 @@ module quadrilatero_perm_unit #( logic mask_req ; logic fifo_full ; logic fifo_empty ; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] row_counter_d; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] row_counter_q; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] cols_counter_d; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] cols_counter_q; localparam int unsigned USAGE = DEPTH > 1 : $clog2(DEPTH) : 0; + localparam int unsigned TILES = RLEN / quadrilatero_pkg::LEN; logic [USAGE:0] fifo_usage; logic fifo_almost_full; //---------------------------------------------------------------------------------------------------------- @@ -86,7 +91,8 @@ module quadrilatero_perm_unit #( always_comb begin : ctrl_block mask_req = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & finished_q & ~finished_ack_i; - finished = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & write_started_q & wready_i ; + finished = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && (row_counter_q == TILES-1) && (cols_counter_q == TILES-1) + & write_started_q & wready_i ; busy = write_started_q &~ finished ; start = ~busy & ~fifo_empty ; finished_id = id_q ; @@ -99,6 +105,8 @@ module quadrilatero_perm_unit #( write_started_d = write_started_q ; finished_d = finished_q ; finished_instr_id_d = finished_instr_id_q; + cols_counter_d = cols_counter_q; + row_counter_d = row_counter_q; if (start) begin operand_reg_d = operand_reg_new; @@ -106,9 +114,24 @@ module quadrilatero_perm_unit #( end if ((write_started_q && wready_i)) begin - counter_d = counter_q + 1; + if(cols_counter_q == TILES-1) begin + cols_counter_d = '0; + if(counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) begin + counter_d = '0; + if(row_counter_q == TILES-1) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + counter_d = counter_q + 1; + end + end else begin + cols_counter_d = cols_counter_q + 1; + end end else if (finished) begin counter_d = '0; + end if (start) begin @@ -134,6 +157,8 @@ module quadrilatero_perm_unit #( id_q <= '0; write_started_q <= '0; counter_q <= '0; + row_counter_q <= '0; + cols_counter_q <= '0; end else begin finished_q <= finished_d ; finished_instr_id_q <= finished_instr_id_d; @@ -141,15 +166,20 @@ module quadrilatero_perm_unit #( id_q <= id_d ; write_started_q <= write_started_d ; counter_q <= counter_d ; + row_counter_q <= row_counter_d ; + cols_counter_q <= cols_counter_d ; end end - assign waddr_o = operand_reg_q ; + assign waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_q; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + assign waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; + end assign wrowaddr_o = counter_q ; assign wdata_o = '0 ; assign we_o = write_started_q &~ mask_req; - assign wlast_o = finished ; + assign wlast_o = write_started_q &~ mask_req ; assign busy_o = fifo_full | fifo_almost_full; assign id_o = id_q ; assign finished_o = finished_q ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv index b26b714b0..4dafb1a5d 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv @@ -9,34 +9,34 @@ module quadrilatero_regfile #( parameter WRITE_PORTS = 2, // number of write ports parameter N_REGS = 8, // how many registers parameter RLEN = 128, // length in bits for each register row - localparam N_ROWS = RLEN / 32 // this is done in the thead spec + localparam N_ROWS = quadrilatero_pkg::LEN / 32 // this is done in the thead spec ) ( // clock and reset input logic clk_i, input logic rst_ni, // read port - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i, // register and port address + input logic [READ_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_i, // register and port address input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i, // we can ask for a single row of a register - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o, // row out + output logic [READ_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rdata_o, // row out // write port - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i, + input logic [WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_i, input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i, - input logic [WRITE_PORTS-1:0][ RLEN-1:0] wdata_i, + input logic [WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] wdata_i, input logic [WRITE_PORTS-1:0] we_i ); `ifdef SIMULATION // Multiple of 2 and less than 2**16 - if (!(RLEN < (1 << 16) && $countones(RLEN) == 1)) begin + if (!(quadrilatero_pkg::LEN < (1 << 16) && $countones(quadrilatero_pkg::LEN) == 1)) begin $fatal("invalid register configuration"); end `endif - logic [N_REGS-1:0][N_ROWS-1:0][RLEN-1:0] mem_q; - logic [N_REGS-1:0][N_ROWS-1:0][RLEN-1:0] mem_d; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0][quadrilatero_pkg::LEN-1:0] mem_q; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0][quadrilatero_pkg::LEN-1:0] mem_d; always_comb begin : write_mem mem_d = mem_q; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 6d69a64c6..d6b3064c1 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -13,7 +13,7 @@ module quadrilatero_register_lsu #( parameter int unsigned BUS_WIDTH = 128, parameter int unsigned N_REGS = 8, parameter int unsigned N_ROWS = 4, - localparam int unsigned RLEN = BUS_WIDTH + localparam int unsigned LLEN = BUS_WIDTH ) ( input logic clk_i , input logic rst_ni , @@ -31,17 +31,17 @@ module quadrilatero_register_lsu #( output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , // Register Write Port for load unit - output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_o , output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ RLEN-1:0] wdata_o , + output logic [quadrilatero_pkg::LEN-1:0] wdata_o , output logic we_o , output logic wlast_o , input logic wready_i , // to stall the request in case the port is busy // Register Read Port for store unit - output logic [ $clog2(N_REGS)-1:0] raddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_o , output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [ RLEN-1:0] rdata_i , + input logic [quadrilatero_pkg::LEN-1:0] rdata_i , input logic rdata_valid_i , output logic rdata_ready_o , output logic rlast_o , @@ -64,7 +64,18 @@ module quadrilatero_register_lsu #( ); - localparam MAX_EL_PER_ROW = RLEN / BUS_WIDTH; + localparam MAX_EL_PER_ROW = quadrilatero_pkg::RLEN / LLEN; + localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); + localparam NumCols = quadrilatero_pkg::RLEN / LLEN; + + typedef enum logic [1:0] { + LSU_IDLE, + LSU_LOAD, + LSU_STORE, + LSU_DONE + } register_lsu_state_e; + + register_lsu_state_e lsu_state_d, lsu_state_q; logic finished; logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; @@ -72,10 +83,11 @@ module quadrilatero_register_lsu #( logic [$clog2(N_ROWS)-1:0] counter_q; logic [$clog2(N_ROWS)-1:0] counter_d; - logic [$clog2(N_REGS)-1:0] waddr_q; - logic [$clog2(N_REGS)-1:0] waddr_d; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_q; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_d; + - logic [RLEN-1:0] load_fifo_data; + logic [LLEN-1:0] load_fifo_data; logic load_fifo_data_available; logic load_fifo_pop; @@ -83,19 +95,15 @@ module quadrilatero_register_lsu #( logic store_fifo_space_available; logic store_fifo_push; logic store_fifo_empty; - logic [RLEN-1:0] store_fifo_data; + logic [LLEN-1:0] store_fifo_data; - logic [RLEN-1:0] data_mask; + logic [LLEN-1:0] data_mask; logic load_fifo_valid; logic busy; logic start; logic start_q; logic start_d; - - logic valid_d; - logic valid_q; - logic write_q; logic write_d; logic terminate; @@ -104,10 +112,6 @@ module quadrilatero_register_lsu #( logic lsu_busy_q; logic lsu_ready; - logic mask_req; - - - logic [ 31:0] src_ptr_d ; logic [ 31:0] stride_d ; @@ -116,29 +120,36 @@ module quadrilatero_register_lsu #( logic [ 31:0] src_ptr ; logic [ 31:0] stride ; - assign mask_req = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & finished_o & ~finished_ack_i; + logic [$clog2(NumCols)-1:0] cols_counter_d; + logic [$clog2(NumCols)-1:0] cols_counter_q; + logic [$clog2(NumCols)-1:0] row_counter_d; + logic [$clog2(NumCols)-1:0] row_counter_q; + + + assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin - lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; - finished = (write_q & terminate) | (~write_q & wlast_o & wready_i); + lsu_id_o = (write_i &~ load_fifo_data_available & rlast_o) ? instr_id_i : back_id_q; + finished = (write_q & terminate & rlast_o) | + (~write_q && (counter_q == LastRow) && (row_counter_q == NumCols-1) && (cols_counter_q == NumCols-1) && wready_i && wlast_o); end always_comb begin: write_to_RF data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols - - we_o = load_fifo_data_available &~ mask_req; + we_o = load_fifo_data_available &~ mask_req; waddr_o = waddr_q; wrowaddr_o = counter_q ; - wdata_o = load_fifo_data & ~data_mask; - wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && we_o && wready_i; - // wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & wready_i; + wdata_o = load_fifo_data & ~data_mask; + end always_comb begin: read_from_RF rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; rrowaddr_o = counter_q ; - raddr_o = operand_reg_i ; - rlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && rdata_valid_i && rdata_ready_o; + raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; + end end always_comb begin: lsu_ctrl_block @@ -147,68 +158,197 @@ module quadrilatero_register_lsu #( store_fifo_push = rdata_ready_o && rdata_valid_i; lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); start = (start_i | start_q) & lsu_ready; - //busy_o = (write_i ? busy_d : busy) | start_q; - busy_o = (write_i ? busy_d : busy | (load_fifo_data_available & counter_d == '0)) | start_q; + busy_o = (write_i ? busy_d : busy) | start_q; stride = (start) ? stride_i : stride_q; src_ptr = (start) ? address_i : src_ptr_q; end always_comb begin: next_value - if (rlast_o || wlast_o) begin - counter_d = '0; - end else if ((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - counter_d = counter_q + 1; - end else begin - counter_d = counter_q; - end - - write_d = (write_i && rlast_o && rdata_valid_i) ? 1'b1 : + write_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b1 : (!write_i && !busy) ? 1'b0 : write_q; - - valid_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? 1'b1 : - (load_fifo_valid && counter_d==3 && valid_q) ? 1'b0 : valid_q; - + start_d = start ? 1'b0 : (start_q | start_i) ? 1'b1 : start_q; stride_d = (start) ? stride_i : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; - back_id_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? instr_id_i : - rlast_o ? lsu_id_o : back_id_q; + busy_d = (write_i && (counter_q == LastRow) && (row_counter_q == NumCols-1) && (cols_counter_q == NumCols-1) && rdata_valid_i && rlast_o) ? 1'b0 : + (write_i && start_i) ? 1'b1 : busy_q; + end - waddr_d = (load_fifo_valid && counter_d==0) ? operand_reg_i : waddr_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + assign waddr_d[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_d, cols_counter_d}; + end - busy_d = (write_i && rlast_o && rdata_valid_i) ? 1'b0 : - (write_i && start_i) ? 1'b1 : busy_q; + always_comb begin: fsm_block + lsu_state_d = lsu_state_q; + counter_d = counter_q; + cols_counter_d = cols_counter_q; + row_counter_d = row_counter_q; + rlast_o = rdata_ready_o? 1'b1 : 1'b0; + wlast_o = we_o? 1'b1 : 1'b0; + + back_id_d = back_id_q; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = waddr_q[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR]; + + + case (lsu_state_q) + LSU_IDLE: begin + back_id_d = instr_id_i; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + row_counter_d = '0; + cols_counter_d = '0; + if(load_fifo_valid && !write_i && wready_i) begin + if(cols_counter_q == NumCols - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_LOAD; + + end else begin + cols_counter_d = cols_counter_q + 1; + lsu_state_d = LSU_LOAD; + end + end else if (write_i & store_fifo_space_available && rdata_valid_i) begin + if(cols_counter_q == NumCols - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_STORE; + end else begin + cols_counter_d = cols_counter_q + 1; + lsu_state_d = LSU_STORE; + end + end + + end + LSU_LOAD: begin + if(load_fifo_valid) begin + if(wready_i) begin + if(counter_q == LastRow) begin + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; + counter_d = '0; + if(row_counter_q == NumCols - 1) begin + row_counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + cols_counter_d = cols_counter_q + 1; + end + end else begin + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; + counter_d = counter_q + 1; + end else begin + cols_counter_d = cols_counter_q + 1; + end + end + end + + end else begin + if(write_i && wready_i) begin + if(cols_counter_q == NumCols - 1) begin + counter_d = '0; + lsu_state_d = LSU_DONE; + cols_counter_d = '0; + back_id_d = instr_id_i; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + end else begin + cols_counter_d = cols_counter_q + 1; + end + end + + end + end + LSU_STORE: begin + if(store_fifo_space_available && write_i && rdata_valid_i) begin + if(counter_q == LastRow) begin + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; + counter_d = '0; + if(row_counter_q == NumCols-1) begin + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + cols_counter_d = cols_counter_q + 1; + end + end else begin + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; + counter_d = counter_q + 1; + end else begin + cols_counter_d = cols_counter_q + 1; + end + end + end + end + LSU_DONE: begin + if(load_fifo_valid && !write_i && wready_i) begin + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; + row_counter_d = '0; + counter_d = counter_q + 1; + lsu_state_d = LSU_LOAD; + end else begin + cols_counter_d = cols_counter_q + 1; + end + end else if (write_i && store_fifo_space_available && rdata_valid_i) begin + if(cols_counter_q == NumCols - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_STORE; + cols_counter_d = '0; + row_counter_d = '0; + end else begin + cols_counter_d = cols_counter_q + 1; + end + end else begin + lsu_state_d = LSU_IDLE; + end + end + default: begin + lsu_state_d = LSU_IDLE; + end + endcase + end + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block if (!rst_ni) begin counter_q <= '0; waddr_q <= '0; back_id_q <= '0; start_q <= '0; - valid_q <= '0; write_q <= '0; busy_q <= '0; + lsu_state_q <= LSU_IDLE; lsu_busy_q <= '0; src_ptr_q <= '0; stride_q <= '0; + cols_counter_q <= '0; + row_counter_q <= '0; end else begin counter_q <= counter_d; back_id_q <= back_id_d; waddr_q <= waddr_d ; start_q <= start_d ; - valid_q <= valid_d ; write_q <= write_d ; busy_q <= busy_d ; + lsu_state_q <= lsu_state_d; lsu_busy_q <= busy; src_ptr_q <= src_ptr_d; stride_q <= stride_d ; + cols_counter_q <= cols_counter_d; + row_counter_q <= row_counter_d; end end @@ -232,7 +372,7 @@ module quadrilatero_register_lsu #( //Configuration .start_i (start ), - .write_i , + .write_i (write_i), .busy_o (busy ), .terminate_o (terminate ), @@ -280,4 +420,7 @@ module quadrilatero_register_lsu #( "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" ); end + if ((NumCols & (NumCols - 1)) != 0) begin + $error("RLEN / LLEN must be a power of 2."); + end endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv index 8709858a6..c4cc392cc 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv @@ -5,7 +5,7 @@ // Author: Saverio Nasturzio module quadrilatero_register_lsu_controller #( - parameter N_SLOTS = 3 + parameter N_SLOTS = 3 //TODO maybe change that to quadrilatero_pkg::MESH_WIDTH ) ( input logic clk_i, input logic rst_ni, @@ -18,10 +18,12 @@ module quadrilatero_register_lsu_controller #( // To Register Loader input logic busy_i, // Load Unit busy output logic start_o, // WL will start executing new instruction + input logic finished_i, //LSU has finished executing instruction output quadrilatero_pkg::lsu_instr_t issued_instr_o, // issued instruction output quadrilatero_pkg::lsu_conf_t issued_instr_conf_o // issued instruction configuration ); - + logic finished_d; + logic finished_q; localparam int unsigned USAGE = N_SLOTS > 1 : $clog2(N_SLOTS) : 0; logic issue_queue_empty; logic start_load; @@ -38,8 +40,10 @@ module quadrilatero_register_lsu_controller #( issued_instr_ff <= '0; issued_instr_conf_ff <= '0; start_o <= '0; + finished_q <= 1'b0; end else begin start_o <= '0; + finished_q <= finished_d; if (start_load) begin issued_instr_ff <= fifo_data_out; issued_instr_conf_ff <= csr_config_i; @@ -48,7 +52,7 @@ module quadrilatero_register_lsu_controller #( end end - + assign finished_d = finished_i; assign issued_instr_conf_o = issued_instr_conf_ff; assign issued_instr_o = issued_instr_ff; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index aedffc0db..ce935b0fe 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -21,32 +21,32 @@ module quadrilatero_rf_sequencer #( input logic rst_ni, // Input from FUs - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , + input logic [READ_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_i , input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , + output logic [READ_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rdata_o , output logic [READ_PORTS-1:0] rvalid_o , input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , + input logic [WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_i , input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , - input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , + input logic [WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] wdata_i , input logic [WRITE_PORTS-1:0] we_i , input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) output logic [WRITE_PORTS-1:0] wready_o , input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , // Outputs to RF - output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , + output logic [RF_READ_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_o , output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , + input logic [RF_READ_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rdata_i , - output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , + output logic [RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_o , output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , + output logic [RF_WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] wdata_o , output logic [RF_WRITE_PORTS-1:0] we_o , @@ -60,35 +60,50 @@ module quadrilatero_rf_sequencer #( output logic [N_REGS-1:0] rw_queue_full_o ); - logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; - logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] head_valid ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_empty; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] w_pop ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] r_pop ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] r_clr ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_pop ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_full ; logic [WRITE_PORTS-1:0] wr_gnt ; logic [WRITE_PORTS-1:0] wr_req ; logic [READ_PORTS -1:0] rd_req ; logic [READ_PORTS -1:0] rd_gnt ; - logic [N_REGS-1:0] rw_queue_push ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; - + logic [quadrilatero_pkg::N_IREGS-1:0] rw_queue_push ; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0] rw_queue_entry; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue ; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] scoreboard_d ; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] scoreboard_q ; genvar ii,hh; assign rw_queue_pop = w_pop | r_pop | ~head_valid; - assign rw_queue_entry = rw_queue_entry_i ; - assign rw_queue_push = rw_queue_push_i ; + always_comb begin: rw_queue_block + rw_queue_entry = '0; + rw_queue_push = '0; + if(quadrilatero_pkg::N_TILES == 1) begin //technically this if is not needed + rw_queue_entry = rw_queue_entry_i ; + rw_queue_push = rw_queue_push_i ; + end else begin + for (int jj = 0; jj < quadrilatero_pkg::N_IREGS; jj++) begin + for (int ii = 0; ii < N_REGS ; ii++) begin + if(jj >> quadrilatero_pkg::TILE_ADDR == ii) begin + rw_queue_entry[jj] = rw_queue_entry_i[ii]; + rw_queue_push[jj] = rw_queue_push_i[ii]; + end + end + end + end + end + - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; - for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs + for (ii = 0; ii < quadrilatero_pkg::N_IREGS; ii++) begin: gen_fifo__regs for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows fifo_v3 #( .FALL_THROUGH (1'b1) , @@ -112,22 +127,16 @@ module quadrilatero_rf_sequencer #( always_comb begin: scoreboard_block rw_queue_full_o = '0; - for (int i = 0; i < N_REGS; i++) begin + for (int i = 0; i < quadrilatero_pkg::N_IREGS; i++) begin for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i] |= (rw_queue_full[i][h]); - - head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; - - - scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; + rw_queue_full_o[i>>quadrilatero_pkg::TILE_ADDR] |= (rw_queue_full[i][h]); - scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + + head_valid[i][h] = scoreboard_q[i][h].valid; - scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + scoreboard_d[i][h] = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h] : scoreboard_q[i][h]; + end end end @@ -137,28 +146,23 @@ module quadrilatero_rf_sequencer #( rd_req = '0; w_pop = '0; r_pop = '0; - r_clr = '0; + r_clr = '0; for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request automatic int m = 32'(waddr_i[jj]); automatic int n = 32'(wrowaddr_i[jj]); - if( scoreboard_q[m][n].id == wr_id_i[jj] && - scoreboard_q[m][n].wready && we_i[jj] ) - begin - wr_req [jj] = ~scoreboard_q[m][n].rvalid; - w_pop [m][n] = wr_gnt[jj]; + if( scoreboard_q[m][n].id == wr_id_i[jj] && we_i[jj] && scoreboard_q[m][n].valid) begin + wr_req [jj] = 1'b1; + w_pop [m][n] = wlast_i[jj]; end end for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request automatic int m = 32'(raddr_i[jj]); automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && - scoreboard_q[m][n].rvalid && rready_i[jj] ) - begin - rd_req [jj] = 1'b1; - r_clr [m][n] = rd_gnt[jj]; - r_pop [m][n] = rd_gnt[jj] &~ scoreboard_q[m][n].wready; + if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj] && scoreboard_q[m][n].valid) begin + rd_req [jj] = 1'b1; + r_pop [m][n] = rlast_i[jj] && (jj != quadrilatero_pkg::SYSTOLIC_ARRAY_A); // for SA_A port we can't pop on read end end @@ -185,17 +189,17 @@ module quadrilatero_rf_sequencer #( block = 1'b0; end - // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin - // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; - // end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]] = 1'b0; end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]] = 1'b0; end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]] = 1'b0; end end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index e80bda504..055dd5e1c 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -17,7 +17,7 @@ module quadrilatero_systolic_array #( parameter int N_REGS = 8 , parameter int ENABLE_SIMD = 1 , localparam int N_ROWS = MESH_WIDTH , - localparam int RLEN = DATA_WIDTH * MESH_WIDTH, + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, parameter FPU = 1 ) ( input logic clk_i , @@ -35,33 +35,33 @@ module quadrilatero_systolic_array #( input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction // Weight Read Register Port - output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , - input logic [ RLEN-1:0] weight_rdata_i , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] weight_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , + input logic [ALEN-1:0] weight_rdata_i , input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, output logic weight_rlast_o , // Data Read Register Port - output logic [ $clog2(N_REGS)-1:0] data_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , - input logic [ RLEN-1:0] data_rdata_i , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] data_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , + input logic [ALEN-1:0] data_rdata_i , input logic data_rdata_valid_i , output logic data_rdata_ready_o , output logic data_rlast_o , // Accumulator Read Register Port - output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , - input logic [ RLEN-1:0] acc_rdata_i , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] acc_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , + input logic [ALEN-1:0] acc_rdata_i , input logic acc_rdata_valid_i , output logic acc_rdata_ready_o , output logic acc_rlast_o , // Accumulator Out Write Register Port - output logic [ $clog2(N_REGS)-1:0] res_waddr_o , - output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , - output logic [ RLEN-1:0] res_wdata_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] res_waddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , + output logic [ALEN-1:0] res_wdata_o , output logic res_we_o , output logic res_wlast_o , input logic res_wready_i , @@ -75,24 +75,48 @@ module quadrilatero_systolic_array #( input logic finished_ack_i , output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o ); + typedef enum logic [1:0]{ + FS_IDLE, + FS_ACTIVE, + FS_LAST + } fs_state_e; + typedef enum logic [1:0]{ + FF_IDLE, + FF_ACTIVE, + FF_DONE + } ff_state_e; + typedef enum logic [1:0]{ + DR_IDLE, + DR_ACTIVE, + DR_DONE + } dr_state_e; + + ff_state_e ff_state_d, ff_state_q; + fs_state_e fs_state_d, fs_state_q; + dr_state_e dr_state_d, dr_state_q; + localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); + localparam RegLastRow = quadrilatero_pkg::RLEN/ ALEN; + localparam K = quadrilatero_pkg::RLEN / ALEN; + + logic [$clog2(K)-1:0] ff_k_counter_d; + logic [$clog2(K)-1:0] ff_k_counter_q; + logic [$clog2(K)-1:0] ff_k_counter_rev; + logic [$clog2(K)-1:0] dr_k_counter_d; + logic [$clog2(K)-1:0] dr_k_counter_q; + logic [$clog2(K)-1:0] dr_k_counter_rev; + logic [$clog2(K)-1:0] ff_it_counter_d; + logic [$clog2(K)-1:0] ff_it_counter_q; + logic [$clog2(K)-1:0] dr_it_counter_d; + logic [$clog2(K)-1:0] dr_it_counter_q; + logic [$clog2(K)-1:0] ff_row_counter_d; + logic [$clog2(K)-1:0] ff_row_counter_q; + logic [$clog2(K)-1:0] dr_row_counter_d; + logic [$clog2(K)-1:0] dr_row_counter_q; + logic last_dr_write; - logic ff_active_d ; - logic ff_active_q ; - logic fs_active_d ; - logic fs_active_q ; - logic dr_active_d ; - logic dr_active_q ; - logic set_ff_active ; - logic rst_ff_active ; - logic set_fs_active ; - logic rst_fs_active ; - logic set_dr_active ; - logic rst_dr_active ; logic valid ; + logic ff_valid; logic clear ; - logic ff_enable ; - logic fs_enable ; - logic dr_enable ; logic pump ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; @@ -127,6 +151,7 @@ module quadrilatero_systolic_array #( logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; logic mask_req ; + logic ready; quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; @@ -136,100 +161,302 @@ module quadrilatero_systolic_array #( logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; //--------------------------------------------------------------------- - + assign ff_k_counter_rev = (K-1-ff_k_counter_q); + assign dr_k_counter_rev = (K-1-dr_k_counter_q); always_comb begin: rf_block // Weight Read Register Port - weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q ; - weight_rdata_ready_o = ff_active_q &~ mask_req ; - weight_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + weight_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = weight_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_k_counter_rev, ff_it_counter_q}; + end + weight_rrowaddr_o = ff_counter_q; + weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_row_counter_q == (RegLastRow-1)) ; // Data Read Register Port - data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q ; - data_rdata_ready_o = ff_active_q &~ mask_req ; - data_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + data_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = data_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + data_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_row_counter_q, ff_it_counter_q}; + end + data_rrowaddr_o = ff_counter_q; + data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + data_rlast_o = ff_state_q != FF_IDLE && (ff_k_counter_q == (K-1)) ; // Accumulator Read Register Port - acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q ; - acc_rdata_ready_o = ff_active_q &~ mask_req ; - acc_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + acc_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = acc_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + acc_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_row_counter_q, ff_k_counter_rev}; + end + acc_rrowaddr_o = ff_counter_q; + acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + acc_rlast_o = '0 ; // Accumulator Out Write Register Port - res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q ; - res_we_o = dr_active_q &~ mask_req ; - res_wlast_o = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + res_waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = dest_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + res_waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {dr_row_counter_q, dr_k_counter_rev}; + end + res_wrowaddr_o = dr_counter_q; + res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; + res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1)); end - always_comb begin: next_value - - // Configuration - data_reg_d = (set_ff_active) ? data_reg_i : data_reg_q ; - acc_reg_d = (set_ff_active) ? acc_reg_i : acc_reg_q ; - weight_reg_d = (set_ff_active) ? weight_reg_i : weight_reg_q ; - sa_ctrl_d = (set_ff_active) ? sa_ctrl_i : sa_ctrl_q ; - - acc_fs_d = (set_fs_active) ? acc_reg_q : acc_fs_q ; - dest_reg_d = (set_dr_active) ? acc_fs_q : dest_reg_q ; + always_comb begin: finished_signal - id_ff_d = (set_ff_active) ? id_i : id_ff_q ; - id_fs_d = (set_fs_active) ? id_ff_q : id_fs_q ; - id_dr_d = (set_dr_active) ? id_fs_q : id_dr_q ; - - // Finished - finished_d = (res_wready_i && res_wlast_o) ? 1'b1 : + finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1) && (dr_k_counter_q == K-1)) ? 1'b1 : (finished_ack_i ) ? 1'b0 : finished_q; - finished_instr_id_d = (res_wready_i && res_wlast_o) ? id_dr_q : + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1) && (dr_k_counter_q == K-1)) ? id_dr_q : (finished_ack_i ) ? '0 : finished_instr_id_q; - - // Counters - ff_counter_d = (ff_enable && ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (ff_enable ) ? ff_counter_q + 1 : ff_counter_q; - - fs_counter_d = (clear ) || - (fs_enable && fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (fs_enable ) ? fs_counter_q + 1 : fs_counter_q; - - dr_counter_d = (clear ) || - (dr_enable && dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (dr_enable ) ? dr_counter_q + 1 : dr_counter_q; - - // Active signals - ff_active_d = set_ff_active ? 1'b1 : - rst_ff_active ? 1'b0 : ff_active_q; - - fs_active_d = set_fs_active ? 1'b1 : - rst_fs_active ? 1'b0 : fs_active_q; - - dr_active_d = set_dr_active ? 1'b1 : - rst_dr_active ? 1'b0 : dr_active_q; end always_comb begin: ctrl_block valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - clear = ~ff_active_q & ~fs_active_q & ~dr_active_q; - - ff_enable = ff_active_q & valid ; - // fs_enable = fs_active_q & (valid | ~ff_active_q); - // dr_enable = dr_active_q & (valid | ~ff_active_q); - fs_enable = fs_active_q; - dr_enable = dr_active_q; - - set_ff_active = ff_counter_d=='0 & start_i ; - set_fs_active = fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - set_dr_active = dr_counter_d=='0 & fs_counter_d==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-2); + if((ff_state_q == FF_IDLE || (ff_state_q == FF_ACTIVE && ff_counter_q == '0 && ff_counter_d == '0)) && (fs_state_q == FS_IDLE) && (dr_state_q == DR_IDLE)) begin + clear = 1'b1; + end else begin + clear = 1'b0; + end + if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q != DR_IDLE)) begin + pump = 1'b1; + end else begin + pump = 1'b0; + end + mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; + end - rst_ff_active = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & ff_counter_d=='0 ; - rst_fs_active = fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q=='0; - rst_dr_active = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & dr_counter_d=='0 & fs_counter_d=='0 & fs_counter_q=='0; + always_comb begin : ff_fsm_block + ff_counter_d = ff_counter_q; + ff_state_d = ff_state_q; + data_reg_d = data_reg_q; + acc_reg_d = acc_reg_q; + weight_reg_d = weight_reg_q; + sa_ctrl_d = sa_ctrl_q; + id_ff_d = id_ff_q; + ff_k_counter_d = ff_k_counter_q; + ff_it_counter_d = ff_it_counter_q; + ff_row_counter_d = ff_row_counter_q; + ff_valid = 1'b0; + + unique case (ff_state_q) + FF_IDLE: begin + ff_counter_d = '0; + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + if(start_i == 1'b1) begin + ff_state_d = FF_ACTIVE; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end + end + FF_ACTIVE: begin + if(valid == 1'b1) begin + if(ff_counter_q==(LastRow-1)) begin + ff_counter_d = ff_counter_q + 1; + ff_state_d = FF_DONE; + end else begin + ff_counter_d = ff_counter_q + 1; + end + end + end + + FF_DONE: begin + if(start_i == 1'b1 | ~(data_rlast_o == 1'b1 && weight_rlast_o == 1'b1 && ff_it_counter_q == (K-1))) begin + if(valid == 1'b1) begin + ff_valid = 1'b1; + ff_counter_d = '0; + ff_state_d = FF_ACTIVE; + if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + if(ff_row_counter_q == RegLastRow-1) begin + ff_row_counter_d = '0; + if(ff_k_counter_q == (K-1)) begin + ff_k_counter_d = '0; + ff_it_counter_d = ff_it_counter_q + 1; + end else begin + ff_k_counter_d = ff_k_counter_q + 1; + end + end else begin + ff_row_counter_d = ff_row_counter_q + 1; + end + end + end + + end else begin + ff_counter_d = '0; + ff_state_d = FF_IDLE; + end + end + + default: begin + ff_state_d = FF_IDLE; + end + endcase + end + always_comb begin : fs_fsm_block + fs_counter_d = fs_counter_q; + fs_state_d = fs_state_q; + + acc_fs_d = acc_fs_q; + id_fs_d = id_fs_q; + + unique case(fs_state_q) + FS_IDLE: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE && valid == 1'b1) begin + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + + end + FS_ACTIVE: begin + if(clear == 1'b1) begin + fs_counter_d = '0; + fs_state_d = FS_IDLE; + end else begin + if(fs_counter_q == LastRow-2) begin + fs_counter_d = fs_counter_q + 1; + fs_state_d = FS_LAST; + end else begin + fs_counter_d = fs_counter_q + 1; + end + end + end + FS_LAST: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + if(ff_state_q == FF_IDLE) begin + fs_state_d = FS_IDLE; + end else begin + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + fs_state_d = FS_IDLE; + end + + end + default: begin + fs_state_d = FS_IDLE; + end + + endcase + end - pump = ff_enable | fs_enable | dr_enable ; - mask_req = (dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) & finished_q & ~finished_ack_i; + always_comb begin : dr_fsm_block + dr_state_d = dr_state_q; + dr_counter_d = dr_counter_q; + dr_k_counter_d = dr_k_counter_q; + dr_it_counter_d = dr_it_counter_q; + dr_row_counter_d = dr_row_counter_q; + last_dr_write = 1'b0; + + dest_reg_d = dest_reg_q; + id_dr_d = id_dr_q; + unique case(dr_state_q) + DR_IDLE: begin + dr_counter_d = '0; + dr_k_counter_d = '0; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + if(fs_state_q == FS_LAST) begin + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + + end + DR_ACTIVE: begin + if(clear == 1'b1) begin + dr_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; + end else begin + if(dr_counter_q == LastRow) begin + + dr_counter_d = '0; + //update DR counters + if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + end else begin + if(dr_row_counter_q == RegLastRow-1) begin + dr_row_counter_d = '0; + if(dr_k_counter_q == (K-1)) begin + dr_k_counter_d = '0; + dr_it_counter_d = dr_it_counter_q + 1; + end else begin + dr_k_counter_d = dr_k_counter_q + 1; + end + end else begin + dr_row_counter_d = dr_row_counter_q + 1; + end + end + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + if(fs_state_q == FS_IDLE) begin + dr_state_d = DR_DONE; + end + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end + + end + DR_DONE: begin + if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin + last_dr_write = 1'b1; + if(res_wready_i == 1'b0) begin + dr_state_d = DR_IDLE; + end else begin + dr_state_d = DR_DONE; + if(dr_counter_q == LastRow) begin + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end else begin + dr_state_d = DR_IDLE; + end + end + default: begin + dr_state_d = DR_IDLE; + end + + endcase end + quadrilatero_skewer #( .MESH_WIDTH(MESH_WIDTH), .DATA_WIDTH(DATA_WIDTH) @@ -276,7 +503,7 @@ module quadrilatero_systolic_array #( .weight_rdata_valid_i , // Weight Data - .weight_rdata_i , + .weight_rdata_i (weight_rdata_i ), .weight_rdata_o (weight_mesh_skewed ) ); @@ -313,9 +540,9 @@ module quadrilatero_systolic_array #( ff_counter_q <= '0; fs_counter_q <= '0; dr_counter_q <= '0; - ff_active_q <= '0; - fs_active_q <= '0; - dr_active_q <= '0; + ff_state_q <= FF_IDLE; + fs_state_q <= FS_IDLE; + dr_state_q <= DR_IDLE; data_reg_q <= '0; acc_reg_q <= '0; weight_reg_q <= '0; @@ -327,13 +554,19 @@ module quadrilatero_systolic_array #( id_dr_q <= '0; finished_q <= '0; finished_instr_id_q <= '0; + ff_k_counter_q <= '0; + dr_k_counter_q <= '0; + ff_it_counter_q <= '0; + dr_it_counter_q <= '0; + ff_row_counter_q <= '0; + dr_row_counter_q <= '0; end else begin ff_counter_q <= ff_counter_d ; fs_counter_q <= fs_counter_d ; dr_counter_q <= dr_counter_d ; - ff_active_q <= ff_active_d ; - fs_active_q <= fs_active_d ; - dr_active_q <= dr_active_d ; + ff_state_q <= ff_state_d; + fs_state_q <= fs_state_d; + dr_state_q <= dr_state_d; data_reg_q <= data_reg_d ; acc_reg_q <= acc_reg_d ; weight_reg_q <= weight_reg_d ; @@ -345,10 +578,16 @@ module quadrilatero_systolic_array #( id_dr_q <= id_dr_d ; finished_q <= finished_d ; finished_instr_id_q <= finished_instr_id_d ; + ff_k_counter_q <= ff_k_counter_d; + dr_k_counter_q <= dr_k_counter_d; + ff_it_counter_q <= ff_it_counter_d; + dr_it_counter_q <= dr_it_counter_d; + ff_row_counter_q <= ff_row_counter_d ; + dr_row_counter_q <= dr_row_counter_d ; end end - - assign sa_ready_o = (ff_counter_d=='0) & ((ff_active_q &~ ff_counter_q=='0) | (~ff_active_q & ~fs_active_q & ~dr_active_q)); + assign ready = (ff_state_q == FF_DONE) && (ff_k_counter_q == K-1) && (ff_it_counter_q == K-1) && (ff_row_counter_q == RegLastRow-1); + assign sa_ready_o = ready || (ff_state_q == FF_IDLE && fs_state_q == FS_IDLE); assign sa_input_id_o = id_ff_q ; assign sa_output_id_o = id_dr_q ; assign finished_o = finished_q ; diff --git a/scripts/sim/modelsim/patch_modelsim_Makefile.py b/scripts/sim/modelsim/patch_modelsim_Makefile.py index cfe3b0ab7..79a020084 100644 --- a/scripts/sim/modelsim/patch_modelsim_Makefile.py +++ b/scripts/sim/modelsim/patch_modelsim_Makefile.py @@ -26,8 +26,7 @@ 'ifdef RUN_UPF'+ "\n" + \ ' RUN_UPF_OPTIONS := -pa' + "\n" + \ 'endif'+ "\n\n"); -string_replaced.append('EXTRA_OPTIONS ?= -t 1ps -voptargs=+acc $(VSIM_OPTIONS) $(addprefix -g,$(PARAMETERS)) $(addprefix +,$(PLUSARGS)) $(RUN_UPF_OPTIONS)') - +string_replaced.append('EXTRA_OPTIONS ?= -do "do /scratch2/bsc25f1/x-heep/waves.do; run 5000 us" -t 1ps -voptargs=+acc $(VSIM_OPTIONS) $(addprefix -g,$(PARAMETERS)) $(addprefix +,$(PLUSARGS)) $(RUN_UPF_OPTIONS)') string_toappend = [] diff --git a/sw/applications/quadrilatero_easy_8x8/main.c b/sw/applications/quadrilatero_easy_8x8/main.c new file mode 100644 index 000000000..55e6c58c1 --- /dev/null +++ b/sw/applications/quadrilatero_easy_8x8/main.c @@ -0,0 +1,456 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/*Variable Data Type*/ +// Supported types: int32_t (0), float (1), int8_t (2), int16_t (3) +#define TYPE 0 + +/* Output tile size */ +// Supported values: 16 (4x4), 64 (8x8). +#define OUTPUT_TILE_SIZE 64 + +/*Register Length*/ +// Supported values: 128 (4x4), 256 (8x8) +#define RLEN 128 + +/* By default, printfs are deactivated. */ +#define PRINTF_IN_FPGA 0 +#define PRINTF_IN_SIM 1 + +/* VCD Files generation */ +// Supported Values: 0 (No), 1 (Yes) +// #define VCD_ENABLE 0 + +// ************************************************************************************************************ +// ***************************** ***************************** +// ***************************** DO NOT TOUCH LINES BELOW ! ***************************** +// ***************************** ***************************** +// ************************************************************************************************************ + +/* Includes */ +#include +#include +#include "csr.h" +#include "x-heep.h" +#include "gpio.h" +// #include "vcd_util.h" + +/* Define Datatype and set of data */ +#if TYPE == 0 + #include "matrixMul32i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 1 + #define HEAD_LINE "mmasa.w" + #define SIMD_SHIFT 2 + typedef int32_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 1 + #include "matrixMul32f.h" + #define FS_INITIAL 0x1 + #define SIMD_FACTOR 1 + #define HEAD_LINE "fmmacc.s" + #define SIMD_SHIFT 2 + typedef float DATA_IN_t ; + typedef float DATA_OUT_t; +#elif TYPE == 2 + #include "matrixMul8i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 4 + #define HEAD_LINE "mmaqa.b" + #define SIMD_SHIFT 0 + typedef int8_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 3 + #include "matrixMul16i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 2 + #define HEAD_LINE "mmada.h" + #define SIMD_SHIFT 1 + typedef int16_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#else +#endif + +/* Declare functions and global variables */ +DATA_OUT_t __attribute__((section(".xheep_data_interleaved"))) matrix_C[SIZE*SIZE]; +void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +int float_condition(int index); +int int_condition(int index); +uint32_t check_results(int K, int N, int M); +void print_matrix(DATA_OUT_t* matrix, int K, int N); + + +/* Select print mode */ + +#if TARGET_SIM && PRINTF_IN_SIM + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#elif TARGET_PYNQ_Z2 && PRINTF_IN_FPGA + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else + #define PRINTF(...) +#endif + +/* Select kernel */ +#if OUTPUT_TILE_SIZE == 16 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_4x4((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 64 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_8x8((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 0 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_CPU((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#else + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) +#endif + + +/* Select check condition */ +#if FS_INITIAL == 0x1 + #define CHECK_CONDITION(index) float_condition((int) index) +#elif FS_INITIAL == 0x0 + #define CHECK_CONDITION(index) int_condition ((int) index) +#else +#endif + +/* VCD Functions */ +#if VCD_ENABLE == 1 + #define VCD_START() vcd_init(); vcd_enable() + #define VCD_STOP() vcd_disable() +#else + #define VCD_START() + #define VCD_STOP() +#endif + +/* Matrices */ +#define MAT_A matrix_A +#define MAT_B matrix_BT +#define MAT_C matrix_C +#define MAT_EXP matrix_EXP + +#define MACC(HEAD,__mat1__, __mat2__, __mat3__) HEAD " m" #__mat1__", m"#__mat2__", m"#__mat3__ +// ------------------------------------------------------------------------------------------------------------------------------------- + + +int main() +{ + uint32_t errors = 0; + unsigned int cycles; + + // Save the address of the matrices + DATA_IN_t* addrA = MAT_A; + DATA_IN_t* addrB = MAT_B; + DATA_OUT_t* addrC = MAT_C; + + int K_size = SIZE/SIMD_FACTOR; + int N_size = SIZE ; + int M_size = SIZE ; + + //enable FP operations + CSR_SET_BITS(CSR_REG_MSTATUS, (FS_INITIAL << 13)); + + //start mcycle csr + CSR_CLEAR_BITS(CSR_REG_MCOUNTINHIBIT, 0x1); + CSR_WRITE(CSR_REG_MCYCLE, 0); + + //execute the kernel + // vcd_init(); + // vcd_enable(); + VCD_START(); + matrixMul_easy(addrA,addrB,addrC,K_size,N_size,M_size,SIMD_SHIFT); + VCD_STOP(); + // vcd_disable(); + + //read mcycle csr + CSR_READ(CSR_REG_MCYCLE, &cycles); + + //check results + errors = check_results(8,8,8); + + PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); + // PRINTF("MATRIX C:\n\r"); + // print_matrix(addrC, M_size, N_size); + // PRINTF("MATRIX EXP:\n\r"); + // print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); + return errors; +} + + +// ------------------------------------------------------------------------------------------------------------------------------------- + +void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + // asm volatile("addi sp, sp, -0x30 " ); // + // asm volatile("sw s0 , 0x2c(sp) " ); // + // asm volatile("sw s1 , 0x28(sp) " ); // + // asm volatile("sw s2 , 0x24(sp) " ); // + // asm volatile("sw s3 , 0x20(sp) " ); // + // asm volatile("sw s4 , 0x1c(sp) " ); // + // asm volatile("sw s5 , 0x18(sp) " ); // + // asm volatile("sw s6 , 0x14(sp) " ); // + // asm volatile("sw s7 , 0x10(sp) " ); // + // asm volatile("sw s8 , 0x0c(sp) " ); // + // asm volatile("sw s9 , 0x08(sp) " ); // + // asm volatile("sw s10, 0x04(sp) " ); // + // asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + + asm volatile("addi a6,x0,32 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; + asm volatile("addi s3,x0, 32 " ); // s3 = K*4; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + + asm volatile("addi t1,x0, 0 " ); // t1 = n0 =0 + asm volatile("addi t3,t0,8 " ); // t3 = m0+WIDTH + asm volatile("mul s1,s3,t0 " ); // s1 = K*4*m0 + asm volatile("mul s2,s3,t3 " ); // s2 = K*4*(m0+WIDTH) + asm volatile("mul s0,s4,t0 " ); // s0 = N*4*m0; + asm volatile("mul s10,s4,t3 " ); // s10 = N*4*(m0+WIDTH) + asm volatile("add s1,%0,x0 " :: "r" (addrA) ); // s1 = startAddrA0 = addrA + K*4*m0 + asm volatile("add s2,%0,x0 " :: "r" (addrA) ); // s2 = startAddrA1 = addrA + K*4*(m0+WIDTH) + asm volatile("add s0,%0,x0 " :: "r" (addrC) ); // s0 = startAddrC0x = addrC + N*4*m0 + asm volatile("add s10,%0,x0 " :: "r" (addrC) ); // s10 = startAddrC1x = addrC + N*4*(m0+WIDTH) + + asm volatile("addi t4,t1,8 " ); // t4 = n0+WIDTH; + asm volatile("addi t2,x0,16 " ); // t2 = k0 = 16; + asm volatile("slli t5,t1, 2 " ); // t5 = n0*4; + asm volatile("mld.w m0, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mld.w m4, (s0), s3 " ); // m4 = 0; + asm volatile("mul s9,s3,t1 " ); // s9 = K*4*n0; + asm volatile("add s9 ,%0,s9 " :: "r" (addrB) ); // s9 = startAddrB0 = addrB + K*4*n0 + asm volatile("mld.w m1, (s9) , a6 " ); // m1 = B[s9] + + asm volatile("mld.w m2, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mzero m3 " ); // m4 = 0; + asm volatile("mld.w m5, (s9) , a6 " ); // m1 = B[s9] + asm volatile("mul s11,s3,t4 " ); // s11 = K*4*(n0+WIDTH); + asm volatile(MACC(HEAD_LINE,4,1,0) ); // m4 += m1 * m0 + asm volatile(MACC(HEAD_LINE,3,5,2) ); // m4 += m1 * m0 + asm volatile("add s11,%0,s11 " :: "r" (addrB) ); // s11 = startAddrB1 = addrB + K*4*(n0+WIDTH) + asm volatile("add s6,t5,0 " ); // s6 = startAddrC00 += n0*4 + asm volatile("mst.w m4, (s0) , s4 " ); // m4 -> (s6) + asm volatile("mst.w m3, (s0) , s4 " ); // m4 -> (s6) + asm volatile("mld.w m0, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mld.w m4, (s0), s3 " ); // m4 = 0; + asm volatile("mul s9,s3,t1 " ); // s9 = K*4*n0; + asm volatile("add s9 ,%0,s9 " :: "r" (addrB) ); // s9 = startAddrB0 = addrB + K*4*n0 + asm volatile("mld.w m1, (s9) , a6 " ); // m1 = B[s9] + + asm volatile("mld.w m2, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mld.w m3, (s0), s3 " ); // m4 = 0; + asm volatile("mld.w m5, (s9) , a6 " ); // m1 = B[s9] + asm volatile("mul s11,s3,t4 " ); // s11 = K*4*(n0+WIDTH); + asm volatile(MACC(HEAD_LINE,4,1,0) ); // m4 += m1 * m0 + asm volatile(MACC(HEAD_LINE,3,5,2) ); // m4 += m1 * m0 + asm volatile("add s11,%0,s11 " :: "r" (addrB) ); // s11 = startAddrB1 = addrB + K*4*(n0+WIDTH) + asm volatile("add s6,t5,0 " ); // s6 = startAddrC00 += n0*4 + asm volatile("mst.w m4, (s0) , s4 " ); // m4 -> (s6) + asm volatile("mst.w m3, (s0) , s4 " ); // m4 -> (s6) + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + + asm volatile("add t0,t0, 16 " ); // t0 = m0 +=2*WIDTH; + //-------------------------------------------------------------------------------- + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + // asm volatile("lw s0 , 0x2c(sp) " ); // + // asm volatile("lw s1 , 0x28(sp) " ); // + // asm volatile("lw s2 , 0x24(sp) " ); // + // asm volatile("lw s3 , 0x20(sp) " ); // + // asm volatile("lw s4 , 0x1c(sp) " ); // + // asm volatile("lw s5 , 0x18(sp) " ); // + // asm volatile("lw s6 , 0x14(sp) " ); // + // asm volatile("lw s7 , 0x10(sp) " ); // + // asm volatile("lw s8 , 0x0c(sp) " ); // + // asm volatile("lw s9 , 0x08(sp) " ); // + // asm volatile("lw s10, 0x04(sp) " ); // + // asm volatile("lw s11, 0x00(sp) " ); // + // asm volatile("addi sp, sp, 0x30 " ); // + +} + + + + + +void __attribute__ ((noinline)) matrixMul_CPU(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + for(int i=0;i= 0); + return (diff > 0.001f); +} + +int int_condition(int index){ + return (MAT_C[index] != MAT_EXP[index]); +} + +uint32_t check_results(int K, int N, int M) +{ + // check + int i, j; + uint32_t err = 0; + + // Check errors + for(i = 0; i < M; i++) { + for(j = 0; j < N; j++) { + if(CHECK_CONDITION(i*N+j)) { + err ++; + //PRINTF("Error at index %d, %d, expected %x, got %x\n\r", i, j, MAT_EXP[i*N+j], MAT_C[i*N+j]); + } + } + } + + return err; +} + +void print_matrix(DATA_OUT_t* matrix, int K, int N) +{ + for(int i=0;i way to go +// for(int m = 0; m < M; m+= 16){ +// for(int n = 0; n < N; n+=16){ +// asm volatile("mzero m0"); //m0 = C00 +// asm volatile("mzero m1"); //m1 = C01 +// asm volatile("mzero m2"); //m2 = C10 +// asm volatile("mzero m3"); //m3 = C11 +// for(int k = 0; k < K; k+=8){ +// //compute C00 +// asm volatile("mld.w m4, (addrA + m*4*K + 4*k), 4*K"); +// asm volatile("mld.w m5, (addrB + n*4*K + 4*k), 4*N"); +// asm volatile("MACC(m0, m4, m5)"); +// //compute C01 +// asm volatile("mld.w m7, (addrB + (n+8)*4*K + 4*k), 4*N"); +// asm volatile("MACC(m1, m4, m7)"); +// //compute C10 +// asm volatile("mld.w m6, (addrA + (m+8)*4*K + 4*k), 4*K"); +// asm volatile("MACC(m2, m6, m5)"); +// //compute C11 +// asm volatile("MACC(m3, m6, m7)"); +// } +// //store C00 +// asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); +// //store C01 +// asm volatile("mst.w m1, (addrC + m*4*N + (n+8)*4), 4*N"); +// //store C10 +// asm volatile("mst.w m2, (addrC + (m+8)*4*N + n*4), 4*N"); +// //store C11 +// asm volatile("mst.w m3, (addrC + (m+8)*4*N + (n+8)*4), 4*N"); + +// } +// } +void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x30 " ); // + asm volatile("sw s0 , 0x2c(sp) " ); // + asm volatile("sw s1 , 0x28(sp) " ); // + asm volatile("sw s2 , 0x24(sp) " ); // + asm volatile("sw s3 , 0x20(sp) " ); // + asm volatile("sw s4 , 0x1c(sp) " ); // + asm volatile("sw s5 , 0x18(sp) " ); // + asm volatile("sw s6 , 0x14(sp) " ); // + asm volatile("sw s7 , 0x10(sp) " ); // + asm volatile("sw s8 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + asm volatile("sll a6,%0,%1 " :: "r" (N),"r" (shift) ); // a6 = N* 2**SIMD_SHIFT + asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + + asm volatile("loopM_start16x16: " ); // while(m0 (s6) + asm volatile(MACC(HEAD_LINE,5,3,0) ); // m5 += m3 * m0 + asm volatile("add s7,t5,s10 " ); // s7 = startAddrC10 += n0*4 + asm volatile("mst.w m6, (s7) , s4 " ); // m6 -> (s7) + asm volatile(MACC(HEAD_LINE,7,3,2) ); // m7 += m3 * m2 + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + asm volatile("mst.w m5, (s5) , s4 " ); // m5 -> (s5) + asm volatile("addi t1,t1, 16 " ); // t1 = n0+=2*WIDTH; + asm volatile("add s8,t6,s10 " ); // s8 = startAddrC11 += (n0+WIDTH)*4 + asm volatile("mst.w m7, (s8) , s4 " ); // m7 -> (s8) + asm volatile("blt t1, %0, loopN_start16x16" :: "r" (N) ); // endwhile(n0 (s11) + asm volatile("blt t4, %0, loopN_start8x8big" :: "r" (N) ); // endwhile(n0 +#include +#include "csr.h" +#include "x-heep.h" +#include "gpio.h" +// #include "vcd_util.h" + +/* Define Datatype and set of data */ +#if TYPE == 0 + #include "matrixMul32i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 1 + #define HEAD_LINE "mmasa.w" + #define SIMD_SHIFT 2 + typedef int32_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 1 + #include "matrixMul32f.h" + #define FS_INITIAL 0x1 + #define SIMD_FACTOR 1 + #define HEAD_LINE "fmmacc.s" + #define SIMD_SHIFT 2 + typedef float DATA_IN_t ; + typedef float DATA_OUT_t; +#elif TYPE == 2 + #include "matrixMul8i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 4 + #define HEAD_LINE "mmaqa.b" + #define SIMD_SHIFT 0 + typedef int8_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 3 + #include "matrixMul16i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 2 + #define HEAD_LINE "mmada.h" + #define SIMD_SHIFT 1 + typedef int16_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#else +#endif + +/* Declare functions and global variables */ +DATA_OUT_t __attribute__((section(".xheep_data_interleaved"))) matrix_C[SIZE*SIZE]; +void __attribute__ ((noinline)) matrixMul_4x4(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMulBigRF_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMul_CPU(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void print_matrix(DATA_OUT_t* matrix, int K, int N); +int float_condition(int index); +int int_condition(int index); +uint32_t check_results(int K, int N, int M); + + +/* Select print mode */ + +#if TARGET_SIM && PRINTF_IN_SIM + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#elif TARGET_PYNQ_Z2 && PRINTF_IN_FPGA + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else + #define PRINTF(...) +#endif + +/* Select kernel */ +#if OUTPUT_TILE_SIZE == 16 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_4x4((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 64 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_8x8((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 0 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_CPU((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#else + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) +#endif + + +/* Select check condition */ +#if FS_INITIAL == 0x1 + #define CHECK_CONDITION(index) float_condition((int) index) +#elif FS_INITIAL == 0x0 + #define CHECK_CONDITION(index) int_condition ((int) index) +#else +#endif + +/* VCD Functions */ +#if VCD_ENABLE == 1 + #define VCD_START() vcd_init(); vcd_enable() + #define VCD_STOP() vcd_disable() +#else + #define VCD_START() + #define VCD_STOP() +#endif + +/* Matrices */ +#define MAT_A matrix_A +#define MAT_B matrix_BT +#define MAT_C matrix_C +#define MAT_EXP matrix_EXP + +#define MACC(HEAD,__mat1__, __mat2__, __mat3__) HEAD " m" #__mat1__", m"#__mat2__", m"#__mat3__ +// ------------------------------------------------------------------------------------------------------------------------------------- + + +int main() +{ + uint32_t errors = 0; + unsigned int cycles; + + // Save the address of the matrices + DATA_IN_t* addrA = MAT_A; + DATA_IN_t* addrB = MAT_B; + DATA_OUT_t* addrC = MAT_C; + + int K_size = SIZE/SIMD_FACTOR; + int N_size = SIZE ; + int M_size = SIZE ; + + //enable FP operations + CSR_SET_BITS(CSR_REG_MSTATUS, (FS_INITIAL << 13)); + + //start mcycle csr + CSR_CLEAR_BITS(CSR_REG_MCOUNTINHIBIT, 0x1); + CSR_WRITE(CSR_REG_MCYCLE, 0); + + //execute the kernel + // vcd_init(); + // vcd_enable(); + VCD_START(); + matrixMul_16x16(addrA,addrB,addrC,K_size,N_size,M_size,SIMD_SHIFT); + VCD_STOP(); + // vcd_disable(); + + //read mcycle csr + CSR_READ(CSR_REG_MCYCLE, &cycles); + + //check results + errors = check_results(K_size,N_size,M_size); + + PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); + // PRINTF("MATRIX C:\n\r"); + // print_matrix(addrC, M_size, N_size); + // PRINTF("MATRIX EXP:\n\r"); + // print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); + return errors; +} + + +// ------------------------------------------------------------------------------------------------------------------------------------- + + +// Output tile size: 4x4 +void __attribute__ ((noinline)) matrixMul_4x4(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x18 " ); // + asm volatile("sw s3 , 0x18(sp) " ); // + asm volatile("sw s4 , 0x14(sp) " ); // + asm volatile("sw s5 , 0x10(sp) " ); // + asm volatile("sw s7 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + asm volatile("addi t5,x0, 0 " ); // t5 = m0 =0; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + asm volatile("sll s7,%0,%1 " :: "r" (N),"r" (shift) ); // s7 = N* 2**SIMD_SHIFT; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("loopM_start4x4: " ); // while(m0 (s11) + asm volatile("blt t4, %0, loopN_start4x4" :: "r" (N) ); // endwhile(n0 (s6) + asm volatile(MACC(HEAD_LINE,5,3,0) ); // m5 += m3 * m0 + asm volatile("add s7,t5,s10 " ); // s7 = startAddrC10 += n0*4 + asm volatile("mst.w m6, (s7) , s4 " ); // m6 -> (s7) + asm volatile(MACC(HEAD_LINE,7,3,2) ); // m7 += m3 * m2 + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + asm volatile("mst.w m5, (s5) , s4 " ); // m5 -> (s5) + asm volatile("addi t1,t1, 8 " ); // t1 = n0+=2*WIDTH; + asm volatile("add s8,t6,s10 " ); // s8 = startAddrC11 += (n0+WIDTH)*4 + asm volatile("mst.w m7, (s8) , s4 " ); // m7 -> (s8) + asm volatile("blt t1, %0, loopN_start8x8" :: "r" (N) ); // endwhile(n0 way to go +/*void __attribute__ ((noinline)) matrixMul_16x16_C(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift){ + uint32_t K_4 = K*4; + uint32_t N_4 = N*4; + for(int m = 0; m < M; m+= 16){ + for(int n = 0; n < N; n+=16){ + asm volatile("mzero m0"); //m0 = C00 + asm volatile("mzero m1"); //m1 = C01 + asm volatile("mzero m2"); //m2 = C10 + asm volatile("mzero m3"); //m3 = C11 + for(int k = 0; k < K; k+=8){ + //compute C00 + asm volatile("mld.w m4, (addrA + m*4*K + 4*k), 4*K"); + asm volatile("mld.w m5, (addrB + n*4*K + 4*k), 4*N"); + asm volatile("MACC(m0, m4, m5)"); + //compute C01 + asm volatile("mld.w m7, (addrB + (n+8)*4*K + 4*k), 4*N"); + asm volatile("MACC(m1, m4, m7)"); + //compute C10 + asm volatile("mld.w m6, (addrA + (m+8)*4*K + 4*k), 4*K"); + asm volatile("MACC(m2, m6, m5)"); + //compute C11 + asm volatile("MACC(m3, m6, m7)"); + } + //store C00 + asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); + //store C01 + asm volatile("mst.w m1, (addrC + m*4*N + (n+8)*4), 4*N"); + //store C10 + asm volatile("mst.w m2, (addrC + (m+8)*4*N + n*4), 4*N"); + //store C11 + asm volatile("mst.w m3, (addrC + (m+8)*4*N + (n+8)*4), 4*N"); + + } + } +} */ +void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x30 " ); // + asm volatile("sw s0 , 0x2c(sp) " ); // + asm volatile("sw s1 , 0x28(sp) " ); // + asm volatile("sw s2 , 0x24(sp) " ); // + asm volatile("sw s3 , 0x20(sp) " ); // + asm volatile("sw s4 , 0x1c(sp) " ); // + asm volatile("sw s5 , 0x18(sp) " ); // + asm volatile("sw s6 , 0x14(sp) " ); // + asm volatile("sw s7 , 0x10(sp) " ); // + asm volatile("sw s8 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + asm volatile("sll a6,%0,%1 " :: "r" (N),"r" (shift) ); // a6 = N* 2**SIMD_SHIFT + asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + + asm volatile("loopM_start16x16: " ); // while(m0 (s6) + asm volatile(MACC(HEAD_LINE,5,3,0) ); // m5 += m3 * m0 + asm volatile("add s7,t5,s10 " ); // s7 = startAddrC10 += n0*4 + asm volatile("mst.w m6, (s7) , s4 " ); // m6 -> (s7) + asm volatile(MACC(HEAD_LINE,7,3,2) ); // m7 += m3 * m2 + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + asm volatile("mst.w m5, (s5) , s4 " ); // m5 -> (s5) + asm volatile("addi t1,t1, 16 " ); // t1 = n0+=2*WIDTH; + asm volatile("add s8,t6,s10 " ); // s8 = startAddrC11 += (n0+WIDTH)*4 + asm volatile("mst.w m7, (s8) , s4 " ); // m7 -> (s8) + asm volatile("blt t1, %0, loopN_start16x16" :: "r" (N) ); // endwhile(n0 (s11) + asm volatile("blt t4, %0, loopN_start8x8big" :: "r" (N) ); // endwhile(n0= 0); + return (diff > 0.001f); +} + +int int_condition(int index){ + return (MAT_C[index] != MAT_EXP[index]); +} + +uint32_t check_results(int K, int N, int M) +{ + // check + int i, j; + uint32_t err = 0; + + // Check errors + for(i = 0; i < M; i++) { + for(j = 0; j < N; j++) { + if(CHECK_CONDITION(i*N+j)) { + err ++; + PRINTF("Error at index %d, %d, expected %x, got %x\n\r", i, j, MAT_EXP[i*N+j], MAT_C[i*N+j]); + } + } + } + + return err; +} + +void print_matrix(DATA_OUT_t* matrix, int K, int N) +{ + for(int i=0;i