From 3b7c26f767a251d7a8c10844c7641c4e68e70483 Mon Sep 17 00:00:00 2001 From: "bsc25f1 Angelo Nujic (anujic)" Date: Mon, 10 Mar 2025 14:17:17 +0100 Subject: [PATCH 01/18] added assembly code for 8x8 Matmul with 8x8 register file --- sw/applications/quadrilatero_matmul/main.c | 154 +++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/sw/applications/quadrilatero_matmul/main.c b/sw/applications/quadrilatero_matmul/main.c index aa7f26a41..51284c585 100644 --- a/sw/applications/quadrilatero_matmul/main.c +++ b/sw/applications/quadrilatero_matmul/main.c @@ -12,6 +12,10 @@ // Supported values: 16 (4x4), 64 (8x8). #define OUTPUT_TILE_SIZE 64 +/*Register Length*/ +// Supported values: 128 (4x4), 256 (8x8) +#define RLEN 128 + /* By default, printfs are deactivated. */ #define PRINTF_IN_FPGA 0 #define PRINTF_IN_SIM 1 @@ -74,6 +78,7 @@ DATA_OUT_t __attribute__((section(".xheep_data_interleaved"))) matrix_C[SIZE*SIZE]; void __attribute__ ((noinline)) matrixMul_4x4(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMulBigRF_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); void __attribute__ ((noinline)) matrixMul_CPU(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); int float_condition(int index); int int_condition(int index); @@ -334,6 +339,155 @@ void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addr asm volatile("addi sp, sp, 0x30 " ); // } +// Output tile size: 8x8 only for RLEN = 256 +// Naive C code +// for(int m = 0; m < M; m+= 8){ +// for(int n = 0; n < N; n += 8){ +// mzero m0; +// for (int k = 0; k < K; k+= 8){ +// mld.w m1, (addrA + m*4*K + 4*k), 4*K; //load matrix A +// mld.w m2, (addrB + n*4*K + k*4), 4*N //load matrix B +// MACC(m0, m1, m2); +// } +// mst.w m0, (addrC + m*4*N + n*4), 4*N; //store matrix C +// } +// } +// more optimized C code using loop unrolling in the K loop +// for(int m = 0; m < M; m+= 8){ +// for(int n = 0; n < N; n += 8){ +// mzero m0; +// for (int k = 0; k < K; k+= 16){ +// mld.w m1, (addrA + m*4*K + 4*k), 4*K; //load matrix A +// mld.w m2, (addrB + n*4*K + k*4), 4*N; //load matrix B +// MACC(m0, m1, m2); +// mld.w m3, (addrA + m*4*K + 4*(k+8)), 4*K +// mld.w m4, (addrB + n*4*K + (k+8)*4), 4*N; +// MACC(m0, m3,m4); +// } +// mst.w m0, (addrC + m*4*N + n*4), 4*N; //store matrix C + +// } +// } +// even more optimized +// for(int m = 0; m < M; m+= 8){ +// for(int n = 0; n < N; n += 8){ +// asm volatile("mzero m0"); +// asm volatile("mld.w m1, (addrA + m*4*K), 4*K"); //load matrix A +// asm volatile("mld.w m2, (addrB + n*4*K), 4*N"); //load matrix B +// asm volatile("MACC(m0, m1, m2)"); +// for(int k = 8; k < K; k+= 24){ +// asm volatile("mld.w m3, (addrA + m*4*K + 4*k), 4*K"); +// asm volatile("mld.w m4, (addrB + n*4*K + k*4), 4*N"); +// asm volatile("MACC(m0, m3,m4)"); +// asm volatile("mld.w m5, (addrA + m*4*K + 4*(k+8)), 4*K"); +// asm volatile("mld.w m6, (addrB + n*4*K + (k+8)*4), 4*N"); +// asm volatile("MACC(m0, m5,m6)"); +// asm volatile("mld.w m7, (addrA + m*4*K + 4*(k+16)), 4*K"); +// asm volatile("mld.w m1, (addrB + n*4*K + (k+16)*4), 4*N"); +// asm volatile("MACC(m0, m7,m1)"); +// } +// asm volatile("mld.w m2, (addrA + m*4*K + 4*(K-8)), 4*K"); +// asm volatile("mld.w m3, (addrB + n*4*K + (K-8)*4), 4*N"); +// asm volatile("MACC(m0, m2,m3)"); +// asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); //store matrix C +// } +// } + +void __attribute__ ((noinline)) matrixMulbigRF_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x18 " ); // + asm volatile("sw s3 , 0x18(sp) " ); // + asm volatile("sw s4 , 0x14(sp) " ); // + asm volatile("sw s5 , 0x10(sp) " ); // + asm volatile("sw s7 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + asm volatile("addi t5,x0, 0 " ); // t5 = m0 =0; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + asm volatile("sll s7,%0,%1 " :: "r" (N),"r" (shift) ); // s7 = N* 2**SIMD_SHIFT; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("loopM_start8x8big: " ); // while(m0 (s11) + asm volatile("blt t4, %0, loopN_start8x8big" :: "r" (N) ); // endwhile(n0 Date: Mon, 17 Mar 2025 17:26:58 +0100 Subject: [PATCH 02/18] Added matMul_16x16 assembly --- sw/applications/quadrilatero_matmul/main.c | 141 +++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/sw/applications/quadrilatero_matmul/main.c b/sw/applications/quadrilatero_matmul/main.c index 51284c585..0b343265c 100644 --- a/sw/applications/quadrilatero_matmul/main.c +++ b/sw/applications/quadrilatero_matmul/main.c @@ -393,6 +393,147 @@ void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addr // } // } +// //unrolling the m/n loops -> way to go +// for(int m = 0; m < M; m+= 16){ +// for(int n = 0; n < N; n+=16){ +// asm volatile("mzero m0"); //m0 = C00 +// asm volatile("mzero m1"); //m1 = C01 +// asm volatile("mzero m2"); //m2 = C10 +// asm volatile("mzero m3"); //m3 = C11 +// for(int k = 0; k < K; k+=8){ +// //compute C00 +// asm volatile("mld.w m4, (addrA + m*4*K + 4*k), 4*K"); +// asm volatile("mld.w m5, (addrB + n*4*K + 4*k), 4*N"); +// asm volatile("MACC(m0, m4, m5)"); +// //compute C01 +// asm volatile("mld.w m7, (addrB + (n+8)*4*K + 4*k), 4*N"); +// asm volatile("MACC(m1, m4, m7)"); +// //compute C10 +// asm volatile("mld.w m6, (addrA + (m+8)*4*K + 4*k), 4*K"); +// asm volatile("MACC(m2, m6, m5)"); +// //compute C11 +// asm volatile("MACC(m3, m6, m7)"); +// } +// //store C00 +// asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); +// //store C01 +// asm volatile("mst.w m1, (addrC + m*4*N + (n+8)*4), 4*N"); +// //store C10 +// asm volatile("mst.w m2, (addrC + (m+8)*4*N + n*4), 4*N"); +// //store C11 +// asm volatile("mst.w m3, (addrC + (m+8)*4*N + (n+8)*4), 4*N"); + +// } +// } +void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x30 " ); // + asm volatile("sw s0 , 0x2c(sp) " ); // + asm volatile("sw s1 , 0x28(sp) " ); // + asm volatile("sw s2 , 0x24(sp) " ); // + asm volatile("sw s3 , 0x20(sp) " ); // + asm volatile("sw s4 , 0x1c(sp) " ); // + asm volatile("sw s5 , 0x18(sp) " ); // + asm volatile("sw s6 , 0x14(sp) " ); // + asm volatile("sw s7 , 0x10(sp) " ); // + asm volatile("sw s8 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + asm volatile("sll a6,%0,%1 " :: "r" (N),"r" (shift) ); // a6 = N* 2**SIMD_SHIFT + asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + + asm volatile("loopM_start16x16: " ); // while(m0 (s6) + asm volatile(MACC(HEAD_LINE,5,3,0) ); // m5 += m3 * m0 + asm volatile("add s7,t5,s10 " ); // s7 = startAddrC10 += n0*4 + asm volatile("mst.w m6, (s7) , s4 " ); // m6 -> (s7) + asm volatile(MACC(HEAD_LINE,7,3,2) ); // m7 += m3 * m2 + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + asm volatile("mst.w m5, (s5) , s4 " ); // m5 -> (s5) + asm volatile("addi t1,t1, 16 " ); // t1 = n0+=2*WIDTH; + asm volatile("add s8,t6,s10 " ); // s8 = startAddrC11 += (n0+WIDTH)*4 + asm volatile("mst.w m7, (s8) , s4 " ); // m7 -> (s8) + asm volatile("blt t1, %0, loopN_start16x16" :: "r" (N) ); // endwhile(n0 Date: Fri, 21 Mar 2025 16:23:31 +0100 Subject: [PATCH 03/18] Changed RLEN/LLEN/ALEN values --- .../rtl/quadrilatero_ff_fs_dr_stage.sv | 10 +- .../rtl/quadrilatero_register_lsu.sv | 14 +- .../rtl/quadrilatero_systolic_array.sv | 10 +- .../quadrilatero_matmul_16x16/main.c | 677 ++++++++++++++++++ .../quadrilatero_matmul_16x16/matrixMul32i.h | 209 ++++++ 5 files changed, 903 insertions(+), 17 deletions(-) create mode 100644 sw/applications/quadrilatero_matmul_16x16/main.c create mode 100644 sw/applications/quadrilatero_matmul_16x16/matrixMul32i.h diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv index 2b2600dac..0c327f8c5 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_ff_fs_dr_stage.sv @@ -12,7 +12,7 @@ module quadrilatero_ff_fs_dr_stage #( parameter DATA_WIDTH = 32, parameter N_REGS = 8, localparam N_ROWS = MESH_WIDTH, - localparam RLEN = DATA_WIDTH * MESH_WIDTH + localparam ALEN = DATA_WIDTH * MESH_WIDTH ) ( input logic clk_i, input logic rst_ni, @@ -28,7 +28,7 @@ module quadrilatero_ff_fs_dr_stage #( // Data Read Register Port output logic [$clog2(N_REGS)-1:0] data_raddr_o, output logic [$clog2(N_ROWS)-1:0] data_rrowaddr_o, - input logic [RLEN-1:0] data_rdata_i, + input logic [ALEN-1:0] data_rdata_i, input logic data_rdata_valid_i, output logic data_rdata_ready_o, output logic data_rlast_o, @@ -36,7 +36,7 @@ module quadrilatero_ff_fs_dr_stage #( // Accumulator Read Register Port output logic [$clog2(N_REGS)-1:0] acc_raddr_o, output logic [$clog2(N_ROWS)-1:0] acc_rrowaddr_o, - input logic [RLEN-1:0] acc_rdata_i, + input logic [ALEN-1:0] acc_rdata_i, input logic acc_rdata_valid_i, output logic acc_rdata_ready_o, output logic acc_rlast_o, @@ -44,7 +44,7 @@ module quadrilatero_ff_fs_dr_stage #( // Accumulator Out Write Register Port output logic [$clog2(N_REGS)-1:0] res_waddr_o, output logic [$clog2(N_ROWS)-1:0] res_wrowaddr_o, - output logic [ RLEN-1:0] res_wdata_o, + output logic [ ALEN-1:0] res_wdata_o, output logic res_we_o, output logic res_wlast_o, input logic res_wready_i, @@ -82,7 +82,7 @@ module quadrilatero_ff_fs_dr_stage #( logic [ $clog2(N_REGS)-1:0] n_res_waddr; logic [ $clog2(N_ROWS)-1:0] n_res_wrowaddr; - logic [ RLEN-1:0] n_res_wdata; + logic [ ALEN-1:0] n_res_wdata; logic n_res_we; logic [ $clog2(N_REGS)-1:0] data_reg_ff; // data register diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 6f357e0bc..ed7ed8b6f 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -13,7 +13,7 @@ module quadrilatero_register_lsu #( parameter int unsigned BUS_WIDTH = 128, parameter int unsigned N_REGS = 8, parameter int unsigned N_ROWS = 4, - localparam int unsigned RLEN = BUS_WIDTH + localparam int unsigned LLEN = BUS_WIDTH ) ( input logic clk_i, input logic rst_ni, @@ -33,7 +33,7 @@ module quadrilatero_register_lsu #( // Register Write Port for load unit output logic [$clog2(N_REGS)-1:0] waddr_o, output logic [$clog2(N_ROWS)-1:0] wrowaddr_o, - output logic [ RLEN-1:0] wdata_o, + output logic [ LLEN-1:0] wdata_o, output logic we_o, output logic wlast_o, input logic wready_i, // to stall the request in case the port is busy @@ -41,7 +41,7 @@ module quadrilatero_register_lsu #( // Register Read Port for store unit output logic [$clog2(N_REGS)-1:0] raddr_o, output logic [$clog2(N_ROWS)-1:0] rrowaddr_o, - input logic [ RLEN-1:0] rdata_i, + input logic [ LLEN-1:0] rdata_i, input logic rdata_valid_i, output logic rdata_ready_o, output logic rlast_o, @@ -64,7 +64,7 @@ module quadrilatero_register_lsu #( ); - localparam MAX_EL_PER_ROW = RLEN / BUS_WIDTH; + localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; logic finished; logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; @@ -75,7 +75,7 @@ module quadrilatero_register_lsu #( logic [ $clog2(N_REGS)-1:0] waddr_q; logic [ $clog2(N_REGS)-1:0] waddr_d; - logic [ RLEN-1:0] load_fifo_data; + logic [ LLEN-1:0] load_fifo_data; logic load_fifo_data_available; logic load_fifo_pop; @@ -83,9 +83,9 @@ module quadrilatero_register_lsu #( logic store_fifo_space_available; logic store_fifo_push; logic store_fifo_empty; - logic [ RLEN-1:0] store_fifo_data; + logic [ LLEN-1:0] store_fifo_data; - logic [ RLEN-1:0] data_mask; + logic [ LLEN-1:0] data_mask; logic load_fifo_valid; logic busy; logic start; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index f82dce146..c87ff476d 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -18,7 +18,7 @@ module quadrilatero_systolic_array #( parameter int N_REGS = 8, parameter int ENABLE_SIMD = 1, localparam int N_ROWS = MESH_WIDTH, - localparam int RLEN = DATA_WIDTH * MESH_WIDTH, + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, parameter FPU = 1 ) ( input logic clk_i, @@ -38,7 +38,7 @@ module quadrilatero_systolic_array #( // Weight Read Register Port output logic [$clog2(N_REGS)-1:0] weight_raddr_o, output logic [$clog2(N_ROWS)-1:0] weight_rrowaddr_o, - input logic [ RLEN-1:0] weight_rdata_i, + input logic [ ALEN-1:0] weight_rdata_i, input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, output logic weight_rlast_o, @@ -46,7 +46,7 @@ module quadrilatero_systolic_array #( // Data Read Register Port output logic [$clog2(N_REGS)-1:0] data_raddr_o, output logic [$clog2(N_ROWS)-1:0] data_rrowaddr_o, - input logic [ RLEN-1:0] data_rdata_i, + input logic [ ALEN-1:0] data_rdata_i, input logic data_rdata_valid_i, output logic data_rdata_ready_o, output logic data_rlast_o, @@ -54,7 +54,7 @@ module quadrilatero_systolic_array #( // Accumulator Read Register Port output logic [$clog2(N_REGS)-1:0] acc_raddr_o, output logic [$clog2(N_ROWS)-1:0] acc_rrowaddr_o, - input logic [ RLEN-1:0] acc_rdata_i, + input logic [ ALEN-1:0] acc_rdata_i, input logic acc_rdata_valid_i, output logic acc_rdata_ready_o, output logic acc_rlast_o, @@ -62,7 +62,7 @@ module quadrilatero_systolic_array #( // Accumulator Out Write Register Port output logic [$clog2(N_REGS)-1:0] res_waddr_o, output logic [$clog2(N_ROWS)-1:0] res_wrowaddr_o, - output logic [ RLEN-1:0] res_wdata_o, + output logic [ ALEN-1:0] res_wdata_o, output logic res_we_o, output logic res_wlast_o, input logic res_wready_i, diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c new file mode 100644 index 000000000..67bd5a2d3 --- /dev/null +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -0,0 +1,677 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/*Variable Data Type*/ +// Supported types: int32_t (0), float (1), int8_t (2), int16_t (3) +#define TYPE 0 + +/* Output tile size */ +// Supported values: 16 (4x4), 64 (8x8). +#define OUTPUT_TILE_SIZE 64 + +/*Register Length*/ +// Supported values: 128 (4x4), 256 (8x8) +#define RLEN 128 + +/* By default, printfs are deactivated. */ +#define PRINTF_IN_FPGA 0 +#define PRINTF_IN_SIM 1 + +/* VCD Files generation */ +// Supported Values: 0 (No), 1 (Yes) +// #define VCD_ENABLE 0 + +// ************************************************************************************************************ +// ***************************** ***************************** +// ***************************** DO NOT TOUCH LINES BELOW ! ***************************** +// ***************************** ***************************** +// ************************************************************************************************************ + +/* Includes */ +#include +#include +#include "csr.h" +#include "x-heep.h" +#include "gpio.h" +// #include "vcd_util.h" + +/* Define Datatype and set of data */ +#if TYPE == 0 + #include "matrixMul32i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 1 + #define HEAD_LINE "mmasa.w" + #define SIMD_SHIFT 2 + typedef int32_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 1 + #include "matrixMul32f.h" + #define FS_INITIAL 0x1 + #define SIMD_FACTOR 1 + #define HEAD_LINE "fmmacc.s" + #define SIMD_SHIFT 2 + typedef float DATA_IN_t ; + typedef float DATA_OUT_t; +#elif TYPE == 2 + #include "matrixMul8i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 4 + #define HEAD_LINE "mmaqa.b" + #define SIMD_SHIFT 0 + typedef int8_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 3 + #include "matrixMul16i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 2 + #define HEAD_LINE "mmada.h" + #define SIMD_SHIFT 1 + typedef int16_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#else +#endif + +/* Declare functions and global variables */ +DATA_OUT_t __attribute__((section(".xheep_data_interleaved"))) matrix_C[SIZE*SIZE]; +void __attribute__ ((noinline)) matrixMul_4x4(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMulBigRF_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMul_CPU(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +int float_condition(int index); +int int_condition(int index); +uint32_t check_results(int K, int N, int M); + + +/* Select print mode */ + +#if TARGET_SIM && PRINTF_IN_SIM + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#elif TARGET_PYNQ_Z2 && PRINTF_IN_FPGA + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else + #define PRINTF(...) +#endif + +/* Select kernel */ +#if OUTPUT_TILE_SIZE == 16 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_4x4((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 64 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_8x8((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 0 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_CPU((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#else + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) +#endif + + +/* Select check condition */ +#if FS_INITIAL == 0x1 + #define CHECK_CONDITION(index) float_condition((int) index) +#elif FS_INITIAL == 0x0 + #define CHECK_CONDITION(index) int_condition ((int) index) +#else +#endif + +/* VCD Functions */ +#if VCD_ENABLE == 1 + #define VCD_START() vcd_init(); vcd_enable() + #define VCD_STOP() vcd_disable() +#else + #define VCD_START() + #define VCD_STOP() +#endif + +/* Matrices */ +#define MAT_A matrix_A +#define MAT_B matrix_BT +#define MAT_C matrix_C +#define MAT_EXP matrix_EXP + +#define MACC(HEAD,__mat1__, __mat2__, __mat3__) HEAD " m" #__mat1__", m"#__mat2__", m"#__mat3__ +// ------------------------------------------------------------------------------------------------------------------------------------- + + +int main() +{ + uint32_t errors = 0; + unsigned int cycles; + + // Save the address of the matrices + DATA_IN_t* addrA = MAT_A; + DATA_IN_t* addrB = MAT_B; + DATA_OUT_t* addrC = MAT_C; + + int K_size = SIZE/SIMD_FACTOR; + int N_size = SIZE ; + int M_size = SIZE ; + + //enable FP operations + CSR_SET_BITS(CSR_REG_MSTATUS, (FS_INITIAL << 13)); + + //start mcycle csr + CSR_CLEAR_BITS(CSR_REG_MCOUNTINHIBIT, 0x1); + CSR_WRITE(CSR_REG_MCYCLE, 0); + + //execute the kernel + // vcd_init(); + // vcd_enable(); + VCD_START(); + matrixMul_16x16(addrA,addrB,addrC,K_size,N_size,M_size,SIMD_SHIFT); + VCD_STOP(); + // vcd_disable(); + + //read mcycle csr + CSR_READ(CSR_REG_MCYCLE, &cycles); + + //check results + errors = check_results(K_size,N_size,M_size); + + PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); + return errors; +} + + +// ------------------------------------------------------------------------------------------------------------------------------------- + + +// Output tile size: 4x4 +void __attribute__ ((noinline)) matrixMul_4x4(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x18 " ); // + asm volatile("sw s3 , 0x18(sp) " ); // + asm volatile("sw s4 , 0x14(sp) " ); // + asm volatile("sw s5 , 0x10(sp) " ); // + asm volatile("sw s7 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + asm volatile("addi t5,x0, 0 " ); // t5 = m0 =0; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + asm volatile("sll s7,%0,%1 " :: "r" (N),"r" (shift) ); // s7 = N* 2**SIMD_SHIFT; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("loopM_start4x4: " ); // while(m0 (s11) + asm volatile("blt t4, %0, loopN_start4x4" :: "r" (N) ); // endwhile(n0 (s6) + asm volatile(MACC(HEAD_LINE,5,3,0) ); // m5 += m3 * m0 + asm volatile("add s7,t5,s10 " ); // s7 = startAddrC10 += n0*4 + asm volatile("mst.w m6, (s7) , s4 " ); // m6 -> (s7) + asm volatile(MACC(HEAD_LINE,7,3,2) ); // m7 += m3 * m2 + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + asm volatile("mst.w m5, (s5) , s4 " ); // m5 -> (s5) + asm volatile("addi t1,t1, 8 " ); // t1 = n0+=2*WIDTH; + asm volatile("add s8,t6,s10 " ); // s8 = startAddrC11 += (n0+WIDTH)*4 + asm volatile("mst.w m7, (s8) , s4 " ); // m7 -> (s8) + asm volatile("blt t1, %0, loopN_start8x8" :: "r" (N) ); // endwhile(n0 way to go +// for(int m = 0; m < M; m+= 16){ +// for(int n = 0; n < N; n+=16){ +// asm volatile("mzero m0"); //m0 = C00 +// asm volatile("mzero m1"); //m1 = C01 +// asm volatile("mzero m2"); //m2 = C10 +// asm volatile("mzero m3"); //m3 = C11 +// for(int k = 0; k < K; k+=8){ +// //compute C00 +// asm volatile("mld.w m4, (addrA + m*4*K + 4*k), 4*K"); +// asm volatile("mld.w m5, (addrB + n*4*K + 4*k), 4*N"); +// asm volatile("MACC(m0, m4, m5)"); +// //compute C01 +// asm volatile("mld.w m7, (addrB + (n+8)*4*K + 4*k), 4*N"); +// asm volatile("MACC(m1, m4, m7)"); +// //compute C10 +// asm volatile("mld.w m6, (addrA + (m+8)*4*K + 4*k), 4*K"); +// asm volatile("MACC(m2, m6, m5)"); +// //compute C11 +// asm volatile("MACC(m3, m6, m7)"); +// } +// //store C00 +// asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); +// //store C01 +// asm volatile("mst.w m1, (addrC + m*4*N + (n+8)*4), 4*N"); +// //store C10 +// asm volatile("mst.w m2, (addrC + (m+8)*4*N + n*4), 4*N"); +// //store C11 +// asm volatile("mst.w m3, (addrC + (m+8)*4*N + (n+8)*4), 4*N"); + +// } +// } +void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x30 " ); // + asm volatile("sw s0 , 0x2c(sp) " ); // + asm volatile("sw s1 , 0x28(sp) " ); // + asm volatile("sw s2 , 0x24(sp) " ); // + asm volatile("sw s3 , 0x20(sp) " ); // + asm volatile("sw s4 , 0x1c(sp) " ); // + asm volatile("sw s5 , 0x18(sp) " ); // + asm volatile("sw s6 , 0x14(sp) " ); // + asm volatile("sw s7 , 0x10(sp) " ); // + asm volatile("sw s8 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + asm volatile("sll a6,%0,%1 " :: "r" (N),"r" (shift) ); // a6 = N* 2**SIMD_SHIFT + asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; + asm volatile("slli s3,%0, 2 " :: "r" (K) ); // s3 = K*4; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + + asm volatile("loopM_start16x16: " ); // while(m0 (s6) + asm volatile(MACC(HEAD_LINE,5,3,0) ); // m5 += m3 * m0 + asm volatile("add s7,t5,s10 " ); // s7 = startAddrC10 += n0*4 + asm volatile("mst.w m6, (s7) , s4 " ); // m6 -> (s7) + asm volatile(MACC(HEAD_LINE,7,3,2) ); // m7 += m3 * m2 + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + asm volatile("mst.w m5, (s5) , s4 " ); // m5 -> (s5) + asm volatile("addi t1,t1, 16 " ); // t1 = n0+=2*WIDTH; + asm volatile("add s8,t6,s10 " ); // s8 = startAddrC11 += (n0+WIDTH)*4 + asm volatile("mst.w m7, (s8) , s4 " ); // m7 -> (s8) + asm volatile("blt t1, %0, loopN_start16x16" :: "r" (N) ); // endwhile(n0 (s11) + asm volatile("blt t4, %0, loopN_start8x8big" :: "r" (N) ); // endwhile(n0= 0); + return (diff > 0.001f); +} + +int int_condition(int index){ + return (MAT_C[index] != MAT_EXP[index]); +} + +uint32_t check_results(int K, int N, int M) +{ + // check + int i, j; + uint32_t err = 0; + + // Check errors + for(i = 0; i < M; i++) { + for(j = 0; j < N; j++) { + if(CHECK_CONDITION(i*N+j)) { + err ++; + PRINTF("Error at index %d, %d, expected %x, got %x\n\r", i, j, MAT_EXP[i*N+j], MAT_C[i*N+j]); + } + } + } + + return err; +} \ No newline at end of file diff --git a/sw/applications/quadrilatero_matmul_16x16/matrixMul32i.h b/sw/applications/quadrilatero_matmul_16x16/matrixMul32i.h new file mode 100644 index 000000000..afb6f9109 --- /dev/null +++ b/sw/applications/quadrilatero_matmul_16x16/matrixMul32i.h @@ -0,0 +1,209 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +#ifndef _MATMUL64INT32_ +#define _MATMUL64INT32_ +// This file is automatically generated +int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_A[] = { + 1537,-1933,324,-583,-389,-654,3473,-1259,3093,3476,-4013,3471,3479,-477,149,2578,920,2192,968,2691,-636,-2958,-2652,-1213,-2089,-949,-1026,-1332,-3241,-1577,-670,-1969,1423,879,-3935,458,-967,698,3840,-715,-2721,2940,-4017,2668,3351,-924,2060,-3772,-79,-1206,-2660,-2273,-1533,1363,-4089,1241,3334,-2863,-1224,2836,-3245,2525,-3209,-2805, + 1571,-2232,-1928,-3987,3570,3420,-3741,-2384,3247,-1428,-2272,-1347,-3409,-559,2852,-2918,3491,1762,445,255,-3533,-2923,-1877,-2821,1246,-556,-2474,696,3089,3085,-3169,4064,-3267,356,1975,-2179,2018,2557,823,-1494,-1103,2449,2238,2292,-3103,3817,3477,-3932,2273,3583,2209,3056,-644,-3891,-166,-1091,3571,1102,-2612,-2597,4051,2959,1663,2785, + 1807,-738,-2081,-1308,-1558,-1585,-2495,1492,3304,-2264,-3372,3739,-469,-2423,3104,560,-2014,2014,2725,-1864,3900,4030,-512,3193,2933,2441,544,-615,3883,4046,-1491,-78,1160,-2938,-1326,-3089,-519,2765,-3643,-2584,-3365,-1746,-3119,-3663,-3203,2509,-2568,-706,-2961,-1215,2576,2680,556,372,-2085,-2818,-544,-2273,3473,-1458,-2219,-1480,-1099,3680, + -3144,2578,-4007,-893,2759,-1799,352,-3698,2680,2921,-2252,2078,-1204,19,2736,-2455,3088,3488,805,967,-3541,3508,310,-1809,3927,762,876,-2,-8,-3157,-892,509,-2433,2592,3656,-2479,2256,3786,-642,-2245,2906,-2102,-1107,-1928,-1149,1174,3954,2474,-974,3507,-133,295,2486,-3816,-2446,-1207,864,-3618,1854,-1508,-1607,2701,-205,2806, + 1855,444,3970,-1838,3071,-2491,-2514,-3276,-3869,2678,3777,795,774,1372,63,-3947,506,9,2187,-2952,-3955,-1668,3406,2301,3361,-1640,3469,-3713,3715,723,784,799,2611,2333,-1087,-2903,-463,-3410,-4021,1274,2553,3125,2041,-3310,-3298,578,2963,-1901,-3103,571,365,-2621,2871,-1230,-33,-732,3729,4008,2627,3985,-3529,-3711,3369,296, + 2875,1297,2831,1697,1962,-1878,-3971,-876,-741,806,-1300,2519,276,-589,3477,1111,-1220,3588,1437,-3358,-2688,-567,884,-868,1090,3262,-248,-1782,3971,466,-2939,398,62,-337,-82,77,-3680,2307,3905,-2163,-2703,2818,213,432,-1688,2693,-2084,2108,-388,-1474,-476,2287,-3420,-3519,-393,3512,-2756,-4000,-1939,4062,-3748,1320,-1178,2952, + -1241,1665,734,-3055,-1039,-1755,-4005,3829,1975,2631,-1899,-1105,-3397,2343,-2760,127,-2924,-3497,2876,-2539,-826,-2881,-2053,-4089,2091,-2448,-2640,4047,1450,1973,4062,-1153,-442,-596,-1593,-755,-460,-267,2676,-954,955,-3440,2966,2525,31,506,-1738,-1714,-2835,-2415,-222,-2066,-2170,3204,-2826,1920,3607,-759,-2922,-1545,2788,-3664,-2890,-2171, + 3775,-2527,-1095,-390,681,-913,1436,1171,2502,-2980,3721,-3931,-2873,-3504,-1050,3104,206,1488,-38,-692,-767,2681,-2113,-56,-2964,3702,3341,-891,971,3243,-517,1725,-2570,622,-263,-1652,-4077,-3832,756,3943,1211,2035,-1730,-3476,-1789,-21,2808,-1745,2588,172,-3552,1783,-2498,2180,-3360,1960,1380,-2792,1977,-2487,238,-2998,-1373,-1250, + 1785,-3913,3311,-2867,423,210,-1311,1035,-973,-1575,-1735,-916,-118,1110,653,2819,-1227,2766,3980,-1733,59,-584,3843,-2323,-2625,-597,833,2330,292,-1059,3176,316,1052,-2834,1446,-3417,-3744,-1317,-4063,-3514,-3626,-3490,-3196,3611,695,3079,2488,4014,-3759,2138,450,3414,-1723,1092,1128,3581,-2011,1970,-1186,-1024,871,-1658,24,-60, + -3023,1438,1322,-2835,-3889,-2784,4087,3891,897,-3857,-337,-2915,614,2298,-785,3227,2137,-1326,-3035,-1969,1907,1258,-3723,3320,-868,2332,743,-111,1163,-2125,434,-3888,2276,3287,-56,1484,-1077,-2124,2444,-1219,-1194,-2352,2725,2519,-757,1524,1769,-3241,-2938,864,2165,-2048,-1847,938,-656,-2397,-807,1642,3814,983,3071,-2906,-17,-183, + 1748,1280,391,3358,1906,2446,-231,2693,2667,-3219,2766,-1458,1691,650,252,-3071,-2841,-255,1018,382,-3043,642,-2898,-2177,-1234,-753,-3382,-3316,-3475,-2307,324,-3257,-3962,-1856,-1108,1370,3997,-357,-562,2993,1593,1241,2787,506,3446,1182,3139,-850,-3014,-2274,-1594,-2571,-1231,-2349,1664,-712,1311,394,3475,3921,-3075,-3532,1751,3323, + -3415,2452,-2858,2072,2091,-2280,2152,1781,-1401,-708,2032,3016,913,2111,-2415,-3776,-1419,3233,1417,-966,2815,-2274,-1030,2508,2227,501,-3974,891,430,-787,3336,-1125,-3872,-1702,-428,-758,-1034,1034,3230,-1429,671,886,3237,-3874,-777,927,-3032,-3640,-2316,2275,3531,397,-3530,-3344,-2231,2856,-2968,803,2325,-1591,-3239,-1045,2018,3999, + 2839,-2243,-4083,3884,2612,1432,-3943,-3502,-3505,780,806,-1450,3268,2238,-1581,3198,1447,-3470,4026,1593,-2991,2728,1601,1358,-1138,2808,-801,-764,445,4060,1595,-2118,3107,2449,-2507,-3240,-2957,-2386,-1388,-2277,-2836,-3897,-2088,-2640,-754,148,-1389,2289,1335,-3155,776,209,-3358,-585,-66,-290,-3462,582,-837,-3699,4004,1346,2598,-379, + 3522,-707,-1940,3565,2851,-81,1328,4065,-1228,-2530,674,2674,-3751,-3291,-419,-2147,-2141,3438,2599,-1104,-1997,-3544,2647,-1311,-2580,2702,2505,3073,-344,152,-3196,3541,-2456,-766,-1606,-1715,156,-1566,-3536,2469,-1638,3044,2270,1458,-1353,732,-2986,-3584,-1216,3590,1569,-2845,-636,1332,-380,586,-2031,-3747,-1733,-1227,1342,259,1746,517, + 2210,3297,-835,2923,-824,-707,3254,-18,3984,-1874,-2266,-1289,2240,3205,2073,2217,-1960,3363,-3445,2661,1212,2806,-4010,-1856,-936,-18,-3673,-819,-469,2877,-347,-3076,-1510,-1024,-26,412,541,-1596,723,-1436,-297,2786,-2846,1860,389,-1117,2707,1879,1540,4027,-1611,2397,-1867,-1687,-2861,-3489,3418,3157,178,1761,2078,1192,892,3145, + -181,-2984,-340,-1615,-2414,-2801,-3325,3170,-3581,2536,-2332,4065,1916,864,-403,1747,2450,-2067,2721,2428,-201,3504,-2712,1665,3724,-2726,-2766,2258,497,-547,1523,531,2401,1317,379,1375,3210,-1212,3508,-4043,-1915,-3604,-407,-997,-1968,-2079,-1139,4022,647,1878,-2172,3470,3035,-1127,1959,-11,-2644,-404,2932,-527,2949,-3821,1698,-488, + -1470,683,-2879,3331,3212,-2196,-709,95,-2087,591,-806,2717,2091,-1581,2298,3620,3796,-1190,953,-2574,984,1512,-3750,3921,-4068,-3176,37,2898,537,-2408,-3155,-1416,235,380,2566,-3309,1529,2999,2614,2057,744,3270,-1748,-2534,1864,-1961,1334,-239,-1371,-2309,-2201,-1062,3609,3467,-418,-1654,2083,1247,3550,-3354,3042,-696,771,1725, + 452,-3788,2130,-477,-3941,1167,-850,3310,-3831,70,-1980,899,-259,1733,1419,-3903,-3050,-680,1844,1559,-48,2186,-2851,-1086,-1777,136,-444,-1553,1553,-1188,2851,-2925,3315,-2512,-141,2053,-1630,3712,-3804,-3163,3628,-1462,-2650,1871,-3478,-352,-1122,-586,60,296,-2686,3602,-2938,271,2967,-3383,3108,1455,1540,2376,93,2427,2970,-734, + -3517,1985,-2530,2685,-1844,503,2737,-20,286,-2146,2944,-2112,-828,-765,-269,-3013,2128,-4041,3058,-3635,4066,3315,3074,-1640,-2975,633,2469,2686,3415,-2477,-1787,-1135,1114,2788,-2505,715,-2547,488,-2316,2876,-86,-1089,-2619,1000,455,2753,-3837,-429,-3509,1071,706,-2297,-3466,-1320,-535,3400,751,-3984,2728,-3615,3297,2866,1033,2120, + -3389,-3771,-2367,2413,-1824,-1186,3927,615,-1982,294,2139,527,1547,-2032,-2101,2940,-696,3282,-412,-3402,-2612,-1528,353,1656,-2081,3885,1593,-335,3853,-1601,-2117,-1007,740,-1315,2263,-2222,411,2455,-3836,2111,-1120,-3340,2717,-2956,-1987,3601,3313,3091,-168,2593,-494,488,2733,2883,-2036,-2529,2957,1455,-1987,1674,844,-2935,730,-2471, + -1220,-1470,3428,-3256,-1412,-573,2366,-315,3234,3647,3967,1937,702,-2657,3920,-2814,-2958,-132,-1170,348,-2293,1093,1978,328,-839,2191,-3166,-362,2043,-560,-3032,-1933,2387,-1804,3741,-2251,2057,3859,-3790,-1439,176,-3424,3291,2204,-1199,-3965,90,3257,-1809,-718,-670,3133,-1145,-313,2551,-3921,-3785,31,-557,3739,-234,2109,1413,-1724, + 3675,651,2398,-3429,-1997,-3716,-3302,3289,2687,-924,2628,3902,-2908,-589,-2106,38,-608,-1471,1866,-1351,28,-2661,2057,-1175,-1338,-3770,1806,3355,3067,2311,374,-1541,3117,1959,-3548,-452,-2938,1130,-2336,3640,-3674,395,-2909,44,3038,-2955,3624,1092,-2602,-2045,2160,3897,-590,1754,-3482,1588,274,2354,3769,76,2304,-560,1413,-3695, + -3264,705,-3383,-1017,3934,-550,-1726,-2541,-3597,-1460,1410,2425,3453,513,-608,3590,2996,-787,-1725,2105,-2742,620,3106,-2235,1836,-3588,-693,-150,2657,637,3013,2394,1080,2490,-592,-4094,-2033,606,1648,-663,670,863,874,-2170,-4072,1090,1190,2062,-1651,1045,-3123,1185,3045,-1100,-2523,-3550,-75,-416,-3140,1813,644,-2725,3951,2573, + 4027,-1925,2642,-3974,2549,1653,-1932,-1926,3241,644,1363,716,1392,-2596,-2220,-2704,3714,3125,-3290,672,-3042,3532,-733,1743,-1003,-852,1833,2882,2905,2938,-1085,-482,3339,-2336,-2101,1152,-588,-2291,-367,-10,1011,-1563,-2786,2341,1869,-2901,-2182,737,3357,-2500,1901,-4064,3538,-2160,3382,4085,3174,-1535,3689,642,-2043,2208,900,-1435, + -1270,2160,2358,135,2799,4075,-2138,-3477,3148,-444,2808,-3067,-1627,-177,3627,-1016,-3900,-2723,1348,-2674,2140,3604,-3479,3938,4061,-2497,-1672,-3898,-2246,-367,-354,225,2123,-3802,1258,-2066,1095,-2419,-312,1769,318,748,-15,-590,3964,2450,824,2649,-776,-2418,3226,-857,2583,2481,-3416,-3472,-3431,2354,-1596,-1444,2397,-283,2357,3420, + -3787,1107,3117,-472,-849,4068,-2438,293,-1735,1931,-2078,-2571,-4072,-486,-3016,-3845,3820,1264,-2776,-3752,-3393,-2077,-2860,-3674,-1556,1675,-1673,3659,-2446,-1565,545,-3091,2897,-120,3662,263,-303,-1532,3543,1729,1789,3768,-839,-3027,1494,1912,2229,1938,3584,-2158,3013,-1937,-1774,1381,964,-3237,-340,-15,-1293,710,3475,-3383,-721,583, + -1294,2135,201,-714,3448,400,2885,2406,-1265,3446,-2900,1598,1571,2252,-1139,-2356,-3914,-499,839,125,-371,2022,-1518,3284,1083,-5,-2690,4048,3795,-2119,-1189,1616,-2537,2127,-2261,3820,3980,457,-1689,-2811,615,1504,2443,-2225,1883,-668,2618,3488,-2490,2036,3841,2500,-14,-3180,2729,3770,2967,-13,2526,4005,1582,-3813,2498,-262, + 820,696,3163,-2594,-2282,-3094,-3295,-1676,3760,-734,627,-385,-1278,-1539,1480,463,3725,-867,379,3620,2900,3841,2325,-379,-1549,-2038,-2414,133,1172,3551,2989,1561,-754,-2683,3599,4067,-239,-630,2056,-3809,-3171,-434,-2992,1393,-518,-126,3066,-4084,-1289,3217,2002,1632,785,911,-1506,203,-3240,-3664,-3216,-2835,359,-1622,3925,-2294, + 3916,-866,4047,1828,-3285,-3090,3993,-2269,3052,-3524,-4043,541,3348,3986,2815,-1118,4049,2885,3701,836,2801,1556,167,3951,1482,3363,-1079,3983,389,-3534,-511,2075,2974,-1191,2937,-3107,2309,3541,-1474,-1860,-3398,-108,2345,3093,2019,-3451,-23,-317,-430,3118,1994,1040,-3243,-844,-752,-1856,-1217,-2726,2925,-3727,790,-853,-853,-3099, + -4064,-829,1227,-2995,-2681,-3006,-3215,-594,815,252,-755,-1554,-804,-1594,-857,-3747,-2971,2653,-1608,3404,1969,-2711,88,-789,-4060,3016,926,-2888,-3745,6,-3322,-22,864,-3747,3973,-652,1207,364,2221,1928,1978,-2400,-253,1694,1662,144,2989,2136,-3444,2343,3303,3560,3759,-3808,2325,-2489,-2252,954,-1593,881,-2299,2214,1544,-1454, + 4008,-2276,583,2479,2282,1370,1945,1890,3732,-1830,-351,-3906,3566,3219,-1390,-76,-3124,3759,1793,-593,-90,-118,3718,-3610,-1589,2314,-2737,3238,-410,1890,-490,2649,-2706,-3483,3881,-3089,3400,3069,2088,-2540,-3159,3440,482,-863,3986,2992,1002,2137,-3464,-3419,3996,3753,3138,292,-3486,1864,-1084,-1123,-2876,-965,-932,-392,3649,-487, + -1648,65,-883,-2488,2241,277,3658,-3163,1458,-2442,-3376,2603,3963,2501,-2444,-3574,-1282,1268,3290,2000,3643,-2435,396,-3564,-866,2772,-1837,3440,-1456,3292,-190,-650,3708,-1231,190,3699,3053,451,2067,2293,2878,2448,-3443,-608,-1196,-550,-3637,819,2913,-2445,2241,331,-2701,3432,-3348,609,2342,3745,-1883,-2402,1538,-3538,759,3140, + 3349,3576,2990,-3886,-722,2742,37,3137,-710,-3822,-1355,2101,21,-494,2189,3285,-376,751,2567,-960,3637,-2701,-3601,1247,-2446,2762,3071,692,196,3093,3997,-2952,-818,1723,-2368,1340,-1226,1262,2769,3725,4020,-3143,-3877,-692,1708,-467,-514,-1500,-2645,-2943,3539,-3802,-2561,-1100,3892,-3300,2461,2600,3482,156,-3608,2937,-2663,1806, + -3176,2794,-3113,-357,2421,-323,-40,1090,-841,-985,1792,2594,-3688,3238,-1335,-998,1461,-2803,1666,3320,-1610,1962,-2571,-2208,-2776,-1639,-844,370,3931,-680,-1753,-3761,1293,-196,1077,-605,3969,-3366,3843,1485,-749,2026,1422,-2334,-3894,1999,-746,-3844,-1084,326,-2884,2452,-2988,-1497,945,-1127,-1285,2511,1856,9,-2719,-113,3403,2304, + 3412,1839,602,-3455,-1864,-3770,2695,2273,2901,-732,-335,2302,2018,-2969,-2521,-1103,2697,-465,1651,-4055,2024,-2348,1622,-1599,2455,156,2370,1181,-1148,-1107,2529,2642,124,-941,-1850,1473,-3772,2116,-1157,-1803,1146,1164,-2174,751,3214,2380,-1935,-56,1195,2797,3486,502,-934,-2513,423,-2587,-2300,-3743,3546,-2214,3944,1652,1518,312, + -2682,-3220,1449,3748,-4080,2469,-3645,2475,320,432,-1937,1914,-1289,2852,-3285,-3248,3498,3558,-336,-1937,-2290,-3956,2653,-770,-3481,2944,2642,2125,1512,1332,2909,-566,-2284,541,-3533,3446,204,-2409,3532,-615,-2121,-3174,-428,-986,160,2129,2916,219,1800,-2018,-64,802,1169,-4080,411,2453,222,2546,-2792,1012,1714,-3827,-163,3070, + 1988,-4,1743,-562,3240,-4012,49,-176,-3257,-2262,270,-3701,2364,-484,-2550,1579,699,1098,4045,904,-2164,3106,820,-2953,-3958,2480,-2419,-3259,2573,-2491,531,956,-3931,3362,-230,-2107,2834,162,-281,-3874,-1229,-1282,783,1290,-930,2071,534,-995,-1560,2346,2174,1213,1055,3718,-3071,-3631,3448,2296,2017,582,458,563,-1058,1864, + -703,2552,-2734,1232,2473,2243,-3477,949,-3592,-744,1744,-930,-3539,-3958,-890,-1498,-3913,700,-2135,-4076,2563,1300,-93,3503,-3136,2895,2588,1732,-3140,2959,-4026,-1043,1814,2521,2968,-3097,-1516,-2957,965,-122,2791,938,-3803,-3003,-926,3367,-778,1387,2631,-3875,-1906,3016,2197,-1236,1586,-495,3718,1923,-3640,238,-953,-3403,-3419,-668, + 147,681,-360,-1743,2424,-2984,1819,3995,-174,-3425,-2708,-4095,-1077,2988,506,-1108,-2072,1612,-2227,1829,-3112,-2732,1849,-1434,2325,3663,-333,2392,3922,-2301,-3015,509,3119,-3649,2481,2650,3195,462,2926,-2767,-717,1994,-1076,1765,-1233,449,-3756,-2592,3359,-3238,1400,1904,-2897,-1061,-1865,-660,1807,3472,-617,-2621,2675,2762,-295,190, + -181,1287,784,-880,-3630,-3093,43,-1997,4012,2954,3810,3340,-753,392,-880,-3967,-339,2701,-799,-1274,-652,-3170,-2144,-2218,2592,2935,-3621,-696,3510,-1158,3773,3990,2456,2212,-2000,2321,1132,440,-3756,-2067,-4018,2274,-2173,-1112,1824,3858,2709,636,3304,2144,3668,10,3624,-2272,1142,-2638,-3565,1165,-3867,2194,2364,3390,-1113,-3816, + -1527,615,2695,2407,1263,3515,1426,598,-758,888,2575,2290,2638,-3888,2675,-1591,2536,-3094,-1521,456,-3912,-2337,-2028,-2075,2966,626,1952,-2979,2814,-2183,-56,3049,1682,-1384,722,-2156,634,-1059,2181,-880,995,2115,1304,2219,-2613,3133,-3892,-2676,-2962,3334,2960,3403,3603,-1524,-419,594,-1577,1103,-755,-570,1288,2833,2989,1774, + -218,-340,-1217,-2773,1198,3589,2715,2303,-312,82,-450,3533,187,3263,-3222,2255,2243,-3474,2392,1670,-482,-341,1693,-3038,1209,-4,2597,152,3576,-734,582,647,-1291,-475,-2451,3480,-88,4083,-4,-2185,-3826,3265,1270,-989,-962,-3910,-609,781,-3152,-906,-1310,1583,3191,-626,-3893,-2284,2333,-2429,694,3093,1956,1115,814,-1724, + -3836,-4015,3451,-3847,389,-1501,-2945,-2301,-3469,2029,3553,-2101,-1835,-1127,-1735,-1575,2609,2811,-353,1089,1046,279,-1727,2273,-572,342,-948,-2591,-3087,1140,447,3331,-1361,-2742,431,-1317,1448,-3667,3644,-364,2570,-2656,663,130,-574,-3509,3232,-3534,1319,3506,-3150,3500,-178,-722,295,-3735,2399,3958,2925,3599,1997,-29,-3525,1653, + -315,-3516,-1580,995,-2366,356,3988,992,3467,1775,442,1381,-3361,1883,-49,1522,-2713,644,-665,-2240,-460,6,-2074,46,-3996,2538,-1605,-3262,1191,-3548,-157,-2776,2604,3446,-3800,3046,1869,687,1979,3680,-1843,-2859,-2958,1855,-1038,996,-2490,1575,-1515,-3540,-2428,1222,-2582,2256,-866,-3162,-56,-2443,1989,-108,-1301,4021,1808,-871, + -56,1530,-3864,1547,3794,-1207,931,-3805,286,2831,758,-1661,599,-814,-1651,4071,301,3986,2563,-1995,2087,-3621,2383,-3491,-741,-718,1633,3358,708,-118,-1656,-3045,902,2261,645,-76,807,-1152,2522,938,1718,1666,-1249,-2785,3271,-3545,-1513,3314,-3090,1577,629,-2707,-3201,-2300,1730,-2661,1259,4024,1805,-411,-1052,3202,2022,3751, + 1633,2869,2732,-683,-4066,65,3874,766,-1846,1375,142,2245,-2369,3593,2944,1282,2003,-1823,-2728,581,-1771,2474,1778,3036,-3291,436,406,-3646,-3731,-2249,-4034,-2642,-4069,507,146,2120,2718,-1459,-3978,-1131,-614,3356,-3653,1483,4013,3678,979,-2433,308,803,3142,-2651,-4048,-3905,1147,-2451,-460,289,3995,2519,-2729,-1556,-21,1465, + -3084,-2659,-2069,-851,1999,2335,3641,3888,27,3359,-2912,924,-261,1060,2508,707,2324,-310,2878,-4039,31,-1081,1012,-3222,680,-3593,2258,1978,-1413,1950,2976,3164,-3583,-484,3898,-3679,848,-2775,-2116,3413,3535,2135,-3312,-3366,167,3921,-4007,3837,21,-2171,-2220,-2011,486,-911,4075,2358,3788,-725,1547,309,-2344,1127,3862,-10, + -3155,127,-2446,3755,-524,3977,3344,-2734,-1035,82,-3539,-1523,3072,1262,-225,-470,-3072,-693,-2024,1912,3641,-477,-1134,2715,-981,1250,-1886,3357,941,-3593,-2142,1048,2235,-394,-1573,86,-527,-3816,-3431,3143,1488,-2558,-2588,-2003,2578,-2804,2640,4081,-636,3571,1943,-3194,-3197,-3816,-1266,1694,-2788,-3763,-1875,3802,3284,1653,-2337,-1148, + 1923,1973,-1386,-2941,-3167,958,1836,1157,2807,-1033,-3106,-1790,-421,2986,-2858,-3530,-3179,1648,2536,979,714,214,-2193,2977,2167,2778,-884,-3509,-1685,-4081,-3611,2575,-307,-1311,348,3599,-801,2894,-998,1635,-1614,-2975,3558,2843,230,457,1856,2653,599,260,-2580,1827,-2636,2380,-2573,1852,-2218,-3718,251,182,1070,887,3974,1086, + -2284,3042,2247,-649,2396,-285,3293,-2605,-823,1896,1138,-339,-2717,3861,-1074,759,2940,-3390,3663,2799,-2276,-1366,-2466,2826,-1102,-1926,2650,-3101,1800,1475,717,3128,2578,1829,3223,-2214,-4,-1590,3,-161,3163,-1156,-693,-905,3715,2135,3293,-3853,1539,-1302,3898,837,-261,-959,3790,-3963,-1942,-1926,-155,-2299,-3104,2206,3381,-2536, + -3489,-3027,2220,-295,169,801,-3846,250,-4008,604,3242,-2894,2371,-3853,2649,-2732,3080,-3785,2397,2669,2966,74,2175,-3353,3242,-4077,1510,3011,91,-301,-2415,1476,1367,-2771,1134,-2681,974,759,-825,-1879,2254,4041,-2301,-1713,-3288,3658,1268,-1852,696,-3435,-2110,-2634,1401,-3849,-3220,2719,-2686,1012,1546,2771,-2376,-1471,3962,763, + -4071,2580,-795,-3108,-2045,-3924,-3865,1765,-916,-1257,616,3442,1139,3463,-2842,3133,-1349,3834,705,1128,-2524,-686,3749,2903,-3844,-3851,3362,1559,-3643,-2533,-3084,1437,2853,-836,941,-2126,-422,2048,3580,-4057,1754,-1725,3558,2118,-2369,1755,107,-2798,-2131,306,-3661,751,-742,1834,1437,-3575,-551,4042,4003,-39,-2284,-1674,-194,-36, + -675,1439,-2712,3458,808,3896,-3274,-1811,-2750,2759,2543,-3650,211,2808,-997,134,3722,-3491,-2327,-1019,3274,-1690,3780,-1010,-1380,1777,-3233,3544,-2255,-1089,1654,-1758,1393,-3719,-3283,3401,-262,-1682,46,-326,-1447,-2173,965,-2623,-324,1678,-1030,-3607,-953,147,-2195,-2087,153,209,1413,-2981,-3536,1078,2356,-325,2679,4030,-2610,2287, + 2261,-3758,1496,-1170,1377,-4091,-1385,1205,2910,-2845,-2902,-3077,-3938,-2492,2093,2260,-2189,2483,2228,928,2613,1257,50,3708,1716,1648,2668,-3926,1718,-1596,-2027,2859,182,2605,3285,558,-1470,2726,-822,-344,-3947,-3259,-2517,365,-1756,-2469,-3263,-3315,94,-3808,3354,1710,-156,-2714,3964,3324,-1109,-2529,1886,-1331,-2056,-840,-3679,1579, + -1919,-3355,-510,1025,3345,1266,1423,-3117,2209,-1117,-2840,-1236,-483,-2866,1539,1419,-3656,3163,-471,2723,3532,-3416,-2158,738,3331,-3084,-1927,3951,245,3629,2758,-3735,-2151,1736,-3781,1484,-2451,-2021,-2254,-2812,-3808,1479,1176,1572,-1079,-1454,-2597,2335,1254,3166,-3742,-973,-2676,2902,-988,-3187,316,-1895,-3462,2859,1426,1322,-1692,3843, + -199,3803,2342,3760,-1497,1117,-2376,1150,448,2528,-3928,-1024,945,2671,-1293,857,-51,-453,2494,-456,1346,1887,-3549,3681,-2748,-1940,-3074,3027,1121,1601,344,-2216,-2889,3250,-3068,-1884,3218,-2941,3317,-594,3653,-922,-2095,-3887,3556,-2695,-2122,-1979,604,700,-3089,-3554,-1873,3514,642,-176,-3500,-2401,2480,2881,900,277,1948,3584, + 3813,-3085,1926,-3548,-1915,-1357,-573,-3771,1703,-916,-8,-1064,-4062,1994,2671,-2368,-1787,-572,3775,885,3277,-4092,1131,-210,-3127,-328,-776,796,2429,-426,-689,-1497,922,2381,884,-140,1057,570,2974,4067,2156,2297,-661,4018,2984,-1681,2501,-485,-2278,3941,1759,2114,-150,2140,-3321,-2193,-1307,2405,-3692,3706,3583,-3437,-2751,3526, + -2666,3207,1123,2371,-2073,808,3271,-556,-1249,1009,-2403,-1356,111,3374,-3671,3029,2069,-2112,-1858,581,-4033,2527,298,-1489,-2611,-1131,3321,498,156,1920,354,666,227,2226,-707,27,-2012,-1972,635,2074,-568,735,2163,-3722,1527,1587,-3908,-3669,-861,-2116,2907,3495,-1107,1459,2513,-3409,1361,1480,-2527,-1022,-1840,-283,-3377,-1411, + 2798,3461,1360,-2719,3137,191,3290,2171,-2322,50,-249,-900,839,3425,-989,-797,3889,1518,638,1763,-2583,-1523,2302,-716,-3658,2171,-653,-4070,-1606,137,-876,-3121,-1248,-1754,-2669,1449,3290,910,-2821,3365,-1826,-2831,1999,-1908,-1022,-2803,-301,1533,80,697,409,3147,1477,1856,2795,784,-1641,-2785,1244,-2617,-3923,3287,551,1943, + -3516,-3393,-3048,3924,774,-3235,1560,608,-997,3107,-853,3106,-528,2320,-2326,-62,3013,130,1824,1814,-2238,-2558,-2984,2000,1211,28,-3732,-2546,2979,1336,3741,-2644,1817,-701,-3959,-2374,-3804,-1571,-3591,-3264,-2497,2857,-743,3400,-1974,281,1524,-188,3175,1987,1090,1216,-832,3597,2596,2526,-2419,-1281,1996,-1503,-3505,3927,2438,2334, + 1617,3972,2354,1991,3718,2869,513,-2749,2956,-1934,-2911,-3245,-3259,1202,-2258,1160,975,3204,1377,3291,3744,-1017,3371,-425,1760,-515,-2925,-2699,-343,-1607,1663,92,-2892,3419,-111,-1941,-3812,-416,-2625,2174,-724,-1567,-1301,2236,1276,4018,-973,-3778,2252,-2437,-3733,-2394,3080,-2518,-151,-1938,202,3352,3138,2500,-1516,-1174,-2899,-3998, + 2302,325,1278,-3003,2331,-4008,-815,-929,2827,1947,-343,1984,2867,3006,-372,2415,414,3155,108,-4038,-3169,-648,3692,-3565,3043,-3497,-1381,-1658,2899,-680,1520,3907,461,-1544,-3765,-1847,339,-1398,3423,-3845,-2869,-2112,2851,-2448,2765,497,596,2552,2983,-3574,433,-3055,-1931,736,3884,1337,3678,2487,-2010,-2201,-4062,257,-487,-2319, + 3109,787,-1318,-4081,-2193,2011,2634,580,-1043,-250,-1868,-1699,3246,-1273,-629,-1315,-2666,-1949,-3222,-897,-1153,1669,-2648,2790,2209,413,2770,1157,3632,-217,-607,3292,2089,-310,518,-3844,3561,3054,-436,-1386,-1336,2741,-464,1066,-2023,3992,383,-1423,-2013,3346,-2597,1466,2902,-3251,2750,2451,1209,-3832,1169,-1027,2323,-4068,-3563,1346, + 2453,-2272,969,-2365,140,-3461,-2368,-353,1129,-12,2641,-2113,-2070,3299,19,-103,951,-1910,753,-1876,-1227,-3295,-3627,-2519,-1064,-1445,-3279,1374,3015,3808,976,-3650,-436,-1921,1093,3694,-3248,-838,-2891,-875,2668,-1873,-3129,-1981,655,-184,2248,1841,-2183,1053,976,1017,-1948,-2933,1584,-3855,-1502,495,-1508,-1648,1937,3582,3547,-1972}; + +int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_BT[] = { + -1056,189,758,-3254,-1143,1513,-474,2502,1014,-3489,1686,-3444,544,-3831,319,2559,-4051,2138,-4001,401,-2912,-1648,-3162,3340,2336,-586,3131,1517,3336,3487,1088,-2011,3837,-1745,30,2895,4026,911,-3212,2301,616,-3664,3988,716,-1074,-1909,-2572,-2329,341,-1592,993,1094,3437,3763,2664,3589,-3096,2997,-580,2624,-3905,-2350,3729,3223, + 2039,844,2325,-3426,2046,-1241,-2142,-3537,3339,-207,563,-3940,1640,-2122,2940,-2660,-1472,-783,680,1723,-3254,-3078,2061,3985,1935,3678,1603,-1064,-1097,-776,-3671,2487,-1291,-3365,459,-242,938,1597,55,3459,2066,24,-1160,335,2082,-3081,-2030,-2230,-1168,105,-2579,-1164,-532,1723,3410,-2099,3776,-1891,-171,3242,-681,3552,-505,2100, + -3042,825,-3790,-3697,-773,-481,3781,654,-1659,1331,-274,-2618,3923,1508,3023,1260,1813,-1598,855,1798,-3345,2369,1652,2577,-3709,-3755,1033,-940,1155,-2450,-1543,1160,2909,930,-2482,-1323,1657,-992,3909,1373,1396,3710,117,-2313,2611,254,-3746,1233,-1861,421,856,2809,2686,2061,1294,-443,207,2189,1157,244,1432,378,2592,230, + 544,-2203,-1832,2825,247,1199,-2542,866,3163,3069,863,1495,-2507,1207,2909,3045,3894,-3975,-2699,337,-1614,2730,-3916,-114,2334,-3585,3054,-2289,733,-1984,3831,3337,1766,-2051,-1066,3380,-1191,3485,3539,-2877,3539,-657,1366,3439,-2201,3433,1503,3386,-2998,3493,-2581,-107,-1182,3970,2234,2693,-928,-1500,800,-970,159,3884,689,-2294, + 1446,-2110,847,1881,1213,2988,-3229,-82,-2747,-1379,768,2985,-3434,-2329,1660,-543,-3018,-1532,-1571,1770,1877,-3324,-2874,2495,226,-895,-4008,860,3323,-641,2211,-1141,3551,-385,-3669,953,3435,-3878,535,-927,-1818,-3392,2077,1155,366,1375,-490,1075,140,737,-2329,-1264,3168,3312,-1608,1088,-1584,-1013,-487,-3936,952,-2930,2471,-3563, + 1027,-182,3736,3677,267,-775,806,-3112,-1471,-240,903,-1105,3109,-4079,1582,-392,-1097,1989,3653,1305,-3481,-1911,2342,-2822,787,1158,3912,2616,1851,256,2811,-3582,2870,311,3876,-301,174,1131,75,2015,-467,4008,2529,-2090,-3147,-2098,-63,-827,3085,-131,-37,461,1676,-2733,3525,-2085,-2358,714,423,2937,-1194,3359,-373,-3507, + 1993,-2351,-2997,445,1523,-2713,3901,2745,-1982,4013,446,-2521,2899,-1907,2155,1850,3489,2323,2131,-3859,-728,870,1190,-2108,436,-2609,-895,1748,-3609,-1306,-917,-319,-930,1420,-606,-676,-982,922,1078,-1927,-3304,1236,2302,-1907,-667,3657,2057,276,4011,3902,-1554,-3237,3027,-2402,-897,-2448,2825,-738,-2860,3100,-2111,255,-3960,969, + -538,-2733,-3670,-809,1514,1536,3562,-3605,-713,-2661,-2130,-2058,-771,2341,1231,2315,-1053,-58,1126,1529,-2034,-178,2285,-3776,-3585,3913,1677,-1088,-2365,1347,3203,-1175,241,-2134,-3518,-2765,-598,3062,1687,-2958,-195,355,3420,-2626,-455,-1460,-3493,439,-1642,-2121,-294,-1513,-3716,-570,1372,-3664,-3337,-1157,-4065,-1460,-1963,-3094,-3763,-3241, + 775,1080,-3147,1943,3206,1,1759,2052,-3028,3230,-1272,-1124,3951,1090,445,-3054,487,-3768,1059,-1226,-1391,-754,3663,2052,-3745,-2548,-3780,-2202,-2602,3974,3009,3944,905,-2996,-1993,1170,-1604,-707,-3831,3219,-3570,-3057,-327,92,-283,2308,1674,3368,-800,1562,-37,4028,-2268,-443,1063,2267,2720,1782,-1319,1113,-814,-3477,-2552,3434, + -523,-2485,3982,1576,1772,-2753,-677,-2357,339,-3031,-479,-831,-1535,1809,-3320,-1988,2846,-3208,1644,3102,-3336,-1113,-2299,-1112,1610,-1292,1705,-240,3307,1150,1990,3486,3945,-2999,4064,2412,1966,-2329,3408,-3629,1172,1154,-665,3242,2439,-1804,-1199,-2245,-2314,4045,-3869,2759,-2214,1178,-1664,2678,-1199,398,1170,1226,2844,2039,-3722,2058, + 904,-3090,967,-2509,-342,3982,-501,-1402,1816,3116,-2500,-2290,-2284,2220,-1963,-2748,408,1161,1077,1366,-425,1684,-1788,-2375,-3019,-3422,3909,2111,-3107,2851,-3558,862,3565,-2501,-2601,-4091,-3599,-538,2388,2164,3011,3064,1967,3063,1100,3972,1400,-812,2406,421,-819,112,-2575,-3246,825,-3441,1952,2053,1439,1215,2324,4080,3053,-3163, + -3352,3362,3165,3431,2616,-1818,-392,1804,3939,-1185,1181,1957,-646,2566,-2714,-418,-3622,1526,1559,2724,-3111,-1201,2581,-2736,1810,-1697,3987,-1844,-3793,-884,-2430,1254,-1230,-2430,3257,294,-843,2035,-2909,-1520,-2192,324,-540,1300,-993,-1493,3666,-905,828,-1881,3424,-3779,461,-2315,40,-2277,194,1774,-3922,1478,-1939,831,2673,-88, + 961,1830,-3247,-3408,417,245,3495,3915,-2005,3023,1270,-1750,3392,3332,2044,-52,3519,-2762,-2277,-96,-2509,274,2732,870,1966,998,-4095,1760,-3776,3605,1732,718,2321,3001,-1730,-865,1977,757,-990,397,4011,-3017,3755,163,-1124,3742,-3035,-1799,-2605,2718,-865,1764,-2722,2786,1024,943,2089,-3057,378,790,3189,-3784,1585,1355, + 3301,-1503,-1223,3442,455,-185,4094,2744,1009,2246,913,-2590,1113,-3155,2243,2965,-2604,-1118,-15,-3301,-96,-2773,3598,-1757,870,2394,-2480,-862,-2934,-2512,-3631,1060,-3630,-1101,2700,-3665,1678,2617,902,2393,-3029,523,-1377,705,2261,-2154,3432,-1742,-3300,-3697,-7,-3250,1133,-1669,-275,507,531,569,483,-849,2605,-435,1214,2133, + -3604,2279,1943,3184,-1715,2299,2562,1462,109,-1436,-1516,-1557,-3146,-2727,-3501,-560,-1142,949,-949,-3852,-2469,3072,-3187,-1903,-3894,-3022,3666,2243,-1362,1660,659,-2703,1322,-2358,2415,4080,3675,-3582,-1810,2366,-364,563,1755,2686,-3218,-859,-3528,3388,-1322,-500,-1936,-855,1945,-2388,1065,-1546,691,550,3663,1645,3127,3966,-635,1369, + -2727,-1225,844,2622,3680,-871,1586,-2008,3592,-3037,3356,2409,-1381,-3395,-3384,-4093,-3535,-1838,3172,1212,-2388,-60,-1155,1543,3994,3222,192,992,3360,478,-2498,1037,-382,-2519,-2906,-2973,-1163,-3380,-486,-810,-3173,-472,2800,2938,2272,-3475,1704,-3727,2681,-3086,-1442,1545,747,-1520,2541,-231,2257,-1043,-3849,-2352,-3104,408,2798,657, + 708,2823,-1700,1598,-1964,-3403,2862,-2095,3690,1890,-237,4048,3557,-3640,-1530,-1511,886,-3868,374,3960,-2729,2430,1721,799,1547,3415,1800,3274,-1804,-591,2517,968,3743,1478,-502,4019,1157,-394,-1932,-956,2676,-2496,3638,2729,1410,-3099,-1057,-4087,-3851,-2725,-2740,-3204,2754,-1786,-1441,-2081,-1835,3310,1909,-1310,-3767,3477,310,3903, + 998,3141,41,3715,496,-2728,-2712,1233,-1049,-3240,3656,-2304,-2845,-2843,3936,1305,-4000,206,1007,-3962,-2301,-1326,-2730,3631,-242,2932,-2701,3682,-2245,449,-3045,3564,-2145,-2985,-1977,3059,1636,-1521,3745,-1184,3138,-3302,-4081,2288,3945,-2691,2022,-3516,2750,570,3690,-1671,-4081,3443,1436,1048,1101,-4022,3909,1611,70,3289,-3274,-144, + 1617,3232,-1210,761,894,-405,556,2757,2769,-2495,-218,-3075,436,-1771,3603,1392,1771,-265,1820,-2112,-2532,-1090,1867,3418,-3323,-94,-602,2805,2823,-262,-1669,608,4045,-2926,-1174,-2569,-1258,-1583,512,-1395,2459,-2831,-1651,3341,2612,-3948,456,1847,4077,89,255,1080,-4065,-406,36,-3897,769,-467,433,493,-257,1812,-4035,3376, + 820,-2490,-2,981,-1664,-2175,3068,-2734,-1421,-860,-2278,84,3997,-3751,544,2221,3707,-3232,3581,2603,-2981,1554,3363,-3998,-365,-2449,-2967,4077,2843,3334,-3491,1609,-1398,-1393,729,-1662,1318,2564,2441,-415,3752,-1226,671,2191,-2362,-143,3284,447,-3543,3397,104,302,2979,2783,-3584,-604,-1333,-528,470,3672,2745,-3984,829,344, + 2143,3457,-1746,3537,-3906,-551,-1156,2224,1677,-3057,4093,-1838,805,-2165,3325,-1460,3946,461,1981,-876,-1574,3637,3703,1390,-3307,-1654,-773,-3960,1509,-1382,-3352,-862,-578,-934,3464,3657,-4049,168,3056,-2934,309,-817,2726,-2370,120,3987,1901,-3299,-456,82,-1881,3348,-2288,2239,-2277,3100,1525,-3953,990,3119,-1663,-398,2399,1422, + 1034,-2229,-1923,1751,-1436,1555,-2846,-1468,-731,3052,-3562,2644,939,-2032,3211,1802,-4067,25,-3162,-2441,-1667,-1347,1354,-1388,1072,-3708,2798,-1645,2243,2795,-2035,-512,-973,-2480,-1446,-2672,702,-2380,2687,2349,2425,-138,1935,-3555,-2432,-3562,-3175,-3235,-187,3047,1363,3325,1394,-1803,1618,-3893,-3388,-2273,-648,1357,-1655,3515,-1071,3860, + 1187,-595,-1036,-759,-2405,-2285,-608,-372,419,-3341,1392,-3318,1525,3427,444,1518,130,-1862,-1754,-2331,-3737,-3605,-778,-114,152,2665,-1375,-2054,1383,-2957,-2137,-2222,565,-3399,-4050,707,-2356,-1113,461,-637,2686,-1712,-3483,-1470,-925,-279,-2156,-2573,2041,3821,206,97,-2086,4047,4017,-291,2403,-3605,3055,-2150,3205,470,-1617,-630, + 171,-1991,-2772,-52,-2553,4052,-903,-2453,806,-2164,445,-3500,-1356,-2073,2555,-3036,3901,-2317,-4059,-3124,-1308,-1750,422,-2281,-1345,-1005,-391,-2206,-1793,4033,1421,-2473,3043,2626,-2106,-2628,-3875,-2194,-1352,-1049,509,-129,-656,3895,689,3071,1261,-412,1431,-3545,-3059,3099,-1662,1486,1879,2373,-585,965,-2615,4082,2340,-3195,-2418,1568, + 335,-882,-2911,1380,2820,2694,-2621,-3815,-315,-508,-1899,-1754,234,808,-3493,-2177,2096,-3417,-3939,4084,-2871,-251,-589,-2960,-2234,1699,2882,-530,-4035,-1792,1646,2989,-1880,-267,2593,-1690,-1067,3671,-2,1958,2441,2631,-1422,1106,1267,1365,-343,502,2937,1474,1047,620,3644,-2555,747,4047,-2707,-1894,1220,-2857,496,-2765,1524,-4033, + 1861,1147,-2658,2737,-3081,-1560,3651,-2663,1943,2607,-2004,-1022,240,-2901,-784,2823,-2155,923,-3026,3374,1597,-1520,3490,3743,1985,1983,417,-472,-866,-1607,2712,671,3515,-3303,1869,2894,2072,1622,952,3695,1372,2472,514,-589,1434,-1061,3227,3595,3463,-2839,2177,1789,-3276,-2497,-1000,-3461,-1016,1309,-69,-959,3606,1625,1442,2762, + -2782,2467,-1633,-3283,2710,3616,922,-2406,-2062,-1910,-381,-2784,591,141,-2696,-1673,-3294,40,3525,2642,-305,546,-3242,865,-2381,887,3317,-2787,697,1562,183,2307,2654,-575,-2770,-3710,-3866,-1608,-4009,1251,-2448,2212,-3393,-1255,2751,1587,1411,2390,-1821,1685,428,2755,3932,1953,2655,-4053,-2110,-3285,2267,2347,-1785,-714,-1573,-4049, + 2763,2772,1596,-1154,1339,2515,-1627,3164,385,3589,-1275,-486,226,2898,-1840,2211,977,-2399,973,-199,-3295,1712,-2373,1396,-3239,827,-2000,-3182,-3514,-1007,3862,-1419,56,-796,2410,2560,3236,2385,3779,2251,-764,-39,1843,-3625,2468,884,3063,2651,-3590,-2074,-2342,3755,-1089,-875,1198,1943,779,23,2414,1723,-2981,2931,1,696, + 3868,-1344,-2568,3859,-3832,-2304,3714,-3691,279,-8,2080,1458,-3324,-3772,-220,-1322,2399,879,-160,3401,-1529,3537,-2094,-598,2143,-1239,-691,2800,-1314,1334,-374,430,-1689,2150,1964,-39,-1999,760,-93,754,532,2538,-2968,-3627,-3558,2325,3497,-34,3169,-3447,-17,-844,1326,1868,118,-1744,-1681,-2100,1083,-1838,-3140,-1328,-3698,-805, + 1509,963,-349,-4092,-2605,2307,-554,2065,1232,-678,-4023,2888,-2844,3773,-1445,2866,-3950,547,1547,-809,1581,-406,1595,684,2812,-211,3986,-2261,864,-2501,-2085,3093,-227,-1300,2574,3385,4063,-3455,-2402,1742,3686,2771,-3127,-437,-1475,-3302,1488,2338,-3008,4000,885,2478,-790,-956,-85,-851,325,-442,330,648,-1539,-2509,505,-1969, + 3081,2487,3869,45,3976,3963,-2318,98,-810,3367,2276,3943,2107,-3130,-302,4088,-3740,-3254,-1248,3612,-3173,578,-2539,514,3593,3132,-520,-329,2660,2277,3532,-1030,2045,-16,-2704,-2898,-3255,-1446,-275,-2357,452,3188,988,-2365,-1920,-119,2195,-1226,2074,1845,2936,2502,3466,-3600,2644,-116,-3928,3667,-1510,-365,-2641,2093,-2144,3761, + -1331,768,1805,2239,-538,646,-1544,3356,-579,2953,1044,3837,-264,-3160,2620,2803,-2037,1519,-2418,-2798,1261,-2216,2687,1785,2841,-1187,-854,1191,-1993,-1281,918,1270,3665,-777,880,-3517,3942,2457,1762,-4095,-2003,-1849,2996,2705,1207,147,1653,-3119,59,2,2906,567,1797,2442,1831,1627,-1949,3072,192,-3495,-1411,-2811,-1927,66, + -872,-1777,-252,-1161,3250,450,2860,2516,2970,3301,-3473,-2047,-2599,3261,2405,-216,3089,3743,2504,2503,-2584,-1093,2497,2204,-470,-1560,2289,206,2320,1559,-1964,2281,1084,-1859,-3187,1183,-3531,-576,1776,1203,3683,-2310,-4023,-3939,-1633,-864,276,-1582,-2666,2067,1912,1028,-4066,674,-342,59,2174,3970,2586,2826,-669,3121,3890,-2380, + 2438,-646,-1437,-1637,-774,3062,2520,-3569,3667,-1668,3046,-2711,3019,3354,-2659,-3240,-1240,1110,2602,-1841,291,3768,-2082,3758,-2776,-714,1922,570,-2195,3240,-2550,1825,554,2642,4014,2123,2145,1417,1506,-3916,-375,1671,2374,1324,-1890,2745,128,2950,3594,89,-92,3347,3702,1214,2399,-1749,1765,-3903,-3394,1261,2672,-337,-3684,3876, + 3788,3104,2084,-2271,-3441,-3572,265,-4003,223,519,-2506,3755,-2708,698,1599,-3296,-3850,3013,3343,-2095,-873,-695,1927,-1382,1247,-3956,2378,-1452,-1139,-297,250,3515,2854,-3139,1093,-1454,783,2099,3539,-337,-3942,-3548,1067,2493,3337,1864,-728,-2165,-3010,3731,1914,-549,-2663,-1228,-449,4069,-3214,-359,1114,936,45,2020,-3617,2162, + -1789,3808,2136,1641,3816,3695,-2094,-134,1817,3529,-3812,-141,-2467,2198,1674,-24,-500,-1303,3496,-988,2374,2554,-1746,-1262,-1765,-3658,-2144,378,-3918,786,-3263,478,-691,3469,-3282,3469,-3988,601,-3213,830,-4034,-4027,1150,736,-464,-803,557,477,2043,1770,-2183,-2121,2950,-2379,-3216,806,342,-3177,1907,-2362,-80,3349,-713,-3735, + -1324,-163,-2388,-3301,-92,2277,1523,-3941,877,2443,2855,3443,1134,3022,2874,-1781,-3843,990,1617,-2900,-2533,2313,-2965,-884,2114,2510,392,-2820,-3143,-445,-2137,1788,1112,-1016,-430,-206,-3564,4091,2339,3898,-654,-1098,2819,-559,1121,-1028,-3730,-2943,1073,-1757,-1388,4092,-2797,1145,-846,-3318,2640,-2173,-620,-3850,-3216,-313,1767,-2252, + 2509,2156,3853,-976,-3604,1717,587,1971,-2165,-2996,452,-1504,-635,-1077,2861,497,3981,-779,-3104,-1098,2423,-2692,-3530,1329,398,3998,-3976,1326,3125,-156,-2526,-927,-1326,-3255,-3632,-3532,938,-3416,874,2514,3409,1320,-688,-3242,2634,-2662,-4014,-3111,3751,-3193,-724,-3983,-1257,-1082,1445,-440,-1333,1081,1830,2028,3591,-2218,263,-1356, + -946,2016,-3120,2039,-1735,4038,1829,3599,1040,-2647,-2103,-561,2162,652,3406,3320,549,-2446,-3708,-1512,3014,1271,-2318,1670,-1942,1745,1940,1623,-4068,-2195,485,-1324,1345,-1271,-1843,-82,-3753,-651,538,-32,-514,-1433,3409,3154,3035,1209,980,-1856,-3115,-3198,3247,678,1363,20,-1199,-392,3512,-1107,3282,1785,1827,-2902,2162,-2435, + 2021,-3189,-1959,162,-646,3777,1068,3953,-2213,-3594,-2119,804,-1494,3780,-3239,723,4001,-4073,3666,1081,-2002,709,3096,-3181,-3794,1478,-1153,-1412,3067,3444,388,-3027,-3652,2792,3396,3897,-1138,-2855,-2460,1142,2577,-1625,-3625,1791,-182,1816,-3516,-329,2822,3929,2864,-722,-2640,3219,-2547,1248,-8,-3004,1939,91,-2754,1825,-2520,-3035, + -1834,3418,-1756,904,-2720,2980,-292,293,2219,3323,2797,1291,-2589,1419,276,162,-260,-3393,-1075,-3088,1443,-2994,-3507,-3239,570,-496,-1878,-2146,-2323,-3062,3810,602,-1870,3250,3284,998,-3380,-2589,3264,2150,2865,-2705,781,2584,-3593,-1046,3897,-1719,-456,-2572,3433,1231,-1316,2836,1303,298,-645,1464,-652,-2469,2860,-1567,-2727,-3544, + -1222,-1658,3312,3486,-3654,3085,-3785,3628,1787,1479,1223,-3070,131,-3143,1299,2201,-247,2445,-1690,-3326,-592,-1124,383,2351,-1139,1197,-1263,-3531,-4094,2252,-1692,2741,-739,3968,725,-3509,939,-1530,1887,2507,1525,887,-3866,1434,-1790,3121,2853,2303,-3492,-3598,-2003,2443,356,-654,37,-2630,-2675,2133,423,1325,1302,86,2003,2082, + 428,2160,446,2471,-2960,-1823,-103,1930,-3929,-2708,-3216,-1099,4071,3882,403,3747,332,-153,-3490,4021,1742,-4067,-973,-2126,-2128,-1821,3,1455,-2616,-601,3900,-631,-1415,-30,-2924,-1074,-382,2466,-1535,286,-3981,1869,-2095,-277,-1185,3610,-1872,1630,-1526,1513,-582,2488,2743,-2046,-3858,-1139,-306,-3087,-2849,2095,2028,2756,-3476,936, + -3095,1694,2379,-2136,698,1305,2331,-3180,2680,3524,2097,2917,-586,-2533,320,-142,182,-3672,-384,-2722,1734,-952,2337,-364,-2208,3821,1824,1697,-2396,-1205,3171,3753,-2779,3634,-1828,1358,-3205,-689,3706,3377,1308,3599,-1664,2792,-3062,955,3099,-2255,265,2689,3111,-2114,1941,-2590,-478,3813,-3438,-1626,1496,-3360,1580,111,3700,-45, + 2697,1676,886,656,-929,-3991,-2821,3633,1331,3131,-2240,-349,1231,-3823,1991,-1037,2393,-3519,-3723,2236,-1193,815,-1145,-3286,-3426,2744,-3779,-3778,1236,-861,-2229,608,4036,-3208,-1685,-815,128,427,5,830,-2271,2216,366,-441,2151,-2921,-2871,3553,-517,-3482,-361,1996,-1843,2984,-2580,-3841,1236,1697,3260,3237,-1826,2855,-2146,-583, + 3426,-1627,-3146,1917,2625,2968,2393,-1661,-708,1215,705,2256,2601,-2971,-1146,331,-3766,3227,264,-1611,-3167,-2120,-3431,-2213,2107,-250,1382,856,-2663,-3026,-751,2070,346,1172,-317,2503,3762,-3008,-1970,-1138,750,-2724,-2084,-2455,-289,-969,-582,-1133,1165,-674,2276,2845,201,-2344,84,-939,772,2921,-2689,-2707,3611,212,-2387,30, + 3742,-2298,1955,282,-106,2995,2813,2011,1913,1697,1766,2889,-810,-903,-1391,-1445,-1533,2114,-3714,3290,1839,-2921,4037,646,1242,876,2681,-1348,-3965,-2448,-1018,326,-90,838,282,2879,168,4029,-204,5,-3781,540,-517,1572,-2520,-187,-1407,-1186,1547,3061,1971,210,-1787,-535,-1637,2755,-499,393,2421,3784,-2482,-96,-764,-2919, + 2139,119,-1874,-539,40,3215,999,-141,3311,-1785,129,4069,-704,-1671,3988,-3221,2158,3587,-766,-3723,-1402,2760,1998,1484,-248,76,924,2806,-1724,522,731,-1613,3635,472,113,-1338,926,-3825,-120,-3282,-1957,-2354,-99,1966,-2600,-3225,2837,2511,-3422,-2220,-3841,-3194,1341,2582,3071,906,-515,1910,-1681,-1061,393,-2228,3544,-3803, + 2682,3829,-2964,-394,-2408,1017,3816,2091,2357,600,2912,2005,3360,-527,-3278,1963,-2445,2888,-1097,-3589,-1043,1740,3779,-4057,2613,1950,2077,-755,846,3635,-2871,3043,1503,-2077,-2655,-2105,659,-1817,2492,-2622,3293,3109,-2532,1466,-1022,2066,-3605,3203,4075,-3540,-1560,665,-1132,-1142,-4055,-92,468,-2461,3729,-2389,3610,-1038,-221,-870, + -660,-1834,-1106,-2036,3743,-2907,-1942,-1019,484,761,2651,-2411,-3516,952,-2179,2438,3045,1542,2397,-1427,-632,-2454,2528,-2240,544,2837,3091,-2118,677,29,-2776,756,1226,-3124,-2291,797,210,369,205,-907,3788,-3771,1733,1004,-2460,3663,-3800,1968,266,2117,-1924,-4082,-3531,-3684,-3620,2297,-3596,3029,3616,2877,1381,-886,3080,2475, + -1312,-3305,1301,-2470,864,-726,1500,3054,1340,1904,-3940,3542,3250,-3186,-473,2041,2408,2354,-3191,493,1633,-574,1557,-1198,34,2556,3897,-1040,1235,2024,2901,-1083,-2512,-3154,490,-1035,806,2791,978,-1556,432,183,3228,-865,3717,1502,1318,1177,3570,2808,-413,145,-3994,-1013,41,-4077,416,-3272,-1885,3138,-666,216,-2538,-1945, + -3906,-3110,-3321,1725,816,1666,-3855,-476,-2742,-3494,-2767,-2115,-327,-3528,-2626,1728,1660,-1675,903,2629,3637,-3851,-1860,3250,-3296,-2172,-724,-1255,-1438,1054,-1991,12,-4094,-3391,35,3673,1793,914,2776,3633,2921,671,-1899,280,-1333,-3873,-3408,391,1204,1834,-994,-4038,2205,3874,-1787,1159,-887,-2021,2248,-917,1602,3056,4044,278, + -1048,-736,1397,611,1038,3527,-2747,3332,-3520,-1931,379,2410,3954,-2257,3576,-1986,3708,-3981,1914,2192,-2416,3666,-2047,-753,4016,-3345,-1996,976,1879,2670,5,1304,-1785,2192,3012,674,503,2409,-933,-823,2004,-3738,-1855,-1782,-3999,-836,1189,-618,-2763,3918,3757,-3802,-3545,-1348,2281,3193,4041,-3192,-601,-1724,-600,3803,-2918,-1729, + 916,1084,3718,-2642,-1175,4073,1050,1623,1367,-3944,3731,-3954,-1804,1794,1227,566,-2502,1499,-3691,-591,-2025,-648,3418,1457,-2492,3761,-660,2724,-2275,114,-3915,-364,-657,1507,56,268,1014,4032,-2436,-3185,3208,2751,28,2354,1086,3769,3463,-4023,-1261,-2844,-1916,-114,-3429,-3350,1879,-3437,-1264,140,3217,2502,3444,-648,-1815,1878, + -3507,1599,1027,-2638,1068,531,251,531,-14,3866,844,155,3880,-3220,-3314,3784,-735,2191,1248,1973,1797,-3057,-1806,2243,3300,-378,1730,-3406,-1508,-3272,-3178,-415,-2774,-3425,-3785,690,3684,579,3855,-3844,1151,3635,3399,2894,642,-3535,1601,-3210,-2321,1473,2693,-3400,3132,1991,-1736,-436,3236,-3827,-236,-4094,-3948,3451,3550,1726, + -2552,-2838,4021,-1999,2632,1990,2830,3966,2431,-2026,-2462,657,2880,-2449,-2026,-1774,-1720,-1526,3666,201,2476,3694,3855,3031,-1060,-2625,1907,-38,1642,-2284,2748,1949,3510,-2184,1271,-3907,-2086,3990,585,-1680,3868,1025,3432,516,3894,1741,-3770,-2692,-475,-3901,-530,-1890,-2820,1433,-3422,-1716,2003,-3593,-1744,716,2419,-2411,-453,-1714, + -1342,-3743,2271,2017,-968,1120,2146,-2437,-2669,2550,-572,-159,708,-2640,4077,4021,-1716,-853,124,-487,-2196,1594,-1087,-1164,-3497,769,1446,-948,-1049,1734,-114,923,-2466,-2009,-954,3028,905,-3658,-1044,732,3245,110,3234,-2097,3599,2264,3693,15,-4092,-3950,567,123,904,2550,2113,-2442,-536,-210,1051,3782,3657,1188,-2863,2152, + -1665,-1130,-3349,1934,-2822,839,-3755,-3508,-1382,2560,-3777,-3456,38,344,2762,102,2080,1890,-3615,1745,-1917,3465,1252,-1286,-114,1797,-1455,1118,-3750,-190,1196,-2342,1488,-3627,1888,1106,-902,-3627,1504,-2939,2855,-2243,-1284,-2864,2077,-1124,-2567,1624,1071,-3819,2718,-1174,-2992,-3351,-3842,1688,-2775,-585,3276,-1228,-3377,-982,1170,3985, + -2850,1413,-2885,1179,3295,-2971,3785,-1262,-732,1335,-1748,-479,-2806,441,-1777,-3351,2556,2025,4094,863,-593,-3515,2919,1867,669,2767,-471,-3578,-3783,-3672,-3041,893,-3754,365,3875,909,3060,-4091,-1843,-1022,-79,3023,354,351,-2434,-2183,872,2453,-2123,3452,2553,-774,554,488,-3451,-3079,-12,910,-2349,980,-913,-1978,1169,3682, + 2445,-3402,-2128,896,-3197,1152,-3293,3111,-697,-1909,1330,3482,3495,-3437,386,2415,3180,-2322,1096,-2104,2291,2649,-6,3184,110,1187,-3844,3975,933,2492,161,1511,219,3759,1205,3945,3020,2985,1890,3812,-1951,-392,-1445,6,-783,-1618,3191,1573,2899,-782,-723,1634,2537,-2824,4027,3483,-2070,-1741,4047,1022,-1828,3738,-548,856, + 488,660,-838,-2414,1712,201,-222,1115,-2793,-3713,1683,-207,3327,-671,1747,2577,399,1478,3244,-3424,-167,2458,-4060,3615,-2407,-212,-3312,-828,-2459,-456,-1213,1436,1553,-813,-56,2508,3203,-888,4028,304,3549,2659,-1462,-1371,1647,-668,-1953,-528,-1538,-164,-3907,833,1628,-576,2970,-465,862,-2355,1739,190,-3084,-2662,-565,971, + -2082,-1899,-102,3422,514,-13,21,-3563,1253,-1925,-2483,-3852,-1853,-1224,-3691,3865,-1096,-25,-2240,-2724,1304,3129,3198,-1270,195,2785,-3729,-2406,-2080,2504,-2908,-3722,1654,1701,738,-3977,-1049,-368,295,1459,1624,-215,641,550,-864,-1694,434,537,-249,-1587,3085,-3913,-1031,-602,2830,-1307,1904,-1560,3610,-1260,-1093,557,2925,-876, + 3184,-327,-1053,3386,-1616,-2071,3029,-1381,3125,-2218,-1985,1232,1473,2256,2881,2181,-86,-3259,3078,-3016,-3828,-1269,-822,3517,-2918,1028,-3922,3208,-2956,-440,-2912,-1235,2957,3486,-407,3799,2341,-1250,-2918,1171,767,-1251,-1321,-719,214,2510,1068,149,-2785,338,-2493,-1266,-1930,1896,3068,3976,147,294,-2747,3454,1973,-2642,-4093,-1151, + 2137,-3716,117,34,-1957,-3095,1008,-154,-1289,1123,-947,-209,-1977,-3493,-3597,2371,2931,2901,-2285,-3161,-1921,424,-1073,1192,1952,-1831,3210,121,2766,-2382,3655,-3488,3783,1196,103,-2243,1915,1842,2086,884,305,2251,-701,-1468,-1072,-1518,755,2260,1645,562,-1170,-1702,308,2886,1623,-2393,1824,966,-1077,-3457,632,3930,-3997,-3969}; + +int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_EXP[] = { + -107786752,28626972,4872083,13049708,-26102708,-11633238,88913373,33591163,-29379259,18974427,54653031,5584828,-48096804,39940489,-72146417,-1893719,38980040,21536634,5717449,13284200,-9300117,-22379498,5276900,4938785,-24775700,-2420473,2029539,42547504,28900136,-29483351,-29783210,-14900814,43624260,-38108244,49529941,58267309,45136847,-21425663,34288872,7189622,-16848125,-28885403,79450085,8520462,78166239,10211171,81402095,-7248431,8911908,-74252751,81706055,2137817,-33868802,-37845759,121753462,34135073,12150083,-16749444,-20100931,-16689015,22670022,-29013318,69464525,47474944, + -44412240,52899646,-40424870,22905699,-23666718,-13399294,35896867,-34537194,-16689512,55339217,128333425,11556635,9312686,6522045,-50947272,55907446,-100910921,-16336477,9818284,54175718,3952000,45652339,-9257324,51530721,58138547,-33425448,-51423460,-43162366,-11290870,-19613690,33361818,-16949923,49341441,83474229,27794245,10337533,20385312,-31217581,-78511332,-6967456,-4734329,9862079,-33132011,28609126,-59641089,47718897,-28600583,-27453915,-31037827,30994223,15106110,-10610716,86026600,72339238,7923030,-57948181,-48206166,-57642905,37599855,-5895243,-56696972,-32074124,-62886610,-34395656, + 41892583,-5624758,-45805507,-39515239,3032657,-54296714,-34348185,3075947,-4425582,-69272309,-74426897,-40860349,-30373233,-16910205,-49925757,13113282,-826447,-3512098,30591043,2007889,32035597,80268561,-3228788,-46888313,-117258767,-16781224,51355242,-56279179,47506422,42771953,38642952,36817612,18554219,6967818,60091424,-11595876,39128179,-12103365,11779748,-20443351,-114666346,13077467,-1739400,-81848016,44308489,-23933872,-19613662,17987985,59394567,-3444343,23936852,-45910819,25437568,-20786967,-26387106,44985793,-34703121,28389947,-37366247,46841075,12620431,10861243,-30587040,-23544906, + -105593932,22538478,-2334798,55348973,-100413300,3192412,74733672,-28774867,-51474033,-13155517,-23932822,49698152,4609645,865883,-26249528,-18872601,63662744,-40389878,-10269996,55325288,19474920,22720904,-53611962,-83822181,58953939,9175721,-8668031,16469701,74681734,23288080,-9063584,-21395365,35288446,31092016,44489459,52838138,36826468,-141099541,-73382805,-41977144,-25708547,-30648619,-28413153,32163519,-49404959,24755959,-30640888,8997465,-624852,32626978,33033171,-59710408,87715003,-24949334,45860328,-53362279,-39166827,70248867,122022127,-9445294,-24805075,501248,-53502726,8952150, + 42692678,64314929,28560035,-56420671,-9628762,92122801,19801968,-77959813,40910762,-4683491,12897672,49985871,22004132,-34404887,-51094325,39505785,28367093,-99575042,-66755230,14886493,3689440,27345054,-29398674,-3983549,-42240826,-76209795,56018772,-17968202,-19238701,47112812,94477389,-7748221,74122826,-52621092,4425131,-64640485,-10118037,-11218225,-87004805,-70996845,-49950916,11781185,-108539763,18558043,-33721441,-23956132,-14529241,36147561,-38301577,77654258,-69665946,-109462702,10182947,-12463442,12387544,-12235675,9169865,-64422986,74115053,-80584241,-400816,-4990061,-29371110,8807447, + -30126805,33708436,-58964436,18985349,-47989347,64263114,33419543,18522857,-5209694,18741087,-35861200,10832886,-63663791,1727445,-58806431,17366208,-64554251,42324368,31222390,-16808785,105122054,79793734,-15416837,9072582,-48413905,-29355755,-35902379,35423169,-12068109,-22279449,88522790,1386017,-1967375,-132332,114846469,-8085809,38992674,-20862444,-71399060,-28448797,-79985510,35588063,23133637,-6197229,55380755,-40466771,27754622,-33514171,49876660,69150238,42491561,-89208694,2773800,10739478,-22440821,-23210053,-14950211,36726628,-23006061,33223301,23201743,-23529324,8319852,-37340592, + 5682169,-47523321,-63409372,36815244,75045515,-94974755,-30883243,27100922,6444265,51031725,5938721,-15576705,83103348,-21005760,-4956945,5230405,-28520476,31756255,-7150337,22168458,-54548968,-46039156,25887525,26654717,-57509972,-97975795,-74529213,10145399,-76574867,-11799447,-53868932,45136988,-20825263,-56702246,52123732,41244202,8080825,4053336,21702329,25690561,133725627,-53676582,2874152,-25377994,-7487057,-31176006,-80893898,-11489725,8688185,14377001,-16951514,-29713724,26771296,-44872201,292071,85586506,-31764412,-35584592,-74042702,-95807897,-50069024,-31823603,24251069,6673912, + -7561825,7472739,-35080953,-16924962,-23262631,-15087682,6912766,21140362,-23178949,349513,25816396,-77474085,-22501896,20185485,-16027398,33608485,-86435469,69488643,31405093,-779920,91087988,-24457944,64616820,51093904,30516924,-13710975,48831169,-17045074,120041469,4679883,-14550508,-102948761,33740332,21409103,-88041116,-30814051,26802190,53828808,985782,71768565,14015992,51621766,-58062166,45720085,-8914588,-4336961,-15082915,-10959502,95608546,26822947,7497353,16954963,-45955470,27672403,-63605957,-5382093,23360748,-19886455,-38532695,6554351,37145873,33166485,-2019060,39218564, + -14380160,-65441169,-39166034,9159629,14789983,-17665435,-5748468,31612487,89171461,35979934,11292833,7203666,-22406044,6936158,-33355888,-41732313,-106550899,-51979341,33742961,38960047,-17802016,-69240170,-6635938,52026575,-20327967,-62899558,55424101,-10789075,-79801363,36566587,-4753869,47753425,30650117,-27914182,99510239,-13783996,-78068376,-100471934,-7308791,41341183,-29869646,28373972,42518635,-21633286,-56060154,-8677595,-1557390,71073631,-32741471,42666795,9577567,-97160958,-17076863,12909231,-106243893,30542874,5406930,-10673991,-28902657,-39341429,-52623393,-31489591,28893756,-13456370, + 61872949,-74656712,40787139,1253280,-3272222,-70214970,-17101965,-15286217,-73043500,10334704,-26561467,-52102653,73879220,-17364360,47574320,-65273513,26244100,17702105,24289986,-3766604,38372342,-94055293,72538693,18286427,-95749812,-8332482,-23652185,-4336342,-53355791,-7083669,-67538215,17225348,-1646412,-8279502,-29239803,-44405035,-50179470,75322085,159152947,9027381,76114735,14992776,11922579,18427233,-10868740,-73798039,-21028400,11067999,-29692906,19010757,4344944,-34835476,-39669376,92625474,7655279,27783193,14286272,-23654676,-17446513,-20319698,18371381,42765228,50418410,20741387, + 38585564,50793594,-23446976,-34352203,-11375649,-13777635,-2512855,-10094805,42812646,-33005676,-7227299,69776661,10438783,94006511,54136445,42446371,32681027,55485393,-16141093,-21598285,71801296,-37848921,-11084874,12264423,3637791,-22640658,-8542820,127868014,-22188689,-10190843,-16190645,-36504054,-48977863,-7267452,-29801859,-17424558,4240562,35265146,63712012,-25361713,811573,55020992,-35450938,-52891204,27803870,3488305,-1762977,-4330251,-41637139,-481058,-35220995,-7031219,-11916186,105217759,26196726,-7009424,71818355,12949265,29891854,-7456248,91157405,25897258,54750439,-115473252, + -6615288,-72988471,-16191348,-62563434,-13203750,-35817892,-12826759,9184038,21149717,-10529350,-92253172,6045228,37130305,-12319874,-57589371,36993490,20126367,16021028,-56342250,-55734635,35419671,17873032,-21976870,-111926537,-33935571,-11055373,-67592418,4293056,-26556312,-39115849,29349836,39657120,21172106,-65957562,56854284,-53658726,36199213,6801916,-14956827,-27401049,4915055,-70509581,-1412741,46446790,-86701874,-8061736,18279032,-77015171,-9518294,54914313,-10090486,13969475,7614373,-40699522,74030079,13139667,-91793176,53038451,84440776,-26141613,24739487,-24829270,-70892223,-85448468, + -68459658,-68666303,16010670,-8477951,44576689,16438286,-34703941,101341875,77345695,-15431322,8434030,-50981366,38518504,-21730425,-68300812,16459961,-385250,-38985224,33254945,1865653,-16899181,-4455224,33986107,59486507,35039707,11328317,44916308,2205440,7800190,-111378360,86370082,-55002314,8187851,-3038063,-73935700,29651187,1171670,-20083919,-22021387,72413050,-28840615,14544624,6320387,-42373354,-2092419,42175875,-96800202,26649229,32653983,51934683,-68376265,-3898286,32712654,-62123058,-116614380,-33551260,34907,100265760,-57377234,28429387,-40986406,74708531,31798146,-2467550, + 14988,26660540,-53997118,-68846847,33357784,31618309,25662594,27642026,-5144062,-24991011,4569309,53876752,593627,76876955,26808198,82535859,-13518793,74555265,6674241,10794030,5505330,46545588,4501809,-51318502,15717551,-49294468,8726582,-115100602,4218882,17460447,-28420278,12719992,6354149,-34505052,47434398,-2286224,-14244468,13803339,-21203324,60740230,-64311569,-19332193,-3467277,54991969,-48724449,48961748,64356220,22298428,39279986,48162736,-3612552,44755311,-15592420,25666879,18097434,17670816,-13464805,-94662773,93326011,-37890009,-31591739,-19533152,17309680,-34859469, + -49012304,-25064174,-2859939,-21240967,-56591249,-34942984,25480850,-31501269,10275487,14551760,39262513,17952714,-13598395,8511174,47511267,-39475921,-22940009,10333345,62073559,19412022,10805499,1417932,13717437,2854250,-57030329,84065201,-25559868,18554381,3686315,21568913,17537143,-73408734,22451396,49987795,-37607794,3717044,-29002658,12070872,37052724,-16285658,-29561303,25794214,93312606,-72086350,58609812,17224738,-17533719,-16714992,57690617,-55337068,23657119,-31172886,-51642132,43351477,-18720280,-92591943,-1236891,26891566,23228352,-37983578,6299986,20108090,17518139,-33737640, + 2250610,-110521585,56672448,78605445,99504164,-28066022,9748644,-64398449,7897179,63934952,-64143170,-97187683,60277930,-82133345,-8083405,-56223270,11435980,-57717377,-38374698,87009973,-37133310,9017849,-19563271,-65605116,-21912289,-25886398,-39328956,42671802,629764,36725559,28717628,55846833,-30865108,4836074,-3649046,7883147,-47878553,-20920603,-31192181,-19206917,-19423645,-14386777,11943811,-57768447,7178135,11008987,-45650541,42060984,-14304747,-12145135,-9111924,13591988,69119546,-127006178,-23482645,-36386864,-35501712,31521192,-38194864,137160907,67051535,-70287551,6700627,28785152, + -51552880,-14516481,113746485,18157477,7841126,-17963968,33935431,-64803834,-30926597,-26159331,-26310722,-63132632,-27499483,81255562,18766040,-46219893,825274,28237719,37336652,66013337,11062214,1177128,-9039753,-63735638,1857702,3312792,-19502340,34009460,39637291,-56397852,-49242119,41484457,-17761303,-8710564,-82862710,1048335,-8780252,38035673,60590253,-78800995,-14211302,28405935,-8920780,-12686706,38933257,-7089537,-116839567,24990765,-29597465,-69920616,-45127702,112477783,2766939,-47151873,46433247,23759275,43086087,-20482246,57444,73382154,112610627,16414716,35376949,75180044, + 20221567,-1098700,-12740606,61941184,8170747,33813847,-63567758,-4042324,11158608,32730704,76112900,-20326354,10192074,-94917172,52615861,-33896859,-6529629,-28835504,7842722,-23689630,11659232,3339711,56785473,20613680,-50689361,-19593999,21368742,27125811,-36796017,36082436,-5081069,-50545022,62784235,-14745675,-26384549,-23532156,39400124,-12508491,130324,55933788,-20210280,-9005756,11426765,-108220675,56886144,-15617548,9706264,721019,-49053366,-23065850,10673886,-11743838,61148853,41367233,-69604364,50358235,6397585,-5864192,-79672037,-5439887,6188154,-33560921,-16724318,14786945, + -86311853,-33930828,37364522,7551055,-38923691,-47792835,-37326634,-28515917,10062333,-12567677,1485328,-67042040,36980755,13651532,76569390,-5578282,28416063,24474894,33057251,-6676852,85415402,-66122719,57563860,21357289,-9958287,-41631038,3954313,-77424333,-19844444,-49311016,-136421578,-86750077,8849373,-14700964,11987662,66219172,-9066131,2706197,53766920,97929458,1595357,-36975158,-9373887,106098607,-46467313,-56313600,-38630874,-17191306,50663871,60959341,-90288024,33096435,63190497,19254118,-58851663,61773659,-33694064,-5659496,-26239139,24821771,-29669597,49210760,33997210,-35166705, + 56772604,-41377810,11284151,-35007148,15859880,37782197,87263452,16025397,31325832,-85158310,-48467623,8174829,8611889,46947771,17186346,4999038,-55868019,-68402972,-17032998,78063782,22764889,-21351216,16874606,-21629020,-37692402,26458256,40135098,-41191059,16107782,-3459656,-52878654,22989731,-1388503,20819492,-79680922,-79096634,13009815,-44661456,20188207,-8296097,-27775497,27841483,1967976,-60346974,-24705753,35960436,-2847983,18884845,1568384,30867732,71186867,-22941054,-49634262,-14165994,-21205345,8420576,55941109,-83080268,56805142,-24179192,-32736822,22044180,41955238,84501885, + 79680224,85647497,-990243,-12326294,-13052052,85255721,-42466393,15561914,-21843911,-25983739,-8188665,39499001,16952776,37822676,18568074,52149003,64829751,-30087050,54246803,48011899,-424867,41196414,-37977028,-7080570,-82320431,39129074,-4746856,8553148,-35639446,-5435074,28840558,52076172,-8784864,23562814,25515375,-42091532,68699738,-16802114,-49876468,-85052295,-18792823,30873211,-93413537,-47479884,102325825,-19977924,42748669,78906788,-26291056,-5818803,50732044,-106413186,-4384453,83124114,-30553866,33015113,34302693,-32319086,-1157882,8295867,-40107245,4607804,3426148,-14363414, + 16633908,-46238173,-11455718,-34659696,35197398,-8555572,-92739189,-98422605,6263749,2505127,28227479,6491733,-53201245,-4880932,-24908796,3612325,-18682820,-818822,22977521,14509641,-10056796,-51933036,-12668612,9835576,-58328244,-14691987,32637739,6081096,-13773893,42783648,-8148970,12456533,56186964,-107323712,62417534,22626586,-32529971,-9315248,11172768,11067219,18573627,2750201,-3949456,19323091,71915791,-41873762,31120925,33950442,22116192,-19948624,-45164767,-60601463,-47496598,-10198043,-84272605,614665,-50732441,-108383085,-106325580,50866963,-79000464,-48107533,6517565,39544790, + -22617876,-59326649,81047073,31573970,4398052,17416924,56552439,40030693,40377193,-868298,-27645468,8153441,73862402,-45686417,-67187382,-13273554,1142451,-172681962,-39679425,132861253,1210562,52260562,-55103361,24431962,36323859,-20475880,33687776,-32037761,1262378,-33998991,77184870,-22541492,-8091547,-14116556,-80802226,-49527411,-9197322,-78066161,-37658347,-55747937,-36590885,10333763,77676010,8439488,-38304872,-22290149,-82506125,9536979,20441164,36977567,10230794,-8775829,24626504,-63005824,-5609842,12225332,-24085314,-12101480,26871774,-25599746,6214220,-14037310,-75410065,-3220686, + 92296354,79961874,-4152652,6639088,13174437,29584847,-34927754,-82083067,-50055484,19333063,86202964,-25937523,-50769577,-74727411,56028214,68833279,56554706,42445921,55183482,-35058612,-35040589,-13936562,17809233,12661405,44586466,-43479304,13908141,-20740124,13045854,-44265705,42481566,-40394612,54471093,28072488,-8905283,16472459,-30562310,64892993,-18542974,12686798,-105597006,-65567012,-118761782,5144522,30861247,-588639,25975128,94828465,52796663,19950814,1503094,9973483,49047032,-17553335,19648522,4533469,-37000965,51925156,-126667017,81721492,24090329,34440690,-42281643,52930114, + 46749344,37963460,26800705,11820484,68585893,-64375936,-70512547,-53931312,-6218100,-17304802,23533660,35951237,-52938749,31954845,13762327,40976327,-32151753,28480649,13204168,-56814818,-18752902,5903711,-53976131,16653943,-30010293,55274999,70523392,16283315,-31477308,930426,59484502,36619293,-54659066,57636597,16825387,30936531,52184219,21052487,35239384,-134274048,37075415,93233992,-76074149,-1786130,-249780,12086829,-96370878,11418868,-17267124,-13874906,-98728073,6737624,-60702524,-115755,26794319,51054261,61428702,39335363,26655597,-82598951,26347161,25664712,-39392856,-63232055, + -40153414,-43530519,-18178063,-49235809,-11394535,33975823,9449536,3356297,-48458289,123293,95213655,-1862772,-8355245,-24771637,87842160,-64900160,-111369835,9977407,6274925,-38659261,-39151525,-41039105,12114613,92230667,63910689,29958009,-3429616,38304775,33229905,-34042298,-24769724,-8688247,-27860056,14264583,-66180257,-36408055,-28197705,97204878,4273770,1007391,102735462,63181305,-34271837,25457368,23469624,7726679,-57071612,2844239,-14386666,-18925897,22085089,-4731691,-38821367,62930826,-70712826,-5241331,33271067,89793187,28533455,-55191471,-10668947,46489641,-14307665,65912419, + 16348797,-34011386,57716883,-25035914,9476438,-44790410,-7068525,-81941592,74281311,-25273022,-49707073,-9570543,67703208,-12028587,4509055,11052009,-18425620,10513301,-44488768,25649436,-35379398,-50724287,-51689379,-113683888,-23593315,877515,16560914,67388650,-77004583,91671719,29918429,1005917,49952616,30787312,-21100418,21667786,-51162489,-25140855,30876144,-45090474,-45169947,-75092431,-37854082,10054016,-42611715,41010172,10728798,-14883701,-29263348,-41047091,-3396011,-63830352,47234000,14560317,30879615,3384737,26202157,-45349545,50190793,37351425,34005226,-49924074,61617609,-88263726, + -20687488,-32835463,-40447897,58182022,34933338,-28754929,-49812648,-37053531,-30824372,82357585,-41323929,20105146,-24736261,-54447101,-73027710,-7982230,-762878,2631717,-34609881,87349479,109060314,-40044607,-30064025,-25476390,16205629,-11131434,-35604118,-20268930,35423451,10006278,-18474540,14696179,-4194391,9291815,54225555,17876381,-53499584,-56310489,-36952975,32023210,9960324,-25090954,-4030795,76123488,-24392155,-94160024,-6345110,32684911,-61021232,-48198153,12393502,19385890,47279701,-37862717,17635654,1894077,-46251963,20530420,-18374073,29911660,-20898902,-34610574,-46979223,-47614509, + -81191385,23193033,-20698367,-29506995,-22871200,41752769,-12612148,35308615,-56381825,68134485,-13835013,-15761826,682063,21369471,-65050310,12238188,47320977,17558252,82855997,52610347,-5239075,-83895871,19721654,-109726810,-19223847,61355429,-30322469,-41417384,9530478,27715921,-99056824,50012667,-7990526,42452072,112446991,-4422643,28698760,31329805,39649161,-19033759,-75030800,-82162839,17354937,-34765895,7146272,-34715868,39757307,47231840,-17803633,-59003560,47020374,-78173990,10292367,24729418,-7146466,87667573,-83699127,-10314407,25135919,31679941,-422089,5418663,61480546,30748888, + 27855656,72988856,4361285,-47773271,-35693895,25012243,-64198107,-9120760,-25319326,-3269755,71747137,84041248,-88220593,3329229,22117690,2570656,-2264576,-4537331,-25442588,-8015629,-25908646,45953678,-37207299,-37821624,62937184,49663754,76046218,32968393,-61515582,61345387,-24167618,10682194,8676732,5234743,63079615,-42469586,5619424,-59588187,-11854531,-75224870,2799941,84998476,4607860,16982199,39133876,-24908112,18668175,-33660428,-87335899,9484971,35833492,42569587,-66689116,21770674,54797453,-55633450,20163575,35430685,74010932,-24549774,-7629626,28886151,-79891874,-43950017, + -19099596,-18876176,8021065,-79272688,-40697451,11292815,21211678,60102548,19867136,-15251769,-716075,75837691,-14541823,146987236,-57574598,22277485,-69036097,-33671197,-33243253,73520209,33148411,-48215228,-86128410,-57522551,54689706,24496935,-834830,57958148,-22305543,-9468960,-32602458,32743123,-18051297,93381541,20085708,-63086481,-8301374,-69652534,20819195,-13307160,-84716368,48920547,58095317,-32367427,-7164938,17331037,15330932,-3081383,81466234,-42892061,12757619,-56279534,-82276554,52059223,16757187,69285561,-421034,13496776,29055813,-13416394,-7099166,14523781,-16768788,-76817316, + -46208,-7183438,23091585,-131663837,-15270599,-15126322,-71143250,51439474,31532521,33331651,12748400,-1845400,32453063,-47919644,-10621875,-1173803,27468945,-47280137,-8883103,40587238,-99043515,-67560200,-9668628,-18481245,2460113,70636081,-33121162,-54985530,-13447186,27928346,-64811568,-52389202,28182329,58222790,-17699316,-62086533,49427807,11720216,-40214784,45741188,-17970420,-84100193,-11047780,14459054,-11404929,62031137,-4448088,-14423026,89705074,15137262,-15674373,55930942,-54290886,-39516103,-16414948,56882808,-54340402,32686562,63835481,-46457510,22239557,-8714854,34724133,-51990569, + 51724680,50968459,-31543197,-39280086,-38540064,6984916,-66593558,22904987,-48789194,-73625802,8683334,-38660273,-29538031,-16046362,19461650,-73757835,35821984,112713268,76293898,-94048965,-65545978,9451263,29327494,-35150387,-99229528,-11461394,22878868,75793919,-31455063,32166464,56804975,-20376881,69920420,-69872078,15717275,-19663337,-1469612,115319451,74297472,63617105,41407049,26680736,5625147,18337195,2553635,-14074670,-15770561,-17695397,-35293514,-21712940,32198197,19785200,65898746,63242235,7105881,2763408,43823053,-22234339,-109094066,64722775,79130973,12387158,6443702,19058915, + 13084383,-62022521,75344457,3623341,21227160,7293244,-51962904,1197008,-25223822,63669357,35582921,-5525910,13930186,-78236291,57708769,44983599,20066687,-65853812,-26342316,-4437349,56410875,31803262,-15972612,-32184302,-24356110,-53980241,-8956386,66960791,2335657,10216346,26427296,-49702765,44619587,-37238856,-30815852,-35641734,18841829,4206940,-46649068,39114624,4775363,-43866020,-45388160,-22998618,10640026,-39075891,-59422071,-13857797,-35372401,50703992,-68009910,7258319,-3734413,-15533370,-19161534,-56270320,-43311096,2790049,35864349,-16663996,44391123,-19701864,-20965159,-40075074, + -80296420,-111734,-13149849,13382683,-74922802,-4425584,28799348,-36481387,-22901724,6291706,27037898,-8398540,30708522,6267055,-16418106,-31264877,41673547,27527430,41107803,-267331,287260,-18468035,63973285,-37425055,15186474,53949929,9452907,-54321837,-3516339,19248881,-14953291,-8122827,3850903,-5468382,88544126,-13240746,-3480205,17115642,20651392,16567671,-10074855,-42621993,28693624,108179345,-4241940,13711439,35646618,-45116103,126471837,44906795,83566213,-35259346,-18202015,21228949,21492192,72772357,-51595455,-43172719,-17351484,20534306,-21065475,-37437246,-43891397,23926158, + -5017438,-91812263,-62098873,-40562884,22361213,-15065737,764799,-2430500,51014876,8482687,6212298,17947733,-41350339,-32744261,30049103,12342220,-31541548,-41402586,-22663256,-30285478,13300405,-10238337,-23185384,83683440,11291388,-12731598,-84954986,14029871,-35703064,-26366326,-21045191,9965619,24316029,1436863,-9784899,-27816404,-90569377,9028464,-644135,27176089,23522902,71266693,32038088,37147514,-76763783,21159024,6720043,34152957,-14855204,44570225,35991567,-21496394,-58165366,13531652,-80755079,-61491471,18293521,53385847,-2257061,37667709,-64154055,-56315782,14862992,-21640566, + -63669646,-2058305,34718916,-57923882,-37369367,-9996032,65213435,14746574,16358546,40472426,-58982038,-7158815,15395533,18736910,-28172545,-23336125,-12959940,10156336,11316096,103335251,45975497,-67006321,25915103,-61802866,-37468579,-78474114,28627375,16818385,-25538396,-45262483,-11173862,-30865665,-12427393,33810089,-17046395,-32078173,-53999294,-57357126,-44442595,78311094,-50733951,-6165296,34095396,-57916820,54345365,7118754,-46922174,-78173719,-17692141,30785174,-5292120,-22731012,10879287,14817410,41128838,11514243,32548486,-57146537,89518458,-57063445,18838156,20071030,-24838917,6579279, + 3298951,-6429078,-20610134,-111488548,7733234,-9987391,17282656,-9014809,27477307,-101901346,26310739,-27656134,-30364031,-9265100,37052798,20225318,-96559511,46948476,30767870,-106530795,10284222,5927413,-43842598,106168108,22062407,-24532325,71096833,-21441442,53249810,-3511544,30453513,19353996,-61859298,95536132,-97735884,-19102655,9874727,25095658,23825460,-39205426,20448142,114508995,-43922363,-28947971,-27843607,56702443,-45835838,41850464,58570019,-42530745,-45812013,-22599415,-54793828,58510348,-108845262,-8410045,585186,14664701,-5367402,73684,72553834,42087243,19137948,7967963, + 43206754,50463201,29435483,-59156117,-23067292,24521182,-38435995,22114976,-60297370,99013160,10401511,45493849,-3675491,21520040,15503346,38320710,-5846963,58740434,83807515,-43726257,-18316256,-31637821,75832382,-54225536,-23853056,77521329,-91644447,-39233586,-63823173,17713702,-36240180,8908764,64286806,-13086846,6496489,-97742552,-2254284,52840517,-25674389,10025922,-54826368,-66766065,11096621,-87296330,61328893,47913964,15543857,-51556120,88156047,54391884,-5380730,-21724166,-48632187,85853658,-10069,29749352,-78637004,30067062,26368504,-73396965,-25637174,-26332483,-30503494,19582590, + -39118000,-38477488,-57068155,-2428417,28720201,55827889,37025317,-54969110,-32417766,-4836405,-24767617,53310608,-65843200,-47462927,-51146243,37145058,12404513,-20874633,-52611837,-70517886,-71951218,-46526821,-11052930,-4618219,-11181959,47296393,51551389,-53989006,49050465,-11327486,88614135,23633040,-49818700,30365151,59176264,-6607046,-16505287,-10472117,-100562803,-65964759,44886786,-10151999,39778732,63504121,16716322,67355431,35392551,-8929162,11401587,-41050018,26942153,-147408838,-51809860,-6796234,-48827532,-94999348,-32219210,-96007294,3996843,19567196,-93166412,-93976101,9018113,45448332, + 42418972,15583637,67541639,95587271,29144904,63974668,-25161645,-59203505,-23772725,90044044,4396804,5700333,27852379,4396330,25779989,49439786,49365722,-52538526,-34786934,51490910,23918863,95239599,-1702941,5005682,49641770,-31215396,-4280412,-24765390,-52244175,-27146327,142710298,62163623,2073267,-29566625,19406280,-67555210,18109186,5734798,5997863,-38399113,24692388,42103823,12146282,89721186,19090683,53035132,19238757,-27904602,21420958,20151446,-18229633,-13929272,73522291,-35669282,81974546,6255713,11658509,-43520472,18454305,-33208023,-17468307,-69853830,-78394361,-24824194, + -49658747,-31117672,41654091,32269446,-26723508,3882025,10575629,45570179,-24176020,33964176,-11639047,28097476,-2066786,-4492707,14665763,-136963,25825191,-127190388,-57481438,64101595,-17746287,-13168407,-33123335,-74126023,-3276474,-26340644,44786088,43934047,-47783458,70518763,3587996,-69147359,31417444,8205728,-50351304,43172666,19681615,-51928964,31724781,59920107,-39707868,-99153913,73431166,-377863,27820668,-18613653,51179908,-23471821,44331013,-7107302,47367333,12845452,8367704,-18460164,71058921,57447588,-11166698,-99286212,14086853,8983290,-26924445,-72492761,-56286591,-3769160, + -18168757,79713069,-2350346,-22691770,-23818154,-17653202,16496974,-47637006,11518247,120555314,73065515,-45068105,-6030231,-51493678,9152657,11233609,-32035710,68080072,19180573,18799827,-21930651,35483114,17272632,20466670,-36339907,-18451545,12948997,15799527,-32402396,-2495792,3576663,4870574,49959004,19633252,5072612,-51326879,3495469,40951015,-68919395,-73517655,21068123,66457592,-64413317,-18357127,8224838,6499957,-19109112,-48413324,-72615608,57715974,37339344,44431948,-19192622,33723109,37891462,-35629003,69543257,7580020,52104183,-18086932,58923475,-20834717,-92661846,14503809, + -9645312,-15460400,27659115,54260879,44671217,-51394482,-46350327,30003016,-40068850,-53760589,7990575,-61253229,-12481832,20661704,64562411,-15629375,15597408,18622336,9840889,-88779126,-1930140,-3719295,57206226,374296,-82528150,39561203,-5219344,68170669,-5956,8903986,-101707080,-52927767,44155897,-14737204,-22771466,33561384,99941365,-21831416,30720566,50582036,44608400,66142174,-5038312,-52737,80270726,8572466,36781506,18947715,29316863,8083775,-25713973,11224128,-99993934,8484461,-20785502,-21773929,21654213,-17168570,-32843795,27057695,14565780,45703766,62542994,56593354, + -46151441,25209991,49413408,-62280018,-85845757,67473493,46146015,30775039,-39039221,-22641695,29633274,11273896,-45279275,64334792,39378210,-35456064,40220317,883262,55049740,-3458002,-67572590,55586294,-20725941,-71692192,-57731910,74478392,-29324811,3045767,-41130125,11930352,-12326321,-32579951,82116531,-52335239,-1039527,-30033089,-27207617,-22315291,-54197147,-39463724,-43203671,-8870587,-41211212,-1971203,-7585160,54452646,-42064562,-8385169,24840528,95216613,-6888861,35002142,-59519982,-27741335,54594609,-76033405,9626631,56334121,80695660,-23917445,14732747,85139952,-17067917,49440671, + -78585134,5939669,27039549,-52577015,-96099984,-63282920,18749192,-27113865,8193469,-66009584,9992154,48020734,19099353,42621115,12051150,-91496628,7490858,50704378,6194663,-58383250,59277666,-36848708,20748897,-12790115,-14734373,22748369,-15796600,51102992,24662432,72630268,-30771049,-18116194,10963430,-27414909,29546835,-3699715,-26873432,67961072,55471585,81388807,-35273486,29351165,14950536,22686485,25117001,-42923504,78465680,-47234117,-10166658,-3265582,18926124,-71931859,-21110098,164334938,-3655585,-57774543,63457821,56241364,72658758,-23223021,22522960,14310955,68063511,-76477629, + -42881921,-13094986,68004237,31980186,-75714666,3820780,68133119,-14889110,63287936,-66603773,64857249,-12092902,62366607,41306114,69089534,-62624504,-70658702,-83202140,-41982379,-17104621,-56913710,23642600,-51494374,33885,34652335,-53019984,8846648,34151585,36347,52538102,-76363923,-48584715,96398540,-35905442,-60001591,-834764,313116,-1808523,-8765565,71672009,-8295187,45604934,-22114493,31374076,-60612414,-1466777,-52373795,26577393,52017359,21594828,31458997,29815937,60749580,-75270322,-50714353,46103379,33868141,15927735,18040322,12633649,37291599,7310981,-7186426,712479, + -57890858,-6078864,-1478333,-50051433,21235201,-21601053,-22481696,6423578,48246985,-19587656,874089,-25584945,-20602827,11062044,25107606,-24133186,-20247278,53672691,58839864,4458701,-79806478,15946649,16691353,-16451882,19069964,94964341,79223337,-49324254,-35529087,56474277,-27357393,-79068126,23974718,-20496962,-16411008,65594825,-71474520,55967531,54775216,10336785,-7988875,-29535122,66516862,14506488,-32814283,55268849,493527,-20736990,-34934986,-28390512,7226576,52080195,36862172,-9787020,-48238046,-2403040,44509784,66061252,32452264,9813117,-32445246,32807433,96181967,-49892902, + 26204147,53130109,-27173028,27227994,-1272926,-98049603,-63132805,-40128380,10020359,-1118696,15167721,23099934,21692816,7062652,-14301040,25700985,3524624,9616994,-11974509,-39141353,74319659,-81524241,20370647,-42650838,515873,64512350,24972023,59633370,-43915879,81896137,-117513789,-61384341,3004791,82271166,79293638,62708235,96839602,-49328427,26808637,-9439332,4865590,-9908651,-22806884,-18283122,-2234286,-22910941,57685790,-10686864,19486724,35166769,-19880259,26619519,-72911975,23376984,46794224,26627273,-58734110,-24289688,61253480,-11596343,-9792512,-32894545,20241519,-78857823, + -24096836,15644524,60422850,28212986,-26708870,32163264,-57971475,4155721,-23616624,52868484,81358934,72278639,-1790037,-62890910,-49167776,67900384,23377941,9027033,21341249,-36469566,-7106845,-16695474,-9303993,-43178662,50886802,7106310,121829785,45222745,26459008,13126510,54873677,-14557355,82723152,14849024,7203594,9644380,47052487,-18272629,-25979445,35837505,53461080,-19674970,-73725293,47440543,-61036655,-24267314,-73733434,-58667109,-68820211,-24899526,-14292588,-39723836,21335841,-4037456,35061347,30089350,21873356,-34568536,25557006,-59078245,4637313,10110402,-35304536,-6774590, + 4383974,-10360081,13533150,-4943920,-5153322,74373559,41239199,-60789929,-31848402,19624030,59013270,32781690,-26722813,19262111,-33666460,-3697576,878187,-72118051,-64560351,107614851,44909789,-8947151,-93974382,33526519,53091477,-51830842,-19738742,-69377646,42975722,-50518995,22093886,7819898,-2597283,-71558518,-49296796,-27986311,-83987282,39723649,-45135168,-55899230,-80212136,48756289,-5958351,34558971,-14348916,-65436648,-5839628,-25896234,4578328,95458369,-39934718,51610729,77966310,24345028,19461242,62107100,-11199584,71818300,-19161330,32534293,2052397,-29126296,-93713435,-69436186, + 29488015,-1666101,81347249,25778074,-43885132,-18023969,-2886729,23968654,-35527771,22714143,63826636,76348609,30091609,-62147875,4978010,-46331258,65014571,-7881067,33015335,23293107,55798680,9072294,10884912,-58370805,-38674129,-40402943,-5087318,19155074,-56942720,34479274,-53457690,110047499,42961797,8154628,82995447,-42017955,36777591,-89736509,23044323,-78996938,-15732344,27896807,-3811382,-50249281,-19213712,-56602078,8295218,33905704,21075921,76862246,81331,-13007528,-78527572,87207919,66471277,54317949,-29675823,-11373007,4749641,-52689547,59494456,-20986238,-13674971,4879475, + -52862946,-4984737,-9800176,22708101,36247821,-7716519,24506559,39155991,56216299,5518377,6652432,-68737308,62057125,8699696,33282825,-69218982,57181150,38886804,18191688,-36928106,4637613,29717780,69283393,43970965,1567675,21429088,-28702163,29534876,-33137566,-102826484,16699223,5360642,-13955308,-38762877,-44731549,56691664,-49574866,62794891,33988013,11222118,34167779,-8542919,60910964,11472163,21418638,-5973060,-23115918,-27035806,-37837869,58985112,-47774938,60076100,-26993713,56865600,-32385304,-74989608,49027329,90833116,-30631156,26575582,-18572301,17141437,8686898,-32731206, + 70423052,71163601,-62301874,-36786507,-45993392,5690051,-43999385,4329490,-49651005,10277508,-95452373,19049452,-102746649,47692503,-51863702,43408205,-27134894,66644059,26343156,-91284575,17615955,12598549,9317978,-39682018,-18427249,-3991881,-2350768,-31532902,28033010,36548001,-16069573,35269926,19561256,13980809,84105947,5983686,3538210,-24661341,-27021908,-9889001,-36868123,11122293,-53393097,-31697419,-10776512,2136366,61414546,-40898762,-63634530,7871070,-25039936,-11252292,18643683,7410943,-26560948,30235130,-47160537,2763572,-17507771,64530420,12096920,42375348,-2343728,5480924, + 34148819,8601951,-74205849,-19313293,68801678,14148794,55160963,51497330,-3576240,17528619,-11067259,-28822283,-23696678,-57940002,27931680,49895522,-33614008,-17445774,21567407,-4508629,-97027572,51995103,-22130052,54795935,-110696873,11847026,15941529,-116973932,-498219,-57143368,25823298,-42594295,-17532484,34288199,-39605782,52571031,-46189808,-32749615,-40308556,-29833874,-73173284,-44787926,84782364,-65266443,-43081900,17758187,7567649,5047566,-56263687,-28633642,66393227,59604923,-28883054,-33352899,16464968,-1093418,14420222,-26732427,20291590,-57115903,-50626019,-8938048,7538865,8886367, + -5876736,2802361,40083229,-21898178,31938126,-51464983,-42509302,-47059382,-46572061,16620630,-11312002,-83624597,32938683,-31804489,28141307,-52654356,7692304,109606529,47691593,14188106,9922464,54154696,27202652,-39956369,-67552103,-46390581,-20013769,63532707,-24654208,-20364952,-11758688,-34315790,31828877,-14308123,-23841038,53417660,-77401770,134760222,-11938443,29463113,-25421945,44308588,174179,-25865645,3186431,-57362957,-103660221,-21478621,-25699707,-26391285,-30950861,102634689,1766089,-11820342,17808455,9210711,36951296,60267838,-34500074,35645906,81327199,22937523,63787426,-33110097, + -24802599,56998863,11703594,-62061513,14146194,-11142038,-33297886,-1827879,-16131905,88171089,39900457,-50581563,-25970480,13986773,-68148178,-43653664,-53753153,2662668,44070911,100666356,-24645744,10982596,-12153221,59493692,-45908768,33625034,-6640770,-40726591,-35220830,69079790,-50044390,-28678671,7153376,55782248,102227253,-71924792,-32549865,10946902,-50836742,12846988,31428471,14667435,1321862,24814160,-10219503,-32098638,2149657,-40045361,-95416000,5928304,-22284934,-4633012,-102562980,49904419,-65650404,9707518,28320397,-80288382,67950779,-46767613,-20361573,-59736715,65410046,-18945552, + -2463001,-42047609,92173065,-28121835,-89178389,27546855,-2859641,98784421,33099354,5739064,44909887,12425030,44968017,-67022502,54874024,-12183901,26937129,-61445180,-7532436,-23724904,1227027,50887429,16231922,21023304,16773767,-10014156,57544831,40935620,29457128,-16345620,27553926,-27500834,37194512,18784743,-31243960,-9025333,54577189,-1534562,44988943,65318784,41697990,-20032889,55189404,-30653296,21684411,10925107,-47690710,-76601569,-704489,-83152399,14021536,-74458085,-19850528,-8816594,-48613379,23662813,66367194,10044572,-32114483,-81555705,-21000343,23149126,19162666,33155701, + 44614469,47308447,41908018,-49201542,-27899394,10679894,-4681017,42857091,86969109,-44848161,-56789778,838716,63890407,-7013700,13787249,-18483058,7068833,-32242111,15813077,-42354312,11371567,8945895,50672253,-59527772,6338632,-18821133,43920429,122545014,-44961294,7252344,5536928,-43673196,36513035,-1947431,-8251139,39263987,24253844,-8763940,-46878515,80178224,-59079530,-55858849,33360786,-37265816,52369218,-15766182,29593995,-17337071,-38957587,5553557,2998702,15098550,-9166629,-19293978,51390414,-89495965,-15544675,-17418923,63019227,11628858,27984278,-8168067,-10703206,-16579836, + -10809524,-87234653,-7468614,83844555,42667788,-1906781,3289335,-22162507,55142437,43947295,-16160322,-9254110,-6107840,-80512867,-47342367,91584810,7574094,-27129885,2060084,-34287772,-27118927,-2258165,54748972,24771617,-13716038,-19016989,42884479,-28283482,35099597,-81569681,92174328,12451200,37747015,-63885905,-38646876,-3031375,-27285306,-61234819,-56253955,64270328,-11468709,-52002929,14151154,2830412,14789806,-56531034,-19304034,-21161419,-47606049,-4396615,-3539712,20838779,-1744944,-126227672,53364756,-74666082,-50256385,269326,32937917,16215672,-58150673,16935234,-21608984,43931641, + -67145741,12287158,-82470800,-81182554,-8044583,19230033,12985216,6821227,-20586703,-41375884,39768360,49222257,-61169093,14881272,-28940574,4597698,-4209981,-60452410,-23499438,-53317606,-21991494,-91802768,-48497724,38309049,50103205,-25877614,70998639,-23363425,31908284,-30367487,-4003033,-35067646,21728049,-50122582,-13309041,124254419,-44214633,40789700,13961583,37537029,21928824,23051706,26715716,-4958624,-35949094,-37301309,51066745,-29215848,-21547193,27246503,-29518768,-28585008,-84438600,66888172,-73274577,28197247,-51535634,-38533491,4297602,-65092916,-48311130,64905982,-16776403,-40120423, + -28437823,2914691,15639398,30815836,-9942505,-9192683,84521706,56619637,32102791,-25973536,-2661531,53412194,-11721483,3925213,-110854924,60931342,-23061839,-11999265,30410169,-45444164,-22914628,-1413826,24859698,-34138416,-70507063,-29440564,-62899830,22785177,-37898548,-27411182,21528149,78045705,41151181,-35865288,88104806,-16904469,48658498,-20783910,-70827403,-76450414,-16940163,-72233973,-64396254,-59075436,30841728,23042055,-34037403,60155891,79675041,26095772,62201232,-142855060,-11770594,-86395852,47413818,18154490,-31102996,-7749413,-63386523,-42532638,-6824734,2920816,-17801359,74669398, + 54896297,25377892,45628382,22565078,4392618,-17829399,28304421,-17901256,23508639,34244271,-1373184,-96891513,84572323,15845680,3611502,-13937924,-20123343,-53577298,-18111106,41877003,-27378090,22241818,28276381,23938894,58921282,-13211401,74106918,-7845074,9162675,52125288,52257178,36388474,-88754083,121919991,56527057,-74544433,-8767213,5913563,18774060,-75380546,-65183429,-12439931,47723330,6678250,-53275558,24483214,8427917,-5556033,75130225,-58279737,18536489,-84292867,40659049,74894375,2276875,45124924,-30558068,-85291461,-72426983,37437174,55361040,-61721981,25506409,28309013, + -22664840,-8183733,-52255062,31399134,6146049,5857734,-42605733,-14856950,-10813541,35867980,47537038,324442,-17697416,-43501038,3839988,-22339802,-50607524,19632980,21562968,-7079692,-22816762,-1469424,79121568,5641872,-36305642,-15897298,-3602237,17780564,-13409028,5136342,8713701,-112930463,69362930,-35319635,-43101162,7507088,-23964595,17164948,-66896945,61144637,31610194,-10818327,-4550414,-34013000,1854828,-1436467,-93410727,-19752605,-62910960,41955921,-24118529,-4450856,33670897,15933993,-83875626,-97823000,10660135,167980,-42940188,-7611227,-25580440,-1247421,16487700,10949572}; + +#define SIZE 64 +#endif \ No newline at end of file From 02ed6b02dd133089932543541833aac9f9536698 Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Mon, 24 Mar 2025 18:19:09 +0100 Subject: [PATCH 04/18] Rewrote SA Control with 3 FSMs --- .../rtl/quadrilatero_dispatcher.sv | 19 +- .../rtl/quadrilatero_register_lsu.sv | 59 ++- .../rtl/quadrilatero_systolic_array.sv | 355 +++++++++++++---- .../rtl/quadrilatero_systolic_array_old.sv | 365 ++++++++++++++++++ 4 files changed, 718 insertions(+), 80 deletions(-) create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv index 2e858cc3c..e3a908655 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv @@ -201,7 +201,24 @@ module quadrilatero_dispatcher #( outstanding_op_d = {1'b0,n_matrix_operands_read_i} + {2'b0, rf_writeback_i}; end - + // always_comb begin: updated_next_value + // if((instr_ready || state_q==IDLE)) begin //we're ready to continue with the next instruction + // rreg_d = rf_read_regs_i; + // wreg_d = rf_writeback_i; + + // rs_d = rs_i; + // rs_valid_d = rs_valid_i; + // instr_id_d = instr_id_i; + // datatype_d = datatype_i; + // is_store_d = is_store_i; + // is_float_d = is_float_i; + // end + // dispatch_d = '0 ; + // dispatch_d[exec_unit_i] = instr_ready; + + // push_operandw_d = rf_writeback_i & instr_ready; + + // end always_comb begin: rw_queue_block rvalid = '0; wready = '0; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 678d83793..5c68b7c1b 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -13,7 +13,7 @@ module quadrilatero_register_lsu #( parameter int unsigned BUS_WIDTH = 128, parameter int unsigned N_REGS = 8, parameter int unsigned N_ROWS = 4, - localparam int unsigned RLEN = BUS_WIDTH + localparam int unsigned LLEN = BUS_WIDTH ) ( input logic clk_i , input logic rst_ni , @@ -33,7 +33,7 @@ module quadrilatero_register_lsu #( // Register Write Port for load unit output logic [ $clog2(N_REGS)-1:0] waddr_o , output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ RLEN-1:0] wdata_o , + output logic [ LLEN-1:0] wdata_o , output logic we_o , output logic wlast_o , input logic wready_i , // to stall the request in case the port is busy @@ -41,7 +41,7 @@ module quadrilatero_register_lsu #( // Register Read Port for store unit output logic [ $clog2(N_REGS)-1:0] raddr_o , output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [ RLEN-1:0] rdata_i , + input logic [ LLEN-1:0] rdata_i , input logic rdata_valid_i , output logic rdata_ready_o , output logic rlast_o , @@ -66,6 +66,14 @@ module quadrilatero_register_lsu #( localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; + // typedef enum logic { + // IDLE, + // COUNTING_ROWS, + // LAST_ROW + // } register_lsu_state_e; + + // register_lsu_state_e state_d, state_q; + logic finished; logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; @@ -75,7 +83,7 @@ module quadrilatero_register_lsu #( logic [$clog2(N_REGS)-1:0] waddr_q; logic [$clog2(N_REGS)-1:0] waddr_d; - logic [RLEN-1:0] load_fifo_data; + logic [LLEN-1:0] load_fifo_data; logic load_fifo_data_available; logic load_fifo_pop; @@ -83,9 +91,9 @@ module quadrilatero_register_lsu #( logic store_fifo_space_available; logic store_fifo_push; logic store_fifo_empty; - logic [RLEN-1:0] store_fifo_data; + logic [LLEN-1:0] store_fifo_data; - logic [RLEN-1:0] data_mask; + logic [LLEN-1:0] data_mask; logic load_fifo_valid; logic busy; logic start; @@ -167,7 +175,7 @@ module quadrilatero_register_lsu #( (!write_i && !busy) ? 1'b0 : write_q; valid_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? 1'b1 : - (load_fifo_valid && counter_d==3 && valid_q) ? 1'b0 : valid_q; + (load_fifo_valid && (counter_d==$clog2(N_ROWS)'(N_ROWS - 1)) && valid_q) ? 1'b0 : valid_q; // $clog2(N_ROWS)'(N_ROWS - 1) was 3, if there's a problem check here... start_d = start ? 1'b0 : (start_q | start_i) ? 1'b1 : start_q; @@ -176,13 +184,48 @@ module quadrilatero_register_lsu #( src_ptr_d = (start) ? address_i : src_ptr_q; back_id_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? instr_id_i : - rlast_o ? lsu_id_o : back_id_q; + rlast_o ? lsu_id_o : back_id_q; waddr_d = (load_fifo_valid && counter_d==0) ? operand_reg_i : waddr_q ; busy_d = (write_i && rlast_o && rdata_valid_i) ? 1'b0 : (write_i && start_i) ? 1'b1 : busy_q; end + // always_comb begin: fsm_block + // counter_d = '0; + // rlast_o = 1'b0; + // rrowaddr_o = counter_q; + // wlast_o = 1'b0; + // wrowaddr_o = counter_q; + // case (state_q) + // IDLE: begin + // counter_d = '0; + // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin + // state_d = COUNTING_ROWS + // end + // state_d = IDLE + // end + // COUNTING_ROWS: begin + // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin + // counter_d = counter_q + 1; + // if(counter_d = $clog2(N_ROWS)'(N_ROWS - 1)) begin + // state_d = LAST_ROW; + // end else begin + // state_d = COUNTING_ROWS; + // end + // end + + // end + // LAST_ROW: begin + // if(rlast_o || wlast_o) begin + // state_d = IDLE; + // end + + + // end + // default: + // endcase + //end always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block if (!rst_ni) begin diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index e80bda504..fcb1fab6f 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -17,7 +17,7 @@ module quadrilatero_systolic_array #( parameter int N_REGS = 8 , parameter int ENABLE_SIMD = 1 , localparam int N_ROWS = MESH_WIDTH , - localparam int RLEN = DATA_WIDTH * MESH_WIDTH, + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, parameter FPU = 1 ) ( input logic clk_i , @@ -37,7 +37,7 @@ module quadrilatero_systolic_array #( // Weight Read Register Port output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , - input logic [ RLEN-1:0] weight_rdata_i , + input logic [ ALEN-1:0] weight_rdata_i , input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, output logic weight_rlast_o , @@ -45,7 +45,7 @@ module quadrilatero_systolic_array #( // Data Read Register Port output logic [ $clog2(N_REGS)-1:0] data_raddr_o , output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , - input logic [ RLEN-1:0] data_rdata_i , + input logic [ ALEN-1:0] data_rdata_i , input logic data_rdata_valid_i , output logic data_rdata_ready_o , output logic data_rlast_o , @@ -53,7 +53,7 @@ module quadrilatero_systolic_array #( // Accumulator Read Register Port output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , - input logic [ RLEN-1:0] acc_rdata_i , + input logic [ ALEN-1:0] acc_rdata_i , input logic acc_rdata_valid_i , output logic acc_rdata_ready_o , output logic acc_rlast_o , @@ -61,7 +61,7 @@ module quadrilatero_systolic_array #( // Accumulator Out Write Register Port output logic [ $clog2(N_REGS)-1:0] res_waddr_o , output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , - output logic [ RLEN-1:0] res_wdata_o , + output logic [ ALEN-1:0] res_wdata_o , output logic res_we_o , output logic res_wlast_o , input logic res_wready_i , @@ -75,24 +75,43 @@ module quadrilatero_systolic_array #( input logic finished_ack_i , output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o ); - - logic ff_active_d ; - logic ff_active_q ; - logic fs_active_d ; - logic fs_active_q ; - logic dr_active_d ; - logic dr_active_q ; - logic set_ff_active ; - logic rst_ff_active ; - logic set_fs_active ; - logic rst_fs_active ; - logic set_dr_active ; - logic rst_dr_active ; + typedef enum logic [1:0]{ + FS_IDLE, + FS_ACTIVE, + FS_LAST + } fs_state_e; + typedef enum logic [1:0]{ + FF_IDLE, + FF_ACTIVE, + FF_DONE + } ff_state_e; + typedef enum logic [1:0]{ + DR_IDLE, + DR_ACTIVE, + DR_DONE + } dr_state_e; + + ff_state_e ff_state_d, ff_state_q; + fs_state_e fs_state_d, fs_state_q; + dr_state_e dr_state_d, dr_state_q; + localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); + // logic ff_active_d ; + // logic ff_active_q ; + // logic fs_active_d ; + // logic fs_active_q ; + // logic dr_active_d ; + // logic dr_active_q ; + // logic set_ff_active ; + // logic rst_ff_active ; + // logic set_fs_active ; + // logic rst_fs_active ; + // logic set_dr_active ; + // logic rst_dr_active ; logic valid ; logic clear ; - logic ff_enable ; - logic fs_enable ; - logic dr_enable ; + // logic ff_enable ; + // logic fs_enable ; + // logic dr_enable ; logic pump ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; @@ -141,42 +160,42 @@ module quadrilatero_systolic_array #( // Weight Read Register Port weight_raddr_o = weight_reg_q ; weight_rrowaddr_o = ff_counter_q ; - weight_rdata_ready_o = ff_active_q &~ mask_req ; - weight_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + weight_rlast_o = ff_counter_q==LastRow; // Data Read Register Port data_raddr_o = data_reg_q ; data_rrowaddr_o = ff_counter_q ; - data_rdata_ready_o = ff_active_q &~ mask_req ; - data_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + data_rlast_o = ff_counter_q==LastRow; // Accumulator Read Register Port acc_raddr_o = acc_reg_q ; acc_rrowaddr_o = ff_counter_q ; - acc_rdata_ready_o = ff_active_q &~ mask_req ; - acc_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + acc_rlast_o = ff_counter_q==LastRow; // Accumulator Out Write Register Port res_waddr_o = dest_reg_q ; res_wrowaddr_o = dr_counter_q ; - res_we_o = dr_active_q &~ mask_req ; - res_wlast_o = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; + res_wlast_o = dr_counter_q==LastRow; end always_comb begin: next_value // Configuration - data_reg_d = (set_ff_active) ? data_reg_i : data_reg_q ; - acc_reg_d = (set_ff_active) ? acc_reg_i : acc_reg_q ; - weight_reg_d = (set_ff_active) ? weight_reg_i : weight_reg_q ; - sa_ctrl_d = (set_ff_active) ? sa_ctrl_i : sa_ctrl_q ; + //data_reg_d = (set_ff_active) ? data_reg_i : data_reg_q ; + //acc_reg_d = (set_ff_active) ? acc_reg_i : acc_reg_q ; + //weight_reg_d = (set_ff_active) ? weight_reg_i : weight_reg_q ; + //sa_ctrl_d = (set_ff_active) ? sa_ctrl_i : sa_ctrl_q ; - acc_fs_d = (set_fs_active) ? acc_reg_q : acc_fs_q ; - dest_reg_d = (set_dr_active) ? acc_fs_q : dest_reg_q ; + //acc_fs_d = (set_fs_active) ? acc_reg_q : acc_fs_q ; + //dest_reg_d = (set_dr_active) ? acc_fs_q : dest_reg_q ; - id_ff_d = (set_ff_active) ? id_i : id_ff_q ; - id_fs_d = (set_fs_active) ? id_ff_q : id_fs_q ; - id_dr_d = (set_dr_active) ? id_fs_q : id_dr_q ; + //id_ff_d = (set_ff_active) ? id_i : id_ff_q ; + //id_fs_d = (set_fs_active) ? id_ff_q : id_fs_q ; + //id_dr_d = (set_dr_active) ? id_fs_q : id_dr_q ; // Finished finished_d = (res_wready_i && res_wlast_o) ? 1'b1 : @@ -186,48 +205,236 @@ module quadrilatero_systolic_array #( (finished_ack_i ) ? '0 : finished_instr_id_q; // Counters - ff_counter_d = (ff_enable && ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (ff_enable ) ? ff_counter_q + 1 : ff_counter_q; + //ff_counter_d = (ff_enable && ff_counter_q==LastRow) ? '0 : + // (ff_enable ) ? ff_counter_q + 1 : ff_counter_q; - fs_counter_d = (clear ) || - (fs_enable && fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (fs_enable ) ? fs_counter_q + 1 : fs_counter_q; + //fs_counter_d = (clear ) || + // (fs_enable && fs_counter_q==LastRow) ? '0 : + // (fs_enable ) ? fs_counter_q + 1 : fs_counter_q; - dr_counter_d = (clear ) || - (dr_enable && dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (dr_enable ) ? dr_counter_q + 1 : dr_counter_q; + //dr_counter_d = (clear ) || + // (dr_enable && dr_counter_q==LastRow) ? '0 : + // (dr_enable ) ? dr_counter_q + 1 : dr_counter_q; // Active signals - ff_active_d = set_ff_active ? 1'b1 : - rst_ff_active ? 1'b0 : ff_active_q; + //ff_active_d = set_ff_active ? 1'b1 : + // rst_ff_active ? 1'b0 : ff_active_q; - fs_active_d = set_fs_active ? 1'b1 : - rst_fs_active ? 1'b0 : fs_active_q; + //fs_active_d = set_fs_active ? 1'b1 : + // rst_fs_active ? 1'b0 : fs_active_q; - dr_active_d = set_dr_active ? 1'b1 : - rst_dr_active ? 1'b0 : dr_active_q; + //dr_active_d = set_dr_active ? 1'b1 : + // rst_dr_active ? 1'b0 : dr_active_q; end always_comb begin: ctrl_block valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - clear = ~ff_active_q & ~fs_active_q & ~dr_active_q; + //clear = ~ff_active_q & ~fs_active_q & ~dr_active_q; + if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q != DR_ACTIVE)) begin + clear = 1'b1; + end else begin + clear = 1'b0; + end - ff_enable = ff_active_q & valid ; + //ff_enable = ff_active_q & valid ; // fs_enable = fs_active_q & (valid | ~ff_active_q); // dr_enable = dr_active_q & (valid | ~ff_active_q); - fs_enable = fs_active_q; - dr_enable = dr_active_q; + //fs_enable = fs_active_q; + //dr_enable = dr_active_q; - set_ff_active = ff_counter_d=='0 & start_i ; - set_fs_active = fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - set_dr_active = dr_counter_d=='0 & fs_counter_d==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-2); + //set_ff_active = ff_counter_d=='0 & start_i ; + //set_fs_active = fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q==LastRow; + //set_dr_active = dr_counter_d=='0 & fs_counter_d==LastRow & fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-2); - rst_ff_active = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & ff_counter_d=='0 ; - rst_fs_active = fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q=='0; - rst_dr_active = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & dr_counter_d=='0 & fs_counter_d=='0 & fs_counter_q=='0; + //rst_ff_active = ff_counter_q==LastRow & ff_counter_d=='0 ; + //rst_fs_active = fs_counter_q==LastRow & fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q=='0; + //rst_dr_active = dr_counter_q==LastRow & dr_counter_d=='0 & fs_counter_d=='0 & fs_counter_q=='0; + + if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q == DR_ACTIVE)) begin + pump = 1'b1; + end else begin + pump = 1'b0; + end + mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; + end + + always_comb begin : ff_fsm_block + ff_counter_d = ff_counter_q; + ff_state_d = ff_state_q; + //Configuration + data_reg_d = data_reg_q; + acc_reg_d = acc_reg_q; + weight_reg_d = weight_reg_q; + sa_ctrl_d = sa_ctrl_q; + id_ff_d = id_ff_q; + + unique case (ff_state_q) + FF_IDLE: begin + ff_counter_d = '0; + if(start_i == 1'b1) begin + ff_state_d = FF_ACTIVE; + //ff_counter_d = '1; // mayday + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end + end + FF_ACTIVE: begin + if(valid == 1'b1) begin + if(ff_counter_q==(LastRow-1)) begin + ff_counter_d = ff_counter_q + 1; + ff_state_d = FF_DONE; + end else begin + ff_counter_d = ff_counter_q + 1; + end + end + + + end + FF_DONE: begin + if(start_i == 1'b1) begin + ff_counter_d = '0; + ff_state_d = FF_ACTIVE; + + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + ff_counter_d = '0; + ff_state_d = FF_IDLE; + end + + end + default: begin + ff_state_d = FF_IDLE; + end + endcase + end + always_comb begin : fs_fsm_block + fs_counter_d = fs_counter_q; + fs_state_d = fs_state_q; + + acc_fs_d = acc_fs_q; + id_fs_d = id_fs_q; + + unique case(fs_state_q) + FS_IDLE: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE ) begin //&& fs_counter_d == '0 was in if clause. + fs_state_d = FS_ACTIVE; + //fs_counter_d = fs_counter_q + 1; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + + end + FS_ACTIVE: begin + if(clear == 1'b1) begin + fs_counter_d = '0; + fs_state_d = FS_IDLE; + end else begin + if(fs_counter_q == LastRow-2) begin + fs_counter_d = fs_counter_q + 1; + fs_state_d = FS_LAST; + end else begin + fs_counter_d = fs_counter_q + 1; + end + // fs_counter_d = '0; + // if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs + // fs_state_d = FS_ACTIVE; + + // acc_fs_d = acc_reg_q; + // id_fs_d = id_ff_q; + // end + // if(ff_state_q == FF_IDLE) begin + // fs_state_d = FS_DONE; + // end + // end else begin + // fs_counter_d = fs_counter_q + 1; + // end + end + end + FS_LAST: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + if(ff_state_q == FF_IDLE) begin + fs_state_d = FS_IDLE; + end + + end + default: begin + fs_state_d = FS_IDLE; + end + + endcase + end + + always_comb begin : dr_fsm_block + dr_state_d = dr_state_q; + dr_counter_d = dr_counter_q; + + dest_reg_d = dest_reg_q; + id_dr_d = id_dr_q; + unique case(dr_state_q) + DR_IDLE: begin + dr_counter_d = '0; + if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 + dr_state_d = DR_ACTIVE; + //dr_counter_d = dr_counter_q + 1; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + + end + DR_ACTIVE: begin + if(clear == 1'b1) begin + dr_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + if(dr_counter_q == LastRow) begin + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; //stall the pipeline + end else begin + dr_counter_d = '0; + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + if(fs_state_q == FS_IDLE) begin + dr_state_d = DR_DONE; + end + end //else begin + // dr_state_d = DR_ACTIVE; + // dest_reg_d = acc_fs_q; + // id_dr_d = id_fs_q; + // end + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + + end + DR_DONE: begin //theoretically we don't need this state. + dr_state_d = DR_IDLE; + end + default: begin + dr_state_d = DR_IDLE; + end + + endcase + - pump = ff_enable | fs_enable | dr_enable ; - mask_req = (dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) & finished_q & ~finished_ack_i; end quadrilatero_skewer #( @@ -313,9 +520,12 @@ module quadrilatero_systolic_array #( ff_counter_q <= '0; fs_counter_q <= '0; dr_counter_q <= '0; - ff_active_q <= '0; - fs_active_q <= '0; - dr_active_q <= '0; + // ff_active_q <= '0; + // fs_active_q <= '0; + // dr_active_q <= '0; + ff_state_q <= FF_IDLE; + fs_state_q <= FS_IDLE; + dr_state_q <= DR_IDLE; data_reg_q <= '0; acc_reg_q <= '0; weight_reg_q <= '0; @@ -331,9 +541,12 @@ module quadrilatero_systolic_array #( ff_counter_q <= ff_counter_d ; fs_counter_q <= fs_counter_d ; dr_counter_q <= dr_counter_d ; - ff_active_q <= ff_active_d ; - fs_active_q <= fs_active_d ; - dr_active_q <= dr_active_d ; + // ff_active_q <= ff_active_d ; + // fs_active_q <= fs_active_d ; + // dr_active_q <= dr_active_d ; + ff_state_q <= ff_state_d; + fs_state_q <= fs_state_d; + dr_state_q <= dr_state_d; data_reg_q <= data_reg_d ; acc_reg_q <= acc_reg_d ; weight_reg_q <= weight_reg_d ; @@ -348,7 +561,7 @@ module quadrilatero_systolic_array #( end end - assign sa_ready_o = (ff_counter_d=='0) & ((ff_active_q &~ ff_counter_q=='0) | (~ff_active_q & ~fs_active_q & ~dr_active_q)); + assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0) | clear); assign sa_input_id_o = id_ff_q ; assign sa_output_id_o = id_dr_q ; assign finished_o = finished_q ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv new file mode 100644 index 000000000..9b63dd974 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv @@ -0,0 +1,365 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* + +TODO: +- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs + - basically you need to inject zeros instead of actual elements +*/ + +module quadrilatero_systolic_array #( + parameter int MESH_WIDTH = 4 , + parameter int DATA_WIDTH = 32 , + parameter int N_REGS = 8 , + parameter int ENABLE_SIMD = 1 , + localparam int N_ROWS = MESH_WIDTH , + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, + parameter FPU = 1 +) ( + input logic clk_i , + input logic rst_ni , + + output logic sa_ready_o , + input logic start_i , + + // Only has effect if ENABLE_SIMD == 1 + input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , + + input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register + input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register + input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register + input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction + + // Weight Read Register Port + output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , + output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , + input logic [ ALEN-1:0] weight_rdata_i , + input logic weight_rdata_valid_i, + output logic weight_rdata_ready_o, + output logic weight_rlast_o , + + // Data Read Register Port + output logic [ $clog2(N_REGS)-1:0] data_raddr_o , + output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , + input logic [ ALEN-1:0] data_rdata_i , + input logic data_rdata_valid_i , + output logic data_rdata_ready_o , + output logic data_rlast_o , + + // Accumulator Read Register Port + output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , + output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , + input logic [ ALEN-1:0] acc_rdata_i , + input logic acc_rdata_valid_i , + output logic acc_rdata_ready_o , + output logic acc_rlast_o , + + // Accumulator Out Write Register Port + output logic [ $clog2(N_REGS)-1:0] res_waddr_o , + output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , + output logic [ ALEN-1:0] res_wdata_o , + output logic res_we_o , + output logic res_wlast_o , + input logic res_wready_i , + + // RF Instruction ID + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , + + // Finish + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o +); + + logic ff_active_d ; + logic ff_active_q ; + logic fs_active_d ; + logic fs_active_q ; + logic dr_active_d ; + logic dr_active_q ; + logic set_ff_active ; + logic rst_ff_active ; + logic set_fs_active ; + logic rst_fs_active ; + logic set_dr_active ; + logic rst_dr_active ; + logic valid ; + logic clear ; + logic ff_enable ; + logic fs_enable ; + logic dr_enable ; + logic pump ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; + + logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register + logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register + logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register + logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register + quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; + quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; + + logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage + + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; + + logic finished_d ; + logic finished_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; + logic mask_req ; + + quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; + + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; + logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; + + //--------------------------------------------------------------------- + + always_comb begin: rf_block + // Weight Read Register Port + weight_raddr_o = weight_reg_q ; + weight_rrowaddr_o = ff_counter_q ; + weight_rdata_ready_o = ff_active_q &~ mask_req ; + weight_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + + // Data Read Register Port + data_raddr_o = data_reg_q ; + data_rrowaddr_o = ff_counter_q ; + data_rdata_ready_o = ff_active_q &~ mask_req ; + data_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + + // Accumulator Read Register Port + acc_raddr_o = acc_reg_q ; + acc_rrowaddr_o = ff_counter_q ; + acc_rdata_ready_o = ff_active_q &~ mask_req ; + acc_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + + // Accumulator Out Write Register Port + res_waddr_o = dest_reg_q ; + res_wrowaddr_o = dr_counter_q ; + res_we_o = dr_active_q &~ mask_req ; + res_wlast_o = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + end + + always_comb begin: next_value + + // Configuration + data_reg_d = (set_ff_active) ? data_reg_i : data_reg_q ; + acc_reg_d = (set_ff_active) ? acc_reg_i : acc_reg_q ; + weight_reg_d = (set_ff_active) ? weight_reg_i : weight_reg_q ; + sa_ctrl_d = (set_ff_active) ? sa_ctrl_i : sa_ctrl_q ; + + acc_fs_d = (set_fs_active) ? acc_reg_q : acc_fs_q ; + dest_reg_d = (set_dr_active) ? acc_fs_q : dest_reg_q ; + + id_ff_d = (set_ff_active) ? id_i : id_ff_q ; + id_fs_d = (set_fs_active) ? id_ff_q : id_fs_q ; + id_dr_d = (set_dr_active) ? id_fs_q : id_dr_q ; + + // Finished + finished_d = (res_wready_i && res_wlast_o) ? 1'b1 : + (finished_ack_i ) ? 1'b0 : finished_q; + + finished_instr_id_d = (res_wready_i && res_wlast_o) ? id_dr_q : + (finished_ack_i ) ? '0 : finished_instr_id_q; + + // Counters + ff_counter_d = (ff_enable && ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : + (ff_enable ) ? ff_counter_q + 1 : ff_counter_q; + + fs_counter_d = (clear ) || + (fs_enable && fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : + (fs_enable ) ? fs_counter_q + 1 : fs_counter_q; + + dr_counter_d = (clear ) || + (dr_enable && dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : + (dr_enable ) ? dr_counter_q + 1 : dr_counter_q; + + // Active signals + ff_active_d = set_ff_active ? 1'b1 : + rst_ff_active ? 1'b0 : ff_active_q; + + fs_active_d = set_fs_active ? 1'b1 : + rst_fs_active ? 1'b0 : fs_active_q; + + dr_active_d = set_dr_active ? 1'b1 : + rst_dr_active ? 1'b0 : dr_active_q; + end + + always_comb begin: ctrl_block + valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; + clear = ~ff_active_q & ~fs_active_q & ~dr_active_q; + + ff_enable = ff_active_q & valid ; + // fs_enable = fs_active_q & (valid | ~ff_active_q); + // dr_enable = dr_active_q & (valid | ~ff_active_q); + fs_enable = fs_active_q; + dr_enable = dr_active_q; + + set_ff_active = ff_counter_d=='0 & start_i ; + set_fs_active = fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); + set_dr_active = dr_counter_d=='0 & fs_counter_d==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-2); + + rst_ff_active = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & ff_counter_d=='0 ; + rst_fs_active = fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q=='0; + rst_dr_active = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & dr_counter_d=='0 & fs_counter_d=='0 & fs_counter_q=='0; + + pump = ff_enable | fs_enable | dr_enable ; + mask_req = (dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) & finished_q & ~finished_ack_i; + end + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_data ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (data_rdata_i ), + .data_o (data_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (acc_rdata_i ), + .data_o (acc_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(4) + ) skewer_inst_ctrl ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i ({MESH_WIDTH{sa_ctrl_q}}), + .data_o (sa_ctrl_mesh_skewed ) + ); + + quadrilatero_wl_stage #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) weight_inst ( + .clk_i , + .rst_ni , + + .ff_counter (ff_counter_q ), + .clear_i (clear ), + .pump_i (pump ), + .weight_rdata_valid_i , + + // Weight Data + .weight_rdata_i , + .weight_rdata_o (weight_mesh_skewed ) + ); + + quadrilatero_mesh #( + .MESH_WIDTH (MESH_WIDTH ), + .ENABLE_SIMD(ENABLE_SIMD), + .FPU (FPU ) + ) mesh_inst ( + .clk_i, + .rst_ni, + + .pump_i (pump ), + .sa_ctrl_i (sa_ctrl_mesh_skewed ), + + .data_i (data_mesh_skewed ), + .acc_i (acc_mesh_skewed ), + .weight_i (weight_mesh_skewed ), + .acc_o (res_mesh_skewed ) + ); + + quadrilatero_deskewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) deskewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (res_mesh_skewed), + .data_o (res_wdata_o ) + ); + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + ff_counter_q <= '0; + fs_counter_q <= '0; + dr_counter_q <= '0; + ff_active_q <= '0; + fs_active_q <= '0; + dr_active_q <= '0; + data_reg_q <= '0; + acc_reg_q <= '0; + weight_reg_q <= '0; + sa_ctrl_q <= '0; + acc_fs_q <= '0; + dest_reg_q <= '0; + id_ff_q <= '0; + id_fs_q <= '0; + id_dr_q <= '0; + finished_q <= '0; + finished_instr_id_q <= '0; + end else begin + ff_counter_q <= ff_counter_d ; + fs_counter_q <= fs_counter_d ; + dr_counter_q <= dr_counter_d ; + ff_active_q <= ff_active_d ; + fs_active_q <= fs_active_d ; + dr_active_q <= dr_active_d ; + data_reg_q <= data_reg_d ; + acc_reg_q <= acc_reg_d ; + weight_reg_q <= weight_reg_d ; + sa_ctrl_q <= sa_ctrl_d ; + acc_fs_q <= acc_fs_d ; + dest_reg_q <= dest_reg_d ; + id_ff_q <= id_ff_d ; + id_fs_q <= id_fs_d ; + id_dr_q <= id_dr_d ; + finished_q <= finished_d ; + finished_instr_id_q <= finished_instr_id_d ; + end + end + + assign sa_ready_o = (ff_counter_d=='0) & ((ff_active_q &~ ff_counter_q=='0) | (~ff_active_q & ~fs_active_q & ~dr_active_q)); + assign sa_input_id_o = id_ff_q ; + assign sa_output_id_o = id_dr_q ; + assign finished_o = finished_q ; + assign finished_instr_id_o = finished_instr_id_q; + + // -------------------------------------------------------------------- + + // Assertions + if (MESH_WIDTH < 2) begin + $error( + "[systolic_array] MESH_WIDTH must be at least 2.\n" + ); + end +endmodule From a355ecde2e70e7830e5df87898588d14fbc1d688 Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Wed, 26 Mar 2025 15:55:56 +0100 Subject: [PATCH 05/18] SA control using 3 FSMs for FF, FS, DR --- .../rtl/quadrilatero_systolic_array.sv | 102 +---- .../rtl/quadrilatero_systolic_array_old.sv | 365 ------------------ 2 files changed, 4 insertions(+), 463 deletions(-) delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index fcb1fab6f..f1757dc91 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -95,23 +95,8 @@ module quadrilatero_systolic_array #( fs_state_e fs_state_d, fs_state_q; dr_state_e dr_state_d, dr_state_q; localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); - // logic ff_active_d ; - // logic ff_active_q ; - // logic fs_active_d ; - // logic fs_active_q ; - // logic dr_active_d ; - // logic dr_active_q ; - // logic set_ff_active ; - // logic rst_ff_active ; - // logic set_fs_active ; - // logic rst_fs_active ; - // logic set_dr_active ; - // logic rst_dr_active ; logic valid ; logic clear ; - // logic ff_enable ; - // logic fs_enable ; - // logic dr_enable ; logic pump ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; @@ -182,74 +167,22 @@ module quadrilatero_systolic_array #( res_wlast_o = dr_counter_q==LastRow; end - always_comb begin: next_value + always_comb begin: finished_signal - // Configuration - //data_reg_d = (set_ff_active) ? data_reg_i : data_reg_q ; - //acc_reg_d = (set_ff_active) ? acc_reg_i : acc_reg_q ; - //weight_reg_d = (set_ff_active) ? weight_reg_i : weight_reg_q ; - //sa_ctrl_d = (set_ff_active) ? sa_ctrl_i : sa_ctrl_q ; - - //acc_fs_d = (set_fs_active) ? acc_reg_q : acc_fs_q ; - //dest_reg_d = (set_dr_active) ? acc_fs_q : dest_reg_q ; - - //id_ff_d = (set_ff_active) ? id_i : id_ff_q ; - //id_fs_d = (set_fs_active) ? id_ff_q : id_fs_q ; - //id_dr_d = (set_dr_active) ? id_fs_q : id_dr_q ; - - // Finished finished_d = (res_wready_i && res_wlast_o) ? 1'b1 : (finished_ack_i ) ? 1'b0 : finished_q; finished_instr_id_d = (res_wready_i && res_wlast_o) ? id_dr_q : (finished_ack_i ) ? '0 : finished_instr_id_q; - - // Counters - //ff_counter_d = (ff_enable && ff_counter_q==LastRow) ? '0 : - // (ff_enable ) ? ff_counter_q + 1 : ff_counter_q; - - //fs_counter_d = (clear ) || - // (fs_enable && fs_counter_q==LastRow) ? '0 : - // (fs_enable ) ? fs_counter_q + 1 : fs_counter_q; - - //dr_counter_d = (clear ) || - // (dr_enable && dr_counter_q==LastRow) ? '0 : - // (dr_enable ) ? dr_counter_q + 1 : dr_counter_q; - - // Active signals - //ff_active_d = set_ff_active ? 1'b1 : - // rst_ff_active ? 1'b0 : ff_active_q; - - //fs_active_d = set_fs_active ? 1'b1 : - // rst_fs_active ? 1'b0 : fs_active_q; - - //dr_active_d = set_dr_active ? 1'b1 : - // rst_dr_active ? 1'b0 : dr_active_q; end always_comb begin: ctrl_block valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - //clear = ~ff_active_q & ~fs_active_q & ~dr_active_q; if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q != DR_ACTIVE)) begin clear = 1'b1; end else begin clear = 1'b0; end - - //ff_enable = ff_active_q & valid ; - // fs_enable = fs_active_q & (valid | ~ff_active_q); - // dr_enable = dr_active_q & (valid | ~ff_active_q); - //fs_enable = fs_active_q; - //dr_enable = dr_active_q; - - //set_ff_active = ff_counter_d=='0 & start_i ; - //set_fs_active = fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q==LastRow; - //set_dr_active = dr_counter_d=='0 & fs_counter_d==LastRow & fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-2); - - //rst_ff_active = ff_counter_q==LastRow & ff_counter_d=='0 ; - //rst_fs_active = fs_counter_q==LastRow & fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q=='0; - //rst_dr_active = dr_counter_q==LastRow & dr_counter_d=='0 & fs_counter_d=='0 & fs_counter_q=='0; - if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q == DR_ACTIVE)) begin pump = 1'b1; end else begin @@ -261,7 +194,6 @@ module quadrilatero_systolic_array #( always_comb begin : ff_fsm_block ff_counter_d = ff_counter_q; ff_state_d = ff_state_q; - //Configuration data_reg_d = data_reg_q; acc_reg_d = acc_reg_q; weight_reg_d = weight_reg_q; @@ -273,7 +205,6 @@ module quadrilatero_systolic_array #( ff_counter_d = '0; if(start_i == 1'b1) begin ff_state_d = FF_ACTIVE; - //ff_counter_d = '1; // mayday data_reg_d = data_reg_i; acc_reg_d = acc_reg_i; weight_reg_d = weight_reg_i; @@ -324,9 +255,8 @@ module quadrilatero_systolic_array #( unique case(fs_state_q) FS_IDLE: begin fs_counter_d = '0; - if(ff_state_q == FF_DONE ) begin //&& fs_counter_d == '0 was in if clause. + if(ff_state_q == FF_DONE ) begin fs_state_d = FS_ACTIVE; - //fs_counter_d = fs_counter_q + 1; acc_fs_d = acc_reg_q; id_fs_d = id_ff_q; @@ -344,19 +274,6 @@ module quadrilatero_systolic_array #( end else begin fs_counter_d = fs_counter_q + 1; end - // fs_counter_d = '0; - // if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs - // fs_state_d = FS_ACTIVE; - - // acc_fs_d = acc_reg_q; - // id_fs_d = id_ff_q; - // end - // if(ff_state_q == FF_IDLE) begin - // fs_state_d = FS_DONE; - // end - // end else begin - // fs_counter_d = fs_counter_q + 1; - // end end end FS_LAST: begin @@ -390,7 +307,6 @@ module quadrilatero_systolic_array #( dr_counter_d = '0; if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 dr_state_d = DR_ACTIVE; - //dr_counter_d = dr_counter_q + 1; dest_reg_d = acc_fs_q; id_dr_d = id_fs_q; end @@ -414,18 +330,14 @@ module quadrilatero_systolic_array #( if(fs_state_q == FS_IDLE) begin dr_state_d = DR_DONE; end - end //else begin - // dr_state_d = DR_ACTIVE; - // dest_reg_d = acc_fs_q; - // id_dr_d = id_fs_q; - // end + end end else begin dr_counter_d = dr_counter_q + 1; end end end - DR_DONE: begin //theoretically we don't need this state. + DR_DONE: begin dr_state_d = DR_IDLE; end default: begin @@ -520,9 +432,6 @@ module quadrilatero_systolic_array #( ff_counter_q <= '0; fs_counter_q <= '0; dr_counter_q <= '0; - // ff_active_q <= '0; - // fs_active_q <= '0; - // dr_active_q <= '0; ff_state_q <= FF_IDLE; fs_state_q <= FS_IDLE; dr_state_q <= DR_IDLE; @@ -541,9 +450,6 @@ module quadrilatero_systolic_array #( ff_counter_q <= ff_counter_d ; fs_counter_q <= fs_counter_d ; dr_counter_q <= dr_counter_d ; - // ff_active_q <= ff_active_d ; - // fs_active_q <= fs_active_d ; - // dr_active_q <= dr_active_d ; ff_state_q <= ff_state_d; fs_state_q <= fs_state_d; dr_state_q <= dr_state_d; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv deleted file mode 100644 index 9b63dd974..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv +++ /dev/null @@ -1,365 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* - -TODO: -- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs - - basically you need to inject zeros instead of actual elements -*/ - -module quadrilatero_systolic_array #( - parameter int MESH_WIDTH = 4 , - parameter int DATA_WIDTH = 32 , - parameter int N_REGS = 8 , - parameter int ENABLE_SIMD = 1 , - localparam int N_ROWS = MESH_WIDTH , - localparam int ALEN = DATA_WIDTH * MESH_WIDTH, - parameter FPU = 1 -) ( - input logic clk_i , - input logic rst_ni , - - output logic sa_ready_o , - input logic start_i , - - // Only has effect if ENABLE_SIMD == 1 - input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , - - input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register - input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register - input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register - input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction - - // Weight Read Register Port - output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , - input logic [ ALEN-1:0] weight_rdata_i , - input logic weight_rdata_valid_i, - output logic weight_rdata_ready_o, - output logic weight_rlast_o , - - // Data Read Register Port - output logic [ $clog2(N_REGS)-1:0] data_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , - input logic [ ALEN-1:0] data_rdata_i , - input logic data_rdata_valid_i , - output logic data_rdata_ready_o , - output logic data_rlast_o , - - // Accumulator Read Register Port - output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , - input logic [ ALEN-1:0] acc_rdata_i , - input logic acc_rdata_valid_i , - output logic acc_rdata_ready_o , - output logic acc_rlast_o , - - // Accumulator Out Write Register Port - output logic [ $clog2(N_REGS)-1:0] res_waddr_o , - output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , - output logic [ ALEN-1:0] res_wdata_o , - output logic res_we_o , - output logic res_wlast_o , - input logic res_wready_i , - - // RF Instruction ID - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , - - // Finish - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o -); - - logic ff_active_d ; - logic ff_active_q ; - logic fs_active_d ; - logic fs_active_q ; - logic dr_active_d ; - logic dr_active_q ; - logic set_ff_active ; - logic rst_ff_active ; - logic set_fs_active ; - logic rst_fs_active ; - logic set_dr_active ; - logic rst_dr_active ; - logic valid ; - logic clear ; - logic ff_enable ; - logic fs_enable ; - logic dr_enable ; - logic pump ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; - - logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register - logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register - logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register - logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register - quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; - quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; - - logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage - - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; - - logic finished_d ; - logic finished_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; - logic mask_req ; - - quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; - - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; - logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; - - //--------------------------------------------------------------------- - - always_comb begin: rf_block - // Weight Read Register Port - weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q ; - weight_rdata_ready_o = ff_active_q &~ mask_req ; - weight_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - - // Data Read Register Port - data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q ; - data_rdata_ready_o = ff_active_q &~ mask_req ; - data_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - - // Accumulator Read Register Port - acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q ; - acc_rdata_ready_o = ff_active_q &~ mask_req ; - acc_rlast_o = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - - // Accumulator Out Write Register Port - res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q ; - res_we_o = dr_active_q &~ mask_req ; - res_wlast_o = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - end - - always_comb begin: next_value - - // Configuration - data_reg_d = (set_ff_active) ? data_reg_i : data_reg_q ; - acc_reg_d = (set_ff_active) ? acc_reg_i : acc_reg_q ; - weight_reg_d = (set_ff_active) ? weight_reg_i : weight_reg_q ; - sa_ctrl_d = (set_ff_active) ? sa_ctrl_i : sa_ctrl_q ; - - acc_fs_d = (set_fs_active) ? acc_reg_q : acc_fs_q ; - dest_reg_d = (set_dr_active) ? acc_fs_q : dest_reg_q ; - - id_ff_d = (set_ff_active) ? id_i : id_ff_q ; - id_fs_d = (set_fs_active) ? id_ff_q : id_fs_q ; - id_dr_d = (set_dr_active) ? id_fs_q : id_dr_q ; - - // Finished - finished_d = (res_wready_i && res_wlast_o) ? 1'b1 : - (finished_ack_i ) ? 1'b0 : finished_q; - - finished_instr_id_d = (res_wready_i && res_wlast_o) ? id_dr_q : - (finished_ack_i ) ? '0 : finished_instr_id_q; - - // Counters - ff_counter_d = (ff_enable && ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (ff_enable ) ? ff_counter_q + 1 : ff_counter_q; - - fs_counter_d = (clear ) || - (fs_enable && fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (fs_enable ) ? fs_counter_q + 1 : fs_counter_q; - - dr_counter_d = (clear ) || - (dr_enable && dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) ? '0 : - (dr_enable ) ? dr_counter_q + 1 : dr_counter_q; - - // Active signals - ff_active_d = set_ff_active ? 1'b1 : - rst_ff_active ? 1'b0 : ff_active_q; - - fs_active_d = set_fs_active ? 1'b1 : - rst_fs_active ? 1'b0 : fs_active_q; - - dr_active_d = set_dr_active ? 1'b1 : - rst_dr_active ? 1'b0 : dr_active_q; - end - - always_comb begin: ctrl_block - valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - clear = ~ff_active_q & ~fs_active_q & ~dr_active_q; - - ff_enable = ff_active_q & valid ; - // fs_enable = fs_active_q & (valid | ~ff_active_q); - // dr_enable = dr_active_q & (valid | ~ff_active_q); - fs_enable = fs_active_q; - dr_enable = dr_active_q; - - set_ff_active = ff_counter_d=='0 & start_i ; - set_fs_active = fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1); - set_dr_active = dr_counter_d=='0 & fs_counter_d==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-2); - - rst_ff_active = ff_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & ff_counter_d=='0 ; - rst_fs_active = fs_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & fs_counter_d=='0 & ff_counter_d=='0 & ff_counter_q=='0; - rst_dr_active = dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1) & dr_counter_d=='0 & fs_counter_d=='0 & fs_counter_q=='0; - - pump = ff_enable | fs_enable | dr_enable ; - mask_req = (dr_counter_q==$clog2(MESH_WIDTH)'(MESH_WIDTH-1)) & finished_q & ~finished_ack_i; - end - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_data ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (data_rdata_i ), - .data_o (data_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (acc_rdata_i ), - .data_o (acc_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(4) - ) skewer_inst_ctrl ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i ({MESH_WIDTH{sa_ctrl_q}}), - .data_o (sa_ctrl_mesh_skewed ) - ); - - quadrilatero_wl_stage #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) weight_inst ( - .clk_i , - .rst_ni , - - .ff_counter (ff_counter_q ), - .clear_i (clear ), - .pump_i (pump ), - .weight_rdata_valid_i , - - // Weight Data - .weight_rdata_i , - .weight_rdata_o (weight_mesh_skewed ) - ); - - quadrilatero_mesh #( - .MESH_WIDTH (MESH_WIDTH ), - .ENABLE_SIMD(ENABLE_SIMD), - .FPU (FPU ) - ) mesh_inst ( - .clk_i, - .rst_ni, - - .pump_i (pump ), - .sa_ctrl_i (sa_ctrl_mesh_skewed ), - - .data_i (data_mesh_skewed ), - .acc_i (acc_mesh_skewed ), - .weight_i (weight_mesh_skewed ), - .acc_o (res_mesh_skewed ) - ); - - quadrilatero_deskewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) deskewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (res_mesh_skewed), - .data_o (res_wdata_o ) - ); - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - ff_counter_q <= '0; - fs_counter_q <= '0; - dr_counter_q <= '0; - ff_active_q <= '0; - fs_active_q <= '0; - dr_active_q <= '0; - data_reg_q <= '0; - acc_reg_q <= '0; - weight_reg_q <= '0; - sa_ctrl_q <= '0; - acc_fs_q <= '0; - dest_reg_q <= '0; - id_ff_q <= '0; - id_fs_q <= '0; - id_dr_q <= '0; - finished_q <= '0; - finished_instr_id_q <= '0; - end else begin - ff_counter_q <= ff_counter_d ; - fs_counter_q <= fs_counter_d ; - dr_counter_q <= dr_counter_d ; - ff_active_q <= ff_active_d ; - fs_active_q <= fs_active_d ; - dr_active_q <= dr_active_d ; - data_reg_q <= data_reg_d ; - acc_reg_q <= acc_reg_d ; - weight_reg_q <= weight_reg_d ; - sa_ctrl_q <= sa_ctrl_d ; - acc_fs_q <= acc_fs_d ; - dest_reg_q <= dest_reg_d ; - id_ff_q <= id_ff_d ; - id_fs_q <= id_fs_d ; - id_dr_q <= id_dr_d ; - finished_q <= finished_d ; - finished_instr_id_q <= finished_instr_id_d ; - end - end - - assign sa_ready_o = (ff_counter_d=='0) & ((ff_active_q &~ ff_counter_q=='0) | (~ff_active_q & ~fs_active_q & ~dr_active_q)); - assign sa_input_id_o = id_ff_q ; - assign sa_output_id_o = id_dr_q ; - assign finished_o = finished_q ; - assign finished_instr_id_o = finished_instr_id_q; - - // -------------------------------------------------------------------- - - // Assertions - if (MESH_WIDTH < 2) begin - $error( - "[systolic_array] MESH_WIDTH must be at least 2.\n" - ); - end -endmodule From 85008c18a5e95617f249840eb27500d3776245ac Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Sat, 29 Mar 2025 19:27:42 +0100 Subject: [PATCH 06/18] LSU control using FSM --- .../rtl/include/quadrilatero_pkg_new.sv | 118 +++++++ .../rtl/quadrilatero_dispatcher_new.sv | 321 +++++++++++++++++ .../quadrilatero/rtl/quadrilatero_lsu_old.sv | 310 +++++++++++++++++ .../rtl/quadrilatero_register_lsu.sv | 164 +++++---- .../rtl/quadrilatero_register_lsu_old.sv | 327 ++++++++++++++++++ .../rtl/quadrilatero_rf_sequencer_new.sv | 290 ++++++++++++++++ 6 files changed, 1456 insertions(+), 74 deletions(-) create mode 100644 hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv new file mode 100644 index 000000000..6d7a05379 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv @@ -0,0 +1,118 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Saverio Nasturzio +package quadrilatero_pkg; + + parameter int unsigned N_REGS = 8; + parameter int unsigned DATA_WIDTH = 32; + parameter int unsigned BUS_WIDTH = 128; + parameter int unsigned MESH_WIDTH = 4; + parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units + parameter int unsigned MAX_NUM_READ_OPERANDS = 3; + parameter int unsigned MAX_NUM_WRITE_OPERANDS = 1; + parameter int unsigned READ_PORTS = 4; // we'll have fewer write ports so we take the maximum one which is the number of READ PORTS for the rw_queue_t + parameter int unsigned WRITE_PORTS = 3; // + parameter int unsigned RF_READ_PORTS = 4; + parameter int unsigned RF_WRITE_PORTS = 3; + + localparam int unsigned N_ROWS = MESH_WIDTH ; + localparam int unsigned RLEN = DATA_WIDTH * MESH_WIDTH; + + + typedef enum logic [2:0] { + SIZE_32 = 1, // 32-bit operation + SIZE_16 = 2, // 16-bit operation + SIZE_8 = 4 // 8-bit operation + } datatype_t; + + typedef struct packed { + logic is_float; + datatype_t datatype; + } sa_ctrl_t; + + typedef struct packed { + logic [$clog2(N_REGS)-1:0] data_reg ; + logic [$clog2(N_REGS)-1:0] acc_reg ; + logic [$clog2(N_REGS)-1:0] weight_reg; + logic [xif_pkg::X_ID_WIDTH-1:0] id ; + sa_ctrl_t sa_ctrl ; + } sa_instr_t; + + typedef struct packed { + logic [32-1:0] stride; + logic [32-1:0] addr; + logic [$clog2(N_REGS)-1:0] operand_reg; + logic [xif_pkg::X_ID_WIDTH-1:0] id; + logic is_store; + } lsu_instr_t; + + typedef struct packed { + logic [31:0] n_col_bytes; + logic [31:0] n_rows; + } lsu_conf_t; + + typedef struct packed { + logic [xif_pkg::X_ID_WIDTH-1:0] id; + } rw_queue_t; + + localparam int unsigned WR_PORT = (WRITE_PORTS > 1) ? $clog2(WRITE_PORTS) : 1; + localparam int unsigned RD_PORT = (READ_PORTS > 1) ? $clog2(READ_PORTS ) : 1; + typedef enum logic [RD_PORT-1:0] { + SYSTOLIC_ARRAY_W, + SYSTOLIC_ARRAY_D, + SYSTOLIC_ARRAY_A, + LSU_R + } read_ports_t; + + typedef enum logic [WR_PORT-1:0] { + SYSTOLIC_ARRAY, + LSU_W, + RF_W + } write_ports_t; + + // Int formats + typedef enum logic [$clog2(NUM_EXEC_UNITS)-1:0] { + FU_SYSTOLIC_ARRAY = 0, + FU_LSU, + FU_RF + // add new units here + } execution_units_t; + + + localparam int unsigned WR_OPS = (MAX_NUM_WRITE_OPERANDS > 1) ? $clog2(MAX_NUM_WRITE_OPERANDS) : 1; + localparam int unsigned RD_OPS = (MAX_NUM_READ_OPERANDS > 1) ? $clog2(MAX_NUM_READ_OPERANDS ) : 1; + typedef struct packed { + logic [RD_OPS-1:0] n_read_ports; + logic [WR_OPS-1:0] n_write_ports; + // Where within read_ports the functional unit starts + logic [$clog2(READ_PORTS):0] base_offset_read; + logic [$clog2(READ_PORTS):0] base_offset_write; + } fu_ports_info; + + // Follow execution_units_t order + parameter fu_ports_info FU_INFO[NUM_EXEC_UNITS] = '{ + '{ // SYSTOLIC_ARRAY + n_read_ports: 3, + n_write_ports: 1, + base_offset_read: 0, + base_offset_write: 0 + }, + '{ // LSU_W + n_read_ports: 1, + n_write_ports: 1, + base_offset_read: 3, // forth element in read_ports_t + base_offset_write: 1 + }, + '{ // RF_W + n_read_ports: 0, + n_write_ports: 1, + base_offset_read: 4, + base_offset_write: 2 + } + }; + + + +endpackage diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv new file mode 100644 index 000000000..c8194822a --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv @@ -0,0 +1,321 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_dispatcher #( + parameter N_REGS = 8, + parameter NUM_EXEC_UNITS = 3 +) ( + input logic clk_i, + input logic rst_ni, + + // Outputs to RF sequencer + // We can share the entry as we fetch 1 instruction at a time + // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles + // NOTE: probably the 'lost'cycles are not lost because we can directly push the instruction to the queue even while pushing the operands and they can start execution and if needed stall since no entry in the rw_queue will be found + output quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_o, + output logic [N_REGS-1:0] rw_queue_push_o, + + // Inputs from RF Sequencer + input logic [N_REGS-1:0] rw_queue_full_i, + + + // Instruction from Decoder + input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i, // id of the instruction + input logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_i, // Register file source operands for the offloaded instruction + input logic [xif_pkg::X_NUM_RS -1:0] rs_valid_i, // Validity of the register file source operand(s) + input quadrilatero_pkg::datatype_t datatype_i, + + + input logic [$clog2(quadrilatero_pkg::MAX_NUM_READ_OPERANDS)-1:0] n_matrix_operands_read_i, // how many reads to RF + + // IMPORTANT: Make sure the order of pushing does not impact or deadlock + input logic [quadrilatero_pkg::MAX_NUM_READ_OPERANDS-1:0][$clog2(N_REGS)-1:0] rf_read_regs_i, // which registers to read from + input logic rf_writeback_i, // whether we need to write to the register file + input logic [$clog2(N_REGS)-1:0] rf_writeback_reg_i, // which register to writeback to + input quadrilatero_pkg::execution_units_t exec_unit_i, // which exec unit + input logic is_store_i, // store to memory operation + input logic is_float_i, // float to arithmetic operation + + input logic instr_valid_i, + output logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_o, // id of the instruction out + output logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_o, // Register file source operands for the offloaded instruction + output logic [xif_pkg::X_NUM_RS -1:0] rs_valid_o, // Validity of the register file source operand(s) + output quadrilatero_pkg::datatype_t datatype_o, + output logic is_store_o, + output logic is_float_o, + + + output logic [$clog2(N_REGS)-1:0] reg_ms1_o, + output logic [$clog2(N_REGS)-1:0] reg_ms2_o, + output logic [$clog2(N_REGS)-1:0] reg_ms3_o, + output logic [$clog2(N_REGS)-1:0] reg_md_o , + + // Backpressure towards Decoder + output logic instr_ready_o, + + // Outputs towards Execution Units + input logic [NUM_EXEC_UNITS-1:0] issue_queue_full_i, + output logic [NUM_EXEC_UNITS-1:0] dispatch_o + + +); + +//------------------------------------------------------------------------------ + + typedef enum logic { + IDLE, + PUSH_OPERANDS + } dispatcher_state_e; + + dispatcher_state_e state_q, state_d; + logic instr_ready; + logic can_issue_instr; + + logic [NUM_EXEC_UNITS-1:0] dispatch_d; + logic [NUM_EXEC_UNITS-1:0] dispatch_q; + + logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_d; // id of the instruction out + logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_d ; // Register file source operands for the offloaded instruction + logic [xif_pkg::X_NUM_RS -1:0] rs_valid_d; // Validity of the register file source operand(s) + quadrilatero_pkg::datatype_t datatype_d; + logic is_store_d; + logic is_float_d; + logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_q; // id of the instruction out + logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_q ; // Register file source operands for the offloaded instruction + logic [xif_pkg::X_NUM_RS -1:0] rs_valid_q; // Validity of the register file source operand(s) + quadrilatero_pkg::datatype_t datatype_q; + logic is_store_q; + logic is_float_q; + + logic [2:0][$clog2(N_REGS)-1:0] rreg_d ; + logic [$clog2(N_REGS)-1:0] wreg_d ; + logic [2:0][$clog2(N_REGS)-1:0] rreg_q ; + logic [$clog2(N_REGS)-1:0] wreg_q ; + + logic ld_eq2 ; + logic ld_eq3 ; + logic ld_eqw ; + logic ld_full1; + logic ld_full2; + logic ld_full3; + logic ld_fullw; + logic ld_reg1 ; + logic ld_reg2 ; + logic ld_reg3 ; + logic ld_regw ; + + logic push_operand1_d; + logic push_operand2_d; + logic push_operand3_d; + logic push_operandw_d; + logic push_operand1_q; + logic push_operand2_q; + logic push_operand3_q; + logic push_operandw_q; + + logic back_push_op1_d; + logic back_push_op2_d; + logic back_push_op3_d; + logic back_push_opw_d; + logic back_push_op1_q; + logic back_push_op2_q; + logic back_push_op3_q; + logic back_push_opw_q; + + logic reg1_valid; + logic reg2_valid; + logic reg3_valid; + logic regw_valid; + + logic [N_REGS-1:0] rvalid; + logic [N_REGS-1:0] wready; + + logic en_cnt; + logic done ; + logic[2:0] delta ; + logic[2:0] outstanding_op_d; + logic[2:0] outstanding_op_q; + + +//------------------------------------------------------------------------------ + + assign can_issue_instr = instr_valid_i && ~issue_queue_full_i[exec_unit_i]; + assign instr_ready = can_issue_instr && (state_q == IDLE | done); + + always_comb begin: internal_signals_block + reg1_valid = (push_operand1_q) ? push_operand1_q : back_push_op1_q ; + reg2_valid = (push_operand2_q) ? push_operand2_q : back_push_op2_q ; + reg3_valid = (push_operand3_q) ? push_operand3_q : back_push_op3_q ; + regw_valid = (push_operandw_q) ? push_operandw_q : back_push_opw_q ; + + ld_eq2 = (rreg_q[0] == rreg_q[1]) & (reg1_valid & reg2_valid); + ld_eq3 = ((rreg_q[0] == rreg_q[2]) & (reg1_valid & reg3_valid) | + (rreg_q[1] == rreg_q[2]) & (reg2_valid & reg3_valid) ); + ld_eqw = ld_eq2 | ld_eq3; + + ld_full1 = rw_queue_full_i[rreg_q[0]]; + ld_full2 = rw_queue_full_i[rreg_q[1]]; + ld_full3 = rw_queue_full_i[rreg_q[2]]; + ld_fullw = rw_queue_full_i[wreg_q ]; + + ld_reg1 = ld_full1 ; + ld_reg2 = ld_full2 | ld_eq2; + ld_reg3 = ld_full3 | ld_eq3; + ld_regw = ld_fullw | ld_eqw; + + delta = 3'b0; + for(int ii = 0; ii < N_REGS; ii++) begin + delta += {2'b0, rvalid[ii]}; + delta += {2'b0, wready[ii]}; + end + + done = (delta == outstanding_op_q); + end + + always_comb begin: next_value + rreg_d = (instr_ready || state_q==IDLE) ? rf_read_regs_i : rreg_q; + wreg_d = (instr_ready || state_q==IDLE) ? rf_writeback_reg_i : wreg_q; + + rs_d = (instr_ready || state_q==IDLE) ? rs_i : rs_q ; + rs_valid_d = (instr_ready || state_q==IDLE) ? rs_valid_i : rs_valid_q; + instr_id_d = (instr_ready || state_q==IDLE) ? instr_id_i : instr_id_q; + datatype_d = (instr_ready || state_q==IDLE) ? datatype_i : datatype_q; + is_store_d = (instr_ready || state_q==IDLE) ? is_store_i : is_store_q; + is_float_d = (instr_ready || state_q==IDLE) ? is_float_i : is_float_q; + + dispatch_d = '0 ; + dispatch_d[exec_unit_i] = instr_ready; + + push_operandw_d = rf_writeback_i & instr_ready; + push_operand1_d = (n_matrix_operands_read_i > 0) & instr_ready; + push_operand2_d = (n_matrix_operands_read_i > 1) & instr_ready; + push_operand3_d = (n_matrix_operands_read_i > 2) & instr_ready; + + back_push_op1_d = ld_reg1 ? reg1_valid : 1'b0; + back_push_op2_d = ld_reg2 ? reg2_valid : 1'b0; + back_push_op3_d = ld_reg3 ? reg3_valid : 1'b0; + back_push_opw_d = ld_regw ? regw_valid : 1'b0; + + outstanding_op_d = {1'b0,n_matrix_operands_read_i} + {2'b0, rf_writeback_i}; + end + + always_comb begin: rw_queue_block + rvalid = '0; + wready = '0; + rvalid[rreg_q[0]] |= reg1_valid &~ ld_reg1; + rvalid[rreg_q[1]] |= reg2_valid &~ ld_reg2; + rvalid[rreg_q[2]] |= reg3_valid &~ ld_reg3; + wready[wreg_q ] = regw_valid &~ ld_regw; + for(int ii = 0; ii < N_REGS; ii++) begin + rw_queue_entry_o[ii].id = instr_id_q; + rw_queue_push_o [ii] = rvalid[ii] | wready[ii]; + end + end + + always_comb begin: fsm_block + en_cnt = 1'b0; + case (state_q) + IDLE: begin + if (can_issue_instr) begin + state_d = PUSH_OPERANDS; + end else begin + state_d = IDLE ; //@ loopback + end + end + PUSH_OPERANDS: begin + en_cnt = 1'b1; + if (done && !instr_ready) begin + state_d = IDLE; + end else begin + state_d = PUSH_OPERANDS ; //@ loopback + end + end + default: state_d = IDLE; + endcase + end + + delta_counter #( + .WIDTH(3), + .STICKY_OVERFLOW(1'b0) + ) delta_counter_i( + .clk_i , + .rst_ni , + .clear_i (1'b0) , // synchronous clear + .en_i (en_cnt) , // enable the counter + .load_i (instr_ready) , // load a new value + .down_i (1'b1) , // downcount, default is up + .delta_i (delta) , + .d_i (outstanding_op_d), + .q_o (outstanding_op_q), + .overflow_o () + ); + + always_ff @(posedge clk_i or negedge rst_ni) begin : seq_block + if (!rst_ni) begin + rreg_q <= '0; + wreg_q <= '0; + dispatch_q <= '0; + rs_q <= '0; + rs_valid_q <= '0; + instr_id_q <= '0; + datatype_q <= quadrilatero_pkg::SIZE_32; + is_store_q <= '0; + is_float_q <= '0; + + back_push_op1_q <= 1'b0; + back_push_op2_q <= 1'b0; + back_push_op3_q <= 1'b0; + back_push_opw_q <= 1'b0; + push_operand1_q <= 1'b0; + push_operand2_q <= 1'b0; + push_operand3_q <= 1'b0; + push_operandw_q <= 1'b0; + state_q <= IDLE; + end else begin + rreg_q <= rreg_d ; + wreg_q <= wreg_d ; + dispatch_q <= dispatch_d; + rs_q <= rs_d ; + rs_valid_q <= rs_valid_d; + instr_id_q <= instr_id_d; + datatype_q <= datatype_d; + is_store_q <= is_store_d; + is_float_q <= is_float_d; + + back_push_op1_q <= back_push_op1_d; + back_push_op2_q <= back_push_op2_d; + back_push_op3_q <= back_push_op3_d; + back_push_opw_q <= back_push_opw_d; + push_operand1_q <= push_operand1_d; + push_operand2_q <= push_operand2_d; + push_operand3_q <= push_operand3_d; + push_operandw_q <= push_operandw_d; + state_q <= state_d ; + end + end + + // Output assignments + assign instr_ready_o = instr_ready; + assign dispatch_o = dispatch_q; + assign rs_o = rs_q ; + assign rs_valid_o = rs_valid_q ; + assign instr_id_o = instr_id_q ; + assign datatype_o = datatype_q ; + assign is_store_o = is_store_q ; + assign is_float_o = is_float_q ; + + assign reg_ms1_o = rreg_q[0] ; + assign reg_ms2_o = rreg_q[1] ; + assign reg_ms3_o = rreg_q[2] ; + assign reg_md_o = wreg_q ; + + // Assertions + if (quadrilatero_pkg::MAX_NUM_READ_OPERANDS != 3) begin + $error( + "[dispatcher] The quadrilatero_pkg::MAX_NUM_READ_OPERANDS needs to be 3 for the current implementation.\n" + ); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv new file mode 100644 index 000000000..cb5f9b2ff --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv @@ -0,0 +1,310 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_lsu #( + parameter int unsigned FIFO_DEPTH = 4, + parameter int unsigned DATA_WIDTH = 32 + +) ( + input logic clk_i , + input logic rst_ni , + + // Bus interface + output logic data_req_o , + output logic [ 31:0] data_addr_o , + output logic data_we_o , + output logic [DATA_WIDTH/8 - 1:0] data_be_o , + output logic [ DATA_WIDTH-1:0] data_wdata_o , + input logic data_gnt_i , + input logic data_rvalid_i , + input logic [ DATA_WIDTH-1:0] data_rdata_i , + + // Configuration + input logic start_i , // start transfer (MUST BE A PULSE!!!!!) + input logic write_i , // write transaction + output logic busy_o , // lsu available + output logic terminate_o , // lsu done + + // Address + input logic [ 31:0] src_ptr_i , // base address + input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one + input logic [ 31:0] rows_i , // how many rows we need to fetch + input logic [ 31:0] cols_i , + + // Output data + output logic [ DATA_WIDTH-1:0] load_fifo_output_o , + output logic load_fifo_valid_o , + output logic load_fifo_data_available_o , + input logic load_fifo_output_pop_i , + + // Input data + input logic [ DATA_WIDTH-1:0] store_fifo_input_i , + input logic store_fifo_push_i , + output logic store_fifo_space_available_o, + output logic store_fifo_empty_o + + +); + + localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; + localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; + localparam int unsigned LastFifoUsage = DEPTH - 1; + + + logic terminate ; + + logic [ 31:0] rows_q ; + logic [ 31:0] rows_d ; + logic [ 31:0] cols_q ; + logic [ 31:0] cols_d ; + logic [ 31:0] src_ptr_inc ; + logic [ 31:0] addr ; + logic [ 31:0] addr_op2 ; + logic [ 31:0] ptr_q ; + logic [ 31:0] ptr_d ; + + logic data_in_req ; + logic data_in_we ; + logic [ DATA_WIDTH/8-1:0] data_in_be ; + logic [ 31:0] data_in_addr ; + logic data_in_rvalid ; + logic [ DATA_WIDTH-1:0] data_in_rdata ; + + logic [ DATA_WIDTH-1:0] load_fifo_input ; + logic [ DATA_WIDTH-1:0] load_fifo_data_out; + logic rd_valid_q ; + logic rd_valid_d ; + logic [ DATA_WIDTH-1:0] rd_head_q ; + logic [ DATA_WIDTH-1:0] rd_head_d ; + logic data_we_q ; + logic data_we_d ; + logic rvalid ; + logic load_fifo_pop ; + logic load_fifo_push ; + logic [Addr_Fifo_Depth-1:0] load_fifo_usage ; + logic load_fifo_alm_full; + logic load_fifo_full ; + logic load_fifo_empty ; + + logic data_out_req ; + logic data_out_we ; + logic [ DATA_WIDTH/8-1:0] data_out_be ; + logic [ 31:0] data_out_addr ; + logic data_out_gnt ; + logic [ DATA_WIDTH-1:0] data_out_wdata ; + + logic store_fifo_full ; + logic store_fifo_empty ; + logic [ DATA_WIDTH-1:0] store_fifo_output ; + logic store_fifo_pop ; + + + enum { + LSU_READY, + LSU_RUNNING + } + lsu_state_q, lsu_state_d; + + + always_comb begin : FSM_block + lsu_state_d = lsu_state_q; + + case (lsu_state_q) + LSU_READY: begin + if (start_i & |cols_i & |rows_i) begin + lsu_state_d = LSU_RUNNING; + end + end + LSU_RUNNING: begin + if (terminate && !start_i) begin + lsu_state_d = LSU_READY; + end + end + endcase + end + + always_comb begin : ctrl_block + terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); + load_fifo_valid_o = rd_valid_d; + busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; + terminate_o = terminate; + end + + always_comb begin : addr_block + src_ptr_inc = DATA_WIDTH / 8; + addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; + addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; + ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; + end + + always_comb begin : counters_block + rows_d = rows_q; + cols_d = cols_q; + + if(start_i) begin + if(data_gnt_i && data_req_o) begin + if(cols_i > 1) begin + rows_d = rows_i - 1; + cols_d = cols_i - 2; + end else if (rows_i > 1) begin + rows_d = rows_i - 2; + cols_d = cols_i - 1; + end + end else begin + rows_d = rows_i - 1; + cols_d = cols_i - 1; + end + end else if (data_gnt_i && data_req_o) begin + if (cols_q > 0) cols_d = cols_q - 1; + else if (rows_q > 0) begin + cols_d = cols_i - 1; + rows_d = rows_q - 1; + end + end + end + + always_comb begin : read_obi + data_in_req = '0; + data_in_we = '0; + data_in_be = '0; + data_in_addr = '0; + + if (load_fifo_full == 1'b0 && load_fifo_alm_full == 1'b0) begin + data_in_req = ~write_i & (start_i | lsu_state_q == LSU_RUNNING); + data_in_we = 1'b0 ; + data_in_be = '1 ; + data_in_addr = addr ; + end + end + + always_comb begin : write_obi + data_out_req = '0 ; + data_out_we = '0 ; + data_out_be = '0 ; + data_out_addr = '0 ; + data_out_wdata = store_fifo_output; + + if (!store_fifo_empty) begin + data_out_req = start_i | lsu_state_q == LSU_RUNNING; + // data_out_we = 1'b1 ; + data_out_we = start_i | lsu_state_q == LSU_RUNNING; + data_out_be = '1 ; + data_out_addr = addr ; + end + end + + always_comb begin : obi_channel_signals + data_in_rvalid = 1'b0 ; + data_wdata_o = data_out_wdata; + data_out_gnt = data_gnt_i ; + data_in_rdata = data_rdata_i ; + + if(store_fifo_empty) begin // read transaction active + data_req_o = data_in_req ; + data_we_o = data_in_we ; + data_be_o = data_in_be ; + data_addr_o = data_in_addr ; + data_in_rvalid = data_rvalid_i ; + end else begin // write transaction active + data_req_o = data_out_req ; + data_we_o = data_out_we ; + data_be_o = data_out_be ; + data_addr_o = data_out_addr ; + end + end + + always_comb begin : load_fifo_block + data_we_d = data_gnt_i && data_req_o && data_we_o; + rvalid = data_in_rvalid &~ data_we_q ; + + load_fifo_alm_full = (load_fifo_usage == LastFifoUsage[Addr_Fifo_Depth-1:0]); + load_fifo_input = data_in_rdata; + load_fifo_push = (rvalid & rd_valid_q & ~load_fifo_output_pop_i) | (rvalid & ~load_fifo_empty); + load_fifo_pop = load_fifo_output_pop_i & ~load_fifo_empty; + + rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : + (load_fifo_output_pop_i & + load_fifo_empty & ~rvalid) ? 1'b0 : rd_valid_q; + + rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || + (rvalid & ~rd_valid_q) ? load_fifo_input : + (load_fifo_output_pop_i & ~load_fifo_empty) ? load_fifo_data_out : rd_head_q; + + load_fifo_output_o = rd_head_q ; + load_fifo_data_available_o = rd_valid_q; + end + + always_comb begin : store_fifo_block + store_fifo_pop = data_out_gnt & data_out_req; + store_fifo_empty_o = store_fifo_empty; + store_fifo_space_available_o = ~store_fifo_full; + end + + fifo_v3 #( + .FALL_THROUGH (1'b0 ), + .DEPTH (DEPTH ), + .DATA_WIDTH (DATA_WIDTH ) + ) load_lsu_fifo_i ( + .clk_i , + .rst_ni , + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + + // status flags + .full_o (load_fifo_full ), + .empty_o (load_fifo_empty ), + .usage_o (load_fifo_usage ), + + // as long as the queue is not full we can push new data + .data_i (load_fifo_input ), + .push_i (load_fifo_push ), + + // as long as the queue is not empty we can pop new elements + .data_o (load_fifo_data_out ), + .pop_i (load_fifo_pop ) + ); + + fifo_v3 #( + .DEPTH(FIFO_DEPTH), + .DATA_WIDTH(DATA_WIDTH) + ) store_lsu_fifo_i ( + .clk_i , + .rst_ni , + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + // status flags + .full_o (store_fifo_full ), + .empty_o (store_fifo_empty ), + .usage_o ( ), + // as long as the queue is not full we can push new data + .data_i (store_fifo_input_i ), + .push_i (store_fifo_push_i ), + // as long as the queue is not empty we can pop new elements + .data_o (store_fifo_output ), + .pop_i (store_fifo_pop ) + ); + + always_ff @(posedge clk_i, negedge rst_ni) begin : seq_block + if (~rst_ni) begin + lsu_state_q <= LSU_READY; + ptr_q <= '0 ; + rows_q <= '0 ; + cols_q <= '0 ; + rd_head_q <= '0 ; + rd_valid_q <= '0 ; + data_we_q <= '0 ; + end else begin + lsu_state_q <= lsu_state_d; + ptr_q <= ptr_d ; + rows_q <= rows_d ; + cols_q <= cols_d ; + rd_head_q <= rd_head_d ; + rd_valid_q <= rd_valid_d ; + data_we_q <= data_we_d ; + end + end + +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 5c68b7c1b..fdcc4e746 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -65,14 +65,16 @@ module quadrilatero_register_lsu #( ); localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; + localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); - // typedef enum logic { - // IDLE, - // COUNTING_ROWS, - // LAST_ROW - // } register_lsu_state_e; + typedef enum logic [1:0] { + LSU_IDLE, + LSU_LOAD, + LSU_STORE, + LSU_DONE + } register_lsu_state_e; - // register_lsu_state_e state_d, state_q; + register_lsu_state_e lsu_state_d, lsu_state_q; logic finished; logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; @@ -100,10 +102,6 @@ module quadrilatero_register_lsu #( logic start_q; logic start_d; - - logic valid_d; - logic valid_q; - logic write_q; logic write_d; logic terminate; @@ -112,10 +110,6 @@ module quadrilatero_register_lsu #( logic lsu_busy_q; logic lsu_ready; - logic mask_req; - - - logic [ 31:0] src_ptr_d ; logic [ 31:0] stride_d ; @@ -124,7 +118,7 @@ module quadrilatero_register_lsu #( logic [ 31:0] src_ptr ; logic [ 31:0] stride ; - assign mask_req = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & finished_o & ~finished_ack_i; + assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; finished = (write_q & terminate) | (~write_q & wlast_o & wready_i); @@ -138,15 +132,13 @@ module quadrilatero_register_lsu #( waddr_o = waddr_q; wrowaddr_o = counter_q ; wdata_o = load_fifo_data & ~data_mask; - wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && we_o && wready_i; - // wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & wready_i; + end always_comb begin: read_from_RF rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; rrowaddr_o = counter_q ; raddr_o = operand_reg_i ; - rlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && rdata_valid_i && rdata_ready_o; end always_comb begin: lsu_ctrl_block @@ -155,87 +147,111 @@ module quadrilatero_register_lsu #( store_fifo_push = rdata_ready_o && rdata_valid_i; lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); start = (start_i | start_q) & lsu_ready; - //busy_o = (write_i ? busy_d : busy) | start_q; - busy_o = (write_i ? busy_d : busy | (load_fifo_data_available & counter_d == '0)) | start_q; + busy_o = (write_i ? busy_d : busy ) | start_q; stride = (start) ? stride_i : stride_q; src_ptr = (start) ? address_i : src_ptr_q; end always_comb begin: next_value - if (rlast_o || wlast_o) begin - counter_d = '0; - end else if ((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - counter_d = counter_q + 1; - end else begin - counter_d = counter_q; - end - write_d = (write_i && rlast_o && rdata_valid_i) ? 1'b1 : (!write_i && !busy) ? 1'b0 : write_q; - - valid_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? 1'b1 : - (load_fifo_valid && (counter_d==$clog2(N_ROWS)'(N_ROWS - 1)) && valid_q) ? 1'b0 : valid_q; // $clog2(N_ROWS)'(N_ROWS - 1) was 3, if there's a problem check here... - + start_d = start ? 1'b0 : (start_q | start_i) ? 1'b1 : start_q; stride_d = (start) ? stride_i : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; - back_id_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? instr_id_i : - rlast_o ? lsu_id_o : back_id_q; - - waddr_d = (load_fifo_valid && counter_d==0) ? operand_reg_i : waddr_q ; - busy_d = (write_i && rlast_o && rdata_valid_i) ? 1'b0 : (write_i && start_i) ? 1'b1 : busy_q; end - // always_comb begin: fsm_block - // counter_d = '0; - // rlast_o = 1'b0; - // rrowaddr_o = counter_q; - // wlast_o = 1'b0; - // wrowaddr_o = counter_q; - // case (state_q) - // IDLE: begin - // counter_d = '0; - // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - // state_d = COUNTING_ROWS - // end - // state_d = IDLE - // end - // COUNTING_ROWS: begin - // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - // counter_d = counter_q + 1; - // if(counter_d = $clog2(N_ROWS)'(N_ROWS - 1)) begin - // state_d = LAST_ROW; - // end else begin - // state_d = COUNTING_ROWS; - // end - // end - - // end - // LAST_ROW: begin - // if(rlast_o || wlast_o) begin - // state_d = IDLE; - // end - - - // end - // default: - // endcase - //end + always_comb begin: fsm_block + lsu_state_d = lsu_state_q; + counter_d = counter_q; + rlast_o = 1'b0; + wlast_o = 1'b0; + + back_id_d = back_id_q; + waddr_d = waddr_q; + + case (lsu_state_q) + LSU_IDLE: begin + if(load_fifo_valid && !write_i) begin + counter_d = '0; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + lsu_state_d = LSU_LOAD; + end else if (write_i & store_fifo_space_available && rdata_valid_i) begin + counter_d = '0; + lsu_state_d = LSU_STORE; + end + + end + LSU_LOAD: begin + if(load_fifo_valid) begin + if(wready_i) begin + if(counter_q == LastRow) begin + counter_d = '0; + wlast_o = 1'b1; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + end else begin + counter_d = counter_q + 1; + end + end + end else begin + counter_d = '0; + lsu_state_d = LSU_DONE; + end + end + LSU_STORE: begin + if(store_fifo_space_available && write_i) begin + if(rdata_valid_i) begin + if(counter_q == LastRow) begin + counter_d = '0; + rlast_o = 1'b1; + lsu_state_d = LSU_DONE; + back_id_d = lsu_id_o; + end else begin + counter_d = counter_q + 1; + end + end + end else begin + counter_d = '0; + back_id_d = instr_id_i; + lsu_state_d = LSU_DONE; + end + end + LSU_DONE: begin + if(load_fifo_valid && !write_i && wready_i) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_LOAD; + end else if (write_i & store_fifo_space_available && rdata_valid_i) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_STORE; + end else begin + lsu_state_d = LSU_IDLE; + end + end + default: begin + lsu_state_d = LSU_IDLE; + end + endcase + + end + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block if (!rst_ni) begin counter_q <= '0; waddr_q <= '0; back_id_q <= '0; start_q <= '0; - valid_q <= '0; write_q <= '0; busy_q <= '0; + lsu_state_q <= LSU_IDLE; lsu_busy_q <= '0; src_ptr_q <= '0; @@ -245,9 +261,9 @@ module quadrilatero_register_lsu #( back_id_q <= back_id_d; waddr_q <= waddr_d ; start_q <= start_d ; - valid_q <= valid_d ; write_q <= write_d ; busy_q <= busy_d ; + lsu_state_q <= lsu_state_d; lsu_busy_q <= busy; src_ptr_q <= src_ptr_d; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv new file mode 100644 index 000000000..caf3f74d9 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv @@ -0,0 +1,327 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* +NOTE: for now we assume we fetch the entire row in 1 cycle. TODO: Change the number of columns and adapt this to arbitrary BUS_WIDTH parameters +NOTE: we are not handling difference in endianness when loading reduced datawidths +*/ + +module quadrilatero_register_lsu #( + parameter int unsigned BUS_WIDTH = 128, + parameter int unsigned N_REGS = 8, + parameter int unsigned N_ROWS = 4, + localparam int unsigned LLEN = BUS_WIDTH +) ( + input logic clk_i , + input logic rst_ni , + + // Bus interface + output logic data_req_o , + output logic [ 31:0] data_addr_o , + output logic data_we_o , + output logic [ BUS_WIDTH/8 - 1:0] data_be_o , + output logic [ BUS_WIDTH-1:0] data_wdata_o , + input logic data_gnt_i , + input logic data_rvalid_i , + input logic [ BUS_WIDTH-1:0] data_rdata_i , + + output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , + + // Register Write Port for load unit + output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [ LLEN-1:0] wdata_o , + output logic we_o , + output logic wlast_o , + input logic wready_i , // to stall the request in case the port is busy + + // Register Read Port for store unit + output logic [ $clog2(N_REGS)-1:0] raddr_o , + output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [ LLEN-1:0] rdata_i , + input logic rdata_valid_i , + output logic rdata_ready_o , + output logic rlast_o , + + // Configuration Signals + input logic start_i , // start loading: MUST BE A PULSE + input logic write_i , + output logic busy_o , + input logic [ 31:0] stride_i , // stride value + input logic [ 31:0] address_i , // address value + input logic [ $clog2(N_REGS)-1:0] operand_reg_i , // destination register + input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i , // instruction id + input logic [ 31:0] n_bytes_cols_i , // we always fetch the entire row and then only take the elements we need + input logic [ 31:0] n_rows_i , + + + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o //instruction id out + +); + + localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; + + // typedef enum logic { + // IDLE, + // COUNTING_ROWS, + // LAST_ROW + // } register_lsu_state_e; + + // register_lsu_state_e state_d, state_q; + + logic finished; + logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; + logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; + + logic [$clog2(N_ROWS)-1:0] counter_q; + logic [$clog2(N_ROWS)-1:0] counter_d; + logic [$clog2(N_REGS)-1:0] waddr_q; + logic [$clog2(N_REGS)-1:0] waddr_d; + + logic [LLEN-1:0] load_fifo_data; + + logic load_fifo_data_available; + logic load_fifo_pop; + + logic store_fifo_space_available; + logic store_fifo_push; + logic store_fifo_empty; + logic [LLEN-1:0] store_fifo_data; + + logic [LLEN-1:0] data_mask; + logic load_fifo_valid; + logic busy; + logic start; + logic start_q; + logic start_d; + + + logic valid_d; + logic valid_q; + + logic write_q; + logic write_d; + logic terminate; + logic busy_q; + logic busy_d; + + logic lsu_busy_q; + logic lsu_ready; + logic mask_req; + + + + + logic [ 31:0] src_ptr_d ; + logic [ 31:0] stride_d ; + logic [ 31:0] src_ptr_q ; + logic [ 31:0] stride_q ; + logic [ 31:0] src_ptr ; + logic [ 31:0] stride ; + + assign mask_req = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & finished_o & ~finished_ack_i; + always_comb begin + lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; + finished = (write_q & terminate) | (~write_q & wlast_o & wready_i); + end + + + always_comb begin: write_to_RF + data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols + + we_o = load_fifo_data_available &~ mask_req; + waddr_o = waddr_q; + wrowaddr_o = counter_q ; + wdata_o = load_fifo_data & ~data_mask; + wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && we_o && wready_i; + // wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & wready_i; + end + + always_comb begin: read_from_RF + rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; + rrowaddr_o = counter_q ; + raddr_o = operand_reg_i ; + rlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && rdata_valid_i && rdata_ready_o; + end + + always_comb begin: lsu_ctrl_block + load_fifo_pop = wready_i; + store_fifo_data = rdata_i; + store_fifo_push = rdata_ready_o && rdata_valid_i; + lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); + start = (start_i | start_q) & lsu_ready; + //busy_o = (write_i ? busy_d : busy) | start_q; + busy_o = (write_i ? busy_d : busy | (load_fifo_data_available & counter_d == '0)) | start_q; + + stride = (start) ? stride_i : stride_q; + src_ptr = (start) ? address_i : src_ptr_q; + end + + always_comb begin: next_value + if (rlast_o || wlast_o) begin + counter_d = '0; + end else if ((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin + counter_d = counter_q + 1; + end else begin + counter_d = counter_q; + end + + write_d = (write_i && rlast_o && rdata_valid_i) ? 1'b1 : + (!write_i && !busy) ? 1'b0 : write_q; + + valid_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? 1'b1 : + (load_fifo_valid && (counter_d==$clog2(N_ROWS)'(N_ROWS - 1)) && valid_q) ? 1'b0 : valid_q; // $clog2(N_ROWS)'(N_ROWS - 1) was 3, if there's a problem check here... + + start_d = start ? 1'b0 : + (start_q | start_i) ? 1'b1 : start_q; + + stride_d = (start) ? stride_i : stride_q ; + src_ptr_d = (start) ? address_i : src_ptr_q; + + back_id_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? instr_id_i : + rlast_o ? lsu_id_o : back_id_q; + + waddr_d = (load_fifo_valid && counter_d==0) ? operand_reg_i : waddr_q ; + + busy_d = (write_i && rlast_o && rdata_valid_i) ? 1'b0 : + (write_i && start_i) ? 1'b1 : busy_q; + end + // always_comb begin: fsm_block + // counter_d = '0; + // rlast_o = 1'b0; + // rrowaddr_o = counter_q; + // wlast_o = 1'b0;sim:/tb_top/testharness_i/gen_USE_EXTERNAL_DEVICE_EXAMPLE/gen_quadrilatero_wrapper/quadrilatero_wrapper_i/mat_inst/regloader_i/busy_o + + // wrowaddr_o = counter_q; + // case (state_q) + // IDLE: begin + // counter_d = '0; + // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin + // state_d = COUNTING_ROWS + // end + // state_d = IDLE + // end + // COUNTING_ROWS: begin + // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin + // counter_d = counter_q + 1; + // if(counter_d = $clog2(N_ROWS)'(N_ROWS - 1)) begin + // state_d = LAST_ROW; + // end else begin + // state_d = COUNTING_ROWS; + // end + // end + + // end + // LAST_ROW: begin + // if(rlast_o || wlast_o) begin + // state_d = IDLE; + // end + + + // end + // default: + // endcase + //end + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + counter_q <= '0; + waddr_q <= '0; + back_id_q <= '0; + start_q <= '0; + valid_q <= '0; + write_q <= '0; + busy_q <= '0; + + lsu_busy_q <= '0; + src_ptr_q <= '0; + stride_q <= '0; + end else begin + counter_q <= counter_d; + back_id_q <= back_id_d; + waddr_q <= waddr_d ; + start_q <= start_d ; + valid_q <= valid_d ; + write_q <= write_d ; + busy_q <= busy_d ; + + lsu_busy_q <= busy; + src_ptr_q <= src_ptr_d; + stride_q <= stride_d ; + end + end + + quadrilatero_lsu #( + .FIFO_DEPTH (4 ), + .DATA_WIDTH (BUS_WIDTH) + ) lsunit_inst ( + + .clk_i , + .rst_ni , + + // Bus interface + .data_req_o , + .data_addr_o , + .data_we_o , + .data_be_o , + .data_wdata_o , + .data_gnt_i , + .data_rvalid_i , + .data_rdata_i , + + //Configuration + .start_i (start ), + .write_i , + .busy_o (busy ), + .terminate_o (terminate ), + + // Address + .src_ptr_i (src_ptr ), + .stride_i (stride ), + .cols_i (MAX_EL_PER_ROW ), + .rows_i (n_rows_i ), + + // Output data + .load_fifo_output_o (load_fifo_data ), + .load_fifo_valid_o (load_fifo_valid ), + .load_fifo_data_available_o (load_fifo_data_available ), + .load_fifo_output_pop_i (load_fifo_pop ), + + // Input data + .store_fifo_input_i (store_fifo_data ), + .store_fifo_push_i (store_fifo_push ), + .store_fifo_space_available_o (store_fifo_space_available ), + .store_fifo_empty_o (store_fifo_empty ) + ); + + //------------------------- + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + finished_o <= '0; + finished_instr_id_o <= '0; + end else begin + if (finished) begin + finished_o <= '1; + finished_instr_id_o <= back_id_q; + end + if (finished_ack_i) begin + finished_o <= '0; + finished_instr_id_o <= '0; + end + end + end + //--------------------- + + // Assertions + if (N_ROWS < 2) begin + $error( + "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" + ); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv new file mode 100644 index 000000000..91b57d19d --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv @@ -0,0 +1,290 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_rf_sequencer #( + parameter READ_PORTS = 4 , + parameter WRITE_PORTS = 2 , + parameter N_REGS = 8 , + parameter N_ROWS = 4 , + parameter RLEN = 128 , + parameter RF_READ_PORTS = 3 , + parameter RF_WRITE_PORTS = 1, + parameter SYNC_REQ = 1, + + parameter N_ENTRIES = 3 // entries in the FIFOs for each register +) ( + + input logic clk_i, + input logic rst_ni, + + // Input from FUs + input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , + input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , + output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , + output logic [READ_PORTS-1:0] rvalid_o , + input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , + + + input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , + input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , + input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , + input logic [WRITE_PORTS-1:0] we_i , + input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) + output logic [WRITE_PORTS-1:0] wready_o , + input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , + + // Outputs to RF + output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , + output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , + + + output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , + output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , + output logic [RF_WRITE_PORTS-1:0] we_o , + + + // Inputs from Dispatcher + // We can share the entry as we fetch 1 instruction at a time + // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles + input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , + input logic [N_REGS-1:0] rw_queue_push_i , + + // Outputs to Dispatcher + output logic [N_REGS-1:0] rw_queue_full_o +); + + //logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; + logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; + + logic [WRITE_PORTS-1:0] wr_gnt ; + logic [WRITE_PORTS-1:0] wr_req ; + logic [READ_PORTS -1:0] rd_req ; + logic [READ_PORTS -1:0] rd_gnt ; + + logic [N_REGS-1:0] rw_queue_push ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; + + genvar ii,hh; + + assign rw_queue_pop = w_pop | r_pop; + assign rw_queue_entry = rw_queue_entry_i ; + assign rw_queue_push = rw_queue_push_i ; + + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; + assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; + + for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs + for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows + fifo_v3 #( + .FALL_THROUGH (1'b1) , + .DEPTH (N_ENTRIES) , + .dtype (quadrilatero_pkg::rw_queue_t) + ) issue_queue_inst ( + .clk_i, + .rst_ni, + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + .usage_o ( ), + .full_o (rw_queue_full [ii][hh] ), + .empty_o (rw_queue_empty[ii][hh] ), + .data_i (rw_queue_entry[ii] ), // data to push into the queue + .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue + .data_o (rw_queue [ii][hh] ), // output data + .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue + ); + end + end + + always_comb begin: scoreboard_block + rw_queue_full_o = '0; + for (int i = 0; i < N_REGS; i++) begin + for (int h = 0; h < N_ROWS; h++) begin + rw_queue_full_o[i] |= (rw_queue_full[i][h]); + + //head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; + + + scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; + + //scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + + //scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : + // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + end + end + end + + always_comb begin: ctrl_block + wr_req = '0; + rd_req = '0; + w_pop = '0; + r_pop = '0; + r_clr = '0; + + for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request + automatic int m = 32'(waddr_i[jj]); + automatic int n = 32'(wrowaddr_i[jj]); + wr_req [jj] = we_i[jj] && (scoreboard_q[m][n].id == wr_id_i[jj]); + w_pop [m][n] = wr_gnt[jj]; + end + + for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request + automatic int m = 32'(raddr_i[jj]); + automatic int n = 32'(rrowaddr_i[jj]); + if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj] ) + begin + rd_req [jj] = 1'b1; + r_clr [m][n] = rd_gnt[jj]; + r_pop [m][n] = rd_gnt[jj]; + end + end + + if(SYNC_REQ) begin: sa_sync_req + + logic block ; + logic same_id_acc; + logic same_id_A ; + logic same_id_D ; + logic same_id_W ; + + // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; + same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; + same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; + same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; + + if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) + ) begin + block = 1'b1; + end else begin + block = 1'b0; + end + + // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin + // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; + // end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; + end + end + end + + if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_WRITE_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (WRITE_PORTS) + ) wr_arb_i( + .clk_i , + .rst_ni , + .req_i (wr_req), + .grant_o (wr_gnt) + ); + always_comb begin: wdata_block + automatic int ll=0; + + wready_o = wr_gnt ; + for (int mm = 0; mm < WRITE_PORTS; mm++) begin + if(wr_gnt[mm]) begin + waddr_o [ll] = waddr_i [mm]; + wrowaddr_o[ll] = wrowaddr_i[mm]; + wdata_o [ll] = wdata_i [mm]; + we_o [ll] = we_i [mm]; + ll++; + end + end + end + end else always_comb begin : write_block_noArb + wr_gnt = wr_req ; + waddr_o = waddr_i ; + wrowaddr_o = wrowaddr_i; + wdata_o = wdata_i ; + we_o = wr_gnt ; + wready_o = wr_gnt ; + end + + if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_READ_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (READ_PORTS) + ) rd_arb_i( + .clk_i , + .rst_ni , + .req_i (rd_req), + .grant_o (rd_gnt) + ); + + always_comb begin: rdata_block + automatic int ll=0; + + rvalid_o = rd_gnt; + for (int mm = 0; mm < READ_PORTS; mm++) begin + if(rd_gnt[mm]) begin + raddr_o [ll] = raddr_i [mm]; + rrowaddr_o[ll] = rrowaddr_i[mm]; + rdata_o [mm] = rdata_i [ll]; + ll++; + end else begin + rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; + end + end + end + end else always_comb begin : read_block_noArb + rd_gnt = rd_req ; + raddr_o = raddr_i ; + rrowaddr_o = rrowaddr_i; + rdata_o = rdata_i ; + rvalid_o = rd_gnt ; + end + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + scoreboard_q <= '0; + end else begin + scoreboard_q <= scoreboard_d; + end + end + + //------------------------------------------------------------------------------------------------------- + + // Assertions + if (WRITE_PORTS < 2) begin + $error( + "[rf_sequencer] WRITE_PORTS must be at least 2.\n" + ); + end + if (READ_PORTS < 2) begin + $error( + "[rf_sequencer] READ_PORTS must be at least 2.\n" + ); + end +endmodule From 9d4464b1fb1c2a28e883d847efa5b14ce260eccd Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Mon, 31 Mar 2025 10:52:17 +0200 Subject: [PATCH 07/18] Added last_row signals to enable multiple read/writes from/to the scoreboard --- .../quadrilatero/rtl/quadrilatero.sv | 25 ++ .../quadrilatero/rtl/quadrilatero_lsu_old.sv | 310 ------------------ .../rtl/quadrilatero_perm_unit.sv | 2 + .../rtl/quadrilatero_register_lsu.sv | 4 + .../rtl/quadrilatero_rf_sequencer.sv | 6 +- .../rtl/quadrilatero_rf_sequencer_new.sv | 21 +- .../rtl/quadrilatero_rf_sequencer_old.sv | 295 +++++++++++++++++ .../rtl/quadrilatero_systolic_array.sv | 8 + 8 files changed, 349 insertions(+), 322 deletions(-) delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv index 5a4d062a1..0237c3296 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv @@ -131,6 +131,7 @@ module quadrilatero logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_rdata_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rvalid_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rlast_from_fu ; + logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rlast_row_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rready_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_rd_id_from_fu ; @@ -139,6 +140,7 @@ module quadrilatero logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_wdata_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_we_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wlast_from_fu ; + logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wlast_row_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wready_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_wr_id_from_fu ; @@ -169,6 +171,7 @@ module quadrilatero logic sa_weight_rdata_valid; logic sa_weight_rdata_ready; logic sa_weight_rlast ; + logic sa_weight_rlast_row ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id ; logic [quadrilatero_pkg::RLEN-1:0] sa_weight_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_weight_raddr ; @@ -177,6 +180,7 @@ module quadrilatero logic sa_data_rdata_valid ; logic sa_data_rdata_ready ; logic sa_data_rlast ; + logic sa_data_rlast_row ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id ; logic [quadrilatero_pkg::RLEN-1:0] sa_data_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_data_raddr ; @@ -185,6 +189,7 @@ module quadrilatero logic sa_acc_rdata_valid ; logic sa_acc_rdata_ready ; logic sa_acc_rlast ; + logic sa_acc_rlast_row ; logic [quadrilatero_pkg::RLEN-1:0] sa_acc_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_acc_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_acc_rrowaddr ; @@ -192,6 +197,7 @@ module quadrilatero logic sa_res_we ; logic sa_res_wready ; logic sa_res_wlast ; + logic sa_res_wlast_row; logic [quadrilatero_pkg::RLEN-1:0] sa_res_wdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_res_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_res_wrowaddr ; @@ -224,6 +230,7 @@ module quadrilatero logic lsu_we ; logic lsu_wlast ; + logic lsu_wlast_row ; logic lsu_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] lsu_id ; logic [quadrilatero_pkg::RLEN-1:0] lsu_wdata ; @@ -231,6 +238,7 @@ module quadrilatero logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_wrowaddr ; logic lsu_rlast ; + logic lsu_rlast_row ; logic lsu_rready ; logic lsu_rvalid ; logic [quadrilatero_pkg::RLEN-1:0] lsu_rdata ; @@ -250,6 +258,7 @@ module quadrilatero logic perm_busy ; logic perm_unit_we ; logic perm_unit_wlast ; + logic perm_unit_wlast_row ; logic perm_unit_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_instr_id ; @@ -511,6 +520,7 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rlast ; + rf_seq_rlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rdata_ready; rf_seq_rd_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_input_id ; @@ -518,6 +528,7 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rlast ; + rf_seq_rlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rdata_ready ; rf_seq_rd_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_input_id ; @@ -525,6 +536,7 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_rlast ; + rf_seq_rlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_data_rdata_ready ; rf_seq_rd_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_input_id ; @@ -532,6 +544,7 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::LSU_R ] = lsu_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::LSU_R ] = lsu_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::LSU_R ] = lsu_rlast ; + rf_seq_rlast_row_from_fu[quadrilatero_pkg::LSU_R ] = lsu_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::LSU_R ] = lsu_rready ; rf_seq_rd_id_from_fu [quadrilatero_pkg::LSU_R ] = lsu_id ; @@ -542,6 +555,7 @@ module quadrilatero rf_seq_wdata_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_wdata ; rf_seq_we_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_we ; rf_seq_wlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_wlast ; + rf_seq_wlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_wlast_row ; rf_seq_wr_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_output_id ; // LSU Write Port @@ -550,6 +564,7 @@ module quadrilatero rf_seq_wdata_from_fu [quadrilatero_pkg::LSU_W ] = lsu_wdata ; rf_seq_we_from_fu [quadrilatero_pkg::LSU_W ] = lsu_we ; rf_seq_wlast_from_fu [quadrilatero_pkg::LSU_W ] = lsu_wlast ; + rf_seq_wlast_row_from_fu[quadrilatero_pkg::LSU_W ] = lsu_wlast_row ; rf_seq_wr_id_from_fu [quadrilatero_pkg::LSU_W ] = lsu_id ; // RF Exec Unit Write Port @@ -558,6 +573,7 @@ module quadrilatero rf_seq_wdata_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_wdata ; rf_seq_we_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_we ; rf_seq_wlast_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_wlast ; + rf_seq_wlast_row_from_fu[quadrilatero_pkg::RF_W ] = perm_unit_wlast_row ; rf_seq_wr_id_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_id ; end @@ -582,6 +598,7 @@ module quadrilatero .rvalid_o (rf_seq_rvalid_from_fu ), .rlast_i (rf_seq_rlast_from_fu ), .rready_i (rf_seq_rready_from_fu ), + .rlast_row_i (rf_seq_rlast_row_from_fu), .rd_id_i (rf_seq_rd_id_from_fu ), .waddr_i (rf_seq_waddr_from_fu ), @@ -590,6 +607,7 @@ module quadrilatero .we_i (rf_seq_we_from_fu ), .wlast_i (rf_seq_wlast_from_fu ), .wready_o (rf_seq_wready_from_fu ), + .wlast_row_i (rf_seq_wlast_row_from_fu), .wr_id_i (rf_seq_wr_id_from_fu ), // Outputs to RF @@ -693,6 +711,7 @@ module quadrilatero .data_rdata_valid_i (sa_data_rdata_valid ), .data_rdata_ready_o (sa_data_rdata_ready ), // unused .data_rlast_o (sa_data_rlast ), + .data_rlast_row_o (sa_data_rlast_row ), // Weight Read Register Port .weight_raddr_o (sa_weight_raddr ), @@ -701,6 +720,7 @@ module quadrilatero .weight_rdata_valid_i (sa_weight_rdata_valid ), .weight_rdata_ready_o (sa_weight_rdata_ready ), .weight_rlast_o (sa_weight_rlast ), + .weight_rlast_row_o (sa_weight_rlast_row ), // Accumulator Read Register Port .acc_raddr_o (sa_acc_raddr ), @@ -709,6 +729,7 @@ module quadrilatero .acc_rdata_valid_i (sa_acc_rdata_valid ), .acc_rdata_ready_o (sa_acc_rdata_ready ), .acc_rlast_o (sa_acc_rlast ), + .acc_rlast_row_o (sa_acc_rlast_row ), // Accumulator Out Write Register Port .res_waddr_o (sa_res_waddr ), @@ -716,6 +737,7 @@ module quadrilatero .res_wdata_o (sa_res_wdata ), .res_we_o (sa_res_we ), .res_wlast_o (sa_res_wlast ), + .res_wlast_row_o (sa_res_wlast_row ), .res_wready_i (sa_res_wready ), .sa_input_id_o (sa_input_id ), @@ -804,6 +826,7 @@ module quadrilatero .wdata_o (lsu_wdata ), .we_o (lsu_we ), .wlast_o (lsu_wlast ), + .wlast_row_o (lsu_wlast_row ), .wready_i (lsu_wready ), // Register Read Port for Store Unit @@ -813,6 +836,7 @@ module quadrilatero .rdata_valid_i (lsu_rvalid ), .rdata_ready_o (lsu_rready ), .rlast_o (lsu_rlast ), + .rlast_row_o (lsu_rlast_row ), // Configuration Signals .stride_i (lsu_ctrl_issued_instr.stride ), // stride value @@ -864,6 +888,7 @@ module quadrilatero .wdata_o (perm_unit_wdata ), .we_o (perm_unit_we ), .wlast_o (perm_unit_wlast ), + .wlast_row_o (perm_unit_wlast_row ), .wready_i (perm_unit_wready ), // Configuration Signals diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv deleted file mode 100644 index cb5f9b2ff..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_lsu #( - parameter int unsigned FIFO_DEPTH = 4, - parameter int unsigned DATA_WIDTH = 32 - -) ( - input logic clk_i , - input logic rst_ni , - - // Bus interface - output logic data_req_o , - output logic [ 31:0] data_addr_o , - output logic data_we_o , - output logic [DATA_WIDTH/8 - 1:0] data_be_o , - output logic [ DATA_WIDTH-1:0] data_wdata_o , - input logic data_gnt_i , - input logic data_rvalid_i , - input logic [ DATA_WIDTH-1:0] data_rdata_i , - - // Configuration - input logic start_i , // start transfer (MUST BE A PULSE!!!!!) - input logic write_i , // write transaction - output logic busy_o , // lsu available - output logic terminate_o , // lsu done - - // Address - input logic [ 31:0] src_ptr_i , // base address - input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one - input logic [ 31:0] rows_i , // how many rows we need to fetch - input logic [ 31:0] cols_i , - - // Output data - output logic [ DATA_WIDTH-1:0] load_fifo_output_o , - output logic load_fifo_valid_o , - output logic load_fifo_data_available_o , - input logic load_fifo_output_pop_i , - - // Input data - input logic [ DATA_WIDTH-1:0] store_fifo_input_i , - input logic store_fifo_push_i , - output logic store_fifo_space_available_o, - output logic store_fifo_empty_o - - -); - - localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; - localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; - localparam int unsigned LastFifoUsage = DEPTH - 1; - - - logic terminate ; - - logic [ 31:0] rows_q ; - logic [ 31:0] rows_d ; - logic [ 31:0] cols_q ; - logic [ 31:0] cols_d ; - logic [ 31:0] src_ptr_inc ; - logic [ 31:0] addr ; - logic [ 31:0] addr_op2 ; - logic [ 31:0] ptr_q ; - logic [ 31:0] ptr_d ; - - logic data_in_req ; - logic data_in_we ; - logic [ DATA_WIDTH/8-1:0] data_in_be ; - logic [ 31:0] data_in_addr ; - logic data_in_rvalid ; - logic [ DATA_WIDTH-1:0] data_in_rdata ; - - logic [ DATA_WIDTH-1:0] load_fifo_input ; - logic [ DATA_WIDTH-1:0] load_fifo_data_out; - logic rd_valid_q ; - logic rd_valid_d ; - logic [ DATA_WIDTH-1:0] rd_head_q ; - logic [ DATA_WIDTH-1:0] rd_head_d ; - logic data_we_q ; - logic data_we_d ; - logic rvalid ; - logic load_fifo_pop ; - logic load_fifo_push ; - logic [Addr_Fifo_Depth-1:0] load_fifo_usage ; - logic load_fifo_alm_full; - logic load_fifo_full ; - logic load_fifo_empty ; - - logic data_out_req ; - logic data_out_we ; - logic [ DATA_WIDTH/8-1:0] data_out_be ; - logic [ 31:0] data_out_addr ; - logic data_out_gnt ; - logic [ DATA_WIDTH-1:0] data_out_wdata ; - - logic store_fifo_full ; - logic store_fifo_empty ; - logic [ DATA_WIDTH-1:0] store_fifo_output ; - logic store_fifo_pop ; - - - enum { - LSU_READY, - LSU_RUNNING - } - lsu_state_q, lsu_state_d; - - - always_comb begin : FSM_block - lsu_state_d = lsu_state_q; - - case (lsu_state_q) - LSU_READY: begin - if (start_i & |cols_i & |rows_i) begin - lsu_state_d = LSU_RUNNING; - end - end - LSU_RUNNING: begin - if (terminate && !start_i) begin - lsu_state_d = LSU_READY; - end - end - endcase - end - - always_comb begin : ctrl_block - terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); - load_fifo_valid_o = rd_valid_d; - busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; - terminate_o = terminate; - end - - always_comb begin : addr_block - src_ptr_inc = DATA_WIDTH / 8; - addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; - ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; - end - - always_comb begin : counters_block - rows_d = rows_q; - cols_d = cols_q; - - if(start_i) begin - if(data_gnt_i && data_req_o) begin - if(cols_i > 1) begin - rows_d = rows_i - 1; - cols_d = cols_i - 2; - end else if (rows_i > 1) begin - rows_d = rows_i - 2; - cols_d = cols_i - 1; - end - end else begin - rows_d = rows_i - 1; - cols_d = cols_i - 1; - end - end else if (data_gnt_i && data_req_o) begin - if (cols_q > 0) cols_d = cols_q - 1; - else if (rows_q > 0) begin - cols_d = cols_i - 1; - rows_d = rows_q - 1; - end - end - end - - always_comb begin : read_obi - data_in_req = '0; - data_in_we = '0; - data_in_be = '0; - data_in_addr = '0; - - if (load_fifo_full == 1'b0 && load_fifo_alm_full == 1'b0) begin - data_in_req = ~write_i & (start_i | lsu_state_q == LSU_RUNNING); - data_in_we = 1'b0 ; - data_in_be = '1 ; - data_in_addr = addr ; - end - end - - always_comb begin : write_obi - data_out_req = '0 ; - data_out_we = '0 ; - data_out_be = '0 ; - data_out_addr = '0 ; - data_out_wdata = store_fifo_output; - - if (!store_fifo_empty) begin - data_out_req = start_i | lsu_state_q == LSU_RUNNING; - // data_out_we = 1'b1 ; - data_out_we = start_i | lsu_state_q == LSU_RUNNING; - data_out_be = '1 ; - data_out_addr = addr ; - end - end - - always_comb begin : obi_channel_signals - data_in_rvalid = 1'b0 ; - data_wdata_o = data_out_wdata; - data_out_gnt = data_gnt_i ; - data_in_rdata = data_rdata_i ; - - if(store_fifo_empty) begin // read transaction active - data_req_o = data_in_req ; - data_we_o = data_in_we ; - data_be_o = data_in_be ; - data_addr_o = data_in_addr ; - data_in_rvalid = data_rvalid_i ; - end else begin // write transaction active - data_req_o = data_out_req ; - data_we_o = data_out_we ; - data_be_o = data_out_be ; - data_addr_o = data_out_addr ; - end - end - - always_comb begin : load_fifo_block - data_we_d = data_gnt_i && data_req_o && data_we_o; - rvalid = data_in_rvalid &~ data_we_q ; - - load_fifo_alm_full = (load_fifo_usage == LastFifoUsage[Addr_Fifo_Depth-1:0]); - load_fifo_input = data_in_rdata; - load_fifo_push = (rvalid & rd_valid_q & ~load_fifo_output_pop_i) | (rvalid & ~load_fifo_empty); - load_fifo_pop = load_fifo_output_pop_i & ~load_fifo_empty; - - rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : - (load_fifo_output_pop_i & - load_fifo_empty & ~rvalid) ? 1'b0 : rd_valid_q; - - rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || - (rvalid & ~rd_valid_q) ? load_fifo_input : - (load_fifo_output_pop_i & ~load_fifo_empty) ? load_fifo_data_out : rd_head_q; - - load_fifo_output_o = rd_head_q ; - load_fifo_data_available_o = rd_valid_q; - end - - always_comb begin : store_fifo_block - store_fifo_pop = data_out_gnt & data_out_req; - store_fifo_empty_o = store_fifo_empty; - store_fifo_space_available_o = ~store_fifo_full; - end - - fifo_v3 #( - .FALL_THROUGH (1'b0 ), - .DEPTH (DEPTH ), - .DATA_WIDTH (DATA_WIDTH ) - ) load_lsu_fifo_i ( - .clk_i , - .rst_ni , - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - - // status flags - .full_o (load_fifo_full ), - .empty_o (load_fifo_empty ), - .usage_o (load_fifo_usage ), - - // as long as the queue is not full we can push new data - .data_i (load_fifo_input ), - .push_i (load_fifo_push ), - - // as long as the queue is not empty we can pop new elements - .data_o (load_fifo_data_out ), - .pop_i (load_fifo_pop ) - ); - - fifo_v3 #( - .DEPTH(FIFO_DEPTH), - .DATA_WIDTH(DATA_WIDTH) - ) store_lsu_fifo_i ( - .clk_i , - .rst_ni , - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - // status flags - .full_o (store_fifo_full ), - .empty_o (store_fifo_empty ), - .usage_o ( ), - // as long as the queue is not full we can push new data - .data_i (store_fifo_input_i ), - .push_i (store_fifo_push_i ), - // as long as the queue is not empty we can pop new elements - .data_o (store_fifo_output ), - .pop_i (store_fifo_pop ) - ); - - always_ff @(posedge clk_i, negedge rst_ni) begin : seq_block - if (~rst_ni) begin - lsu_state_q <= LSU_READY; - ptr_q <= '0 ; - rows_q <= '0 ; - cols_q <= '0 ; - rd_head_q <= '0 ; - rd_valid_q <= '0 ; - data_we_q <= '0 ; - end else begin - lsu_state_q <= lsu_state_d; - ptr_q <= ptr_d ; - rows_q <= rows_d ; - cols_q <= cols_d ; - rd_head_q <= rd_head_d ; - rd_valid_q <= rd_valid_d ; - data_we_q <= data_we_d ; - end - end - -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv index 615dbe5bf..3206db206 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv @@ -19,6 +19,7 @@ module quadrilatero_perm_unit #( output logic [ RLEN-1:0] wdata_o , output logic we_o , output logic wlast_o , + output logic wlast_row_o, input logic wready_i , // to stall the request in case the port is busy // Configuration Signals @@ -150,6 +151,7 @@ module quadrilatero_perm_unit #( assign wdata_o = '0 ; assign we_o = write_started_q &~ mask_req; assign wlast_o = finished ; + assign wlast_row_o = 1'b1; assign busy_o = fifo_full | fifo_almost_full; assign id_o = id_q ; assign finished_o = finished_q ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index fdcc4e746..8f5c49e0e 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -36,6 +36,7 @@ module quadrilatero_register_lsu #( output logic [ LLEN-1:0] wdata_o , output logic we_o , output logic wlast_o , + output logic wlast_row_o, input logic wready_i , // to stall the request in case the port is busy // Register Read Port for store unit @@ -45,6 +46,7 @@ module quadrilatero_register_lsu #( input logic rdata_valid_i , output logic rdata_ready_o , output logic rlast_o , + output logic rlast_row_o, // Configuration Signals input logic start_i , // start loading: MUST BE A PULSE @@ -132,6 +134,7 @@ module quadrilatero_register_lsu #( waddr_o = waddr_q; wrowaddr_o = counter_q ; wdata_o = load_fifo_data & ~data_mask; + wlast_row_o = 1'b1; end @@ -139,6 +142,7 @@ module quadrilatero_register_lsu #( rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; rrowaddr_o = counter_q ; raddr_o = operand_reg_i ; + rlast_row_o = 1'b1; end always_comb begin: lsu_ctrl_block diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index aedffc0db..e928b9892 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -27,6 +27,7 @@ module quadrilatero_rf_sequencer #( output logic [READ_PORTS-1:0] rvalid_o , input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0] rlast_row_i, // tells us if we're reading rrowaddr_i row for the last time (for now always 1) input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , @@ -36,6 +37,7 @@ module quadrilatero_rf_sequencer #( input logic [WRITE_PORTS-1:0] we_i , input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) output logic [WRITE_PORTS-1:0] wready_o , + input logic [WRITE_PORTS-1:0] wlast_row_i, // tells us if we're writing wrowaddr_i row for the last time (for now always 1) input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , // Outputs to RF @@ -146,7 +148,7 @@ module quadrilatero_rf_sequencer #( scoreboard_q[m][n].wready && we_i[jj] ) begin wr_req [jj] = ~scoreboard_q[m][n].rvalid; - w_pop [m][n] = wr_gnt[jj]; + w_pop [m][n] = wr_gnt[jj] && wlast_row_i[jj]; end end @@ -158,7 +160,7 @@ module quadrilatero_rf_sequencer #( begin rd_req [jj] = 1'b1; r_clr [m][n] = rd_gnt[jj]; - r_pop [m][n] = rd_gnt[jj] &~ scoreboard_q[m][n].wready; + r_pop [m][n] = rd_gnt[jj] &~ scoreboard_q[m][n].wready && rlast_row_i[jj]; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv index 91b57d19d..3f2fc907b 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv @@ -27,6 +27,7 @@ module quadrilatero_rf_sequencer #( output logic [READ_PORTS-1:0] rvalid_o , input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0] rlast_row_i, // tells us if we're reading rrowaddr_i row for the last time (for now always 1) input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , @@ -36,6 +37,7 @@ module quadrilatero_rf_sequencer #( input logic [WRITE_PORTS-1:0] we_i , input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) output logic [WRITE_PORTS-1:0] wready_o , + input logic [WRITE_PORTS-1:0] wlast_row_i, // tells us if we're writing wrowaddr_i row for the last time (for now always 1) input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , // Outputs to RF @@ -81,7 +83,7 @@ module quadrilatero_rf_sequencer #( genvar ii,hh; - assign rw_queue_pop = w_pop | r_pop; + assign rw_queue_pop = w_pop | r_pop | ~scoreboard_q; assign rw_queue_entry = rw_queue_entry_i ; assign rw_queue_push = rw_queue_push_i ; @@ -133,8 +135,8 @@ module quadrilatero_rf_sequencer #( end always_comb begin: ctrl_block - wr_req = '0; - rd_req = '0; + wr_req = '0; + rd_req = '0; w_pop = '0; r_pop = '0; r_clr = '0; @@ -142,18 +144,17 @@ module quadrilatero_rf_sequencer #( for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request automatic int m = 32'(waddr_i[jj]); automatic int n = 32'(wrowaddr_i[jj]); - wr_req [jj] = we_i[jj] && (scoreboard_q[m][n].id == wr_id_i[jj]); - w_pop [m][n] = wr_gnt[jj]; + wr_req [jj] = we_i[jj] && (scoreboard_q[m][n].id == wr_id_i[jj]); //PROBLEM: we set write request even if we're not yet ready to write + w_pop [m][n] = wlast_row_i[jj] && (scoreboard_q[m][n].id == wr_id_i[jj]) && we_i[jj]; end for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request automatic int m = 32'(raddr_i[jj]); automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj] ) - begin - rd_req [jj] = 1'b1; - r_clr [m][n] = rd_gnt[jj]; - r_pop [m][n] = rd_gnt[jj]; + r_pop [m][n] = rlast_row_i[jj] && ( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj]) ; + if(scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj]) begin + rd_req [jj] = 1'b1; + r_clr [m][n] = rd_gnt[jj]; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv new file mode 100644 index 000000000..aedffc0db --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv @@ -0,0 +1,295 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_rf_sequencer #( + parameter READ_PORTS = 4 , + parameter WRITE_PORTS = 2 , + parameter N_REGS = 8 , + parameter N_ROWS = 4 , + parameter RLEN = 128 , + parameter RF_READ_PORTS = 3 , + parameter RF_WRITE_PORTS = 1, + parameter SYNC_REQ = 1, + + parameter N_ENTRIES = 3 // entries in the FIFOs for each register +) ( + + input logic clk_i, + input logic rst_ni, + + // Input from FUs + input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , + input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , + output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , + output logic [READ_PORTS-1:0] rvalid_o , + input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , + + + input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , + input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , + input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , + input logic [WRITE_PORTS-1:0] we_i , + input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) + output logic [WRITE_PORTS-1:0] wready_o , + input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , + + // Outputs to RF + output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , + output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , + + + output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , + output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , + output logic [RF_WRITE_PORTS-1:0] we_o , + + + // Inputs from Dispatcher + // We can share the entry as we fetch 1 instruction at a time + // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles + input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , + input logic [N_REGS-1:0] rw_queue_push_i , + + // Outputs to Dispatcher + output logic [N_REGS-1:0] rw_queue_full_o +); + + logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; + logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; + + logic [WRITE_PORTS-1:0] wr_gnt ; + logic [WRITE_PORTS-1:0] wr_req ; + logic [READ_PORTS -1:0] rd_req ; + logic [READ_PORTS -1:0] rd_gnt ; + + logic [N_REGS-1:0] rw_queue_push ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; + + genvar ii,hh; + + assign rw_queue_pop = w_pop | r_pop | ~head_valid; + assign rw_queue_entry = rw_queue_entry_i ; + assign rw_queue_push = rw_queue_push_i ; + + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; + assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; + + for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs + for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows + fifo_v3 #( + .FALL_THROUGH (1'b1) , + .DEPTH (N_ENTRIES) , + .dtype (quadrilatero_pkg::rw_queue_t) + ) issue_queue_inst ( + .clk_i, + .rst_ni, + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + .usage_o ( ), + .full_o (rw_queue_full [ii][hh] ), + .empty_o (rw_queue_empty[ii][hh] ), + .data_i (rw_queue_entry[ii] ), // data to push into the queue + .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue + .data_o (rw_queue [ii][hh] ), // output data + .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue + ); + end + end + + always_comb begin: scoreboard_block + rw_queue_full_o = '0; + for (int i = 0; i < N_REGS; i++) begin + for (int h = 0; h < N_ROWS; h++) begin + rw_queue_full_o[i] |= (rw_queue_full[i][h]); + + head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; + + + scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; + + scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + + scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : + (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + end + end + end + + always_comb begin: ctrl_block + wr_req = '0; + rd_req = '0; + w_pop = '0; + r_pop = '0; + r_clr = '0; + + for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request + automatic int m = 32'(waddr_i[jj]); + automatic int n = 32'(wrowaddr_i[jj]); + if( scoreboard_q[m][n].id == wr_id_i[jj] && + scoreboard_q[m][n].wready && we_i[jj] ) + begin + wr_req [jj] = ~scoreboard_q[m][n].rvalid; + w_pop [m][n] = wr_gnt[jj]; + end + end + + for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request + automatic int m = 32'(raddr_i[jj]); + automatic int n = 32'(rrowaddr_i[jj]); + if( scoreboard_q[m][n].id == rd_id_i[jj] && + scoreboard_q[m][n].rvalid && rready_i[jj] ) + begin + rd_req [jj] = 1'b1; + r_clr [m][n] = rd_gnt[jj]; + r_pop [m][n] = rd_gnt[jj] &~ scoreboard_q[m][n].wready; + end + end + + if(SYNC_REQ) begin: sa_sync_req + + logic block ; + logic same_id_acc; + logic same_id_A ; + logic same_id_D ; + logic same_id_W ; + + // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; + same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; + same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; + same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; + + if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) + ) begin + block = 1'b1; + end else begin + block = 1'b0; + end + + // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin + // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; + // end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; + end + end + end + + if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_WRITE_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (WRITE_PORTS) + ) wr_arb_i( + .clk_i , + .rst_ni , + .req_i (wr_req), + .grant_o (wr_gnt) + ); + always_comb begin: wdata_block + automatic int ll=0; + + wready_o = wr_gnt ; + for (int mm = 0; mm < WRITE_PORTS; mm++) begin + if(wr_gnt[mm]) begin + waddr_o [ll] = waddr_i [mm]; + wrowaddr_o[ll] = wrowaddr_i[mm]; + wdata_o [ll] = wdata_i [mm]; + we_o [ll] = we_i [mm]; + ll++; + end + end + end + end else always_comb begin : write_block_noArb + wr_gnt = wr_req ; + waddr_o = waddr_i ; + wrowaddr_o = wrowaddr_i; + wdata_o = wdata_i ; + we_o = wr_gnt ; + wready_o = wr_gnt ; + end + + if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_READ_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (READ_PORTS) + ) rd_arb_i( + .clk_i , + .rst_ni , + .req_i (rd_req), + .grant_o (rd_gnt) + ); + + always_comb begin: rdata_block + automatic int ll=0; + + rvalid_o = rd_gnt; + for (int mm = 0; mm < READ_PORTS; mm++) begin + if(rd_gnt[mm]) begin + raddr_o [ll] = raddr_i [mm]; + rrowaddr_o[ll] = rrowaddr_i[mm]; + rdata_o [mm] = rdata_i [ll]; + ll++; + end else begin + rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; + end + end + end + end else always_comb begin : read_block_noArb + rd_gnt = rd_req ; + raddr_o = raddr_i ; + rrowaddr_o = rrowaddr_i; + rdata_o = rdata_i ; + rvalid_o = rd_gnt ; + end + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + scoreboard_q <= '0; + end else begin + scoreboard_q <= scoreboard_d; + end + end + + //------------------------------------------------------------------------------------------------------- + + // Assertions + if (WRITE_PORTS < 2) begin + $error( + "[rf_sequencer] WRITE_PORTS must be at least 2.\n" + ); + end + if (READ_PORTS < 2) begin + $error( + "[rf_sequencer] READ_PORTS must be at least 2.\n" + ); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index f1757dc91..cdf019b5d 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -41,6 +41,7 @@ module quadrilatero_systolic_array #( input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, output logic weight_rlast_o , + output logic weight_rlast_row_o , // Data Read Register Port output logic [ $clog2(N_REGS)-1:0] data_raddr_o , @@ -49,6 +50,7 @@ module quadrilatero_systolic_array #( input logic data_rdata_valid_i , output logic data_rdata_ready_o , output logic data_rlast_o , + output logic data_rlast_row_o , // Accumulator Read Register Port output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , @@ -57,6 +59,7 @@ module quadrilatero_systolic_array #( input logic acc_rdata_valid_i , output logic acc_rdata_ready_o , output logic acc_rlast_o , + output logic acc_rlast_row_o , // Accumulator Out Write Register Port output logic [ $clog2(N_REGS)-1:0] res_waddr_o , @@ -64,6 +67,7 @@ module quadrilatero_systolic_array #( output logic [ ALEN-1:0] res_wdata_o , output logic res_we_o , output logic res_wlast_o , + output logic res_wlast_row_o , input logic res_wready_i , // RF Instruction ID @@ -147,24 +151,28 @@ module quadrilatero_systolic_array #( weight_rrowaddr_o = ff_counter_q ; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; weight_rlast_o = ff_counter_q==LastRow; + weight_rlast_row_o = 1'b1; // Data Read Register Port data_raddr_o = data_reg_q ; data_rrowaddr_o = ff_counter_q ; data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; data_rlast_o = ff_counter_q==LastRow; + data_rlast_row_o = 1'b1; // Accumulator Read Register Port acc_raddr_o = acc_reg_q ; acc_rrowaddr_o = ff_counter_q ; acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; acc_rlast_o = ff_counter_q==LastRow; + acc_rlast_row_o = 1'b1; // Accumulator Out Write Register Port res_waddr_o = dest_reg_q ; res_wrowaddr_o = dr_counter_q ; res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; res_wlast_o = dr_counter_q==LastRow; + res_wlast_row_o = 1'b1; end always_comb begin: finished_signal From aa5df19838b95c0574172086e80a091c34b907fc Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Wed, 9 Apr 2025 17:50:39 +0200 Subject: [PATCH 08/18] LSU and SA working control for 4x4 --- .../rtl/include/quadrilatero_pkg.sv | 5 +- ...ero_pkg_new.sv => quadrilatero_pkg_old.sv} | 4 +- .../quadrilatero/rtl/quadrilatero.sv | 25 - .../rtl/quadrilatero_dispatcher.sv | 28 +- ..._new.sv => quadrilatero_dispatcher_old.sv} | 27 +- .../rtl/quadrilatero_perm_unit.sv | 4 +- .../rtl/quadrilatero_register_lsu.sv | 160 ++++-- .../rtl/quadrilatero_register_lsu_broken.sv | 420 +++++++++++++++ .../rtl/quadrilatero_rf_sequencer.sv | 48 +- .../rtl/quadrilatero_rf_sequencer_0bit.sv | 292 +++++++++++ ...w.sv => quadrilatero_rf_sequencer_2bit.sv} | 40 +- ...ilatero_rf_sequencer_kinda_working_1bit.sv | 293 +++++++++++ .../rtl/quadrilatero_systolic_array.sv | 159 ++++-- .../rtl/quadrilatero_systolic_array_old.sv | 484 ++++++++++++++++++ .../sim/modelsim/patch_modelsim_Makefile.py | 3 +- 15 files changed, 1812 insertions(+), 180 deletions(-) rename hw/ip_examples/quadrilatero/rtl/include/{quadrilatero_pkg_new.sv => quadrilatero_pkg_old.sv} (96%) rename hw/ip_examples/quadrilatero/rtl/{quadrilatero_dispatcher_new.sv => quadrilatero_dispatcher_old.sv} (92%) create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv rename hw/ip_examples/quadrilatero/rtl/{quadrilatero_rf_sequencer_new.sv => quadrilatero_rf_sequencer_2bit.sv} (86%) create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index 74d7a277a..b38e262ee 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -54,9 +54,8 @@ package quadrilatero_pkg; } lsu_conf_t; typedef struct packed { - logic [xif_pkg::X_ID_WIDTH-1:0] id; - logic rvalid; - logic wready; + logic [xif_pkg::X_ID_WIDTH-1:0] id; + logic valid; } rw_queue_t; localparam int unsigned WR_PORT = (WRITE_PORTS > 1) ? $clog2(WRITE_PORTS) : 1; diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv similarity index 96% rename from hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv rename to hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv index 6d7a05379..74d7a277a 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_new.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv @@ -54,7 +54,9 @@ package quadrilatero_pkg; } lsu_conf_t; typedef struct packed { - logic [xif_pkg::X_ID_WIDTH-1:0] id; + logic [xif_pkg::X_ID_WIDTH-1:0] id; + logic rvalid; + logic wready; } rw_queue_t; localparam int unsigned WR_PORT = (WRITE_PORTS > 1) ? $clog2(WRITE_PORTS) : 1; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv index 0237c3296..5a4d062a1 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv @@ -131,7 +131,6 @@ module quadrilatero logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_rdata_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rvalid_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rlast_from_fu ; - logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rlast_row_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rready_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_rd_id_from_fu ; @@ -140,7 +139,6 @@ module quadrilatero logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_wdata_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_we_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wlast_from_fu ; - logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wlast_row_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wready_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_wr_id_from_fu ; @@ -171,7 +169,6 @@ module quadrilatero logic sa_weight_rdata_valid; logic sa_weight_rdata_ready; logic sa_weight_rlast ; - logic sa_weight_rlast_row ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id ; logic [quadrilatero_pkg::RLEN-1:0] sa_weight_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_weight_raddr ; @@ -180,7 +177,6 @@ module quadrilatero logic sa_data_rdata_valid ; logic sa_data_rdata_ready ; logic sa_data_rlast ; - logic sa_data_rlast_row ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id ; logic [quadrilatero_pkg::RLEN-1:0] sa_data_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_data_raddr ; @@ -189,7 +185,6 @@ module quadrilatero logic sa_acc_rdata_valid ; logic sa_acc_rdata_ready ; logic sa_acc_rlast ; - logic sa_acc_rlast_row ; logic [quadrilatero_pkg::RLEN-1:0] sa_acc_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_acc_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_acc_rrowaddr ; @@ -197,7 +192,6 @@ module quadrilatero logic sa_res_we ; logic sa_res_wready ; logic sa_res_wlast ; - logic sa_res_wlast_row; logic [quadrilatero_pkg::RLEN-1:0] sa_res_wdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_res_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_res_wrowaddr ; @@ -230,7 +224,6 @@ module quadrilatero logic lsu_we ; logic lsu_wlast ; - logic lsu_wlast_row ; logic lsu_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] lsu_id ; logic [quadrilatero_pkg::RLEN-1:0] lsu_wdata ; @@ -238,7 +231,6 @@ module quadrilatero logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_wrowaddr ; logic lsu_rlast ; - logic lsu_rlast_row ; logic lsu_rready ; logic lsu_rvalid ; logic [quadrilatero_pkg::RLEN-1:0] lsu_rdata ; @@ -258,7 +250,6 @@ module quadrilatero logic perm_busy ; logic perm_unit_we ; logic perm_unit_wlast ; - logic perm_unit_wlast_row ; logic perm_unit_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_instr_id ; @@ -520,7 +511,6 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rlast ; - rf_seq_rlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_weight_rdata_ready; rf_seq_rd_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_W] = sa_input_id ; @@ -528,7 +518,6 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rlast ; - rf_seq_rlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_data_rdata_ready ; rf_seq_rd_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_D] = sa_input_id ; @@ -536,7 +525,6 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_rlast ; - rf_seq_rlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_acc_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_data_rdata_ready ; rf_seq_rd_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY_A] = sa_input_id ; @@ -544,7 +532,6 @@ module quadrilatero rf_seq_raddr_from_fu [quadrilatero_pkg::LSU_R ] = lsu_raddr ; rf_seq_rrowaddr_from_fu[quadrilatero_pkg::LSU_R ] = lsu_rrowaddr ; rf_seq_rlast_from_fu [quadrilatero_pkg::LSU_R ] = lsu_rlast ; - rf_seq_rlast_row_from_fu[quadrilatero_pkg::LSU_R ] = lsu_rlast_row ; rf_seq_rready_from_fu [quadrilatero_pkg::LSU_R ] = lsu_rready ; rf_seq_rd_id_from_fu [quadrilatero_pkg::LSU_R ] = lsu_id ; @@ -555,7 +542,6 @@ module quadrilatero rf_seq_wdata_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_wdata ; rf_seq_we_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_we ; rf_seq_wlast_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_wlast ; - rf_seq_wlast_row_from_fu[quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_res_wlast_row ; rf_seq_wr_id_from_fu [quadrilatero_pkg::SYSTOLIC_ARRAY] = sa_output_id ; // LSU Write Port @@ -564,7 +550,6 @@ module quadrilatero rf_seq_wdata_from_fu [quadrilatero_pkg::LSU_W ] = lsu_wdata ; rf_seq_we_from_fu [quadrilatero_pkg::LSU_W ] = lsu_we ; rf_seq_wlast_from_fu [quadrilatero_pkg::LSU_W ] = lsu_wlast ; - rf_seq_wlast_row_from_fu[quadrilatero_pkg::LSU_W ] = lsu_wlast_row ; rf_seq_wr_id_from_fu [quadrilatero_pkg::LSU_W ] = lsu_id ; // RF Exec Unit Write Port @@ -573,7 +558,6 @@ module quadrilatero rf_seq_wdata_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_wdata ; rf_seq_we_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_we ; rf_seq_wlast_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_wlast ; - rf_seq_wlast_row_from_fu[quadrilatero_pkg::RF_W ] = perm_unit_wlast_row ; rf_seq_wr_id_from_fu [quadrilatero_pkg::RF_W ] = perm_unit_id ; end @@ -598,7 +582,6 @@ module quadrilatero .rvalid_o (rf_seq_rvalid_from_fu ), .rlast_i (rf_seq_rlast_from_fu ), .rready_i (rf_seq_rready_from_fu ), - .rlast_row_i (rf_seq_rlast_row_from_fu), .rd_id_i (rf_seq_rd_id_from_fu ), .waddr_i (rf_seq_waddr_from_fu ), @@ -607,7 +590,6 @@ module quadrilatero .we_i (rf_seq_we_from_fu ), .wlast_i (rf_seq_wlast_from_fu ), .wready_o (rf_seq_wready_from_fu ), - .wlast_row_i (rf_seq_wlast_row_from_fu), .wr_id_i (rf_seq_wr_id_from_fu ), // Outputs to RF @@ -711,7 +693,6 @@ module quadrilatero .data_rdata_valid_i (sa_data_rdata_valid ), .data_rdata_ready_o (sa_data_rdata_ready ), // unused .data_rlast_o (sa_data_rlast ), - .data_rlast_row_o (sa_data_rlast_row ), // Weight Read Register Port .weight_raddr_o (sa_weight_raddr ), @@ -720,7 +701,6 @@ module quadrilatero .weight_rdata_valid_i (sa_weight_rdata_valid ), .weight_rdata_ready_o (sa_weight_rdata_ready ), .weight_rlast_o (sa_weight_rlast ), - .weight_rlast_row_o (sa_weight_rlast_row ), // Accumulator Read Register Port .acc_raddr_o (sa_acc_raddr ), @@ -729,7 +709,6 @@ module quadrilatero .acc_rdata_valid_i (sa_acc_rdata_valid ), .acc_rdata_ready_o (sa_acc_rdata_ready ), .acc_rlast_o (sa_acc_rlast ), - .acc_rlast_row_o (sa_acc_rlast_row ), // Accumulator Out Write Register Port .res_waddr_o (sa_res_waddr ), @@ -737,7 +716,6 @@ module quadrilatero .res_wdata_o (sa_res_wdata ), .res_we_o (sa_res_we ), .res_wlast_o (sa_res_wlast ), - .res_wlast_row_o (sa_res_wlast_row ), .res_wready_i (sa_res_wready ), .sa_input_id_o (sa_input_id ), @@ -826,7 +804,6 @@ module quadrilatero .wdata_o (lsu_wdata ), .we_o (lsu_we ), .wlast_o (lsu_wlast ), - .wlast_row_o (lsu_wlast_row ), .wready_i (lsu_wready ), // Register Read Port for Store Unit @@ -836,7 +813,6 @@ module quadrilatero .rdata_valid_i (lsu_rvalid ), .rdata_ready_o (lsu_rready ), .rlast_o (lsu_rlast ), - .rlast_row_o (lsu_rlast_row ), // Configuration Signals .stride_i (lsu_ctrl_issued_instr.stride ), // stride value @@ -888,7 +864,6 @@ module quadrilatero .wdata_o (perm_unit_wdata ), .we_o (perm_unit_we ), .wlast_o (perm_unit_wlast ), - .wlast_row_o (perm_unit_wlast_row ), .wready_i (perm_unit_wready ), // Configuration Signals diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv index e3a908655..611502f70 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher.sv @@ -168,8 +168,8 @@ module quadrilatero_dispatcher #( delta = 3'b0; for(int ii = 0; ii < N_REGS; ii++) begin - delta += {2'b0, rw_queue_entry_o[ii].rvalid}; - delta += {2'b0, rw_queue_entry_o[ii].wready}; + delta += {2'b0, rvalid[ii]}; + delta += {2'b0, wready[ii]}; end done = (delta == outstanding_op_q); @@ -201,24 +201,7 @@ module quadrilatero_dispatcher #( outstanding_op_d = {1'b0,n_matrix_operands_read_i} + {2'b0, rf_writeback_i}; end - // always_comb begin: updated_next_value - // if((instr_ready || state_q==IDLE)) begin //we're ready to continue with the next instruction - // rreg_d = rf_read_regs_i; - // wreg_d = rf_writeback_i; - - // rs_d = rs_i; - // rs_valid_d = rs_valid_i; - // instr_id_d = instr_id_i; - // datatype_d = datatype_i; - // is_store_d = is_store_i; - // is_float_d = is_float_i; - // end - // dispatch_d = '0 ; - // dispatch_d[exec_unit_i] = instr_ready; - - // push_operandw_d = rf_writeback_i & instr_ready; - - // end + always_comb begin: rw_queue_block rvalid = '0; wready = '0; @@ -227,10 +210,9 @@ module quadrilatero_dispatcher #( rvalid[rreg_q[2]] |= reg3_valid &~ ld_reg3; wready[wreg_q ] = regw_valid &~ ld_regw; for(int ii = 0; ii < N_REGS; ii++) begin - rw_queue_entry_o[ii].rvalid = rvalid[ii]; - rw_queue_entry_o[ii].wready = wready[ii]; rw_queue_entry_o[ii].id = instr_id_q; - rw_queue_push_o [ii] = rw_queue_entry_o[ii].rvalid | rw_queue_entry_o[ii].wready; + rw_queue_push_o [ii] = rvalid[ii] | wready[ii]; + rw_queue_entry_o[ii].valid = rvalid[ii] | wready[ii]; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv similarity index 92% rename from hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv rename to hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv index c8194822a..e3a908655 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_new.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv @@ -168,8 +168,8 @@ module quadrilatero_dispatcher #( delta = 3'b0; for(int ii = 0; ii < N_REGS; ii++) begin - delta += {2'b0, rvalid[ii]}; - delta += {2'b0, wready[ii]}; + delta += {2'b0, rw_queue_entry_o[ii].rvalid}; + delta += {2'b0, rw_queue_entry_o[ii].wready}; end done = (delta == outstanding_op_q); @@ -201,7 +201,24 @@ module quadrilatero_dispatcher #( outstanding_op_d = {1'b0,n_matrix_operands_read_i} + {2'b0, rf_writeback_i}; end - + // always_comb begin: updated_next_value + // if((instr_ready || state_q==IDLE)) begin //we're ready to continue with the next instruction + // rreg_d = rf_read_regs_i; + // wreg_d = rf_writeback_i; + + // rs_d = rs_i; + // rs_valid_d = rs_valid_i; + // instr_id_d = instr_id_i; + // datatype_d = datatype_i; + // is_store_d = is_store_i; + // is_float_d = is_float_i; + // end + // dispatch_d = '0 ; + // dispatch_d[exec_unit_i] = instr_ready; + + // push_operandw_d = rf_writeback_i & instr_ready; + + // end always_comb begin: rw_queue_block rvalid = '0; wready = '0; @@ -210,8 +227,10 @@ module quadrilatero_dispatcher #( rvalid[rreg_q[2]] |= reg3_valid &~ ld_reg3; wready[wreg_q ] = regw_valid &~ ld_regw; for(int ii = 0; ii < N_REGS; ii++) begin + rw_queue_entry_o[ii].rvalid = rvalid[ii]; + rw_queue_entry_o[ii].wready = wready[ii]; rw_queue_entry_o[ii].id = instr_id_q; - rw_queue_push_o [ii] = rvalid[ii] | wready[ii]; + rw_queue_push_o [ii] = rw_queue_entry_o[ii].rvalid | rw_queue_entry_o[ii].wready; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv index 3206db206..16cfc298c 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv @@ -19,7 +19,6 @@ module quadrilatero_perm_unit #( output logic [ RLEN-1:0] wdata_o , output logic we_o , output logic wlast_o , - output logic wlast_row_o, input logic wready_i , // to stall the request in case the port is busy // Configuration Signals @@ -150,8 +149,7 @@ module quadrilatero_perm_unit #( assign wrowaddr_o = counter_q ; assign wdata_o = '0 ; assign we_o = write_started_q &~ mask_req; - assign wlast_o = finished ; - assign wlast_row_o = 1'b1; + assign wlast_o = write_started_q &~ mask_req ; assign busy_o = fifo_full | fifo_almost_full; assign id_o = id_q ; assign finished_o = finished_q ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 8f5c49e0e..d749279db 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -33,20 +33,18 @@ module quadrilatero_register_lsu #( // Register Write Port for load unit output logic [ $clog2(N_REGS)-1:0] waddr_o , output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ LLEN-1:0] wdata_o , + output logic [quadrilatero_pkg::RLEN-1:0] wdata_o , output logic we_o , output logic wlast_o , - output logic wlast_row_o, input logic wready_i , // to stall the request in case the port is busy // Register Read Port for store unit output logic [ $clog2(N_REGS)-1:0] raddr_o , output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [ LLEN-1:0] rdata_i , + input logic [quadrilatero_pkg::RLEN-1:0] rdata_i , input logic rdata_valid_i , output logic rdata_ready_o , output logic rlast_o , - output logic rlast_row_o, // Configuration Signals input logic start_i , // start loading: MUST BE A PULSE @@ -68,7 +66,8 @@ module quadrilatero_register_lsu #( localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); - + localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; + typedef enum logic [1:0] { LSU_IDLE, LSU_LOAD, @@ -120,45 +119,54 @@ module quadrilatero_register_lsu #( logic [ 31:0] src_ptr ; logic [ 31:0] stride ; + logic [$clog2(NumAccesses)-1:0] access_counter_d; + logic [$clog2(NumAccesses)-1:0] access_counter_q; + + logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_d; + logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_q; + + logic [quadrilatero_pkg::RLEN-1:0] store_mask; + logic [quadrilatero_pkg::RLEN-1:0] load_mask; + assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; - finished = (write_q & terminate) | (~write_q & wlast_o & wready_i); + finished = (write_q & terminate) | (~write_q & (counter_q == LastRow) & wready_i); end always_comb begin: write_to_RF data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols - - we_o = load_fifo_data_available &~ mask_req; - waddr_o = waddr_q; + load_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); + we_o = load_fifo_data_available &~ mask_req && (access_counter_q == NumAccesses -1); // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy + waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; wrowaddr_o = counter_q ; - wdata_o = load_fifo_data & ~data_mask; - wlast_row_o = 1'b1; + load_row_buffer_d = (load_row_buffer_q & ~load_mask) | (load_fifo_data << (LLEN * access_counter_q)); + wdata_o = load_row_buffer_d & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q end always_comb begin: read_from_RF + store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; rrowaddr_o = counter_q ; raddr_o = operand_reg_i ; - rlast_row_o = 1'b1; end always_comb begin: lsu_ctrl_block load_fifo_pop = wready_i; - store_fifo_data = rdata_i; + store_fifo_data = (rdata_i & store_mask) >> (LLEN * access_counter_q); store_fifo_push = rdata_ready_o && rdata_valid_i; lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); start = (start_i | start_q) & lsu_ready; - busy_o = (write_i ? busy_d : busy ) | start_q; + busy_o = (write_i ? busy_d : busy) | start_q; stride = (start) ? stride_i : stride_q; src_ptr = (start) ? address_i : src_ptr_q; end always_comb begin: next_value - write_d = (write_i && rlast_o && rdata_valid_i) ? 1'b1 : + write_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b1 : (!write_i && !busy) ? 1'b0 : write_q; start_d = start ? 1'b0 : @@ -167,12 +175,15 @@ module quadrilatero_register_lsu #( stride_d = (start) ? stride_i : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; - busy_d = (write_i && rlast_o && rdata_valid_i) ? 1'b0 : + busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b0 : (write_i && start_i) ? 1'b1 : busy_q; end always_comb begin: fsm_block lsu_state_d = lsu_state_q; counter_d = counter_q; + access_counter_d = access_counter_q; + //load_row_buffer_d = load_row_buffer_q; + //we_o = 1'b0; rlast_o = 1'b0; wlast_o = 1'b0; @@ -181,48 +192,97 @@ module quadrilatero_register_lsu #( case (lsu_state_q) LSU_IDLE: begin - if(load_fifo_valid && !write_i) begin - counter_d = '0; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; + back_id_d = instr_id_i; // was inside if + waddr_d = operand_reg_i; + if(load_fifo_valid && !write_i && wready_i) begin //checking for wready makes sense but somehow is wrong? + counter_d = counter_q + 1; + wlast_o = 1'b1; + //back_id_d = instr_id_i; was here + access_counter_d = '0; lsu_state_d = LSU_LOAD; end else if (write_i & store_fifo_space_available && rdata_valid_i) begin - counter_d = '0; + counter_d = counter_q + 1; + rlast_o = 1'b1; + access_counter_d = '0; lsu_state_d = LSU_STORE; end end LSU_LOAD: begin if(load_fifo_valid) begin + //maybe here wlast_o = 1'b1; ? if(wready_i) begin if(counter_q == LastRow) begin + if(access_counter_q == NumAccesses - 1) begin + wlast_o = 1'b1; + //we_o = 1'b1; + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + end else begin + access_counter_d = access_counter_q + 1; + end + end else begin + if(access_counter_q == NumAccesses - 1) begin + //we_o = 1'b1; + wlast_o = 1'b1; + access_counter_d = '0; + counter_d = counter_q + 1; + end else begin + access_counter_d = access_counter_q + 1; + end + end + end + // end else begin + // //wlast_o = 1'b1; // maybe wrong + // //we_o = 1'b1; + // counter_d = '0; + // lsu_state_d = LSU_DONE; + end else begin + if(write_i && wready_i) begin // transition from load to store + if(access_counter_q == NumAccesses - 1) begin counter_d = '0; wlast_o = 1'b1; lsu_state_d = LSU_DONE; + access_counter_d = '0; back_id_d = instr_id_i; waddr_d = operand_reg_i; + end else begin - counter_d = counter_q + 1; + access_counter_d = access_counter_q + 1; end end - end else begin - counter_d = '0; - lsu_state_d = LSU_DONE; + end end LSU_STORE: begin - if(store_fifo_space_available && write_i) begin - if(rdata_valid_i) begin + if(store_fifo_space_available && write_i && rdata_valid_i) begin + //if(rdata_valid_i) begin if(counter_q == LastRow) begin - counter_d = '0; - rlast_o = 1'b1; - lsu_state_d = LSU_DONE; - back_id_d = lsu_id_o; + if(access_counter_q == NumAccesses - 1) begin + rlast_o = 1'b1; + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + end else begin + access_counter_d = access_counter_q + 1; + end + end else begin - counter_d = counter_q + 1; + if(access_counter_q == NumAccesses - 1) begin + rlast_o = 1'b1; + access_counter_d = '0; + counter_d = counter_q + 1; + end else begin + access_counter_d = access_counter_q + 1; + end end - end - end else begin + end else begin // this case is very suspicious + //rlast_o = 1'b1; // maybe wrong counter_d = '0; back_id_d = instr_id_i; lsu_state_d = LSU_DONE; @@ -230,11 +290,24 @@ module quadrilatero_register_lsu #( end LSU_DONE: begin if(load_fifo_valid && !write_i && wready_i) begin - counter_d = counter_q + 1; - lsu_state_d = LSU_LOAD; - end else if (write_i & store_fifo_space_available && rdata_valid_i) begin - counter_d = counter_q + 1; - lsu_state_d = LSU_STORE; + if(access_counter_q == NumAccesses - 1) begin + access_counter_d = '0; + counter_d = counter_q + 1; + wlast_o = 1'b1; + //we_o = 1'b1; + lsu_state_d = LSU_LOAD; + end else begin + access_counter_d = access_counter_q + 1; + end + end else if (write_i && store_fifo_space_available && rdata_valid_i) begin + if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + rlast_o = 1'b1; + lsu_state_d = LSU_STORE; + access_counter_d = '0; + end else begin + access_counter_d = access_counter_q + 1; + end end else begin lsu_state_d = LSU_IDLE; end @@ -260,6 +333,8 @@ module quadrilatero_register_lsu #( lsu_busy_q <= '0; src_ptr_q <= '0; stride_q <= '0; + access_counter_q <= '0; + load_row_buffer_q <= '0; end else begin counter_q <= counter_d; back_id_q <= back_id_d; @@ -272,6 +347,8 @@ module quadrilatero_register_lsu #( lsu_busy_q <= busy; src_ptr_q <= src_ptr_d; stride_q <= stride_d ; + access_counter_q <= access_counter_d; + load_row_buffer_q <= load_row_buffer_d; end end @@ -295,7 +372,7 @@ module quadrilatero_register_lsu #( //Configuration .start_i (start ), - .write_i , + .write_i (write_i), // & ~(lsu_state_q == LSU_LOAD && write_i) .busy_o (busy ), .terminate_o (terminate ), @@ -343,4 +420,7 @@ module quadrilatero_register_lsu #( "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" ); end + if ((NumAccesses & (NumAccesses - 1)) != 0) begin + $error("RLEN / LLEN must be a power of 2."); + end endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv new file mode 100644 index 000000000..b18f1a44a --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv @@ -0,0 +1,420 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* +NOTE: for now we assume we fetch the entire row in 1 cycle. TODO: Change the number of columns and adapt this to arbitrary BUS_WIDTH parameters +NOTE: we are not handling difference in endianness when loading reduced datawidths +*/ + +module quadrilatero_register_lsu #( + parameter int unsigned BUS_WIDTH = 128, + parameter int unsigned N_REGS = 8, + parameter int unsigned N_ROWS = 4, + localparam int unsigned LLEN = BUS_WIDTH +) ( + input logic clk_i , + input logic rst_ni , + + // Bus interface + output logic data_req_o , + output logic [ 31:0] data_addr_o , + output logic data_we_o , + output logic [ BUS_WIDTH/8 - 1:0] data_be_o , + output logic [ BUS_WIDTH-1:0] data_wdata_o , + input logic data_gnt_i , + input logic data_rvalid_i , + input logic [ BUS_WIDTH-1:0] data_rdata_i , + + output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , + + // Register Write Port for load unit + output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [ LLEN-1:0] wdata_o , + output logic we_o , + output logic wlast_o , + input logic wready_i , // to stall the request in case the port is busy + + // Register Read Port for store unit + output logic [ $clog2(N_REGS)-1:0] raddr_o , + output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [ LLEN-1:0] rdata_i , + input logic rdata_valid_i , + output logic rdata_ready_o , + output logic rlast_o , + + // Configuration Signals + input logic start_i , // start loading: MUST BE A PULSE + input logic write_i , + output logic busy_o , + input logic [ 31:0] stride_i , // stride value + input logic [ 31:0] address_i , // address value + input logic [ $clog2(N_REGS)-1:0] operand_reg_i , // destination register + input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i , // instruction id + input logic [ 31:0] n_bytes_cols_i , // we always fetch the entire row and then only take the elements we need + input logic [ 31:0] n_rows_i , + + + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o //instruction id out + +); + + localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; + localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); + localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; + + typedef enum logic [1:0] { + LSU_IDLE, + LSU_LOAD, + LSU_STORE, + LSU_DONE + } register_lsu_state_e; + + register_lsu_state_e lsu_state_d, lsu_state_q; + + logic finished; + logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; + logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; + + logic [$clog2(N_ROWS)-1:0] counter_q; + logic [$clog2(N_ROWS)-1:0] counter_d; + logic [$clog2(N_REGS)-1:0] waddr_q; + logic [$clog2(N_REGS)-1:0] waddr_d; + + logic [LLEN-1:0] load_fifo_data; + + logic load_fifo_data_available; + logic load_fifo_pop; + + logic store_fifo_space_available; + logic store_fifo_push; + logic store_fifo_empty; + logic [LLEN-1:0] store_fifo_data; + + logic [LLEN-1:0] data_mask; + logic load_fifo_valid; + logic busy; + logic start; + logic start_q; + logic start_d; + + logic write_q; + logic write_d; + logic terminate; + logic busy_q; + logic busy_d; + + logic lsu_busy_q; + logic lsu_ready; + + logic [ 31:0] src_ptr_d ; + logic [ 31:0] stride_d ; + logic [ 31:0] src_ptr_q ; + logic [ 31:0] stride_q ; + logic [ 31:0] src_ptr ; + logic [ 31:0] stride ; + + logic [$clog2(NumAccesses)-1:0] access_counter_d; + logic [$clog2(NumAccesses)-1:0] access_counter_q; + + logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_d; + logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_q; + + logic [quadrilatero_pkg::RLEN-1:0] store_mask; + + assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; + always_comb begin + lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; + finished = (write_q & terminate) | (~write_q & (counter_q == LastRow) & wready_i); + end + + + always_comb begin: write_to_RF + data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols + + we_o = load_fifo_data_available &~ mask_req && (access_counter_q == NumAccesses -1); // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy + waddr_o = waddr_q; + wrowaddr_o = counter_q ; + load_row_buffer_d = load_row_buffer_q | (load_fifo_data << (LLEN * access_counter_q)); + wdata_o = load_row_buffer_d & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q + + end + + always_comb begin: read_from_RF + store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); + rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; + rrowaddr_o = counter_q ; + raddr_o = operand_reg_i ; + end + + always_comb begin: lsu_ctrl_block + load_fifo_pop = wready_i; + store_fifo_data = rdata_i & store_mask; + store_fifo_push = rdata_ready_o && rdata_valid_i; + lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); + start = (start_i | start_q) & lsu_ready; + busy_o = (write_i ? busy_d : busy ) | start_q; + + stride = (start) ? stride_i : stride_q; + src_ptr = (start) ? address_i : src_ptr_q; + end + + always_comb begin: next_value + write_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b1 : + (!write_i && !busy) ? 1'b0 : write_q; + + start_d = start ? 1'b0 : + (start_q | start_i) ? 1'b1 : start_q; + + stride_d = (start) ? stride_i : stride_q ; + src_ptr_d = (start) ? address_i : src_ptr_q; + + busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b0 : + (write_i && start_i) ? 1'b1 : busy_q; + end + always_comb begin: fsm_block + lsu_state_d = lsu_state_q; + counter_d = counter_q; + access_counter_d = access_counter_q; + //load_row_buffer_d = load_row_buffer_q; + //we_o = 1'b0; + rlast_o = 1'b0; + wlast_o = 1'b0; + + back_id_d = back_id_q; + waddr_d = waddr_q; + + case (lsu_state_q) + LSU_IDLE: begin + if(load_fifo_valid && !write_i && wready_i) begin //checking for wready makes sense but somehow is wrong? + wlast_o = 1'b1; + counter_d = counter_q + 1; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + access_counter_d = '0; + lsu_state_d = LSU_LOAD; + end else if (write_i & store_fifo_space_available && rdata_valid_i) begin + rlast_o = 1'b1; + counter_d = counter_q + 1; + access_counter_d = '0; + lsu_state_d = LSU_STORE; + end + + end + LSU_LOAD: begin + if(load_fifo_valid) begin + if(wready_i) begin + wlast_o = 1'b1; + if(counter_q == LastRow) begin + //if(access_counter_q == NumAccesses - 1) begin + + //we_o = 1'b1; + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + // end else begin + // access_counter_d = access_counter_q + 1; + end + else begin + //if(access_counter_q == NumAccesses - 1) begin + //we_o = 1'b1; + + access_counter_d = '0; + counter_d = counter_q + 1; + // end else begin + // access_counter_d = access_counter_q + 1; + end + end + + // end else begin + // //wlast_o = 1'b1; // maybe wrong + // //we_o = 1'b1; + // counter_d = '0; + // lsu_state_d = LSU_DONE; + end else begin + // if(access_counter_q == NumAccesses - 1) begin + //wlast_o = 1'b1; // very random but apparantly needed + //we_o = 1'b1; + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + // end else begin + // access_counter_d = access_counter_q + 1; + // end + + end + end + LSU_STORE: begin + if(store_fifo_space_available && write_i && rdata_valid_i) begin + //if(rdata_valid_i) begin + rlast_o = 1'b1; + if(counter_q == LastRow) begin + //if(access_counter_q == NumAccesses - 1) begin + + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = lsu_id_o; + // end else begin + // access_counter_d = access_counter_q + 1; + // end + + end else begin + //if(access_counter_q == NumAccesses - 1) begin + + access_counter_d = '0; + counter_d = counter_q + 1; + // end else begin + // access_counter_d = access_counter_q + 1; + // end + end + end else begin // this case is very suspicious + //rlast_o = 1'b1; // maybe wrong + counter_d = '0; + back_id_d = lsu_id_o; + lsu_state_d = LSU_DONE; + end + end + LSU_DONE: begin + if(load_fifo_valid && !write_i && wready_i) begin + //if(access_counter_q == NumAccesses - 1) begin + access_counter_d = '0; + counter_d = counter_q + 1; + wlast_o = 1'b1; + //we_o = 1'b1; + lsu_state_d = LSU_LOAD; + // end else begin + // access_counter_d = access_counter_q + 1; + // end + end else if (write_i && store_fifo_space_available && rdata_valid_i) begin + //if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + rlast_o = 1'b1; + lsu_state_d = LSU_STORE; + access_counter_d = '0; + // end else begin + // access_counter_d = access_counter_q + 1; + // end + end else begin + lsu_state_d = LSU_IDLE; + end + end + default: begin + lsu_state_d = LSU_IDLE; + end + endcase + + end + + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + counter_q <= '0; + waddr_q <= '0; + back_id_q <= '0; + start_q <= '0; + write_q <= '0; + busy_q <= '0; + lsu_state_q <= LSU_IDLE; + + lsu_busy_q <= '0; + src_ptr_q <= '0; + stride_q <= '0; + access_counter_q <= '0; + load_row_buffer_q <= '0; + end else begin + counter_q <= counter_d; + back_id_q <= back_id_d; + waddr_q <= waddr_d ; + start_q <= start_d ; + write_q <= write_d ; + busy_q <= busy_d ; + lsu_state_q <= lsu_state_d; + + lsu_busy_q <= busy; + src_ptr_q <= src_ptr_d; + stride_q <= stride_d ; + access_counter_q <= access_counter_d; + load_row_buffer_q <= load_row_buffer_d; + end + end + + quadrilatero_lsu #( + .FIFO_DEPTH (4 ), + .DATA_WIDTH (BUS_WIDTH) + ) lsunit_inst ( + + .clk_i , + .rst_ni , + + // Bus interface + .data_req_o , + .data_addr_o , + .data_we_o , + .data_be_o , + .data_wdata_o , + .data_gnt_i , + .data_rvalid_i , + .data_rdata_i , + + //Configuration + .start_i (start ), + .write_i , + .busy_o (busy ), + .terminate_o (terminate ), + + // Address + .src_ptr_i (src_ptr ), + .stride_i (stride ), + .cols_i (MAX_EL_PER_ROW ), + .rows_i (n_rows_i ), + + // Output data + .load_fifo_output_o (load_fifo_data ), + .load_fifo_valid_o (load_fifo_valid ), + .load_fifo_data_available_o (load_fifo_data_available ), + .load_fifo_output_pop_i (load_fifo_pop ), + + // Input data + .store_fifo_input_i (store_fifo_data ), + .store_fifo_push_i (store_fifo_push ), + .store_fifo_space_available_o (store_fifo_space_available ), + .store_fifo_empty_o (store_fifo_empty ) + ); + + //------------------------- + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + finished_o <= '0; + finished_instr_id_o <= '0; + end else begin + if (finished) begin + finished_o <= '1; + finished_instr_id_o <= back_id_q; + end + if (finished_ack_i) begin + finished_o <= '0; + finished_instr_id_o <= '0; + end + end + end + //--------------------- + + // Assertions + if (N_ROWS < 2) begin + $error( + "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" + ); + end + if ((NumAccesses & (NumAccesses - 1)) != 0) begin + $error("RLEN / LLEN must be a power of 2."); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index e928b9892..ab52e307e 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -27,7 +27,6 @@ module quadrilatero_rf_sequencer #( output logic [READ_PORTS-1:0] rvalid_o , input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0] rlast_row_i, // tells us if we're reading rrowaddr_i row for the last time (for now always 1) input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , @@ -37,7 +36,6 @@ module quadrilatero_rf_sequencer #( input logic [WRITE_PORTS-1:0] we_i , input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) output logic [WRITE_PORTS-1:0] wready_o , - input logic [WRITE_PORTS-1:0] wlast_row_i, // tells us if we're writing wrowaddr_i row for the last time (for now always 1) input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , // Outputs to RF @@ -80,10 +78,9 @@ module quadrilatero_rf_sequencer #( quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; - genvar ii,hh; - assign rw_queue_pop = w_pop | r_pop | ~head_valid; + assign rw_queue_pop = w_pop | r_pop | ~head_valid; //problem assign rw_queue_entry = rw_queue_entry_i ; assign rw_queue_push = rw_queue_push_i ; @@ -118,18 +115,19 @@ module quadrilatero_rf_sequencer #( for (int h = 0; h < N_ROWS; h++) begin rw_queue_full_o[i] |= (rw_queue_full[i][h]); - head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; - + + head_valid[i][h] = scoreboard_q[i][h].valid; - scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; + scoreboard_d[i][h] = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h] : scoreboard_q[i][h]; + - scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + // scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + // scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : + // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; end end end @@ -139,28 +137,23 @@ module quadrilatero_rf_sequencer #( rd_req = '0; w_pop = '0; r_pop = '0; - r_clr = '0; + r_clr = '0; for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request automatic int m = 32'(waddr_i[jj]); automatic int n = 32'(wrowaddr_i[jj]); - if( scoreboard_q[m][n].id == wr_id_i[jj] && - scoreboard_q[m][n].wready && we_i[jj] ) - begin - wr_req [jj] = ~scoreboard_q[m][n].rvalid; - w_pop [m][n] = wr_gnt[jj] && wlast_row_i[jj]; + if( scoreboard_q[m][n].id == wr_id_i[jj] && we_i[jj] && scoreboard_q[m][n].valid) begin + wr_req [jj] = 1'b1; + w_pop [m][n] = wlast_i[jj]; end end for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request automatic int m = 32'(raddr_i[jj]); automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && - scoreboard_q[m][n].rvalid && rready_i[jj] ) - begin - rd_req [jj] = 1'b1; - r_clr [m][n] = rd_gnt[jj]; - r_pop [m][n] = rd_gnt[jj] &~ scoreboard_q[m][n].wready && rlast_row_i[jj]; + if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj] && scoreboard_q[m][n].valid) begin + rd_req [jj] = 1'b1; + r_pop [m][n] = rlast_i[jj] && (jj != quadrilatero_pkg::SYSTOLIC_ARRAY_A); // for SA_A port we can't pop on read end end @@ -192,12 +185,15 @@ module quadrilatero_rf_sequencer #( // end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]] = 1'b0; end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]] = 1'b0; end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]] = 1'b0; end end end @@ -234,7 +230,7 @@ module quadrilatero_rf_sequencer #( wrowaddr_o = wrowaddr_i; wdata_o = wdata_i ; we_o = wr_gnt ; - wready_o = wr_gnt ; + wready_o = we_i ; end if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv new file mode 100644 index 000000000..c3e3d83bd --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv @@ -0,0 +1,292 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_rf_sequencer #( + parameter READ_PORTS = 4 , + parameter WRITE_PORTS = 2 , + parameter N_REGS = 8 , + parameter N_ROWS = 4 , + parameter RLEN = 128 , + parameter RF_READ_PORTS = 3 , + parameter RF_WRITE_PORTS = 1, + parameter SYNC_REQ = 1, + + parameter N_ENTRIES = 3 // entries in the FIFOs for each register +) ( + + input logic clk_i, + input logic rst_ni, + + // Input from FUs + input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , + input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , + output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , + output logic [READ_PORTS-1:0] rvalid_o , + input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , + + + input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , + input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , + input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , + input logic [WRITE_PORTS-1:0] we_i , + input logic [WRITE_PORTS-1:0] wlast_i , // we can use this instead of wlast_row_i + output logic [WRITE_PORTS-1:0] wready_o , + input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , + + // Outputs to RF + output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , + output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , + + + output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , + output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , + output logic [RF_WRITE_PORTS-1:0] we_o , + + + // Inputs from Dispatcher + // We can share the entry as we fetch 1 instruction at a time + // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles + input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , + input logic [N_REGS-1:0] rw_queue_push_i , + + // Outputs to Dispatcher + output logic [N_REGS-1:0] rw_queue_full_o +); + + //logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; + logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; + + logic [WRITE_PORTS-1:0] wr_gnt ; + logic [WRITE_PORTS-1:0] wr_req ; + logic [READ_PORTS -1:0] rd_req ; + logic [READ_PORTS -1:0] rd_gnt ; + + logic [N_REGS-1:0] rw_queue_push ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; + + logic starting; + + genvar ii,hh; + + assign rw_queue_pop = w_pop | r_pop | ~starting; + assign rw_queue_entry = rw_queue_entry_i ; + assign rw_queue_push = rw_queue_push_i ; + + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; + assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; + + for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs + for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows + fifo_v3 #( + .FALL_THROUGH (1'b1) , + .DEPTH (N_ENTRIES) , + .dtype (quadrilatero_pkg::rw_queue_t) + ) issue_queue_inst ( + .clk_i, + .rst_ni, + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + .usage_o ( ), + .full_o (rw_queue_full [ii][hh] ), + .empty_o (rw_queue_empty[ii][hh] ), + .data_i (rw_queue_entry[ii] ), // data to push into the queue + .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue + .data_o (rw_queue [ii][hh] ), // output data + .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue + ); + end + end + + always_comb begin: scoreboard_block + rw_queue_full_o = '0; + for (int i = 0; i < N_REGS; i++) begin + for (int h = 0; h < N_ROWS; h++) begin + rw_queue_full_o[i] |= (rw_queue_full[i][h]); + + //head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; + + + scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; + + // scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + + // scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : + // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + end + end + end + + always_comb begin: ctrl_block + wr_req = '0; + rd_req = '0; + w_pop = '0; + r_pop = '0; + starting = '0; + + for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request + automatic int m = 32'(waddr_i[jj]); + automatic int n = 32'(wrowaddr_i[jj]); + if( scoreboard_q[m][n].id == wr_id_i[jj] && we_i) begin + starting |= 1'b1; + wr_req [jj] = 1'b1; + w_pop [m][n] = wlast_i[jj]; + end + end + + for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request + automatic int m = 32'(raddr_i[jj]); + automatic int n = 32'(rrowaddr_i[jj]); + if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i) begin + rd_req [jj] = 1'b1; + r_pop [m][n] = rlast_i[jj]; + end + end + + if(SYNC_REQ) begin: sa_sync_req + + logic block ; + logic same_id_acc; + logic same_id_A ; + logic same_id_D ; + logic same_id_W ; + + // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; + same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; + same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; + same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; + + if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) + ) begin + block = 1'b1; + end else begin + block = 1'b0; + end + + // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin + // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; + // end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; + end + end + end + + if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_WRITE_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (WRITE_PORTS) + ) wr_arb_i( + .clk_i , + .rst_ni , + .req_i (wr_req), + .grant_o (wr_gnt) + ); + always_comb begin: wdata_block + automatic int ll=0; + + wready_o = wr_gnt ; + for (int mm = 0; mm < WRITE_PORTS; mm++) begin + if(wr_gnt[mm]) begin + waddr_o [ll] = waddr_i [mm]; + wrowaddr_o[ll] = wrowaddr_i[mm]; + wdata_o [ll] = wdata_i [mm]; + we_o [ll] = we_i [mm]; + ll++; + end + end + end + end else always_comb begin : write_block_noArb + wr_gnt = wr_req ; + waddr_o = waddr_i ; + wrowaddr_o = wrowaddr_i; + wdata_o = wdata_i ; + we_o = wr_gnt ; + wready_o = wr_gnt ; //might need to be changed + end + + if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_READ_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (READ_PORTS) + ) rd_arb_i( + .clk_i , + .rst_ni , + .req_i (rd_req), + .grant_o (rd_gnt) + ); + + always_comb begin: rdata_block + automatic int ll=0; + + rvalid_o = rd_gnt; + for (int mm = 0; mm < READ_PORTS; mm++) begin + if(rd_gnt[mm]) begin + raddr_o [ll] = raddr_i [mm]; + rrowaddr_o[ll] = rrowaddr_i[mm]; + rdata_o [mm] = rdata_i [ll]; + ll++; + end else begin + rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; + end + end + end + end else always_comb begin : read_block_noArb + rd_gnt = rd_req ; + raddr_o = raddr_i ; + rrowaddr_o = rrowaddr_i; + rdata_o = rdata_i ; + rvalid_o = rd_gnt ; //might need to be changed + end + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + scoreboard_q <= '0; + end else begin + scoreboard_q <= scoreboard_d; + end + end + + //------------------------------------------------------------------------------------------------------- + + // Assertions + if (WRITE_PORTS < 2) begin + $error( + "[rf_sequencer] WRITE_PORTS must be at least 2.\n" + ); + end + if (READ_PORTS < 2) begin + $error( + "[rf_sequencer] READ_PORTS must be at least 2.\n" + ); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv similarity index 86% rename from hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv rename to hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv index 3f2fc907b..d25b6bf64 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_new.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv @@ -27,7 +27,6 @@ module quadrilatero_rf_sequencer #( output logic [READ_PORTS-1:0] rvalid_o , input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0] rlast_row_i, // tells us if we're reading rrowaddr_i row for the last time (for now always 1) input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , @@ -37,7 +36,6 @@ module quadrilatero_rf_sequencer #( input logic [WRITE_PORTS-1:0] we_i , input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) output logic [WRITE_PORTS-1:0] wready_o , - input logic [WRITE_PORTS-1:0] wlast_row_i, // tells us if we're writing wrowaddr_i row for the last time (for now always 1) input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , // Outputs to RF @@ -62,7 +60,7 @@ module quadrilatero_rf_sequencer #( output logic [N_REGS-1:0] rw_queue_full_o ); - //logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; + logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; @@ -83,7 +81,7 @@ module quadrilatero_rf_sequencer #( genvar ii,hh; - assign rw_queue_pop = w_pop | r_pop | ~scoreboard_q; + assign rw_queue_pop = w_pop | r_pop | ~head_valid; assign rw_queue_entry = rw_queue_entry_i ; assign rw_queue_push = rw_queue_push_i ; @@ -118,25 +116,25 @@ module quadrilatero_rf_sequencer #( for (int h = 0; h < N_ROWS; h++) begin rw_queue_full_o[i] |= (rw_queue_full[i][h]); - //head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; + head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; - //scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - //scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : + (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; end end end always_comb begin: ctrl_block - wr_req = '0; - rd_req = '0; + wr_req = '0; + rd_req = '0; w_pop = '0; r_pop = '0; r_clr = '0; @@ -144,17 +142,23 @@ module quadrilatero_rf_sequencer #( for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request automatic int m = 32'(waddr_i[jj]); automatic int n = 32'(wrowaddr_i[jj]); - wr_req [jj] = we_i[jj] && (scoreboard_q[m][n].id == wr_id_i[jj]); //PROBLEM: we set write request even if we're not yet ready to write - w_pop [m][n] = wlast_row_i[jj] && (scoreboard_q[m][n].id == wr_id_i[jj]) && we_i[jj]; + if( scoreboard_q[m][n].id == wr_id_i[jj] && + scoreboard_q[m][n].wready && we_i[jj] ) + begin + wr_req [jj] = 1'b1; + w_pop [m][n] = wlast_i[jj]; + end end for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request automatic int m = 32'(raddr_i[jj]); automatic int n = 32'(rrowaddr_i[jj]); - r_pop [m][n] = rlast_row_i[jj] && ( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj]) ; - if(scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj]) begin - rd_req [jj] = 1'b1; - r_clr [m][n] = rd_gnt[jj]; + if( scoreboard_q[m][n].id == rd_id_i[jj] && + scoreboard_q[m][n].rvalid && rready_i[jj] ) + begin + rd_req [jj] = 1'b1; + r_clr [m][n] = rd_gnt[jj]; + r_pop [m][n] = ~scoreboard_q[m][n].wready & rlast_i[jj]; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv new file mode 100644 index 000000000..ab52e307e --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv @@ -0,0 +1,293 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_rf_sequencer #( + parameter READ_PORTS = 4 , + parameter WRITE_PORTS = 2 , + parameter N_REGS = 8 , + parameter N_ROWS = 4 , + parameter RLEN = 128 , + parameter RF_READ_PORTS = 3 , + parameter RF_WRITE_PORTS = 1, + parameter SYNC_REQ = 1, + + parameter N_ENTRIES = 3 // entries in the FIFOs for each register +) ( + + input logic clk_i, + input logic rst_ni, + + // Input from FUs + input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , + input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , + output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , + output logic [READ_PORTS-1:0] rvalid_o , + input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) + input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , + + + input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , + input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , + input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , + input logic [WRITE_PORTS-1:0] we_i , + input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) + output logic [WRITE_PORTS-1:0] wready_o , + input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , + + // Outputs to RF + output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , + output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , + + + output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , + output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , + output logic [RF_WRITE_PORTS-1:0] we_o , + + + // Inputs from Dispatcher + // We can share the entry as we fetch 1 instruction at a time + // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles + input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , + input logic [N_REGS-1:0] rw_queue_push_i , + + // Outputs to Dispatcher + output logic [N_REGS-1:0] rw_queue_full_o +); + + logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; + logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; + + logic [WRITE_PORTS-1:0] wr_gnt ; + logic [WRITE_PORTS-1:0] wr_req ; + logic [READ_PORTS -1:0] rd_req ; + logic [READ_PORTS -1:0] rd_gnt ; + + logic [N_REGS-1:0] rw_queue_push ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; + quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; + genvar ii,hh; + + assign rw_queue_pop = w_pop | r_pop | ~head_valid; //problem + assign rw_queue_entry = rw_queue_entry_i ; + assign rw_queue_push = rw_queue_push_i ; + + logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; + assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; + + for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs + for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows + fifo_v3 #( + .FALL_THROUGH (1'b1) , + .DEPTH (N_ENTRIES) , + .dtype (quadrilatero_pkg::rw_queue_t) + ) issue_queue_inst ( + .clk_i, + .rst_ni, + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + .usage_o ( ), + .full_o (rw_queue_full [ii][hh] ), + .empty_o (rw_queue_empty[ii][hh] ), + .data_i (rw_queue_entry[ii] ), // data to push into the queue + .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue + .data_o (rw_queue [ii][hh] ), // output data + .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue + ); + end + end + + always_comb begin: scoreboard_block + rw_queue_full_o = '0; + for (int i = 0; i < N_REGS; i++) begin + for (int h = 0; h < N_ROWS; h++) begin + rw_queue_full_o[i] |= (rw_queue_full[i][h]); + + + head_valid[i][h] = scoreboard_q[i][h].valid; + + scoreboard_d[i][h] = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : + (rw_queue_pop[i][h] ) ? rw_queue[i][h] : scoreboard_q[i][h]; + + + // scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; + + // scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : + // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : + // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; + end + end + end + + always_comb begin: ctrl_block + wr_req = '0; + rd_req = '0; + w_pop = '0; + r_pop = '0; + r_clr = '0; + + for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request + automatic int m = 32'(waddr_i[jj]); + automatic int n = 32'(wrowaddr_i[jj]); + if( scoreboard_q[m][n].id == wr_id_i[jj] && we_i[jj] && scoreboard_q[m][n].valid) begin + wr_req [jj] = 1'b1; + w_pop [m][n] = wlast_i[jj]; + end + end + + for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request + automatic int m = 32'(raddr_i[jj]); + automatic int n = 32'(rrowaddr_i[jj]); + if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj] && scoreboard_q[m][n].valid) begin + rd_req [jj] = 1'b1; + r_pop [m][n] = rlast_i[jj] && (jj != quadrilatero_pkg::SYSTOLIC_ARRAY_A); // for SA_A port we can't pop on read + end + end + + if(SYNC_REQ) begin: sa_sync_req + + logic block ; + logic same_id_acc; + logic same_id_A ; + logic same_id_D ; + logic same_id_W ; + + // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; + same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; + same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; + same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; + + if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || + (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) + ) begin + block = 1'b1; + end else begin + block = 1'b0; + end + + // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin + // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; + // end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]] = 1'b0; + end + if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin + rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; + r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]] = 1'b0; + end + end + end + + if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_WRITE_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (WRITE_PORTS) + ) wr_arb_i( + .clk_i , + .rst_ni , + .req_i (wr_req), + .grant_o (wr_gnt) + ); + always_comb begin: wdata_block + automatic int ll=0; + + wready_o = wr_gnt ; + for (int mm = 0; mm < WRITE_PORTS; mm++) begin + if(wr_gnt[mm]) begin + waddr_o [ll] = waddr_i [mm]; + wrowaddr_o[ll] = wrowaddr_i[mm]; + wdata_o [ll] = wdata_i [mm]; + we_o [ll] = we_i [mm]; + ll++; + end + end + end + end else always_comb begin : write_block_noArb + wr_gnt = wr_req ; + waddr_o = waddr_i ; + wrowaddr_o = wrowaddr_i; + wdata_o = wdata_i ; + we_o = wr_gnt ; + wready_o = we_i ; + end + + if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb + + quadrilatero_rr_arbiter #( + .NumActOut (RF_READ_PORTS) , + .N_ROWS (N_ROWS) , + .WIDTH (READ_PORTS) + ) rd_arb_i( + .clk_i , + .rst_ni , + .req_i (rd_req), + .grant_o (rd_gnt) + ); + + always_comb begin: rdata_block + automatic int ll=0; + + rvalid_o = rd_gnt; + for (int mm = 0; mm < READ_PORTS; mm++) begin + if(rd_gnt[mm]) begin + raddr_o [ll] = raddr_i [mm]; + rrowaddr_o[ll] = rrowaddr_i[mm]; + rdata_o [mm] = rdata_i [ll]; + ll++; + end else begin + rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; + end + end + end + end else always_comb begin : read_block_noArb + rd_gnt = rd_req ; + raddr_o = raddr_i ; + rrowaddr_o = rrowaddr_i; + rdata_o = rdata_i ; + rvalid_o = rd_gnt ; + end + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + scoreboard_q <= '0; + end else begin + scoreboard_q <= scoreboard_d; + end + end + + //------------------------------------------------------------------------------------------------------- + + // Assertions + if (WRITE_PORTS < 2) begin + $error( + "[rf_sequencer] WRITE_PORTS must be at least 2.\n" + ); + end + if (READ_PORTS < 2) begin + $error( + "[rf_sequencer] READ_PORTS must be at least 2.\n" + ); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index cdf019b5d..4e67b7ed9 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -37,37 +37,33 @@ module quadrilatero_systolic_array #( // Weight Read Register Port output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , - input logic [ ALEN-1:0] weight_rdata_i , + input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, output logic weight_rlast_o , - output logic weight_rlast_row_o , // Data Read Register Port output logic [ $clog2(N_REGS)-1:0] data_raddr_o , output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , - input logic [ ALEN-1:0] data_rdata_i , + input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , input logic data_rdata_valid_i , output logic data_rdata_ready_o , output logic data_rlast_o , - output logic data_rlast_row_o , // Accumulator Read Register Port output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , - input logic [ ALEN-1:0] acc_rdata_i , + input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , input logic acc_rdata_valid_i , output logic acc_rdata_ready_o , output logic acc_rlast_o , - output logic acc_rlast_row_o , // Accumulator Out Write Register Port output logic [ $clog2(N_REGS)-1:0] res_waddr_o , output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , - output logic [ ALEN-1:0] res_wdata_o , + output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , output logic res_we_o , output logic res_wlast_o , - output logic res_wlast_row_o , input logic res_wready_i , // RF Instruction ID @@ -99,6 +95,37 @@ module quadrilatero_systolic_array #( fs_state_e fs_state_d, fs_state_q; dr_state_e dr_state_d, dr_state_q; localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); + localparam RegLastRow = quadrilatero_pkg::RLEN/ ALEN; + localparam K = quadrilatero_pkg::RLEN / ALEN; + + logic [$clog2(K)-1:0] ff_k_counter_d; + logic [$clog2(K)-1:0] ff_k_counter_q; + logic [$clog2(K)-1:0] dr_k_counter_d; + logic [$clog2(K)-1:0] dr_k_counter_q; + logic [$clog2(K)-1:0] ff_it_counter_d; + logic [$clog2(K)-1:0] ff_it_counter_q; + logic [$clog2(K)-1:0] dr_it_counter_d; + logic [$clog2(K)-1:0] dr_it_counter_q; + logic [$clog2(K)-1:0] ff_row_counter_d; + logic [$clog2(K)-1:0] ff_row_counter_q; + logic [$clog2(K)-1:0] dr_row_counter_d; + logic [$clog2(K)-1:0] dr_row_counter_q; + logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; + + // Data Masks + logic [quadrilatero_pkg::RLEN-1:0] data_mask; + logic [quadrilatero_pkg::RLEN-1:0] weight_mask; + logic [quadrilatero_pkg::RLEN-1:0] acc_mask; + //logic [quadrilatero_pkg::RLEN-1:0] res_mask; + + logic [ALEN-1:0] data_rdata_masked; + logic [ALEN-1:0] weight_rdata_masked; + logic [ALEN-1:0] acc_rdata_masked; + logic [ALEN-1:0] res_wdata_partial; + logic [quadrilatero_pkg::RLEN-1:0] res_wdata_buffer_d; + logic [quadrilatero_pkg::RLEN-1:0] res_wdata_buffer_q; + + logic valid ; logic clear ; logic pump ; @@ -147,44 +174,49 @@ module quadrilatero_systolic_array #( always_comb begin: rf_block // Weight Read Register Port + weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); + weight_base_row = N_ROWS * ff_it_counter_q; weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q ; + weight_rrowaddr_o = ff_counter_q + weight_base_row; + weight_rdata_masked = (weight_rdata_i & weight_mask) >> ALEN * ff_k_counter_q; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = ff_counter_q==LastRow; - weight_rlast_row_o = 1'b1; + weight_rlast_o = (ff_state_q != FF_IDLE) && ff_k_counter_q == (K-1) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? // Data Read Register Port + data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} >> (ALEN * ff_it_counter_q); data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q ; + data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + data_rdata_masked = (data_rdata_i & data_mask) >> ALEN * ff_it_counter_q; data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - data_rlast_o = ff_counter_q==LastRow; - data_rlast_row_o = 1'b1; + data_rlast_o = ff_state_q != FF_IDLE && ff_it_counter_q == (K-1) ; // Accumulator Read Register Port + acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q ; + acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + acc_rdata_masked = (acc_rdata_i & acc_mask) >> ALEN * ff_k_counter_q; acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - acc_rlast_o = ff_counter_q==LastRow; - acc_rlast_row_o = 1'b1; + acc_rlast_o = '0 ; // Accumulator Out Write Register Port res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q ; + res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; + res_wdata_o = res_wdata_buffer_q | res_wdata_partial << ALEN * dr_k_counter_q; res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; - res_wlast_o = dr_counter_q==LastRow; - res_wlast_row_o = 1'b1; + res_wlast_o = (dr_state_q != DR_IDLE) && dr_it_counter_q == (K-1) ; end always_comb begin: finished_signal - finished_d = (res_wready_i && res_wlast_o) ? 1'b1 : + finished_d = (res_wready_i && (dr_counter_q == LastRow)) ? 1'b1 : (finished_ack_i ) ? 1'b0 : finished_q; - finished_instr_id_d = (res_wready_i && res_wlast_o) ? id_dr_q : + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow)) ? id_dr_q : (finished_ack_i ) ? '0 : finished_instr_id_q; end always_comb begin: ctrl_block + res_wdata_buffer_d = res_wdata_buffer_q; valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q != DR_ACTIVE)) begin clear = 1'b1; @@ -207,10 +239,16 @@ module quadrilatero_systolic_array #( weight_reg_d = weight_reg_q; sa_ctrl_d = sa_ctrl_q; id_ff_d = id_ff_q; + ff_k_counter_d = ff_k_counter_q; + ff_it_counter_d = ff_it_counter_q; + ff_row_counter_d = ff_row_counter_q; unique case (ff_state_q) FF_IDLE: begin ff_counter_d = '0; + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; if(start_i == 1'b1) begin ff_state_d = FF_ACTIVE; data_reg_d = data_reg_i; @@ -229,25 +267,41 @@ module quadrilatero_systolic_array #( ff_counter_d = ff_counter_q + 1; end end - - end + FF_DONE: begin if(start_i == 1'b1) begin ff_counter_d = '0; ff_state_d = FF_ACTIVE; - - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; + if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1)) begin // get inputs from new instruction + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + if(ff_row_counter_q == RegLastRow-1) begin + ff_row_counter_d = '0; + if(ff_k_counter_q == (K-1)) begin + ff_k_counter_d = '0; + ff_it_counter_d = ff_it_counter_q + 1; + end else begin + ff_k_counter_d = ff_k_counter_q + 1; + end + end else begin + ff_row_counter_d = ff_row_counter_q + 1; + end + end + end else begin ff_counter_d = '0; ff_state_d = FF_IDLE; end - end + default: begin ff_state_d = FF_IDLE; end @@ -307,6 +361,9 @@ module quadrilatero_systolic_array #( always_comb begin : dr_fsm_block dr_state_d = dr_state_q; dr_counter_d = dr_counter_q; + dr_k_counter_d = dr_k_counter_q; + dr_it_counter_d = dr_it_counter_q; + dr_row_counter_d = dr_row_counter_q; dest_reg_d = dest_reg_q; id_dr_d = id_dr_q; @@ -334,6 +391,24 @@ module quadrilatero_systolic_array #( dr_state_d = DR_ACTIVE; dest_reg_d = acc_fs_q; id_dr_d = id_fs_q; + //update DR counters + if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + end else begin + if(dr_row_counter_q == RegLastRow-1) begin + dr_row_counter_d = '0; + if(dr_k_counter_q == (K-1)) begin + dr_k_counter_d = '0; + dr_it_counter_d = dr_it_counter_q + 1; + end else begin + dr_k_counter_d = dr_k_counter_q + 1; + end + end else begin + dr_row_counter_d = dr_row_counter_q + 1; + end + end end if(fs_state_q == FS_IDLE) begin dr_state_d = DR_DONE; @@ -364,7 +439,7 @@ module quadrilatero_systolic_array #( .clk_i , .rst_ni , .pump_i (pump ), - .data_i (data_rdata_i ), + .data_i (data_rdata_masked ), .data_o (data_mesh_skewed) ); @@ -375,7 +450,7 @@ module quadrilatero_systolic_array #( .clk_i , .rst_ni , .pump_i (pump ), - .data_i (acc_rdata_i ), + .data_i (acc_rdata_masked ), .data_o (acc_mesh_skewed) ); @@ -403,7 +478,7 @@ module quadrilatero_systolic_array #( .weight_rdata_valid_i , // Weight Data - .weight_rdata_i , + .weight_rdata_i (weight_rdata_masked ), .weight_rdata_o (weight_mesh_skewed ) ); @@ -432,7 +507,7 @@ module quadrilatero_systolic_array #( .rst_ni , .pump_i (pump ), .data_i (res_mesh_skewed), - .data_o (res_wdata_o ) + .data_o (res_wdata_partial ) ); always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block @@ -454,6 +529,13 @@ module quadrilatero_systolic_array #( id_dr_q <= '0; finished_q <= '0; finished_instr_id_q <= '0; + ff_k_counter_q <= '0; + dr_k_counter_q <= '0; + ff_it_counter_q <= '0; + dr_it_counter_q <= '0; + ff_row_counter_q <= '0; + dr_row_counter_q <= '0; + res_wdata_buffer_q <= '0; end else begin ff_counter_q <= ff_counter_d ; fs_counter_q <= fs_counter_d ; @@ -472,6 +554,13 @@ module quadrilatero_systolic_array #( id_dr_q <= id_dr_d ; finished_q <= finished_d ; finished_instr_id_q <= finished_instr_id_d ; + ff_k_counter_q <= ff_k_counter_d; + dr_k_counter_q <= dr_k_counter_d; + ff_it_counter_q <= ff_it_counter_d; + dr_it_counter_q <= dr_it_counter_d; + ff_row_counter_q <= ff_row_counter_d ; + dr_row_counter_q <= dr_row_counter_d ; + res_wdata_buffer_q <= res_wdata_buffer_d ; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv new file mode 100644 index 000000000..cde315829 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv @@ -0,0 +1,484 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* + +TODO: +- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs + - basically you need to inject zeros instead of actual elements +*/ + +module quadrilatero_systolic_array #( + parameter int MESH_WIDTH = 4 , + parameter int DATA_WIDTH = 32 , + parameter int N_REGS = 8 , + parameter int ENABLE_SIMD = 1 , + localparam int N_ROWS = MESH_WIDTH , + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, + parameter FPU = 1 +) ( + input logic clk_i , + input logic rst_ni , + + output logic sa_ready_o , + input logic start_i , + + // Only has effect if ENABLE_SIMD == 1 + input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , + + input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register + input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register + input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register + input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction + + // Weight Read Register Port + output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , + output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , + input logic [ ALEN-1:0] weight_rdata_i , + input logic weight_rdata_valid_i, + output logic weight_rdata_ready_o, + output logic weight_rlast_o , + + // Data Read Register Port + output logic [ $clog2(N_REGS)-1:0] data_raddr_o , + output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , + input logic [ ALEN-1:0] data_rdata_i , + input logic data_rdata_valid_i , + output logic data_rdata_ready_o , + output logic data_rlast_o , + + // Accumulator Read Register Port + output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , + output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , + input logic [ ALEN-1:0] acc_rdata_i , + input logic acc_rdata_valid_i , + output logic acc_rdata_ready_o , + output logic acc_rlast_o , + + // Accumulator Out Write Register Port + output logic [ $clog2(N_REGS)-1:0] res_waddr_o , + output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , + output logic [ ALEN-1:0] res_wdata_o , + output logic res_we_o , + output logic res_wlast_o , + input logic res_wready_i , + + // RF Instruction ID + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , + + // Finish + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o +); + typedef enum logic [1:0]{ + FS_IDLE, + FS_ACTIVE, + FS_LAST + } fs_state_e; + typedef enum logic [1:0]{ + FF_IDLE, + FF_ACTIVE, + FF_DONE + } ff_state_e; + typedef enum logic [1:0]{ + DR_IDLE, + DR_ACTIVE, + DR_DONE + } dr_state_e; + + ff_state_e ff_state_d, ff_state_q; + fs_state_e fs_state_d, fs_state_q; + dr_state_e dr_state_d, dr_state_q; + localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); + logic valid ; + logic clear ; + logic pump ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; + + logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register + logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register + logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register + logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register + quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; + quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; + + logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage + + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; + + logic finished_d ; + logic finished_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; + logic mask_req ; + + quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; + + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; + logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; + + //--------------------------------------------------------------------- + + always_comb begin: rf_block + // Weight Read Register Port + weight_raddr_o = weight_reg_q ; + weight_rrowaddr_o = ff_counter_q ; + weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + weight_rlast_o = (ff_state_q != FF_IDLE) ; + + // Data Read Register Port + data_raddr_o = data_reg_q ; + data_rrowaddr_o = ff_counter_q ; + data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + data_rlast_o = ff_state_q != FF_IDLE ; + + // Accumulator Read Register Port + acc_raddr_o = acc_reg_q ; + acc_rrowaddr_o = ff_counter_q ; + acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + acc_rlast_o = '0 ; + + // Accumulator Out Write Register Port + res_waddr_o = dest_reg_q ; + res_wrowaddr_o = dr_counter_q ; + res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; + res_wlast_o = dr_state_q != DR_IDLE ; + end + + always_comb begin: finished_signal + + finished_d = (res_wready_i && (dr_counter_q == LastRow)) ? 1'b1 : + (finished_ack_i ) ? 1'b0 : finished_q; + + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow)) ? id_dr_q : + (finished_ack_i ) ? '0 : finished_instr_id_q; + end + + always_comb begin: ctrl_block + valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; + if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q != DR_ACTIVE)) begin + clear = 1'b1; + end else begin + clear = 1'b0; + end + if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q == DR_ACTIVE)) begin + pump = 1'b1; + end else begin + pump = 1'b0; + end + mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; + end + + always_comb begin : ff_fsm_block + ff_counter_d = ff_counter_q; + ff_state_d = ff_state_q; + data_reg_d = data_reg_q; + acc_reg_d = acc_reg_q; + weight_reg_d = weight_reg_q; + sa_ctrl_d = sa_ctrl_q; + id_ff_d = id_ff_q; + + unique case (ff_state_q) + FF_IDLE: begin + ff_counter_d = '0; + if(start_i == 1'b1) begin + ff_state_d = FF_ACTIVE; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end + end + FF_ACTIVE: begin + if(valid == 1'b1) begin + if(ff_counter_q==(LastRow-1)) begin + ff_counter_d = ff_counter_q + 1; + ff_state_d = FF_DONE; + end else begin + ff_counter_d = ff_counter_q + 1; + end + end + + + end + FF_DONE: begin + if(start_i == 1'b1) begin + ff_counter_d = '0; + ff_state_d = FF_ACTIVE; + + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + ff_counter_d = '0; + ff_state_d = FF_IDLE; + end + + end + default: begin + ff_state_d = FF_IDLE; + end + endcase + end + always_comb begin : fs_fsm_block + fs_counter_d = fs_counter_q; + fs_state_d = fs_state_q; + + acc_fs_d = acc_fs_q; + id_fs_d = id_fs_q; + + unique case(fs_state_q) + FS_IDLE: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE ) begin + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + + end + FS_ACTIVE: begin + if(clear == 1'b1) begin + fs_counter_d = '0; + fs_state_d = FS_IDLE; + end else begin + if(fs_counter_q == LastRow-2) begin + fs_counter_d = fs_counter_q + 1; + fs_state_d = FS_LAST; + end else begin + fs_counter_d = fs_counter_q + 1; + end + end + end + FS_LAST: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + if(ff_state_q == FF_IDLE) begin + fs_state_d = FS_IDLE; + end + + end + default: begin + fs_state_d = FS_IDLE; + end + + endcase + end + + always_comb begin : dr_fsm_block + dr_state_d = dr_state_q; + dr_counter_d = dr_counter_q; + + dest_reg_d = dest_reg_q; + id_dr_d = id_dr_q; + unique case(dr_state_q) + DR_IDLE: begin + dr_counter_d = '0; + if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + + end + DR_ACTIVE: begin + if(clear == 1'b1) begin + dr_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + if(dr_counter_q == LastRow) begin + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; //stall the pipeline + end else begin + dr_counter_d = '0; + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + if(fs_state_q == FS_IDLE) begin + dr_state_d = DR_DONE; + end + end + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + + end + DR_DONE: begin + dr_state_d = DR_IDLE; + end + default: begin + dr_state_d = DR_IDLE; + end + + endcase + + + end + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_data ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (data_rdata_i ), + .data_o (data_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (acc_rdata_i ), + .data_o (acc_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(4) + ) skewer_inst_ctrl ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i ({MESH_WIDTH{sa_ctrl_q}}), + .data_o (sa_ctrl_mesh_skewed ) + ); + + quadrilatero_wl_stage #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) weight_inst ( + .clk_i , + .rst_ni , + + .ff_counter (ff_counter_q ), + .clear_i (clear ), + .pump_i (pump ), + .weight_rdata_valid_i , + + // Weight Data + .weight_rdata_i , + .weight_rdata_o (weight_mesh_skewed ) + ); + + quadrilatero_mesh #( + .MESH_WIDTH (MESH_WIDTH ), + .ENABLE_SIMD(ENABLE_SIMD), + .FPU (FPU ) + ) mesh_inst ( + .clk_i, + .rst_ni, + + .pump_i (pump ), + .sa_ctrl_i (sa_ctrl_mesh_skewed ), + + .data_i (data_mesh_skewed ), + .acc_i (acc_mesh_skewed ), + .weight_i (weight_mesh_skewed ), + .acc_o (res_mesh_skewed ) + ); + + quadrilatero_deskewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) deskewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (res_mesh_skewed), + .data_o (res_wdata_o ) + ); + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + ff_counter_q <= '0; + fs_counter_q <= '0; + dr_counter_q <= '0; + ff_state_q <= FF_IDLE; + fs_state_q <= FS_IDLE; + dr_state_q <= DR_IDLE; + data_reg_q <= '0; + acc_reg_q <= '0; + weight_reg_q <= '0; + sa_ctrl_q <= '0; + acc_fs_q <= '0; + dest_reg_q <= '0; + id_ff_q <= '0; + id_fs_q <= '0; + id_dr_q <= '0; + finished_q <= '0; + finished_instr_id_q <= '0; + end else begin + ff_counter_q <= ff_counter_d ; + fs_counter_q <= fs_counter_d ; + dr_counter_q <= dr_counter_d ; + ff_state_q <= ff_state_d; + fs_state_q <= fs_state_d; + dr_state_q <= dr_state_d; + data_reg_q <= data_reg_d ; + acc_reg_q <= acc_reg_d ; + weight_reg_q <= weight_reg_d ; + sa_ctrl_q <= sa_ctrl_d ; + acc_fs_q <= acc_fs_d ; + dest_reg_q <= dest_reg_d ; + id_ff_q <= id_ff_d ; + id_fs_q <= id_fs_d ; + id_dr_q <= id_dr_d ; + finished_q <= finished_d ; + finished_instr_id_q <= finished_instr_id_d ; + end + end + + assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0) | clear); + assign sa_input_id_o = id_ff_q ; + assign sa_output_id_o = id_dr_q ; + assign finished_o = finished_q ; + assign finished_instr_id_o = finished_instr_id_q; + + // -------------------------------------------------------------------- + + // Assertions + if (MESH_WIDTH < 2) begin + $error( + "[systolic_array] MESH_WIDTH must be at least 2.\n" + ); + end +endmodule diff --git a/scripts/sim/modelsim/patch_modelsim_Makefile.py b/scripts/sim/modelsim/patch_modelsim_Makefile.py index cfe3b0ab7..79a020084 100644 --- a/scripts/sim/modelsim/patch_modelsim_Makefile.py +++ b/scripts/sim/modelsim/patch_modelsim_Makefile.py @@ -26,8 +26,7 @@ 'ifdef RUN_UPF'+ "\n" + \ ' RUN_UPF_OPTIONS := -pa' + "\n" + \ 'endif'+ "\n\n"); -string_replaced.append('EXTRA_OPTIONS ?= -t 1ps -voptargs=+acc $(VSIM_OPTIONS) $(addprefix -g,$(PARAMETERS)) $(addprefix +,$(PLUSARGS)) $(RUN_UPF_OPTIONS)') - +string_replaced.append('EXTRA_OPTIONS ?= -do "do /scratch2/bsc25f1/x-heep/waves.do; run 5000 us" -t 1ps -voptargs=+acc $(VSIM_OPTIONS) $(addprefix -g,$(PARAMETERS)) $(addprefix +,$(PLUSARGS)) $(RUN_UPF_OPTIONS)') string_toappend = [] From 59633a359a9dad22ea2351fab06365c5217771ba Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Wed, 9 Apr 2025 18:07:21 +0200 Subject: [PATCH 09/18] 4x4 working clean --- .../rtl/include/quadrilatero_pkg_old.sv | 120 ----- .../rtl/quadrilatero_dispatcher_old.sv | 340 ------------ .../rtl/quadrilatero_register_lsu.sv | 25 +- .../rtl/quadrilatero_register_lsu_broken.sv | 420 --------------- .../rtl/quadrilatero_register_lsu_old.sv | 327 ------------ .../rtl/quadrilatero_rf_sequencer.sv | 12 +- .../rtl/quadrilatero_rf_sequencer_0bit.sv | 292 ----------- .../rtl/quadrilatero_rf_sequencer_2bit.sv | 295 ----------- ...ilatero_rf_sequencer_kinda_working_1bit.sv | 293 ----------- .../rtl/quadrilatero_rf_sequencer_old.sv | 295 ----------- .../rtl/quadrilatero_systolic_array_old.sv | 484 ------------------ 11 files changed, 7 insertions(+), 2896 deletions(-) delete mode 100644 hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv deleted file mode 100644 index 74d7a277a..000000000 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg_old.sv +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Saverio Nasturzio -package quadrilatero_pkg; - - parameter int unsigned N_REGS = 8; - parameter int unsigned DATA_WIDTH = 32; - parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 4; - parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units - parameter int unsigned MAX_NUM_READ_OPERANDS = 3; - parameter int unsigned MAX_NUM_WRITE_OPERANDS = 1; - parameter int unsigned READ_PORTS = 4; // we'll have fewer write ports so we take the maximum one which is the number of READ PORTS for the rw_queue_t - parameter int unsigned WRITE_PORTS = 3; // - parameter int unsigned RF_READ_PORTS = 4; - parameter int unsigned RF_WRITE_PORTS = 3; - - localparam int unsigned N_ROWS = MESH_WIDTH ; - localparam int unsigned RLEN = DATA_WIDTH * MESH_WIDTH; - - - typedef enum logic [2:0] { - SIZE_32 = 1, // 32-bit operation - SIZE_16 = 2, // 16-bit operation - SIZE_8 = 4 // 8-bit operation - } datatype_t; - - typedef struct packed { - logic is_float; - datatype_t datatype; - } sa_ctrl_t; - - typedef struct packed { - logic [$clog2(N_REGS)-1:0] data_reg ; - logic [$clog2(N_REGS)-1:0] acc_reg ; - logic [$clog2(N_REGS)-1:0] weight_reg; - logic [xif_pkg::X_ID_WIDTH-1:0] id ; - sa_ctrl_t sa_ctrl ; - } sa_instr_t; - - typedef struct packed { - logic [32-1:0] stride; - logic [32-1:0] addr; - logic [$clog2(N_REGS)-1:0] operand_reg; - logic [xif_pkg::X_ID_WIDTH-1:0] id; - logic is_store; - } lsu_instr_t; - - typedef struct packed { - logic [31:0] n_col_bytes; - logic [31:0] n_rows; - } lsu_conf_t; - - typedef struct packed { - logic [xif_pkg::X_ID_WIDTH-1:0] id; - logic rvalid; - logic wready; - } rw_queue_t; - - localparam int unsigned WR_PORT = (WRITE_PORTS > 1) ? $clog2(WRITE_PORTS) : 1; - localparam int unsigned RD_PORT = (READ_PORTS > 1) ? $clog2(READ_PORTS ) : 1; - typedef enum logic [RD_PORT-1:0] { - SYSTOLIC_ARRAY_W, - SYSTOLIC_ARRAY_D, - SYSTOLIC_ARRAY_A, - LSU_R - } read_ports_t; - - typedef enum logic [WR_PORT-1:0] { - SYSTOLIC_ARRAY, - LSU_W, - RF_W - } write_ports_t; - - // Int formats - typedef enum logic [$clog2(NUM_EXEC_UNITS)-1:0] { - FU_SYSTOLIC_ARRAY = 0, - FU_LSU, - FU_RF - // add new units here - } execution_units_t; - - - localparam int unsigned WR_OPS = (MAX_NUM_WRITE_OPERANDS > 1) ? $clog2(MAX_NUM_WRITE_OPERANDS) : 1; - localparam int unsigned RD_OPS = (MAX_NUM_READ_OPERANDS > 1) ? $clog2(MAX_NUM_READ_OPERANDS ) : 1; - typedef struct packed { - logic [RD_OPS-1:0] n_read_ports; - logic [WR_OPS-1:0] n_write_ports; - // Where within read_ports the functional unit starts - logic [$clog2(READ_PORTS):0] base_offset_read; - logic [$clog2(READ_PORTS):0] base_offset_write; - } fu_ports_info; - - // Follow execution_units_t order - parameter fu_ports_info FU_INFO[NUM_EXEC_UNITS] = '{ - '{ // SYSTOLIC_ARRAY - n_read_ports: 3, - n_write_ports: 1, - base_offset_read: 0, - base_offset_write: 0 - }, - '{ // LSU_W - n_read_ports: 1, - n_write_ports: 1, - base_offset_read: 3, // forth element in read_ports_t - base_offset_write: 1 - }, - '{ // RF_W - n_read_ports: 0, - n_write_ports: 1, - base_offset_read: 4, - base_offset_write: 2 - } - }; - - - -endpackage diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv deleted file mode 100644 index e3a908655..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_dispatcher_old.sv +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_dispatcher #( - parameter N_REGS = 8, - parameter NUM_EXEC_UNITS = 3 -) ( - input logic clk_i, - input logic rst_ni, - - // Outputs to RF sequencer - // We can share the entry as we fetch 1 instruction at a time - // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles - // NOTE: probably the 'lost'cycles are not lost because we can directly push the instruction to the queue even while pushing the operands and they can start execution and if needed stall since no entry in the rw_queue will be found - output quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_o, - output logic [N_REGS-1:0] rw_queue_push_o, - - // Inputs from RF Sequencer - input logic [N_REGS-1:0] rw_queue_full_i, - - - // Instruction from Decoder - input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i, // id of the instruction - input logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_i, // Register file source operands for the offloaded instruction - input logic [xif_pkg::X_NUM_RS -1:0] rs_valid_i, // Validity of the register file source operand(s) - input quadrilatero_pkg::datatype_t datatype_i, - - - input logic [$clog2(quadrilatero_pkg::MAX_NUM_READ_OPERANDS)-1:0] n_matrix_operands_read_i, // how many reads to RF - - // IMPORTANT: Make sure the order of pushing does not impact or deadlock - input logic [quadrilatero_pkg::MAX_NUM_READ_OPERANDS-1:0][$clog2(N_REGS)-1:0] rf_read_regs_i, // which registers to read from - input logic rf_writeback_i, // whether we need to write to the register file - input logic [$clog2(N_REGS)-1:0] rf_writeback_reg_i, // which register to writeback to - input quadrilatero_pkg::execution_units_t exec_unit_i, // which exec unit - input logic is_store_i, // store to memory operation - input logic is_float_i, // float to arithmetic operation - - input logic instr_valid_i, - output logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_o, // id of the instruction out - output logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_o, // Register file source operands for the offloaded instruction - output logic [xif_pkg::X_NUM_RS -1:0] rs_valid_o, // Validity of the register file source operand(s) - output quadrilatero_pkg::datatype_t datatype_o, - output logic is_store_o, - output logic is_float_o, - - - output logic [$clog2(N_REGS)-1:0] reg_ms1_o, - output logic [$clog2(N_REGS)-1:0] reg_ms2_o, - output logic [$clog2(N_REGS)-1:0] reg_ms3_o, - output logic [$clog2(N_REGS)-1:0] reg_md_o , - - // Backpressure towards Decoder - output logic instr_ready_o, - - // Outputs towards Execution Units - input logic [NUM_EXEC_UNITS-1:0] issue_queue_full_i, - output logic [NUM_EXEC_UNITS-1:0] dispatch_o - - -); - -//------------------------------------------------------------------------------ - - typedef enum logic { - IDLE, - PUSH_OPERANDS - } dispatcher_state_e; - - dispatcher_state_e state_q, state_d; - logic instr_ready; - logic can_issue_instr; - - logic [NUM_EXEC_UNITS-1:0] dispatch_d; - logic [NUM_EXEC_UNITS-1:0] dispatch_q; - - logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_d; // id of the instruction out - logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_d ; // Register file source operands for the offloaded instruction - logic [xif_pkg::X_NUM_RS -1:0] rs_valid_d; // Validity of the register file source operand(s) - quadrilatero_pkg::datatype_t datatype_d; - logic is_store_d; - logic is_float_d; - logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_q; // id of the instruction out - logic [xif_pkg::X_NUM_RS -1:0][xif_pkg::X_RFR_WIDTH-1:0] rs_q ; // Register file source operands for the offloaded instruction - logic [xif_pkg::X_NUM_RS -1:0] rs_valid_q; // Validity of the register file source operand(s) - quadrilatero_pkg::datatype_t datatype_q; - logic is_store_q; - logic is_float_q; - - logic [2:0][$clog2(N_REGS)-1:0] rreg_d ; - logic [$clog2(N_REGS)-1:0] wreg_d ; - logic [2:0][$clog2(N_REGS)-1:0] rreg_q ; - logic [$clog2(N_REGS)-1:0] wreg_q ; - - logic ld_eq2 ; - logic ld_eq3 ; - logic ld_eqw ; - logic ld_full1; - logic ld_full2; - logic ld_full3; - logic ld_fullw; - logic ld_reg1 ; - logic ld_reg2 ; - logic ld_reg3 ; - logic ld_regw ; - - logic push_operand1_d; - logic push_operand2_d; - logic push_operand3_d; - logic push_operandw_d; - logic push_operand1_q; - logic push_operand2_q; - logic push_operand3_q; - logic push_operandw_q; - - logic back_push_op1_d; - logic back_push_op2_d; - logic back_push_op3_d; - logic back_push_opw_d; - logic back_push_op1_q; - logic back_push_op2_q; - logic back_push_op3_q; - logic back_push_opw_q; - - logic reg1_valid; - logic reg2_valid; - logic reg3_valid; - logic regw_valid; - - logic [N_REGS-1:0] rvalid; - logic [N_REGS-1:0] wready; - - logic en_cnt; - logic done ; - logic[2:0] delta ; - logic[2:0] outstanding_op_d; - logic[2:0] outstanding_op_q; - - -//------------------------------------------------------------------------------ - - assign can_issue_instr = instr_valid_i && ~issue_queue_full_i[exec_unit_i]; - assign instr_ready = can_issue_instr && (state_q == IDLE | done); - - always_comb begin: internal_signals_block - reg1_valid = (push_operand1_q) ? push_operand1_q : back_push_op1_q ; - reg2_valid = (push_operand2_q) ? push_operand2_q : back_push_op2_q ; - reg3_valid = (push_operand3_q) ? push_operand3_q : back_push_op3_q ; - regw_valid = (push_operandw_q) ? push_operandw_q : back_push_opw_q ; - - ld_eq2 = (rreg_q[0] == rreg_q[1]) & (reg1_valid & reg2_valid); - ld_eq3 = ((rreg_q[0] == rreg_q[2]) & (reg1_valid & reg3_valid) | - (rreg_q[1] == rreg_q[2]) & (reg2_valid & reg3_valid) ); - ld_eqw = ld_eq2 | ld_eq3; - - ld_full1 = rw_queue_full_i[rreg_q[0]]; - ld_full2 = rw_queue_full_i[rreg_q[1]]; - ld_full3 = rw_queue_full_i[rreg_q[2]]; - ld_fullw = rw_queue_full_i[wreg_q ]; - - ld_reg1 = ld_full1 ; - ld_reg2 = ld_full2 | ld_eq2; - ld_reg3 = ld_full3 | ld_eq3; - ld_regw = ld_fullw | ld_eqw; - - delta = 3'b0; - for(int ii = 0; ii < N_REGS; ii++) begin - delta += {2'b0, rw_queue_entry_o[ii].rvalid}; - delta += {2'b0, rw_queue_entry_o[ii].wready}; - end - - done = (delta == outstanding_op_q); - end - - always_comb begin: next_value - rreg_d = (instr_ready || state_q==IDLE) ? rf_read_regs_i : rreg_q; - wreg_d = (instr_ready || state_q==IDLE) ? rf_writeback_reg_i : wreg_q; - - rs_d = (instr_ready || state_q==IDLE) ? rs_i : rs_q ; - rs_valid_d = (instr_ready || state_q==IDLE) ? rs_valid_i : rs_valid_q; - instr_id_d = (instr_ready || state_q==IDLE) ? instr_id_i : instr_id_q; - datatype_d = (instr_ready || state_q==IDLE) ? datatype_i : datatype_q; - is_store_d = (instr_ready || state_q==IDLE) ? is_store_i : is_store_q; - is_float_d = (instr_ready || state_q==IDLE) ? is_float_i : is_float_q; - - dispatch_d = '0 ; - dispatch_d[exec_unit_i] = instr_ready; - - push_operandw_d = rf_writeback_i & instr_ready; - push_operand1_d = (n_matrix_operands_read_i > 0) & instr_ready; - push_operand2_d = (n_matrix_operands_read_i > 1) & instr_ready; - push_operand3_d = (n_matrix_operands_read_i > 2) & instr_ready; - - back_push_op1_d = ld_reg1 ? reg1_valid : 1'b0; - back_push_op2_d = ld_reg2 ? reg2_valid : 1'b0; - back_push_op3_d = ld_reg3 ? reg3_valid : 1'b0; - back_push_opw_d = ld_regw ? regw_valid : 1'b0; - - outstanding_op_d = {1'b0,n_matrix_operands_read_i} + {2'b0, rf_writeback_i}; - end - // always_comb begin: updated_next_value - // if((instr_ready || state_q==IDLE)) begin //we're ready to continue with the next instruction - // rreg_d = rf_read_regs_i; - // wreg_d = rf_writeback_i; - - // rs_d = rs_i; - // rs_valid_d = rs_valid_i; - // instr_id_d = instr_id_i; - // datatype_d = datatype_i; - // is_store_d = is_store_i; - // is_float_d = is_float_i; - // end - // dispatch_d = '0 ; - // dispatch_d[exec_unit_i] = instr_ready; - - // push_operandw_d = rf_writeback_i & instr_ready; - - // end - always_comb begin: rw_queue_block - rvalid = '0; - wready = '0; - rvalid[rreg_q[0]] |= reg1_valid &~ ld_reg1; - rvalid[rreg_q[1]] |= reg2_valid &~ ld_reg2; - rvalid[rreg_q[2]] |= reg3_valid &~ ld_reg3; - wready[wreg_q ] = regw_valid &~ ld_regw; - for(int ii = 0; ii < N_REGS; ii++) begin - rw_queue_entry_o[ii].rvalid = rvalid[ii]; - rw_queue_entry_o[ii].wready = wready[ii]; - rw_queue_entry_o[ii].id = instr_id_q; - rw_queue_push_o [ii] = rw_queue_entry_o[ii].rvalid | rw_queue_entry_o[ii].wready; - end - end - - always_comb begin: fsm_block - en_cnt = 1'b0; - case (state_q) - IDLE: begin - if (can_issue_instr) begin - state_d = PUSH_OPERANDS; - end else begin - state_d = IDLE ; //@ loopback - end - end - PUSH_OPERANDS: begin - en_cnt = 1'b1; - if (done && !instr_ready) begin - state_d = IDLE; - end else begin - state_d = PUSH_OPERANDS ; //@ loopback - end - end - default: state_d = IDLE; - endcase - end - - delta_counter #( - .WIDTH(3), - .STICKY_OVERFLOW(1'b0) - ) delta_counter_i( - .clk_i , - .rst_ni , - .clear_i (1'b0) , // synchronous clear - .en_i (en_cnt) , // enable the counter - .load_i (instr_ready) , // load a new value - .down_i (1'b1) , // downcount, default is up - .delta_i (delta) , - .d_i (outstanding_op_d), - .q_o (outstanding_op_q), - .overflow_o () - ); - - always_ff @(posedge clk_i or negedge rst_ni) begin : seq_block - if (!rst_ni) begin - rreg_q <= '0; - wreg_q <= '0; - dispatch_q <= '0; - rs_q <= '0; - rs_valid_q <= '0; - instr_id_q <= '0; - datatype_q <= quadrilatero_pkg::SIZE_32; - is_store_q <= '0; - is_float_q <= '0; - - back_push_op1_q <= 1'b0; - back_push_op2_q <= 1'b0; - back_push_op3_q <= 1'b0; - back_push_opw_q <= 1'b0; - push_operand1_q <= 1'b0; - push_operand2_q <= 1'b0; - push_operand3_q <= 1'b0; - push_operandw_q <= 1'b0; - state_q <= IDLE; - end else begin - rreg_q <= rreg_d ; - wreg_q <= wreg_d ; - dispatch_q <= dispatch_d; - rs_q <= rs_d ; - rs_valid_q <= rs_valid_d; - instr_id_q <= instr_id_d; - datatype_q <= datatype_d; - is_store_q <= is_store_d; - is_float_q <= is_float_d; - - back_push_op1_q <= back_push_op1_d; - back_push_op2_q <= back_push_op2_d; - back_push_op3_q <= back_push_op3_d; - back_push_opw_q <= back_push_opw_d; - push_operand1_q <= push_operand1_d; - push_operand2_q <= push_operand2_d; - push_operand3_q <= push_operand3_d; - push_operandw_q <= push_operandw_d; - state_q <= state_d ; - end - end - - // Output assignments - assign instr_ready_o = instr_ready; - assign dispatch_o = dispatch_q; - assign rs_o = rs_q ; - assign rs_valid_o = rs_valid_q ; - assign instr_id_o = instr_id_q ; - assign datatype_o = datatype_q ; - assign is_store_o = is_store_q ; - assign is_float_o = is_float_q ; - - assign reg_ms1_o = rreg_q[0] ; - assign reg_ms2_o = rreg_q[1] ; - assign reg_ms3_o = rreg_q[2] ; - assign reg_md_o = wreg_q ; - - // Assertions - if (quadrilatero_pkg::MAX_NUM_READ_OPERANDS != 3) begin - $error( - "[dispatcher] The quadrilatero_pkg::MAX_NUM_READ_OPERANDS needs to be 3 for the current implementation.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index d749279db..16b0cbe1e 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -182,8 +182,6 @@ module quadrilatero_register_lsu #( lsu_state_d = lsu_state_q; counter_d = counter_q; access_counter_d = access_counter_q; - //load_row_buffer_d = load_row_buffer_q; - //we_o = 1'b0; rlast_o = 1'b0; wlast_o = 1'b0; @@ -192,12 +190,11 @@ module quadrilatero_register_lsu #( case (lsu_state_q) LSU_IDLE: begin - back_id_d = instr_id_i; // was inside if + back_id_d = instr_id_i; waddr_d = operand_reg_i; - if(load_fifo_valid && !write_i && wready_i) begin //checking for wready makes sense but somehow is wrong? + if(load_fifo_valid && !write_i && wready_i) begin counter_d = counter_q + 1; wlast_o = 1'b1; - //back_id_d = instr_id_i; was here access_counter_d = '0; lsu_state_d = LSU_LOAD; end else if (write_i & store_fifo_space_available && rdata_valid_i) begin @@ -210,12 +207,10 @@ module quadrilatero_register_lsu #( end LSU_LOAD: begin if(load_fifo_valid) begin - //maybe here wlast_o = 1'b1; ? if(wready_i) begin if(counter_q == LastRow) begin if(access_counter_q == NumAccesses - 1) begin wlast_o = 1'b1; - //we_o = 1'b1; access_counter_d = '0; counter_d = '0; lsu_state_d = LSU_DONE; @@ -226,7 +221,6 @@ module quadrilatero_register_lsu #( end end else begin if(access_counter_q == NumAccesses - 1) begin - //we_o = 1'b1; wlast_o = 1'b1; access_counter_d = '0; counter_d = counter_q + 1; @@ -235,13 +229,9 @@ module quadrilatero_register_lsu #( end end end - // end else begin - // //wlast_o = 1'b1; // maybe wrong - // //we_o = 1'b1; - // counter_d = '0; - // lsu_state_d = LSU_DONE; + end else begin - if(write_i && wready_i) begin // transition from load to store + if(write_i && wready_i) begin if(access_counter_q == NumAccesses - 1) begin counter_d = '0; wlast_o = 1'b1; @@ -259,7 +249,6 @@ module quadrilatero_register_lsu #( end LSU_STORE: begin if(store_fifo_space_available && write_i && rdata_valid_i) begin - //if(rdata_valid_i) begin if(counter_q == LastRow) begin if(access_counter_q == NumAccesses - 1) begin rlast_o = 1'b1; @@ -281,8 +270,7 @@ module quadrilatero_register_lsu #( access_counter_d = access_counter_q + 1; end end - end else begin // this case is very suspicious - //rlast_o = 1'b1; // maybe wrong + end else begin counter_d = '0; back_id_d = instr_id_i; lsu_state_d = LSU_DONE; @@ -294,7 +282,6 @@ module quadrilatero_register_lsu #( access_counter_d = '0; counter_d = counter_q + 1; wlast_o = 1'b1; - //we_o = 1'b1; lsu_state_d = LSU_LOAD; end else begin access_counter_d = access_counter_q + 1; @@ -372,7 +359,7 @@ module quadrilatero_register_lsu #( //Configuration .start_i (start ), - .write_i (write_i), // & ~(lsu_state_q == LSU_LOAD && write_i) + .write_i (write_i), .busy_o (busy ), .terminate_o (terminate ), diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv deleted file mode 100644 index b18f1a44a..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_broken.sv +++ /dev/null @@ -1,420 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* -NOTE: for now we assume we fetch the entire row in 1 cycle. TODO: Change the number of columns and adapt this to arbitrary BUS_WIDTH parameters -NOTE: we are not handling difference in endianness when loading reduced datawidths -*/ - -module quadrilatero_register_lsu #( - parameter int unsigned BUS_WIDTH = 128, - parameter int unsigned N_REGS = 8, - parameter int unsigned N_ROWS = 4, - localparam int unsigned LLEN = BUS_WIDTH -) ( - input logic clk_i , - input logic rst_ni , - - // Bus interface - output logic data_req_o , - output logic [ 31:0] data_addr_o , - output logic data_we_o , - output logic [ BUS_WIDTH/8 - 1:0] data_be_o , - output logic [ BUS_WIDTH-1:0] data_wdata_o , - input logic data_gnt_i , - input logic data_rvalid_i , - input logic [ BUS_WIDTH-1:0] data_rdata_i , - - output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , - - // Register Write Port for load unit - output logic [ $clog2(N_REGS)-1:0] waddr_o , - output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ LLEN-1:0] wdata_o , - output logic we_o , - output logic wlast_o , - input logic wready_i , // to stall the request in case the port is busy - - // Register Read Port for store unit - output logic [ $clog2(N_REGS)-1:0] raddr_o , - output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [ LLEN-1:0] rdata_i , - input logic rdata_valid_i , - output logic rdata_ready_o , - output logic rlast_o , - - // Configuration Signals - input logic start_i , // start loading: MUST BE A PULSE - input logic write_i , - output logic busy_o , - input logic [ 31:0] stride_i , // stride value - input logic [ 31:0] address_i , // address value - input logic [ $clog2(N_REGS)-1:0] operand_reg_i , // destination register - input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i , // instruction id - input logic [ 31:0] n_bytes_cols_i , // we always fetch the entire row and then only take the elements we need - input logic [ 31:0] n_rows_i , - - - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o //instruction id out - -); - - localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; - localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); - localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; - - typedef enum logic [1:0] { - LSU_IDLE, - LSU_LOAD, - LSU_STORE, - LSU_DONE - } register_lsu_state_e; - - register_lsu_state_e lsu_state_d, lsu_state_q; - - logic finished; - logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; - logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; - - logic [$clog2(N_ROWS)-1:0] counter_q; - logic [$clog2(N_ROWS)-1:0] counter_d; - logic [$clog2(N_REGS)-1:0] waddr_q; - logic [$clog2(N_REGS)-1:0] waddr_d; - - logic [LLEN-1:0] load_fifo_data; - - logic load_fifo_data_available; - logic load_fifo_pop; - - logic store_fifo_space_available; - logic store_fifo_push; - logic store_fifo_empty; - logic [LLEN-1:0] store_fifo_data; - - logic [LLEN-1:0] data_mask; - logic load_fifo_valid; - logic busy; - logic start; - logic start_q; - logic start_d; - - logic write_q; - logic write_d; - logic terminate; - logic busy_q; - logic busy_d; - - logic lsu_busy_q; - logic lsu_ready; - - logic [ 31:0] src_ptr_d ; - logic [ 31:0] stride_d ; - logic [ 31:0] src_ptr_q ; - logic [ 31:0] stride_q ; - logic [ 31:0] src_ptr ; - logic [ 31:0] stride ; - - logic [$clog2(NumAccesses)-1:0] access_counter_d; - logic [$clog2(NumAccesses)-1:0] access_counter_q; - - logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_d; - logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_q; - - logic [quadrilatero_pkg::RLEN-1:0] store_mask; - - assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; - always_comb begin - lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; - finished = (write_q & terminate) | (~write_q & (counter_q == LastRow) & wready_i); - end - - - always_comb begin: write_to_RF - data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols - - we_o = load_fifo_data_available &~ mask_req && (access_counter_q == NumAccesses -1); // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy - waddr_o = waddr_q; - wrowaddr_o = counter_q ; - load_row_buffer_d = load_row_buffer_q | (load_fifo_data << (LLEN * access_counter_q)); - wdata_o = load_row_buffer_d & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q - - end - - always_comb begin: read_from_RF - store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); - rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; - rrowaddr_o = counter_q ; - raddr_o = operand_reg_i ; - end - - always_comb begin: lsu_ctrl_block - load_fifo_pop = wready_i; - store_fifo_data = rdata_i & store_mask; - store_fifo_push = rdata_ready_o && rdata_valid_i; - lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); - start = (start_i | start_q) & lsu_ready; - busy_o = (write_i ? busy_d : busy ) | start_q; - - stride = (start) ? stride_i : stride_q; - src_ptr = (start) ? address_i : src_ptr_q; - end - - always_comb begin: next_value - write_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b1 : - (!write_i && !busy) ? 1'b0 : write_q; - - start_d = start ? 1'b0 : - (start_q | start_i) ? 1'b1 : start_q; - - stride_d = (start) ? stride_i : stride_q ; - src_ptr_d = (start) ? address_i : src_ptr_q; - - busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b0 : - (write_i && start_i) ? 1'b1 : busy_q; - end - always_comb begin: fsm_block - lsu_state_d = lsu_state_q; - counter_d = counter_q; - access_counter_d = access_counter_q; - //load_row_buffer_d = load_row_buffer_q; - //we_o = 1'b0; - rlast_o = 1'b0; - wlast_o = 1'b0; - - back_id_d = back_id_q; - waddr_d = waddr_q; - - case (lsu_state_q) - LSU_IDLE: begin - if(load_fifo_valid && !write_i && wready_i) begin //checking for wready makes sense but somehow is wrong? - wlast_o = 1'b1; - counter_d = counter_q + 1; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - access_counter_d = '0; - lsu_state_d = LSU_LOAD; - end else if (write_i & store_fifo_space_available && rdata_valid_i) begin - rlast_o = 1'b1; - counter_d = counter_q + 1; - access_counter_d = '0; - lsu_state_d = LSU_STORE; - end - - end - LSU_LOAD: begin - if(load_fifo_valid) begin - if(wready_i) begin - wlast_o = 1'b1; - if(counter_q == LastRow) begin - //if(access_counter_q == NumAccesses - 1) begin - - //we_o = 1'b1; - access_counter_d = '0; - counter_d = '0; - lsu_state_d = LSU_DONE; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - // end else begin - // access_counter_d = access_counter_q + 1; - end - else begin - //if(access_counter_q == NumAccesses - 1) begin - //we_o = 1'b1; - - access_counter_d = '0; - counter_d = counter_q + 1; - // end else begin - // access_counter_d = access_counter_q + 1; - end - end - - // end else begin - // //wlast_o = 1'b1; // maybe wrong - // //we_o = 1'b1; - // counter_d = '0; - // lsu_state_d = LSU_DONE; - end else begin - // if(access_counter_q == NumAccesses - 1) begin - //wlast_o = 1'b1; // very random but apparantly needed - //we_o = 1'b1; - access_counter_d = '0; - counter_d = '0; - lsu_state_d = LSU_DONE; - // end else begin - // access_counter_d = access_counter_q + 1; - // end - - end - end - LSU_STORE: begin - if(store_fifo_space_available && write_i && rdata_valid_i) begin - //if(rdata_valid_i) begin - rlast_o = 1'b1; - if(counter_q == LastRow) begin - //if(access_counter_q == NumAccesses - 1) begin - - access_counter_d = '0; - counter_d = '0; - lsu_state_d = LSU_DONE; - back_id_d = lsu_id_o; - // end else begin - // access_counter_d = access_counter_q + 1; - // end - - end else begin - //if(access_counter_q == NumAccesses - 1) begin - - access_counter_d = '0; - counter_d = counter_q + 1; - // end else begin - // access_counter_d = access_counter_q + 1; - // end - end - end else begin // this case is very suspicious - //rlast_o = 1'b1; // maybe wrong - counter_d = '0; - back_id_d = lsu_id_o; - lsu_state_d = LSU_DONE; - end - end - LSU_DONE: begin - if(load_fifo_valid && !write_i && wready_i) begin - //if(access_counter_q == NumAccesses - 1) begin - access_counter_d = '0; - counter_d = counter_q + 1; - wlast_o = 1'b1; - //we_o = 1'b1; - lsu_state_d = LSU_LOAD; - // end else begin - // access_counter_d = access_counter_q + 1; - // end - end else if (write_i && store_fifo_space_available && rdata_valid_i) begin - //if(access_counter_q == NumAccesses - 1) begin - counter_d = counter_q + 1; - rlast_o = 1'b1; - lsu_state_d = LSU_STORE; - access_counter_d = '0; - // end else begin - // access_counter_d = access_counter_q + 1; - // end - end else begin - lsu_state_d = LSU_IDLE; - end - end - default: begin - lsu_state_d = LSU_IDLE; - end - endcase - - end - - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - counter_q <= '0; - waddr_q <= '0; - back_id_q <= '0; - start_q <= '0; - write_q <= '0; - busy_q <= '0; - lsu_state_q <= LSU_IDLE; - - lsu_busy_q <= '0; - src_ptr_q <= '0; - stride_q <= '0; - access_counter_q <= '0; - load_row_buffer_q <= '0; - end else begin - counter_q <= counter_d; - back_id_q <= back_id_d; - waddr_q <= waddr_d ; - start_q <= start_d ; - write_q <= write_d ; - busy_q <= busy_d ; - lsu_state_q <= lsu_state_d; - - lsu_busy_q <= busy; - src_ptr_q <= src_ptr_d; - stride_q <= stride_d ; - access_counter_q <= access_counter_d; - load_row_buffer_q <= load_row_buffer_d; - end - end - - quadrilatero_lsu #( - .FIFO_DEPTH (4 ), - .DATA_WIDTH (BUS_WIDTH) - ) lsunit_inst ( - - .clk_i , - .rst_ni , - - // Bus interface - .data_req_o , - .data_addr_o , - .data_we_o , - .data_be_o , - .data_wdata_o , - .data_gnt_i , - .data_rvalid_i , - .data_rdata_i , - - //Configuration - .start_i (start ), - .write_i , - .busy_o (busy ), - .terminate_o (terminate ), - - // Address - .src_ptr_i (src_ptr ), - .stride_i (stride ), - .cols_i (MAX_EL_PER_ROW ), - .rows_i (n_rows_i ), - - // Output data - .load_fifo_output_o (load_fifo_data ), - .load_fifo_valid_o (load_fifo_valid ), - .load_fifo_data_available_o (load_fifo_data_available ), - .load_fifo_output_pop_i (load_fifo_pop ), - - // Input data - .store_fifo_input_i (store_fifo_data ), - .store_fifo_push_i (store_fifo_push ), - .store_fifo_space_available_o (store_fifo_space_available ), - .store_fifo_empty_o (store_fifo_empty ) - ); - - //------------------------- - - always_ff @(posedge clk_i or negedge rst_ni) begin - if (!rst_ni) begin - finished_o <= '0; - finished_instr_id_o <= '0; - end else begin - if (finished) begin - finished_o <= '1; - finished_instr_id_o <= back_id_q; - end - if (finished_ack_i) begin - finished_o <= '0; - finished_instr_id_o <= '0; - end - end - end - //--------------------- - - // Assertions - if (N_ROWS < 2) begin - $error( - "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" - ); - end - if ((NumAccesses & (NumAccesses - 1)) != 0) begin - $error("RLEN / LLEN must be a power of 2."); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv deleted file mode 100644 index caf3f74d9..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_old.sv +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* -NOTE: for now we assume we fetch the entire row in 1 cycle. TODO: Change the number of columns and adapt this to arbitrary BUS_WIDTH parameters -NOTE: we are not handling difference in endianness when loading reduced datawidths -*/ - -module quadrilatero_register_lsu #( - parameter int unsigned BUS_WIDTH = 128, - parameter int unsigned N_REGS = 8, - parameter int unsigned N_ROWS = 4, - localparam int unsigned LLEN = BUS_WIDTH -) ( - input logic clk_i , - input logic rst_ni , - - // Bus interface - output logic data_req_o , - output logic [ 31:0] data_addr_o , - output logic data_we_o , - output logic [ BUS_WIDTH/8 - 1:0] data_be_o , - output logic [ BUS_WIDTH-1:0] data_wdata_o , - input logic data_gnt_i , - input logic data_rvalid_i , - input logic [ BUS_WIDTH-1:0] data_rdata_i , - - output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , - - // Register Write Port for load unit - output logic [ $clog2(N_REGS)-1:0] waddr_o , - output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ LLEN-1:0] wdata_o , - output logic we_o , - output logic wlast_o , - input logic wready_i , // to stall the request in case the port is busy - - // Register Read Port for store unit - output logic [ $clog2(N_REGS)-1:0] raddr_o , - output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [ LLEN-1:0] rdata_i , - input logic rdata_valid_i , - output logic rdata_ready_o , - output logic rlast_o , - - // Configuration Signals - input logic start_i , // start loading: MUST BE A PULSE - input logic write_i , - output logic busy_o , - input logic [ 31:0] stride_i , // stride value - input logic [ 31:0] address_i , // address value - input logic [ $clog2(N_REGS)-1:0] operand_reg_i , // destination register - input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i , // instruction id - input logic [ 31:0] n_bytes_cols_i , // we always fetch the entire row and then only take the elements we need - input logic [ 31:0] n_rows_i , - - - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o //instruction id out - -); - - localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; - - // typedef enum logic { - // IDLE, - // COUNTING_ROWS, - // LAST_ROW - // } register_lsu_state_e; - - // register_lsu_state_e state_d, state_q; - - logic finished; - logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; - logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; - - logic [$clog2(N_ROWS)-1:0] counter_q; - logic [$clog2(N_ROWS)-1:0] counter_d; - logic [$clog2(N_REGS)-1:0] waddr_q; - logic [$clog2(N_REGS)-1:0] waddr_d; - - logic [LLEN-1:0] load_fifo_data; - - logic load_fifo_data_available; - logic load_fifo_pop; - - logic store_fifo_space_available; - logic store_fifo_push; - logic store_fifo_empty; - logic [LLEN-1:0] store_fifo_data; - - logic [LLEN-1:0] data_mask; - logic load_fifo_valid; - logic busy; - logic start; - logic start_q; - logic start_d; - - - logic valid_d; - logic valid_q; - - logic write_q; - logic write_d; - logic terminate; - logic busy_q; - logic busy_d; - - logic lsu_busy_q; - logic lsu_ready; - logic mask_req; - - - - - logic [ 31:0] src_ptr_d ; - logic [ 31:0] stride_d ; - logic [ 31:0] src_ptr_q ; - logic [ 31:0] stride_q ; - logic [ 31:0] src_ptr ; - logic [ 31:0] stride ; - - assign mask_req = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & finished_o & ~finished_ack_i; - always_comb begin - lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; - finished = (write_q & terminate) | (~write_q & wlast_o & wready_i); - end - - - always_comb begin: write_to_RF - data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols - - we_o = load_fifo_data_available &~ mask_req; - waddr_o = waddr_q; - wrowaddr_o = counter_q ; - wdata_o = load_fifo_data & ~data_mask; - wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && we_o && wready_i; - // wlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & wready_i; - end - - always_comb begin: read_from_RF - rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; - rrowaddr_o = counter_q ; - raddr_o = operand_reg_i ; - rlast_o = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && rdata_valid_i && rdata_ready_o; - end - - always_comb begin: lsu_ctrl_block - load_fifo_pop = wready_i; - store_fifo_data = rdata_i; - store_fifo_push = rdata_ready_o && rdata_valid_i; - lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); - start = (start_i | start_q) & lsu_ready; - //busy_o = (write_i ? busy_d : busy) | start_q; - busy_o = (write_i ? busy_d : busy | (load_fifo_data_available & counter_d == '0)) | start_q; - - stride = (start) ? stride_i : stride_q; - src_ptr = (start) ? address_i : src_ptr_q; - end - - always_comb begin: next_value - if (rlast_o || wlast_o) begin - counter_d = '0; - end else if ((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - counter_d = counter_q + 1; - end else begin - counter_d = counter_q; - end - - write_d = (write_i && rlast_o && rdata_valid_i) ? 1'b1 : - (!write_i && !busy) ? 1'b0 : write_q; - - valid_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? 1'b1 : - (load_fifo_valid && (counter_d==$clog2(N_ROWS)'(N_ROWS - 1)) && valid_q) ? 1'b0 : valid_q; // $clog2(N_ROWS)'(N_ROWS - 1) was 3, if there's a problem check here... - - start_d = start ? 1'b0 : - (start_q | start_i) ? 1'b1 : start_q; - - stride_d = (start) ? stride_i : stride_q ; - src_ptr_d = (start) ? address_i : src_ptr_q; - - back_id_d = (load_fifo_valid && counter_d==0 && ~valid_q) ? instr_id_i : - rlast_o ? lsu_id_o : back_id_q; - - waddr_d = (load_fifo_valid && counter_d==0) ? operand_reg_i : waddr_q ; - - busy_d = (write_i && rlast_o && rdata_valid_i) ? 1'b0 : - (write_i && start_i) ? 1'b1 : busy_q; - end - // always_comb begin: fsm_block - // counter_d = '0; - // rlast_o = 1'b0; - // rrowaddr_o = counter_q; - // wlast_o = 1'b0;sim:/tb_top/testharness_i/gen_USE_EXTERNAL_DEVICE_EXAMPLE/gen_quadrilatero_wrapper/quadrilatero_wrapper_i/mat_inst/regloader_i/busy_o - - // wrowaddr_o = counter_q; - // case (state_q) - // IDLE: begin - // counter_d = '0; - // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - // state_d = COUNTING_ROWS - // end - // state_d = IDLE - // end - // COUNTING_ROWS: begin - // if((we_o && wready_i) || (rdata_valid_i && rdata_ready_o && !rlast_o)) begin - // counter_d = counter_q + 1; - // if(counter_d = $clog2(N_ROWS)'(N_ROWS - 1)) begin - // state_d = LAST_ROW; - // end else begin - // state_d = COUNTING_ROWS; - // end - // end - - // end - // LAST_ROW: begin - // if(rlast_o || wlast_o) begin - // state_d = IDLE; - // end - - - // end - // default: - // endcase - //end - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - counter_q <= '0; - waddr_q <= '0; - back_id_q <= '0; - start_q <= '0; - valid_q <= '0; - write_q <= '0; - busy_q <= '0; - - lsu_busy_q <= '0; - src_ptr_q <= '0; - stride_q <= '0; - end else begin - counter_q <= counter_d; - back_id_q <= back_id_d; - waddr_q <= waddr_d ; - start_q <= start_d ; - valid_q <= valid_d ; - write_q <= write_d ; - busy_q <= busy_d ; - - lsu_busy_q <= busy; - src_ptr_q <= src_ptr_d; - stride_q <= stride_d ; - end - end - - quadrilatero_lsu #( - .FIFO_DEPTH (4 ), - .DATA_WIDTH (BUS_WIDTH) - ) lsunit_inst ( - - .clk_i , - .rst_ni , - - // Bus interface - .data_req_o , - .data_addr_o , - .data_we_o , - .data_be_o , - .data_wdata_o , - .data_gnt_i , - .data_rvalid_i , - .data_rdata_i , - - //Configuration - .start_i (start ), - .write_i , - .busy_o (busy ), - .terminate_o (terminate ), - - // Address - .src_ptr_i (src_ptr ), - .stride_i (stride ), - .cols_i (MAX_EL_PER_ROW ), - .rows_i (n_rows_i ), - - // Output data - .load_fifo_output_o (load_fifo_data ), - .load_fifo_valid_o (load_fifo_valid ), - .load_fifo_data_available_o (load_fifo_data_available ), - .load_fifo_output_pop_i (load_fifo_pop ), - - // Input data - .store_fifo_input_i (store_fifo_data ), - .store_fifo_push_i (store_fifo_push ), - .store_fifo_space_available_o (store_fifo_space_available ), - .store_fifo_empty_o (store_fifo_empty ) - ); - - //------------------------- - - always_ff @(posedge clk_i or negedge rst_ni) begin - if (!rst_ni) begin - finished_o <= '0; - finished_instr_id_o <= '0; - end else begin - if (finished) begin - finished_o <= '1; - finished_instr_id_o <= back_id_q; - end - if (finished_ack_i) begin - finished_o <= '0; - finished_instr_id_o <= '0; - end - end - end - //--------------------- - - // Assertions - if (N_ROWS < 2) begin - $error( - "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index ab52e307e..25af6b913 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -80,7 +80,7 @@ module quadrilatero_rf_sequencer #( quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; genvar ii,hh; - assign rw_queue_pop = w_pop | r_pop | ~head_valid; //problem + assign rw_queue_pop = w_pop | r_pop | ~head_valid; assign rw_queue_entry = rw_queue_entry_i ; assign rw_queue_push = rw_queue_push_i ; @@ -121,13 +121,6 @@ module quadrilatero_rf_sequencer #( scoreboard_d[i][h] = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : (rw_queue_pop[i][h] ) ? rw_queue[i][h] : scoreboard_q[i][h]; - - // scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - - // scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; end end end @@ -180,9 +173,6 @@ module quadrilatero_rf_sequencer #( block = 1'b0; end - // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin - // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; - // end if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]] = 1'b0; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv deleted file mode 100644 index c3e3d83bd..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_0bit.sv +++ /dev/null @@ -1,292 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_rf_sequencer #( - parameter READ_PORTS = 4 , - parameter WRITE_PORTS = 2 , - parameter N_REGS = 8 , - parameter N_ROWS = 4 , - parameter RLEN = 128 , - parameter RF_READ_PORTS = 3 , - parameter RF_WRITE_PORTS = 1, - parameter SYNC_REQ = 1, - - parameter N_ENTRIES = 3 // entries in the FIFOs for each register -) ( - - input logic clk_i, - input logic rst_ni, - - // Input from FUs - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , - input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , - output logic [READ_PORTS-1:0] rvalid_o , - input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , - - - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , - input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , - input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , - input logic [WRITE_PORTS-1:0] we_i , - input logic [WRITE_PORTS-1:0] wlast_i , // we can use this instead of wlast_row_i - output logic [WRITE_PORTS-1:0] wready_o , - input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , - - // Outputs to RF - output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , - output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , - - - output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , - output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , - output logic [RF_WRITE_PORTS-1:0] we_o , - - - // Inputs from Dispatcher - // We can share the entry as we fetch 1 instruction at a time - // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles - input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , - input logic [N_REGS-1:0] rw_queue_push_i , - - // Outputs to Dispatcher - output logic [N_REGS-1:0] rw_queue_full_o -); - - //logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; - logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; - - logic [WRITE_PORTS-1:0] wr_gnt ; - logic [WRITE_PORTS-1:0] wr_req ; - logic [READ_PORTS -1:0] rd_req ; - logic [READ_PORTS -1:0] rd_gnt ; - - logic [N_REGS-1:0] rw_queue_push ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; - - logic starting; - - genvar ii,hh; - - assign rw_queue_pop = w_pop | r_pop | ~starting; - assign rw_queue_entry = rw_queue_entry_i ; - assign rw_queue_push = rw_queue_push_i ; - - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; - assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; - - for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs - for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows - fifo_v3 #( - .FALL_THROUGH (1'b1) , - .DEPTH (N_ENTRIES) , - .dtype (quadrilatero_pkg::rw_queue_t) - ) issue_queue_inst ( - .clk_i, - .rst_ni, - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - .usage_o ( ), - .full_o (rw_queue_full [ii][hh] ), - .empty_o (rw_queue_empty[ii][hh] ), - .data_i (rw_queue_entry[ii] ), // data to push into the queue - .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue - .data_o (rw_queue [ii][hh] ), // output data - .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue - ); - end - end - - always_comb begin: scoreboard_block - rw_queue_full_o = '0; - for (int i = 0; i < N_REGS; i++) begin - for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i] |= (rw_queue_full[i][h]); - - //head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; - - - scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; - - // scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - - // scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; - end - end - end - - always_comb begin: ctrl_block - wr_req = '0; - rd_req = '0; - w_pop = '0; - r_pop = '0; - starting = '0; - - for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request - automatic int m = 32'(waddr_i[jj]); - automatic int n = 32'(wrowaddr_i[jj]); - if( scoreboard_q[m][n].id == wr_id_i[jj] && we_i) begin - starting |= 1'b1; - wr_req [jj] = 1'b1; - w_pop [m][n] = wlast_i[jj]; - end - end - - for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request - automatic int m = 32'(raddr_i[jj]); - automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i) begin - rd_req [jj] = 1'b1; - r_pop [m][n] = rlast_i[jj]; - end - end - - if(SYNC_REQ) begin: sa_sync_req - - logic block ; - logic same_id_acc; - logic same_id_A ; - logic same_id_D ; - logic same_id_W ; - - // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; - same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; - same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; - same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; - - if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) - ) begin - block = 1'b1; - end else begin - block = 1'b0; - end - - // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin - // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; - // end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; - end - end - end - - if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_WRITE_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (WRITE_PORTS) - ) wr_arb_i( - .clk_i , - .rst_ni , - .req_i (wr_req), - .grant_o (wr_gnt) - ); - always_comb begin: wdata_block - automatic int ll=0; - - wready_o = wr_gnt ; - for (int mm = 0; mm < WRITE_PORTS; mm++) begin - if(wr_gnt[mm]) begin - waddr_o [ll] = waddr_i [mm]; - wrowaddr_o[ll] = wrowaddr_i[mm]; - wdata_o [ll] = wdata_i [mm]; - we_o [ll] = we_i [mm]; - ll++; - end - end - end - end else always_comb begin : write_block_noArb - wr_gnt = wr_req ; - waddr_o = waddr_i ; - wrowaddr_o = wrowaddr_i; - wdata_o = wdata_i ; - we_o = wr_gnt ; - wready_o = wr_gnt ; //might need to be changed - end - - if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_READ_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (READ_PORTS) - ) rd_arb_i( - .clk_i , - .rst_ni , - .req_i (rd_req), - .grant_o (rd_gnt) - ); - - always_comb begin: rdata_block - automatic int ll=0; - - rvalid_o = rd_gnt; - for (int mm = 0; mm < READ_PORTS; mm++) begin - if(rd_gnt[mm]) begin - raddr_o [ll] = raddr_i [mm]; - rrowaddr_o[ll] = rrowaddr_i[mm]; - rdata_o [mm] = rdata_i [ll]; - ll++; - end else begin - rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; - end - end - end - end else always_comb begin : read_block_noArb - rd_gnt = rd_req ; - raddr_o = raddr_i ; - rrowaddr_o = rrowaddr_i; - rdata_o = rdata_i ; - rvalid_o = rd_gnt ; //might need to be changed - end - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - scoreboard_q <= '0; - end else begin - scoreboard_q <= scoreboard_d; - end - end - - //------------------------------------------------------------------------------------------------------- - - // Assertions - if (WRITE_PORTS < 2) begin - $error( - "[rf_sequencer] WRITE_PORTS must be at least 2.\n" - ); - end - if (READ_PORTS < 2) begin - $error( - "[rf_sequencer] READ_PORTS must be at least 2.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv deleted file mode 100644 index d25b6bf64..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_2bit.sv +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_rf_sequencer #( - parameter READ_PORTS = 4 , - parameter WRITE_PORTS = 2 , - parameter N_REGS = 8 , - parameter N_ROWS = 4 , - parameter RLEN = 128 , - parameter RF_READ_PORTS = 3 , - parameter RF_WRITE_PORTS = 1, - parameter SYNC_REQ = 1, - - parameter N_ENTRIES = 3 // entries in the FIFOs for each register -) ( - - input logic clk_i, - input logic rst_ni, - - // Input from FUs - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , - input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , - output logic [READ_PORTS-1:0] rvalid_o , - input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , - - - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , - input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , - input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , - input logic [WRITE_PORTS-1:0] we_i , - input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) - output logic [WRITE_PORTS-1:0] wready_o , - input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , - - // Outputs to RF - output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , - output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , - - - output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , - output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , - output logic [RF_WRITE_PORTS-1:0] we_o , - - - // Inputs from Dispatcher - // We can share the entry as we fetch 1 instruction at a time - // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles - input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , - input logic [N_REGS-1:0] rw_queue_push_i , - - // Outputs to Dispatcher - output logic [N_REGS-1:0] rw_queue_full_o -); - - logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; - logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; - - logic [WRITE_PORTS-1:0] wr_gnt ; - logic [WRITE_PORTS-1:0] wr_req ; - logic [READ_PORTS -1:0] rd_req ; - logic [READ_PORTS -1:0] rd_gnt ; - - logic [N_REGS-1:0] rw_queue_push ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; - - genvar ii,hh; - - assign rw_queue_pop = w_pop | r_pop | ~head_valid; - assign rw_queue_entry = rw_queue_entry_i ; - assign rw_queue_push = rw_queue_push_i ; - - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; - assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; - - for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs - for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows - fifo_v3 #( - .FALL_THROUGH (1'b1) , - .DEPTH (N_ENTRIES) , - .dtype (quadrilatero_pkg::rw_queue_t) - ) issue_queue_inst ( - .clk_i, - .rst_ni, - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - .usage_o ( ), - .full_o (rw_queue_full [ii][hh] ), - .empty_o (rw_queue_empty[ii][hh] ), - .data_i (rw_queue_entry[ii] ), // data to push into the queue - .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue - .data_o (rw_queue [ii][hh] ), // output data - .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue - ); - end - end - - always_comb begin: scoreboard_block - rw_queue_full_o = '0; - for (int i = 0; i < N_REGS; i++) begin - for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i] |= (rw_queue_full[i][h]); - - head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; - - - scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; - - scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - - scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; - end - end - end - - always_comb begin: ctrl_block - wr_req = '0; - rd_req = '0; - w_pop = '0; - r_pop = '0; - r_clr = '0; - - for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request - automatic int m = 32'(waddr_i[jj]); - automatic int n = 32'(wrowaddr_i[jj]); - if( scoreboard_q[m][n].id == wr_id_i[jj] && - scoreboard_q[m][n].wready && we_i[jj] ) - begin - wr_req [jj] = 1'b1; - w_pop [m][n] = wlast_i[jj]; - end - end - - for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request - automatic int m = 32'(raddr_i[jj]); - automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && - scoreboard_q[m][n].rvalid && rready_i[jj] ) - begin - rd_req [jj] = 1'b1; - r_clr [m][n] = rd_gnt[jj]; - r_pop [m][n] = ~scoreboard_q[m][n].wready & rlast_i[jj]; - end - end - - if(SYNC_REQ) begin: sa_sync_req - - logic block ; - logic same_id_acc; - logic same_id_A ; - logic same_id_D ; - logic same_id_W ; - - // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; - same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; - same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; - same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; - - if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) - ) begin - block = 1'b1; - end else begin - block = 1'b0; - end - - // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin - // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; - // end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; - end - end - end - - if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_WRITE_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (WRITE_PORTS) - ) wr_arb_i( - .clk_i , - .rst_ni , - .req_i (wr_req), - .grant_o (wr_gnt) - ); - always_comb begin: wdata_block - automatic int ll=0; - - wready_o = wr_gnt ; - for (int mm = 0; mm < WRITE_PORTS; mm++) begin - if(wr_gnt[mm]) begin - waddr_o [ll] = waddr_i [mm]; - wrowaddr_o[ll] = wrowaddr_i[mm]; - wdata_o [ll] = wdata_i [mm]; - we_o [ll] = we_i [mm]; - ll++; - end - end - end - end else always_comb begin : write_block_noArb - wr_gnt = wr_req ; - waddr_o = waddr_i ; - wrowaddr_o = wrowaddr_i; - wdata_o = wdata_i ; - we_o = wr_gnt ; - wready_o = wr_gnt ; - end - - if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_READ_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (READ_PORTS) - ) rd_arb_i( - .clk_i , - .rst_ni , - .req_i (rd_req), - .grant_o (rd_gnt) - ); - - always_comb begin: rdata_block - automatic int ll=0; - - rvalid_o = rd_gnt; - for (int mm = 0; mm < READ_PORTS; mm++) begin - if(rd_gnt[mm]) begin - raddr_o [ll] = raddr_i [mm]; - rrowaddr_o[ll] = rrowaddr_i[mm]; - rdata_o [mm] = rdata_i [ll]; - ll++; - end else begin - rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; - end - end - end - end else always_comb begin : read_block_noArb - rd_gnt = rd_req ; - raddr_o = raddr_i ; - rrowaddr_o = rrowaddr_i; - rdata_o = rdata_i ; - rvalid_o = rd_gnt ; - end - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - scoreboard_q <= '0; - end else begin - scoreboard_q <= scoreboard_d; - end - end - - //------------------------------------------------------------------------------------------------------- - - // Assertions - if (WRITE_PORTS < 2) begin - $error( - "[rf_sequencer] WRITE_PORTS must be at least 2.\n" - ); - end - if (READ_PORTS < 2) begin - $error( - "[rf_sequencer] READ_PORTS must be at least 2.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv deleted file mode 100644 index ab52e307e..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_kinda_working_1bit.sv +++ /dev/null @@ -1,293 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_rf_sequencer #( - parameter READ_PORTS = 4 , - parameter WRITE_PORTS = 2 , - parameter N_REGS = 8 , - parameter N_ROWS = 4 , - parameter RLEN = 128 , - parameter RF_READ_PORTS = 3 , - parameter RF_WRITE_PORTS = 1, - parameter SYNC_REQ = 1, - - parameter N_ENTRIES = 3 // entries in the FIFOs for each register -) ( - - input logic clk_i, - input logic rst_ni, - - // Input from FUs - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , - input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , - output logic [READ_PORTS-1:0] rvalid_o , - input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , - - - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , - input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , - input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , - input logic [WRITE_PORTS-1:0] we_i , - input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) - output logic [WRITE_PORTS-1:0] wready_o , - input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , - - // Outputs to RF - output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , - output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , - - - output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , - output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , - output logic [RF_WRITE_PORTS-1:0] we_o , - - - // Inputs from Dispatcher - // We can share the entry as we fetch 1 instruction at a time - // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles - input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , - input logic [N_REGS-1:0] rw_queue_push_i , - - // Outputs to Dispatcher - output logic [N_REGS-1:0] rw_queue_full_o -); - - logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; - logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; - - logic [WRITE_PORTS-1:0] wr_gnt ; - logic [WRITE_PORTS-1:0] wr_req ; - logic [READ_PORTS -1:0] rd_req ; - logic [READ_PORTS -1:0] rd_gnt ; - - logic [N_REGS-1:0] rw_queue_push ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; - genvar ii,hh; - - assign rw_queue_pop = w_pop | r_pop | ~head_valid; //problem - assign rw_queue_entry = rw_queue_entry_i ; - assign rw_queue_push = rw_queue_push_i ; - - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; - assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; - - for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs - for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows - fifo_v3 #( - .FALL_THROUGH (1'b1) , - .DEPTH (N_ENTRIES) , - .dtype (quadrilatero_pkg::rw_queue_t) - ) issue_queue_inst ( - .clk_i, - .rst_ni, - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - .usage_o ( ), - .full_o (rw_queue_full [ii][hh] ), - .empty_o (rw_queue_empty[ii][hh] ), - .data_i (rw_queue_entry[ii] ), // data to push into the queue - .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue - .data_o (rw_queue [ii][hh] ), // output data - .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue - ); - end - end - - always_comb begin: scoreboard_block - rw_queue_full_o = '0; - for (int i = 0; i < N_REGS; i++) begin - for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i] |= (rw_queue_full[i][h]); - - - head_valid[i][h] = scoreboard_q[i][h].valid; - - scoreboard_d[i][h] = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h] : scoreboard_q[i][h]; - - - // scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - - // scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - // (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - // (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; - end - end - end - - always_comb begin: ctrl_block - wr_req = '0; - rd_req = '0; - w_pop = '0; - r_pop = '0; - r_clr = '0; - - for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request - automatic int m = 32'(waddr_i[jj]); - automatic int n = 32'(wrowaddr_i[jj]); - if( scoreboard_q[m][n].id == wr_id_i[jj] && we_i[jj] && scoreboard_q[m][n].valid) begin - wr_req [jj] = 1'b1; - w_pop [m][n] = wlast_i[jj]; - end - end - - for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request - automatic int m = 32'(raddr_i[jj]); - automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && rready_i[jj] && scoreboard_q[m][n].valid) begin - rd_req [jj] = 1'b1; - r_pop [m][n] = rlast_i[jj] && (jj != quadrilatero_pkg::SYSTOLIC_ARRAY_A); // for SA_A port we can't pop on read - end - end - - if(SYNC_REQ) begin: sa_sync_req - - logic block ; - logic same_id_acc; - logic same_id_A ; - logic same_id_D ; - logic same_id_W ; - - // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; - same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; - same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; - same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; - - if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) - ) begin - block = 1'b1; - end else begin - block = 1'b0; - end - - // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin - // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; - // end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; - r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; - r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; - r_pop[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]] = 1'b0; - end - end - end - - if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_WRITE_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (WRITE_PORTS) - ) wr_arb_i( - .clk_i , - .rst_ni , - .req_i (wr_req), - .grant_o (wr_gnt) - ); - always_comb begin: wdata_block - automatic int ll=0; - - wready_o = wr_gnt ; - for (int mm = 0; mm < WRITE_PORTS; mm++) begin - if(wr_gnt[mm]) begin - waddr_o [ll] = waddr_i [mm]; - wrowaddr_o[ll] = wrowaddr_i[mm]; - wdata_o [ll] = wdata_i [mm]; - we_o [ll] = we_i [mm]; - ll++; - end - end - end - end else always_comb begin : write_block_noArb - wr_gnt = wr_req ; - waddr_o = waddr_i ; - wrowaddr_o = wrowaddr_i; - wdata_o = wdata_i ; - we_o = wr_gnt ; - wready_o = we_i ; - end - - if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_READ_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (READ_PORTS) - ) rd_arb_i( - .clk_i , - .rst_ni , - .req_i (rd_req), - .grant_o (rd_gnt) - ); - - always_comb begin: rdata_block - automatic int ll=0; - - rvalid_o = rd_gnt; - for (int mm = 0; mm < READ_PORTS; mm++) begin - if(rd_gnt[mm]) begin - raddr_o [ll] = raddr_i [mm]; - rrowaddr_o[ll] = rrowaddr_i[mm]; - rdata_o [mm] = rdata_i [ll]; - ll++; - end else begin - rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; - end - end - end - end else always_comb begin : read_block_noArb - rd_gnt = rd_req ; - raddr_o = raddr_i ; - rrowaddr_o = rrowaddr_i; - rdata_o = rdata_i ; - rvalid_o = rd_gnt ; - end - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - scoreboard_q <= '0; - end else begin - scoreboard_q <= scoreboard_d; - end - end - - //------------------------------------------------------------------------------------------------------- - - // Assertions - if (WRITE_PORTS < 2) begin - $error( - "[rf_sequencer] WRITE_PORTS must be at least 2.\n" - ); - end - if (READ_PORTS < 2) begin - $error( - "[rf_sequencer] READ_PORTS must be at least 2.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv deleted file mode 100644 index aedffc0db..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer_old.sv +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_rf_sequencer #( - parameter READ_PORTS = 4 , - parameter WRITE_PORTS = 2 , - parameter N_REGS = 8 , - parameter N_ROWS = 4 , - parameter RLEN = 128 , - parameter RF_READ_PORTS = 3 , - parameter RF_WRITE_PORTS = 1, - parameter SYNC_REQ = 1, - - parameter N_ENTRIES = 3 // entries in the FIFOs for each register -) ( - - input logic clk_i, - input logic rst_ni, - - // Input from FUs - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , - input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , - output logic [READ_PORTS-1:0] rvalid_o , - input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) - input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , - - - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , - input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , - input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , - input logic [WRITE_PORTS-1:0] we_i , - input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) - output logic [WRITE_PORTS-1:0] wready_o , - input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , - - // Outputs to RF - output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , - output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , - - - output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , - output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , - output logic [RF_WRITE_PORTS-1:0] we_o , - - - // Inputs from Dispatcher - // We can share the entry as we fetch 1 instruction at a time - // NOTE: Actually maybe it's better to have more ports so that we can push all operands and not waste cycles - input quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry_i , - input logic [N_REGS-1:0] rw_queue_push_i , - - // Outputs to Dispatcher - output logic [N_REGS-1:0] rw_queue_full_o -); - - logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; - logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; - - logic [WRITE_PORTS-1:0] wr_gnt ; - logic [WRITE_PORTS-1:0] wr_req ; - logic [READ_PORTS -1:0] rd_req ; - logic [READ_PORTS -1:0] rd_gnt ; - - logic [N_REGS-1:0] rw_queue_push ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; - - genvar ii,hh; - - assign rw_queue_pop = w_pop | r_pop | ~head_valid; - assign rw_queue_entry = rw_queue_entry_i ; - assign rw_queue_push = rw_queue_push_i ; - - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; - assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; - - for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs - for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows - fifo_v3 #( - .FALL_THROUGH (1'b1) , - .DEPTH (N_ENTRIES) , - .dtype (quadrilatero_pkg::rw_queue_t) - ) issue_queue_inst ( - .clk_i, - .rst_ni, - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - .usage_o ( ), - .full_o (rw_queue_full [ii][hh] ), - .empty_o (rw_queue_empty[ii][hh] ), - .data_i (rw_queue_entry[ii] ), // data to push into the queue - .push_i (rw_queue_push [ii] ), // data is valid and can be pushed to the queue - .data_o (rw_queue [ii][hh] ), // output data - .pop_i (rw_queue_pop [ii][hh] & ~rw_queue_empty[ii][hh]) // pop head from queue - ); - end - end - - always_comb begin: scoreboard_block - rw_queue_full_o = '0; - for (int i = 0; i < N_REGS; i++) begin - for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i] |= (rw_queue_full[i][h]); - - head_valid[i][h] = scoreboard_q[i][h].wready | scoreboard_q[i][h].rvalid; - - - scoreboard_d[i][h].id = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? '0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].id : scoreboard_q[i][h].id; - - scoreboard_d[i][h].wready = (rw_queue_pop[i][h] && rw_queue_empty[i][h]) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].wready : scoreboard_q[i][h].wready; - - scoreboard_d[i][h].rvalid = (rw_queue_pop[i][h] && rw_queue_empty[i][h] ) ? 1'b0 : - (rw_queue_pop[i][h] ) ? rw_queue[i][h].rvalid : - (r_clr[i][h] ) ? 1'b0 : scoreboard_q[i][h].rvalid; - end - end - end - - always_comb begin: ctrl_block - wr_req = '0; - rd_req = '0; - w_pop = '0; - r_pop = '0; - r_clr = '0; - - for (int jj = 0; jj < WRITE_PORTS; jj++) begin: write_request - automatic int m = 32'(waddr_i[jj]); - automatic int n = 32'(wrowaddr_i[jj]); - if( scoreboard_q[m][n].id == wr_id_i[jj] && - scoreboard_q[m][n].wready && we_i[jj] ) - begin - wr_req [jj] = ~scoreboard_q[m][n].rvalid; - w_pop [m][n] = wr_gnt[jj]; - end - end - - for (int jj = 0; jj < READ_PORTS; jj++) begin: read_request - automatic int m = 32'(raddr_i[jj]); - automatic int n = 32'(rrowaddr_i[jj]); - if( scoreboard_q[m][n].id == rd_id_i[jj] && - scoreboard_q[m][n].rvalid && rready_i[jj] ) - begin - rd_req [jj] = 1'b1; - r_clr [m][n] = rd_gnt[jj]; - r_pop [m][n] = rd_gnt[jj] &~ scoreboard_q[m][n].wready; - end - end - - if(SYNC_REQ) begin: sa_sync_req - - logic block ; - logic same_id_acc; - logic same_id_A ; - logic same_id_D ; - logic same_id_W ; - - // same_id_acc = wr_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY ] == scoreboard_q[waddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]][wrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY ]].id; - same_id_A = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A]].id; - same_id_D = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D]].id; - same_id_W = rd_id_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] == scoreboard_q[raddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]][rrowaddr_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W]].id; - - if( // (we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && !same_id_acc) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && !same_id_A ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && !same_id_D ) || - (rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && !same_id_W ) - ) begin - block = 1'b1; - end else begin - block = 1'b0; - end - - // if(we_i [quadrilatero_pkg::SYSTOLIC_ARRAY ] && same_id_acc && block) begin - // wr_req[quadrilatero_pkg::SYSTOLIC_ARRAY ] = 1'b0; - // end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_A] && same_id_A && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_A] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_D] && same_id_D && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_D] = 1'b0; - end - if(rready_i[quadrilatero_pkg::SYSTOLIC_ARRAY_W] && same_id_W && block) begin - rd_req[quadrilatero_pkg::SYSTOLIC_ARRAY_W] = 1'b0; - end - end - end - - if(RF_WRITE_PORTS != WRITE_PORTS) begin: write_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_WRITE_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (WRITE_PORTS) - ) wr_arb_i( - .clk_i , - .rst_ni , - .req_i (wr_req), - .grant_o (wr_gnt) - ); - always_comb begin: wdata_block - automatic int ll=0; - - wready_o = wr_gnt ; - for (int mm = 0; mm < WRITE_PORTS; mm++) begin - if(wr_gnt[mm]) begin - waddr_o [ll] = waddr_i [mm]; - wrowaddr_o[ll] = wrowaddr_i[mm]; - wdata_o [ll] = wdata_i [mm]; - we_o [ll] = we_i [mm]; - ll++; - end - end - end - end else always_comb begin : write_block_noArb - wr_gnt = wr_req ; - waddr_o = waddr_i ; - wrowaddr_o = wrowaddr_i; - wdata_o = wdata_i ; - we_o = wr_gnt ; - wready_o = wr_gnt ; - end - - if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb - - quadrilatero_rr_arbiter #( - .NumActOut (RF_READ_PORTS) , - .N_ROWS (N_ROWS) , - .WIDTH (READ_PORTS) - ) rd_arb_i( - .clk_i , - .rst_ni , - .req_i (rd_req), - .grant_o (rd_gnt) - ); - - always_comb begin: rdata_block - automatic int ll=0; - - rvalid_o = rd_gnt; - for (int mm = 0; mm < READ_PORTS; mm++) begin - if(rd_gnt[mm]) begin - raddr_o [ll] = raddr_i [mm]; - rrowaddr_o[ll] = rrowaddr_i[mm]; - rdata_o [mm] = rdata_i [ll]; - ll++; - end else begin - rdata_o [mm] = rdata_i [RF_READ_PORTS-1]; - end - end - end - end else always_comb begin : read_block_noArb - rd_gnt = rd_req ; - raddr_o = raddr_i ; - rrowaddr_o = rrowaddr_i; - rdata_o = rdata_i ; - rvalid_o = rd_gnt ; - end - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - scoreboard_q <= '0; - end else begin - scoreboard_q <= scoreboard_d; - end - end - - //------------------------------------------------------------------------------------------------------- - - // Assertions - if (WRITE_PORTS < 2) begin - $error( - "[rf_sequencer] WRITE_PORTS must be at least 2.\n" - ); - end - if (READ_PORTS < 2) begin - $error( - "[rf_sequencer] READ_PORTS must be at least 2.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv deleted file mode 100644 index cde315829..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv +++ /dev/null @@ -1,484 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* - -TODO: -- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs - - basically you need to inject zeros instead of actual elements -*/ - -module quadrilatero_systolic_array #( - parameter int MESH_WIDTH = 4 , - parameter int DATA_WIDTH = 32 , - parameter int N_REGS = 8 , - parameter int ENABLE_SIMD = 1 , - localparam int N_ROWS = MESH_WIDTH , - localparam int ALEN = DATA_WIDTH * MESH_WIDTH, - parameter FPU = 1 -) ( - input logic clk_i , - input logic rst_ni , - - output logic sa_ready_o , - input logic start_i , - - // Only has effect if ENABLE_SIMD == 1 - input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , - - input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register - input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register - input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register - input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction - - // Weight Read Register Port - output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , - input logic [ ALEN-1:0] weight_rdata_i , - input logic weight_rdata_valid_i, - output logic weight_rdata_ready_o, - output logic weight_rlast_o , - - // Data Read Register Port - output logic [ $clog2(N_REGS)-1:0] data_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , - input logic [ ALEN-1:0] data_rdata_i , - input logic data_rdata_valid_i , - output logic data_rdata_ready_o , - output logic data_rlast_o , - - // Accumulator Read Register Port - output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , - input logic [ ALEN-1:0] acc_rdata_i , - input logic acc_rdata_valid_i , - output logic acc_rdata_ready_o , - output logic acc_rlast_o , - - // Accumulator Out Write Register Port - output logic [ $clog2(N_REGS)-1:0] res_waddr_o , - output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , - output logic [ ALEN-1:0] res_wdata_o , - output logic res_we_o , - output logic res_wlast_o , - input logic res_wready_i , - - // RF Instruction ID - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , - - // Finish - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o -); - typedef enum logic [1:0]{ - FS_IDLE, - FS_ACTIVE, - FS_LAST - } fs_state_e; - typedef enum logic [1:0]{ - FF_IDLE, - FF_ACTIVE, - FF_DONE - } ff_state_e; - typedef enum logic [1:0]{ - DR_IDLE, - DR_ACTIVE, - DR_DONE - } dr_state_e; - - ff_state_e ff_state_d, ff_state_q; - fs_state_e fs_state_d, fs_state_q; - dr_state_e dr_state_d, dr_state_q; - localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); - logic valid ; - logic clear ; - logic pump ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; - - logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register - logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register - logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register - logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register - quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; - quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; - - logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage - - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; - - logic finished_d ; - logic finished_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; - logic mask_req ; - - quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; - - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; - logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; - - //--------------------------------------------------------------------- - - always_comb begin: rf_block - // Weight Read Register Port - weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q ; - weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = (ff_state_q != FF_IDLE) ; - - // Data Read Register Port - data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q ; - data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - data_rlast_o = ff_state_q != FF_IDLE ; - - // Accumulator Read Register Port - acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q ; - acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - acc_rlast_o = '0 ; - - // Accumulator Out Write Register Port - res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q ; - res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; - res_wlast_o = dr_state_q != DR_IDLE ; - end - - always_comb begin: finished_signal - - finished_d = (res_wready_i && (dr_counter_q == LastRow)) ? 1'b1 : - (finished_ack_i ) ? 1'b0 : finished_q; - - finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow)) ? id_dr_q : - (finished_ack_i ) ? '0 : finished_instr_id_q; - end - - always_comb begin: ctrl_block - valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q != DR_ACTIVE)) begin - clear = 1'b1; - end else begin - clear = 1'b0; - end - if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q == DR_ACTIVE)) begin - pump = 1'b1; - end else begin - pump = 1'b0; - end - mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; - end - - always_comb begin : ff_fsm_block - ff_counter_d = ff_counter_q; - ff_state_d = ff_state_q; - data_reg_d = data_reg_q; - acc_reg_d = acc_reg_q; - weight_reg_d = weight_reg_q; - sa_ctrl_d = sa_ctrl_q; - id_ff_d = id_ff_q; - - unique case (ff_state_q) - FF_IDLE: begin - ff_counter_d = '0; - if(start_i == 1'b1) begin - ff_state_d = FF_ACTIVE; - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end - end - FF_ACTIVE: begin - if(valid == 1'b1) begin - if(ff_counter_q==(LastRow-1)) begin - ff_counter_d = ff_counter_q + 1; - ff_state_d = FF_DONE; - end else begin - ff_counter_d = ff_counter_q + 1; - end - end - - - end - FF_DONE: begin - if(start_i == 1'b1) begin - ff_counter_d = '0; - ff_state_d = FF_ACTIVE; - - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end else begin - ff_counter_d = '0; - ff_state_d = FF_IDLE; - end - - end - default: begin - ff_state_d = FF_IDLE; - end - endcase - end - always_comb begin : fs_fsm_block - fs_counter_d = fs_counter_q; - fs_state_d = fs_state_q; - - acc_fs_d = acc_fs_q; - id_fs_d = id_fs_q; - - unique case(fs_state_q) - FS_IDLE: begin - fs_counter_d = '0; - if(ff_state_q == FF_DONE ) begin - fs_state_d = FS_ACTIVE; - - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - end - - end - FS_ACTIVE: begin - if(clear == 1'b1) begin - fs_counter_d = '0; - fs_state_d = FS_IDLE; - end else begin - if(fs_counter_q == LastRow-2) begin - fs_counter_d = fs_counter_q + 1; - fs_state_d = FS_LAST; - end else begin - fs_counter_d = fs_counter_q + 1; - end - end - end - FS_LAST: begin - fs_counter_d = '0; - if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs - fs_state_d = FS_ACTIVE; - - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - end - if(ff_state_q == FF_IDLE) begin - fs_state_d = FS_IDLE; - end - - end - default: begin - fs_state_d = FS_IDLE; - end - - endcase - end - - always_comb begin : dr_fsm_block - dr_state_d = dr_state_q; - dr_counter_d = dr_counter_q; - - dest_reg_d = dest_reg_q; - id_dr_d = id_dr_q; - unique case(dr_state_q) - DR_IDLE: begin - dr_counter_d = '0; - if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - end - - end - DR_ACTIVE: begin - if(clear == 1'b1) begin - dr_counter_d = '0; - dr_state_d = DR_IDLE; - end else begin - if(dr_counter_q == LastRow) begin - if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; //stall the pipeline - end else begin - dr_counter_d = '0; - if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - end - if(fs_state_q == FS_IDLE) begin - dr_state_d = DR_DONE; - end - end - end else begin - dr_counter_d = dr_counter_q + 1; - end - end - - end - DR_DONE: begin - dr_state_d = DR_IDLE; - end - default: begin - dr_state_d = DR_IDLE; - end - - endcase - - - end - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_data ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (data_rdata_i ), - .data_o (data_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (acc_rdata_i ), - .data_o (acc_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(4) - ) skewer_inst_ctrl ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i ({MESH_WIDTH{sa_ctrl_q}}), - .data_o (sa_ctrl_mesh_skewed ) - ); - - quadrilatero_wl_stage #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) weight_inst ( - .clk_i , - .rst_ni , - - .ff_counter (ff_counter_q ), - .clear_i (clear ), - .pump_i (pump ), - .weight_rdata_valid_i , - - // Weight Data - .weight_rdata_i , - .weight_rdata_o (weight_mesh_skewed ) - ); - - quadrilatero_mesh #( - .MESH_WIDTH (MESH_WIDTH ), - .ENABLE_SIMD(ENABLE_SIMD), - .FPU (FPU ) - ) mesh_inst ( - .clk_i, - .rst_ni, - - .pump_i (pump ), - .sa_ctrl_i (sa_ctrl_mesh_skewed ), - - .data_i (data_mesh_skewed ), - .acc_i (acc_mesh_skewed ), - .weight_i (weight_mesh_skewed ), - .acc_o (res_mesh_skewed ) - ); - - quadrilatero_deskewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) deskewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (res_mesh_skewed), - .data_o (res_wdata_o ) - ); - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - ff_counter_q <= '0; - fs_counter_q <= '0; - dr_counter_q <= '0; - ff_state_q <= FF_IDLE; - fs_state_q <= FS_IDLE; - dr_state_q <= DR_IDLE; - data_reg_q <= '0; - acc_reg_q <= '0; - weight_reg_q <= '0; - sa_ctrl_q <= '0; - acc_fs_q <= '0; - dest_reg_q <= '0; - id_ff_q <= '0; - id_fs_q <= '0; - id_dr_q <= '0; - finished_q <= '0; - finished_instr_id_q <= '0; - end else begin - ff_counter_q <= ff_counter_d ; - fs_counter_q <= fs_counter_d ; - dr_counter_q <= dr_counter_d ; - ff_state_q <= ff_state_d; - fs_state_q <= fs_state_d; - dr_state_q <= dr_state_d; - data_reg_q <= data_reg_d ; - acc_reg_q <= acc_reg_d ; - weight_reg_q <= weight_reg_d ; - sa_ctrl_q <= sa_ctrl_d ; - acc_fs_q <= acc_fs_d ; - dest_reg_q <= dest_reg_d ; - id_ff_q <= id_ff_d ; - id_fs_q <= id_fs_d ; - id_dr_q <= id_dr_d ; - finished_q <= finished_d ; - finished_instr_id_q <= finished_instr_id_d ; - end - end - - assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0) | clear); - assign sa_input_id_o = id_ff_q ; - assign sa_output_id_o = id_dr_q ; - assign finished_o = finished_q ; - assign finished_instr_id_o = finished_instr_id_q; - - // -------------------------------------------------------------------- - - // Assertions - if (MESH_WIDTH < 2) begin - $error( - "[systolic_array] MESH_WIDTH must be at least 2.\n" - ); - end -endmodule From 7a414a6b7cdb08db87aa95ee92f3e48af762319a Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Sat, 12 Apr 2025 10:54:13 +0200 Subject: [PATCH 10/18] 8x8 control working --- .../rtl/include/quadrilatero_pkg.sv | 3 +- .../quadrilatero/rtl/quadrilatero.sv | 3 +- .../quadrilatero/rtl/quadrilatero_lsu.sv | 16 ++- .../rtl/quadrilatero_register_lsu.sv | 52 +++++++--- .../quadrilatero_register_lsu_controller.sv | 10 +- .../rtl/quadrilatero_rf_sequencer.sv | 2 +- .../rtl/quadrilatero_systolic_array.sv | 97 +++++++++++-------- 7 files changed, 115 insertions(+), 68 deletions(-) diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index b38e262ee..37b43b43e 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -8,7 +8,8 @@ package quadrilatero_pkg; parameter int unsigned N_REGS = 8; parameter int unsigned DATA_WIDTH = 32; parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 4; + parameter int unsigned MESH_WIDTH = 8; + parameter int unsigned SA_MESH_WIDTH = 4; parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units parameter int unsigned MAX_NUM_READ_OPERANDS = 3; parameter int unsigned MAX_NUM_WRITE_OPERANDS = 1; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv index 5a4d062a1..108ebac41 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv @@ -671,7 +671,7 @@ module quadrilatero ); quadrilatero_systolic_array #( - .MESH_WIDTH(MESH_WIDTH), + .MESH_WIDTH(quadrilatero_pkg::SA_MESH_WIDTH), .FPU (FPU ) ) sa_inst ( .clk_i , @@ -774,6 +774,7 @@ module quadrilatero // To Register Loader .busy_i (lsu_busy | x_res_almost_full), // Load Unit busy + .finished_i (lsu_finished), .start_o (lsu_ctrl_start ), // .issued_instr_o (lsu_ctrl_issued_instr ), // issued instruction .issued_instr_conf_o (lsu_ctrl_issued_instr_conf ) // issued instruction configuration diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index cb5f9b2ff..a7780d12a 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -5,7 +5,7 @@ // Author: Danilo Cammarata module quadrilatero_lsu #( - parameter int unsigned FIFO_DEPTH = 4, + parameter int unsigned FIFO_DEPTH = quadrilatero_pkg::MESH_WIDTH, parameter int unsigned DATA_WIDTH = 32 ) ( @@ -27,7 +27,8 @@ module quadrilatero_lsu #( input logic write_i , // write transaction output logic busy_o , // lsu available output logic terminate_o , // lsu done - + input logic last_i, + // Address input logic [ 31:0] src_ptr_i , // base address input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one @@ -100,6 +101,8 @@ module quadrilatero_lsu #( logic store_fifo_empty ; logic [ DATA_WIDTH-1:0] store_fifo_output ; logic store_fifo_pop ; + logic last_q ; + logic last_d ; enum { @@ -109,6 +112,7 @@ module quadrilatero_lsu #( lsu_state_q, lsu_state_d; + assign last_d = last_i; always_comb begin : FSM_block lsu_state_d = lsu_state_q; @@ -119,7 +123,7 @@ module quadrilatero_lsu #( end end LSU_RUNNING: begin - if (terminate && !start_i) begin + if (terminate && !start_i && (store_fifo_empty)) begin lsu_state_d = LSU_READY; end end @@ -150,14 +154,14 @@ module quadrilatero_lsu #( rows_d = rows_i - 1; cols_d = cols_i - 2; end else if (rows_i > 1) begin - rows_d = rows_i - 2; + rows_d = rows_i - 1; cols_d = cols_i - 1; end end else begin rows_d = rows_i - 1; cols_d = cols_i - 1; end - end else if (data_gnt_i && data_req_o) begin + end else if (data_gnt_i && data_req_o && last_i) begin if (cols_q > 0) cols_d = cols_q - 1; else if (rows_q > 0) begin cols_d = cols_i - 1; @@ -291,6 +295,7 @@ module quadrilatero_lsu #( if (~rst_ni) begin lsu_state_q <= LSU_READY; ptr_q <= '0 ; + last_q <= '0 ; rows_q <= '0 ; cols_q <= '0 ; rd_head_q <= '0 ; @@ -299,6 +304,7 @@ module quadrilatero_lsu #( end else begin lsu_state_q <= lsu_state_d; ptr_q <= ptr_d ; + last_q <= last_d ; rows_q <= rows_d ; cols_q <= cols_d ; rd_head_q <= rd_head_d ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 16b0cbe1e..aad3f7463 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -85,6 +85,9 @@ module quadrilatero_register_lsu #( logic [$clog2(N_ROWS)-1:0] counter_d; logic [$clog2(N_REGS)-1:0] waddr_q; logic [$clog2(N_REGS)-1:0] waddr_d; + logic [$clog2(N_REGS)-1:0] raddr_q; + logic [$clog2(N_REGS)-1:0] raddr_d; + logic [LLEN-1:0] load_fifo_data; @@ -130,15 +133,15 @@ module quadrilatero_register_lsu #( assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin - lsu_id_o = (write_i &~ load_fifo_data_available) ? instr_id_i : back_id_q; - finished = (write_q & terminate) | (~write_q & (counter_q == LastRow) & wready_i); + lsu_id_o = (write_i &~ load_fifo_data_available & rlast_o) ? instr_id_i : back_id_q; + finished = (write_q & terminate & rlast_o) | (~write_q & (counter_q == LastRow) & wready_i && wlast_o); end always_comb begin: write_to_RF data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols load_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); - we_o = load_fifo_data_available &~ mask_req && (access_counter_q == NumAccesses -1); // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy + we_o = load_fifo_data_available &~ mask_req; // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; wrowaddr_o = counter_q ; load_row_buffer_d = (load_row_buffer_q & ~load_mask) | (load_fifo_data << (LLEN * access_counter_q)); @@ -150,7 +153,7 @@ module quadrilatero_register_lsu #( store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; rrowaddr_o = counter_q ; - raddr_o = operand_reg_i ; + raddr_o = operand_reg_i;//lsu_state_q == LSU_IDLE? raddr_d : raddr_q; end always_comb begin: lsu_ctrl_block @@ -175,7 +178,7 @@ module quadrilatero_register_lsu #( stride_d = (start) ? stride_i : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; - busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b0 : + busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i && rlast_o) ? 1'b0 : (write_i && start_i) ? 1'b1 : busy_q; end always_comb begin: fsm_block @@ -187,21 +190,32 @@ module quadrilatero_register_lsu #( back_id_d = back_id_q; waddr_d = waddr_q; + raddr_d = raddr_q; case (lsu_state_q) LSU_IDLE: begin back_id_d = instr_id_i; waddr_d = operand_reg_i; + raddr_d = operand_reg_i; + //access_counter_d = '0; if(load_fifo_valid && !write_i && wready_i) begin - counter_d = counter_q + 1; - wlast_o = 1'b1; - access_counter_d = '0; - lsu_state_d = LSU_LOAD; + if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_LOAD; + wlast_o = 1'b1; + end else begin + access_counter_d = access_counter_q + 1; + lsu_state_d = LSU_LOAD; + end end else if (write_i & store_fifo_space_available && rdata_valid_i) begin - counter_d = counter_q + 1; - rlast_o = 1'b1; - access_counter_d = '0; - lsu_state_d = LSU_STORE; + if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_STORE; + rlast_o = 1'b1; + end else begin + access_counter_d = access_counter_q + 1; + lsu_state_d = LSU_STORE; + end end end @@ -216,6 +230,7 @@ module quadrilatero_register_lsu #( lsu_state_d = LSU_DONE; back_id_d = instr_id_i; waddr_d = operand_reg_i; + raddr_d = operand_reg_i; end else begin access_counter_d = access_counter_q + 1; end @@ -239,6 +254,7 @@ module quadrilatero_register_lsu #( access_counter_d = '0; back_id_d = instr_id_i; waddr_d = operand_reg_i; + raddr_d = operand_reg_i; end else begin access_counter_d = access_counter_q + 1; @@ -257,6 +273,7 @@ module quadrilatero_register_lsu #( lsu_state_d = LSU_DONE; back_id_d = instr_id_i; waddr_d = operand_reg_i; + raddr_d = operand_reg_i; end else begin access_counter_d = access_counter_q + 1; end @@ -271,9 +288,9 @@ module quadrilatero_register_lsu #( end end end else begin - counter_d = '0; - back_id_d = instr_id_i; - lsu_state_d = LSU_DONE; + // counter_d = '0; + // back_id_d = instr_id_i; + // lsu_state_d = LSU_DONE; end end LSU_DONE: begin @@ -311,6 +328,7 @@ module quadrilatero_register_lsu #( if (!rst_ni) begin counter_q <= '0; waddr_q <= '0; + raddr_q <= '0; back_id_q <= '0; start_q <= '0; write_q <= '0; @@ -326,6 +344,7 @@ module quadrilatero_register_lsu #( counter_q <= counter_d; back_id_q <= back_id_d; waddr_q <= waddr_d ; + raddr_q <= raddr_d ; start_q <= start_d ; write_q <= write_d ; busy_q <= busy_d ; @@ -362,6 +381,7 @@ module quadrilatero_register_lsu #( .write_i (write_i), .busy_o (busy ), .terminate_o (terminate ), + .last_i (wlast_o | rlast_o), // Address .src_ptr_i (src_ptr ), diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv index 8709858a6..c4cc392cc 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_controller.sv @@ -5,7 +5,7 @@ // Author: Saverio Nasturzio module quadrilatero_register_lsu_controller #( - parameter N_SLOTS = 3 + parameter N_SLOTS = 3 //TODO maybe change that to quadrilatero_pkg::MESH_WIDTH ) ( input logic clk_i, input logic rst_ni, @@ -18,10 +18,12 @@ module quadrilatero_register_lsu_controller #( // To Register Loader input logic busy_i, // Load Unit busy output logic start_o, // WL will start executing new instruction + input logic finished_i, //LSU has finished executing instruction output quadrilatero_pkg::lsu_instr_t issued_instr_o, // issued instruction output quadrilatero_pkg::lsu_conf_t issued_instr_conf_o // issued instruction configuration ); - + logic finished_d; + logic finished_q; localparam int unsigned USAGE = N_SLOTS > 1 : $clog2(N_SLOTS) : 0; logic issue_queue_empty; logic start_load; @@ -38,8 +40,10 @@ module quadrilatero_register_lsu_controller #( issued_instr_ff <= '0; issued_instr_conf_ff <= '0; start_o <= '0; + finished_q <= 1'b0; end else begin start_o <= '0; + finished_q <= finished_d; if (start_load) begin issued_instr_ff <= fifo_data_out; issued_instr_conf_ff <= csr_config_i; @@ -48,7 +52,7 @@ module quadrilatero_register_lsu_controller #( end end - + assign finished_d = finished_i; assign issued_instr_conf_o = issued_instr_conf_ff; assign issued_instr_o = issued_instr_ff; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index 25af6b913..9f1c02c0f 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -220,7 +220,7 @@ module quadrilatero_rf_sequencer #( wrowaddr_o = wrowaddr_i; wdata_o = wdata_i ; we_o = wr_gnt ; - wready_o = we_i ; + wready_o = wr_gnt ; end if(RF_READ_PORTS != READ_PORTS) begin: read_block_wArb diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index 4e67b7ed9..7912a6456 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -36,7 +36,7 @@ module quadrilatero_systolic_array #( // Weight Read Register Port output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] weight_rrowaddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, @@ -44,7 +44,7 @@ module quadrilatero_systolic_array #( // Data Read Register Port output logic [ $clog2(N_REGS)-1:0] data_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] data_rrowaddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , input logic data_rdata_valid_i , output logic data_rdata_ready_o , @@ -52,7 +52,7 @@ module quadrilatero_systolic_array #( // Accumulator Read Register Port output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , - output logic [ $clog2(N_ROWS)-1:0] acc_rrowaddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , input logic acc_rdata_valid_i , output logic acc_rdata_ready_o , @@ -60,7 +60,7 @@ module quadrilatero_systolic_array #( // Accumulator Out Write Register Port output logic [ $clog2(N_REGS)-1:0] res_waddr_o , - output logic [ $clog2(N_ROWS)-1:0] res_wrowaddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , output logic res_we_o , output logic res_wlast_o , @@ -127,6 +127,7 @@ module quadrilatero_systolic_array #( logic valid ; + logic ff_valid; logic clear ; logic pump ; logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; @@ -178,23 +179,24 @@ module quadrilatero_systolic_array #( weight_base_row = N_ROWS * ff_it_counter_q; weight_raddr_o = weight_reg_q ; weight_rrowaddr_o = ff_counter_q + weight_base_row; - weight_rdata_masked = (weight_rdata_i & weight_mask) >> ALEN * ff_k_counter_q; + weight_rdata_masked = (weight_rdata_i & weight_mask) >> ALEN * ff_k_counter_q; //TODO fix weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = (ff_state_q != FF_IDLE) && ff_k_counter_q == (K-1) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? + weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? // Data Read Register Port - data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} >> (ALEN * ff_it_counter_q); + data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); // TODO fix data_raddr_o = data_reg_q ; data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - data_rdata_masked = (data_rdata_i & data_mask) >> ALEN * ff_it_counter_q; + data_rdata_masked = (data_rdata_i & data_mask) >> ALEN * ff_it_counter_q ; //TODO fix data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - data_rlast_o = ff_state_q != FF_IDLE && ff_it_counter_q == (K-1) ; + data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; // Accumulator Read Register Port acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); acc_raddr_o = acc_reg_q ; acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - acc_rdata_masked = (acc_rdata_i & acc_mask) >> ALEN * ff_k_counter_q; + acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? + (res_wdata_o >> ALEN * ff_k_counter_q)[ALEN-1:0] : (acc_rdata_i >> ALEN * ff_k_counter_q)[ALEN-1:0]; //TODO fix acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; acc_rlast_o = '0 ; @@ -203,15 +205,15 @@ module quadrilatero_systolic_array #( res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; res_wdata_o = res_wdata_buffer_q | res_wdata_partial << ALEN * dr_k_counter_q; res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; - res_wlast_o = (dr_state_q != DR_IDLE) && dr_it_counter_q == (K-1) ; + res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); end always_comb begin: finished_signal - finished_d = (res_wready_i && (dr_counter_q == LastRow)) ? 1'b1 : + finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : (finished_ack_i ) ? 1'b0 : finished_q; - finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow)) ? id_dr_q : + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? id_dr_q : (finished_ack_i ) ? '0 : finished_instr_id_q; end @@ -241,7 +243,8 @@ module quadrilatero_systolic_array #( id_ff_d = id_ff_q; ff_k_counter_d = ff_k_counter_q; ff_it_counter_d = ff_it_counter_q; - ff_row_counter_d = ff_row_counter_q; + ff_row_counter_d = ff_row_counter_q; + ff_valid = 1'b0; unique case (ff_state_q) FF_IDLE: begin @@ -260,6 +263,7 @@ module quadrilatero_systolic_array #( end FF_ACTIVE: begin if(valid == 1'b1) begin + ff_valid = 1'b1; if(ff_counter_q==(LastRow-1)) begin ff_counter_d = ff_counter_q + 1; ff_state_d = FF_DONE; @@ -270,29 +274,32 @@ module quadrilatero_systolic_array #( end FF_DONE: begin - if(start_i == 1'b1) begin - ff_counter_d = '0; - ff_state_d = FF_ACTIVE; - if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1)) begin // get inputs from new instruction - ff_it_counter_d = '0; - ff_row_counter_d = '0; - ff_k_counter_d = '0; - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end else begin - if(ff_row_counter_q == RegLastRow-1) begin + if(start_i == 1'b1 | ~(data_rlast_o == 1'b1 && weight_rlast_o == 1'b1 && ff_it_counter_q == (K-1))) begin + if(valid == 1'b1) begin + ff_valid = 1'b1; + ff_counter_d = '0; + ff_state_d = FF_ACTIVE; + if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin // get inputs from new instruction + ff_it_counter_d = '0; ff_row_counter_d = '0; - if(ff_k_counter_q == (K-1)) begin - ff_k_counter_d = '0; - ff_it_counter_d = ff_it_counter_q + 1; + ff_k_counter_d = '0; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + if(ff_row_counter_q == RegLastRow-1) begin + ff_row_counter_d = '0; + if(ff_k_counter_q == (K-1)) begin + ff_k_counter_d = '0; + ff_it_counter_d = ff_it_counter_q + 1; + end else begin + ff_k_counter_d = ff_k_counter_q + 1; + end end else begin - ff_k_counter_d = ff_k_counter_q + 1; + ff_row_counter_d = ff_row_counter_q + 1; end - end else begin - ff_row_counter_d = ff_row_counter_q + 1; end end @@ -317,7 +324,7 @@ module quadrilatero_systolic_array #( unique case(fs_state_q) FS_IDLE: begin fs_counter_d = '0; - if(ff_state_q == FF_DONE ) begin + if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin fs_state_d = FS_ACTIVE; acc_fs_d = acc_reg_q; @@ -329,18 +336,18 @@ module quadrilatero_systolic_array #( if(clear == 1'b1) begin fs_counter_d = '0; fs_state_d = FS_IDLE; - end else begin + end else begin if(fs_counter_q == LastRow-2) begin fs_counter_d = fs_counter_q + 1; fs_state_d = FS_LAST; end else begin fs_counter_d = fs_counter_q + 1; - end + end end end FS_LAST: begin fs_counter_d = '0; - if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs + if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin //stay in active mode, load new inputs fs_state_d = FS_ACTIVE; acc_fs_d = acc_reg_q; @@ -348,6 +355,10 @@ module quadrilatero_systolic_array #( end if(ff_state_q == FF_IDLE) begin fs_state_d = FS_IDLE; + end else begin + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + fs_state_d = FS_IDLE; end end @@ -370,6 +381,9 @@ module quadrilatero_systolic_array #( unique case(dr_state_q) DR_IDLE: begin dr_counter_d = '0; + dr_k_counter_d = '0; + dr_it_counter_d = '0; + dr_row_counter_d = '0; if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 dr_state_d = DR_ACTIVE; dest_reg_d = acc_fs_q; @@ -413,8 +427,8 @@ module quadrilatero_systolic_array #( if(fs_state_q == FS_IDLE) begin dr_state_d = DR_DONE; end - end - end else begin + end + end else begin dr_counter_d = dr_counter_q + 1; end end @@ -564,7 +578,8 @@ module quadrilatero_systolic_array #( end end - assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0) | clear); + assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0 + && (ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1))) | clear); assign sa_input_id_o = id_ff_q ; assign sa_output_id_o = id_dr_q ; assign finished_o = finished_q ; From ff83ca19c3675041fe85d971423916dc2b614303 Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Sun, 13 Apr 2025 16:15:52 +0200 Subject: [PATCH 11/18] 8x8 RF working for one MACC --- do_sim.sh | 4 + .../quadrilatero/rtl/quadrilatero_lsu.sv | 9 +- .../quadrilatero/rtl/quadrilatero_lsu_new.sv | 318 ++++++++++++++++++ .../rtl/quadrilatero_register_lsu.sv | 11 +- .../rtl/quadrilatero_systolic_array.sv | 128 ++++--- sw/applications/quadrilatero_easy_8x8/main.c | 306 +++++++++++++++++ .../quadrilatero_easy_8x8/matrixMul32i.h | 44 +++ 7 files changed, 767 insertions(+), 53 deletions(-) create mode 100644 do_sim.sh create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv create mode 100644 sw/applications/quadrilatero_easy_8x8/main.c create mode 100644 sw/applications/quadrilatero_easy_8x8/matrixMul32i.h diff --git a/do_sim.sh b/do_sim.sh new file mode 100644 index 000000000..b82ec5392 --- /dev/null +++ b/do_sim.sh @@ -0,0 +1,4 @@ +make questasim-sim FUSESOC_PARAM="--X_EXT=1" +cd ./build/openhwgroup.org_systems_core-v-mini-mcu_0/sim-modelsim/ +make run-gui PLUSARGS="c firmware=../../../sw/build/main.hex" +cd ../../.. \ No newline at end of file diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index a7780d12a..f6931f653 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -28,6 +28,7 @@ module quadrilatero_lsu #( output logic busy_o , // lsu available output logic terminate_o , // lsu done input logic last_i, + input logic access_counter_match_i, // Address input logic [ 31:0] src_ptr_i , // base address @@ -131,7 +132,7 @@ module quadrilatero_lsu #( end always_comb begin : ctrl_block - terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); + terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING) && (!last_i || write_i)); // !last_i only for RLEN/LLEN = 2, in other cases work with access counter load_fifo_valid_o = rd_valid_d; busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; terminate_o = terminate; @@ -140,8 +141,8 @@ module quadrilatero_lsu #( always_comb begin : addr_block src_ptr_inc = DATA_WIDTH / 8; addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; - ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; + addr = (start_i) ? ((src_ptr_i == ptr_q) ? ptr_q + addr_op2 : src_ptr_i) : ((write_i && !data_we_q)? ptr_q : ptr_q + addr_op2); //what happens when 2 loads don't load from subsequent addresses? + ptr_d = ((data_gnt_i && data_req_o) || start_i) ? addr : ptr_q; end always_comb begin : counters_block @@ -231,7 +232,7 @@ module quadrilatero_lsu #( rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : (load_fifo_output_pop_i & - load_fifo_empty & ~rvalid) ? 1'b0 : rd_valid_q; + load_fifo_empty & ~data_gnt_i) ? 1'b0 : rd_valid_q; rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || (rvalid & ~rd_valid_q) ? load_fifo_input : diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv new file mode 100644 index 000000000..1614ea3d3 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv @@ -0,0 +1,318 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_lsu #( + parameter int unsigned FIFO_DEPTH = quadrilatero_pkg::MESH_WIDTH, + parameter int unsigned DATA_WIDTH = 32 + +) ( + input logic clk_i , + input logic rst_ni , + + // Bus interface + output logic data_req_o , + output logic [ 31:0] data_addr_o , + output logic data_we_o , + output logic [DATA_WIDTH/8 - 1:0] data_be_o , + output logic [ DATA_WIDTH-1:0] data_wdata_o , + input logic data_gnt_i , + input logic data_rvalid_i , + input logic [ DATA_WIDTH-1:0] data_rdata_i , + + // Configuration + input logic start_i , // start transfer (MUST BE A PULSE!!!!!) + input logic write_i , // write transaction + output logic busy_o , // lsu available + output logic terminate_o , // lsu done + input logic last_i, + input logic access_counter_match_i, + + // Address + input logic [ 31:0] src_ptr_i , // base address + input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one + input logic [ 31:0] rows_i , // how many rows we need to fetch + input logic [ 31:0] cols_i , + + // Output data + output logic [ DATA_WIDTH-1:0] load_fifo_output_o , + output logic load_fifo_valid_o , + output logic load_fifo_data_available_o , + input logic load_fifo_output_pop_i , + + // Input data + input logic [ DATA_WIDTH-1:0] store_fifo_input_i , + input logic store_fifo_push_i , + output logic store_fifo_space_available_o, + output logic store_fifo_empty_o + + +); + + localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; + localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; + localparam int unsigned LastFifoUsage = DEPTH - 1; + + + logic terminate ; + + logic [ 31:0] rows_q ; + logic [ 31:0] rows_d ; + logic [ 31:0] cols_q ; + logic [ 31:0] cols_d ; + logic [ 31:0] src_ptr_inc ; + logic [ 31:0] addr ; + logic [ 31:0] addr_op2 ; + logic [ 31:0] ptr_q ; + logic [ 31:0] ptr_d ; + + logic data_in_req ; + logic data_in_we ; + logic [ DATA_WIDTH/8-1:0] data_in_be ; + logic [ 31:0] data_in_addr ; + logic data_in_rvalid ; + logic [ DATA_WIDTH-1:0] data_in_rdata ; + + logic [ DATA_WIDTH-1:0] load_fifo_input ; + logic [ DATA_WIDTH-1:0] load_fifo_data_out; + logic rd_valid_q ; + logic rd_valid_d ; + logic [ DATA_WIDTH-1:0] rd_head_q ; + logic [ DATA_WIDTH-1:0] rd_head_d ; + logic data_we_q ; + logic data_we_d ; + logic rvalid ; + logic load_fifo_pop ; + logic load_fifo_push ; + logic [Addr_Fifo_Depth-1:0] load_fifo_usage ; + logic load_fifo_alm_full; + logic load_fifo_full ; + logic load_fifo_empty ; + + logic data_out_req ; + logic data_out_we ; + logic [ DATA_WIDTH/8-1:0] data_out_be ; + logic [ 31:0] data_out_addr ; + logic data_out_gnt ; + logic [ DATA_WIDTH-1:0] data_out_wdata ; + + logic store_fifo_full ; + logic store_fifo_empty ; + logic [ DATA_WIDTH-1:0] store_fifo_output ; + logic store_fifo_pop ; + logic last_q ; + logic last_d ; + + + enum { + LSU_READY, + LSU_RUNNING + } + lsu_state_q, lsu_state_d; + + + assign last_d = last_i; + always_comb begin : FSM_block + lsu_state_d = lsu_state_q; + + case (lsu_state_q) + LSU_READY: begin + if (start_i & |cols_i & |rows_i) begin + lsu_state_d = LSU_RUNNING; + end + end + LSU_RUNNING: begin + if (terminate && !start_i && (store_fifo_empty)) begin + lsu_state_d = LSU_READY; + end + end + endcase + end + + always_comb begin : ctrl_block + terminate = ((rows_q == 'b1 && last_i && !write_i) + && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); //absolutely ugly + load_fifo_valid_o = rd_valid_d; + busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; + terminate_o = terminate; + end + + always_comb begin : addr_block + src_ptr_inc = DATA_WIDTH / 8; + addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; + addr = (start_i) ? src_ptr_i : ptr_q + addr_op2; // || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1)) && (access_counter_match_i == 1'b0) + ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; + end + + always_comb begin : counters_block + rows_d = rows_q; + cols_d = cols_q; + + if(start_i) begin + if(data_gnt_i && data_req_o) begin + if(cols_i > 1) begin + rows_d = rows_i - 1; + cols_d = cols_i - 2; + end else if (rows_i > 1) begin + rows_d = rows_i - 1; + cols_d = cols_i - 1; + end + end else begin + rows_d = rows_i - 1; + cols_d = cols_i - 1; + end + end else if (data_gnt_i && data_req_o && last_i) begin + if (cols_q > 0) cols_d = cols_q - 1; + else if (rows_q > 0) begin + cols_d = cols_i - 1; + rows_d = rows_q - 1; + end + end + end + + always_comb begin : read_obi + data_in_req = '0; + data_in_we = '0; + data_in_be = '0; + data_in_addr = '0; + + if (load_fifo_full == 1'b0 && load_fifo_alm_full == 1'b0) begin + data_in_req = ~write_i & (start_i | lsu_state_q == LSU_RUNNING); + data_in_we = 1'b0 ; + data_in_be = '1 ; + data_in_addr = addr ; + end + end + + always_comb begin : write_obi + data_out_req = '0 ; + data_out_we = '0 ; + data_out_be = '0 ; + data_out_addr = '0 ; + data_out_wdata = store_fifo_output; + + if (!store_fifo_empty) begin + data_out_req = start_i | lsu_state_q == LSU_RUNNING; + // data_out_we = 1'b1 ; + data_out_we = start_i | lsu_state_q == LSU_RUNNING; + data_out_be = '1 ; + data_out_addr = addr ; + end + end + + always_comb begin : obi_channel_signals + data_in_rvalid = 1'b0 ; + data_wdata_o = data_out_wdata; + data_out_gnt = data_gnt_i ; + data_in_rdata = data_rdata_i ; + + if(store_fifo_empty) begin // read transaction active + data_req_o = data_in_req ; + data_we_o = data_in_we ; + data_be_o = data_in_be ; + data_addr_o = data_in_addr ; + data_in_rvalid = data_rvalid_i ; + end else begin // write transaction active + data_req_o = data_out_req ; + data_we_o = data_out_we ; + data_be_o = data_out_be ; + data_addr_o = data_out_addr ; + end + end + + always_comb begin : load_fifo_block + data_we_d = data_gnt_i && data_req_o && data_we_o; + rvalid = data_in_rvalid &~ data_we_q ; + + load_fifo_alm_full = (load_fifo_usage == LastFifoUsage[Addr_Fifo_Depth-1:0]); + load_fifo_input = data_in_rdata; + load_fifo_push = (rvalid & rd_valid_q & ~load_fifo_output_pop_i) | (rvalid & ~load_fifo_empty); + load_fifo_pop = load_fifo_output_pop_i & ~load_fifo_empty; + + rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : + (load_fifo_output_pop_i & + load_fifo_empty & ~rvalid) ? 1'b0 : rd_valid_q; + + rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || + (rvalid & ~rd_valid_q) ? load_fifo_input : + (load_fifo_output_pop_i & ~load_fifo_empty) ? load_fifo_data_out : rd_head_q; + + load_fifo_output_o = rd_head_q ; + load_fifo_data_available_o = rd_valid_q; + end + + always_comb begin : store_fifo_block + store_fifo_pop = data_out_gnt & data_out_req; + store_fifo_empty_o = store_fifo_empty; + store_fifo_space_available_o = ~store_fifo_full; + end + + fifo_v3 #( + .FALL_THROUGH (1'b0 ), + .DEPTH (DEPTH ), + .DATA_WIDTH (DATA_WIDTH ) + ) load_lsu_fifo_i ( + .clk_i , + .rst_ni , + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + + // status flags + .full_o (load_fifo_full ), + .empty_o (load_fifo_empty ), + .usage_o (load_fifo_usage ), + + // as long as the queue is not full we can push new data + .data_i (load_fifo_input ), + .push_i (load_fifo_push ), + + // as long as the queue is not empty we can pop new elements + .data_o (load_fifo_data_out ), + .pop_i (load_fifo_pop ) + ); + + fifo_v3 #( + .DEPTH(FIFO_DEPTH), + .DATA_WIDTH(DATA_WIDTH) + ) store_lsu_fifo_i ( + .clk_i , + .rst_ni , + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + // status flags + .full_o (store_fifo_full ), + .empty_o (store_fifo_empty ), + .usage_o ( ), + // as long as the queue is not full we can push new data + .data_i (store_fifo_input_i ), + .push_i (store_fifo_push_i ), + // as long as the queue is not empty we can pop new elements + .data_o (store_fifo_output ), + .pop_i (store_fifo_pop ) + ); + + always_ff @(posedge clk_i, negedge rst_ni) begin : seq_block + if (~rst_ni) begin + lsu_state_q <= LSU_READY; + ptr_q <= '0 ; + last_q <= '0 ; + rows_q <= '0 ; + cols_q <= '0 ; + rd_head_q <= '0 ; + rd_valid_q <= '0 ; + data_we_q <= '0 ; + end else begin + lsu_state_q <= lsu_state_d; + ptr_q <= ptr_d ; + last_q <= last_d ; + rows_q <= rows_d ; + cols_q <= cols_d ; + rd_head_q <= rd_head_d ; + rd_valid_q <= rd_valid_d ; + data_we_q <= data_we_d ; + end + end + +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index aad3f7463..7fda92966 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -125,8 +125,8 @@ module quadrilatero_register_lsu #( logic [$clog2(NumAccesses)-1:0] access_counter_d; logic [$clog2(NumAccesses)-1:0] access_counter_q; - logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_d; - logic [quadrilatero_pkg::RLEN-1:0] load_row_buffer_q; + logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_d; + logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_q; logic [quadrilatero_pkg::RLEN-1:0] store_mask; logic [quadrilatero_pkg::RLEN-1:0] load_mask; @@ -145,7 +145,7 @@ module quadrilatero_register_lsu #( waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; wrowaddr_o = counter_q ; load_row_buffer_d = (load_row_buffer_q & ~load_mask) | (load_fifo_data << (LLEN * access_counter_q)); - wdata_o = load_row_buffer_d & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q + wdata_o = {load_fifo_data, load_row_buffer_q} & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q end @@ -164,7 +164,7 @@ module quadrilatero_register_lsu #( start = (start_i | start_q) & lsu_ready; busy_o = (write_i ? busy_d : busy) | start_q; - stride = (start) ? stride_i : stride_q; + stride = (start) ? (stride_i / NumAccesses) : stride_q; src_ptr = (start) ? address_i : src_ptr_q; end @@ -175,7 +175,7 @@ module quadrilatero_register_lsu #( start_d = start ? 1'b0 : (start_q | start_i) ? 1'b1 : start_q; - stride_d = (start) ? stride_i : stride_q ; + stride_d = (start) ? (stride_i / NumAccesses) : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i && rlast_o) ? 1'b0 : @@ -382,6 +382,7 @@ module quadrilatero_register_lsu #( .busy_o (busy ), .terminate_o (terminate ), .last_i (wlast_o | rlast_o), + .access_counter_match_i (access_counter_d == access_counter_q), // Address .src_ptr_i (src_ptr ), diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index 7912a6456..e0c43fff0 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -111,6 +111,7 @@ module quadrilatero_systolic_array #( logic [$clog2(K)-1:0] dr_row_counter_d; logic [$clog2(K)-1:0] dr_row_counter_q; logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; + logic last_dr_write; // Data Masks logic [quadrilatero_pkg::RLEN-1:0] data_mask; @@ -119,11 +120,15 @@ module quadrilatero_systolic_array #( //logic [quadrilatero_pkg::RLEN-1:0] res_mask; logic [ALEN-1:0] data_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; logic [ALEN-1:0] weight_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_shifted; logic [ALEN-1:0] acc_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; logic [ALEN-1:0] res_wdata_partial; - logic [quadrilatero_pkg::RLEN-1:0] res_wdata_buffer_d; - logic [quadrilatero_pkg::RLEN-1:0] res_wdata_buffer_q; + logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; + logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0] res_wdata_buffer_d; + logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0]res_wdata_buffer_q; logic valid ; @@ -176,10 +181,11 @@ module quadrilatero_systolic_array #( always_comb begin: rf_block // Weight Read Register Port weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - weight_base_row = N_ROWS * ff_it_counter_q; + weight_base_row = (quadrilatero_pkg::RLEN / DATA_WIDTH) - (N_ROWS * (ff_it_counter_q + 1)); weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q + weight_base_row; - weight_rdata_masked = (weight_rdata_i & weight_mask) >> ALEN * ff_k_counter_q; //TODO fix + weight_rrowaddr_o = ff_counter_q + weight_base_row; + weight_rdata_shifted = (weight_rdata_i >> ALEN * ff_k_counter_q); + weight_rdata_masked = weight_rdata_shifted[ALEN-1:0]; //TODO fix weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? @@ -187,7 +193,8 @@ module quadrilatero_systolic_array #( data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); // TODO fix data_raddr_o = data_reg_q ; data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - data_rdata_masked = (data_rdata_i & data_mask) >> ALEN * ff_it_counter_q ; //TODO fix + data_rdata_shifted = (data_rdata_i << ALEN * ff_it_counter_q); + data_rdata_masked = data_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] ; //TODO fix data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; @@ -195,19 +202,32 @@ module quadrilatero_systolic_array #( acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); acc_raddr_o = acc_reg_q ; acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + acc_rdata_shifted = (acc_rdata_i >> ALEN * ff_k_counter_q); + res_rdata_shifted = (res_wdata_o >> ALEN * ff_k_counter_q); acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? - (res_wdata_o >> ALEN * ff_k_counter_q)[ALEN-1:0] : (acc_rdata_i >> ALEN * ff_k_counter_q)[ALEN-1:0]; //TODO fix + res_rdata_shifted[ALEN-1:0] : acc_rdata_shifted[ALEN-1:0]; //TODO fix acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; acc_rlast_o = '0 ; // Accumulator Out Write Register Port res_waddr_o = dest_reg_q ; res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; - res_wdata_o = res_wdata_buffer_q | res_wdata_partial << ALEN * dr_k_counter_q; - res_we_o = (dr_state_q == DR_ACTIVE) &~ mask_req ; + res_wdata_o = {res_wdata_partial, res_wdata_buffer_q[dr_counter_q]}; + res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); end + always_comb begin : weight_buffer_block + res_wdata_buffer_d = res_wdata_buffer_q; + if(dr_state_q != DR_IDLE) begin + if(dr_k_counter_q == K-1) begin + res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer + end else begin + res_wdata_buffer_d[dr_counter_q] = res_wdata_buffer_q[dr_counter_q] | (res_wdata_partial << ALEN*dr_k_counter_q); + end + end + end + always_comb begin: finished_signal finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : @@ -218,14 +238,13 @@ module quadrilatero_systolic_array #( end always_comb begin: ctrl_block - res_wdata_buffer_d = res_wdata_buffer_q; valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q != DR_ACTIVE)) begin + if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q == DR_IDLE)) begin clear = 1'b1; end else begin clear = 1'b0; end - if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q == DR_ACTIVE)) begin + if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q != DR_IDLE)) begin pump = 1'b1; end else begin pump = 1'b0; @@ -289,16 +308,16 @@ module quadrilatero_systolic_array #( sa_ctrl_d = sa_ctrl_i; id_ff_d = id_i; end else begin - if(ff_row_counter_q == RegLastRow-1) begin - ff_row_counter_d = '0; - if(ff_k_counter_q == (K-1)) begin - ff_k_counter_d = '0; + if(ff_k_counter_q == K-1) begin + ff_k_counter_d = '0; + if(ff_row_counter_q == (RegLastRow-1)) begin + ff_row_counter_d = '0; ff_it_counter_d = ff_it_counter_q + 1; end else begin - ff_k_counter_d = ff_k_counter_q + 1; + ff_row_counter_d = ff_row_counter_q + 1; end end else begin - ff_row_counter_d = ff_row_counter_q + 1; + ff_k_counter_d = ff_k_counter_q + 1; end end end @@ -375,6 +394,7 @@ module quadrilatero_systolic_array #( dr_k_counter_d = dr_k_counter_q; dr_it_counter_d = dr_it_counter_q; dr_row_counter_d = dr_row_counter_q; + last_dr_write = 1'b0; dest_reg_d = dest_reg_q; id_dr_d = id_dr_q; @@ -396,46 +416,66 @@ module quadrilatero_systolic_array #( dr_counter_d = '0; dr_state_d = DR_IDLE; end else begin - if(dr_counter_q == LastRow) begin - if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; //stall the pipeline - end else begin + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; + end else begin + if(dr_counter_q == LastRow) begin + dr_counter_d = '0; - if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - //update DR counters - if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin - dr_it_counter_d = '0; - dr_row_counter_d = '0; + //update DR counters + if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + end else begin + if(dr_k_counter_q == K-1) begin dr_k_counter_d = '0; - end else begin - if(dr_row_counter_q == RegLastRow-1) begin + if(dr_row_counter_q == (RegLastRow-1)) begin dr_row_counter_d = '0; - if(dr_k_counter_q == (K-1)) begin - dr_k_counter_d = '0; - dr_it_counter_d = dr_it_counter_q + 1; - end else begin - dr_k_counter_d = dr_k_counter_q + 1; - end + dr_it_counter_d = dr_it_counter_q + 1; end else begin dr_row_counter_d = dr_row_counter_q + 1; end + end else begin + dr_k_counter_d = dr_k_counter_q + 1; end end - if(fs_state_q == FS_IDLE) begin - dr_state_d = DR_DONE; + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + if(fs_state_q == FS_IDLE) begin + dr_state_d = DR_DONE; end - end - end else begin - dr_counter_d = dr_counter_q + 1; + end else begin + dr_counter_d = dr_counter_q + 1; + end end end end DR_DONE: begin - dr_state_d = DR_IDLE; + if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin + last_dr_write = 1'b1; + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; //stall + end else begin + dr_state_d = DR_DONE; + if(dr_counter_q == LastRow) begin + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end else begin + dr_state_d = DR_IDLE; + end end default: begin dr_state_d = DR_IDLE; diff --git a/sw/applications/quadrilatero_easy_8x8/main.c b/sw/applications/quadrilatero_easy_8x8/main.c new file mode 100644 index 000000000..ee82d32e9 --- /dev/null +++ b/sw/applications/quadrilatero_easy_8x8/main.c @@ -0,0 +1,306 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/*Variable Data Type*/ +// Supported types: int32_t (0), float (1), int8_t (2), int16_t (3) +#define TYPE 0 + +/* Output tile size */ +// Supported values: 16 (4x4), 64 (8x8). +#define OUTPUT_TILE_SIZE 64 + +/*Register Length*/ +// Supported values: 128 (4x4), 256 (8x8) +#define RLEN 128 + +/* By default, printfs are deactivated. */ +#define PRINTF_IN_FPGA 0 +#define PRINTF_IN_SIM 1 + +/* VCD Files generation */ +// Supported Values: 0 (No), 1 (Yes) +// #define VCD_ENABLE 0 + +// ************************************************************************************************************ +// ***************************** ***************************** +// ***************************** DO NOT TOUCH LINES BELOW ! ***************************** +// ***************************** ***************************** +// ************************************************************************************************************ + +/* Includes */ +#include +#include +#include "csr.h" +#include "x-heep.h" +#include "gpio.h" +// #include "vcd_util.h" + +/* Define Datatype and set of data */ +#if TYPE == 0 + #include "matrixMul32i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 1 + #define HEAD_LINE "mmasa.w" + #define SIMD_SHIFT 2 + typedef int32_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 1 + #include "matrixMul32f.h" + #define FS_INITIAL 0x1 + #define SIMD_FACTOR 1 + #define HEAD_LINE "fmmacc.s" + #define SIMD_SHIFT 2 + typedef float DATA_IN_t ; + typedef float DATA_OUT_t; +#elif TYPE == 2 + #include "matrixMul8i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 4 + #define HEAD_LINE "mmaqa.b" + #define SIMD_SHIFT 0 + typedef int8_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#elif TYPE == 3 + #include "matrixMul16i.h" + #define FS_INITIAL 0x0 + #define SIMD_FACTOR 2 + #define HEAD_LINE "mmada.h" + #define SIMD_SHIFT 1 + typedef int16_t DATA_IN_t ; + typedef int32_t DATA_OUT_t; +#else +#endif + +/* Declare functions and global variables */ +DATA_OUT_t __attribute__((section(".xheep_data_interleaved"))) matrix_C[SIZE*SIZE]; +void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +int float_condition(int index); +int int_condition(int index); +uint32_t check_results(int K, int N, int M); +void print_matrix(DATA_OUT_t* matrix, int K, int N); + + +/* Select print mode */ + +#if TARGET_SIM && PRINTF_IN_SIM + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#elif TARGET_PYNQ_Z2 && PRINTF_IN_FPGA + #define PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else + #define PRINTF(...) +#endif + +/* Select kernel */ +#if OUTPUT_TILE_SIZE == 16 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_4x4((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 64 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_8x8((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#elif OUTPUT_TILE_SIZE == 0 + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) matrixMul_CPU((DATA_IN_t*)addrA, (DATA_IN_t*)addrB, (DATA_OUT_t*)addrC, (int) K, (int) N, (int) M, (int) shift) +#else + #define MATRIX_MUL(addrA, addrB, addrC, K, N, M, shift) +#endif + + +/* Select check condition */ +#if FS_INITIAL == 0x1 + #define CHECK_CONDITION(index) float_condition((int) index) +#elif FS_INITIAL == 0x0 + #define CHECK_CONDITION(index) int_condition ((int) index) +#else +#endif + +/* VCD Functions */ +#if VCD_ENABLE == 1 + #define VCD_START() vcd_init(); vcd_enable() + #define VCD_STOP() vcd_disable() +#else + #define VCD_START() + #define VCD_STOP() +#endif + +/* Matrices */ +#define MAT_A matrix_A +#define MAT_B matrix_BT +#define MAT_C matrix_C +#define MAT_EXP matrix_EXP + +#define MACC(HEAD,__mat1__, __mat2__, __mat3__) HEAD " m" #__mat1__", m"#__mat2__", m"#__mat3__ +// ------------------------------------------------------------------------------------------------------------------------------------- + + +int main() +{ + uint32_t errors = 0; + unsigned int cycles; + + // Save the address of the matrices + DATA_IN_t* addrA = MAT_A; + DATA_IN_t* addrB = MAT_B; + DATA_OUT_t* addrC = MAT_C; + + int K_size = SIZE/SIMD_FACTOR; + int N_size = SIZE ; + int M_size = SIZE ; + + //enable FP operations + CSR_SET_BITS(CSR_REG_MSTATUS, (FS_INITIAL << 13)); + + //start mcycle csr + CSR_CLEAR_BITS(CSR_REG_MCOUNTINHIBIT, 0x1); + CSR_WRITE(CSR_REG_MCYCLE, 0); + + //execute the kernel + // vcd_init(); + // vcd_enable(); + VCD_START(); + matrixMul_easy(addrA,addrB,addrC,K_size,N_size,M_size,SIMD_SHIFT); + VCD_STOP(); + // vcd_disable(); + + //read mcycle csr + CSR_READ(CSR_REG_MCYCLE, &cycles); + + //check results + errors = check_results(K_size,N_size,M_size); + + PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); + PRINTF("MATRIX C:\n\r"); + print_matrix(addrC, M_size, N_size); + PRINTF("MATRIX EXP:\n\r"); + print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); + return errors; +} + + +// ------------------------------------------------------------------------------------------------------------------------------------- + +void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + asm volatile("addi sp, sp, -0x30 " ); // + asm volatile("sw s0 , 0x2c(sp) " ); // + asm volatile("sw s1 , 0x28(sp) " ); // + asm volatile("sw s2 , 0x24(sp) " ); // + asm volatile("sw s3 , 0x20(sp) " ); // + asm volatile("sw s4 , 0x1c(sp) " ); // + asm volatile("sw s5 , 0x18(sp) " ); // + asm volatile("sw s6 , 0x14(sp) " ); // + asm volatile("sw s7 , 0x10(sp) " ); // + asm volatile("sw s8 , 0x0c(sp) " ); // + asm volatile("sw s9 , 0x08(sp) " ); // + asm volatile("sw s10, 0x04(sp) " ); // + asm volatile("sw s11, 0x00(sp) " ); // + + //-------------------------------------------------------------------------------- + // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + asm volatile("addi a6,x0,32 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; + asm volatile("addi s3,x0, 32 " ); // s3 = K*4; + asm volatile("slli s4,%0, 2 " :: "r" (N) ); // s4 = N*4; + + asm volatile("addi t1,x0, 0 " ); // t1 = n0 =0 + asm volatile("addi t3,t0,8 " ); // t3 = m0+WIDTH + asm volatile("mul s1,s3,t0 " ); // s1 = K*4*m0 + asm volatile("mul s2,s3,t3 " ); // s2 = K*4*(m0+WIDTH) + asm volatile("mul s0,s4,t0 " ); // s0 = N*4*m0; + asm volatile("mul s10,s4,t3 " ); // s10 = N*4*(m0+WIDTH) + asm volatile("add s1,%0,x0 " :: "r" (addrA) ); // s1 = startAddrA0 = addrA + K*4*m0 + asm volatile("add s2,%0,x0 " :: "r" (addrA) ); // s2 = startAddrA1 = addrA + K*4*(m0+WIDTH) + asm volatile("add s0,%0,x0 " :: "r" (addrC) ); // s0 = startAddrC0x = addrC + N*4*m0 + asm volatile("add s10,%0,x0 " :: "r" (addrC) ); // s10 = startAddrC1x = addrC + N*4*(m0+WIDTH) + + asm volatile("addi t4,t1,8 " ); // t4 = n0+WIDTH; + asm volatile("addi t2,x0,16 " ); // t2 = k0 = 16; + asm volatile("slli t5,t1, 2 " ); // t5 = n0*4; + asm volatile("mld.w m0, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mul s9,s3,t1 " ); // s9 = K*4*n0; + asm volatile("add s9 ,%0,s9 " :: "r" (addrB) ); // s9 = startAddrB0 = addrB + K*4*n0 + asm volatile("mld.w m1, (s9) , a6 " ); // m1 = B[s9] + asm volatile("mul s11,s3,t4 " ); // s11 = K*4*(n0+WIDTH); + asm volatile(MACC(HEAD_LINE,4,1,0) ); // m4 += m1 * m0 + asm volatile("add s11,%0,s11 " :: "r" (addrB) ); // s11 = startAddrB1 = addrB + K*4*(n0+WIDTH) + asm volatile("add s6,t5,0 " ); // s6 = startAddrC00 += n0*4 + asm volatile("mst.w m4, (s0) , s4 " ); // m4 -> (s6) + asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; + asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 + + asm volatile("add t0,t0, 16 " ); // t0 = m0 +=2*WIDTH; + //-------------------------------------------------------------------------------- + + asm volatile("lw s0 , 0x2c(sp) " ); // + asm volatile("lw s1 , 0x28(sp) " ); // + asm volatile("lw s2 , 0x24(sp) " ); // + asm volatile("lw s3 , 0x20(sp) " ); // + asm volatile("lw s4 , 0x1c(sp) " ); // + asm volatile("lw s5 , 0x18(sp) " ); // + asm volatile("lw s6 , 0x14(sp) " ); // + asm volatile("lw s7 , 0x10(sp) " ); // + asm volatile("lw s8 , 0x0c(sp) " ); // + asm volatile("lw s9 , 0x08(sp) " ); // + asm volatile("lw s10, 0x04(sp) " ); // + asm volatile("lw s11, 0x00(sp) " ); // + asm volatile("addi sp, sp, 0x30 " ); // + +} + + + + + +void __attribute__ ((noinline)) matrixMul_CPU(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) +{ + for(int i=0;i= 0); + return (diff > 0.001f); +} + +int int_condition(int index){ + return (MAT_C[index] != MAT_EXP[index]); +} + +uint32_t check_results(int K, int N, int M) +{ + // check + int i, j; + uint32_t err = 0; + + // Check errors + for(i = 0; i < M; i++) { + for(j = 0; j < N; j++) { + if(CHECK_CONDITION(i*N+j)) { + err ++; + PRINTF("Error at index %d, %d, expected %x, got %x\n\r", i, j, MAT_EXP[i*N+j], MAT_C[i*N+j]); + } + } + } + + return err; +} + +void print_matrix(DATA_OUT_t* matrix, int K, int N) +{ + for(int i=0;i Date: Thu, 17 Apr 2025 10:22:54 +0200 Subject: [PATCH 12/18] 8x8 LSU bug "fixed" --- .../quadrilatero/rtl/quadrilatero_lsu.sv | 28 +- .../rtl/quadrilatero_lsu_kindofworking.sv | 317 +++++++++ .../rtl/quadrilatero_register_lsu.sv | 4 +- ...register_lsu_adapted_to_8x8_not_working.sv | 434 ++++++++++++ .../rtl/quadrilatero_systolic_array.sv | 28 +- ...quadrilatero_systolic_array_wrong_order.sv | 636 ++++++++++++++++++ sw/applications/quadrilatero_easy_8x8/main.c | 64 +- .../quadrilatero_easy_8x8/matrixMul32i.h | 32 +- .../quadrilatero_matmul_16x16/main.c | 84 ++- 9 files changed, 1524 insertions(+), 103 deletions(-) create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index f6931f653..edbe3552b 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -5,7 +5,7 @@ // Author: Danilo Cammarata module quadrilatero_lsu #( - parameter int unsigned FIFO_DEPTH = quadrilatero_pkg::MESH_WIDTH, + parameter int unsigned FIFO_DEPTH = 4, parameter int unsigned DATA_WIDTH = 32 ) ( @@ -27,9 +27,8 @@ module quadrilatero_lsu #( input logic write_i , // write transaction output logic busy_o , // lsu available output logic terminate_o , // lsu done - input logic last_i, - input logic access_counter_match_i, - + input logic last_i , + // Address input logic [ 31:0] src_ptr_i , // base address input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one @@ -102,8 +101,6 @@ module quadrilatero_lsu #( logic store_fifo_empty ; logic [ DATA_WIDTH-1:0] store_fifo_output ; logic store_fifo_pop ; - logic last_q ; - logic last_d ; enum { @@ -113,7 +110,6 @@ module quadrilatero_lsu #( lsu_state_q, lsu_state_d; - assign last_d = last_i; always_comb begin : FSM_block lsu_state_d = lsu_state_q; @@ -124,7 +120,7 @@ module quadrilatero_lsu #( end end LSU_RUNNING: begin - if (terminate && !start_i && (store_fifo_empty)) begin + if (terminate && !start_i) begin lsu_state_d = LSU_READY; end end @@ -132,7 +128,7 @@ module quadrilatero_lsu #( end always_comb begin : ctrl_block - terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING) && (!last_i || write_i)); // !last_i only for RLEN/LLEN = 2, in other cases work with access counter + terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); load_fifo_valid_o = rd_valid_d; busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; terminate_o = terminate; @@ -141,8 +137,8 @@ module quadrilatero_lsu #( always_comb begin : addr_block src_ptr_inc = DATA_WIDTH / 8; addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i) ? ((src_ptr_i == ptr_q) ? ptr_q + addr_op2 : src_ptr_i) : ((write_i && !data_we_q)? ptr_q : ptr_q + addr_op2); //what happens when 2 loads don't load from subsequent addresses? - ptr_d = ((data_gnt_i && data_req_o) || start_i) ? addr : ptr_q; + addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; + ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; end always_comb begin : counters_block @@ -155,14 +151,14 @@ module quadrilatero_lsu #( rows_d = rows_i - 1; cols_d = cols_i - 2; end else if (rows_i > 1) begin - rows_d = rows_i - 1; + rows_d = rows_i - 2; cols_d = cols_i - 1; end end else begin rows_d = rows_i - 1; cols_d = cols_i - 1; end - end else if (data_gnt_i && data_req_o && last_i) begin + end else if (data_gnt_i && data_req_o) begin if (cols_q > 0) cols_d = cols_q - 1; else if (rows_q > 0) begin cols_d = cols_i - 1; @@ -232,7 +228,7 @@ module quadrilatero_lsu #( rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : (load_fifo_output_pop_i & - load_fifo_empty & ~data_gnt_i) ? 1'b0 : rd_valid_q; + load_fifo_empty & ~rvalid) ? 1'b0 : rd_valid_q; rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || (rvalid & ~rd_valid_q) ? load_fifo_input : @@ -296,7 +292,6 @@ module quadrilatero_lsu #( if (~rst_ni) begin lsu_state_q <= LSU_READY; ptr_q <= '0 ; - last_q <= '0 ; rows_q <= '0 ; cols_q <= '0 ; rd_head_q <= '0 ; @@ -305,7 +300,6 @@ module quadrilatero_lsu #( end else begin lsu_state_q <= lsu_state_d; ptr_q <= ptr_d ; - last_q <= last_d ; rows_q <= rows_d ; cols_q <= cols_d ; rd_head_q <= rd_head_d ; @@ -314,4 +308,4 @@ module quadrilatero_lsu #( end end -endmodule +endmodule \ No newline at end of file diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv new file mode 100644 index 000000000..f6931f653 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv @@ -0,0 +1,317 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +module quadrilatero_lsu #( + parameter int unsigned FIFO_DEPTH = quadrilatero_pkg::MESH_WIDTH, + parameter int unsigned DATA_WIDTH = 32 + +) ( + input logic clk_i , + input logic rst_ni , + + // Bus interface + output logic data_req_o , + output logic [ 31:0] data_addr_o , + output logic data_we_o , + output logic [DATA_WIDTH/8 - 1:0] data_be_o , + output logic [ DATA_WIDTH-1:0] data_wdata_o , + input logic data_gnt_i , + input logic data_rvalid_i , + input logic [ DATA_WIDTH-1:0] data_rdata_i , + + // Configuration + input logic start_i , // start transfer (MUST BE A PULSE!!!!!) + input logic write_i , // write transaction + output logic busy_o , // lsu available + output logic terminate_o , // lsu done + input logic last_i, + input logic access_counter_match_i, + + // Address + input logic [ 31:0] src_ptr_i , // base address + input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one + input logic [ 31:0] rows_i , // how many rows we need to fetch + input logic [ 31:0] cols_i , + + // Output data + output logic [ DATA_WIDTH-1:0] load_fifo_output_o , + output logic load_fifo_valid_o , + output logic load_fifo_data_available_o , + input logic load_fifo_output_pop_i , + + // Input data + input logic [ DATA_WIDTH-1:0] store_fifo_input_i , + input logic store_fifo_push_i , + output logic store_fifo_space_available_o, + output logic store_fifo_empty_o + + +); + + localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; + localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; + localparam int unsigned LastFifoUsage = DEPTH - 1; + + + logic terminate ; + + logic [ 31:0] rows_q ; + logic [ 31:0] rows_d ; + logic [ 31:0] cols_q ; + logic [ 31:0] cols_d ; + logic [ 31:0] src_ptr_inc ; + logic [ 31:0] addr ; + logic [ 31:0] addr_op2 ; + logic [ 31:0] ptr_q ; + logic [ 31:0] ptr_d ; + + logic data_in_req ; + logic data_in_we ; + logic [ DATA_WIDTH/8-1:0] data_in_be ; + logic [ 31:0] data_in_addr ; + logic data_in_rvalid ; + logic [ DATA_WIDTH-1:0] data_in_rdata ; + + logic [ DATA_WIDTH-1:0] load_fifo_input ; + logic [ DATA_WIDTH-1:0] load_fifo_data_out; + logic rd_valid_q ; + logic rd_valid_d ; + logic [ DATA_WIDTH-1:0] rd_head_q ; + logic [ DATA_WIDTH-1:0] rd_head_d ; + logic data_we_q ; + logic data_we_d ; + logic rvalid ; + logic load_fifo_pop ; + logic load_fifo_push ; + logic [Addr_Fifo_Depth-1:0] load_fifo_usage ; + logic load_fifo_alm_full; + logic load_fifo_full ; + logic load_fifo_empty ; + + logic data_out_req ; + logic data_out_we ; + logic [ DATA_WIDTH/8-1:0] data_out_be ; + logic [ 31:0] data_out_addr ; + logic data_out_gnt ; + logic [ DATA_WIDTH-1:0] data_out_wdata ; + + logic store_fifo_full ; + logic store_fifo_empty ; + logic [ DATA_WIDTH-1:0] store_fifo_output ; + logic store_fifo_pop ; + logic last_q ; + logic last_d ; + + + enum { + LSU_READY, + LSU_RUNNING + } + lsu_state_q, lsu_state_d; + + + assign last_d = last_i; + always_comb begin : FSM_block + lsu_state_d = lsu_state_q; + + case (lsu_state_q) + LSU_READY: begin + if (start_i & |cols_i & |rows_i) begin + lsu_state_d = LSU_RUNNING; + end + end + LSU_RUNNING: begin + if (terminate && !start_i && (store_fifo_empty)) begin + lsu_state_d = LSU_READY; + end + end + endcase + end + + always_comb begin : ctrl_block + terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING) && (!last_i || write_i)); // !last_i only for RLEN/LLEN = 2, in other cases work with access counter + load_fifo_valid_o = rd_valid_d; + busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; + terminate_o = terminate; + end + + always_comb begin : addr_block + src_ptr_inc = DATA_WIDTH / 8; + addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; + addr = (start_i) ? ((src_ptr_i == ptr_q) ? ptr_q + addr_op2 : src_ptr_i) : ((write_i && !data_we_q)? ptr_q : ptr_q + addr_op2); //what happens when 2 loads don't load from subsequent addresses? + ptr_d = ((data_gnt_i && data_req_o) || start_i) ? addr : ptr_q; + end + + always_comb begin : counters_block + rows_d = rows_q; + cols_d = cols_q; + + if(start_i) begin + if(data_gnt_i && data_req_o) begin + if(cols_i > 1) begin + rows_d = rows_i - 1; + cols_d = cols_i - 2; + end else if (rows_i > 1) begin + rows_d = rows_i - 1; + cols_d = cols_i - 1; + end + end else begin + rows_d = rows_i - 1; + cols_d = cols_i - 1; + end + end else if (data_gnt_i && data_req_o && last_i) begin + if (cols_q > 0) cols_d = cols_q - 1; + else if (rows_q > 0) begin + cols_d = cols_i - 1; + rows_d = rows_q - 1; + end + end + end + + always_comb begin : read_obi + data_in_req = '0; + data_in_we = '0; + data_in_be = '0; + data_in_addr = '0; + + if (load_fifo_full == 1'b0 && load_fifo_alm_full == 1'b0) begin + data_in_req = ~write_i & (start_i | lsu_state_q == LSU_RUNNING); + data_in_we = 1'b0 ; + data_in_be = '1 ; + data_in_addr = addr ; + end + end + + always_comb begin : write_obi + data_out_req = '0 ; + data_out_we = '0 ; + data_out_be = '0 ; + data_out_addr = '0 ; + data_out_wdata = store_fifo_output; + + if (!store_fifo_empty) begin + data_out_req = start_i | lsu_state_q == LSU_RUNNING; + // data_out_we = 1'b1 ; + data_out_we = start_i | lsu_state_q == LSU_RUNNING; + data_out_be = '1 ; + data_out_addr = addr ; + end + end + + always_comb begin : obi_channel_signals + data_in_rvalid = 1'b0 ; + data_wdata_o = data_out_wdata; + data_out_gnt = data_gnt_i ; + data_in_rdata = data_rdata_i ; + + if(store_fifo_empty) begin // read transaction active + data_req_o = data_in_req ; + data_we_o = data_in_we ; + data_be_o = data_in_be ; + data_addr_o = data_in_addr ; + data_in_rvalid = data_rvalid_i ; + end else begin // write transaction active + data_req_o = data_out_req ; + data_we_o = data_out_we ; + data_be_o = data_out_be ; + data_addr_o = data_out_addr ; + end + end + + always_comb begin : load_fifo_block + data_we_d = data_gnt_i && data_req_o && data_we_o; + rvalid = data_in_rvalid &~ data_we_q ; + + load_fifo_alm_full = (load_fifo_usage == LastFifoUsage[Addr_Fifo_Depth-1:0]); + load_fifo_input = data_in_rdata; + load_fifo_push = (rvalid & rd_valid_q & ~load_fifo_output_pop_i) | (rvalid & ~load_fifo_empty); + load_fifo_pop = load_fifo_output_pop_i & ~load_fifo_empty; + + rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : + (load_fifo_output_pop_i & + load_fifo_empty & ~data_gnt_i) ? 1'b0 : rd_valid_q; + + rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || + (rvalid & ~rd_valid_q) ? load_fifo_input : + (load_fifo_output_pop_i & ~load_fifo_empty) ? load_fifo_data_out : rd_head_q; + + load_fifo_output_o = rd_head_q ; + load_fifo_data_available_o = rd_valid_q; + end + + always_comb begin : store_fifo_block + store_fifo_pop = data_out_gnt & data_out_req; + store_fifo_empty_o = store_fifo_empty; + store_fifo_space_available_o = ~store_fifo_full; + end + + fifo_v3 #( + .FALL_THROUGH (1'b0 ), + .DEPTH (DEPTH ), + .DATA_WIDTH (DATA_WIDTH ) + ) load_lsu_fifo_i ( + .clk_i , + .rst_ni , + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + + // status flags + .full_o (load_fifo_full ), + .empty_o (load_fifo_empty ), + .usage_o (load_fifo_usage ), + + // as long as the queue is not full we can push new data + .data_i (load_fifo_input ), + .push_i (load_fifo_push ), + + // as long as the queue is not empty we can pop new elements + .data_o (load_fifo_data_out ), + .pop_i (load_fifo_pop ) + ); + + fifo_v3 #( + .DEPTH(FIFO_DEPTH), + .DATA_WIDTH(DATA_WIDTH) + ) store_lsu_fifo_i ( + .clk_i , + .rst_ni , + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + // status flags + .full_o (store_fifo_full ), + .empty_o (store_fifo_empty ), + .usage_o ( ), + // as long as the queue is not full we can push new data + .data_i (store_fifo_input_i ), + .push_i (store_fifo_push_i ), + // as long as the queue is not empty we can pop new elements + .data_o (store_fifo_output ), + .pop_i (store_fifo_pop ) + ); + + always_ff @(posedge clk_i, negedge rst_ni) begin : seq_block + if (~rst_ni) begin + lsu_state_q <= LSU_READY; + ptr_q <= '0 ; + last_q <= '0 ; + rows_q <= '0 ; + cols_q <= '0 ; + rd_head_q <= '0 ; + rd_valid_q <= '0 ; + data_we_q <= '0 ; + end else begin + lsu_state_q <= lsu_state_d; + ptr_q <= ptr_d ; + last_q <= last_d ; + rows_q <= rows_d ; + cols_q <= cols_d ; + rd_head_q <= rd_head_d ; + rd_valid_q <= rd_valid_d ; + data_we_q <= data_we_d ; + end + end + +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index 7fda92966..d5303f913 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -64,7 +64,7 @@ module quadrilatero_register_lsu #( ); - localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; + localparam MAX_EL_PER_ROW = quadrilatero_pkg::RLEN / LLEN; localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; @@ -382,7 +382,7 @@ module quadrilatero_register_lsu #( .busy_o (busy ), .terminate_o (terminate ), .last_i (wlast_o | rlast_o), - .access_counter_match_i (access_counter_d == access_counter_q), + //.access_counter_match_i (access_counter_d == access_counter_q), // Address .src_ptr_i (src_ptr ), diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv new file mode 100644 index 000000000..7fda92966 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv @@ -0,0 +1,434 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* +NOTE: for now we assume we fetch the entire row in 1 cycle. TODO: Change the number of columns and adapt this to arbitrary BUS_WIDTH parameters +NOTE: we are not handling difference in endianness when loading reduced datawidths +*/ + +module quadrilatero_register_lsu #( + parameter int unsigned BUS_WIDTH = 128, + parameter int unsigned N_REGS = 8, + parameter int unsigned N_ROWS = 4, + localparam int unsigned LLEN = BUS_WIDTH +) ( + input logic clk_i , + input logic rst_ni , + + // Bus interface + output logic data_req_o , + output logic [ 31:0] data_addr_o , + output logic data_we_o , + output logic [ BUS_WIDTH/8 - 1:0] data_be_o , + output logic [ BUS_WIDTH-1:0] data_wdata_o , + input logic data_gnt_i , + input logic data_rvalid_i , + input logic [ BUS_WIDTH-1:0] data_rdata_i , + + output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , + + // Register Write Port for load unit + output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , + output logic [quadrilatero_pkg::RLEN-1:0] wdata_o , + output logic we_o , + output logic wlast_o , + input logic wready_i , // to stall the request in case the port is busy + + // Register Read Port for store unit + output logic [ $clog2(N_REGS)-1:0] raddr_o , + output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] rdata_i , + input logic rdata_valid_i , + output logic rdata_ready_o , + output logic rlast_o , + + // Configuration Signals + input logic start_i , // start loading: MUST BE A PULSE + input logic write_i , + output logic busy_o , + input logic [ 31:0] stride_i , // stride value + input logic [ 31:0] address_i , // address value + input logic [ $clog2(N_REGS)-1:0] operand_reg_i , // destination register + input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i , // instruction id + input logic [ 31:0] n_bytes_cols_i , // we always fetch the entire row and then only take the elements we need + input logic [ 31:0] n_rows_i , + + + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o //instruction id out + +); + + localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; + localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); + localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; + + typedef enum logic [1:0] { + LSU_IDLE, + LSU_LOAD, + LSU_STORE, + LSU_DONE + } register_lsu_state_e; + + register_lsu_state_e lsu_state_d, lsu_state_q; + + logic finished; + logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; + logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; + + logic [$clog2(N_ROWS)-1:0] counter_q; + logic [$clog2(N_ROWS)-1:0] counter_d; + logic [$clog2(N_REGS)-1:0] waddr_q; + logic [$clog2(N_REGS)-1:0] waddr_d; + logic [$clog2(N_REGS)-1:0] raddr_q; + logic [$clog2(N_REGS)-1:0] raddr_d; + + + logic [LLEN-1:0] load_fifo_data; + + logic load_fifo_data_available; + logic load_fifo_pop; + + logic store_fifo_space_available; + logic store_fifo_push; + logic store_fifo_empty; + logic [LLEN-1:0] store_fifo_data; + + logic [LLEN-1:0] data_mask; + logic load_fifo_valid; + logic busy; + logic start; + logic start_q; + logic start_d; + + logic write_q; + logic write_d; + logic terminate; + logic busy_q; + logic busy_d; + + logic lsu_busy_q; + logic lsu_ready; + + logic [ 31:0] src_ptr_d ; + logic [ 31:0] stride_d ; + logic [ 31:0] src_ptr_q ; + logic [ 31:0] stride_q ; + logic [ 31:0] src_ptr ; + logic [ 31:0] stride ; + + logic [$clog2(NumAccesses)-1:0] access_counter_d; + logic [$clog2(NumAccesses)-1:0] access_counter_q; + + logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_d; + logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_q; + + logic [quadrilatero_pkg::RLEN-1:0] store_mask; + logic [quadrilatero_pkg::RLEN-1:0] load_mask; + + assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; + always_comb begin + lsu_id_o = (write_i &~ load_fifo_data_available & rlast_o) ? instr_id_i : back_id_q; + finished = (write_q & terminate & rlast_o) | (~write_q & (counter_q == LastRow) & wready_i && wlast_o); + end + + + always_comb begin: write_to_RF + data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols + load_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); + we_o = load_fifo_data_available &~ mask_req; // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy + waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; + wrowaddr_o = counter_q ; + load_row_buffer_d = (load_row_buffer_q & ~load_mask) | (load_fifo_data << (LLEN * access_counter_q)); + wdata_o = {load_fifo_data, load_row_buffer_q} & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q + + end + + always_comb begin: read_from_RF + store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); + rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; + rrowaddr_o = counter_q ; + raddr_o = operand_reg_i;//lsu_state_q == LSU_IDLE? raddr_d : raddr_q; + end + + always_comb begin: lsu_ctrl_block + load_fifo_pop = wready_i; + store_fifo_data = (rdata_i & store_mask) >> (LLEN * access_counter_q); + store_fifo_push = rdata_ready_o && rdata_valid_i; + lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); + start = (start_i | start_q) & lsu_ready; + busy_o = (write_i ? busy_d : busy) | start_q; + + stride = (start) ? (stride_i / NumAccesses) : stride_q; + src_ptr = (start) ? address_i : src_ptr_q; + end + + always_comb begin: next_value + write_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b1 : + (!write_i && !busy) ? 1'b0 : write_q; + + start_d = start ? 1'b0 : + (start_q | start_i) ? 1'b1 : start_q; + + stride_d = (start) ? (stride_i / NumAccesses) : stride_q ; + src_ptr_d = (start) ? address_i : src_ptr_q; + + busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i && rlast_o) ? 1'b0 : + (write_i && start_i) ? 1'b1 : busy_q; + end + always_comb begin: fsm_block + lsu_state_d = lsu_state_q; + counter_d = counter_q; + access_counter_d = access_counter_q; + rlast_o = 1'b0; + wlast_o = 1'b0; + + back_id_d = back_id_q; + waddr_d = waddr_q; + raddr_d = raddr_q; + + case (lsu_state_q) + LSU_IDLE: begin + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + raddr_d = operand_reg_i; + //access_counter_d = '0; + if(load_fifo_valid && !write_i && wready_i) begin + if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_LOAD; + wlast_o = 1'b1; + end else begin + access_counter_d = access_counter_q + 1; + lsu_state_d = LSU_LOAD; + end + end else if (write_i & store_fifo_space_available && rdata_valid_i) begin + if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + lsu_state_d = LSU_STORE; + rlast_o = 1'b1; + end else begin + access_counter_d = access_counter_q + 1; + lsu_state_d = LSU_STORE; + end + end + + end + LSU_LOAD: begin + if(load_fifo_valid) begin + if(wready_i) begin + if(counter_q == LastRow) begin + if(access_counter_q == NumAccesses - 1) begin + wlast_o = 1'b1; + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + raddr_d = operand_reg_i; + end else begin + access_counter_d = access_counter_q + 1; + end + end else begin + if(access_counter_q == NumAccesses - 1) begin + wlast_o = 1'b1; + access_counter_d = '0; + counter_d = counter_q + 1; + end else begin + access_counter_d = access_counter_q + 1; + end + end + end + + end else begin + if(write_i && wready_i) begin + if(access_counter_q == NumAccesses - 1) begin + counter_d = '0; + wlast_o = 1'b1; + lsu_state_d = LSU_DONE; + access_counter_d = '0; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + raddr_d = operand_reg_i; + + end else begin + access_counter_d = access_counter_q + 1; + end + end + + end + end + LSU_STORE: begin + if(store_fifo_space_available && write_i && rdata_valid_i) begin + if(counter_q == LastRow) begin + if(access_counter_q == NumAccesses - 1) begin + rlast_o = 1'b1; + access_counter_d = '0; + counter_d = '0; + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d = operand_reg_i; + raddr_d = operand_reg_i; + end else begin + access_counter_d = access_counter_q + 1; + end + + end else begin + if(access_counter_q == NumAccesses - 1) begin + rlast_o = 1'b1; + access_counter_d = '0; + counter_d = counter_q + 1; + end else begin + access_counter_d = access_counter_q + 1; + end + end + end else begin + // counter_d = '0; + // back_id_d = instr_id_i; + // lsu_state_d = LSU_DONE; + end + end + LSU_DONE: begin + if(load_fifo_valid && !write_i && wready_i) begin + if(access_counter_q == NumAccesses - 1) begin + access_counter_d = '0; + counter_d = counter_q + 1; + wlast_o = 1'b1; + lsu_state_d = LSU_LOAD; + end else begin + access_counter_d = access_counter_q + 1; + end + end else if (write_i && store_fifo_space_available && rdata_valid_i) begin + if(access_counter_q == NumAccesses - 1) begin + counter_d = counter_q + 1; + rlast_o = 1'b1; + lsu_state_d = LSU_STORE; + access_counter_d = '0; + end else begin + access_counter_d = access_counter_q + 1; + end + end else begin + lsu_state_d = LSU_IDLE; + end + end + default: begin + lsu_state_d = LSU_IDLE; + end + endcase + + end + + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + counter_q <= '0; + waddr_q <= '0; + raddr_q <= '0; + back_id_q <= '0; + start_q <= '0; + write_q <= '0; + busy_q <= '0; + lsu_state_q <= LSU_IDLE; + + lsu_busy_q <= '0; + src_ptr_q <= '0; + stride_q <= '0; + access_counter_q <= '0; + load_row_buffer_q <= '0; + end else begin + counter_q <= counter_d; + back_id_q <= back_id_d; + waddr_q <= waddr_d ; + raddr_q <= raddr_d ; + start_q <= start_d ; + write_q <= write_d ; + busy_q <= busy_d ; + lsu_state_q <= lsu_state_d; + + lsu_busy_q <= busy; + src_ptr_q <= src_ptr_d; + stride_q <= stride_d ; + access_counter_q <= access_counter_d; + load_row_buffer_q <= load_row_buffer_d; + end + end + + quadrilatero_lsu #( + .FIFO_DEPTH (4 ), + .DATA_WIDTH (BUS_WIDTH) + ) lsunit_inst ( + + .clk_i , + .rst_ni , + + // Bus interface + .data_req_o , + .data_addr_o , + .data_we_o , + .data_be_o , + .data_wdata_o , + .data_gnt_i , + .data_rvalid_i , + .data_rdata_i , + + //Configuration + .start_i (start ), + .write_i (write_i), + .busy_o (busy ), + .terminate_o (terminate ), + .last_i (wlast_o | rlast_o), + .access_counter_match_i (access_counter_d == access_counter_q), + + // Address + .src_ptr_i (src_ptr ), + .stride_i (stride ), + .cols_i (MAX_EL_PER_ROW ), + .rows_i (n_rows_i ), + + // Output data + .load_fifo_output_o (load_fifo_data ), + .load_fifo_valid_o (load_fifo_valid ), + .load_fifo_data_available_o (load_fifo_data_available ), + .load_fifo_output_pop_i (load_fifo_pop ), + + // Input data + .store_fifo_input_i (store_fifo_data ), + .store_fifo_push_i (store_fifo_push ), + .store_fifo_space_available_o (store_fifo_space_available ), + .store_fifo_empty_o (store_fifo_empty ) + ); + + //------------------------- + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + finished_o <= '0; + finished_instr_id_o <= '0; + end else begin + if (finished) begin + finished_o <= '1; + finished_instr_id_o <= back_id_q; + end + if (finished_ack_i) begin + finished_o <= '0; + finished_instr_id_o <= '0; + end + end + end + //--------------------- + + // Assertions + if (N_ROWS < 2) begin + $error( + "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" + ); + end + if ((NumAccesses & (NumAccesses - 1)) != 0) begin + $error("RLEN / LLEN must be a power of 2."); + end +endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index e0c43fff0..e605ba491 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -181,7 +181,7 @@ module quadrilatero_systolic_array #( always_comb begin: rf_block // Weight Read Register Port weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - weight_base_row = (quadrilatero_pkg::RLEN / DATA_WIDTH) - (N_ROWS * (ff_it_counter_q + 1)); + weight_base_row = N_ROWS * ff_it_counter_q; weight_raddr_o = weight_reg_q ; weight_rrowaddr_o = ff_counter_q + weight_base_row; weight_rdata_shifted = (weight_rdata_i >> ALEN * ff_k_counter_q); @@ -223,7 +223,7 @@ module quadrilatero_systolic_array #( if(dr_k_counter_q == K-1) begin res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer end else begin - res_wdata_buffer_d[dr_counter_q] = res_wdata_buffer_q[dr_counter_q] | (res_wdata_partial << ALEN*dr_k_counter_q); + res_wdata_buffer_d[dr_counter_q] = res_wdata_partial; end end end @@ -308,16 +308,16 @@ module quadrilatero_systolic_array #( sa_ctrl_d = sa_ctrl_i; id_ff_d = id_i; end else begin - if(ff_k_counter_q == K-1) begin - ff_k_counter_d = '0; - if(ff_row_counter_q == (RegLastRow-1)) begin - ff_row_counter_d = '0; + if(ff_row_counter_q == RegLastRow-1) begin + ff_row_counter_d = '0; + if(ff_k_counter_q == (K-1)) begin + ff_k_counter_d = '0; ff_it_counter_d = ff_it_counter_q + 1; end else begin - ff_row_counter_d = ff_row_counter_q + 1; + ff_k_counter_d = ff_k_counter_q + 1; end end else begin - ff_k_counter_d = ff_k_counter_q + 1; + ff_row_counter_d = ff_row_counter_q + 1; end end end @@ -428,16 +428,16 @@ module quadrilatero_systolic_array #( dr_row_counter_d = '0; dr_k_counter_d = '0; end else begin - if(dr_k_counter_q == K-1) begin - dr_k_counter_d = '0; - if(dr_row_counter_q == (RegLastRow-1)) begin - dr_row_counter_d = '0; + if(dr_row_counter_q == RegLastRow-1) begin + dr_row_counter_d = '0; + if(dr_k_counter_q == (K-1)) begin + dr_k_counter_d = '0; dr_it_counter_d = dr_it_counter_q + 1; end else begin - dr_row_counter_d = dr_row_counter_q + 1; + dr_k_counter_d = dr_k_counter_q + 1; end end else begin - dr_k_counter_d = dr_k_counter_q + 1; + dr_row_counter_d = dr_row_counter_q + 1; end end if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv new file mode 100644 index 000000000..e0c43fff0 --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv @@ -0,0 +1,636 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* + +TODO: +- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs + - basically you need to inject zeros instead of actual elements +*/ + +module quadrilatero_systolic_array #( + parameter int MESH_WIDTH = 4 , + parameter int DATA_WIDTH = 32 , + parameter int N_REGS = 8 , + parameter int ENABLE_SIMD = 1 , + localparam int N_ROWS = MESH_WIDTH , + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, + parameter FPU = 1 +) ( + input logic clk_i , + input logic rst_ni , + + output logic sa_ready_o , + input logic start_i , + + // Only has effect if ENABLE_SIMD == 1 + input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , + + input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register + input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register + input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register + input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction + + // Weight Read Register Port + output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , + input logic weight_rdata_valid_i, + output logic weight_rdata_ready_o, + output logic weight_rlast_o , + + // Data Read Register Port + output logic [ $clog2(N_REGS)-1:0] data_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , + input logic data_rdata_valid_i , + output logic data_rdata_ready_o , + output logic data_rlast_o , + + // Accumulator Read Register Port + output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , + input logic acc_rdata_valid_i , + output logic acc_rdata_ready_o , + output logic acc_rlast_o , + + // Accumulator Out Write Register Port + output logic [ $clog2(N_REGS)-1:0] res_waddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , + output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , + output logic res_we_o , + output logic res_wlast_o , + input logic res_wready_i , + + // RF Instruction ID + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , + + // Finish + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o +); + typedef enum logic [1:0]{ + FS_IDLE, + FS_ACTIVE, + FS_LAST + } fs_state_e; + typedef enum logic [1:0]{ + FF_IDLE, + FF_ACTIVE, + FF_DONE + } ff_state_e; + typedef enum logic [1:0]{ + DR_IDLE, + DR_ACTIVE, + DR_DONE + } dr_state_e; + + ff_state_e ff_state_d, ff_state_q; + fs_state_e fs_state_d, fs_state_q; + dr_state_e dr_state_d, dr_state_q; + localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); + localparam RegLastRow = quadrilatero_pkg::RLEN/ ALEN; + localparam K = quadrilatero_pkg::RLEN / ALEN; + + logic [$clog2(K)-1:0] ff_k_counter_d; + logic [$clog2(K)-1:0] ff_k_counter_q; + logic [$clog2(K)-1:0] dr_k_counter_d; + logic [$clog2(K)-1:0] dr_k_counter_q; + logic [$clog2(K)-1:0] ff_it_counter_d; + logic [$clog2(K)-1:0] ff_it_counter_q; + logic [$clog2(K)-1:0] dr_it_counter_d; + logic [$clog2(K)-1:0] dr_it_counter_q; + logic [$clog2(K)-1:0] ff_row_counter_d; + logic [$clog2(K)-1:0] ff_row_counter_q; + logic [$clog2(K)-1:0] dr_row_counter_d; + logic [$clog2(K)-1:0] dr_row_counter_q; + logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; + logic last_dr_write; + + // Data Masks + logic [quadrilatero_pkg::RLEN-1:0] data_mask; + logic [quadrilatero_pkg::RLEN-1:0] weight_mask; + logic [quadrilatero_pkg::RLEN-1:0] acc_mask; + //logic [quadrilatero_pkg::RLEN-1:0] res_mask; + + logic [ALEN-1:0] data_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; + logic [ALEN-1:0] weight_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_shifted; + logic [ALEN-1:0] acc_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; + logic [ALEN-1:0] res_wdata_partial; + logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; + logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0] res_wdata_buffer_d; + logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0]res_wdata_buffer_q; + + + logic valid ; + logic ff_valid; + logic clear ; + logic pump ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; + + logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register + logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register + logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register + logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register + quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; + quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; + + logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage + + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; + + logic finished_d ; + logic finished_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; + logic mask_req ; + + quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; + + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; + logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; + + //--------------------------------------------------------------------- + + always_comb begin: rf_block + // Weight Read Register Port + weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); + weight_base_row = (quadrilatero_pkg::RLEN / DATA_WIDTH) - (N_ROWS * (ff_it_counter_q + 1)); + weight_raddr_o = weight_reg_q ; + weight_rrowaddr_o = ff_counter_q + weight_base_row; + weight_rdata_shifted = (weight_rdata_i >> ALEN * ff_k_counter_q); + weight_rdata_masked = weight_rdata_shifted[ALEN-1:0]; //TODO fix + weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? + + // Data Read Register Port + data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); // TODO fix + data_raddr_o = data_reg_q ; + data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + data_rdata_shifted = (data_rdata_i << ALEN * ff_it_counter_q); + data_rdata_masked = data_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] ; //TODO fix + data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; + + // Accumulator Read Register Port + acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); + acc_raddr_o = acc_reg_q ; + acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + acc_rdata_shifted = (acc_rdata_i >> ALEN * ff_k_counter_q); + res_rdata_shifted = (res_wdata_o >> ALEN * ff_k_counter_q); + acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? + res_rdata_shifted[ALEN-1:0] : acc_rdata_shifted[ALEN-1:0]; //TODO fix + acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + acc_rlast_o = '0 ; + + // Accumulator Out Write Register Port + res_waddr_o = dest_reg_q ; + res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; + res_wdata_o = {res_wdata_partial, res_wdata_buffer_q[dr_counter_q]}; + res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; + res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); + end + + always_comb begin : weight_buffer_block + res_wdata_buffer_d = res_wdata_buffer_q; + if(dr_state_q != DR_IDLE) begin + if(dr_k_counter_q == K-1) begin + res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer + end else begin + res_wdata_buffer_d[dr_counter_q] = res_wdata_buffer_q[dr_counter_q] | (res_wdata_partial << ALEN*dr_k_counter_q); + end + end + end + + always_comb begin: finished_signal + + finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : + (finished_ack_i ) ? 1'b0 : finished_q; + + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? id_dr_q : + (finished_ack_i ) ? '0 : finished_instr_id_q; + end + + always_comb begin: ctrl_block + valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; + if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q == DR_IDLE)) begin + clear = 1'b1; + end else begin + clear = 1'b0; + end + if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q != DR_IDLE)) begin + pump = 1'b1; + end else begin + pump = 1'b0; + end + mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; + end + + always_comb begin : ff_fsm_block + ff_counter_d = ff_counter_q; + ff_state_d = ff_state_q; + data_reg_d = data_reg_q; + acc_reg_d = acc_reg_q; + weight_reg_d = weight_reg_q; + sa_ctrl_d = sa_ctrl_q; + id_ff_d = id_ff_q; + ff_k_counter_d = ff_k_counter_q; + ff_it_counter_d = ff_it_counter_q; + ff_row_counter_d = ff_row_counter_q; + ff_valid = 1'b0; + + unique case (ff_state_q) + FF_IDLE: begin + ff_counter_d = '0; + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + if(start_i == 1'b1) begin + ff_state_d = FF_ACTIVE; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end + end + FF_ACTIVE: begin + if(valid == 1'b1) begin + ff_valid = 1'b1; + if(ff_counter_q==(LastRow-1)) begin + ff_counter_d = ff_counter_q + 1; + ff_state_d = FF_DONE; + end else begin + ff_counter_d = ff_counter_q + 1; + end + end + end + + FF_DONE: begin + if(start_i == 1'b1 | ~(data_rlast_o == 1'b1 && weight_rlast_o == 1'b1 && ff_it_counter_q == (K-1))) begin + if(valid == 1'b1) begin + ff_valid = 1'b1; + ff_counter_d = '0; + ff_state_d = FF_ACTIVE; + if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin // get inputs from new instruction + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + if(ff_k_counter_q == K-1) begin + ff_k_counter_d = '0; + if(ff_row_counter_q == (RegLastRow-1)) begin + ff_row_counter_d = '0; + ff_it_counter_d = ff_it_counter_q + 1; + end else begin + ff_row_counter_d = ff_row_counter_q + 1; + end + end else begin + ff_k_counter_d = ff_k_counter_q + 1; + end + end + end + + end else begin + ff_counter_d = '0; + ff_state_d = FF_IDLE; + end + end + + default: begin + ff_state_d = FF_IDLE; + end + endcase + end + always_comb begin : fs_fsm_block + fs_counter_d = fs_counter_q; + fs_state_d = fs_state_q; + + acc_fs_d = acc_fs_q; + id_fs_d = id_fs_q; + + unique case(fs_state_q) + FS_IDLE: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + + end + FS_ACTIVE: begin + if(clear == 1'b1) begin + fs_counter_d = '0; + fs_state_d = FS_IDLE; + end else begin + if(fs_counter_q == LastRow-2) begin + fs_counter_d = fs_counter_q + 1; + fs_state_d = FS_LAST; + end else begin + fs_counter_d = fs_counter_q + 1; + end + end + end + FS_LAST: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin //stay in active mode, load new inputs + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + if(ff_state_q == FF_IDLE) begin + fs_state_d = FS_IDLE; + end else begin + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + fs_state_d = FS_IDLE; + end + + end + default: begin + fs_state_d = FS_IDLE; + end + + endcase + end + + always_comb begin : dr_fsm_block + dr_state_d = dr_state_q; + dr_counter_d = dr_counter_q; + dr_k_counter_d = dr_k_counter_q; + dr_it_counter_d = dr_it_counter_q; + dr_row_counter_d = dr_row_counter_q; + last_dr_write = 1'b0; + + dest_reg_d = dest_reg_q; + id_dr_d = id_dr_q; + unique case(dr_state_q) + DR_IDLE: begin + dr_counter_d = '0; + dr_k_counter_d = '0; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + + end + DR_ACTIVE: begin + if(clear == 1'b1) begin + dr_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; + end else begin + if(dr_counter_q == LastRow) begin + + dr_counter_d = '0; + //update DR counters + if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + end else begin + if(dr_k_counter_q == K-1) begin + dr_k_counter_d = '0; + if(dr_row_counter_q == (RegLastRow-1)) begin + dr_row_counter_d = '0; + dr_it_counter_d = dr_it_counter_q + 1; + end else begin + dr_row_counter_d = dr_row_counter_q + 1; + end + end else begin + dr_k_counter_d = dr_k_counter_q + 1; + end + end + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + if(fs_state_q == FS_IDLE) begin + dr_state_d = DR_DONE; + end + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end + + end + DR_DONE: begin + if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin + last_dr_write = 1'b1; + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; //stall + end else begin + dr_state_d = DR_DONE; + if(dr_counter_q == LastRow) begin + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end else begin + dr_state_d = DR_IDLE; + end + end + default: begin + dr_state_d = DR_IDLE; + end + + endcase + + + end + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_data ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (data_rdata_masked ), + .data_o (data_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (acc_rdata_masked ), + .data_o (acc_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(4) + ) skewer_inst_ctrl ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i ({MESH_WIDTH{sa_ctrl_q}}), + .data_o (sa_ctrl_mesh_skewed ) + ); + + quadrilatero_wl_stage #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) weight_inst ( + .clk_i , + .rst_ni , + + .ff_counter (ff_counter_q ), + .clear_i (clear ), + .pump_i (pump ), + .weight_rdata_valid_i , + + // Weight Data + .weight_rdata_i (weight_rdata_masked ), + .weight_rdata_o (weight_mesh_skewed ) + ); + + quadrilatero_mesh #( + .MESH_WIDTH (MESH_WIDTH ), + .ENABLE_SIMD(ENABLE_SIMD), + .FPU (FPU ) + ) mesh_inst ( + .clk_i, + .rst_ni, + + .pump_i (pump ), + .sa_ctrl_i (sa_ctrl_mesh_skewed ), + + .data_i (data_mesh_skewed ), + .acc_i (acc_mesh_skewed ), + .weight_i (weight_mesh_skewed ), + .acc_o (res_mesh_skewed ) + ); + + quadrilatero_deskewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) deskewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (res_mesh_skewed), + .data_o (res_wdata_partial ) + ); + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + ff_counter_q <= '0; + fs_counter_q <= '0; + dr_counter_q <= '0; + ff_state_q <= FF_IDLE; + fs_state_q <= FS_IDLE; + dr_state_q <= DR_IDLE; + data_reg_q <= '0; + acc_reg_q <= '0; + weight_reg_q <= '0; + sa_ctrl_q <= '0; + acc_fs_q <= '0; + dest_reg_q <= '0; + id_ff_q <= '0; + id_fs_q <= '0; + id_dr_q <= '0; + finished_q <= '0; + finished_instr_id_q <= '0; + ff_k_counter_q <= '0; + dr_k_counter_q <= '0; + ff_it_counter_q <= '0; + dr_it_counter_q <= '0; + ff_row_counter_q <= '0; + dr_row_counter_q <= '0; + res_wdata_buffer_q <= '0; + end else begin + ff_counter_q <= ff_counter_d ; + fs_counter_q <= fs_counter_d ; + dr_counter_q <= dr_counter_d ; + ff_state_q <= ff_state_d; + fs_state_q <= fs_state_d; + dr_state_q <= dr_state_d; + data_reg_q <= data_reg_d ; + acc_reg_q <= acc_reg_d ; + weight_reg_q <= weight_reg_d ; + sa_ctrl_q <= sa_ctrl_d ; + acc_fs_q <= acc_fs_d ; + dest_reg_q <= dest_reg_d ; + id_ff_q <= id_ff_d ; + id_fs_q <= id_fs_d ; + id_dr_q <= id_dr_d ; + finished_q <= finished_d ; + finished_instr_id_q <= finished_instr_id_d ; + ff_k_counter_q <= ff_k_counter_d; + dr_k_counter_q <= dr_k_counter_d; + ff_it_counter_q <= ff_it_counter_d; + dr_it_counter_q <= dr_it_counter_d; + ff_row_counter_q <= ff_row_counter_d ; + dr_row_counter_q <= dr_row_counter_d ; + res_wdata_buffer_q <= res_wdata_buffer_d ; + end + end + + assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0 + && (ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1))) | clear); + assign sa_input_id_o = id_ff_q ; + assign sa_output_id_o = id_dr_q ; + assign finished_o = finished_q ; + assign finished_instr_id_o = finished_instr_id_q; + + // -------------------------------------------------------------------- + + // Assertions + if (MESH_WIDTH < 2) begin + $error( + "[systolic_array] MESH_WIDTH must be at least 2.\n" + ); + end +endmodule diff --git a/sw/applications/quadrilatero_easy_8x8/main.c b/sw/applications/quadrilatero_easy_8x8/main.c index ee82d32e9..c3ff90e9a 100644 --- a/sw/applications/quadrilatero_easy_8x8/main.c +++ b/sw/applications/quadrilatero_easy_8x8/main.c @@ -180,22 +180,23 @@ int main() void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) { - asm volatile("addi sp, sp, -0x30 " ); // - asm volatile("sw s0 , 0x2c(sp) " ); // - asm volatile("sw s1 , 0x28(sp) " ); // - asm volatile("sw s2 , 0x24(sp) " ); // - asm volatile("sw s3 , 0x20(sp) " ); // - asm volatile("sw s4 , 0x1c(sp) " ); // - asm volatile("sw s5 , 0x18(sp) " ); // - asm volatile("sw s6 , 0x14(sp) " ); // - asm volatile("sw s7 , 0x10(sp) " ); // - asm volatile("sw s8 , 0x0c(sp) " ); // - asm volatile("sw s9 , 0x08(sp) " ); // - asm volatile("sw s10, 0x04(sp) " ); // - asm volatile("sw s11, 0x00(sp) " ); // + // asm volatile("addi sp, sp, -0x30 " ); // + // asm volatile("sw s0 , 0x2c(sp) " ); // + // asm volatile("sw s1 , 0x28(sp) " ); // + // asm volatile("sw s2 , 0x24(sp) " ); // + // asm volatile("sw s3 , 0x20(sp) " ); // + // asm volatile("sw s4 , 0x1c(sp) " ); // + // asm volatile("sw s5 , 0x18(sp) " ); // + // asm volatile("sw s6 , 0x14(sp) " ); // + // asm volatile("sw s7 , 0x10(sp) " ); // + // asm volatile("sw s8 , 0x0c(sp) " ); // + // asm volatile("sw s9 , 0x08(sp) " ); // + // asm volatile("sw s10, 0x04(sp) " ); // + // asm volatile("sw s11, 0x00(sp) " ); // //-------------------------------------------------------------------------------- // asm volatile("addi a7,x0, 4 " ); // a7 = WIDTH; + asm volatile("addi a6,x0,32 " ); // a6 = N* 2**SIMD_SHIFT asm volatile("addi t0,x0, 0 " ); // t0 = m0 =0; asm volatile("addi s3,x0, 32 " ); // s3 = K*4; @@ -216,7 +217,7 @@ void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* add asm volatile("addi t2,x0,16 " ); // t2 = k0 = 16; asm volatile("slli t5,t1, 2 " ); // t5 = n0*4; asm volatile("mld.w m0, (s1) , s3 " ); // m0 = A[s1] - asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mld.w m4, (s0), s3 " ); // m4 = 0; asm volatile("mul s9,s3,t1 " ); // s9 = K*4*n0; asm volatile("add s9 ,%0,s9 " :: "r" (addrB) ); // s9 = startAddrB0 = addrB + K*4*n0 asm volatile("mld.w m1, (s9) , a6 " ); // m1 = B[s9] @@ -230,20 +231,27 @@ void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* add asm volatile("add t0,t0, 16 " ); // t0 = m0 +=2*WIDTH; //-------------------------------------------------------------------------------- - - asm volatile("lw s0 , 0x2c(sp) " ); // - asm volatile("lw s1 , 0x28(sp) " ); // - asm volatile("lw s2 , 0x24(sp) " ); // - asm volatile("lw s3 , 0x20(sp) " ); // - asm volatile("lw s4 , 0x1c(sp) " ); // - asm volatile("lw s5 , 0x18(sp) " ); // - asm volatile("lw s6 , 0x14(sp) " ); // - asm volatile("lw s7 , 0x10(sp) " ); // - asm volatile("lw s8 , 0x0c(sp) " ); // - asm volatile("lw s9 , 0x08(sp) " ); // - asm volatile("lw s10, 0x04(sp) " ); // - asm volatile("lw s11, 0x00(sp) " ); // - asm volatile("addi sp, sp, 0x30 " ); // + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + // asm volatile("lw s0 , 0x2c(sp) " ); // + // asm volatile("lw s1 , 0x28(sp) " ); // + // asm volatile("lw s2 , 0x24(sp) " ); // + // asm volatile("lw s3 , 0x20(sp) " ); // + // asm volatile("lw s4 , 0x1c(sp) " ); // + // asm volatile("lw s5 , 0x18(sp) " ); // + // asm volatile("lw s6 , 0x14(sp) " ); // + // asm volatile("lw s7 , 0x10(sp) " ); // + // asm volatile("lw s8 , 0x0c(sp) " ); // + // asm volatile("lw s9 , 0x08(sp) " ); // + // asm volatile("lw s10, 0x04(sp) " ); // + // asm volatile("lw s11, 0x00(sp) " ); // + // asm volatile("addi sp, sp, 0x30 " ); // } diff --git a/sw/applications/quadrilatero_easy_8x8/matrixMul32i.h b/sw/applications/quadrilatero_easy_8x8/matrixMul32i.h index d09715172..28c46ecdd 100644 --- a/sw/applications/quadrilatero_easy_8x8/matrixMul32i.h +++ b/sw/applications/quadrilatero_easy_8x8/matrixMul32i.h @@ -8,7 +8,7 @@ #define _MATMUL64INT32_ // This file is not ;) automatically generated int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_A[] = { - 1,0,0,0,0,0,0,0, + 1,2,0x12345678,4,5,6,7,8, 0,1,0,0,0,0,0,0, 0,0,1,0,0,0,0,0, 0,0,0,1,0,0,0,0, @@ -29,16 +29,28 @@ int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_BT[] = { 0,0,0,0,0,0,0,2 }; +int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_C[] = { + 1,1,1,1,2,2,2,2, + 1,1,1,1,2,2,2,2, + 1,1,1,1,2,2,2,2, + 1,1,1,1,2,2,2,2, + 3,3,3,3,4,4,4,4, + 3,3,3,3,4,4,4,4, + 3,3,3,3,4,4,4,4, + 3,3,3,3,4,4,4,4 + }; + + int32_t __attribute__((section(".xheep_data_interleaved"))) matrix_EXP[] = { - 2,0,0,0,0,0,0,0, - 0,2,0,0,0,0,0,0, - 0,0,2,0,0,0,0,0, - 0,0,0,2,0,0,0,0, - 0,0,0,0,2,0,0,0, - 0,0,0,0,0,2,0,0, - 0,0,0,0,0,0,2,0, - 0,0,0,0,0,0,0,2 - }; + 3,5,610839793,9,12,14,16,18, + 1,3,1,1,2,2,2,2, + 1,1,3,1,2,2,2,2, + 1,1,1,3,2,2,2,2, + 3,3,3,3,6,4,4,4, + 3,3,3,3,4,6,4,4, + 3,3,3,3,4,4,6,4, + 3,3,3,3,4,4,4,6 + }; #define SIZE 8 #endif \ No newline at end of file diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index 67bd5a2d3..67dce7b1d 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -81,6 +81,7 @@ void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB, void __attribute__ ((noinline)) matrixMulBigRF_8x8(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); void __attribute__ ((noinline)) matrixMul_CPU(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift); +void print_matrix(DATA_OUT_t* matrix, int K, int N); int float_condition(int index); int int_condition(int index); uint32_t check_results(int K, int N, int M); @@ -168,9 +169,14 @@ int main() CSR_READ(CSR_REG_MCYCLE, &cycles); //check results - errors = check_results(K_size,N_size,M_size); + //errors = check_results(K_size,N_size,M_size); + errors = 17; PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); + PRINTF("MATRIX C:\n\r"); + print_matrix(addrC, M_size, N_size); + PRINTF("MATRIX EXP:\n\r"); + print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); return errors; } @@ -395,37 +401,41 @@ void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addr // } // //unrolling the m/n loops -> way to go -// for(int m = 0; m < M; m+= 16){ -// for(int n = 0; n < N; n+=16){ -// asm volatile("mzero m0"); //m0 = C00 -// asm volatile("mzero m1"); //m1 = C01 -// asm volatile("mzero m2"); //m2 = C10 -// asm volatile("mzero m3"); //m3 = C11 -// for(int k = 0; k < K; k+=8){ -// //compute C00 -// asm volatile("mld.w m4, (addrA + m*4*K + 4*k), 4*K"); -// asm volatile("mld.w m5, (addrB + n*4*K + 4*k), 4*N"); -// asm volatile("MACC(m0, m4, m5)"); -// //compute C01 -// asm volatile("mld.w m7, (addrB + (n+8)*4*K + 4*k), 4*N"); -// asm volatile("MACC(m1, m4, m7)"); -// //compute C10 -// asm volatile("mld.w m6, (addrA + (m+8)*4*K + 4*k), 4*K"); -// asm volatile("MACC(m2, m6, m5)"); -// //compute C11 -// asm volatile("MACC(m3, m6, m7)"); -// } -// //store C00 -// asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); -// //store C01 -// asm volatile("mst.w m1, (addrC + m*4*N + (n+8)*4), 4*N"); -// //store C10 -// asm volatile("mst.w m2, (addrC + (m+8)*4*N + n*4), 4*N"); -// //store C11 -// asm volatile("mst.w m3, (addrC + (m+8)*4*N + (n+8)*4), 4*N"); - -// } -// } +void __attribute__ ((noinline)) matrixMul_16x16_C(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift){ + uint32_t K_4 = K*4; + uint32_t N_4 = N*4; + for(int m = 0; m < M; m+= 16){ + for(int n = 0; n < N; n+=16){ + asm volatile("mzero m0"); //m0 = C00 + asm volatile("mzero m1"); //m1 = C01 + asm volatile("mzero m2"); //m2 = C10 + asm volatile("mzero m3"); //m3 = C11 + for(int k = 0; k < K; k+=8){ + //compute C00 + asm volatile("mld.w m4, (addrA + m*4*K + 4*k), 4*K"); + asm volatile("mld.w m5, (addrB + n*4*K + 4*k), 4*N"); + asm volatile("MACC(m0, m4, m5)"); + //compute C01 + asm volatile("mld.w m7, (addrB + (n+8)*4*K + 4*k), 4*N"); + asm volatile("MACC(m1, m4, m7)"); + //compute C10 + asm volatile("mld.w m6, (addrA + (m+8)*4*K + 4*k), 4*K"); + asm volatile("MACC(m2, m6, m5)"); + //compute C11 + asm volatile("MACC(m3, m6, m7)"); + } + //store C00 + asm volatile("mst.w m0, (addrC + m*4*N + n*4), 4*N"); + //store C01 + asm volatile("mst.w m1, (addrC + m*4*N + (n+8)*4), 4*N"); + //store C10 + asm volatile("mst.w m2, (addrC + (m+8)*4*N + n*4), 4*N"); + //store C11 + asm volatile("mst.w m3, (addrC + (m+8)*4*N + (n+8)*4), 4*N"); + + } + } +} void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) { asm volatile("addi sp, sp, -0x30 " ); // @@ -674,4 +684,14 @@ uint32_t check_results(int K, int N, int M) } return err; +} + +void print_matrix(DATA_OUT_t* matrix, int K, int N) +{ + for(int i=0;i Date: Mon, 28 Apr 2025 15:20:17 +0200 Subject: [PATCH 13/18] 8x8 SA control working, but data isn't stored properly --- .../rtl/quadrilatero_systolic_array.sv | 70 +- .../rtl/quadrilatero_systolic_array_old.sv | 636 ++++++++++++++++++ sw/applications/quadrilatero_easy_8x8/main.c | 128 +++- .../quadrilatero_matmul_16x16/main.c | 66 +- 4 files changed, 864 insertions(+), 36 deletions(-) create mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index e605ba491..ba8a5f4af 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -117,7 +117,7 @@ module quadrilatero_systolic_array #( logic [quadrilatero_pkg::RLEN-1:0] data_mask; logic [quadrilatero_pkg::RLEN-1:0] weight_mask; logic [quadrilatero_pkg::RLEN-1:0] acc_mask; - //logic [quadrilatero_pkg::RLEN-1:0] res_mask; + logic [quadrilatero_pkg::RLEN-1:0] res_mask; logic [ALEN-1:0] data_rdata_masked; logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; @@ -127,8 +127,8 @@ module quadrilatero_systolic_array #( logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; logic [ALEN-1:0] res_wdata_partial; logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; - logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0] res_wdata_buffer_d; - logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0]res_wdata_buffer_q; + logic [(N_ROWS*K)-1:0][quadrilatero_pkg::RLEN-1:0] res_wdata_buffer_d; + logic [(N_ROWS*K)-1:0][quadrilatero_pkg::RLEN-1:0]res_wdata_buffer_q; logic valid ; @@ -184,17 +184,17 @@ module quadrilatero_systolic_array #( weight_base_row = N_ROWS * ff_it_counter_q; weight_raddr_o = weight_reg_q ; weight_rrowaddr_o = ff_counter_q + weight_base_row; - weight_rdata_shifted = (weight_rdata_i >> ALEN * ff_k_counter_q); - weight_rdata_masked = weight_rdata_shifted[ALEN-1:0]; //TODO fix + weight_rdata_shifted = (weight_rdata_i << ALEN * ff_k_counter_q); + weight_rdata_masked = weight_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? // Data Read Register Port - data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); // TODO fix + data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); data_raddr_o = data_reg_q ; data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - data_rdata_shifted = (data_rdata_i << ALEN * ff_it_counter_q); - data_rdata_masked = data_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] ; //TODO fix + data_rdata_shifted = (data_rdata_i >> ALEN * ff_it_counter_q); + data_rdata_masked = data_rdata_shifted[ALEN-1:0] ; data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; @@ -202,32 +202,21 @@ module quadrilatero_systolic_array #( acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); acc_raddr_o = acc_reg_q ; acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - acc_rdata_shifted = (acc_rdata_i >> ALEN * ff_k_counter_q); - res_rdata_shifted = (res_wdata_o >> ALEN * ff_k_counter_q); + acc_rdata_shifted = (acc_rdata_i << ALEN * ff_k_counter_q); + res_rdata_shifted = (res_wdata_o << ALEN * ff_k_counter_q); acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? - res_rdata_shifted[ALEN-1:0] : acc_rdata_shifted[ALEN-1:0]; //TODO fix + res_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] : acc_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; //TODO fix acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; acc_rlast_o = '0 ; // Accumulator Out Write Register Port res_waddr_o = dest_reg_q ; res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; - res_wdata_o = {res_wdata_partial, res_wdata_buffer_q[dr_counter_q]}; + res_wdata_o = res_wdata_buffer_d[res_wrowaddr_o]; res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); end - always_comb begin : weight_buffer_block - res_wdata_buffer_d = res_wdata_buffer_q; - if(dr_state_q != DR_IDLE) begin - if(dr_k_counter_q == K-1) begin - res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer - end else begin - res_wdata_buffer_d[dr_counter_q] = res_wdata_partial; - end - end - end - always_comb begin: finished_signal finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : @@ -482,10 +471,43 @@ module quadrilatero_systolic_array #( end endcase - + end + always_comb begin : weight_buffer_block + res_wdata_buffer_d = res_wdata_buffer_q; + res_mask = {ALEN{1'b1}} << (ALEN*((K-1)-dr_k_counter_q)); + if(ff_state_q != FF_IDLE) begin + res_wdata_buffer_d[acc_rrowaddr_o] = acc_rdata_i; + end + if(dr_state_q != DR_IDLE) begin + res_wdata_buffer_d[res_wrowaddr_o] = (res_wdata_buffer_q[res_wrowaddr_o] & ~res_mask) | (res_wdata_partial << (ALEN*((K-1)-dr_k_counter_q))); + end end + // fifo_v3 #( + // .FALL_THROUGH (1'b0 ), + // .DEPTH (5 ), //TODO change + // .DATA_WIDTH (quadrilatero_pkg::RLEN ) + // ) acc_buffer_i ( + // .clk_i , + // .rst_ni , + // .flush_i (1'b0 ), + // .testmode_i (1'b0 ), + + // // status flags + // .full_o (acc_fifo_full ), + // .empty_o (acc_fifo_empty ), + // .usage_o (acc_fifo_usage ), + // // as long as the queue is not full we can push new data + // .data_i (acc_rdata_i ), + // .push_i (pump ), + // // as long as the queue is not empty we can pop new elements + // .data_o (acc_fifo_wdata ), + // .pop_i (pump & ~acc_fifo_empty ) + // ); + //end + + quadrilatero_skewer #( .MESH_WIDTH(MESH_WIDTH), .DATA_WIDTH(DATA_WIDTH) diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv new file mode 100644 index 000000000..c51eb8b6d --- /dev/null +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv @@ -0,0 +1,636 @@ +// Copyright 2024 EPFL +// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Author: Danilo Cammarata + +/* + +TODO: +- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs + - basically you need to inject zeros instead of actual elements +*/ + +module quadrilatero_systolic_array #( + parameter int MESH_WIDTH = 4 , + parameter int DATA_WIDTH = 32 , + parameter int N_REGS = 8 , + parameter int ENABLE_SIMD = 1 , + localparam int N_ROWS = MESH_WIDTH , + localparam int ALEN = DATA_WIDTH * MESH_WIDTH, + parameter FPU = 1 +) ( + input logic clk_i , + input logic rst_ni , + + output logic sa_ready_o , + input logic start_i , + + // Only has effect if ENABLE_SIMD == 1 + input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , + + input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register + input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register + input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register + input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction + + // Weight Read Register Port + output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , + input logic weight_rdata_valid_i, + output logic weight_rdata_ready_o, + output logic weight_rlast_o , + + // Data Read Register Port + output logic [ $clog2(N_REGS)-1:0] data_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , + input logic data_rdata_valid_i , + output logic data_rdata_ready_o , + output logic data_rlast_o , + + // Accumulator Read Register Port + output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , + input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , + input logic acc_rdata_valid_i , + output logic acc_rdata_ready_o , + output logic acc_rlast_o , + + // Accumulator Out Write Register Port + output logic [ $clog2(N_REGS)-1:0] res_waddr_o , + output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , + output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , + output logic res_we_o , + output logic res_wlast_o , + input logic res_wready_i , + + // RF Instruction ID + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , + output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , + + // Finish + output logic finished_o , + input logic finished_ack_i , + output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o +); + typedef enum logic [1:0]{ + FS_IDLE, + FS_ACTIVE, + FS_LAST + } fs_state_e; + typedef enum logic [1:0]{ + FF_IDLE, + FF_ACTIVE, + FF_DONE + } ff_state_e; + typedef enum logic [1:0]{ + DR_IDLE, + DR_ACTIVE, + DR_DONE + } dr_state_e; + + ff_state_e ff_state_d, ff_state_q; + fs_state_e fs_state_d, fs_state_q; + dr_state_e dr_state_d, dr_state_q; + localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); + localparam RegLastRow = quadrilatero_pkg::RLEN/ ALEN; + localparam K = quadrilatero_pkg::RLEN / ALEN; + + logic [$clog2(K)-1:0] ff_k_counter_d; + logic [$clog2(K)-1:0] ff_k_counter_q; + logic [$clog2(K)-1:0] dr_k_counter_d; + logic [$clog2(K)-1:0] dr_k_counter_q; + logic [$clog2(K)-1:0] ff_it_counter_d; + logic [$clog2(K)-1:0] ff_it_counter_q; + logic [$clog2(K)-1:0] dr_it_counter_d; + logic [$clog2(K)-1:0] dr_it_counter_q; + logic [$clog2(K)-1:0] ff_row_counter_d; + logic [$clog2(K)-1:0] ff_row_counter_q; + logic [$clog2(K)-1:0] dr_row_counter_d; + logic [$clog2(K)-1:0] dr_row_counter_q; + logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; + logic last_dr_write; + + // Data Masks + logic [quadrilatero_pkg::RLEN-1:0] data_mask; + logic [quadrilatero_pkg::RLEN-1:0] weight_mask; + logic [quadrilatero_pkg::RLEN-1:0] acc_mask; + //logic [quadrilatero_pkg::RLEN-1:0] res_mask; + + logic [ALEN-1:0] data_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; + logic [ALEN-1:0] weight_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_shifted; + logic [ALEN-1:0] acc_rdata_masked; + logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; + logic [ALEN-1:0] res_wdata_partial; + logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; + logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0] res_wdata_buffer_d; + logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0]res_wdata_buffer_q; + + + logic valid ; + logic ff_valid; + logic clear ; + logic pump ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; + logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; + + logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register + logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register + logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage + logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register + logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register + quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; + quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; + + logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage + logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage + + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; + logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; + + logic finished_d ; + logic finished_q ; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; + logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; + logic mask_req ; + + quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; + + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; + logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; + logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; + + //--------------------------------------------------------------------- + + always_comb begin: rf_block + // Weight Read Register Port + weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); + weight_base_row = N_ROWS * ff_it_counter_q; + weight_raddr_o = weight_reg_q ; + weight_rrowaddr_o = ff_counter_q + weight_base_row; + weight_rdata_shifted = (weight_rdata_i << ALEN * ff_k_counter_q); + weight_rdata_masked = weight_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; + weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? + + // Data Read Register Port + data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); + data_raddr_o = data_reg_q ; + data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + data_rdata_shifted = (data_rdata_i >> ALEN * ff_it_counter_q); + data_rdata_masked = data_rdata_shifted[ALEN-1:0] ; + data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; + + // Accumulator Read Register Port + acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); + acc_raddr_o = acc_reg_q ; + acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; + acc_rdata_shifted = (acc_rdata_i << ALEN * ff_k_counter_q); + res_rdata_shifted = (res_wdata_o << ALEN * ff_k_counter_q); + acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? + res_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] : acc_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; //TODO fix + acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; + acc_rlast_o = '0 ; + + // Accumulator Out Write Register Port + res_waddr_o = dest_reg_q ; + res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; + res_wdata_o = {res_wdata_buffer_q[dr_counter_q], res_wdata_partial}; // TODO: fix this, probably need a bigger buffer to make life easier. + res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; + res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); + end + + always_comb begin : weight_buffer_block + res_wdata_buffer_d = res_wdata_buffer_q; + if(dr_state_q != DR_IDLE) begin + if(dr_k_counter_q == K-1) begin + res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer + end else begin + res_wdata_buffer_d[dr_counter_q] = res_wdata_partial; + end + end + end + + always_comb begin: finished_signal + + finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : + (finished_ack_i ) ? 1'b0 : finished_q; + + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? id_dr_q : + (finished_ack_i ) ? '0 : finished_instr_id_q; + end + + always_comb begin: ctrl_block + valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; + if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q == DR_IDLE)) begin + clear = 1'b1; + end else begin + clear = 1'b0; + end + if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q != DR_IDLE)) begin + pump = 1'b1; + end else begin + pump = 1'b0; + end + mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; + end + + always_comb begin : ff_fsm_block + ff_counter_d = ff_counter_q; + ff_state_d = ff_state_q; + data_reg_d = data_reg_q; + acc_reg_d = acc_reg_q; + weight_reg_d = weight_reg_q; + sa_ctrl_d = sa_ctrl_q; + id_ff_d = id_ff_q; + ff_k_counter_d = ff_k_counter_q; + ff_it_counter_d = ff_it_counter_q; + ff_row_counter_d = ff_row_counter_q; + ff_valid = 1'b0; + + unique case (ff_state_q) + FF_IDLE: begin + ff_counter_d = '0; + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + if(start_i == 1'b1) begin + ff_state_d = FF_ACTIVE; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end + end + FF_ACTIVE: begin + if(valid == 1'b1) begin + ff_valid = 1'b1; + if(ff_counter_q==(LastRow-1)) begin + ff_counter_d = ff_counter_q + 1; + ff_state_d = FF_DONE; + end else begin + ff_counter_d = ff_counter_q + 1; + end + end + end + + FF_DONE: begin + if(start_i == 1'b1 | ~(data_rlast_o == 1'b1 && weight_rlast_o == 1'b1 && ff_it_counter_q == (K-1))) begin + if(valid == 1'b1) begin + ff_valid = 1'b1; + ff_counter_d = '0; + ff_state_d = FF_ACTIVE; + if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin // get inputs from new instruction + ff_it_counter_d = '0; + ff_row_counter_d = '0; + ff_k_counter_d = '0; + data_reg_d = data_reg_i; + acc_reg_d = acc_reg_i; + weight_reg_d = weight_reg_i; + sa_ctrl_d = sa_ctrl_i; + id_ff_d = id_i; + end else begin + if(ff_row_counter_q == RegLastRow-1) begin + ff_row_counter_d = '0; + if(ff_k_counter_q == (K-1)) begin + ff_k_counter_d = '0; + ff_it_counter_d = ff_it_counter_q + 1; + end else begin + ff_k_counter_d = ff_k_counter_q + 1; + end + end else begin + ff_row_counter_d = ff_row_counter_q + 1; + end + end + end + + end else begin + ff_counter_d = '0; + ff_state_d = FF_IDLE; + end + end + + default: begin + ff_state_d = FF_IDLE; + end + endcase + end + always_comb begin : fs_fsm_block + fs_counter_d = fs_counter_q; + fs_state_d = fs_state_q; + + acc_fs_d = acc_fs_q; + id_fs_d = id_fs_q; + + unique case(fs_state_q) + FS_IDLE: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + + end + FS_ACTIVE: begin + if(clear == 1'b1) begin + fs_counter_d = '0; + fs_state_d = FS_IDLE; + end else begin + if(fs_counter_q == LastRow-2) begin + fs_counter_d = fs_counter_q + 1; + fs_state_d = FS_LAST; + end else begin + fs_counter_d = fs_counter_q + 1; + end + end + end + FS_LAST: begin + fs_counter_d = '0; + if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin //stay in active mode, load new inputs + fs_state_d = FS_ACTIVE; + + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + end + if(ff_state_q == FF_IDLE) begin + fs_state_d = FS_IDLE; + end else begin + acc_fs_d = acc_reg_q; + id_fs_d = id_ff_q; + fs_state_d = FS_IDLE; + end + + end + default: begin + fs_state_d = FS_IDLE; + end + + endcase + end + + always_comb begin : dr_fsm_block + dr_state_d = dr_state_q; + dr_counter_d = dr_counter_q; + dr_k_counter_d = dr_k_counter_q; + dr_it_counter_d = dr_it_counter_q; + dr_row_counter_d = dr_row_counter_q; + last_dr_write = 1'b0; + + dest_reg_d = dest_reg_q; + id_dr_d = id_dr_q; + unique case(dr_state_q) + DR_IDLE: begin + dr_counter_d = '0; + dr_k_counter_d = '0; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + + end + DR_ACTIVE: begin + if(clear == 1'b1) begin + dr_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; + end else begin + if(dr_counter_q == LastRow) begin + + dr_counter_d = '0; + //update DR counters + if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + end else begin + if(dr_row_counter_q == RegLastRow-1) begin + dr_row_counter_d = '0; + if(dr_k_counter_q == (K-1)) begin + dr_k_counter_d = '0; + dr_it_counter_d = dr_it_counter_q + 1; + end else begin + dr_k_counter_d = dr_k_counter_q + 1; + end + end else begin + dr_row_counter_d = dr_row_counter_q + 1; + end + end + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) + dr_state_d = DR_ACTIVE; + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + end + if(fs_state_q == FS_IDLE) begin + dr_state_d = DR_DONE; + end + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end + + end + DR_DONE: begin + if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin + last_dr_write = 1'b1; + if(res_wready_i == 1'b0) begin + dr_state_d = dr_state_q; //stall + end else begin + dr_state_d = DR_DONE; + if(dr_counter_q == LastRow) begin + dest_reg_d = acc_fs_q; + id_dr_d = id_fs_q; + dr_it_counter_d = '0; + dr_row_counter_d = '0; + dr_k_counter_d = '0; + dr_state_d = DR_IDLE; + end else begin + dr_counter_d = dr_counter_q + 1; + end + end + end else begin + dr_state_d = DR_IDLE; + end + end + default: begin + dr_state_d = DR_IDLE; + end + + endcase + + + end + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_data ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (data_rdata_masked ), + .data_o (data_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) skewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (acc_rdata_masked ), + .data_o (acc_mesh_skewed) + ); + + quadrilatero_skewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(4) + ) skewer_inst_ctrl ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i ({MESH_WIDTH{sa_ctrl_q}}), + .data_o (sa_ctrl_mesh_skewed ) + ); + + quadrilatero_wl_stage #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) weight_inst ( + .clk_i , + .rst_ni , + + .ff_counter (ff_counter_q ), + .clear_i (clear ), + .pump_i (pump ), + .weight_rdata_valid_i , + + // Weight Data + .weight_rdata_i (weight_rdata_masked ), + .weight_rdata_o (weight_mesh_skewed ) + ); + + quadrilatero_mesh #( + .MESH_WIDTH (MESH_WIDTH ), + .ENABLE_SIMD(ENABLE_SIMD), + .FPU (FPU ) + ) mesh_inst ( + .clk_i, + .rst_ni, + + .pump_i (pump ), + .sa_ctrl_i (sa_ctrl_mesh_skewed ), + + .data_i (data_mesh_skewed ), + .acc_i (acc_mesh_skewed ), + .weight_i (weight_mesh_skewed ), + .acc_o (res_mesh_skewed ) + ); + + quadrilatero_deskewer #( + .MESH_WIDTH(MESH_WIDTH), + .DATA_WIDTH(DATA_WIDTH) + ) deskewer_inst_acc ( + .clk_i , + .rst_ni , + .pump_i (pump ), + .data_i (res_mesh_skewed), + .data_o (res_wdata_partial ) + ); + + always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block + if (!rst_ni) begin + ff_counter_q <= '0; + fs_counter_q <= '0; + dr_counter_q <= '0; + ff_state_q <= FF_IDLE; + fs_state_q <= FS_IDLE; + dr_state_q <= DR_IDLE; + data_reg_q <= '0; + acc_reg_q <= '0; + weight_reg_q <= '0; + sa_ctrl_q <= '0; + acc_fs_q <= '0; + dest_reg_q <= '0; + id_ff_q <= '0; + id_fs_q <= '0; + id_dr_q <= '0; + finished_q <= '0; + finished_instr_id_q <= '0; + ff_k_counter_q <= '0; + dr_k_counter_q <= '0; + ff_it_counter_q <= '0; + dr_it_counter_q <= '0; + ff_row_counter_q <= '0; + dr_row_counter_q <= '0; + res_wdata_buffer_q <= '0; + end else begin + ff_counter_q <= ff_counter_d ; + fs_counter_q <= fs_counter_d ; + dr_counter_q <= dr_counter_d ; + ff_state_q <= ff_state_d; + fs_state_q <= fs_state_d; + dr_state_q <= dr_state_d; + data_reg_q <= data_reg_d ; + acc_reg_q <= acc_reg_d ; + weight_reg_q <= weight_reg_d ; + sa_ctrl_q <= sa_ctrl_d ; + acc_fs_q <= acc_fs_d ; + dest_reg_q <= dest_reg_d ; + id_ff_q <= id_ff_d ; + id_fs_q <= id_fs_d ; + id_dr_q <= id_dr_d ; + finished_q <= finished_d ; + finished_instr_id_q <= finished_instr_id_d ; + ff_k_counter_q <= ff_k_counter_d; + dr_k_counter_q <= dr_k_counter_d; + ff_it_counter_q <= ff_it_counter_d; + dr_it_counter_q <= dr_it_counter_d; + ff_row_counter_q <= ff_row_counter_d ; + dr_row_counter_q <= dr_row_counter_d ; + res_wdata_buffer_q <= res_wdata_buffer_d ; + end + end + + assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0 + && (ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1))) | clear); + assign sa_input_id_o = id_ff_q ; + assign sa_output_id_o = id_dr_q ; + assign finished_o = finished_q ; + assign finished_instr_id_o = finished_instr_id_q; + + // -------------------------------------------------------------------- + + // Assertions + if (MESH_WIDTH < 2) begin + $error( + "[systolic_array] MESH_WIDTH must be at least 2.\n" + ); + end +endmodule diff --git a/sw/applications/quadrilatero_easy_8x8/main.c b/sw/applications/quadrilatero_easy_8x8/main.c index c3ff90e9a..84329c926 100644 --- a/sw/applications/quadrilatero_easy_8x8/main.c +++ b/sw/applications/quadrilatero_easy_8x8/main.c @@ -168,10 +168,10 @@ int main() errors = check_results(K_size,N_size,M_size); PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); - PRINTF("MATRIX C:\n\r"); - print_matrix(addrC, M_size, N_size); - PRINTF("MATRIX EXP:\n\r"); - print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); + // PRINTF("MATRIX C:\n\r"); + // print_matrix(addrC, M_size, N_size); + // PRINTF("MATRIX EXP:\n\r"); + // print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); return errors; } @@ -231,6 +231,126 @@ void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* add asm volatile("add t0,t0, 16 " ); // t0 = m0 +=2*WIDTH; //-------------------------------------------------------------------------------- + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT + asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT asm volatile("mzero m7 " ); // a6 = N* 2**SIMD_SHIFT diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index 67dce7b1d..b56d21d04 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -169,14 +169,13 @@ int main() CSR_READ(CSR_REG_MCYCLE, &cycles); //check results - //errors = check_results(K_size,N_size,M_size); - errors = 17; + errors = check_results(K_size,N_size,M_size); PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); - PRINTF("MATRIX C:\n\r"); - print_matrix(addrC, M_size, N_size); - PRINTF("MATRIX EXP:\n\r"); - print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); + // PRINTF("MATRIX C:\n\r"); + // print_matrix(addrC, M_size, N_size); + // PRINTF("MATRIX EXP:\n\r"); + // print_matrix((DATA_OUT_t *) MAT_EXP, M_size, N_size); return errors; } @@ -401,7 +400,7 @@ void __attribute__ ((noinline)) matrixMul_8x8(DATA_IN_t* addrA,DATA_IN_t* addr // } // //unrolling the m/n loops -> way to go -void __attribute__ ((noinline)) matrixMul_16x16_C(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift){ +/*void __attribute__ ((noinline)) matrixMul_16x16_C(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift){ uint32_t K_4 = K*4; uint32_t N_4 = N*4; for(int m = 0; m < M; m+= 16){ @@ -435,7 +434,7 @@ void __attribute__ ((noinline)) matrixMul_16x16_C(DATA_IN_t* addrA,DATA_IN_t* a } } -} +} */ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* addrB,DATA_OUT_t* addrC, int K, int N, int M, int shift) { asm volatile("addi sp, sp, -0x30 " ); // @@ -524,6 +523,57 @@ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* ad asm volatile("blt t0, %0, loopM_start16x16" :: "r" (M) ); // endwhile(m0 Date: Tue, 29 Apr 2025 20:34:38 +0200 Subject: [PATCH 14/18] 8x8 with SA res buffer, not working --- .../rtl/include/quadrilatero_pkg.sv | 2 +- .../rtl/quadrilatero_systolic_array.sv | 2 +- sw/applications/quadrilatero_easy_8x8/main.c | 6 +-- .../quadrilatero_matmul_16x16/main.c | 49 +------------------ 4 files changed, 6 insertions(+), 53 deletions(-) diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index 37b43b43e..bfa4cce52 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -8,7 +8,7 @@ package quadrilatero_pkg; parameter int unsigned N_REGS = 8; parameter int unsigned DATA_WIDTH = 32; parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 8; + parameter int unsigned MESH_WIDTH = 8; //change register size parameter int unsigned SA_MESH_WIDTH = 4; parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units parameter int unsigned MAX_NUM_READ_OPERANDS = 3; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index ba8a5f4af..9909f2327 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -473,7 +473,7 @@ module quadrilatero_systolic_array #( endcase end - always_comb begin : weight_buffer_block + always_comb begin : res_buffer_block // if RLEN == ALEN we don't need the weight buffer res_wdata_buffer_d = res_wdata_buffer_q; res_mask = {ALEN{1'b1}} << (ALEN*((K-1)-dr_k_counter_q)); if(ff_state_q != FF_IDLE) begin diff --git a/sw/applications/quadrilatero_easy_8x8/main.c b/sw/applications/quadrilatero_easy_8x8/main.c index 84329c926..aa76d60e5 100644 --- a/sw/applications/quadrilatero_easy_8x8/main.c +++ b/sw/applications/quadrilatero_easy_8x8/main.c @@ -165,7 +165,7 @@ int main() CSR_READ(CSR_REG_MCYCLE, &cycles); //check results - errors = check_results(K_size,N_size,M_size); + errors = check_results(8,8,8); PRINTF("program finished with %d errors and %d cycles\n\r", errors, cycles); // PRINTF("MATRIX C:\n\r"); @@ -414,8 +414,8 @@ uint32_t check_results(int K, int N, int M) for(i = 0; i < M; i++) { for(j = 0; j < N; j++) { if(CHECK_CONDITION(i*N+j)) { - err ++; - PRINTF("Error at index %d, %d, expected %x, got %x\n\r", i, j, MAT_EXP[i*N+j], MAT_C[i*N+j]); + err ++; + //PRINTF("Error at index %d, %d, expected %x, got %x\n\r", i, j, MAT_EXP[i*N+j], MAT_C[i*N+j]); } } } diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index b56d21d04..112c176f4 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -524,54 +524,7 @@ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* ad //-------------------------------------------------------------------------------- //-------------------------------------------------------------------------------- - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("lw s0 , 0x2c(sp) " ); // From c660282e97ffb736049523b507a8fde3ab73e9d9 Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Thu, 1 May 2025 11:56:50 +0200 Subject: [PATCH 15/18] 4x4 working with N_IREGS modification --- .../rtl/include/quadrilatero_pkg.sv | 12 +- .../quadrilatero/rtl/quadrilatero.sv | 34 ++-- .../quadrilatero/rtl/quadrilatero_lsu.sv | 1 - .../rtl/quadrilatero_perm_unit.sv | 6 +- .../quadrilatero/rtl/quadrilatero_regfile.sv | 14 +- .../rtl/quadrilatero_register_lsu.sv | 165 +++++++++--------- .../rtl/quadrilatero_rf_sequencer.sv | 70 +++++--- .../rtl/quadrilatero_systolic_array.sv | 132 ++++---------- sw/applications/quadrilatero_easy_8x8/main.c | 22 +++ .../quadrilatero_matmul_16x16/main.c | 98 ++++++++++- 10 files changed, 313 insertions(+), 241 deletions(-) diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index bfa4cce52..7bc6bca96 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -8,7 +8,7 @@ package quadrilatero_pkg; parameter int unsigned N_REGS = 8; parameter int unsigned DATA_WIDTH = 32; parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 8; //change register size + parameter int unsigned MESH_WIDTH = 4; //change register size parameter int unsigned SA_MESH_WIDTH = 4; parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units parameter int unsigned MAX_NUM_READ_OPERANDS = 3; @@ -18,8 +18,14 @@ package quadrilatero_pkg; parameter int unsigned RF_READ_PORTS = 4; parameter int unsigned RF_WRITE_PORTS = 3; - localparam int unsigned N_ROWS = MESH_WIDTH ; - localparam int unsigned RLEN = DATA_WIDTH * MESH_WIDTH; + localparam int unsigned RLEN = DATA_WIDTH * MESH_WIDTH; + localparam int unsigned ALEN = 128; + localparam int unsigned LLEN = 128; + localparam int unsigned LEN = ALEN; + localparam int unsigned N_ROWS = LEN / DATA_WIDTH ; //TODO: not sure if this is correct? + localparam int unsigned N_TILES = (RLEN/LEN)**2; + localparam int unsigned TILE_ADDR = (RLEN/LEN) == 1? 0: RLEN/LEN; + localparam int unsigned N_IREGS = N_REGS * N_TILES; typedef enum logic [2:0] { diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv index 108ebac41..badd478ab 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv @@ -128,7 +128,7 @@ module quadrilatero // RF Sequencer logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_raddr_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_rrowaddr_from_fu; - logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_rdata_from_fu ; + logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_rdata_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rvalid_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rlast_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rready_from_fu ; @@ -136,19 +136,19 @@ module quadrilatero logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_waddr_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_wrowaddr_from_fu; - logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_wdata_from_fu ; + logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_wdata_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_we_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wlast_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_wready_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_wr_id_from_fu ; - logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_raddr_to_rf ; + logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_raddr_to_rf ; logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_rrowaddr_to_rf ; - logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][quadrilatero_pkg::RLEN-1:0] rf_seq_rdata_to_rf ; + logic [quadrilatero_pkg::RF_READ_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_rdata_to_rf ; - logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_waddr_to_rf ; + logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_waddr_to_rf ; logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_wrowaddr_to_rf ; - logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][quadrilatero_pkg::RLEN-1:0] rf_seq_wdata_to_rf ; + logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rf_seq_wdata_to_rf ; logic [quadrilatero_pkg::RF_WRITE_PORTS-1:0] rf_seq_we_to_rf ; quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_REGS-1:0] rf_seq_rw_queue_entry ; @@ -170,30 +170,30 @@ module quadrilatero logic sa_weight_rdata_ready; logic sa_weight_rlast ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id ; - logic [quadrilatero_pkg::RLEN-1:0] sa_weight_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_weight_raddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_weight_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_weight_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_weight_rrowaddr ; logic sa_data_rdata_valid ; logic sa_data_rdata_ready ; logic sa_data_rlast ; logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id ; - logic [quadrilatero_pkg::RLEN-1:0] sa_data_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_data_raddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_data_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_data_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_data_rrowaddr ; logic sa_acc_rdata_valid ; logic sa_acc_rdata_ready ; logic sa_acc_rlast ; - logic [quadrilatero_pkg::RLEN-1:0] sa_acc_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_acc_raddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_acc_rdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_acc_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_acc_rrowaddr ; logic sa_res_we ; logic sa_res_wready ; logic sa_res_wlast ; - logic [quadrilatero_pkg::RLEN-1:0] sa_res_wdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] sa_res_waddr ; + logic [quadrilatero_pkg::LEN-1:0] sa_res_wdata ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] sa_res_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] sa_res_wrowaddr ; logic sa_finished ; @@ -226,14 +226,14 @@ module quadrilatero logic lsu_wlast ; logic lsu_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] lsu_id ; - logic [quadrilatero_pkg::RLEN-1:0] lsu_wdata ; + logic [quadrilatero_pkg::LEN-1:0] lsu_wdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] lsu_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_wrowaddr ; logic lsu_rlast ; logic lsu_rready ; logic lsu_rvalid ; - logic [quadrilatero_pkg::RLEN-1:0] lsu_rdata ; + logic [quadrilatero_pkg::LEN-1:0] lsu_rdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] lsu_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_rrowaddr ; @@ -254,7 +254,7 @@ module quadrilatero logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_instr_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_finished_instr_id; - logic [quadrilatero_pkg::RLEN-1:0] perm_unit_wdata ; + logic [quadrilatero_pkg::LEN-1:0] perm_unit_wdata ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] perm_unit_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] perm_unit_wrowaddr ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] perm_unit_reg ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index edbe3552b..579a10a56 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -27,7 +27,6 @@ module quadrilatero_lsu #( input logic write_i , // write transaction output logic busy_o , // lsu available output logic terminate_o , // lsu done - input logic last_i , // Address input logic [ 31:0] src_ptr_i , // base address diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv index 16cfc298c..9de2a46c9 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv @@ -14,9 +14,9 @@ module quadrilatero_perm_unit #( input logic rst_ni , // Register Write Port - output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_o , output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [ RLEN-1:0] wdata_o , + output logic [ quadrilatero_pkg::LEN-1:0] wdata_o , output logic we_o , output logic wlast_o , input logic wready_i , // to stall the request in case the port is busy @@ -145,7 +145,7 @@ module quadrilatero_perm_unit #( end - assign waddr_o = operand_reg_q ; + assign waddr_o = operand_reg_q ; //TODO: fix assign wrowaddr_o = counter_q ; assign wdata_o = '0 ; assign we_o = write_started_q &~ mask_req; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv index b26b714b0..83c06e904 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv @@ -16,27 +16,27 @@ module quadrilatero_regfile #( input logic rst_ni, // read port - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i, // register and port address + input logic [READ_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_i, // register and port address input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i, // we can ask for a single row of a register - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o, // row out + output logic [READ_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rdata_o, // row out // write port - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i, + input logic [WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_i, input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i, - input logic [WRITE_PORTS-1:0][ RLEN-1:0] wdata_i, + input logic [WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] wdata_i, input logic [WRITE_PORTS-1:0] we_i ); `ifdef SIMULATION // Multiple of 2 and less than 2**16 - if (!(RLEN < (1 << 16) && $countones(RLEN) == 1)) begin + if (!(quadrilatero_pkg::LEN < (1 << 16) && $countones(quadrilatero_pkg::LEN) == 1)) begin $fatal("invalid register configuration"); end `endif - logic [N_REGS-1:0][N_ROWS-1:0][RLEN-1:0] mem_q; - logic [N_REGS-1:0][N_ROWS-1:0][RLEN-1:0] mem_d; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0][quadrilatero_pkg::LEN-1:0] mem_q; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0][quadrilatero_pkg::LEN-1:0] mem_d; always_comb begin : write_mem mem_d = mem_q; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index d5303f913..bb3e32027 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -31,17 +31,17 @@ module quadrilatero_register_lsu #( output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , // Register Write Port for load unit - output logic [ $clog2(N_REGS)-1:0] waddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_o , output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [quadrilatero_pkg::RLEN-1:0] wdata_o , + output logic [quadrilatero_pkg::LEN-1:0] wdata_o , output logic we_o , output logic wlast_o , input logic wready_i , // to stall the request in case the port is busy // Register Read Port for store unit - output logic [ $clog2(N_REGS)-1:0] raddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_o , output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] rdata_i , + input logic [quadrilatero_pkg::LEN-1:0] rdata_i , input logic rdata_valid_i , output logic rdata_ready_o , output logic rlast_o , @@ -66,7 +66,7 @@ module quadrilatero_register_lsu #( localparam MAX_EL_PER_ROW = quadrilatero_pkg::RLEN / LLEN; localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); - localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; + localparam NumCols = quadrilatero_pkg::RLEN / LLEN; typedef enum logic [1:0] { LSU_IDLE, @@ -83,10 +83,8 @@ module quadrilatero_register_lsu #( logic [$clog2(N_ROWS)-1:0] counter_q; logic [$clog2(N_ROWS)-1:0] counter_d; - logic [$clog2(N_REGS)-1:0] waddr_q; - logic [$clog2(N_REGS)-1:0] waddr_d; - logic [$clog2(N_REGS)-1:0] raddr_q; - logic [$clog2(N_REGS)-1:0] raddr_d; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_q; //TODO: change these + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_d; logic [LLEN-1:0] load_fifo_data; @@ -122,14 +120,11 @@ module quadrilatero_register_lsu #( logic [ 31:0] src_ptr ; logic [ 31:0] stride ; - logic [$clog2(NumAccesses)-1:0] access_counter_d; - logic [$clog2(NumAccesses)-1:0] access_counter_q; + logic [$clog2(NumCols)-1:0] cols_counter_d; + logic [$clog2(NumCols)-1:0] cols_counter_q; + logic [$clog2(NumCols)-1:0] row_counter_d; + logic [$clog2(NumCols)-1:0] row_counter_q; - logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_d; - logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_q; - - logic [quadrilatero_pkg::RLEN-1:0] store_mask; - logic [quadrilatero_pkg::RLEN-1:0] load_mask; assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin @@ -140,31 +135,31 @@ module quadrilatero_register_lsu #( always_comb begin: write_to_RF data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols - load_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); - we_o = load_fifo_data_available &~ mask_req; // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy + we_o = load_fifo_data_available &~ mask_req; waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; wrowaddr_o = counter_q ; - load_row_buffer_d = (load_row_buffer_q & ~load_mask) | (load_fifo_data << (LLEN * access_counter_q)); - wdata_o = {load_fifo_data, load_row_buffer_q} & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q + wdata_o = load_fifo_data & ~data_mask; end always_comb begin: read_from_RF - store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; rrowaddr_o = counter_q ; - raddr_o = operand_reg_i;//lsu_state_q == LSU_IDLE? raddr_d : raddr_q; + raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; + end end always_comb begin: lsu_ctrl_block load_fifo_pop = wready_i; - store_fifo_data = (rdata_i & store_mask) >> (LLEN * access_counter_q); + store_fifo_data = rdata_i; store_fifo_push = rdata_ready_o && rdata_valid_i; lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); start = (start_i | start_q) & lsu_ready; busy_o = (write_i ? busy_d : busy) | start_q; - stride = (start) ? (stride_i / NumAccesses) : stride_q; + stride = (start) ? (stride_i / NumCols) : stride_q; src_ptr = (start) ? address_i : src_ptr_q; end @@ -175,7 +170,7 @@ module quadrilatero_register_lsu #( start_d = start ? 1'b0 : (start_q | start_i) ? 1'b1 : start_q; - stride_d = (start) ? (stride_i / NumAccesses) : stride_q ; + stride_d = (start) ? (stride_i / NumCols) : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i && rlast_o) ? 1'b0 : @@ -184,36 +179,38 @@ module quadrilatero_register_lsu #( always_comb begin: fsm_block lsu_state_d = lsu_state_q; counter_d = counter_q; - access_counter_d = access_counter_q; - rlast_o = 1'b0; - wlast_o = 1'b0; + cols_counter_d = cols_counter_q; + row_counter_d = row_counter_q; + rlast_o = cols_counter_q == NumCols - 1 && rdata_ready_o? 1'b1 : 1'b0; + wlast_o = cols_counter_q == NumCols - 1 && we_o? 1'b1 : 1'b0; back_id_d = back_id_q; - waddr_d = waddr_q; - raddr_d = raddr_q; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = waddr_q[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR]; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + waddr_d[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_d, cols_counter_d}; //TODO: not sure about _d + end case (lsu_state_q) LSU_IDLE: begin back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; - //access_counter_d = '0; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; + row_counter_d = '0; + //cols_counter_d = '0; if(load_fifo_valid && !write_i && wready_i) begin - if(access_counter_q == NumAccesses - 1) begin + if(cols_counter_q == NumCols - 1) begin counter_d = counter_q + 1; lsu_state_d = LSU_LOAD; - wlast_o = 1'b1; + end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; lsu_state_d = LSU_LOAD; end end else if (write_i & store_fifo_space_available && rdata_valid_i) begin - if(access_counter_q == NumAccesses - 1) begin + if(cols_counter_q == NumCols - 1) begin counter_d = counter_q + 1; lsu_state_d = LSU_STORE; - rlast_o = 1'b1; end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; lsu_state_d = LSU_STORE; end end @@ -223,41 +220,39 @@ module quadrilatero_register_lsu #( if(load_fifo_valid) begin if(wready_i) begin if(counter_q == LastRow) begin - if(access_counter_q == NumAccesses - 1) begin - wlast_o = 1'b1; - access_counter_d = '0; + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; counter_d = '0; - lsu_state_d = LSU_DONE; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; + if(row_counter_q == NumCols - 1) begin + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; //TODO: change to only update the MSBs + end else begin + row_counter_d = row_counter_q + 1; + end end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end else begin - if(access_counter_q == NumAccesses - 1) begin - wlast_o = 1'b1; - access_counter_d = '0; + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; counter_d = counter_q + 1; end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end end end else begin if(write_i && wready_i) begin - if(access_counter_q == NumAccesses - 1) begin + if(cols_counter_q == NumCols - 1) begin counter_d = '0; - wlast_o = 1'b1; lsu_state_d = LSU_DONE; - access_counter_d = '0; + cols_counter_d = '0; back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; - + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; //TODO: change to only update the MSBs end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end @@ -266,25 +261,26 @@ module quadrilatero_register_lsu #( LSU_STORE: begin if(store_fifo_space_available && write_i && rdata_valid_i) begin if(counter_q == LastRow) begin - if(access_counter_q == NumAccesses - 1) begin - rlast_o = 1'b1; - access_counter_d = '0; + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; counter_d = '0; - lsu_state_d = LSU_DONE; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; + if(row_counter_q == NumCols-1) begin + lsu_state_d = LSU_DONE; + back_id_d = instr_id_i; + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; //TODO: change to only update the MSBs + end else begin + row_counter_d = row_counter_q + 1; + end end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end else begin - if(access_counter_q == NumAccesses - 1) begin - rlast_o = 1'b1; - access_counter_d = '0; + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; counter_d = counter_q + 1; end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end end else begin @@ -295,22 +291,20 @@ module quadrilatero_register_lsu #( end LSU_DONE: begin if(load_fifo_valid && !write_i && wready_i) begin - if(access_counter_q == NumAccesses - 1) begin - access_counter_d = '0; + if(cols_counter_q == NumCols - 1) begin + cols_counter_d = '0; counter_d = counter_q + 1; - wlast_o = 1'b1; lsu_state_d = LSU_LOAD; end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end else if (write_i && store_fifo_space_available && rdata_valid_i) begin - if(access_counter_q == NumAccesses - 1) begin + if(cols_counter_q == NumCols - 1) begin counter_d = counter_q + 1; - rlast_o = 1'b1; lsu_state_d = LSU_STORE; - access_counter_d = '0; + cols_counter_d = '0; end else begin - access_counter_d = access_counter_q + 1; + cols_counter_d = cols_counter_q + 1; end end else begin lsu_state_d = LSU_IDLE; @@ -328,7 +322,6 @@ module quadrilatero_register_lsu #( if (!rst_ni) begin counter_q <= '0; waddr_q <= '0; - raddr_q <= '0; back_id_q <= '0; start_q <= '0; write_q <= '0; @@ -338,13 +331,12 @@ module quadrilatero_register_lsu #( lsu_busy_q <= '0; src_ptr_q <= '0; stride_q <= '0; - access_counter_q <= '0; - load_row_buffer_q <= '0; + cols_counter_q <= '0; + row_counter_q <= '0; end else begin counter_q <= counter_d; back_id_q <= back_id_d; waddr_q <= waddr_d ; - raddr_q <= raddr_d ; start_q <= start_d ; write_q <= write_d ; busy_q <= busy_d ; @@ -353,8 +345,8 @@ module quadrilatero_register_lsu #( lsu_busy_q <= busy; src_ptr_q <= src_ptr_d; stride_q <= stride_d ; - access_counter_q <= access_counter_d; - load_row_buffer_q <= load_row_buffer_d; + cols_counter_q <= cols_counter_d; + row_counter_q <= row_counter_d; end end @@ -381,8 +373,7 @@ module quadrilatero_register_lsu #( .write_i (write_i), .busy_o (busy ), .terminate_o (terminate ), - .last_i (wlast_o | rlast_o), - //.access_counter_match_i (access_counter_d == access_counter_q), + //.cols_counter_match_i (cols_counter_d == cols_counter_q), // Address .src_ptr_i (src_ptr ), @@ -428,7 +419,7 @@ module quadrilatero_register_lsu #( "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" ); end - if ((NumAccesses & (NumAccesses - 1)) != 0) begin + if ((NumCols & (NumCols - 1)) != 0) begin $error("RLEN / LLEN must be a power of 2."); end endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index 9f1c02c0f..50004c070 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -21,32 +21,32 @@ module quadrilatero_rf_sequencer #( input logic rst_ni, // Input from FUs - input logic [READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_i , + input logic [READ_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_i , input logic [READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_i , - output logic [READ_PORTS-1:0][RLEN-1:0] rdata_o , + output logic [READ_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rdata_o , output logic [READ_PORTS-1:0] rvalid_o , input logic [READ_PORTS-1:0] rlast_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0] rready_i , // request finished (must be PULSE) input logic [READ_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] rd_id_i , - input logic [WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_i , + input logic [WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_i , input logic [WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_i , - input logic [WRITE_PORTS-1:0][RLEN-1:0] wdata_i , + input logic [WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] wdata_i , input logic [WRITE_PORTS-1:0] we_i , input logic [WRITE_PORTS-1:0] wlast_i , // request finished (must be PULSE) output logic [WRITE_PORTS-1:0] wready_o , input logic [WRITE_PORTS-1:0][xif_pkg::X_ID_WIDTH-1:0] wr_id_i , // Outputs to RF - output logic [RF_READ_PORTS-1:0][$clog2(N_REGS)-1:0] raddr_o , + output logic [RF_READ_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] raddr_o , output logic [RF_READ_PORTS-1:0][$clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [RF_READ_PORTS-1:0][RLEN-1:0] rdata_i , + input logic [RF_READ_PORTS-1:0][quadrilatero_pkg::LEN-1:0] rdata_i , - output logic [RF_WRITE_PORTS-1:0][$clog2(N_REGS)-1:0] waddr_o , + output logic [RF_WRITE_PORTS-1:0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_o , output logic [RF_WRITE_PORTS-1:0][$clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [RF_WRITE_PORTS-1:0][RLEN-1:0] wdata_o , + output logic [RF_WRITE_PORTS-1:0][quadrilatero_pkg::LEN-1:0] wdata_o , output logic [RF_WRITE_PORTS-1:0] we_o , @@ -60,34 +60,50 @@ module quadrilatero_rf_sequencer #( output logic [N_REGS-1:0] rw_queue_full_o ); - logic [N_REGS-1:0][N_ROWS-1:0] head_valid ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_empty; - logic [N_REGS-1:0][N_ROWS-1:0] w_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] r_clr ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop ; - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_full ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] head_valid ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_empty; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] w_pop ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] r_pop ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] r_clr ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_pop ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_full ; logic [WRITE_PORTS-1:0] wr_gnt ; logic [WRITE_PORTS-1:0] wr_req ; logic [READ_PORTS -1:0] rd_req ; logic [READ_PORTS -1:0] rd_gnt ; - logic [N_REGS-1:0] rw_queue_push ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0] rw_queue_entry; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] rw_queue ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_d ; - quadrilatero_pkg::rw_queue_t [N_REGS-1:0][N_ROWS-1:0] scoreboard_q ; + logic [quadrilatero_pkg::N_IREGS-1:0] rw_queue_push ; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0] rw_queue_entry; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue ; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] scoreboard_d ; + quadrilatero_pkg::rw_queue_t [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] scoreboard_q ; genvar ii,hh; - assign rw_queue_pop = w_pop | r_pop | ~head_valid; - assign rw_queue_entry = rw_queue_entry_i ; - assign rw_queue_push = rw_queue_push_i ; + assign rw_queue_pop = w_pop | r_pop | ~head_valid; + always_comb begin: rw_queue_block + rw_queue_entry = '0; + rw_queue_push = '0; + if(quadrilatero_pkg::N_TILES == 1) begin //technically this if is not needed + rw_queue_entry = rw_queue_entry_i ; + rw_queue_push = rw_queue_push_i ; + end else begin + for (int jj = 0; jj < quadrilatero_pkg::N_IREGS; jj++) begin + for (int ii = 0; ii < N_REGS ; ii++) begin + if(jj >> quadrilatero_pkg::TILE_ADDR == ii) begin + rw_queue_entry[jj] = rw_queue_entry_i[ii]; + rw_queue_push[jj] = rw_queue_push_i[ii]; + end + end + end + end + end + - logic [N_REGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; + logic [quadrilatero_pkg::N_IREGS-1:0][N_ROWS-1:0] rw_queue_pop_fifo ; assign rw_queue_pop_fifo = rw_queue_pop & ~rw_queue_empty; - for (ii = 0; ii < N_REGS; ii++) begin: gen_fifo__regs + for (ii = 0; ii < quadrilatero_pkg::N_IREGS; ii++) begin: gen_fifo__regs for (hh = 0; hh < N_ROWS; hh++) begin: gen_fifo__rows fifo_v3 #( .FALL_THROUGH (1'b1) , @@ -111,9 +127,9 @@ module quadrilatero_rf_sequencer #( always_comb begin: scoreboard_block rw_queue_full_o = '0; - for (int i = 0; i < N_REGS; i++) begin + for (int i = 0; i < quadrilatero_pkg::N_IREGS; i++) begin for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i] |= (rw_queue_full[i][h]); + rw_queue_full_o[i>>quadrilatero_pkg::TILE_ADDR] |= (rw_queue_full[i][h]); //TODO: change this head_valid[i][h] = scoreboard_q[i][h].valid; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index 9909f2327..696fafa33 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -35,33 +35,33 @@ module quadrilatero_systolic_array #( input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction // Weight Read Register Port - output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] weight_raddr_o , output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , + input logic [ALEN-1:0] weight_rdata_i , input logic weight_rdata_valid_i, output logic weight_rdata_ready_o, output logic weight_rlast_o , // Data Read Register Port - output logic [ $clog2(N_REGS)-1:0] data_raddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] data_raddr_o , output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , + input logic [ALEN-1:0] data_rdata_i , input logic data_rdata_valid_i , output logic data_rdata_ready_o , output logic data_rlast_o , // Accumulator Read Register Port - output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] acc_raddr_o , output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , + input logic [ALEN-1:0] acc_rdata_i , input logic acc_rdata_valid_i , output logic acc_rdata_ready_o , output logic acc_rlast_o , // Accumulator Out Write Register Port - output logic [ $clog2(N_REGS)-1:0] res_waddr_o , + output logic [ $clog2(quadrilatero_pkg::N_IREGS)-1:0] res_waddr_o , output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , - output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , + output logic [ALEN-1:0] res_wdata_o , output logic res_we_o , output logic res_wlast_o , input logic res_wready_i , @@ -110,27 +110,8 @@ module quadrilatero_systolic_array #( logic [$clog2(K)-1:0] ff_row_counter_q; logic [$clog2(K)-1:0] dr_row_counter_d; logic [$clog2(K)-1:0] dr_row_counter_q; - logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; logic last_dr_write; - // Data Masks - logic [quadrilatero_pkg::RLEN-1:0] data_mask; - logic [quadrilatero_pkg::RLEN-1:0] weight_mask; - logic [quadrilatero_pkg::RLEN-1:0] acc_mask; - logic [quadrilatero_pkg::RLEN-1:0] res_mask; - - logic [ALEN-1:0] data_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; - logic [ALEN-1:0] weight_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_shifted; - logic [ALEN-1:0] acc_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; - logic [ALEN-1:0] res_wdata_partial; - logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; - logic [(N_ROWS*K)-1:0][quadrilatero_pkg::RLEN-1:0] res_wdata_buffer_d; - logic [(N_ROWS*K)-1:0][quadrilatero_pkg::RLEN-1:0]res_wdata_buffer_q; - - logic valid ; logic ff_valid; logic clear ; @@ -180,39 +161,38 @@ module quadrilatero_systolic_array #( always_comb begin: rf_block // Weight Read Register Port - weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - weight_base_row = N_ROWS * ff_it_counter_q; - weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q + weight_base_row; - weight_rdata_shifted = (weight_rdata_i << ALEN * ff_k_counter_q); - weight_rdata_masked = weight_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; + weight_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = weight_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_it_counter_q, (K-1-ff_k_counter_q)}; + end + weight_rrowaddr_o = ff_counter_q; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? // Data Read Register Port - data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); - data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - data_rdata_shifted = (data_rdata_i >> ALEN * ff_it_counter_q); - data_rdata_masked = data_rdata_shifted[ALEN-1:0] ; + data_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = data_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + data_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_row_counter_q, ff_it_counter_q}; + end + data_rrowaddr_o = ff_counter_q; data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; // Accumulator Read Register Port - acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - acc_rdata_shifted = (acc_rdata_i << ALEN * ff_k_counter_q); - res_rdata_shifted = (res_wdata_o << ALEN * ff_k_counter_q); - acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? - res_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] : acc_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; //TODO fix + acc_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = acc_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + acc_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_row_counter_q, (K-1-ff_k_counter_q)}; + end + acc_rrowaddr_o = ff_counter_q; acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; acc_rlast_o = '0 ; // Accumulator Out Write Register Port - res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; - res_wdata_o = res_wdata_buffer_d[res_wrowaddr_o]; + res_waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = dest_reg_q ; + if(quadrilatero_pkg::TILE_ADDR != 0) begin + res_waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {dr_row_counter_q, (K-1-dr_k_counter_q)}; + end + res_wrowaddr_o = dr_counter_q; res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); end @@ -228,7 +208,7 @@ module quadrilatero_systolic_array #( always_comb begin: ctrl_block valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q == DR_IDLE)) begin + if((ff_state_q == FF_IDLE) && (fs_state_q == FS_IDLE) && (dr_state_q == DR_IDLE)) begin clear = 1'b1; end else begin clear = 1'b0; @@ -271,7 +251,6 @@ module quadrilatero_systolic_array #( end FF_ACTIVE: begin if(valid == 1'b1) begin - ff_valid = 1'b1; if(ff_counter_q==(LastRow-1)) begin ff_counter_d = ff_counter_q + 1; ff_state_d = FF_DONE; @@ -332,7 +311,7 @@ module quadrilatero_systolic_array #( unique case(fs_state_q) FS_IDLE: begin fs_counter_d = '0; - if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin + if(ff_state_q == FF_DONE) begin fs_state_d = FS_ACTIVE; acc_fs_d = acc_reg_q; @@ -355,7 +334,7 @@ module quadrilatero_systolic_array #( end FS_LAST: begin fs_counter_d = '0; - if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin //stay in active mode, load new inputs + if(ff_state_q == FF_DONE) begin //stay in active mode, load new inputs fs_state_d = FS_ACTIVE; acc_fs_d = acc_reg_q; @@ -448,7 +427,7 @@ module quadrilatero_systolic_array #( if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin last_dr_write = 1'b1; if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; //stall + dr_state_d = DR_IDLE; //TODO: check if this is correct end else begin dr_state_d = DR_DONE; if(dr_counter_q == LastRow) begin @@ -473,40 +452,6 @@ module quadrilatero_systolic_array #( endcase end - always_comb begin : res_buffer_block // if RLEN == ALEN we don't need the weight buffer - res_wdata_buffer_d = res_wdata_buffer_q; - res_mask = {ALEN{1'b1}} << (ALEN*((K-1)-dr_k_counter_q)); - if(ff_state_q != FF_IDLE) begin - res_wdata_buffer_d[acc_rrowaddr_o] = acc_rdata_i; - end - if(dr_state_q != DR_IDLE) begin - res_wdata_buffer_d[res_wrowaddr_o] = (res_wdata_buffer_q[res_wrowaddr_o] & ~res_mask) | (res_wdata_partial << (ALEN*((K-1)-dr_k_counter_q))); - end - end - - // fifo_v3 #( - // .FALL_THROUGH (1'b0 ), - // .DEPTH (5 ), //TODO change - // .DATA_WIDTH (quadrilatero_pkg::RLEN ) - // ) acc_buffer_i ( - // .clk_i , - // .rst_ni , - // .flush_i (1'b0 ), - // .testmode_i (1'b0 ), - - // // status flags - // .full_o (acc_fifo_full ), - // .empty_o (acc_fifo_empty ), - // .usage_o (acc_fifo_usage ), - // // as long as the queue is not full we can push new data - // .data_i (acc_rdata_i ), - // .push_i (pump ), - // // as long as the queue is not empty we can pop new elements - // .data_o (acc_fifo_wdata ), - // .pop_i (pump & ~acc_fifo_empty ) - // ); - //end - quadrilatero_skewer #( .MESH_WIDTH(MESH_WIDTH), @@ -515,7 +460,7 @@ module quadrilatero_systolic_array #( .clk_i , .rst_ni , .pump_i (pump ), - .data_i (data_rdata_masked ), + .data_i (data_rdata_i ), .data_o (data_mesh_skewed) ); @@ -526,7 +471,7 @@ module quadrilatero_systolic_array #( .clk_i , .rst_ni , .pump_i (pump ), - .data_i (acc_rdata_masked ), + .data_i (acc_rdata_i ), .data_o (acc_mesh_skewed) ); @@ -554,7 +499,7 @@ module quadrilatero_systolic_array #( .weight_rdata_valid_i , // Weight Data - .weight_rdata_i (weight_rdata_masked ), + .weight_rdata_i (weight_rdata_i ), .weight_rdata_o (weight_mesh_skewed ) ); @@ -583,7 +528,7 @@ module quadrilatero_systolic_array #( .rst_ni , .pump_i (pump ), .data_i (res_mesh_skewed), - .data_o (res_wdata_partial ) + .data_o (res_wdata_o ) ); always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block @@ -611,7 +556,6 @@ module quadrilatero_systolic_array #( dr_it_counter_q <= '0; ff_row_counter_q <= '0; dr_row_counter_q <= '0; - res_wdata_buffer_q <= '0; end else begin ff_counter_q <= ff_counter_d ; fs_counter_q <= fs_counter_d ; @@ -636,12 +580,10 @@ module quadrilatero_systolic_array #( dr_it_counter_q <= dr_it_counter_d; ff_row_counter_q <= ff_row_counter_d ; dr_row_counter_q <= dr_row_counter_d ; - res_wdata_buffer_q <= res_wdata_buffer_d ; end end - assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0 - && (ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1))) | clear); + assign sa_ready_o = ff_state_q != FF_ACTIVE ; // && fs_state_q == FS_IDLE? assign sa_input_id_o = id_ff_q ; assign sa_output_id_o = id_dr_q ; assign finished_o = finished_q ; diff --git a/sw/applications/quadrilatero_easy_8x8/main.c b/sw/applications/quadrilatero_easy_8x8/main.c index aa76d60e5..55e6c58c1 100644 --- a/sw/applications/quadrilatero_easy_8x8/main.c +++ b/sw/applications/quadrilatero_easy_8x8/main.c @@ -221,11 +221,33 @@ void __attribute__ ((noinline)) matrixMul_easy(DATA_IN_t* addrA,DATA_IN_t* add asm volatile("mul s9,s3,t1 " ); // s9 = K*4*n0; asm volatile("add s9 ,%0,s9 " :: "r" (addrB) ); // s9 = startAddrB0 = addrB + K*4*n0 asm volatile("mld.w m1, (s9) , a6 " ); // m1 = B[s9] + + asm volatile("mld.w m2, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mzero m3 " ); // m4 = 0; + asm volatile("mld.w m5, (s9) , a6 " ); // m1 = B[s9] + asm volatile("mul s11,s3,t4 " ); // s11 = K*4*(n0+WIDTH); + asm volatile(MACC(HEAD_LINE,4,1,0) ); // m4 += m1 * m0 + asm volatile(MACC(HEAD_LINE,3,5,2) ); // m4 += m1 * m0 + asm volatile("add s11,%0,s11 " :: "r" (addrB) ); // s11 = startAddrB1 = addrB + K*4*(n0+WIDTH) + asm volatile("add s6,t5,0 " ); // s6 = startAddrC00 += n0*4 + asm volatile("mst.w m4, (s0) , s4 " ); // m4 -> (s6) + asm volatile("mst.w m3, (s0) , s4 " ); // m4 -> (s6) + asm volatile("mld.w m0, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mld.w m4, (s0), s3 " ); // m4 = 0; + asm volatile("mul s9,s3,t1 " ); // s9 = K*4*n0; + asm volatile("add s9 ,%0,s9 " :: "r" (addrB) ); // s9 = startAddrB0 = addrB + K*4*n0 + asm volatile("mld.w m1, (s9) , a6 " ); // m1 = B[s9] + + asm volatile("mld.w m2, (s1) , s3 " ); // m0 = A[s1] + asm volatile("mld.w m3, (s0), s3 " ); // m4 = 0; + asm volatile("mld.w m5, (s9) , a6 " ); // m1 = B[s9] asm volatile("mul s11,s3,t4 " ); // s11 = K*4*(n0+WIDTH); asm volatile(MACC(HEAD_LINE,4,1,0) ); // m4 += m1 * m0 + asm volatile(MACC(HEAD_LINE,3,5,2) ); // m4 += m1 * m0 asm volatile("add s11,%0,s11 " :: "r" (addrB) ); // s11 = startAddrB1 = addrB + K*4*(n0+WIDTH) asm volatile("add s6,t5,0 " ); // s6 = startAddrC00 += n0*4 asm volatile("mst.w m4, (s0) , s4 " ); // m4 -> (s6) + asm volatile("mst.w m3, (s0) , s4 " ); // m4 -> (s6) asm volatile("slli t6,t4, 2 " ); // t6 = (n0+WIDTH)*4; asm volatile("add s5,t6,s0 " ); // s5 = startAddrC01 += (n0+WIDTH)*4 diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index 112c176f4..990d7c63f 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -524,7 +524,103 @@ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* ad //-------------------------------------------------------------------------------- //-------------------------------------------------------------------------------- - + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("mzero m4 " ); // m4 = 0; + asm volatile("lw s0 , 0x2c(sp) " ); // From afaf00aff485d3ad962976e63e68668144801e8a Mon Sep 17 00:00:00 2001 From: Angelo Nujic Date: Fri, 2 May 2025 18:20:28 +0200 Subject: [PATCH 16/18] fixed bugs: clear signal in SA, row_counter in lsu, address calculation for 16x16 C code --- .../quadrilatero/rtl/quadrilatero.sv | 10 +- .../quadrilatero/rtl/quadrilatero_lsu.sv | 53 ++- .../rtl/quadrilatero_lsu_kindofworking.sv | 317 ------------------ ...ero_lsu_new.sv => quadrilatero_lsu_old.sv} | 24 +- .../rtl/quadrilatero_perm_unit.sv | 36 +- .../quadrilatero/rtl/quadrilatero_regfile.sv | 2 +- .../rtl/quadrilatero_register_lsu.sv | 36 +- .../rtl/quadrilatero_systolic_array.sv | 30 +- .../quadrilatero_matmul_16x16/main.c | 8 +- 9 files changed, 139 insertions(+), 377 deletions(-) delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv rename hw/ip_examples/quadrilatero/rtl/{quadrilatero_lsu_new.sv => quadrilatero_lsu_old.sv} (92%) diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv index badd478ab..77452023a 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero.sv @@ -126,7 +126,7 @@ module quadrilatero // RF Sequencer - logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_raddr_from_fu ; + logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_raddr_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_rrowaddr_from_fu; logic [quadrilatero_pkg::READ_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_rdata_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rvalid_from_fu ; @@ -134,7 +134,7 @@ module quadrilatero logic [quadrilatero_pkg::READ_PORTS-1 :0] rf_seq_rready_from_fu ; logic [quadrilatero_pkg::READ_PORTS-1 :0][xif_pkg::X_ID_WIDTH-1:0] rf_seq_rd_id_from_fu ; - logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_REGS)-1:0] rf_seq_waddr_from_fu ; + logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_IREGS)-1:0] rf_seq_waddr_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][$clog2(quadrilatero_pkg::N_ROWS)-1:0] rf_seq_wrowaddr_from_fu; logic [quadrilatero_pkg::WRITE_PORTS-1 :0][quadrilatero_pkg::LEN-1:0] rf_seq_wdata_from_fu ; logic [quadrilatero_pkg::WRITE_PORTS-1 :0] rf_seq_we_from_fu ; @@ -227,14 +227,14 @@ module quadrilatero logic lsu_wready ; logic [xif_pkg::X_ID_WIDTH-1:0] lsu_id ; logic [quadrilatero_pkg::LEN-1:0] lsu_wdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] lsu_waddr ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] lsu_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_wrowaddr ; logic lsu_rlast ; logic lsu_rready ; logic lsu_rvalid ; logic [quadrilatero_pkg::LEN-1:0] lsu_rdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] lsu_raddr ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] lsu_raddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] lsu_rrowaddr ; logic lsu_busy ; @@ -255,7 +255,7 @@ module quadrilatero logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_instr_id ; logic [xif_pkg::X_ID_WIDTH-1:0] perm_unit_finished_instr_id; logic [quadrilatero_pkg::LEN-1:0] perm_unit_wdata ; - logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] perm_unit_waddr ; + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] perm_unit_waddr ; logic [$clog2(quadrilatero_pkg::N_ROWS)-1:0] perm_unit_wrowaddr ; logic [$clog2(quadrilatero_pkg::N_REGS)-1:0] perm_unit_reg ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index 579a10a56..d179eea36 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -52,6 +52,8 @@ module quadrilatero_lsu #( localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; localparam int unsigned LastFifoUsage = DEPTH - 1; + localparam int unsigned LastRow = quadrilatero_pkg::MESH_WIDTH-1; + localparam int unsigned LastCol = quadrilatero_pkg::TILE_ADDR-1; logic terminate ; @@ -100,7 +102,10 @@ module quadrilatero_lsu #( logic store_fifo_empty ; logic [ DATA_WIDTH-1:0] store_fifo_output ; logic store_fifo_pop ; - + logic [$clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] row_counter_d; + logic [$clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] row_counter_q; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_q; //TODO: check cols counter when TILE_ADDR == 0 + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_d; enum { LSU_READY, @@ -135,17 +140,37 @@ module quadrilatero_lsu #( always_comb begin : addr_block src_ptr_inc = DATA_WIDTH / 8; - addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; + addr_op2 = (stride_i * row_counter_q) + (src_ptr_inc * col_counter_q); + addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : src_ptr_i + addr_op2; ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; end always_comb begin : counters_block rows_d = rows_q; cols_d = cols_q; + row_counter_d = row_counter_q; + col_counter_d = col_counter_q; if(start_i) begin if(data_gnt_i && data_req_o) begin + if(quadrilatero_pkg::TILE_ADDR != 0) begin + if(col_counter_q == LastCol) begin + col_counter_d = '0; + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + col_counter_d = col_counter_d + 1; + end + end else begin + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end if(cols_i > 1) begin rows_d = rows_i - 1; cols_d = cols_i - 2; @@ -158,6 +183,24 @@ module quadrilatero_lsu #( cols_d = cols_i - 1; end end else if (data_gnt_i && data_req_o) begin + if(quadrilatero_pkg::TILE_ADDR != 0) begin + if(col_counter_q == LastCol) begin + col_counter_d = '0; + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + col_counter_d = col_counter_d + 1; + end + end else begin + if(row_counter_q == LastRow) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end if (cols_q > 0) cols_d = cols_q - 1; else if (rows_q > 0) begin cols_d = cols_i - 1; @@ -296,6 +339,8 @@ module quadrilatero_lsu #( rd_head_q <= '0 ; rd_valid_q <= '0 ; data_we_q <= '0 ; + row_counter_q <= '0 ; + col_counter_q <= '0 ; end else begin lsu_state_q <= lsu_state_d; ptr_q <= ptr_d ; @@ -304,6 +349,8 @@ module quadrilatero_lsu #( rd_head_q <= rd_head_d ; rd_valid_q <= rd_valid_d ; data_we_q <= data_we_d ; + row_counter_q <= row_counter_d; + col_counter_q <= col_counter_d; end end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv deleted file mode 100644 index f6931f653..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_kindofworking.sv +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_lsu #( - parameter int unsigned FIFO_DEPTH = quadrilatero_pkg::MESH_WIDTH, - parameter int unsigned DATA_WIDTH = 32 - -) ( - input logic clk_i , - input logic rst_ni , - - // Bus interface - output logic data_req_o , - output logic [ 31:0] data_addr_o , - output logic data_we_o , - output logic [DATA_WIDTH/8 - 1:0] data_be_o , - output logic [ DATA_WIDTH-1:0] data_wdata_o , - input logic data_gnt_i , - input logic data_rvalid_i , - input logic [ DATA_WIDTH-1:0] data_rdata_i , - - // Configuration - input logic start_i , // start transfer (MUST BE A PULSE!!!!!) - input logic write_i , // write transaction - output logic busy_o , // lsu available - output logic terminate_o , // lsu done - input logic last_i, - input logic access_counter_match_i, - - // Address - input logic [ 31:0] src_ptr_i , // base address - input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one - input logic [ 31:0] rows_i , // how many rows we need to fetch - input logic [ 31:0] cols_i , - - // Output data - output logic [ DATA_WIDTH-1:0] load_fifo_output_o , - output logic load_fifo_valid_o , - output logic load_fifo_data_available_o , - input logic load_fifo_output_pop_i , - - // Input data - input logic [ DATA_WIDTH-1:0] store_fifo_input_i , - input logic store_fifo_push_i , - output logic store_fifo_space_available_o, - output logic store_fifo_empty_o - - -); - - localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; - localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; - localparam int unsigned LastFifoUsage = DEPTH - 1; - - - logic terminate ; - - logic [ 31:0] rows_q ; - logic [ 31:0] rows_d ; - logic [ 31:0] cols_q ; - logic [ 31:0] cols_d ; - logic [ 31:0] src_ptr_inc ; - logic [ 31:0] addr ; - logic [ 31:0] addr_op2 ; - logic [ 31:0] ptr_q ; - logic [ 31:0] ptr_d ; - - logic data_in_req ; - logic data_in_we ; - logic [ DATA_WIDTH/8-1:0] data_in_be ; - logic [ 31:0] data_in_addr ; - logic data_in_rvalid ; - logic [ DATA_WIDTH-1:0] data_in_rdata ; - - logic [ DATA_WIDTH-1:0] load_fifo_input ; - logic [ DATA_WIDTH-1:0] load_fifo_data_out; - logic rd_valid_q ; - logic rd_valid_d ; - logic [ DATA_WIDTH-1:0] rd_head_q ; - logic [ DATA_WIDTH-1:0] rd_head_d ; - logic data_we_q ; - logic data_we_d ; - logic rvalid ; - logic load_fifo_pop ; - logic load_fifo_push ; - logic [Addr_Fifo_Depth-1:0] load_fifo_usage ; - logic load_fifo_alm_full; - logic load_fifo_full ; - logic load_fifo_empty ; - - logic data_out_req ; - logic data_out_we ; - logic [ DATA_WIDTH/8-1:0] data_out_be ; - logic [ 31:0] data_out_addr ; - logic data_out_gnt ; - logic [ DATA_WIDTH-1:0] data_out_wdata ; - - logic store_fifo_full ; - logic store_fifo_empty ; - logic [ DATA_WIDTH-1:0] store_fifo_output ; - logic store_fifo_pop ; - logic last_q ; - logic last_d ; - - - enum { - LSU_READY, - LSU_RUNNING - } - lsu_state_q, lsu_state_d; - - - assign last_d = last_i; - always_comb begin : FSM_block - lsu_state_d = lsu_state_q; - - case (lsu_state_q) - LSU_READY: begin - if (start_i & |cols_i & |rows_i) begin - lsu_state_d = LSU_RUNNING; - end - end - LSU_RUNNING: begin - if (terminate && !start_i && (store_fifo_empty)) begin - lsu_state_d = LSU_READY; - end - end - endcase - end - - always_comb begin : ctrl_block - terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING) && (!last_i || write_i)); // !last_i only for RLEN/LLEN = 2, in other cases work with access counter - load_fifo_valid_o = rd_valid_d; - busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; - terminate_o = terminate; - end - - always_comb begin : addr_block - src_ptr_inc = DATA_WIDTH / 8; - addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i) ? ((src_ptr_i == ptr_q) ? ptr_q + addr_op2 : src_ptr_i) : ((write_i && !data_we_q)? ptr_q : ptr_q + addr_op2); //what happens when 2 loads don't load from subsequent addresses? - ptr_d = ((data_gnt_i && data_req_o) || start_i) ? addr : ptr_q; - end - - always_comb begin : counters_block - rows_d = rows_q; - cols_d = cols_q; - - if(start_i) begin - if(data_gnt_i && data_req_o) begin - if(cols_i > 1) begin - rows_d = rows_i - 1; - cols_d = cols_i - 2; - end else if (rows_i > 1) begin - rows_d = rows_i - 1; - cols_d = cols_i - 1; - end - end else begin - rows_d = rows_i - 1; - cols_d = cols_i - 1; - end - end else if (data_gnt_i && data_req_o && last_i) begin - if (cols_q > 0) cols_d = cols_q - 1; - else if (rows_q > 0) begin - cols_d = cols_i - 1; - rows_d = rows_q - 1; - end - end - end - - always_comb begin : read_obi - data_in_req = '0; - data_in_we = '0; - data_in_be = '0; - data_in_addr = '0; - - if (load_fifo_full == 1'b0 && load_fifo_alm_full == 1'b0) begin - data_in_req = ~write_i & (start_i | lsu_state_q == LSU_RUNNING); - data_in_we = 1'b0 ; - data_in_be = '1 ; - data_in_addr = addr ; - end - end - - always_comb begin : write_obi - data_out_req = '0 ; - data_out_we = '0 ; - data_out_be = '0 ; - data_out_addr = '0 ; - data_out_wdata = store_fifo_output; - - if (!store_fifo_empty) begin - data_out_req = start_i | lsu_state_q == LSU_RUNNING; - // data_out_we = 1'b1 ; - data_out_we = start_i | lsu_state_q == LSU_RUNNING; - data_out_be = '1 ; - data_out_addr = addr ; - end - end - - always_comb begin : obi_channel_signals - data_in_rvalid = 1'b0 ; - data_wdata_o = data_out_wdata; - data_out_gnt = data_gnt_i ; - data_in_rdata = data_rdata_i ; - - if(store_fifo_empty) begin // read transaction active - data_req_o = data_in_req ; - data_we_o = data_in_we ; - data_be_o = data_in_be ; - data_addr_o = data_in_addr ; - data_in_rvalid = data_rvalid_i ; - end else begin // write transaction active - data_req_o = data_out_req ; - data_we_o = data_out_we ; - data_be_o = data_out_be ; - data_addr_o = data_out_addr ; - end - end - - always_comb begin : load_fifo_block - data_we_d = data_gnt_i && data_req_o && data_we_o; - rvalid = data_in_rvalid &~ data_we_q ; - - load_fifo_alm_full = (load_fifo_usage == LastFifoUsage[Addr_Fifo_Depth-1:0]); - load_fifo_input = data_in_rdata; - load_fifo_push = (rvalid & rd_valid_q & ~load_fifo_output_pop_i) | (rvalid & ~load_fifo_empty); - load_fifo_pop = load_fifo_output_pop_i & ~load_fifo_empty; - - rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : - (load_fifo_output_pop_i & - load_fifo_empty & ~data_gnt_i) ? 1'b0 : rd_valid_q; - - rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || - (rvalid & ~rd_valid_q) ? load_fifo_input : - (load_fifo_output_pop_i & ~load_fifo_empty) ? load_fifo_data_out : rd_head_q; - - load_fifo_output_o = rd_head_q ; - load_fifo_data_available_o = rd_valid_q; - end - - always_comb begin : store_fifo_block - store_fifo_pop = data_out_gnt & data_out_req; - store_fifo_empty_o = store_fifo_empty; - store_fifo_space_available_o = ~store_fifo_full; - end - - fifo_v3 #( - .FALL_THROUGH (1'b0 ), - .DEPTH (DEPTH ), - .DATA_WIDTH (DATA_WIDTH ) - ) load_lsu_fifo_i ( - .clk_i , - .rst_ni , - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - - // status flags - .full_o (load_fifo_full ), - .empty_o (load_fifo_empty ), - .usage_o (load_fifo_usage ), - - // as long as the queue is not full we can push new data - .data_i (load_fifo_input ), - .push_i (load_fifo_push ), - - // as long as the queue is not empty we can pop new elements - .data_o (load_fifo_data_out ), - .pop_i (load_fifo_pop ) - ); - - fifo_v3 #( - .DEPTH(FIFO_DEPTH), - .DATA_WIDTH(DATA_WIDTH) - ) store_lsu_fifo_i ( - .clk_i , - .rst_ni , - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - // status flags - .full_o (store_fifo_full ), - .empty_o (store_fifo_empty ), - .usage_o ( ), - // as long as the queue is not full we can push new data - .data_i (store_fifo_input_i ), - .push_i (store_fifo_push_i ), - // as long as the queue is not empty we can pop new elements - .data_o (store_fifo_output ), - .pop_i (store_fifo_pop ) - ); - - always_ff @(posedge clk_i, negedge rst_ni) begin : seq_block - if (~rst_ni) begin - lsu_state_q <= LSU_READY; - ptr_q <= '0 ; - last_q <= '0 ; - rows_q <= '0 ; - cols_q <= '0 ; - rd_head_q <= '0 ; - rd_valid_q <= '0 ; - data_we_q <= '0 ; - end else begin - lsu_state_q <= lsu_state_d; - ptr_q <= ptr_d ; - last_q <= last_d ; - rows_q <= rows_d ; - cols_q <= cols_d ; - rd_head_q <= rd_head_d ; - rd_valid_q <= rd_valid_d ; - data_we_q <= data_we_d ; - end - end - -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv similarity index 92% rename from hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv rename to hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv index 1614ea3d3..579a10a56 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_new.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv @@ -5,7 +5,7 @@ // Author: Danilo Cammarata module quadrilatero_lsu #( - parameter int unsigned FIFO_DEPTH = quadrilatero_pkg::MESH_WIDTH, + parameter int unsigned FIFO_DEPTH = 4, parameter int unsigned DATA_WIDTH = 32 ) ( @@ -27,9 +27,7 @@ module quadrilatero_lsu #( input logic write_i , // write transaction output logic busy_o , // lsu available output logic terminate_o , // lsu done - input logic last_i, - input logic access_counter_match_i, - + // Address input logic [ 31:0] src_ptr_i , // base address input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one @@ -102,8 +100,6 @@ module quadrilatero_lsu #( logic store_fifo_empty ; logic [ DATA_WIDTH-1:0] store_fifo_output ; logic store_fifo_pop ; - logic last_q ; - logic last_d ; enum { @@ -113,7 +109,6 @@ module quadrilatero_lsu #( lsu_state_q, lsu_state_d; - assign last_d = last_i; always_comb begin : FSM_block lsu_state_d = lsu_state_q; @@ -124,7 +119,7 @@ module quadrilatero_lsu #( end end LSU_RUNNING: begin - if (terminate && !start_i && (store_fifo_empty)) begin + if (terminate && !start_i) begin lsu_state_d = LSU_READY; end end @@ -132,8 +127,7 @@ module quadrilatero_lsu #( end always_comb begin : ctrl_block - terminate = ((rows_q == 'b1 && last_i && !write_i) - && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); //absolutely ugly + terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); load_fifo_valid_o = rd_valid_d; busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; terminate_o = terminate; @@ -142,7 +136,7 @@ module quadrilatero_lsu #( always_comb begin : addr_block src_ptr_inc = DATA_WIDTH / 8; addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i) ? src_ptr_i : ptr_q + addr_op2; // || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1)) && (access_counter_match_i == 1'b0) + addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; end @@ -156,14 +150,14 @@ module quadrilatero_lsu #( rows_d = rows_i - 1; cols_d = cols_i - 2; end else if (rows_i > 1) begin - rows_d = rows_i - 1; + rows_d = rows_i - 2; cols_d = cols_i - 1; end end else begin rows_d = rows_i - 1; cols_d = cols_i - 1; end - end else if (data_gnt_i && data_req_o && last_i) begin + end else if (data_gnt_i && data_req_o) begin if (cols_q > 0) cols_d = cols_q - 1; else if (rows_q > 0) begin cols_d = cols_i - 1; @@ -297,7 +291,6 @@ module quadrilatero_lsu #( if (~rst_ni) begin lsu_state_q <= LSU_READY; ptr_q <= '0 ; - last_q <= '0 ; rows_q <= '0 ; cols_q <= '0 ; rd_head_q <= '0 ; @@ -306,7 +299,6 @@ module quadrilatero_lsu #( end else begin lsu_state_q <= lsu_state_d; ptr_q <= ptr_d ; - last_q <= last_d ; rows_q <= rows_d ; cols_q <= cols_d ; rd_head_q <= rd_head_d ; @@ -315,4 +307,4 @@ module quadrilatero_lsu #( end end -endmodule +endmodule \ No newline at end of file diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv index 9de2a46c9..97017a957 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv @@ -58,8 +58,13 @@ module quadrilatero_perm_unit #( logic mask_req ; logic fifo_full ; logic fifo_empty ; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] row_counter_d; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] row_counter_q; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] cols_counter_d; + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1 : 0] cols_counter_q; localparam int unsigned USAGE = DEPTH > 1 : $clog2(DEPTH) : 0; + localparam int unsigned TILES = RLEN / quadrilatero_pkg::LEN; logic [USAGE:0] fifo_usage; logic fifo_almost_full; //---------------------------------------------------------------------------------------------------------- @@ -86,7 +91,8 @@ module quadrilatero_perm_unit #( always_comb begin : ctrl_block mask_req = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & finished_q & ~finished_ack_i; - finished = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) & write_started_q & wready_i ; + finished = (counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) && (row_counter_q == TILES-1) && (cols_counter_q == TILES-1) + & write_started_q & wready_i ; busy = write_started_q &~ finished ; start = ~busy & ~fifo_empty ; finished_id = id_q ; @@ -99,6 +105,8 @@ module quadrilatero_perm_unit #( write_started_d = write_started_q ; finished_d = finished_q ; finished_instr_id_d = finished_instr_id_q; + cols_counter_d = cols_counter_q; + row_counter_d = row_counter_q; if (start) begin operand_reg_d = operand_reg_new; @@ -106,9 +114,24 @@ module quadrilatero_perm_unit #( end if ((write_started_q && wready_i)) begin - counter_d = counter_q + 1; + if(cols_counter_q == TILES-1) begin + cols_counter_d = '0; + if(counter_q == $clog2(N_ROWS)'(N_ROWS - 1)) begin + counter_d = '0; + if(row_counter_q == TILES-1) begin + row_counter_d = '0; + end else begin + row_counter_d = row_counter_q + 1; + end + end else begin + counter_d = counter_q + 1; + end + end else begin + cols_counter_d = cols_counter_q + 1; + end end else if (finished) begin counter_d = '0; + end if (start) begin @@ -134,6 +157,8 @@ module quadrilatero_perm_unit #( id_q <= '0; write_started_q <= '0; counter_q <= '0; + row_counter_q <= '0; + cols_counter_q <= '0; end else begin finished_q <= finished_d ; finished_instr_id_q <= finished_instr_id_d; @@ -141,11 +166,16 @@ module quadrilatero_perm_unit #( id_q <= id_d ; write_started_q <= write_started_d ; counter_q <= counter_d ; + row_counter_q <= row_counter_d ; + cols_counter_q <= cols_counter_d ; end end - assign waddr_o = operand_reg_q ; //TODO: fix + assign waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_q ; //TODO: fix + if(quadrilatero_pkg::TILE_ADDR != 0) begin + assign waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; + end assign wrowaddr_o = counter_q ; assign wdata_o = '0 ; assign we_o = write_started_q &~ mask_req; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv index 83c06e904..4dafb1a5d 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_regfile.sv @@ -9,7 +9,7 @@ module quadrilatero_regfile #( parameter WRITE_PORTS = 2, // number of write ports parameter N_REGS = 8, // how many registers parameter RLEN = 128, // length in bits for each register row - localparam N_ROWS = RLEN / 32 // this is done in the thead spec + localparam N_ROWS = quadrilatero_pkg::LEN / 32 // this is done in the thead spec ) ( // clock and reset input logic clk_i, diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index bb3e32027..a466737b4 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -129,14 +129,14 @@ module quadrilatero_register_lsu #( assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin lsu_id_o = (write_i &~ load_fifo_data_available & rlast_o) ? instr_id_i : back_id_q; - finished = (write_q & terminate & rlast_o) | (~write_q & (counter_q == LastRow) & wready_i && wlast_o); + finished = (write_q & terminate & rlast_o) | (~write_q && (counter_q == LastRow) && (row_counter_q == NumCols-1) && (cols_counter_q == NumCols-1) && wready_i && wlast_o); end always_comb begin: write_to_RF data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols we_o = load_fifo_data_available &~ mask_req; - waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; + waddr_o = waddr_q; wrowaddr_o = counter_q ; wdata_o = load_fifo_data & ~data_mask; @@ -147,7 +147,7 @@ module quadrilatero_register_lsu #( rrowaddr_o = counter_q ; raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; if(quadrilatero_pkg::TILE_ADDR != 0) begin - raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; + raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; //TODO: maybe change to _d end end @@ -159,7 +159,7 @@ module quadrilatero_register_lsu #( start = (start_i | start_q) & lsu_ready; busy_o = (write_i ? busy_d : busy) | start_q; - stride = (start) ? (stride_i / NumCols) : stride_q; + stride = (start) ? stride_i : stride_q; src_ptr = (start) ? address_i : src_ptr_q; end @@ -170,32 +170,35 @@ module quadrilatero_register_lsu #( start_d = start ? 1'b0 : (start_q | start_i) ? 1'b1 : start_q; - stride_d = (start) ? (stride_i / NumCols) : stride_q ; + stride_d = (start) ? stride_i : stride_q ; src_ptr_d = (start) ? address_i : src_ptr_q; - busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i && rlast_o) ? 1'b0 : + busy_d = (write_i && (counter_q == LastRow) && (row_counter_q == NumCols-1) && (cols_counter_q == NumCols-1) && rdata_valid_i && rlast_o) ? 1'b0 : (write_i && start_i) ? 1'b1 : busy_q; end + + if(quadrilatero_pkg::TILE_ADDR != 0) begin + assign waddr_d[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_d, cols_counter_d}; //TODO: not sure about _d + end + always_comb begin: fsm_block lsu_state_d = lsu_state_q; counter_d = counter_q; cols_counter_d = cols_counter_q; row_counter_d = row_counter_q; - rlast_o = cols_counter_q == NumCols - 1 && rdata_ready_o? 1'b1 : 1'b0; - wlast_o = cols_counter_q == NumCols - 1 && we_o? 1'b1 : 1'b0; + rlast_o = rdata_ready_o? 1'b1 : 1'b0; + wlast_o = we_o? 1'b1 : 1'b0; back_id_d = back_id_q; waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = waddr_q[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR]; - if(quadrilatero_pkg::TILE_ADDR != 0) begin - waddr_d[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_d, cols_counter_d}; //TODO: not sure about _d - end + case (lsu_state_q) LSU_IDLE: begin back_id_d = instr_id_i; waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; row_counter_d = '0; - //cols_counter_d = '0; + cols_counter_d = '0; if(load_fifo_valid && !write_i && wready_i) begin if(cols_counter_q == NumCols - 1) begin counter_d = counter_q + 1; @@ -224,9 +227,10 @@ module quadrilatero_register_lsu #( cols_counter_d = '0; counter_d = '0; if(row_counter_q == NumCols - 1) begin + row_counter_d = '0; lsu_state_d = LSU_DONE; back_id_d = instr_id_i; - waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; //TODO: change to only update the MSBs + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; end else begin row_counter_d = row_counter_q + 1; end @@ -250,7 +254,7 @@ module quadrilatero_register_lsu #( lsu_state_d = LSU_DONE; cols_counter_d = '0; back_id_d = instr_id_i; - waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; //TODO: change to only update the MSBs + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; end else begin cols_counter_d = cols_counter_q + 1; end @@ -267,7 +271,7 @@ module quadrilatero_register_lsu #( if(row_counter_q == NumCols-1) begin lsu_state_d = LSU_DONE; back_id_d = instr_id_i; - waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; //TODO: change to only update the MSBs + waddr_d[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; end else begin row_counter_d = row_counter_q + 1; end @@ -293,6 +297,7 @@ module quadrilatero_register_lsu #( if(load_fifo_valid && !write_i && wready_i) begin if(cols_counter_q == NumCols - 1) begin cols_counter_d = '0; + row_counter_d = '0; counter_d = counter_q + 1; lsu_state_d = LSU_LOAD; end else begin @@ -303,6 +308,7 @@ module quadrilatero_register_lsu #( counter_d = counter_q + 1; lsu_state_d = LSU_STORE; cols_counter_d = '0; + row_counter_d = '0; end else begin cols_counter_d = cols_counter_q + 1; end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index 696fafa33..f2a9cee93 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -100,8 +100,10 @@ module quadrilatero_systolic_array #( logic [$clog2(K)-1:0] ff_k_counter_d; logic [$clog2(K)-1:0] ff_k_counter_q; + logic [$clog2(K)-1:0] ff_k_counter_rev; logic [$clog2(K)-1:0] dr_k_counter_d; logic [$clog2(K)-1:0] dr_k_counter_q; + logic [$clog2(K)-1:0] dr_k_counter_rev; logic [$clog2(K)-1:0] ff_it_counter_d; logic [$clog2(K)-1:0] ff_it_counter_q; logic [$clog2(K)-1:0] dr_it_counter_d; @@ -149,6 +151,7 @@ module quadrilatero_systolic_array #( logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; logic mask_req ; + logic ready; quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; @@ -158,16 +161,17 @@ module quadrilatero_systolic_array #( logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; //--------------------------------------------------------------------- - + assign ff_k_counter_rev = (K-1-ff_k_counter_q); + assign dr_k_counter_rev = (K-1-dr_k_counter_q); always_comb begin: rf_block // Weight Read Register Port weight_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = weight_reg_q ; if(quadrilatero_pkg::TILE_ADDR != 0) begin - weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_it_counter_q, (K-1-ff_k_counter_q)}; + weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_it_counter_q, ff_k_counter_rev}; end weight_rrowaddr_o = ff_counter_q; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? + weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? // Data Read Register Port data_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = data_reg_q ; @@ -176,12 +180,12 @@ module quadrilatero_systolic_array #( end data_rrowaddr_o = ff_counter_q; data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; + data_rlast_o = ff_state_q != FF_IDLE && (ff_k_counter_q == (K-1)) ; // Accumulator Read Register Port acc_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = acc_reg_q ; if(quadrilatero_pkg::TILE_ADDR != 0) begin - acc_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_row_counter_q, (K-1-ff_k_counter_q)}; + acc_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_row_counter_q, ff_k_counter_rev}; end acc_rrowaddr_o = ff_counter_q; acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; @@ -190,25 +194,25 @@ module quadrilatero_systolic_array #( // Accumulator Out Write Register Port res_waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = dest_reg_q ; if(quadrilatero_pkg::TILE_ADDR != 0) begin - res_waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {dr_row_counter_q, (K-1-dr_k_counter_q)}; + res_waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {dr_row_counter_q, dr_k_counter_rev}; end res_wrowaddr_o = dr_counter_q; res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; - res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); + res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1)); end always_comb begin: finished_signal - finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : + finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1) && (dr_k_counter_q == K-1)) ? 1'b1 : (finished_ack_i ) ? 1'b0 : finished_q; - finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? id_dr_q : + finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1) && (dr_k_counter_q == K-1)) ? id_dr_q : (finished_ack_i ) ? '0 : finished_instr_id_q; end always_comb begin: ctrl_block valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE) && (fs_state_q == FS_IDLE) && (dr_state_q == DR_IDLE)) begin + if((ff_state_q == FF_IDLE || (ff_state_q == FF_ACTIVE && ff_counter_q == '0 && ff_counter_d == '0)) && (fs_state_q == FS_IDLE) && (dr_state_q == DR_IDLE)) begin //TODO: check this condition (ff_state_q == FF_IDLE || (ff_state_q == FF_ACTIVE && valid == 1'b0)) clear = 1'b1; end else begin clear = 1'b0; @@ -311,7 +315,7 @@ module quadrilatero_systolic_array #( unique case(fs_state_q) FS_IDLE: begin fs_counter_d = '0; - if(ff_state_q == FF_DONE) begin + if(ff_state_q == FF_DONE && valid == 1'b1) begin fs_state_d = FS_ACTIVE; acc_fs_d = acc_reg_q; @@ -582,8 +586,8 @@ module quadrilatero_systolic_array #( dr_row_counter_q <= dr_row_counter_d ; end end - - assign sa_ready_o = ff_state_q != FF_ACTIVE ; // && fs_state_q == FS_IDLE? + assign ready = (ff_state_q == FF_DONE) && (ff_k_counter_q == K-1) && (ff_it_counter_q == K-1) && (ff_row_counter_q == RegLastRow-1); + assign sa_ready_o = ready || (ff_state_q == FF_IDLE && fs_state_q == FS_IDLE); // && fs_state_q == FS_IDLE? assign sa_input_id_o = id_ff_q ; assign sa_output_id_o = id_dr_q ; assign finished_o = finished_q ; diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index 990d7c63f..93a1876e5 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -472,7 +472,7 @@ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* ad asm volatile("loopN_start16x16: " ); // while(n0 Date: Fri, 2 May 2025 23:09:21 +0200 Subject: [PATCH 17/18] 8x8 working --- hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv | 2 +- hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv | 2 +- hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv | 2 +- sw/applications/quadrilatero_matmul_16x16/main.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index 7bc6bca96..6a1c2d062 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -8,7 +8,7 @@ package quadrilatero_pkg; parameter int unsigned N_REGS = 8; parameter int unsigned DATA_WIDTH = 32; parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 4; //change register size + parameter int unsigned MESH_WIDTH = 8; //change register size parameter int unsigned SA_MESH_WIDTH = 4; parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units parameter int unsigned MAX_NUM_READ_OPERANDS = 3; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index d179eea36..3a8d39b36 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -133,7 +133,7 @@ module quadrilatero_lsu #( always_comb begin : ctrl_block terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); - load_fifo_valid_o = rd_valid_d; + load_fifo_valid_o = rd_valid_d | rd_valid_q; busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; terminate_o = terminate; end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index f2a9cee93..b3c0fa325 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -167,7 +167,7 @@ module quadrilatero_systolic_array #( // Weight Read Register Port weight_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = weight_reg_q ; if(quadrilatero_pkg::TILE_ADDR != 0) begin - weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_it_counter_q, ff_k_counter_rev}; + weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_k_counter_rev, ff_it_counter_q}; //TODO: check if this is correct, we transpose B matrix (it looks correct compared to 4x4 RF) end weight_rrowaddr_o = ff_counter_q; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index 93a1876e5..9d6cd39f2 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -472,7 +472,7 @@ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* ad asm volatile("loopN_start16x16: " ); // while(n0 Date: Sat, 3 May 2025 12:04:31 +0200 Subject: [PATCH 18/18] Extending register file dimension --- .../rtl/include/quadrilatero_pkg.sv | 4 +- .../quadrilatero/rtl/quadrilatero_lsu.sv | 2 +- .../quadrilatero/rtl/quadrilatero_lsu_old.sv | 310 --------- .../rtl/quadrilatero_perm_unit.sv | 2 +- .../rtl/quadrilatero_register_lsu.sv | 19 +- ...register_lsu_adapted_to_8x8_not_working.sv | 434 ------------ .../rtl/quadrilatero_rf_sequencer.sv | 2 +- .../rtl/quadrilatero_systolic_array.sv | 16 +- .../rtl/quadrilatero_systolic_array_old.sv | 636 ------------------ ...quadrilatero_systolic_array_wrong_order.sv | 636 ------------------ .../quadrilatero_matmul_16x16/main.c | 97 +-- 11 files changed, 21 insertions(+), 2137 deletions(-) delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv delete mode 100644 hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv diff --git a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv index 6a1c2d062..7414e84e9 100644 --- a/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv +++ b/hw/ip_examples/quadrilatero/rtl/include/quadrilatero_pkg.sv @@ -8,8 +8,8 @@ package quadrilatero_pkg; parameter int unsigned N_REGS = 8; parameter int unsigned DATA_WIDTH = 32; parameter int unsigned BUS_WIDTH = 128; - parameter int unsigned MESH_WIDTH = 8; //change register size - parameter int unsigned SA_MESH_WIDTH = 4; + parameter int unsigned MESH_WIDTH = 8; // change register dimension + parameter int unsigned SA_MESH_WIDTH = 4; // change systolic array dimension parameter int unsigned NUM_EXEC_UNITS = 3; // change me to add units parameter int unsigned MAX_NUM_READ_OPERANDS = 3; parameter int unsigned MAX_NUM_WRITE_OPERANDS = 1; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv index 3a8d39b36..3b0163568 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu.sv @@ -104,7 +104,7 @@ module quadrilatero_lsu #( logic store_fifo_pop ; logic [$clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] row_counter_d; logic [$clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] row_counter_q; - logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_q; //TODO: check cols counter when TILE_ADDR == 0 + logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_q; logic [$clog2(quadrilatero_pkg::TILE_ADDR)-1:0] col_counter_d; enum { diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv deleted file mode 100644 index 579a10a56..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_lsu_old.sv +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -module quadrilatero_lsu #( - parameter int unsigned FIFO_DEPTH = 4, - parameter int unsigned DATA_WIDTH = 32 - -) ( - input logic clk_i , - input logic rst_ni , - - // Bus interface - output logic data_req_o , - output logic [ 31:0] data_addr_o , - output logic data_we_o , - output logic [DATA_WIDTH/8 - 1:0] data_be_o , - output logic [ DATA_WIDTH-1:0] data_wdata_o , - input logic data_gnt_i , - input logic data_rvalid_i , - input logic [ DATA_WIDTH-1:0] data_rdata_i , - - // Configuration - input logic start_i , // start transfer (MUST BE A PULSE!!!!!) - input logic write_i , // write transaction - output logic busy_o , // lsu available - output logic terminate_o , // lsu done - - // Address - input logic [ 31:0] src_ptr_i , // base address - input logic [ 31:0] stride_i , // stride to move in memory from one row to the next one - input logic [ 31:0] rows_i , // how many rows we need to fetch - input logic [ 31:0] cols_i , - - // Output data - output logic [ DATA_WIDTH-1:0] load_fifo_output_o , - output logic load_fifo_valid_o , - output logic load_fifo_data_available_o , - input logic load_fifo_output_pop_i , - - // Input data - input logic [ DATA_WIDTH-1:0] store_fifo_input_i , - input logic store_fifo_push_i , - output logic store_fifo_space_available_o, - output logic store_fifo_empty_o - - -); - - localparam int unsigned DEPTH = (FIFO_DEPTH > 0) ? FIFO_DEPTH - 1 : 0; - localparam int unsigned Addr_Fifo_Depth = (FIFO_DEPTH > 1) ? $clog2(FIFO_DEPTH) : 1; - localparam int unsigned LastFifoUsage = DEPTH - 1; - - - logic terminate ; - - logic [ 31:0] rows_q ; - logic [ 31:0] rows_d ; - logic [ 31:0] cols_q ; - logic [ 31:0] cols_d ; - logic [ 31:0] src_ptr_inc ; - logic [ 31:0] addr ; - logic [ 31:0] addr_op2 ; - logic [ 31:0] ptr_q ; - logic [ 31:0] ptr_d ; - - logic data_in_req ; - logic data_in_we ; - logic [ DATA_WIDTH/8-1:0] data_in_be ; - logic [ 31:0] data_in_addr ; - logic data_in_rvalid ; - logic [ DATA_WIDTH-1:0] data_in_rdata ; - - logic [ DATA_WIDTH-1:0] load_fifo_input ; - logic [ DATA_WIDTH-1:0] load_fifo_data_out; - logic rd_valid_q ; - logic rd_valid_d ; - logic [ DATA_WIDTH-1:0] rd_head_q ; - logic [ DATA_WIDTH-1:0] rd_head_d ; - logic data_we_q ; - logic data_we_d ; - logic rvalid ; - logic load_fifo_pop ; - logic load_fifo_push ; - logic [Addr_Fifo_Depth-1:0] load_fifo_usage ; - logic load_fifo_alm_full; - logic load_fifo_full ; - logic load_fifo_empty ; - - logic data_out_req ; - logic data_out_we ; - logic [ DATA_WIDTH/8-1:0] data_out_be ; - logic [ 31:0] data_out_addr ; - logic data_out_gnt ; - logic [ DATA_WIDTH-1:0] data_out_wdata ; - - logic store_fifo_full ; - logic store_fifo_empty ; - logic [ DATA_WIDTH-1:0] store_fifo_output ; - logic store_fifo_pop ; - - - enum { - LSU_READY, - LSU_RUNNING - } - lsu_state_q, lsu_state_d; - - - always_comb begin : FSM_block - lsu_state_d = lsu_state_q; - - case (lsu_state_q) - LSU_READY: begin - if (start_i & |cols_i & |rows_i) begin - lsu_state_d = LSU_RUNNING; - end - end - LSU_RUNNING: begin - if (terminate && !start_i) begin - lsu_state_d = LSU_READY; - end - end - endcase - end - - always_comb begin : ctrl_block - terminate = (|rows_q == '0 && |cols_q == '0 && data_gnt_i && data_req_o && (lsu_state_q == LSU_RUNNING)); - load_fifo_valid_o = rd_valid_d; - busy_o = (lsu_state_q == LSU_RUNNING) & ~terminate; - terminate_o = terminate; - end - - always_comb begin : addr_block - src_ptr_inc = DATA_WIDTH / 8; - addr_op2 = (cols_q == '0) ? stride_i : src_ptr_inc; - addr = (start_i || ((rows_q == rows_i - 1) && (cols_q == cols_i - 1))) ? src_ptr_i : ptr_q + addr_op2; - ptr_d = (data_gnt_i && data_req_o) ? addr : ptr_q; - end - - always_comb begin : counters_block - rows_d = rows_q; - cols_d = cols_q; - - if(start_i) begin - if(data_gnt_i && data_req_o) begin - if(cols_i > 1) begin - rows_d = rows_i - 1; - cols_d = cols_i - 2; - end else if (rows_i > 1) begin - rows_d = rows_i - 2; - cols_d = cols_i - 1; - end - end else begin - rows_d = rows_i - 1; - cols_d = cols_i - 1; - end - end else if (data_gnt_i && data_req_o) begin - if (cols_q > 0) cols_d = cols_q - 1; - else if (rows_q > 0) begin - cols_d = cols_i - 1; - rows_d = rows_q - 1; - end - end - end - - always_comb begin : read_obi - data_in_req = '0; - data_in_we = '0; - data_in_be = '0; - data_in_addr = '0; - - if (load_fifo_full == 1'b0 && load_fifo_alm_full == 1'b0) begin - data_in_req = ~write_i & (start_i | lsu_state_q == LSU_RUNNING); - data_in_we = 1'b0 ; - data_in_be = '1 ; - data_in_addr = addr ; - end - end - - always_comb begin : write_obi - data_out_req = '0 ; - data_out_we = '0 ; - data_out_be = '0 ; - data_out_addr = '0 ; - data_out_wdata = store_fifo_output; - - if (!store_fifo_empty) begin - data_out_req = start_i | lsu_state_q == LSU_RUNNING; - // data_out_we = 1'b1 ; - data_out_we = start_i | lsu_state_q == LSU_RUNNING; - data_out_be = '1 ; - data_out_addr = addr ; - end - end - - always_comb begin : obi_channel_signals - data_in_rvalid = 1'b0 ; - data_wdata_o = data_out_wdata; - data_out_gnt = data_gnt_i ; - data_in_rdata = data_rdata_i ; - - if(store_fifo_empty) begin // read transaction active - data_req_o = data_in_req ; - data_we_o = data_in_we ; - data_be_o = data_in_be ; - data_addr_o = data_in_addr ; - data_in_rvalid = data_rvalid_i ; - end else begin // write transaction active - data_req_o = data_out_req ; - data_we_o = data_out_we ; - data_be_o = data_out_be ; - data_addr_o = data_out_addr ; - end - end - - always_comb begin : load_fifo_block - data_we_d = data_gnt_i && data_req_o && data_we_o; - rvalid = data_in_rvalid &~ data_we_q ; - - load_fifo_alm_full = (load_fifo_usage == LastFifoUsage[Addr_Fifo_Depth-1:0]); - load_fifo_input = data_in_rdata; - load_fifo_push = (rvalid & rd_valid_q & ~load_fifo_output_pop_i) | (rvalid & ~load_fifo_empty); - load_fifo_pop = load_fifo_output_pop_i & ~load_fifo_empty; - - rd_valid_d = (rvalid & ~rd_valid_q) ? 1'b1 : - (load_fifo_output_pop_i & - load_fifo_empty & ~rvalid) ? 1'b0 : rd_valid_q; - - rd_head_d = (load_fifo_output_pop_i & load_fifo_empty & rvalid) || - (rvalid & ~rd_valid_q) ? load_fifo_input : - (load_fifo_output_pop_i & ~load_fifo_empty) ? load_fifo_data_out : rd_head_q; - - load_fifo_output_o = rd_head_q ; - load_fifo_data_available_o = rd_valid_q; - end - - always_comb begin : store_fifo_block - store_fifo_pop = data_out_gnt & data_out_req; - store_fifo_empty_o = store_fifo_empty; - store_fifo_space_available_o = ~store_fifo_full; - end - - fifo_v3 #( - .FALL_THROUGH (1'b0 ), - .DEPTH (DEPTH ), - .DATA_WIDTH (DATA_WIDTH ) - ) load_lsu_fifo_i ( - .clk_i , - .rst_ni , - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - - // status flags - .full_o (load_fifo_full ), - .empty_o (load_fifo_empty ), - .usage_o (load_fifo_usage ), - - // as long as the queue is not full we can push new data - .data_i (load_fifo_input ), - .push_i (load_fifo_push ), - - // as long as the queue is not empty we can pop new elements - .data_o (load_fifo_data_out ), - .pop_i (load_fifo_pop ) - ); - - fifo_v3 #( - .DEPTH(FIFO_DEPTH), - .DATA_WIDTH(DATA_WIDTH) - ) store_lsu_fifo_i ( - .clk_i , - .rst_ni , - .flush_i (1'b0 ), - .testmode_i (1'b0 ), - // status flags - .full_o (store_fifo_full ), - .empty_o (store_fifo_empty ), - .usage_o ( ), - // as long as the queue is not full we can push new data - .data_i (store_fifo_input_i ), - .push_i (store_fifo_push_i ), - // as long as the queue is not empty we can pop new elements - .data_o (store_fifo_output ), - .pop_i (store_fifo_pop ) - ); - - always_ff @(posedge clk_i, negedge rst_ni) begin : seq_block - if (~rst_ni) begin - lsu_state_q <= LSU_READY; - ptr_q <= '0 ; - rows_q <= '0 ; - cols_q <= '0 ; - rd_head_q <= '0 ; - rd_valid_q <= '0 ; - data_we_q <= '0 ; - end else begin - lsu_state_q <= lsu_state_d; - ptr_q <= ptr_d ; - rows_q <= rows_d ; - cols_q <= cols_d ; - rd_head_q <= rd_head_d ; - rd_valid_q <= rd_valid_d ; - data_we_q <= data_we_d ; - end - end - -endmodule \ No newline at end of file diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv index 97017a957..5444ab137 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_perm_unit.sv @@ -172,7 +172,7 @@ module quadrilatero_perm_unit #( end - assign waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_q ; //TODO: fix + assign waddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_q; if(quadrilatero_pkg::TILE_ADDR != 0) begin assign waddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; end diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv index a466737b4..d6b3064c1 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu.sv @@ -83,7 +83,7 @@ module quadrilatero_register_lsu #( logic [$clog2(N_ROWS)-1:0] counter_q; logic [$clog2(N_ROWS)-1:0] counter_d; - logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_q; //TODO: change these + logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_q; logic [$clog2(quadrilatero_pkg::N_IREGS)-1:0] waddr_d; @@ -129,7 +129,8 @@ module quadrilatero_register_lsu #( assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; always_comb begin lsu_id_o = (write_i &~ load_fifo_data_available & rlast_o) ? instr_id_i : back_id_q; - finished = (write_q & terminate & rlast_o) | (~write_q && (counter_q == LastRow) && (row_counter_q == NumCols-1) && (cols_counter_q == NumCols-1) && wready_i && wlast_o); + finished = (write_q & terminate & rlast_o) | + (~write_q && (counter_q == LastRow) && (row_counter_q == NumCols-1) && (cols_counter_q == NumCols-1) && wready_i && wlast_o); end @@ -147,7 +148,7 @@ module quadrilatero_register_lsu #( rrowaddr_o = counter_q ; raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = operand_reg_i; if(quadrilatero_pkg::TILE_ADDR != 0) begin - raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; //TODO: maybe change to _d + raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_q, cols_counter_q}; end end @@ -178,7 +179,7 @@ module quadrilatero_register_lsu #( end if(quadrilatero_pkg::TILE_ADDR != 0) begin - assign waddr_d[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_d, cols_counter_d}; //TODO: not sure about _d + assign waddr_d[quadrilatero_pkg::TILE_ADDR-1:0] = {row_counter_d, cols_counter_d}; end always_comb begin: fsm_block @@ -277,8 +278,7 @@ module quadrilatero_register_lsu #( end end else begin cols_counter_d = cols_counter_q + 1; - end - + end end else begin if(cols_counter_q == NumCols - 1) begin cols_counter_d = '0; @@ -287,11 +287,7 @@ module quadrilatero_register_lsu #( cols_counter_d = cols_counter_q + 1; end end - end else begin - // counter_d = '0; - // back_id_d = instr_id_i; - // lsu_state_d = LSU_DONE; - end + end end LSU_DONE: begin if(load_fifo_valid && !write_i && wready_i) begin @@ -379,7 +375,6 @@ module quadrilatero_register_lsu #( .write_i (write_i), .busy_o (busy ), .terminate_o (terminate ), - //.cols_counter_match_i (cols_counter_d == cols_counter_q), // Address .src_ptr_i (src_ptr ), diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv deleted file mode 100644 index 7fda92966..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_register_lsu_adapted_to_8x8_not_working.sv +++ /dev/null @@ -1,434 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* -NOTE: for now we assume we fetch the entire row in 1 cycle. TODO: Change the number of columns and adapt this to arbitrary BUS_WIDTH parameters -NOTE: we are not handling difference in endianness when loading reduced datawidths -*/ - -module quadrilatero_register_lsu #( - parameter int unsigned BUS_WIDTH = 128, - parameter int unsigned N_REGS = 8, - parameter int unsigned N_ROWS = 4, - localparam int unsigned LLEN = BUS_WIDTH -) ( - input logic clk_i , - input logic rst_ni , - - // Bus interface - output logic data_req_o , - output logic [ 31:0] data_addr_o , - output logic data_we_o , - output logic [ BUS_WIDTH/8 - 1:0] data_be_o , - output logic [ BUS_WIDTH-1:0] data_wdata_o , - input logic data_gnt_i , - input logic data_rvalid_i , - input logic [ BUS_WIDTH-1:0] data_rdata_i , - - output logic[xif_pkg::X_ID_WIDTH-1:0] lsu_id_o , - - // Register Write Port for load unit - output logic [ $clog2(N_REGS)-1:0] waddr_o , - output logic [ $clog2(N_ROWS)-1:0] wrowaddr_o , - output logic [quadrilatero_pkg::RLEN-1:0] wdata_o , - output logic we_o , - output logic wlast_o , - input logic wready_i , // to stall the request in case the port is busy - - // Register Read Port for store unit - output logic [ $clog2(N_REGS)-1:0] raddr_o , - output logic [ $clog2(N_ROWS)-1:0] rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] rdata_i , - input logic rdata_valid_i , - output logic rdata_ready_o , - output logic rlast_o , - - // Configuration Signals - input logic start_i , // start loading: MUST BE A PULSE - input logic write_i , - output logic busy_o , - input logic [ 31:0] stride_i , // stride value - input logic [ 31:0] address_i , // address value - input logic [ $clog2(N_REGS)-1:0] operand_reg_i , // destination register - input logic [xif_pkg::X_ID_WIDTH-1:0] instr_id_i , // instruction id - input logic [ 31:0] n_bytes_cols_i , // we always fetch the entire row and then only take the elements we need - input logic [ 31:0] n_rows_i , - - - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o //instruction id out - -); - - localparam MAX_EL_PER_ROW = LLEN / BUS_WIDTH; - localparam LastRow = $clog2(N_ROWS)'(N_ROWS - 1); - localparam NumAccesses = quadrilatero_pkg::RLEN / LLEN; - - typedef enum logic [1:0] { - LSU_IDLE, - LSU_LOAD, - LSU_STORE, - LSU_DONE - } register_lsu_state_e; - - register_lsu_state_e lsu_state_d, lsu_state_q; - - logic finished; - logic [xif_pkg::X_ID_WIDTH-1:0] back_id_q; - logic [xif_pkg::X_ID_WIDTH-1:0] back_id_d; - - logic [$clog2(N_ROWS)-1:0] counter_q; - logic [$clog2(N_ROWS)-1:0] counter_d; - logic [$clog2(N_REGS)-1:0] waddr_q; - logic [$clog2(N_REGS)-1:0] waddr_d; - logic [$clog2(N_REGS)-1:0] raddr_q; - logic [$clog2(N_REGS)-1:0] raddr_d; - - - logic [LLEN-1:0] load_fifo_data; - - logic load_fifo_data_available; - logic load_fifo_pop; - - logic store_fifo_space_available; - logic store_fifo_push; - logic store_fifo_empty; - logic [LLEN-1:0] store_fifo_data; - - logic [LLEN-1:0] data_mask; - logic load_fifo_valid; - logic busy; - logic start; - logic start_q; - logic start_d; - - logic write_q; - logic write_d; - logic terminate; - logic busy_q; - logic busy_d; - - logic lsu_busy_q; - logic lsu_ready; - - logic [ 31:0] src_ptr_d ; - logic [ 31:0] stride_d ; - logic [ 31:0] src_ptr_q ; - logic [ 31:0] stride_q ; - logic [ 31:0] src_ptr ; - logic [ 31:0] stride ; - - logic [$clog2(NumAccesses)-1:0] access_counter_d; - logic [$clog2(NumAccesses)-1:0] access_counter_q; - - logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_d; - logic [(quadrilatero_pkg::RLEN-LLEN)-1:0] load_row_buffer_q; - - logic [quadrilatero_pkg::RLEN-1:0] store_mask; - logic [quadrilatero_pkg::RLEN-1:0] load_mask; - - assign mask_req = (counter_q == LastRow) & finished_o & ~finished_ack_i; - always_comb begin - lsu_id_o = (write_i &~ load_fifo_data_available & rlast_o) ? instr_id_i : back_id_q; - finished = (write_q & terminate & rlast_o) | (~write_q & (counter_q == LastRow) & wready_i && wlast_o); - end - - - always_comb begin: write_to_RF - data_mask = '1 << (8 * n_bytes_cols_i); // SPEC says to load zeros outside of rows and cols - load_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); - we_o = load_fifo_data_available &~ mask_req; // && ((access_counter_q == NumAccesses -1) || (lsu_state_q == LSU_LOAD && !load_fifo_valid)); //last part is sketchy - waddr_o = lsu_state_q == LSU_IDLE? waddr_d : waddr_q; - wrowaddr_o = counter_q ; - load_row_buffer_d = (load_row_buffer_q & ~load_mask) | (load_fifo_data << (LLEN * access_counter_q)); - wdata_o = {load_fifo_data, load_row_buffer_q} & ~data_mask; //watch out with load_row_buffer_d instead of load_row_buffer_q - - end - - always_comb begin: read_from_RF - store_mask = ({{(quadrilatero_pkg::RLEN - LLEN){1'b0}}, {LLEN{1'b1}}}) << (LLEN * access_counter_q); - rdata_ready_o = write_i & store_fifo_space_available &~ load_fifo_data_available &~ mask_req; - rrowaddr_o = counter_q ; - raddr_o = operand_reg_i;//lsu_state_q == LSU_IDLE? raddr_d : raddr_q; - end - - always_comb begin: lsu_ctrl_block - load_fifo_pop = wready_i; - store_fifo_data = (rdata_i & store_mask) >> (LLEN * access_counter_q); - store_fifo_push = rdata_ready_o && rdata_valid_i; - lsu_ready = store_fifo_empty | (write_i &~ load_fifo_data_available &~ lsu_busy_q); - start = (start_i | start_q) & lsu_ready; - busy_o = (write_i ? busy_d : busy) | start_q; - - stride = (start) ? (stride_i / NumAccesses) : stride_q; - src_ptr = (start) ? address_i : src_ptr_q; - end - - always_comb begin: next_value - write_d = (write_i && (counter_q == LastRow) && rdata_valid_i) ? 1'b1 : - (!write_i && !busy) ? 1'b0 : write_q; - - start_d = start ? 1'b0 : - (start_q | start_i) ? 1'b1 : start_q; - - stride_d = (start) ? (stride_i / NumAccesses) : stride_q ; - src_ptr_d = (start) ? address_i : src_ptr_q; - - busy_d = (write_i && (counter_q == LastRow) && rdata_valid_i && rlast_o) ? 1'b0 : - (write_i && start_i) ? 1'b1 : busy_q; - end - always_comb begin: fsm_block - lsu_state_d = lsu_state_q; - counter_d = counter_q; - access_counter_d = access_counter_q; - rlast_o = 1'b0; - wlast_o = 1'b0; - - back_id_d = back_id_q; - waddr_d = waddr_q; - raddr_d = raddr_q; - - case (lsu_state_q) - LSU_IDLE: begin - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; - //access_counter_d = '0; - if(load_fifo_valid && !write_i && wready_i) begin - if(access_counter_q == NumAccesses - 1) begin - counter_d = counter_q + 1; - lsu_state_d = LSU_LOAD; - wlast_o = 1'b1; - end else begin - access_counter_d = access_counter_q + 1; - lsu_state_d = LSU_LOAD; - end - end else if (write_i & store_fifo_space_available && rdata_valid_i) begin - if(access_counter_q == NumAccesses - 1) begin - counter_d = counter_q + 1; - lsu_state_d = LSU_STORE; - rlast_o = 1'b1; - end else begin - access_counter_d = access_counter_q + 1; - lsu_state_d = LSU_STORE; - end - end - - end - LSU_LOAD: begin - if(load_fifo_valid) begin - if(wready_i) begin - if(counter_q == LastRow) begin - if(access_counter_q == NumAccesses - 1) begin - wlast_o = 1'b1; - access_counter_d = '0; - counter_d = '0; - lsu_state_d = LSU_DONE; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; - end else begin - access_counter_d = access_counter_q + 1; - end - end else begin - if(access_counter_q == NumAccesses - 1) begin - wlast_o = 1'b1; - access_counter_d = '0; - counter_d = counter_q + 1; - end else begin - access_counter_d = access_counter_q + 1; - end - end - end - - end else begin - if(write_i && wready_i) begin - if(access_counter_q == NumAccesses - 1) begin - counter_d = '0; - wlast_o = 1'b1; - lsu_state_d = LSU_DONE; - access_counter_d = '0; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; - - end else begin - access_counter_d = access_counter_q + 1; - end - end - - end - end - LSU_STORE: begin - if(store_fifo_space_available && write_i && rdata_valid_i) begin - if(counter_q == LastRow) begin - if(access_counter_q == NumAccesses - 1) begin - rlast_o = 1'b1; - access_counter_d = '0; - counter_d = '0; - lsu_state_d = LSU_DONE; - back_id_d = instr_id_i; - waddr_d = operand_reg_i; - raddr_d = operand_reg_i; - end else begin - access_counter_d = access_counter_q + 1; - end - - end else begin - if(access_counter_q == NumAccesses - 1) begin - rlast_o = 1'b1; - access_counter_d = '0; - counter_d = counter_q + 1; - end else begin - access_counter_d = access_counter_q + 1; - end - end - end else begin - // counter_d = '0; - // back_id_d = instr_id_i; - // lsu_state_d = LSU_DONE; - end - end - LSU_DONE: begin - if(load_fifo_valid && !write_i && wready_i) begin - if(access_counter_q == NumAccesses - 1) begin - access_counter_d = '0; - counter_d = counter_q + 1; - wlast_o = 1'b1; - lsu_state_d = LSU_LOAD; - end else begin - access_counter_d = access_counter_q + 1; - end - end else if (write_i && store_fifo_space_available && rdata_valid_i) begin - if(access_counter_q == NumAccesses - 1) begin - counter_d = counter_q + 1; - rlast_o = 1'b1; - lsu_state_d = LSU_STORE; - access_counter_d = '0; - end else begin - access_counter_d = access_counter_q + 1; - end - end else begin - lsu_state_d = LSU_IDLE; - end - end - default: begin - lsu_state_d = LSU_IDLE; - end - endcase - - end - - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - counter_q <= '0; - waddr_q <= '0; - raddr_q <= '0; - back_id_q <= '0; - start_q <= '0; - write_q <= '0; - busy_q <= '0; - lsu_state_q <= LSU_IDLE; - - lsu_busy_q <= '0; - src_ptr_q <= '0; - stride_q <= '0; - access_counter_q <= '0; - load_row_buffer_q <= '0; - end else begin - counter_q <= counter_d; - back_id_q <= back_id_d; - waddr_q <= waddr_d ; - raddr_q <= raddr_d ; - start_q <= start_d ; - write_q <= write_d ; - busy_q <= busy_d ; - lsu_state_q <= lsu_state_d; - - lsu_busy_q <= busy; - src_ptr_q <= src_ptr_d; - stride_q <= stride_d ; - access_counter_q <= access_counter_d; - load_row_buffer_q <= load_row_buffer_d; - end - end - - quadrilatero_lsu #( - .FIFO_DEPTH (4 ), - .DATA_WIDTH (BUS_WIDTH) - ) lsunit_inst ( - - .clk_i , - .rst_ni , - - // Bus interface - .data_req_o , - .data_addr_o , - .data_we_o , - .data_be_o , - .data_wdata_o , - .data_gnt_i , - .data_rvalid_i , - .data_rdata_i , - - //Configuration - .start_i (start ), - .write_i (write_i), - .busy_o (busy ), - .terminate_o (terminate ), - .last_i (wlast_o | rlast_o), - .access_counter_match_i (access_counter_d == access_counter_q), - - // Address - .src_ptr_i (src_ptr ), - .stride_i (stride ), - .cols_i (MAX_EL_PER_ROW ), - .rows_i (n_rows_i ), - - // Output data - .load_fifo_output_o (load_fifo_data ), - .load_fifo_valid_o (load_fifo_valid ), - .load_fifo_data_available_o (load_fifo_data_available ), - .load_fifo_output_pop_i (load_fifo_pop ), - - // Input data - .store_fifo_input_i (store_fifo_data ), - .store_fifo_push_i (store_fifo_push ), - .store_fifo_space_available_o (store_fifo_space_available ), - .store_fifo_empty_o (store_fifo_empty ) - ); - - //------------------------- - - always_ff @(posedge clk_i or negedge rst_ni) begin - if (!rst_ni) begin - finished_o <= '0; - finished_instr_id_o <= '0; - end else begin - if (finished) begin - finished_o <= '1; - finished_instr_id_o <= back_id_q; - end - if (finished_ack_i) begin - finished_o <= '0; - finished_instr_id_o <= '0; - end - end - end - //--------------------- - - // Assertions - if (N_ROWS < 2) begin - $error( - "[quadrilatero_register_lsu] N_ROWS must be at least 2.\n" - ); - end - if ((NumAccesses & (NumAccesses - 1)) != 0) begin - $error("RLEN / LLEN must be a power of 2."); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv index 50004c070..ce935b0fe 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_rf_sequencer.sv @@ -129,7 +129,7 @@ module quadrilatero_rf_sequencer #( rw_queue_full_o = '0; for (int i = 0; i < quadrilatero_pkg::N_IREGS; i++) begin for (int h = 0; h < N_ROWS; h++) begin - rw_queue_full_o[i>>quadrilatero_pkg::TILE_ADDR] |= (rw_queue_full[i][h]); //TODO: change this + rw_queue_full_o[i>>quadrilatero_pkg::TILE_ADDR] |= (rw_queue_full[i][h]); head_valid[i][h] = scoreboard_q[i][h].valid; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv index b3c0fa325..055dd5e1c 100644 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv +++ b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array.sv @@ -167,11 +167,11 @@ module quadrilatero_systolic_array #( // Weight Read Register Port weight_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = weight_reg_q ; if(quadrilatero_pkg::TILE_ADDR != 0) begin - weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_k_counter_rev, ff_it_counter_q}; //TODO: check if this is correct, we transpose B matrix (it looks correct compared to 4x4 RF) + weight_raddr_o[quadrilatero_pkg::TILE_ADDR-1:0] = {ff_k_counter_rev, ff_it_counter_q}; end weight_rrowaddr_o = ff_counter_q; weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? + weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_row_counter_q == (RegLastRow-1)) ; // Data Read Register Port data_raddr_o[$clog2(quadrilatero_pkg::N_IREGS)-1:quadrilatero_pkg::TILE_ADDR] = data_reg_q ; @@ -212,7 +212,7 @@ module quadrilatero_systolic_array #( always_comb begin: ctrl_block valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE || (ff_state_q == FF_ACTIVE && ff_counter_q == '0 && ff_counter_d == '0)) && (fs_state_q == FS_IDLE) && (dr_state_q == DR_IDLE)) begin //TODO: check this condition (ff_state_q == FF_IDLE || (ff_state_q == FF_ACTIVE && valid == 1'b0)) + if((ff_state_q == FF_IDLE || (ff_state_q == FF_ACTIVE && ff_counter_q == '0 && ff_counter_d == '0)) && (fs_state_q == FS_IDLE) && (dr_state_q == DR_IDLE)) begin clear = 1'b1; end else begin clear = 1'b0; @@ -270,7 +270,7 @@ module quadrilatero_systolic_array #( ff_valid = 1'b1; ff_counter_d = '0; ff_state_d = FF_ACTIVE; - if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin // get inputs from new instruction + if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin ff_it_counter_d = '0; ff_row_counter_d = '0; ff_k_counter_d = '0; @@ -376,7 +376,7 @@ module quadrilatero_systolic_array #( dr_k_counter_d = '0; dr_it_counter_d = '0; dr_row_counter_d = '0; - if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 + if(fs_state_q == FS_LAST) begin dr_state_d = DR_ACTIVE; dest_reg_d = acc_fs_q; id_dr_d = id_fs_q; @@ -412,7 +412,7 @@ module quadrilatero_systolic_array #( dr_row_counter_d = dr_row_counter_q + 1; end end - if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) + if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs dr_state_d = DR_ACTIVE; dest_reg_d = acc_fs_q; id_dr_d = id_fs_q; @@ -431,7 +431,7 @@ module quadrilatero_systolic_array #( if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin last_dr_write = 1'b1; if(res_wready_i == 1'b0) begin - dr_state_d = DR_IDLE; //TODO: check if this is correct + dr_state_d = DR_IDLE; end else begin dr_state_d = DR_DONE; if(dr_counter_q == LastRow) begin @@ -587,7 +587,7 @@ module quadrilatero_systolic_array #( end end assign ready = (ff_state_q == FF_DONE) && (ff_k_counter_q == K-1) && (ff_it_counter_q == K-1) && (ff_row_counter_q == RegLastRow-1); - assign sa_ready_o = ready || (ff_state_q == FF_IDLE && fs_state_q == FS_IDLE); // && fs_state_q == FS_IDLE? + assign sa_ready_o = ready || (ff_state_q == FF_IDLE && fs_state_q == FS_IDLE); assign sa_input_id_o = id_ff_q ; assign sa_output_id_o = id_dr_q ; assign finished_o = finished_q ; diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv deleted file mode 100644 index c51eb8b6d..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_old.sv +++ /dev/null @@ -1,636 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* - -TODO: -- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs - - basically you need to inject zeros instead of actual elements -*/ - -module quadrilatero_systolic_array #( - parameter int MESH_WIDTH = 4 , - parameter int DATA_WIDTH = 32 , - parameter int N_REGS = 8 , - parameter int ENABLE_SIMD = 1 , - localparam int N_ROWS = MESH_WIDTH , - localparam int ALEN = DATA_WIDTH * MESH_WIDTH, - parameter FPU = 1 -) ( - input logic clk_i , - input logic rst_ni , - - output logic sa_ready_o , - input logic start_i , - - // Only has effect if ENABLE_SIMD == 1 - input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , - - input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register - input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register - input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register - input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction - - // Weight Read Register Port - output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , - input logic weight_rdata_valid_i, - output logic weight_rdata_ready_o, - output logic weight_rlast_o , - - // Data Read Register Port - output logic [ $clog2(N_REGS)-1:0] data_raddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , - input logic data_rdata_valid_i , - output logic data_rdata_ready_o , - output logic data_rlast_o , - - // Accumulator Read Register Port - output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , - input logic acc_rdata_valid_i , - output logic acc_rdata_ready_o , - output logic acc_rlast_o , - - // Accumulator Out Write Register Port - output logic [ $clog2(N_REGS)-1:0] res_waddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , - output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , - output logic res_we_o , - output logic res_wlast_o , - input logic res_wready_i , - - // RF Instruction ID - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , - - // Finish - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o -); - typedef enum logic [1:0]{ - FS_IDLE, - FS_ACTIVE, - FS_LAST - } fs_state_e; - typedef enum logic [1:0]{ - FF_IDLE, - FF_ACTIVE, - FF_DONE - } ff_state_e; - typedef enum logic [1:0]{ - DR_IDLE, - DR_ACTIVE, - DR_DONE - } dr_state_e; - - ff_state_e ff_state_d, ff_state_q; - fs_state_e fs_state_d, fs_state_q; - dr_state_e dr_state_d, dr_state_q; - localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); - localparam RegLastRow = quadrilatero_pkg::RLEN/ ALEN; - localparam K = quadrilatero_pkg::RLEN / ALEN; - - logic [$clog2(K)-1:0] ff_k_counter_d; - logic [$clog2(K)-1:0] ff_k_counter_q; - logic [$clog2(K)-1:0] dr_k_counter_d; - logic [$clog2(K)-1:0] dr_k_counter_q; - logic [$clog2(K)-1:0] ff_it_counter_d; - logic [$clog2(K)-1:0] ff_it_counter_q; - logic [$clog2(K)-1:0] dr_it_counter_d; - logic [$clog2(K)-1:0] dr_it_counter_q; - logic [$clog2(K)-1:0] ff_row_counter_d; - logic [$clog2(K)-1:0] ff_row_counter_q; - logic [$clog2(K)-1:0] dr_row_counter_d; - logic [$clog2(K)-1:0] dr_row_counter_q; - logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; - logic last_dr_write; - - // Data Masks - logic [quadrilatero_pkg::RLEN-1:0] data_mask; - logic [quadrilatero_pkg::RLEN-1:0] weight_mask; - logic [quadrilatero_pkg::RLEN-1:0] acc_mask; - //logic [quadrilatero_pkg::RLEN-1:0] res_mask; - - logic [ALEN-1:0] data_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; - logic [ALEN-1:0] weight_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_shifted; - logic [ALEN-1:0] acc_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; - logic [ALEN-1:0] res_wdata_partial; - logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; - logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0] res_wdata_buffer_d; - logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0]res_wdata_buffer_q; - - - logic valid ; - logic ff_valid; - logic clear ; - logic pump ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; - - logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register - logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register - logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register - logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register - quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; - quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; - - logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage - - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; - - logic finished_d ; - logic finished_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; - logic mask_req ; - - quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; - - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; - logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; - - //--------------------------------------------------------------------- - - always_comb begin: rf_block - // Weight Read Register Port - weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - weight_base_row = N_ROWS * ff_it_counter_q; - weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q + weight_base_row; - weight_rdata_shifted = (weight_rdata_i << ALEN * ff_k_counter_q); - weight_rdata_masked = weight_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; - weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? - - // Data Read Register Port - data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); - data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - data_rdata_shifted = (data_rdata_i >> ALEN * ff_it_counter_q); - data_rdata_masked = data_rdata_shifted[ALEN-1:0] ; - data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; - - // Accumulator Read Register Port - acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - acc_rdata_shifted = (acc_rdata_i << ALEN * ff_k_counter_q); - res_rdata_shifted = (res_wdata_o << ALEN * ff_k_counter_q); - acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? - res_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] : acc_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN]; //TODO fix - acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - acc_rlast_o = '0 ; - - // Accumulator Out Write Register Port - res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; - res_wdata_o = {res_wdata_buffer_q[dr_counter_q], res_wdata_partial}; // TODO: fix this, probably need a bigger buffer to make life easier. - res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; - res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); - end - - always_comb begin : weight_buffer_block - res_wdata_buffer_d = res_wdata_buffer_q; - if(dr_state_q != DR_IDLE) begin - if(dr_k_counter_q == K-1) begin - res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer - end else begin - res_wdata_buffer_d[dr_counter_q] = res_wdata_partial; - end - end - end - - always_comb begin: finished_signal - - finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : - (finished_ack_i ) ? 1'b0 : finished_q; - - finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? id_dr_q : - (finished_ack_i ) ? '0 : finished_instr_id_q; - end - - always_comb begin: ctrl_block - valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q == DR_IDLE)) begin - clear = 1'b1; - end else begin - clear = 1'b0; - end - if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q != DR_IDLE)) begin - pump = 1'b1; - end else begin - pump = 1'b0; - end - mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; - end - - always_comb begin : ff_fsm_block - ff_counter_d = ff_counter_q; - ff_state_d = ff_state_q; - data_reg_d = data_reg_q; - acc_reg_d = acc_reg_q; - weight_reg_d = weight_reg_q; - sa_ctrl_d = sa_ctrl_q; - id_ff_d = id_ff_q; - ff_k_counter_d = ff_k_counter_q; - ff_it_counter_d = ff_it_counter_q; - ff_row_counter_d = ff_row_counter_q; - ff_valid = 1'b0; - - unique case (ff_state_q) - FF_IDLE: begin - ff_counter_d = '0; - ff_it_counter_d = '0; - ff_row_counter_d = '0; - ff_k_counter_d = '0; - if(start_i == 1'b1) begin - ff_state_d = FF_ACTIVE; - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end - end - FF_ACTIVE: begin - if(valid == 1'b1) begin - ff_valid = 1'b1; - if(ff_counter_q==(LastRow-1)) begin - ff_counter_d = ff_counter_q + 1; - ff_state_d = FF_DONE; - end else begin - ff_counter_d = ff_counter_q + 1; - end - end - end - - FF_DONE: begin - if(start_i == 1'b1 | ~(data_rlast_o == 1'b1 && weight_rlast_o == 1'b1 && ff_it_counter_q == (K-1))) begin - if(valid == 1'b1) begin - ff_valid = 1'b1; - ff_counter_d = '0; - ff_state_d = FF_ACTIVE; - if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin // get inputs from new instruction - ff_it_counter_d = '0; - ff_row_counter_d = '0; - ff_k_counter_d = '0; - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end else begin - if(ff_row_counter_q == RegLastRow-1) begin - ff_row_counter_d = '0; - if(ff_k_counter_q == (K-1)) begin - ff_k_counter_d = '0; - ff_it_counter_d = ff_it_counter_q + 1; - end else begin - ff_k_counter_d = ff_k_counter_q + 1; - end - end else begin - ff_row_counter_d = ff_row_counter_q + 1; - end - end - end - - end else begin - ff_counter_d = '0; - ff_state_d = FF_IDLE; - end - end - - default: begin - ff_state_d = FF_IDLE; - end - endcase - end - always_comb begin : fs_fsm_block - fs_counter_d = fs_counter_q; - fs_state_d = fs_state_q; - - acc_fs_d = acc_fs_q; - id_fs_d = id_fs_q; - - unique case(fs_state_q) - FS_IDLE: begin - fs_counter_d = '0; - if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin - fs_state_d = FS_ACTIVE; - - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - end - - end - FS_ACTIVE: begin - if(clear == 1'b1) begin - fs_counter_d = '0; - fs_state_d = FS_IDLE; - end else begin - if(fs_counter_q == LastRow-2) begin - fs_counter_d = fs_counter_q + 1; - fs_state_d = FS_LAST; - end else begin - fs_counter_d = fs_counter_q + 1; - end - end - end - FS_LAST: begin - fs_counter_d = '0; - if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin //stay in active mode, load new inputs - fs_state_d = FS_ACTIVE; - - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - end - if(ff_state_q == FF_IDLE) begin - fs_state_d = FS_IDLE; - end else begin - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - fs_state_d = FS_IDLE; - end - - end - default: begin - fs_state_d = FS_IDLE; - end - - endcase - end - - always_comb begin : dr_fsm_block - dr_state_d = dr_state_q; - dr_counter_d = dr_counter_q; - dr_k_counter_d = dr_k_counter_q; - dr_it_counter_d = dr_it_counter_q; - dr_row_counter_d = dr_row_counter_q; - last_dr_write = 1'b0; - - dest_reg_d = dest_reg_q; - id_dr_d = id_dr_q; - unique case(dr_state_q) - DR_IDLE: begin - dr_counter_d = '0; - dr_k_counter_d = '0; - dr_it_counter_d = '0; - dr_row_counter_d = '0; - if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - end - - end - DR_ACTIVE: begin - if(clear == 1'b1) begin - dr_counter_d = '0; - dr_state_d = DR_IDLE; - end else begin - if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; - end else begin - if(dr_counter_q == LastRow) begin - - dr_counter_d = '0; - //update DR counters - if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin - dr_it_counter_d = '0; - dr_row_counter_d = '0; - dr_k_counter_d = '0; - end else begin - if(dr_row_counter_q == RegLastRow-1) begin - dr_row_counter_d = '0; - if(dr_k_counter_q == (K-1)) begin - dr_k_counter_d = '0; - dr_it_counter_d = dr_it_counter_q + 1; - end else begin - dr_k_counter_d = dr_k_counter_q + 1; - end - end else begin - dr_row_counter_d = dr_row_counter_q + 1; - end - end - if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - end - if(fs_state_q == FS_IDLE) begin - dr_state_d = DR_DONE; - end - end else begin - dr_counter_d = dr_counter_q + 1; - end - end - end - - end - DR_DONE: begin - if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin - last_dr_write = 1'b1; - if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; //stall - end else begin - dr_state_d = DR_DONE; - if(dr_counter_q == LastRow) begin - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - dr_it_counter_d = '0; - dr_row_counter_d = '0; - dr_k_counter_d = '0; - dr_state_d = DR_IDLE; - end else begin - dr_counter_d = dr_counter_q + 1; - end - end - end else begin - dr_state_d = DR_IDLE; - end - end - default: begin - dr_state_d = DR_IDLE; - end - - endcase - - - end - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_data ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (data_rdata_masked ), - .data_o (data_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (acc_rdata_masked ), - .data_o (acc_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(4) - ) skewer_inst_ctrl ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i ({MESH_WIDTH{sa_ctrl_q}}), - .data_o (sa_ctrl_mesh_skewed ) - ); - - quadrilatero_wl_stage #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) weight_inst ( - .clk_i , - .rst_ni , - - .ff_counter (ff_counter_q ), - .clear_i (clear ), - .pump_i (pump ), - .weight_rdata_valid_i , - - // Weight Data - .weight_rdata_i (weight_rdata_masked ), - .weight_rdata_o (weight_mesh_skewed ) - ); - - quadrilatero_mesh #( - .MESH_WIDTH (MESH_WIDTH ), - .ENABLE_SIMD(ENABLE_SIMD), - .FPU (FPU ) - ) mesh_inst ( - .clk_i, - .rst_ni, - - .pump_i (pump ), - .sa_ctrl_i (sa_ctrl_mesh_skewed ), - - .data_i (data_mesh_skewed ), - .acc_i (acc_mesh_skewed ), - .weight_i (weight_mesh_skewed ), - .acc_o (res_mesh_skewed ) - ); - - quadrilatero_deskewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) deskewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (res_mesh_skewed), - .data_o (res_wdata_partial ) - ); - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - ff_counter_q <= '0; - fs_counter_q <= '0; - dr_counter_q <= '0; - ff_state_q <= FF_IDLE; - fs_state_q <= FS_IDLE; - dr_state_q <= DR_IDLE; - data_reg_q <= '0; - acc_reg_q <= '0; - weight_reg_q <= '0; - sa_ctrl_q <= '0; - acc_fs_q <= '0; - dest_reg_q <= '0; - id_ff_q <= '0; - id_fs_q <= '0; - id_dr_q <= '0; - finished_q <= '0; - finished_instr_id_q <= '0; - ff_k_counter_q <= '0; - dr_k_counter_q <= '0; - ff_it_counter_q <= '0; - dr_it_counter_q <= '0; - ff_row_counter_q <= '0; - dr_row_counter_q <= '0; - res_wdata_buffer_q <= '0; - end else begin - ff_counter_q <= ff_counter_d ; - fs_counter_q <= fs_counter_d ; - dr_counter_q <= dr_counter_d ; - ff_state_q <= ff_state_d; - fs_state_q <= fs_state_d; - dr_state_q <= dr_state_d; - data_reg_q <= data_reg_d ; - acc_reg_q <= acc_reg_d ; - weight_reg_q <= weight_reg_d ; - sa_ctrl_q <= sa_ctrl_d ; - acc_fs_q <= acc_fs_d ; - dest_reg_q <= dest_reg_d ; - id_ff_q <= id_ff_d ; - id_fs_q <= id_fs_d ; - id_dr_q <= id_dr_d ; - finished_q <= finished_d ; - finished_instr_id_q <= finished_instr_id_d ; - ff_k_counter_q <= ff_k_counter_d; - dr_k_counter_q <= dr_k_counter_d; - ff_it_counter_q <= ff_it_counter_d; - dr_it_counter_q <= dr_it_counter_d; - ff_row_counter_q <= ff_row_counter_d ; - dr_row_counter_q <= dr_row_counter_d ; - res_wdata_buffer_q <= res_wdata_buffer_d ; - end - end - - assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0 - && (ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1))) | clear); - assign sa_input_id_o = id_ff_q ; - assign sa_output_id_o = id_dr_q ; - assign finished_o = finished_q ; - assign finished_instr_id_o = finished_instr_id_q; - - // -------------------------------------------------------------------- - - // Assertions - if (MESH_WIDTH < 2) begin - $error( - "[systolic_array] MESH_WIDTH must be at least 2.\n" - ); - end -endmodule diff --git a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv b/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv deleted file mode 100644 index e0c43fff0..000000000 --- a/hw/ip_examples/quadrilatero/rtl/quadrilatero_systolic_array_wrong_order.sv +++ /dev/null @@ -1,636 +0,0 @@ -// Copyright 2024 EPFL -// Solderpad Hardware License, Version 2.1, see LICENSE.md for details. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Author: Danilo Cammarata - -/* - -TODO: -- handle matrices operations with matrices < MESH_WIDTH based on the configuration CSRs - - basically you need to inject zeros instead of actual elements -*/ - -module quadrilatero_systolic_array #( - parameter int MESH_WIDTH = 4 , - parameter int DATA_WIDTH = 32 , - parameter int N_REGS = 8 , - parameter int ENABLE_SIMD = 1 , - localparam int N_ROWS = MESH_WIDTH , - localparam int ALEN = DATA_WIDTH * MESH_WIDTH, - parameter FPU = 1 -) ( - input logic clk_i , - input logic rst_ni , - - output logic sa_ready_o , - input logic start_i , - - // Only has effect if ENABLE_SIMD == 1 - input quadrilatero_pkg::sa_ctrl_t sa_ctrl_i , - - input logic [ $clog2(N_REGS)-1:0] data_reg_i , // data register - input logic [ $clog2(N_REGS)-1:0] acc_reg_i , // accumulator register - input logic [ $clog2(N_REGS)-1:0] weight_reg_i , // weight register - input logic [xif_pkg::X_ID_WIDTH-1:0] id_i , // id of the instruction - - // Weight Read Register Port - output logic [ $clog2(N_REGS)-1:0] weight_raddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] weight_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_i , - input logic weight_rdata_valid_i, - output logic weight_rdata_ready_o, - output logic weight_rlast_o , - - // Data Read Register Port - output logic [ $clog2(N_REGS)-1:0] data_raddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] data_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] data_rdata_i , - input logic data_rdata_valid_i , - output logic data_rdata_ready_o , - output logic data_rlast_o , - - // Accumulator Read Register Port - output logic [ $clog2(N_REGS)-1:0] acc_raddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] acc_rrowaddr_o , - input logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_i , - input logic acc_rdata_valid_i , - output logic acc_rdata_ready_o , - output logic acc_rlast_o , - - // Accumulator Out Write Register Port - output logic [ $clog2(N_REGS)-1:0] res_waddr_o , - output logic [ $clog2(quadrilatero_pkg::MESH_WIDTH)-1:0] res_wrowaddr_o , - output logic [quadrilatero_pkg::RLEN-1:0] res_wdata_o , - output logic res_we_o , - output logic res_wlast_o , - input logic res_wready_i , - - // RF Instruction ID - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_input_id_o , - output logic [xif_pkg::X_ID_WIDTH-1:0] sa_output_id_o , - - // Finish - output logic finished_o , - input logic finished_ack_i , - output logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_o -); - typedef enum logic [1:0]{ - FS_IDLE, - FS_ACTIVE, - FS_LAST - } fs_state_e; - typedef enum logic [1:0]{ - FF_IDLE, - FF_ACTIVE, - FF_DONE - } ff_state_e; - typedef enum logic [1:0]{ - DR_IDLE, - DR_ACTIVE, - DR_DONE - } dr_state_e; - - ff_state_e ff_state_d, ff_state_q; - fs_state_e fs_state_d, fs_state_q; - dr_state_e dr_state_d, dr_state_q; - localparam LastRow = $clog2(MESH_WIDTH)'(MESH_WIDTH-1); - localparam RegLastRow = quadrilatero_pkg::RLEN/ ALEN; - localparam K = quadrilatero_pkg::RLEN / ALEN; - - logic [$clog2(K)-1:0] ff_k_counter_d; - logic [$clog2(K)-1:0] ff_k_counter_q; - logic [$clog2(K)-1:0] dr_k_counter_d; - logic [$clog2(K)-1:0] dr_k_counter_q; - logic [$clog2(K)-1:0] ff_it_counter_d; - logic [$clog2(K)-1:0] ff_it_counter_q; - logic [$clog2(K)-1:0] dr_it_counter_d; - logic [$clog2(K)-1:0] dr_it_counter_q; - logic [$clog2(K)-1:0] ff_row_counter_d; - logic [$clog2(K)-1:0] ff_row_counter_q; - logic [$clog2(K)-1:0] dr_row_counter_d; - logic [$clog2(K)-1:0] dr_row_counter_q; - logic [$clog2(quadrilatero_pkg::RLEN/DATA_WIDTH)-1:0] weight_base_row; - logic last_dr_write; - - // Data Masks - logic [quadrilatero_pkg::RLEN-1:0] data_mask; - logic [quadrilatero_pkg::RLEN-1:0] weight_mask; - logic [quadrilatero_pkg::RLEN-1:0] acc_mask; - //logic [quadrilatero_pkg::RLEN-1:0] res_mask; - - logic [ALEN-1:0] data_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] data_rdata_shifted; - logic [ALEN-1:0] weight_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] weight_rdata_shifted; - logic [ALEN-1:0] acc_rdata_masked; - logic [quadrilatero_pkg::RLEN-1:0] acc_rdata_shifted; - logic [ALEN-1:0] res_wdata_partial; - logic [quadrilatero_pkg::RLEN-1:0] res_rdata_shifted; - logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0] res_wdata_buffer_d; - logic [N_ROWS-1:0][(quadrilatero_pkg::RLEN - ALEN)-1:0]res_wdata_buffer_q; - - - logic valid ; - logic ff_valid; - logic clear ; - logic pump ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] ff_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] fs_counter_q ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_d ; - logic [$clog2(MESH_WIDTH)-1 :0] dr_counter_q ; - - logic [ $clog2(N_REGS)-1:0] data_reg_d ; // Data register - logic [ $clog2(N_REGS)-1:0] data_reg_q ; // Data register - logic [ $clog2(N_REGS)-1:0] acc_reg_d ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] acc_reg_q ; // Accumulator register -- FF Stage - logic [ $clog2(N_REGS)-1:0] weight_reg_q ; // Weight register - logic [ $clog2(N_REGS)-1:0] weight_reg_d ; // Weight register - quadrilatero_pkg::sa_ctrl_t sa_ctrl_d ; - quadrilatero_pkg::sa_ctrl_t sa_ctrl_q ; - - logic [ $clog2(N_REGS)-1:0] acc_fs_q ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] acc_fs_d ; // Accumulator register -- FS Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_q ; // Accumulator register -- DR Stage - logic [ $clog2(N_REGS)-1:0] dest_reg_d ; // Accumulator register -- DR Stage - - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_ff_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_fs_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_d ; - logic [xif_pkg::X_ID_WIDTH-1:0] id_dr_q ; - - logic finished_d ; - logic finished_q ; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_d; - logic [xif_pkg::X_ID_WIDTH-1:0] finished_instr_id_q; - logic mask_req ; - - quadrilatero_pkg::sa_ctrl_t [MESH_WIDTH-1:0] sa_ctrl_mesh_skewed; - - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] data_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] acc_mesh_skewed ; - logic [MESH_WIDTH-1:0][MESH_WIDTH-1:0][DATA_WIDTH-1:0] weight_mesh_skewed ; - logic [MESH_WIDTH-1:0][DATA_WIDTH-1:0] res_mesh_skewed ; - - //--------------------------------------------------------------------- - - always_comb begin: rf_block - // Weight Read Register Port - weight_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - weight_base_row = (quadrilatero_pkg::RLEN / DATA_WIDTH) - (N_ROWS * (ff_it_counter_q + 1)); - weight_raddr_o = weight_reg_q ; - weight_rrowaddr_o = ff_counter_q + weight_base_row; - weight_rdata_shifted = (weight_rdata_i >> ALEN * ff_k_counter_q); - weight_rdata_masked = weight_rdata_shifted[ALEN-1:0]; //TODO fix - weight_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - weight_rlast_o = (ff_state_q != FF_IDLE) && (ff_k_counter_q == (K-1) && ff_row_counter_q == (RegLastRow-1)) ; // might leave at ff_it_counter_q == (K-1) to free all regs at the same time? - - // Data Read Register Port - data_mask = {{(ALEN){1'b1}}, {(quadrilatero_pkg::RLEN - ALEN){1'b0}}} << (ALEN * ff_it_counter_q); // TODO fix - data_raddr_o = data_reg_q ; - data_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - data_rdata_shifted = (data_rdata_i << ALEN * ff_it_counter_q); - data_rdata_masked = data_rdata_shifted[quadrilatero_pkg::RLEN-1:quadrilatero_pkg::RLEN-ALEN] ; //TODO fix - data_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - data_rlast_o = ff_state_q != FF_IDLE && (ff_it_counter_q == (K-1) && ff_k_counter_q == (K-1)) ; - - // Accumulator Read Register Port - acc_mask = {(ALEN){1'b1}} << (ALEN * ff_k_counter_q); - acc_raddr_o = acc_reg_q ; - acc_rrowaddr_o = ff_counter_q + (ff_row_counter_q * N_ROWS) ; - acc_rdata_shifted = (acc_rdata_i >> ALEN * ff_k_counter_q); - res_rdata_shifted = (res_wdata_o >> ALEN * ff_k_counter_q); - acc_rdata_masked = sa_input_id_o == sa_output_id_o && acc_rrowaddr_o == res_wrowaddr_o? - res_rdata_shifted[ALEN-1:0] : acc_rdata_shifted[ALEN-1:0]; //TODO fix - acc_rdata_ready_o = (ff_state_q != FF_IDLE) &~ mask_req ; - acc_rlast_o = '0 ; - - // Accumulator Out Write Register Port - res_waddr_o = dest_reg_q ; - res_wrowaddr_o = dr_counter_q + (dr_row_counter_q * N_ROWS) ; - res_wdata_o = {res_wdata_partial, res_wdata_buffer_q[dr_counter_q]}; - res_we_o = (dr_state_q == DR_ACTIVE || last_dr_write == 1'b1) &~ mask_req; - res_wlast_o = (dr_state_q != DR_IDLE) && (dr_it_counter_q == (K-1) && dr_k_counter_q == (K-1)); - end - - always_comb begin : weight_buffer_block - res_wdata_buffer_d = res_wdata_buffer_q; - if(dr_state_q != DR_IDLE) begin - if(dr_k_counter_q == K-1) begin - res_wdata_buffer_d[dr_counter_q] = '0; //resetting the buffer - end else begin - res_wdata_buffer_d[dr_counter_q] = res_wdata_buffer_q[dr_counter_q] | (res_wdata_partial << ALEN*dr_k_counter_q); - end - end - end - - always_comb begin: finished_signal - - finished_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? 1'b1 : - (finished_ack_i ) ? 1'b0 : finished_q; - - finished_instr_id_d = (res_wready_i && (dr_counter_q == LastRow) && res_wlast_o && (dr_row_counter_q == RegLastRow-1)) ? id_dr_q : - (finished_ack_i ) ? '0 : finished_instr_id_q; - end - - always_comb begin: ctrl_block - valid = weight_rdata_valid_i & data_rdata_valid_i & acc_rdata_valid_i; - if((ff_state_q == FF_IDLE) && (fs_state_q != FS_ACTIVE) && (dr_state_q == DR_IDLE)) begin - clear = 1'b1; - end else begin - clear = 1'b0; - end - if((ff_state_q != FF_IDLE && valid == 1'b1) || (fs_state_q == FS_ACTIVE) || (dr_state_q != DR_IDLE)) begin - pump = 1'b1; - end else begin - pump = 1'b0; - end - mask_req = (dr_counter_q==LastRow) & finished_q & ~finished_ack_i; - end - - always_comb begin : ff_fsm_block - ff_counter_d = ff_counter_q; - ff_state_d = ff_state_q; - data_reg_d = data_reg_q; - acc_reg_d = acc_reg_q; - weight_reg_d = weight_reg_q; - sa_ctrl_d = sa_ctrl_q; - id_ff_d = id_ff_q; - ff_k_counter_d = ff_k_counter_q; - ff_it_counter_d = ff_it_counter_q; - ff_row_counter_d = ff_row_counter_q; - ff_valid = 1'b0; - - unique case (ff_state_q) - FF_IDLE: begin - ff_counter_d = '0; - ff_it_counter_d = '0; - ff_row_counter_d = '0; - ff_k_counter_d = '0; - if(start_i == 1'b1) begin - ff_state_d = FF_ACTIVE; - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end - end - FF_ACTIVE: begin - if(valid == 1'b1) begin - ff_valid = 1'b1; - if(ff_counter_q==(LastRow-1)) begin - ff_counter_d = ff_counter_q + 1; - ff_state_d = FF_DONE; - end else begin - ff_counter_d = ff_counter_q + 1; - end - end - end - - FF_DONE: begin - if(start_i == 1'b1 | ~(data_rlast_o == 1'b1 && weight_rlast_o == 1'b1 && ff_it_counter_q == (K-1))) begin - if(valid == 1'b1) begin - ff_valid = 1'b1; - ff_counter_d = '0; - ff_state_d = FF_ACTIVE; - if(ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1) && start_i == 1'b1) begin // get inputs from new instruction - ff_it_counter_d = '0; - ff_row_counter_d = '0; - ff_k_counter_d = '0; - data_reg_d = data_reg_i; - acc_reg_d = acc_reg_i; - weight_reg_d = weight_reg_i; - sa_ctrl_d = sa_ctrl_i; - id_ff_d = id_i; - end else begin - if(ff_k_counter_q == K-1) begin - ff_k_counter_d = '0; - if(ff_row_counter_q == (RegLastRow-1)) begin - ff_row_counter_d = '0; - ff_it_counter_d = ff_it_counter_q + 1; - end else begin - ff_row_counter_d = ff_row_counter_q + 1; - end - end else begin - ff_k_counter_d = ff_k_counter_q + 1; - end - end - end - - end else begin - ff_counter_d = '0; - ff_state_d = FF_IDLE; - end - end - - default: begin - ff_state_d = FF_IDLE; - end - endcase - end - always_comb begin : fs_fsm_block - fs_counter_d = fs_counter_q; - fs_state_d = fs_state_q; - - acc_fs_d = acc_fs_q; - id_fs_d = id_fs_q; - - unique case(fs_state_q) - FS_IDLE: begin - fs_counter_d = '0; - if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin - fs_state_d = FS_ACTIVE; - - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - end - - end - FS_ACTIVE: begin - if(clear == 1'b1) begin - fs_counter_d = '0; - fs_state_d = FS_IDLE; - end else begin - if(fs_counter_q == LastRow-2) begin - fs_counter_d = fs_counter_q + 1; - fs_state_d = FS_LAST; - end else begin - fs_counter_d = fs_counter_q + 1; - end - end - end - FS_LAST: begin - fs_counter_d = '0; - if(ff_state_q == FF_DONE && ff_valid == 1'b1) begin //stay in active mode, load new inputs - fs_state_d = FS_ACTIVE; - - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - end - if(ff_state_q == FF_IDLE) begin - fs_state_d = FS_IDLE; - end else begin - acc_fs_d = acc_reg_q; - id_fs_d = id_ff_q; - fs_state_d = FS_IDLE; - end - - end - default: begin - fs_state_d = FS_IDLE; - end - - endcase - end - - always_comb begin : dr_fsm_block - dr_state_d = dr_state_q; - dr_counter_d = dr_counter_q; - dr_k_counter_d = dr_k_counter_q; - dr_it_counter_d = dr_it_counter_q; - dr_row_counter_d = dr_row_counter_q; - last_dr_write = 1'b0; - - dest_reg_d = dest_reg_q; - id_dr_d = id_dr_q; - unique case(dr_state_q) - DR_IDLE: begin - dr_counter_d = '0; - dr_k_counter_d = '0; - dr_it_counter_d = '0; - dr_row_counter_d = '0; - if(fs_state_q == FS_LAST) begin //fs_counter_d == LastRow ) && (fs_counter_q == LastRow - 1 - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - end - - end - DR_ACTIVE: begin - if(clear == 1'b1) begin - dr_counter_d = '0; - dr_state_d = DR_IDLE; - end else begin - if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; - end else begin - if(dr_counter_q == LastRow) begin - - dr_counter_d = '0; - //update DR counters - if(dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow - 1) && dr_k_counter_q == (K-1)) begin - dr_it_counter_d = '0; - dr_row_counter_d = '0; - dr_k_counter_d = '0; - end else begin - if(dr_k_counter_q == K-1) begin - dr_k_counter_d = '0; - if(dr_row_counter_q == (RegLastRow-1)) begin - dr_row_counter_d = '0; - dr_it_counter_d = dr_it_counter_q + 1; - end else begin - dr_row_counter_d = dr_row_counter_q + 1; - end - end else begin - dr_k_counter_d = dr_k_counter_q + 1; - end - end - if(fs_state_q == FS_LAST) begin //stay in the active mode, load new inputs (fs_counter_d == LastRow - 1 ) && (fs_counter_q == LastRow - 2) - dr_state_d = DR_ACTIVE; - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - end - if(fs_state_q == FS_IDLE) begin - dr_state_d = DR_DONE; - end - end else begin - dr_counter_d = dr_counter_q + 1; - end - end - end - - end - DR_DONE: begin - if((fs_state_q == FS_IDLE && dr_it_counter_q == (K-1) && dr_row_counter_q == (RegLastRow-1) && dr_k_counter_q == (K-1))) begin - last_dr_write = 1'b1; - if(res_wready_i == 1'b0) begin - dr_state_d = dr_state_q; //stall - end else begin - dr_state_d = DR_DONE; - if(dr_counter_q == LastRow) begin - dest_reg_d = acc_fs_q; - id_dr_d = id_fs_q; - dr_it_counter_d = '0; - dr_row_counter_d = '0; - dr_k_counter_d = '0; - dr_state_d = DR_IDLE; - end else begin - dr_counter_d = dr_counter_q + 1; - end - end - end else begin - dr_state_d = DR_IDLE; - end - end - default: begin - dr_state_d = DR_IDLE; - end - - endcase - - - end - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_data ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (data_rdata_masked ), - .data_o (data_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) skewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (acc_rdata_masked ), - .data_o (acc_mesh_skewed) - ); - - quadrilatero_skewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(4) - ) skewer_inst_ctrl ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i ({MESH_WIDTH{sa_ctrl_q}}), - .data_o (sa_ctrl_mesh_skewed ) - ); - - quadrilatero_wl_stage #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) weight_inst ( - .clk_i , - .rst_ni , - - .ff_counter (ff_counter_q ), - .clear_i (clear ), - .pump_i (pump ), - .weight_rdata_valid_i , - - // Weight Data - .weight_rdata_i (weight_rdata_masked ), - .weight_rdata_o (weight_mesh_skewed ) - ); - - quadrilatero_mesh #( - .MESH_WIDTH (MESH_WIDTH ), - .ENABLE_SIMD(ENABLE_SIMD), - .FPU (FPU ) - ) mesh_inst ( - .clk_i, - .rst_ni, - - .pump_i (pump ), - .sa_ctrl_i (sa_ctrl_mesh_skewed ), - - .data_i (data_mesh_skewed ), - .acc_i (acc_mesh_skewed ), - .weight_i (weight_mesh_skewed ), - .acc_o (res_mesh_skewed ) - ); - - quadrilatero_deskewer #( - .MESH_WIDTH(MESH_WIDTH), - .DATA_WIDTH(DATA_WIDTH) - ) deskewer_inst_acc ( - .clk_i , - .rst_ni , - .pump_i (pump ), - .data_i (res_mesh_skewed), - .data_o (res_wdata_partial ) - ); - - always_ff @(posedge clk_i or negedge rst_ni) begin: seq_block - if (!rst_ni) begin - ff_counter_q <= '0; - fs_counter_q <= '0; - dr_counter_q <= '0; - ff_state_q <= FF_IDLE; - fs_state_q <= FS_IDLE; - dr_state_q <= DR_IDLE; - data_reg_q <= '0; - acc_reg_q <= '0; - weight_reg_q <= '0; - sa_ctrl_q <= '0; - acc_fs_q <= '0; - dest_reg_q <= '0; - id_ff_q <= '0; - id_fs_q <= '0; - id_dr_q <= '0; - finished_q <= '0; - finished_instr_id_q <= '0; - ff_k_counter_q <= '0; - dr_k_counter_q <= '0; - ff_it_counter_q <= '0; - dr_it_counter_q <= '0; - ff_row_counter_q <= '0; - dr_row_counter_q <= '0; - res_wdata_buffer_q <= '0; - end else begin - ff_counter_q <= ff_counter_d ; - fs_counter_q <= fs_counter_d ; - dr_counter_q <= dr_counter_d ; - ff_state_q <= ff_state_d; - fs_state_q <= fs_state_d; - dr_state_q <= dr_state_d; - data_reg_q <= data_reg_d ; - acc_reg_q <= acc_reg_d ; - weight_reg_q <= weight_reg_d ; - sa_ctrl_q <= sa_ctrl_d ; - acc_fs_q <= acc_fs_d ; - dest_reg_q <= dest_reg_d ; - id_ff_q <= id_ff_d ; - id_fs_q <= id_fs_d ; - id_dr_q <= id_dr_d ; - finished_q <= finished_d ; - finished_instr_id_q <= finished_instr_id_d ; - ff_k_counter_q <= ff_k_counter_d; - dr_k_counter_q <= dr_k_counter_d; - ff_it_counter_q <= ff_it_counter_d; - dr_it_counter_q <= dr_it_counter_d; - ff_row_counter_q <= ff_row_counter_d ; - dr_row_counter_q <= dr_row_counter_d ; - res_wdata_buffer_q <= res_wdata_buffer_d ; - end - end - - assign sa_ready_o = (ff_counter_d=='0) & (((ff_state_q != FF_IDLE) &~ ff_counter_q=='0 - && (ff_it_counter_q == (K-1) && ff_row_counter_q == (RegLastRow - 1) && ff_k_counter_q == (K-1))) | clear); - assign sa_input_id_o = id_ff_q ; - assign sa_output_id_o = id_dr_q ; - assign finished_o = finished_q ; - assign finished_instr_id_o = finished_instr_id_q; - - // -------------------------------------------------------------------- - - // Assertions - if (MESH_WIDTH < 2) begin - $error( - "[systolic_array] MESH_WIDTH must be at least 2.\n" - ); - end -endmodule diff --git a/sw/applications/quadrilatero_matmul_16x16/main.c b/sw/applications/quadrilatero_matmul_16x16/main.c index 9d6cd39f2..eb6d29576 100644 --- a/sw/applications/quadrilatero_matmul_16x16/main.c +++ b/sw/applications/quadrilatero_matmul_16x16/main.c @@ -524,102 +524,7 @@ void __attribute__ ((noinline)) matrixMul_16x16(DATA_IN_t* addrA,DATA_IN_t* ad //-------------------------------------------------------------------------------- //-------------------------------------------------------------------------------- - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; - asm volatile("mzero m4 " ); // m4 = 0; +