From 87f2ce6f7b4fc8c79d91838d47c488c77584c1e9 Mon Sep 17 00:00:00 2001
From: Jonathon Wang <jwang2672@wisc.edu>
Date: Mon, 14 Apr 2025 19:09:47 -0500
Subject: [PATCH] add shifting unit

---
 rtl/warp_defines.v |   7 ++-
 rtl/warp_integer.v | 104 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/rtl/warp_defines.v b/rtl/warp_defines.v
index 87a936d..0f0f72c 100644
--- a/rtl/warp_defines.v
+++ b/rtl/warp_defines.v
@@ -98,4 +98,9 @@
 `define CANONICAL_NOP 32'h00000013
 `define BUNDLE_SIZE 108
 
-`endif /* WARP_DEFINES */
+`define XSHIFT_OP_SHL 2'b00
+`define XSHIFT_OP_SHR 2'b01
+`define XSHIFT_OP_ROL 2'b10
+`define XSHIFT_OP_ROR 2'b11
+
+`endif
diff --git a/rtl/warp_integer.v b/rtl/warp_integer.v
index f30dad9..1699765 100644
--- a/rtl/warp_integer.v
+++ b/rtl/warp_integer.v
@@ -2,11 +2,6 @@
 
 // `include "warp_defines.v"
 
-`define XSHIFT_OP_SHL 2'b00
-`define XSHIFT_OP_SHR 2'b01
-`define XSHIFT_OP_ROL 2'b10
-`define XSHIFT_OP_ROR 2'b11
-
 // scalar integer arithmetic unit - add/sub, set less than, min/max, branch
 // latency: 1 cycle
 // initiation interval: 1 cycle
@@ -383,6 +378,105 @@ module warp_xshift (
     input  wire        i_word,
     output wire [63:0] o_result
 );
+    // Internal wires for each shift stage
+    wire [63:0] stage0_rol, stage1_rol, stage2_rol, stage3_rol, stage4_rol, stage5_rol;
+    wire [63:0] stage0_ror, stage1_ror, stage2_ror, stage3_ror, stage4_ror, stage5_ror;
+    wire [63:0] stage0_shl, stage1_shl, stage2_shl, stage3_shl, stage4_shl, stage5_shl;
+    wire [63:0] stage0_shr, stage1_shr, stage2_shr, stage3_shr, stage4_shr, stage5_shr;
+    
+    // 32-bit rotation wires using consistent style
+    wire [31:0] stage0_rol_32, stage1_rol_32, stage2_rol_32, stage3_rol_32, stage4_rol_32;
+    wire [31:0] stage0_ror_32, stage1_ror_32, stage2_ror_32, stage3_ror_32, stage4_ror_32;
+    wire [31:0] rotated_32_result;
+    wire [63:0] rotated_32;
+    
+    wire [63:0] operand_in;
+    wire [63:0] o_result_tmp;
+
+    // if i_word true between rotate or shift outputs for 32 bits, else select 64 bit 
+    assign o_result = (i_word) ? (
+        ((i_opsel == `XSHIFT_OP_ROL) | (i_opsel == `XSHIFT_OP_ROR)) ? rotated_32 :
+        {{32{o_result_tmp[31]}}, o_result_tmp[31:0]}
+    ) : o_result_tmp;
+
+    // sign extend for i_word before operation 
+    assign operand_in = i_word ? {{32{i_operand[31]}}, i_operand[31:0]} : i_operand;
+
+    // Barrel shifter stages for 64-bit operations
+    // Stage 0: shift or rotate by 1
+    assign stage0_shl = (i_amount[0]) ? ({operand_in[62:0], 1'b0}) : operand_in;
+    assign stage0_shr = (i_amount[0]) ? ({(i_arithmetic & operand_in[63]), operand_in[63:1]}) : operand_in;
+    assign stage0_rol = (i_amount[0]) ? ({operand_in[62:0], operand_in[63]}) : operand_in;
+    assign stage0_ror = (i_amount[0]) ? ({operand_in[0], operand_in[63:1]}) : operand_in;
+
+    // Stage 1: shift or rotate by 2
+    assign stage1_shl = (i_amount[1]) ? ({stage0_shl[61:0], 2'b0}) : stage0_shl;
+    assign stage1_shr = (i_amount[1]) ? ({{2{i_arithmetic & stage0_shr[63]}}, stage0_shr[63:2]}) : stage0_shr;
+    assign stage1_rol = (i_amount[1]) ? ({stage0_rol[61:0], stage0_rol[63:62]}) : stage0_rol;
+    assign stage1_ror = (i_amount[1]) ? ({stage0_ror[1:0], stage0_ror[63:2]}) : stage0_ror;
+
+    // Stage 2: shift or rotate by 4
+    assign stage2_shl = (i_amount[2]) ? ({stage1_shl[59:0], 4'b0}) : stage1_shl;
+    assign stage2_shr = (i_amount[2]) ? ({{4{i_arithmetic & stage1_shr[63]}}, stage1_shr[63:4]}) : stage1_shr;
+    assign stage2_rol = (i_amount[2]) ? ({stage1_rol[59:0], stage1_rol[63:60]}) : stage1_rol;
+    assign stage2_ror = (i_amount[2]) ? ({stage1_ror[3:0], stage1_ror[63:4]}) : stage1_ror;
+
+    // Stage 3: shift or rotate by 8
+    assign stage3_shl = (i_amount[3]) ? ({stage2_shl[55:0], 8'b0}) : stage2_shl;
+    assign stage3_shr = (i_amount[3]) ? ({{8{i_arithmetic & stage2_shr[63]}}, stage2_shr[63:8]}) : stage2_shr;
+    assign stage3_rol = (i_amount[3]) ? ({stage2_rol[55:0], stage2_rol[63:56]}) : stage2_rol;
+    assign stage3_ror = (i_amount[3]) ? ({stage2_ror[7:0], stage2_ror[63:8]}) : stage2_ror;
+
+    // Stage 4: shift or rotate by 16
+    assign stage4_shl = (i_amount[4]) ? ({stage3_shl[47:0], 16'b0}) : stage3_shl;
+    assign stage4_shr = (i_amount[4]) ? ({{16{i_arithmetic & stage3_shr[63]}}, stage3_shr[63:16]}) : stage3_shr;
+    assign stage4_rol = (i_amount[4]) ? ({stage3_rol[47:0], stage3_rol[63:48]}) : stage3_rol;
+    assign stage4_ror = (i_amount[4]) ? ({stage3_ror[15:0], stage3_ror[63:16]}) : stage3_ror;
+
+    // Stage 5: shift or rotate by 32
+    assign stage5_shl = (i_amount[5] & ~i_word) ? ({stage4_shl[31:0], 32'b0}) : stage4_shl;
+    assign stage5_shr = (i_amount[5] & ~i_word) ? ({{32{i_arithmetic & stage4_shr[63]}}, stage4_shr[63:32]}) : stage4_shr;
+    assign stage5_rol = (i_amount[5]) ? ({stage4_rol[31:0], stage4_rol[63:32]}) : stage4_rol;
+    assign stage5_ror = (i_amount[5]) ? ({stage4_ror[31:0], stage4_ror[63:32]}) : stage4_ror;
+
+    // 32-bit rotation stages - using separate wires for each operation
+    // Stage 0: rotate by 1 (32 bit)
+    assign stage0_rol_32 = (i_amount[0]) ? {i_operand[30:0], i_operand[31]} : i_operand[31:0];
+    assign stage0_ror_32 = (i_amount[0]) ? {i_operand[0], i_operand[31:1]} : i_operand[31:0];
+
+    // Stage 1: rotate by 2 (32 bit)
+    assign stage1_rol_32 = (i_amount[1]) ? {stage0_rol_32[29:0], stage0_rol_32[31:30]} : stage0_rol_32;
+    assign stage1_ror_32 = (i_amount[1]) ? {stage0_ror_32[1:0], stage0_ror_32[31:2]} : stage0_ror_32;
+
+    // Stage 2: rotate by 4 (32 bit)
+    assign stage2_rol_32 = (i_amount[2]) ? {stage1_rol_32[27:0], stage1_rol_32[31:28]} : stage1_rol_32;
+    assign stage2_ror_32 = (i_amount[2]) ? {stage1_ror_32[3:0], stage1_ror_32[31:4]} : stage1_ror_32;
+
+    // Stage 3: rotate by 8 (32 bit)
+    assign stage3_rol_32 = (i_amount[3]) ? {stage2_rol_32[23:0], stage2_rol_32[31:24]} : stage2_rol_32;
+    assign stage3_ror_32 = (i_amount[3]) ? {stage2_ror_32[7:0], stage2_ror_32[31:8]} : stage2_ror_32;
+
+    // Stage 4: rotate by 16 (32 bit)
+    assign stage4_rol_32 = (i_amount[4]) ? {stage3_rol_32[15:0], stage3_rol_32[31:16]} : stage3_rol_32;
+    assign stage4_ror_32 = (i_amount[4]) ? {stage3_ror_32[15:0], stage3_ror_32[31:16]} : stage3_ror_32;
+
+    // Select the appropriate 32-bit rotation result based on operation
+    assign rotated_32_result = (i_opsel == `XSHIFT_OP_ROL) ? stage4_rol_32 : stage4_ror_32;
+    
+    // Sign extend the 32-bit result to 64 bits
+    assign rotated_32 = {{32{rotated_32_result[31]}}, rotated_32_result};
+
+    // Select operation for final output using case statement
+    reg [63:0] o_result_tmp;
+    always @(*) begin
+        case (i_opsel)
+            `XSHIFT_OP_SHL: o_result_tmp = stage5_shl;
+            `XSHIFT_OP_SHR: o_result_tmp = stage5_shr;
+            `XSHIFT_OP_ROL: o_result_tmp = stage5_rol;
+            `XSHIFT_OP_ROR: o_result_tmp = stage5_ror;
+            default:        o_result_tmp = 64'hx;  // For safety
+        endcase
+    end
 endmodule
 
 // multiplies two 64 bit operands and outputs the lower 64 bits of