diff --git a/Makefile b/Makefile index 6f591b541..03d093afb 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,19 @@ build-docker: docker pull $(img); \ fi +build-docker-python13: + docker build --build-arg VHLS_PATH=$(vhls) --build-arg VHLS_VERSION=$(vhls_version) -f Docker/Dockerfile-$(PLATFORM)-python13 --tag mase-ubuntu2204-docker-python13 Docker; \ + +shell-python13: + docker run -it --shm-size 256m \ + --hostname mase-ubuntu2204-docker-python13 \ + -w /workspace \ + -v /$(USER_PREFIX)/$(shell whoami)/.gitconfig:/root/.gitconfig \ + -v /$(USER_PREFIX)/$(shell whoami)/.ssh:/root/.ssh \ + -v /$(USER_PREFIX)/$(shell whoami)/.mase:/root/.mase:z \ + -v $(shell pwd):/workspace:z \ + $(DOCKER_RUN_EXTRA_ARGS) \ + $(img) /bin/bash shell: docker run -it --shm-size 256m \ --hostname mase-ubuntu2204 \ diff --git a/a_cx_mxint_quant/__init__.py b/a_cx_mxint_quant/__init__.py new file mode 100644 index 000000000..ac5bdc3d1 --- /dev/null +++ b/a_cx_mxint_quant/__init__.py @@ -0,0 +1,86 @@ +from .module_level_tranform import vit_module_level_quantize +from .quantizers import mxint_hardware, mxint_quant_block + +from .linear import MXIntLinear +from .attention import MXIntAttention +from .module_level_tranform import MXIntLayerNorm, MXIntGELU +from .modules import MXIntPatchEmbed, MXIntAddition +from mase_components import get_module_dependencies +VIT_CUSTOM_OPS = { + "modules": { + MXIntPatchEmbed: { + "args": { + "data_in": "data_in", + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_patch_embed", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_patch_embed" + ), + }, + MXIntAttention: { + "args": { + "data_in": "data_in", + "dim": "config", + "num_heads": "config", + "qkv_bias": "config", + "qk_norm": None, + "attn_drop": None, + "proj_drop": None, + "norm_layer": None, + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_vit_attention_wrap", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_vit_attention_wrap" + ), + }, + MXIntLayerNorm: { + "args": { + "data_in": "data_in", + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_layernorm", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_layernorm" + ), + }, + MXIntGELU: { + "args": { + "data_in": "data_in", + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_gelu", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_gelu" + ), + }, + MXIntLinear: { + "args": { + "data_in": "data_in", + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_linear", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_linear" + ), + }, + MXIntAddition: { + "args": { + "input_0": "data_in", + "input_1": "data_in", + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_addition", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_addition" + ), + }, + }, +} \ No newline at end of file diff --git a/a_cx_mxint_quant/attention.py b/a_cx_mxint_quant/attention.py new file mode 100644 index 000000000..34892c16e --- /dev/null +++ b/a_cx_mxint_quant/attention.py @@ -0,0 +1,192 @@ +from functools import partial + +import torch +import torch.nn as nn +from torch import Tensor +from torch.nn import functional as F + +from .attention_head import _ViTSelfAttentionHeadBase, ViTSelfAttentionHeadInteger + +from chop.nn.quantized.modules.linear import ( + LinearInteger, +) +from chop.nn.quantized.functional import fixed_softermax +from chop.nn.quantizers import integer_quantizer +from chop.nn.quantized.functional import matmul_integer + +from typing import Optional, Tuple, Union + +from .linear import MXIntLinear +from .attention_head import MXIntViTAttentionHead + +class _ViTAttentionBase(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.query = nn.Linear(dim, dim, bias=qkv_bias) + self.key = nn.Linear(dim, dim, bias=qkv_bias) + self.value = nn.Linear(dim, dim, bias=qkv_bias) + self.self_attention = _ViTSelfAttentionHeadBase( + dim=self.head_dim, num_heads=num_heads, attn_drop=attn_drop + ) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + + def _tensor_reshape(x): + return x.reshape(B, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + + q, k, v = ( + _tensor_reshape(self.query(x)), + _tensor_reshape(self.key(x)), + _tensor_reshape(self.value(x)), + ) + x = self.self_attention(q, k, v) + x = x.transpose(1, 2).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + +class ViTAttentionInteger(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + q_config: dict = None, + floor=True, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + self.query = LinearInteger( + dim, + dim, + bias=qkv_bias, + config={ + "data_in_width": q_config["data_in_width"], + "data_in_frac_width": q_config["data_in_frac_width"], + "weight_width": q_config["qkv_weight_width"], + "weight_frac_width": q_config["qkv_weight_frac_width"], + "bias_width": q_config["qkv_bias_width"], + "bias_frac_width": q_config["qkv_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkv_width"], + "data_out_frac_width": q_config["qkv_frac_width"], + }, + floor=floor, + ) + self.key = LinearInteger( + dim, + dim, + bias=qkv_bias, + config={ + "data_in_width": q_config["data_in_width"], + "data_in_frac_width": q_config["data_in_frac_width"], + "weight_width": q_config["qkv_weight_width"], + "weight_frac_width": q_config["qkv_weight_frac_width"], + "bias_width": q_config["qkv_bias_width"], + "bias_frac_width": q_config["qkv_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkv_width"], + "data_out_frac_width": q_config["qkv_frac_width"], + }, + floor=floor, + ) + self.value = LinearInteger( + dim, + dim, + bias=qkv_bias, + config={ + "data_in_width": q_config["data_in_width"], + "data_in_frac_width": q_config["data_in_frac_width"], + "weight_width": q_config["qkv_weight_width"], + "weight_frac_width": q_config["qkv_weight_frac_width"], + "bias_width": q_config["qkv_bias_width"], + "bias_frac_width": q_config["qkv_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkv_width"], + "data_out_frac_width": q_config["qkv_frac_width"], + }, + floor=floor, + ) + self.self_attention = ViTSelfAttentionHeadInteger( + dim=self.head_dim, + num_heads=num_heads, + attn_drop=attn_drop, + q_config={ + "query_width": q_config["qkv_width"], + "query_frac_width": q_config["qkv_frac_width"], + "key_width": q_config["qkv_width"], + "key_frac_width": q_config["qkv_frac_width"], + "value_width": q_config["qkv_width"], + "value_frac_width": q_config["qkv_frac_width"], + "qkmm_out_width": q_config["qkmm_out_width"], + "qkmm_out_frac_width": q_config["qkmm_out_frac_width"], + "softmax_exp_width": q_config["softmax_exp_width"], + "softmax_exp_frac_width": q_config["softmax_exp_frac_width"], + "softmax_out_frac_width": q_config["softmax_out_frac_width"], + "svmm_out_width": q_config["svmm_out_width"], + "svmm_out_frac_width": q_config["svmm_out_frac_width"], + }, + floor=floor, + ) + self.proj = LinearInteger( + dim, + dim, + config={ + "data_in_width": q_config["svmm_out_width"], + "data_in_frac_width": q_config["svmm_out_frac_width"], + "weight_width": q_config["proj_weight_width"], + "weight_frac_width": q_config["proj_weight_frac_width"], + "bias_width": q_config["proj_bias_width"], + "bias_frac_width": q_config["proj_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["data_out_width"], + "data_out_frac_width": q_config["data_out_frac_width"], + }, + floor=floor, + ) + +class MXIntAttention(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + q_config: dict = None, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + + # Replace attention with MXIntViTAttentionHead + # self.self_attention = MXIntViTAttentionHead( + # dim=self.head_dim, + # num_heads=num_heads, + # attn_drop=attn_drop, + # q_config=q_config + # ) diff --git a/a_cx_mxint_quant/attention_head.py b/a_cx_mxint_quant/attention_head.py new file mode 100644 index 000000000..9935948e6 --- /dev/null +++ b/a_cx_mxint_quant/attention_head.py @@ -0,0 +1,167 @@ +import torch +from torch import Tensor +import torch.nn as nn +import math + +from typing import Optional, Tuple +from functools import partial + +from chop.nn.quantized.functional.matmul import ( + generic_matmul_integer, +) +from chop.nn.quantized.functional.softmax import ( + softmax_integer, +) +from chop.nn.quantizers.integer import integer_quantizer, integer_floor_quantizer +from .quantizers import mxint_quant_block + +class _ViTSelfAttentionHeadBase(torch.nn.Module): + def __init__(self, dim, num_heads, attn_drop) -> None: + super().__init__() + self.dropout = nn.Dropout(attn_drop) + + self.matmul1 = torch.matmul + self.matmul2 = torch.matmul + self.mult_data = torch.tensor(1 / math.sqrt(dim)) + self.act = nn.functional.softmax + + def self_attention_head( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + ) -> Tensor: + attention_scores = self.matmul1(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores * self.mult_data + + # Normalize the attention scores to probabilities. + attention_probs = self.act(attention_scores, dim=-1) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + context_layer = self.matmul2(attention_probs, value_layer) + return context_layer + + def forward( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + ) -> Tensor: + return self.self_attention_head( + query_layer=query_layer, key_layer=key_layer, value_layer=value_layer + ) + +from .linear import MXIntLinear, fast_linear +from .quantizers import mxint_hardware + +class MXIntMatMul(nn.Module): + def __init__(self, q_config=None): + super().__init__() + assert q_config is not None, "q_config cannot be None" + self.q_config = q_config + + def forward(self, x: torch.Tensor, y: torch.Tensor): + qx, _, _ = mxint_hardware( + x, + q_config = { + "width": self.q_config["data_in_width"], + "exponent_width": self.q_config["data_in_exponent_width"], + }, + parallelism = self.q_config["data_in_parallelism"] + ) + qy, _, _ = mxint_hardware( + y, + q_config = { + "width": self.q_config["weight_width"], + "exponent_width": self.q_config["weight_exponent_width"], + }, + parallelism = self.q_config["weight_parallelism"] + ) + + out = qx @ qy + out, _, _ = mxint_hardware( + out, + q_config = { + "width": self.q_config["data_out_width"], + "exponent_width": self.q_config["data_out_exponent_width"], + }, + parallelism = self.q_config["data_out_parallelism"] + ) + return out + +from .softmax import MXIntSoftmax +class MXIntViTAttentionHead(_ViTSelfAttentionHeadBase): + def __init__( + self, dim, num_heads, attn_drop=0.0, q_config: dict = None, floor=False + ) -> None: + super().__init__(dim, num_heads, attn_drop) + self.dropout = nn.Dropout(attn_drop) + + self.matmul1 = torch.matmul + self.matmul2 = torch.matmul + self.act = MXIntSoftmax(q_config=q_config) + self.mult_data = torch.tensor(1 / math.sqrt(dim)) + +class ViTSelfAttentionHeadInteger(_ViTSelfAttentionHeadBase): + def __init__( + self, dim, num_heads, attn_drop=0.0, q_config: dict = None, floor=False + ) -> None: + super().__init__(dim, num_heads, attn_drop) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + self.query_quantizer = partial( + base_quantizer, + width=q_config["query_width"], + frac_width=q_config["query_frac_width"], + ) + self.key_quantizer = partial( + base_quantizer, + width=q_config["key_width"], + frac_width=q_config["key_frac_width"], + ) + self.value_quantizer = partial( + base_quantizer, + width=q_config["value_width"], + frac_width=q_config["value_frac_width"], + ) + self.matmul1 = partial( + generic_matmul_integer, + config={ + "data_in_width": q_config["query_width"], + "data_in_frac_width": q_config["query_frac_width"], + "weight_width": q_config["key_width"], + "weight_frac_width": q_config["key_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkmm_out_width"], + "data_out_frac_width": q_config["qkmm_out_frac_width"], + }, + floor=floor, + ) + self.act = partial( + softmax_integer, + config={ + "data_in_width": q_config["qkmm_out_width"], + "data_in_frac_width": q_config["qkmm_out_frac_width"], + "data_in_exp_width": q_config["softmax_exp_width"], + "data_in_exp_frac_width": q_config["softmax_exp_frac_width"], + "data_out_frac_width": q_config["softmax_out_frac_width"], + "mult_data": self.mult_data, + }, + floor=floor, + ) + self.mult_data = torch.tensor(1) + self.matmul2 = partial( + generic_matmul_integer, + config={ + "data_in_width": q_config["softmax_out_frac_width"] + 2, + "data_in_frac_width": q_config["softmax_out_frac_width"], + "weight_width": q_config["value_width"], + "weight_frac_width": q_config["value_frac_width"], + }, + out_config={ + "data_out_width": q_config["svmm_out_width"], + "data_out_frac_width": q_config["svmm_out_frac_width"], + }, + floor=floor, + ) diff --git a/a_cx_mxint_quant/gelu.drawio b/a_cx_mxint_quant/gelu.drawio new file mode 100644 index 000000000..bb0299be8 --- /dev/null +++ b/a_cx_mxint_quant/gelu.drawio @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/a_cx_mxint_quant/gelu.py b/a_cx_mxint_quant/gelu.py new file mode 100644 index 000000000..cbe191f1d --- /dev/null +++ b/a_cx_mxint_quant/gelu.py @@ -0,0 +1,77 @@ +# models.py +import torch +import torch.nn as nn +import math +from typing import List, Union, Optional +from pathlib import Path +import torch +import torch.nn as nn +from torch import Tensor +import math +from typing import Literal, Optional, Tuple, Union, Dict +from enum import Enum +from functools import partial +from tqdm import tqdm +from chop.nn.quantizers.integer import _integer_quantize +from .quantizers import mxint_hardware +from .utils import reshape_to_block, reshape_back + +def mxint_gelu(x, q_config): + """Vectorized range reduction""" + qx, mx, ex = mxint_hardware( + x, + { + "width": q_config["data_in_width"], + "exponent_width": q_config["data_in_exponent_width"], + "round_bits": 4, + }, + parallelism=q_config["data_in_parallelism"] + ) + # first + + original_shape = qx.shape + t1, t0 = mx.shape[-2:] + p1, p0 = q_config["data_in_parallelism"] + qx = reshape_to_block(qx, t1,t0, p1, p0) + mx = reshape_to_block(mx, t1, t0, p1, p0) + ex = ex.unsqueeze(-1).unsqueeze(-1) + + qout = torch.relu(qx) + eout = ex + remaining = (qx > -3) & (qx < 3) + + # data_width_loss + # avoid quant_loss here + # we will need to shift it to + # in hardware qx is lossless + VALID_WIDTH = q_config["data_in_width"] + 2 + HASH_OUT_WIDTH = q_config["hash_out_width"] + HASH_OUT_FRAC_WIDTH = HASH_OUT_WIDTH - 3 + # hash loss + qgelu = _integer_quantize(torch.nn.GELU()(qx), HASH_OUT_WIDTH, HASH_OUT_FRAC_WIDTH) + mgelu = qgelu * 2**(HASH_OUT_WIDTH - 1) // 2**ex + qgelu = mgelu * 2**ex / 2**(HASH_OUT_WIDTH - 1) + + qout[remaining] = qgelu[remaining] + qout = reshape_back(qout, t1, t0, p1, p0) + qout = qout.reshape(original_shape) + qx, mx, ex = mxint_hardware( + qout, + { + "width": q_config["data_out_width"], + "exponent_width": q_config["data_out_exponent_width"], + "round_bits": 4, + }, + parallelism=q_config["data_out_parallelism"] + ) + return qx, mx, ex + +class MXIntGELU(nn.Module): + def __init__(self, q_config: Dict = {}): + super().__init__() + self.q_config = q_config + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out, _, _ = mxint_gelu(x, self.q_config) + return out + diff --git a/a_cx_mxint_quant/layer_norm.drawio b/a_cx_mxint_quant/layer_norm.drawio new file mode 100644 index 000000000..8589bf0a0 --- /dev/null +++ b/a_cx_mxint_quant/layer_norm.drawio @@ -0,0 +1,469 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/a_cx_mxint_quant/layer_norm.py b/a_cx_mxint_quant/layer_norm.py new file mode 100644 index 000000000..302307c05 --- /dev/null +++ b/a_cx_mxint_quant/layer_norm.py @@ -0,0 +1,166 @@ +from torch import nn +import torch + +from .quantizers import mxint_quant_block, mxint_hardware +from chop.nn.quantizers import integer_floor_quantizer, integer_quantizer +from torch import Tensor +from math import ceil, log2 + +def mxint_layer_norm( + x: torch.Tensor, + normalized_shape: tuple or int, + weight=None, + bias=None, + eps=1e-5, + q_config={}, +): + def quantize(x, width, frac_width, by_pass=False, floor=False): + if not by_pass: + if floor: + x = integer_floor_quantizer(x, width, frac_width) + else: + x = integer_quantizer(x, width, frac_width) + return x + + def get_dim_and_prodofdim(x, normalized_shape): + dim = tuple(range(0 - len(normalized_shape), 0)) + num_vals = 1 + for items in dim: + num_vals *= x.shape[items] + return dim, num_vals + ''' + actually, we cannot assume that the input is quantized + ''' + def isqrt(x: torch.Tensor): + x = (x + eps).sqrt() + x = x.reciprocal() + return x + + if isinstance(normalized_shape, int): + normalized_shape = (normalized_shape,) + dim, num_vals = get_dim_and_prodofdim(x, normalized_shape) + inv_num_vals = torch.tensor(1 / num_vals) + + acc_out_width = ceil(log2(num_vals)) + q_config.get("data_in_width") + inv_num_vals_quant_0 = 2**acc_out_width // num_vals / 2**acc_out_width + # Mean calculation + mu_acc = x.sum(dim, keepdim=True) + mu = mu_acc * inv_num_vals_quant_0 + mu = quantize( + mu, + q_config.get("data_in_width"), + q_config.get("data_in_frac_width"), + q_config.get("by_pass"), + True, + ) + # I hope the output precision here should be $clog2 + # Variance calculation + diff = x - mu + + squares = diff**2 + sum_squares = torch.sum(squares, dim, keepdim=True) + squares_adder_tree_width = 2 * q_config.get("data_in_width") + ceil(log2(num_vals)) + inv_num_vals_quant_1 = 2**squares_adder_tree_width // num_vals / 2**squares_adder_tree_width + var = sum_squares * inv_num_vals_quant_1 + var = quantize( + var, + squares_adder_tree_width + 2, + 2*q_config.get("data_in_width") - 2, + floor=True, + ) + var, mvar, evar = mxint_hardware( + var, + { + "width": q_config.get("isqrt_in_width"), + "exponent_width": 6, + }, + parallelism=[1, 1], + ) + + var, mvar, evar = mxint_hardware( + var, + { + "width": q_config.get("isqrt_in_width"), + "exponent_width": 6, + }, + parallelism=[1, 1], + ) + mvar[evar %2 !=0] *= 2 + evar[evar %2 !=0] -= 1 + minv_sqrt = isqrt(mvar/2**(q_config.get("isqrt_in_width") - 1)) + minv_sqrt = integer_quantizer(minv_sqrt, q_config.get("isqrt_out_width"), q_config.get("isqrt_out_frac_width")) + einv_sqrt = -evar/2 + + inv_sqrt = minv_sqrt * 2**einv_sqrt + + # Norm calculation + mnorm_out = diff * minv_sqrt + enorm_out = einv_sqrt + mnorm_out = quantize( + mnorm_out, + q_config.get("data_out_width"), + q_config.get("data_out_frac_width"), + q_config.get("by_pass"), + floor=True, + ) + qnorm_out = mnorm_out*2**einv_sqrt + if weight is not None: + qweight, mweight, eweight = mxint_hardware(weight, + { + "width": q_config.get("weight_width"), + "exponent_width": q_config.get("weight_exponent_width"), + "round_bits": 4 + }, + q_config.get("weight_parallelism")) + qnorm_out = qnorm_out * qweight + if bias is not None: + qbias, mbias, ebias = mxint_hardware(bias, + { + "width": q_config.get("bias_width"), + "exponent_width": q_config.get("bias_exponent_width"), + "round_bits": 4 + }, + q_config.get("bias_parallelism")) + qnorm_out = qnorm_out + qbias + qnorm_out, mnorm_out, enorm_out = mxint_hardware(qnorm_out, + { + "width": q_config.get("data_out_width"), + "exponent_width": q_config.get("data_out_exponent_width"), + "round_bits": 4 + }, + q_config.get("data_out_parallelism")) + return qnorm_out, mnorm_out, enorm_out + +def layer_norm_hardware( + x: torch.Tensor, + normalized_shape: tuple or int, + weight=None, + bias=None, + eps=1e-5, + q_config=None, +): + qx, mx, ex = mxint_quant_block(x, q_config["data_in_width"], q_config["data_in_exponent_width"]) + qnorm_out, _, _ = mxint_layer_norm(qx, normalized_shape, weight, bias, eps, q_config) + return qnorm_out + +class MXIntLayerNorm(nn.LayerNorm): + def __init__( + self, + normalized_shape, + eps: float = 0.00001, + elementwise_affine: bool = False, + bias: bool = False, + q_config=None, + ) -> None: + self.q_config = q_config + super().__init__(normalized_shape, eps, elementwise_affine, bias) + + def forward(self, x: Tensor) -> Tensor: + return layer_norm_hardware( + x, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + q_config=self.q_config, + ) \ No newline at end of file diff --git a/a_cx_mxint_quant/linear.py b/a_cx_mxint_quant/linear.py new file mode 100644 index 000000000..3e6883ffe --- /dev/null +++ b/a_cx_mxint_quant/linear.py @@ -0,0 +1,173 @@ +from chop.nn.quantized.modules.linear import _LinearBase +from torch import Tensor +import torch +from .quantizers import mxint_hardware, reshape_to_block, reshape_back + +def fast_linear(x, w, b, config): + batch_size, n = x.shape[:2] + out_features = w.shape[0] + qx, mx, ex = mxint_hardware(x, **{ + "parallelism":[config["x_config"]["parallism_dim_1"], config["x_config"]["parallism_dim_0"]], + "q_config":{ + "width": config["x_config"]["width"], + "exponent_width": config["x_config"]["exponent_width"], + "round_bits": config["round_bits"], + + }, + }) + qw, mw, ew = mxint_hardware(w, **{ + "parallelism":[config["w_config"]["parallism_dim_1"], config["w_config"]["parallism_dim_0"]], + "q_config":{ + "width": config["w_config"]["width"], + "exponent_width": config["w_config"]["exponent_width"], + "round_bits": 8, + } + }) + qb, mb, eb = mxint_hardware(b, **{ + "parallelism":[config["bias_config"]["parallism_dim_1"], config["bias_config"]["parallism_dim_0"]], + "q_config":{ + "width": config["bias_config"]["width"], + "exponent_width": config["bias_config"]["exponent_width"], + "round_bits": 8, + } + }) + x_config = config["x_config"] + w_config = config["w_config"] + reshaped_mx = reshape_to_block(mx, x_config["dim_1"], x_config["dim_0"], x_config["parallism_dim_1"], x_config["parallism_dim_0"]) + reshaped_mw = reshape_to_block(mw, w_config["dim_1"], w_config["dim_0"], w_config["parallism_dim_1"], w_config["parallism_dim_0"]) + + # move the infeatures depth to the front + mx_for_accumulation = reshaped_mx.permute(2, 0, 1, 3, 4) + # The dimension will be [depth_in_features, batch_size, depth_n, parallism_n, parallism_in_features] + # For every parallelised block, we will have a unique exponent + # Original shape of ex is [batch_size, depth_n, depth_in_features] + # We will permute it to [depth_in_features, batch_size, depth_n] + ex_for_accumulation = ex.permute(2, 0, 1) + + # Same for mw, the shape of mw is [depth_out_features, depth_in_features, parallism_out_features, parallism_in_features] + mw_for_accumulation = reshaped_mw.squeeze(0) + mw_for_accumulation = mw_for_accumulation.permute(1, 0, 2, 3) + ew_for_accumulation = ew.transpose(0, 1) + + # We are trying to do the matmul based on the block partition + # mx is [depth_in_features, batch_size, depth_n, parallism_n, parallism_in_features] + # mw is [depth_in_features, depth_out_features, parallism_out_features, parallism_in_features] + # merge depth_out_features and parallelism_out_features + # mw = [depth_in_features, out_features, parallism_in_features] + mw_for_accumulation = mw_for_accumulation.reshape(mw_for_accumulation.shape[0], -1, mw_for_accumulation.shape[-1]) + + mout = mx_for_accumulation[0] @ mw_for_accumulation[0].transpose(-2, -1) + mout = reshape_to_block(mout, x_config["dim_1"], w_config["dim_1"], x_config["parallism_dim_1"], w_config["parallism_dim_1"]) + # shape of mout is [batch_size, depth_n, parallism_n, out_features] + ex_expanded = ex_for_accumulation.unsqueeze(-1) # [depth_in_features, batch_size, depth_n, 1] + ew_expanded = ew_for_accumulation.unsqueeze(1).unsqueeze(2) # [depth_in_features, 1, 1, depth_out_features] + eout = (ex_expanded[0] + ew_expanded[0]).unsqueeze(-1).unsqueeze(-1) + for i in range(1, mx_for_accumulation.shape[0]): + new_exponent = (ex_expanded[i] + ew_expanded[i]).unsqueeze(-1).unsqueeze(-1) + max_exponent = torch.max(eout, new_exponent) + mout = mout // 2 ** (max_exponent - eout) + current_result = mx_for_accumulation[i] @ mw_for_accumulation[i].transpose(-2, -1) + current_result = reshape_to_block(current_result, x_config["dim_1"], w_config["dim_1"], x_config["parallism_dim_1"], w_config["parallism_dim_1"]) + current_result = current_result // 2 ** (max_exponent - new_exponent) + mout += current_result + eout = max_exponent + + # the shape of qout will be [batch_size, depth_in_n, depth_out_features, paral_n, paral_out_features] + # the shape of mb will be [1, 1, out_features] + # reshape mb to [1, 1, depth_out_features, 1, paral_out_features] + # broad cast to [batch_size, depth_in_n, depth_out_features, paral_n, paral_out_features] + + # the shape of eout willbe [batch_size, depth_n, depth_out_features] + # the shape of eb will be [1, 1, depth_out_featuers] + + # so i wish eb can map back to + out_config = config["out_config"] + b_config = config["bias_config"] + width_difference = x_config["width"] + w_config["width"] - 2 - (b_config["width"] -1) + reshaped_mb = mb.reshape(1, 1, out_config["depth_dim_0"], 1, out_config["parallism_dim_0"]) + reshaped_eb = eb.reshape(1, 1, out_config["depth_dim_0"], 1, 1) + mb_for_out = reshaped_mb // 2**(eout - reshaped_eb - width_difference) + mout = mout + mb_for_out + + qout = reshape_back((mout / 2 **(x_config["width"]+w_config["width"] - 2 - eout)), x_config["dim_1"], w_config["dim_1"], x_config["parallism_dim_1"], w_config["parallism_dim_1"]) + qout = qout.reshape(batch_size, n, out_features) + + return qout + +class MXIntLinear(_LinearBase): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + q_config=None, + ) -> None: + super().__init__(in_features, out_features, bias, device, dtype) + assert q_config is not None, "config is None!" + self.in_features = in_features + self.out_features = out_features + self.q_config = q_config + self.bypass = q_config.get("bypass", False) + if self.bypass: + return + # establish quantizer + + def forward(self, x: Tensor) -> Tensor: + # an example of config + unroll_in_features = self.q_config["data_in_parallelism"][1] + unroll_out_features = self.q_config["data_out_parallelism"][1] + unroll_n = self.q_config["data_in_parallelism"][0] + in_features = self.in_features + out_features = self.out_features + n = x.shape[1] + batch_size = x.shape[0] + assert x.shape[2] == in_features, f"Input shape mismatch: {x.shape[2]} != {in_features}" + + self.config = { + "x_config": { + "width": self.q_config["data_in_width"], + "exponent_width": self.q_config["data_in_exponent_width"], + "parallism_dim_0": unroll_in_features, + "parallism_dim_1": unroll_n, + "depth_dim_0": in_features // unroll_in_features, + "depth_dim_1": n // unroll_n, + "dim_0": in_features, + "dim_1": n, + }, + "w_config": { + "width": self.q_config["weight_width"], + "exponent_width": self.q_config["weight_exponent_width"], + "parallism_dim_0": unroll_in_features, + "parallism_dim_1": unroll_out_features, + "depth_dim_0": in_features // unroll_in_features, + "depth_dim_1": out_features // unroll_out_features, + "dim_0": in_features, + "dim_1": out_features, + }, + "bias_config": { + "width": self.q_config["bias_width"], + "exponent_width": self.q_config["bias_exponent_width"], + "parallism_dim_0": unroll_out_features, + "parallism_dim_1": 1, + "depth_dim_0": out_features // unroll_out_features, + "depth_dim_1": 1, + "dim_0": out_features, + "dim_1": 1, + }, + "out_config": { + "width": self.q_config["data_out_width"], + "exponent_width": self.q_config["data_out_exponent_width"], + "parallism_dim_0": unroll_out_features, + "parallism_dim_1": unroll_n, + "depth_dim_0": out_features // unroll_out_features, + "depth_dim_1": n // unroll_n, + "dim_0": out_features, + "dim_1": n, + }, + "round_bits": self.q_config.get("round_bits", 4), + } + # out = fast_linear(x, self.weight, self.bias, self.config) + out = torch.nn.Linear(in_features, out_features, bias=True)(x) + return out diff --git a/a_cx_mxint_quant/mase_mxint_top_tb.py b/a_cx_mxint_quant/mase_mxint_top_tb.py new file mode 100644 index 000000000..beb02fe6c --- /dev/null +++ b/a_cx_mxint_quant/mase_mxint_top_tb.py @@ -0,0 +1,334 @@ +from pathlib import Path + +import cocotb +import logging, torch +from pathlib import Path + +logger = logging.getLogger(__name__) + +from pathlib import Path + +import cocotb +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import MultiSignalStreamDriver, MultiSignalErrorThresholdStreamMonitor, MultiSignalStreamMonitor +import sys +from os import getenv, PathLike + +import torch +from pathlib import Path +import time +import warnings +from cocotb.runner import get_runner, get_results + +from chop.tools import get_logger +import mase_components +from mase_components import get_modules + +import glob, os +from cocotb.utils import get_sim_time +def simulate( + model: torch.nn.Module = None, + model_info=None, + task: str = "", + dataset_info=None, + data_module=None, + load_name: PathLike = None, + load_type: str = None, + run_emit: bool = False, + skip_build: bool = False, + skip_test: bool = False, + trace_depth: int = 3, + gui: bool = False, + waves: bool = False, + simulator: str = "verilator", + pass_args = {}, +): + SIM = getenv("SIM", simulator) + runner = get_runner(SIM) + + project_dir = ( + pass_args["project_dir"] + if "project_dir" in pass_args.keys() + else Path.home() / ".mase" / "top" + ) + + if not skip_build: + # To do: extract from mz checkpoint + if simulator == "questa": + sources = glob.glob(os.path.join(project_dir / "hardware" / "rtl", "*.sv")) + build_args = [] + + elif simulator == "verilator": + # sources = ["../../../top.sv"] + sources = glob.glob(os.path.join(project_dir / "hardware" / "rtl", "*.sv")) + build_args = [ + "-Wno-fatal", + "-Wno-lint", + "-Wno-style", + "--trace-fst", + "--trace-structs", + "--trace-depth", + str(trace_depth), + "--unroll-count", + "16384" + ] + else: + raise ValueError(f"Unrecognized simulator: {simulator}") + + includes = [ + project_dir / "hardware" / "rtl", + ] + [ + Path(mase_components.__file__).parent / module / "rtl" + for module in get_modules() + ] + + build_start = time.time() + + runner.build( + verilog_sources=sources, + includes=includes, + hdl_toplevel="top", + build_args=build_args, + parameters=[], # use default parameters, + ) + + build_end = time.time() + logger.info(f"Build finished. Time taken: {build_end - build_start:.2f}s") + + if not skip_test: + # Add tb file to python path + + # sys.path.append(str(pass_args["test_dir"])) + + test_start = time.time() + runner.test( + hdl_toplevel="top", + test_module="mase_mxint_top_tb", + hdl_toplevel_lang="verilog", + gui=gui, + waves=waves, + ) + test_end = time.time() + logger.info(f"Test finished. Time taken: {test_end - test_start:.2f}s") + +class MaseGraphTB(Testbench): + def __init__(self, dut, fail_on_checks=True): + super().__init__(dut, dut.clk, dut.rst, fail_on_checks=fail_on_checks) + + # Instantiate as many drivers as required inputs to the model + self.input_drivers = {} + self.output_monitors = {} + + arg = "data_in_0" + result = "data_out_0" + self.input_drivers[arg] = MultiSignalStreamDriver( + dut.clk, (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, dut.data_in_0_ready + ) + # self.input_drivers[arg].log.setLevel(logging.DEBUG) + + # Instantiate as many monitors as required outputs + self.output_monitors[result] = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=False, + ) + # self.output_monitors[result].log.setLevel(logging.DEBUG) + + def generate_inputs(self, batches, model=None): + """ + Generate inputs for the model by sampling a random tensor + for each input argument, according to its shape + + :param batches: number of batches to generate for each argument + :type batches: int + :return: a dictionary of input arguments and their corresponding tensors + :rtype: Dict + """ + # ! TO DO: iterate through graph.args instead to generalize + inputs = torch.randn(batches, self.get_parameter(f"DATA_IN_0_TENSOR_SIZE_DIM_1"), self.get_parameter(f"DATA_IN_0_TENSOR_SIZE_DIM_0")) + if model is not None: + outputs = model(inputs) + else: + outputs = torch.randn(batches, self.get_parameter(f"DATA_OUT_0_TENSOR_SIZE_DIM_1"), self.get_parameter(f"DATA_OUT_0_TENSOR_SIZE_DIM_0")) + return inputs, outputs + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + from mase_components.linear_layers.mxint_operators.test.utils import mxint_hardware + from mase_components.linear_layers.mxint_operators.test.utils import pack_tensor_to_mx_listed_chunk + + (qtensor, mtensor, etensor) = mxint_hardware(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + def load_drivers(self, in_tensors): + for i in range(in_tensors.shape[0]): + data_0_inputs = self.preprocess_tensor_for_mxint( + tensor=in_tensors[i], + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "round_bits": 4 + }, + parallelism=[self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")] + ) + self.input_drivers["data_in_0"].load_driver(data_0_inputs) + + def load_monitors(self, expectation): + for i in range(expectation.shape[0]): + exp_out = self.preprocess_tensor_for_mxint( + tensor=expectation[i], + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "round_bits": 4 + }, + parallelism=[self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0")] + ) + self.output_monitors["data_out_0"].load_monitor(exp_out) + +import torch.nn as nn +@cocotb.test() +async def test(dut): + # cocotb.start_soon(check_signal(dut)) + tb = MaseGraphTB(dut, fail_on_checks=True) + await tb.initialize() + in_tensors, out_tensors = tb.generate_inputs(batches=1) + + tb.load_drivers(in_tensors) + tb.load_monitors(out_tensors) + + await tb.wait_end(timeout=100, timeout_unit="ms") + +from cocotb.triggers import * +async def check_signal(dut): + await Timer(40, units="ns") + # Initialize counters for each data handshake interface + data_in_0_count = 0 + data_out_0_count = 0 + linear1_data_count = 0 + act_data_count = 0 + linear2_data_count = 0 + norm1_data_count = 0 + attention_data_count = 0 + norm2_data_count = 0 + add_data_count = 0 + add1_data_count = 0 + out_depth = 192/4 + # Initialize timestamps for measuring handshake intervals + data_in_time = get_sim_time(units='ns') + data_out_time = get_sim_time(units='ns') + linear1_time = get_sim_time(units='ns') + act_time = get_sim_time(units='ns') + linear2_time = get_sim_time(units='ns') + norm1_time = get_sim_time(units='ns') + attention_time = get_sim_time(units='ns') + norm2_time = get_sim_time(units='ns') + add_time = get_sim_time(units='ns') + add1_time = get_sim_time(units='ns') + + while True: + await RisingEdge(dut.clk) + await ReadOnly() + + # Count handshakes for main input/output + if dut.data_in_0_valid.value and dut.data_in_0_ready.value: + data_in_0_count += 1 + if data_in_0_count == out_depth: + data_in_0_count = 0 + new_data_in_time = get_sim_time(units='ns') + diff_data_in = new_data_in_time - data_in_time + data_in_time = get_sim_time(units='ns') + print(f"data_in_0 handshake time: {diff_data_in}") + + if dut.data_out_0_valid.value and dut.data_out_0_ready.value: + data_out_0_count += 1 + if data_out_0_count == out_depth: + data_out_0_count = 0 + new_data_out_time = get_sim_time(units='ns') + diff_data_out = new_data_out_time - data_out_time + data_out_time = get_sim_time(units='ns') + print(f"data_out_0 handshake time: {diff_data_out}") + + if dut.stream_blocks_0_linear1_data_out_0_valid.value and dut.stream_blocks_0_linear1_data_out_0_ready.value: + linear1_data_count += 1 + if linear1_data_count == out_depth: + linear1_data_count = 0 + new_linear1_time = get_sim_time(units='ns') + diff_linear1 = new_linear1_time - linear1_time + linear1_time = get_sim_time(units='ns') + print(f"linear1 handshake time: {diff_linear1}") + + if dut.stream_blocks_0_act_data_out_0_valid.value and dut.stream_blocks_0_act_data_out_0_ready.value: + act_data_count += 1 + if act_data_count == out_depth: + act_data_count = 0 + new_act_time = get_sim_time(units='ns') + diff_act = new_act_time - act_time + act_time = get_sim_time(units='ns') + print(f"act handshake time: {diff_act}") + + if dut.stream_blocks_0_linear2_data_out_0_valid.value and dut.stream_blocks_0_linear2_data_out_0_ready.value: + linear2_data_count += 1 + if linear2_data_count == out_depth: + linear2_data_count = 0 + new_linear2_time = get_sim_time(units='ns') + diff_linear2 = new_linear2_time - linear2_time + linear2_time = get_sim_time(units='ns') + print(f"linear2 handshake time: {diff_linear2}") + + if dut.stream_blocks_0_norm1_data_out_0_valid.value and dut.stream_blocks_0_norm1_data_out_0_ready.value: + norm1_data_count += 1 + if norm1_data_count == out_depth: + norm1_data_count = 0 + new_norm1_time = get_sim_time(units='ns') + diff_norm1 = new_norm1_time - norm1_time + norm1_time = get_sim_time(units='ns') + print(f"norm1 handshake time: {diff_norm1}") + + if dut.stream_blocks_0_attention_data_out_0_valid.value and dut.stream_blocks_0_attention_data_out_0_ready.value: + attention_data_count += 1 + if attention_data_count == out_depth: + attention_data_count = 0 + new_attention_time = get_sim_time(units='ns') + diff_attention = new_attention_time - attention_time + attention_time = get_sim_time(units='ns') + print(f"attention handshake time: {diff_attention}") + + if dut.stream_blocks_0_norm2_data_out_0_valid.value and dut.stream_blocks_0_norm2_data_out_0_ready.value: + norm2_data_count += 1 + if norm2_data_count == out_depth: + norm2_data_count = 0 + new_norm2_time = get_sim_time(units='ns') + diff_norm2 = new_norm2_time - norm2_time + norm2_time = get_sim_time(units='ns') + print(f"norm2 handshake time: {diff_norm2}") + + if dut.stream_blocks_0_add_data_out_0_valid.value and dut.stream_blocks_0_add_data_out_0_ready.value: + add_data_count += 1 + if add_data_count == out_depth: + add_data_count = 0 + new_add_time = get_sim_time(units='ns') + diff_add = new_add_time - add_time + add_time = get_sim_time(units='ns') + print(f"add handshake time: {diff_add}") + + if dut.stream_blocks_0_add_1_data_out_0_valid.value and dut.stream_blocks_0_add_1_data_out_0_ready.value: + add1_data_count += 1 + if add1_data_count == out_depth: + add1_data_count = 0 + new_add1_time = get_sim_time(units='ns') + diff_add1 = new_add1_time - add1_time + add1_time = get_sim_time(units='ns') + print(f"add1 handshake time: {diff_add1}") + + + + +if __name__ == "__main__": + pass_args = { + "project_dir": Path("./mxint_vit_block"), + } + simulate(skip_build=False, skip_test=False, simulator="verilator", waves=True, gui=False, trace_depth=5, pass_args=pass_args) \ No newline at end of file diff --git a/a_cx_mxint_quant/mase_top_tb.py b/a_cx_mxint_quant/mase_top_tb.py new file mode 100644 index 000000000..f48a40723 --- /dev/null +++ b/a_cx_mxint_quant/mase_top_tb.py @@ -0,0 +1,232 @@ +from pathlib import Path + +import cocotb +import logging, torch +from pathlib import Path + +logger = logging.getLogger(__name__) + +from pathlib import Path + +import cocotb +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor +import sys +from os import getenv, PathLike + +import torch +from pathlib import Path +import time +import warnings +from cocotb.runner import get_runner, get_results + +from chop.tools import get_logger +import mase_components +from mase_components import get_modules + +import glob, os + +def simulate( + model: torch.nn.Module = None, + model_info=None, + task: str = "", + dataset_info=None, + data_module=None, + load_name: PathLike = None, + load_type: str = None, + run_emit: bool = False, + skip_build: bool = False, + skip_test: bool = False, + trace_depth: int = 3, + gui: bool = False, + waves: bool = False, + simulator: str = "verilator", + pass_args = {}, +): + SIM = getenv("SIM", simulator) + runner = get_runner(SIM) + + project_dir = ( + pass_args["project_dir"] + if "project_dir" in pass_args.keys() + else Path.home() / ".mase" / "top" + ) + + if not skip_build: + # To do: extract from mz checkpoint + if simulator == "questa": + sources = glob.glob(os.path.join(project_dir / "hardware" / "rtl", "*.sv")) + build_args = [] + + elif simulator == "verilator": + # sources = ["../../../top.sv"] + sources = glob.glob(os.path.join(project_dir / "hardware" / "rtl", "*.sv")) + build_args = [ + "-Wno-fatal", + "-Wno-lint", + "-Wno-style", + "--trace-fst", + "--trace-structs", + "--trace-depth", + str(trace_depth), + "--unroll-count", + "16384" + ] + else: + raise ValueError(f"Unrecognized simulator: {simulator}") + + includes = [ + project_dir / "hardware" / "rtl", + ] + [ + Path(mase_components.__file__).parent / module / "rtl" + for module in get_modules() + ] + + build_start = time.time() + + runner.build( + verilog_sources=sources, + includes=includes, + hdl_toplevel="top", + build_args=build_args, + parameters=[], # use default parameters, + ) + + build_end = time.time() + logger.info(f"Build finished. Time taken: {build_end - build_start:.2f}s") + + if not skip_test: + # Add tb file to python path + + # sys.path.append(str(pass_args["test_dir"])) + + test_start = time.time() + runner.test( + hdl_toplevel="top", + test_module="mase_top_tb", + hdl_toplevel_lang="verilog", + gui=gui, + waves=waves, + ) + test_end = time.time() + logger.info(f"Test finished. Time taken: {test_end - test_start:.2f}s") + +class MaseGraphTB(Testbench): + def __init__(self, dut, fail_on_checks=True): + super().__init__(dut, dut.clk, dut.rst, fail_on_checks=fail_on_checks) + + # Instantiate as many drivers as required inputs to the model + self.input_drivers = {} + self.output_monitors = {} + + arg = "data_in_0" + result = "data_out_0" + self.input_drivers[arg] = StreamDriver( + dut.clk, + getattr(dut, arg), + getattr(dut, f"{arg}_valid"), + getattr(dut, f"{arg}_ready"), + ) + self.input_drivers[arg].log.setLevel(logging.DEBUG) + + # Instantiate as many monitors as required outputs + self.output_monitors[result] = StreamMonitor( + dut.clk, + getattr(dut, result), + getattr(dut, f"{result}_valid"), + getattr(dut, f"{result}_ready"), + check=False, + ) + self.output_monitors[result].log.setLevel(logging.DEBUG) + + def generate_inputs(self, batches): + """ + Generate inputs for the model by sampling a random tensor + for each input argument, according to its shape + + :param batches: number of batches to generate for each argument + :type batches: int + :return: a dictionary of input arguments and their corresponding tensors + :rtype: Dict + """ + # ! TO DO: iterate through graph.args instead to generalize + inputs = torch.randn(batches, self.get_parameter(f"DATA_IN_0_TENSOR_SIZE_DIM_1"), self.get_parameter(f"DATA_IN_0_TENSOR_SIZE_DIM_0")) + outputs = torch.randn(batches, self.get_parameter(f"DATA_OUT_0_TENSOR_SIZE_DIM_1"), self.get_parameter(f"DATA_OUT_0_TENSOR_SIZE_DIM_0")) + return inputs, outputs + + def load_drivers(self, in_tensors): + from mase_cocotb.utils import fixed_preprocess_tensor + + in_data_blocks = fixed_preprocess_tensor( + tensor=in_tensors, + q_config={ + "width": self.get_parameter(f"DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter( + f"DATA_IN_0_PRECISION_1" + ), + }, + parallelism=[ + self.get_parameter(f"DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter(f"DATA_IN_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + + # Append all input blocks to input driver + # ! TO DO: generalize + block_size = self.get_parameter( + "DATA_IN_0_PARALLELISM_DIM_0" + ) * self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1") + for block in in_data_blocks: + if len(block) < block_size: + block = block + [0] * (block_size - len(block)) + self.input_drivers["data_in_0"].append(block) + + def load_monitors(self, expectation): + from mase_cocotb.utils import fixed_preprocess_tensor + + # Process the expectation tensor + output_blocks = fixed_preprocess_tensor( + tensor=expectation, + q_config={ + "width": self.get_parameter(f"DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter(f"DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + + # Set expectation for each monitor + for block in output_blocks: + # ! TO DO: generalize to multi-output models + if len(block) < self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"): + block = block + [0] * ( + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0") - len(block) + ) + self.output_monitors["data_out_0"].expect(block) + + # Drive the in-flight flag for each monitor + self.output_monitors["data_out_0"].in_flight = True + +@cocotb.test() +async def test(dut): + + tb = MaseGraphTB(dut, fail_on_checks=True) + await tb.initialize() + + in_tensors, out_tensors = tb.generate_inputs(batches=10) + + tb.load_drivers(in_tensors) + tb.load_monitors(out_tensors) + + await tb.wait_end(timeout=0.1, timeout_unit="s") + + +if __name__ == "__main__": + pass_args = { + "project_dir": Path("./int_linear"), + } + simulate(skip_build=False, skip_test=False, simulator="verilator", waves=True, gui=False, pass_args=pass_args) \ No newline at end of file diff --git a/a_cx_mxint_quant/mase_utils.py b/a_cx_mxint_quant/mase_utils.py new file mode 100644 index 000000000..41460a9c0 --- /dev/null +++ b/a_cx_mxint_quant/mase_utils.py @@ -0,0 +1,475 @@ +import torch +from chop.nn.quantizers import integer_floor_quantizer +from functools import partial +import torch.nn.functional as F +from torch import Tensor + +import torch +from functools import partial +import torch.nn.functional as F +from torch import Tensor + +def mxint_quant_block( + x, width: int = 12, exponent_width: int = 6, exponent: int = None, floor: bool = False +): + """ + - Idea from https://arxiv.org/pdf/2310.10537 + - Convert IEEE FP32/64 to Integer with sharing scale + - The main difference between is the sharing scale do not support NAN representation + --- + - `width`: The number of mantissa bits + 1 (the sign bit) + - `exponent_width`: the number of exponent bits, which is shared over a block + - `exponent_bias`: the exponent bias, if None, `2**(exponent_bits-1)-1` will be used + + """ + exponent_bias = 2 ** (exponent_width - 1) + exponent_max = 2**exponent_width - 1 - exponent_bias + exponent_min = -exponent_bias + + # Vectorized max and log2 operations + abs_max = x.abs().max(dim=-1, keepdim=True).values + log2 = torch.log2(abs_max + torch.finfo(torch.float32).tiny) + + exponent = torch.ceil(log2) + exponent = torch.clamp(exponent, exponent_min, exponent_max) + + # Vectorized mantissa calculation + int_min = -(2 ** (width - 1)) + int_max = 2 ** (width - 1) - 1 + mantissa = x * (2 ** (width - 1)) / 2**exponent + if floor: + mantissa = torch.floor(mantissa) + else: + mantissa = torch.round(mantissa) + mantissa = torch.clamp(mantissa, int_min, int_max) + q_x = (2**exponent) * mantissa /(2 ** (width - 1)) + return q_x + + +def mxint_hardware(tensor, q_config, parallelism): + """ + Vectorized hardware-aware quantization implementation + """ + original_shape = tensor.shape + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + if len(parallelism) == 1: + parallelism = [1, parallelism[0]] + + p1, p0 = parallelism + t1, t0 = tensor.shape[-2:] + + assert (t1 % p1 == 0 and t0 % p0 == 0), \ + f"Block size mismatch: t1={t1}, p1={p1}, t0={t0}, p0={p0}" + + # Single reshape and permute operation + block_tensor = tensor.reshape(-1, t1 // p1, p1, t0 // p0, p0)\ + .permute(0, 1, 3, 2, 4)\ + .reshape(-1, p1 * p0) + + # Direct vectorized quantization without loop + qtensor = mxint_quant_block(block_tensor, **q_config) + + # Efficient shape restoration + return qtensor.reshape(-1, t1 // p1, t0 // p0, p1, p0)\ + .permute(0, 1, 3, 2, 4)\ + .reshape(original_shape) + +def wrapped_mxint_linear_hardware(x, w, bias, in_features, out_features, config): + mx = x[0] + n = mx.reshape(-1, in_features).shape[0] + in_config = { + "x_config": { + "width": config["data_in_width"], + "exponent_width": config["data_in_exponent_width"], + "parallism_dim_0": config["data_in_parallelism"][1], + "parallism_dim_1": config["data_in_parallelism"][0], + "depth_dim_0": in_features // config["data_in_parallelism"][1], + "depth_dim_1": n // config["data_in_parallelism"][0], + "dim_0": in_features, + "dim_1": n, + }, + "w_config": { + "width": config["weight_width"], + "exponent_width": config["weight_exponent_width"], + "parallism_dim_0": config["weight_parallelism"][1], + "parallism_dim_1": config["weight_parallelism"][0], + "depth_dim_0": in_features // config["weight_parallelism"][1], + "depth_dim_1": out_features // config["weight_parallelism"][0], + "dim_0": in_features, + "dim_1": out_features, + }, + "bias_config": { + "width": config["bias_width"], + "exponent_width": config["bias_exponent_width"], + "parallism_dim_0": config["bias_parallelism"][1], + "parallism_dim_1": 1, + "depth_dim_0": out_features // config["bias_parallelism"][1], + "depth_dim_1": 1, + "dim_0": out_features, + "dim_1": 1, + }, + "out_config": { + "width": config["data_out_width"], + "exponent_width": config["data_out_exponent_width"], + "parallism_dim_0": config["data_out_parallelism"][1], + "parallism_dim_1": config["data_out_parallelism"][0], + "depth_dim_0": out_features // config["data_out_parallelism"][1], + "depth_dim_1": n // config["data_out_parallelism"][0], + "dim_0": out_features, + "dim_1": n, + }, + } + mout, eout = mxint_linear_hardware(x, w, bias, in_config) + out_config = in_config["out_config"] + reshaped_mout = mout.reshape( + out_config["depth_dim_1"], + out_config["parallism_dim_1"], + out_config["depth_dim_0"], + out_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + reshaped_out = reshaped_mout * 2 ** ( + eout[:, :, None, None] - config["data_out_width"] + 1 + ) + out = reshaped_out.reshape( + out_config["depth_dim_1"], + out_config["depth_dim_0"], + out_config["parallism_dim_1"], + out_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + out = out.reshape(out_config["dim_1"], out_config["dim_0"]) + + return out + + +def mxint_linear_hardware(x, w, bias, config): + """ + assume 2 dimensional input + config = { + "x_config":{ + "width": , + "exponent_width" , + "parallism_dim_0", + "parallism_dim_1", + "depth_dim_0", + "depth_dim_1", + "dim_0", + "dim_1", + }, + "w_config": { + ... + }, + "bias_config": { + ... + }, + "out_config": { + ... + }, + } + """ + mx, ex = x + mw, ew = w + x_config = config["x_config"] + w_config = config["w_config"] + out_config = config["out_config"] + from math import ceil, log2 + + def DotProductCore(man_x, exp_x, man_y, exp_y): + return man_x @ man_y.transpose(0, 1), exp_x + exp_y + + def block_wise_reshape_tensor(x, x_config): + reshaped_x = x.reshape( + x_config["depth_dim_1"], + x_config["parallism_dim_1"], + x_config["depth_dim_0"], + x_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + reshaped_x = reshaped_x.reshape( + x_config["depth_dim_1"] * x_config["depth_dim_0"], + x_config["parallism_dim_1"], + x_config["parallism_dim_0"], + ) + return reshaped_x + + # assume 2 dimensional input + assert ( + x_config["depth_dim_0"] == w_config["depth_dim_0"] + ), "need to check the setting of dim" + assert ( + x_config["parallism_dim_0"] == w_config["parallism_dim_0"] + ), "need to check the setting of dim" + reshaped_ex = ex.reshape(-1) + reshaped_mx = block_wise_reshape_tensor(mx, x_config) + reshaped_ew = ew.reshape(-1) + reshaped_mw = block_wise_reshape_tensor(mw, w_config) + man_out = torch.zeros( + x_config["depth_dim_1"], + w_config["depth_dim_1"], + x_config["parallism_dim_1"] * w_config["parallism_dim_1"], + ) + exp_out = torch.zeros(x_config["depth_dim_1"], w_config["depth_dim_1"]) + for i in range(x_config["depth_dim_1"]): + for j in range(w_config["depth_dim_1"]): + partial_man_out = torch.zeros( + w_config["depth_dim_0"], + x_config["parallism_dim_1"], + w_config["parallism_dim_1"], + ) + partial_exp_out = torch.zeros(w_config["depth_dim_0"]) + for k in range(x_config["depth_dim_0"]): + mx_block = reshaped_mx[i * x_config["depth_dim_0"] + k] + ex_block = reshaped_ex[i * x_config["depth_dim_0"] + k] + mw_block = reshaped_mw[j * w_config["depth_dim_0"] + k] + ew_block = reshaped_ew[j * w_config["depth_dim_0"] + k] + partial_man_out[k], partial_exp_out[k] = DotProductCore( + mx_block, ex_block, mw_block, ew_block + ) + acc_man_out, acc_exp_out = MxIntAccumulator( + partial_man_out.reshape(w_config["depth_dim_0"], -1), partial_exp_out + ) + if bias != None: + bias_config = config["bias_config"] + mbias, ebias = bias + reshaped_mbias = mbias.reshape( + w_config["depth_dim_1"], w_config["parallism_dim_1"] + ) + reshaped_ebias = ebias.reshape(w_config["depth_dim_1"]) + shifted_value = ( + reshaped_ebias[j] + - acc_exp_out + + x_config["width"] + + w_config["width"] + - 2 + - (bias_config["width"] - 1) + ) + shifted_bias = reshaped_mbias[j].repeat( + x_config["parallism_dim_1"] + ) * 2 ** (shifted_value) + acc_man_out = shifted_bias + acc_man_out + man_out[i][j], exp_out[i][j] = MxIntCast( + acc_man_out, + acc_exp_out, + { + "in_width": x_config["width"] + + w_config["width"] + + ceil(log2(x_config["dim_0"])), + "in_frac_width": x_config["width"] + w_config["width"] - 2, + "in_exponent_width": max( + x_config["exponent_width"], w_config["exponent_width"] + ) + + 1, + "out_width": out_config["width"], + "out_exponent_width": out_config["exponent_width"], + }, + ) + man_out = ( + man_out.reshape( + x_config["depth_dim_1"], + w_config["depth_dim_1"], + x_config["parallism_dim_1"], + w_config["parallism_dim_1"], + ) + .permute(0, 2, 1, 3) + .reshape(x_config["dim_1"], w_config["dim_1"]) + ) + return man_out, exp_out + + +def MXIntMatmulHardware(man_x, exp_x, man_y, exp_y, x_config, y_config, out_config): + """ + assume 2 dimensional input + config = { + "width": , + "exponent_width" , + "parallism_dim_0", + "parallism_dim_1", + "depth_dim_0", + "depth_dim_1", + "dim_0", + "dim_1", + } + man.shape = [dim_1 * dim_0] + exp.shape = [depth_dim_1, depth_dim_0] + """ + from math import ceil, log2 + + def MatmulCore(man_x, exp_x, man_y, exp_y): + return man_x @ man_y, exp_x + exp_y + + # assume 2 dimensional input + assert ( + x_config["depth_dim_0"] == y_config["depth_dim_1"] + ), "need to check the setting of dim" + + def block_wise_reshape_tensor(x, x_config): + reshaped_x = x.reshape( + x_config["depth_dim_1"], + x_config["parallism_dim_1"], + x_config["depth_dim_0"], + x_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + reshaped_x = reshaped_x.reshape( + x_config["depth_dim_1"] * x_config["depth_dim_0"], + x_config["parallism_dim_1"], + x_config["parallism_dim_0"], + ) + return reshaped_x + + reshaped_exp_x = exp_x.reshape(-1) + reshaped_man_x = block_wise_reshape_tensor(man_x, x_config) + reshaped_exp_y = exp_y.reshape(-1) + reshaped_man_y = block_wise_reshape_tensor(man_y, y_config) + man_out = torch.zeros( + x_config["depth_dim_1"], + y_config["depth_dim_0"], + x_config["parallism_dim_1"] * y_config["parallism_dim_0"], + ) + exp_out = torch.zeros(x_config["depth_dim_1"], y_config["depth_dim_0"]) + for i in range(x_config["depth_dim_1"]): + for j in range(y_config["depth_dim_0"]): + partial_man_out = torch.zeros( + y_config["depth_dim_1"], + x_config["parallism_dim_1"], + y_config["parallism_dim_0"], + ) + partial_exp_out = torch.zeros(y_config["depth_dim_1"]) + for k in range(y_config["depth_dim_1"]): + man_x_block = reshaped_man_x[i * x_config["depth_dim_0"] + k] + exp_x_block = reshaped_exp_x[i * x_config["depth_dim_0"] + k] + man_y_block = reshaped_man_y[k * y_config["depth_dim_0"] + j] + exp_y_block = reshaped_exp_y[k * y_config["depth_dim_0"] + j] + partial_man_out[k], partial_exp_out[k] = MatmulCore( + man_x_block, exp_x_block, man_y_block, exp_y_block + ) + acc_man_out, acc_exp_out = MxIntAccumulator( + partial_man_out.reshape(y_config["depth_dim_1"], -1), partial_exp_out + ) + man_out[i][j], exp_out[i][j] = MxIntCast( + acc_man_out, + acc_exp_out, + { + "in_width": x_config["width"] + + y_config["width"] + + ceil(log2(x_config["dim_0"])), + "in_frac_width": x_config["width"] + y_config["width"] - 2, + "in_exponent_width": max( + x_config["exponent_width"], y_config["exponent_width"] + ) + + 1, + "out_width": out_config["width"], + "out_exponent_width": out_config["exponent_width"], + }, + ) + man_out = ( + man_out.reshape( + x_config["depth_dim_1"], + y_config["depth_dim_0"], + x_config["parallism_dim_1"], + x_config["parallism_dim_0"], + ) + .permute(0, 2, 1, 3) + .reshape(x_config["dim_1"], y_config["dim_0"]) + ) + return man_out, exp_out + + +def MxIntCast(man_in, exp_in, param): + # In Man Width + max_in = torch.ceil(torch.log2(man_in.abs().max())) + out_width = param["out_width"] + out_exponent_width = param["out_exponent_width"] + in_width = param["in_width"] + in_frac_width = param["in_frac_width"] + in_exponent_width = param["in_exponent_width"] + + out_exponent_max = 2 ** (out_exponent_width - 1) - 1 + out_exponent_min = -(2 ** (out_exponent_width - 1)) + + out_min = -(2 ** (out_width - 1)) + out_max = 2 ** (out_width - 1) - 1 + lma_in = torch.ceil(torch.log2(man_in.abs().max() + 1e-3)) + out_exp_full = lma_in + exp_in - in_frac_width + out_exp = torch.clamp(out_exp_full, out_exponent_min, out_exponent_max) + out_man = man_in // 2 ** (in_frac_width - exp_in + out_exp - (out_width - 1)) + out_man = torch.clamp(out_man, out_min, out_max) + + return out_man, out_exp + +def MxIntAccumulator(man, exp): + IN_DEPTH, BLOCK_SIZE = man.shape[0], man.shape[1] + max_exp = torch.Tensor([float("-inf")]) + mout = torch.zeros(BLOCK_SIZE) + out_exp = torch.Tensor([float("-inf")]) + for i in range(IN_DEPTH): + max_exp = exp[i] if exp[i] > max_exp else max_exp + mout = mout // 2 ** (max_exp - out_exp) + out_exp = max_exp + shifted_man = man[i] // 2 ** (max_exp - exp[i]) + mout = mout + shifted_man + + return mout, out_exp + +def quantized_range_reduction(mx, ex, in_man_width, data_out_n_width): + """Vectorized range reduction""" + def hardware_round(mx, ex, in_man_frac_width, data_out_width): + round_max = 2**(data_out_width-1) - 1 + round_min = -2**(data_out_width-1) + round_x = mx.reshape(-1) // 2**((in_man_frac_width-ex).reshape(-1)) + return torch.clamp(round_x, round_min, round_max) + coefficient_quant_block = partial( + mxint_quantize, + width=8, + exponent_width=4) + _, mlog2_e, elog2_e = coefficient_quant_block(torch.log2(torch.tensor(math.e))) + _, mln_2, eln_2 = coefficient_quant_block(torch.log(torch.tensor(2.0))) + n = hardware_round(mx * mlog2_e, ex + elog2_e, (in_man_width - 1 + 7), data_out_n_width) + print(n) + _mx = n * mln_2 + _ex = eln_2 + shifted_mx = mx // 2**(_ex - ex + (in_man_width - 1) - 7) + print(shifted_mx) + print(_ex - ex + (in_man_width - 1) - 7) + mr = shifted_mx - _mx + # return mr as an fixedpoint ?.7 we can make it 2.7 + # return n as an integer number with width = data_out_width + return mr, n + +def fixed_exp(fr): + frac_width = 7 + exp = 1*2**(frac_width) + fr + fr**2//2**(frac_width + 1) + fr**3*5//2**(frac_width + 4) + return exp + +def mxint_softmax(x, q_config): + # fixed_r, integer_n + in_man_width = q_config["in_man_width"] + in_exp_width = q_config["in_exp_width"] + data_out_n_width = q_config["data_out_n_width"] + data_out_man_width = q_config["data_out_man_width"] + data_out_frac_width = data_out_man_width - 1 + data_out_exp_width = q_config["data_out_exp_width"] + + shape = x.shape[0] + mout = torch.zeros_like(x) + eout = torch.zeros_like(x) + + list_of_mexps = [] + list_of_eexps = [] + for i in range(shape): + _, mx, ex = mxint_quantize(x[i], in_man_width, in_exp_width) + fixed_r, integer_n = quantized_range_reduction(mx, ex, in_man_width, data_out_n_width) + # fixed_r will be 2.7 bits, integer_n will be data_out_n_width bits + mexp = fixed_exp(fixed_r) + eexp = integer_n + # currently we got mexp ?.7 bits, integer_n data_out_n_width bits + list_of_mexps.append(mexp) + list_of_eexps.append(eexp) + eexps = torch.stack(list_of_eexps) + mexps = torch.stack(list_of_mexps) + m_sum, e_sum = MxIntAccumulator(torch.stack(list_of_mexps), torch.stack(list_of_eexps)) + extended_mexps = mexps * 2**(data_out_frac_width) + pre_cast_mout = extended_mexps // mexps + pre_cast_eout = eexps - e_sum + pre_cast_out = pre_cast_mout * 2**(pre_cast_eout - 7) + for i in range(shape): + _, mout[i], eout[i] = mxint_quantize(pre_cast_out[i], data_out_man_width, data_out_exp_width) + return mout, eout diff --git a/a_cx_mxint_quant/module_level_tranform.py b/a_cx_mxint_quant/module_level_tranform.py new file mode 100644 index 000000000..9d82a5087 --- /dev/null +++ b/a_cx_mxint_quant/module_level_tranform.py @@ -0,0 +1,147 @@ +import torch.nn as nn +import chop as chop +from chop.tools import get_logger +from chop.tools.logger import set_logging_verbosity + +from .attention_head import _ViTSelfAttentionHeadBase +from .attention import MXIntAttention +from chop.models.vision.vit.vit import Attention + +from .linear import MXIntLinear +# from .layer_norm import MXIntLayerNorm +# from .gelu import MXIntGELU +import torch + + +logger = get_logger(__name__) +set_logging_verbosity("debug") +class MXIntGELU(nn.Module): + def __init__(self, q_config = {}): + super().__init__() + self.q_config = q_config + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = x + return out + +class MXIntLayerNorm(nn.LayerNorm): + def __init__( + self, + normalized_shape, + eps: float = 0.00001, + elementwise_affine: bool = False, + bias: bool = False, + q_config=None, + ) -> None: + self.q_config = q_config + super().__init__(normalized_shape, eps, elementwise_affine, bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.layer_norm( + x, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + ) + +def vit_module_level_quantize(model, model_config = {}, q_config = {}): + def parse_q_config(module, q_config): + if q_config.get("by") == "name": + if module[0] in q_config: + return False, q_config[module[0]]["config"] + else: + return True, None + elif q_config.get("by") == "type": + module_name = module[1].__class__.__name__ + if "Linear" in module_name: + if any("linear" in key for key in q_config.keys()): + if "linear1" in module[0] and "linear1" in q_config: + return False, q_config["linear1"]["config"] + elif "linear2" in module[0] and "linear2" in q_config: + return False, q_config["linear2"]["config"] + else: + return False, q_config["linear"]["config"] + else: + return True, None + elif "layer_norm" in q_config and "LayerNorm" in module_name: + return False, q_config["layer_norm"]["config"] + elif "attention" in q_config and "Attention" in module_name: + return False, q_config["attention"]["config"] + elif "gelu" in q_config and "GELU" in module_name: + return False, q_config["gelu"]["config"] + else: + return True, None + else: + raise ValueError(f"Invalid q_config: {q_config}") + + from chop.passes.graph.utils import deepsetattr + for module in model.named_modules(): + skip, config = parse_q_config(module, q_config) + if skip: + continue + if isinstance(module[1], Attention): + ori_module = module[1] + new_module = MXIntAttention( + model_config["dim"], + model_config["num_heads"], + qkv_bias=True, + q_config=config, + ) + logger.info(f"Replacing module: {module[0]}") + dim = ori_module.head_dim * ori_module.num_heads + + qkv_weight = ori_module.qkv.weight.reshape(3, dim, dim) + new_module.query.weight = nn.Parameter(qkv_weight[0]) + new_module.key.weight = nn.Parameter(qkv_weight[1]) + new_module.value.weight = nn.Parameter(qkv_weight[2]) + + has_bias = False if ori_module.qkv.bias == None else True + if has_bias: + qkv_bias = ori_module.qkv.bias.reshape(3, 1, dim) + new_module.query.bias = nn.Parameter(qkv_bias[0]) + new_module.key.bias = nn.Parameter(qkv_bias[1]) + new_module.value.bias = nn.Parameter(qkv_bias[2]) + + new_module.proj.weight = ori_module.proj.weight + new_module.proj.bias = ori_module.proj.bias + deepsetattr(model, module[0], new_module) + elif isinstance(module[1], nn.LayerNorm): + ori_module = module[1] + if ori_module.bias is not None: + bias = True + new_module = MXIntLayerNorm( + ori_module.normalized_shape, + eps=ori_module.eps, + elementwise_affine=ori_module.elementwise_affine, + bias=bias, + q_config=config, + ) + new_module.weight = ori_module.weight + new_module.bias = ori_module.bias + logger.info(f"Replacing module: {module[0]}") + + deepsetattr(model, module[0], new_module) + elif isinstance(module[1], nn.Linear) or isinstance(module[1], MXIntLinear): + if "attention" in module[0]: + continue + if module[0] == "head": + continue + ori_module = module[1] + new_module = MXIntLinear( + ori_module.in_features, + ori_module.out_features, + q_config=config, + ) + new_module.weight = ori_module.weight + new_module.bias = ori_module.bias + logger.info(f"Replacing linear module: {module[0]}") + deepsetattr(model, module[0], new_module) + elif isinstance(module[1], nn.GELU): + ori_module = module[1] + new_module = MXIntGELU( + q_config=config, + ) + logger.info(f"Replacing module: {module[0]}") + deepsetattr(model, module[0], new_module) + return model \ No newline at end of file diff --git a/a_cx_mxint_quant/modules.py b/a_cx_mxint_quant/modules.py new file mode 100644 index 000000000..be63a5299 --- /dev/null +++ b/a_cx_mxint_quant/modules.py @@ -0,0 +1,67 @@ + +import torch.nn as nn + +from chop.nn.quantized.modules.attention import _ViTAttentionBase + +import chop as chop +from chop.tools import get_logger +from chop.tools.logger import set_logging_verbosity + +logger = get_logger(__name__) +set_logging_verbosity("debug") +from chop.models.vision.vit.vit import Attention +import torch +from mase_components.linear_layers.mxint_operators.test.utils import MXIntLinearHardware +class MXIntPatchEmbed(nn.Module): + def __init__( + self, + img_size: int, + patch_size: int, + in_chans: int, + embed_dim: int, + q_config: dict = None, + norm_layer: nn.Module = nn.LayerNorm + ) -> None: + super().__init__() + self.q_config = q_config + self.conv = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + # self.norm = norm_layer(embed_dim) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = (img_size // patch_size) ** 2 + self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim)) + self.distill_token = nn.Parameter(torch.randn(1, 1, embed_dim)) + def forward(self, x): + x = self.conv(x) + x = x.flatten(2).transpose(1, 2) + # x = self.norm(x) + x = torch.cat((self.cls_token.expand(x.size(0), -1, -1), self.distill_token.expand(x.size(0), -1, -1), x), dim=1) + return x + +class ViTAttentionMxInt(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + q_config: dict = None, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + + +class MXIntAddition(nn.Module): + def __init__( + self, + q_config, + ) -> None: + super().__init__() + self.q_config = q_config + + def forward(self, x, y): + return x + y + diff --git a/src/mase_components/vision_models/vit/rtl/__init__.py b/a_cx_mxint_quant/mxint_cast.drawio similarity index 100% rename from src/mase_components/vision_models/vit/rtl/__init__.py rename to a_cx_mxint_quant/mxint_cast.drawio diff --git a/a_cx_mxint_quant/quantizers.py b/a_cx_mxint_quant/quantizers.py new file mode 100644 index 000000000..939b3c76f --- /dev/null +++ b/a_cx_mxint_quant/quantizers.py @@ -0,0 +1,73 @@ +import torch +from functools import partial +import torch.nn.functional as F +from torch import Tensor +from chop.nn.quantized.modules.linear import _LinearBase +from .utils import reshape_to_block, reshape_back + +def mxint_quant_block( + x, width: int = 12, exponent_width: int = 6, exponent: int = None, round_bits: int = 4, +): + """ + - Idea from https://arxiv.org/pdf/2310.10537 + - Convert IEEE FP32/64 to Integer with sharing scale + - The main difference between is the sharing scale do not support NAN representation + --- + - `width`: The number of mantissa bits + 1 (the sign bit) + - `exponent_width`: the number of exponent bits, which is shared over a block + - `exponent_bias`: the exponent bias, if None, `2**(exponent_bits-1)-1` will be used + + """ + exponent_bias = 2 ** (exponent_width - 1) + exponent_max = 2**exponent_width - 1 - exponent_bias + exponent_min = -exponent_bias + + # Vectorized max and log2 operations + abs_max = x.abs().max(dim=-1, keepdim=True).values + log2 = torch.log2(abs_max + torch.finfo(torch.float32).tiny) + + exponent = torch.ceil(log2) + exponent[exponent == log2] += 1 + exponent = torch.clamp(exponent, exponent_min, exponent_max) + + # Vectorized mantissa calculation + int_min = -(2 ** (width - 1)) + int_max = 2 ** (width - 1) - 1 + mantissa = x * (2 ** (width - 1)) / 2**exponent + mantissa = mantissa * 2 ** round_bits + mantissa = torch.floor(mantissa) + mantissa = mantissa / 2 ** round_bits + mantissa = torch.round(mantissa) + mantissa = torch.clamp(mantissa, int_min, int_max) + q_x = (2**exponent) * mantissa /(2 ** (width - 1)) + return q_x, mantissa, exponent + +def mxint_hardware(tensor, q_config, parallelism): + """ + Vectorized hardware-aware quantization implementation + """ + + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + if len(parallelism) == 1: + parallelism = [1, parallelism[0]] + + p1, p0 = parallelism + t1, t0 = tensor.shape[-2:] + + original_mshape = tensor.shape + original_eshape = torch.Size([t1//p1, t0//p0]) if len(tensor.shape) <=2 else torch.Size([*tensor.shape[:-2],t1//p1, t0//p0]) + assert (t1 % p1 == 0 and t0 % p0 == 0), \ + f"Block size mismatch: t1={t1}, p1={p1}, t0={t0}, p0={p0}" + + # Single reshape and permute operation + block_tensor = reshape_to_block(tensor, t1, t0, p1, p0).reshape(-1, p1*p0) + qtensor, mantissa, exponent = mxint_quant_block(block_tensor, **q_config) + + qtensor = reshape_back(qtensor, t1, t0, p1, p0) + mantissa = reshape_back(mantissa, t1, t0, p1, p0) + qtensor = qtensor.reshape(original_mshape) + mantissa = mantissa.reshape(original_mshape) + exponent = exponent.reshape(original_eshape) + # Efficient shape restoration + return qtensor, mantissa, exponent \ No newline at end of file diff --git a/a_cx_mxint_quant/softmax.drawio b/a_cx_mxint_quant/softmax.drawio new file mode 100644 index 000000000..152dce204 --- /dev/null +++ b/a_cx_mxint_quant/softmax.drawio @@ -0,0 +1,143 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/a_cx_mxint_quant/softmax.py b/a_cx_mxint_quant/softmax.py new file mode 100644 index 000000000..c502fd221 --- /dev/null +++ b/a_cx_mxint_quant/softmax.py @@ -0,0 +1,117 @@ +# models.py +import torch +import torch.nn as nn +import math +from typing import List, Union, Optional +from pathlib import Path +import torch +import torch.nn as nn +from torch import Tensor +import math +from typing import Literal, Optional, Tuple, Union, Dict +from enum import Enum +from .quantizers import mxint_quant_block, mxint_hardware +from chop.nn.quantizers.integer import integer_quantizer, integer_floor_quantizer +from functools import partial +from tqdm import tqdm + +class MXIntHardwareExp(nn.Module): + def __init__(self, q_config: Dict = {}): + super().__init__() + self.q_config = q_config + + def hardware_range_reduction(self, qx, data_r_width, data_n_width) -> tuple[torch.Tensor, torch.Tensor]: + """ + Perform range reduction: x = r + n*ln(2) + Returns (r, n) where r is remainder and n is integer power + """ + coefficient_quant_block = partial( + mxint_quant_block, + width=8, + exponent_width=4 + ) + self.log2_e, _, _ = coefficient_quant_block(torch.log2(torch.tensor(math.e))) + new_mx = qx * self.log2_e + new_mx = integer_floor_quantizer(new_mx, data_n_width + data_r_width - 1, data_r_width - 1) + n = new_mx.floor() + r = new_mx - n + return r, n + + def forward(self, x: torch.Tensor) -> torch.Tensor: + + qx, mx, ex = mxint_hardware(x, + { + 'width': self.q_config.get('data_in_width'), + 'exponent_width': self.q_config.get('data_in_exponent_width') + }, + parallelism=[1,1]) + + mr, n = self.hardware_range_reduction(qx, self.q_config.get('data_r_width'), self.q_config.get('data_out_exponent_width')) + mexp = 2 ** mr + mexp = integer_quantizer(mexp, self.q_config.get('data_out_width'), self.q_config.get('data_out_width') - 2) + mexp = mexp * 2 ** (self.q_config.get('data_out_width') - 2) + eexp = n + qexp = mexp * 2 ** eexp / 2 ** (self.q_config.get('data_out_width') - 2) + + return qexp, mexp, eexp + +from tqdm import tqdm +# CX: Set a new search +# accumulator depth should be in the first dimension +def mxint_accumulator(mx,ex): + out = mx[0] + emax = ex[0] + for i in range(1, mx.shape[0]): + old_max = emax + emax = torch.max(emax, ex[i]) + in_out = out // 2**(emax - old_max) + in_mx = mx[i]// 2**(emax - ex[i]) + out = in_out + in_mx + # breakpoint() + return out, emax + + +class MXIntSoftmax(nn.Module): + def __init__(self,q_config: Dict = {}): + super().__init__() + self.q_config = q_config + self.exp_module = MXIntHardwareExp(q_config=q_config) + + def forward(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: + def exp(self, x): + qexp, mexp, eexp = self.exp_module(x) + return qexp, mexp, eexp + + def exp_sum(self, qexp, mexp, eexp): + exp_sum_underflow_bits = self.q_config["exp_sum_underflow_bits"] + mexp = (mexp) * 2**exp_sum_underflow_bits + + mexp= mexp.transpose(1,0) + eexp = eexp.transpose(1,0) + mexp_sum, eexp_sum = mxint_accumulator(mexp, eexp) + qexp_sum = mexp_sum * 2**eexp_sum / 2**exp_sum_underflow_bits + return qexp_sum, mexp_sum, eexp_sum + + def division(self, qexp, mexp, eexp, qexp_sum, mexp_sum, eexp_sum): + division_underflow_bits = self.q_config["division_underflow_bits"] + exp_sum_underflow_bits = self.q_config["exp_sum_underflow_bits"] + mout = mexp * 2**(division_underflow_bits+exp_sum_underflow_bits) // mexp_sum + eout = eexp - eexp_sum + qout = mout * 2**eout / 2**division_underflow_bits + + qout, _, _ = mxint_hardware( + qout, + q_config = { + "width": self.q_config["data_width"], + "exponent_width": self.q_config["data_exponent_width"], + }, + parallelism = [1,1] + ) + + return qout, mout, eout + + qexp, mexp, eexp = exp(self, x) + qexp_sum, mexp_sum, eexp_sum = exp_sum(self, qexp, mexp, eexp) + qout, mout, eout = division(self, qexp, mexp, eexp, qexp_sum, mexp_sum, eexp_sum) + + return qout \ No newline at end of file diff --git a/a_cx_mxint_quant/utils.py b/a_cx_mxint_quant/utils.py new file mode 100644 index 000000000..ea0e0b102 --- /dev/null +++ b/a_cx_mxint_quant/utils.py @@ -0,0 +1,33 @@ +import torch +import torch.nn.functional as F + +def _get_similarity(tensor_raw, tensor_sim, metric=None): + if metric == "cosine": + similarity = F.cosine_similarity(tensor_raw, tensor_sim, dim=-1) + elif metric == "pearson": + similarity = F.cosine_similarity( + tensor_raw - torch.mean(tensor_raw, dim=-1, keepdim=True), + tensor_sim - torch.mean(tensor_sim, dim=-1, keepdim=True), + dim=-1, + ) + else: + if metric == "L1_norm": + similarity = -torch.abs(tensor_raw - tensor_sim) + elif metric == "L2_norm": + similarity = -((tensor_raw - tensor_sim) ** 2) + elif metric == "linear_weighted_L2_norm": + similarity = -tensor_raw.abs() * (tensor_raw - tensor_sim) ** 2 + elif metric == "square_weighted_L2_norm": + similarity = -((tensor_raw * (tensor_raw - tensor_sim)) ** 2) + else: + raise NotImplementedError(f"metric {metric} not implemented!") + similarity = torch.mean(similarity, dim=-1) + return similarity + +def reshape_to_block(tensor, t1, t0, p1, p0): + return tensor.reshape(-1, t1 // p1, p1, t0 // p0, p0)\ + .permute(0, 1, 3, 2, 4) + +def reshape_back(tensor, t1, t0, p1, p0): + return tensor.reshape(-1, t1 // p1, t0 // p0, p1, p0)\ + .permute(0, 1, 3, 2, 4) \ No newline at end of file diff --git a/a_cx_test_files/1attention_test.py b/a_cx_test_files/1attention_test.py new file mode 100644 index 000000000..ec851237d --- /dev/null +++ b/a_cx_test_files/1attention_test.py @@ -0,0 +1,67 @@ + +from chop.nn.quantized.modules.attention_head import _ViTSelfAttentionHeadBase, ViTSelfAttentionHeadInteger +from chop.nn.quantized.modules.attention import _ViTAttentionBase + +import torch.nn as nn +import torch + +class ViTAttentionBase(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = torch.tensor(self.head_dim**-0.5) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + q, k = self.q_norm(q), self.k_norm(k) + + attn = q @ k.transpose(-2, -1) + attn = (attn * self.scale).softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +if __name__ == "__main__": + dim = 4 + head = 2 + + torch.manual_seed(0) + x = torch.rand(1, dim, dim) + module = ViTAttentionBase(dim, head) + result = module(x) + _module = _ViTAttentionBase(dim, head) + _module.qkv.weight = module.qkv.weight + _module.proj.weight = module.proj.weight + _module.qkv.bias = module.qkv.bias + _module.proj.bias = module.proj.bias + _result = _module(x) + print(result==_result) \ No newline at end of file diff --git a/a_cx_test_files/2linear_weigth_scatter.py b/a_cx_test_files/2linear_weigth_scatter.py new file mode 100644 index 000000000..0d265d7c7 --- /dev/null +++ b/a_cx_test_files/2linear_weigth_scatter.py @@ -0,0 +1,72 @@ + +from chop.nn.quantized.modules.attention_head import _ViTSelfAttentionHeadBase, ViTSelfAttentionHeadInteger +from chop.nn.quantized.modules.attention import _ViTAttentionBase + +import torch.nn as nn +import torch + +class ViTAttentionBase(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = torch.tensor(self.head_dim**-0.5) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + q, k = self.q_norm(q), self.k_norm(k) + + attn = q @ k.transpose(-2, -1) + attn = (attn * self.scale).softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +if __name__ == "__main__": + dim = 4 + head = 2 + n = 3 + x = torch.rand(1, n, dim) + qkv = nn.Linear(dim, 3 * dim) + q = nn.Linear(dim, dim) + k = nn.Linear(dim, dim) + v = nn.Linear(dim, dim) + + new_weight = qkv.weight.reshape(3, -1, dim) + new_bias = qkv.bias.reshape(3, -1, dim) + q.weight,k.weight,v.weight = nn.Parameter(new_weight[0]),nn.Parameter(new_weight[1]),nn.Parameter(new_weight[2]) + q.bias,k.bias,v.bias = nn.Parameter(new_bias[0]),nn.Parameter(new_bias[1]),nn.Parameter(new_bias[2]) + qkv_x = qkv(x) + qkv_x = qkv_x.reshape(-1, 3, dim).permute(1,0,2) + print(qkv_x[0] == q(x)) + print(qkv_x[1] == k(x)) + print(qkv_x[2] == v(x)) + \ No newline at end of file diff --git a/a_cx_test_files/3intattention.py b/a_cx_test_files/3intattention.py new file mode 100644 index 000000000..4c33ed518 --- /dev/null +++ b/a_cx_test_files/3intattention.py @@ -0,0 +1,34 @@ +from chop.nn.quantized import ViTAttentionInteger + +import torch.nn as nn +import torch + +from chop.nn.quantized.modules.linear import ( + LinearInteger, +) + +if __name__ == "__main__": + dim = 4 + head = 2 + + torch.manual_seed(0) + x = torch.rand(1, dim, dim) + q_config = { + "data_in_width":8, + "data_in_frac_width":4, + "qkv_weight_width":8, + "qkv_weight_frac_width":4, + "qkv_bias_width":8, + "qkv_bias_frac_width":4, + "qkv_width":8, + "qkv_frac_width":4, + "qkmm_out_width":4, + "qkmm_out_frac_width":8, + "softmax_exp_width":4, + "softmax_exp_frac_width":8, + "softmax_out_frac_width":4, + "svmm_out_width":8, + "svmm_out_frac_width":4, + } + module = ViTAttentionInteger(dim, head, q_config=q_config) + print(module(x)) \ No newline at end of file diff --git a/a_cx_test_files/4norm.py b/a_cx_test_files/4norm.py new file mode 100644 index 000000000..52b4222f9 --- /dev/null +++ b/a_cx_test_files/4norm.py @@ -0,0 +1,98 @@ +from chop.nn.quantized import ViTAttentionInteger +import logging + +import torch.nn as nn +import torch + +from chop.nn.quantizers.integer import ( + integer_floor_quantizer, +) + +logger = logging.getLogger("norm.models") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +logger.addHandler(handler) + +def quantize(x, width, frac_width, by_pass=False): + if not by_pass: + x = integer_floor_quantizer(x, width, frac_width) + return x +def get_dim_and_prodofdim(x, normalized_shape): + dim = tuple(range(0 - len(normalized_shape), 0)) + num_vals = 1 + for items in dim: + num_vals *= x.shape[items] + return dim, num_vals +def isqrt(x:torch.Tensor): + x = x.sqrt() + x = x.reciprocal() + return x +def _fixed_group_norm_2d_model( + x: torch.Tensor, + normalized_shape: tuple, + q_config, +): + #TODO: add hardware debug info + logger.debug(f"Input: \n {x[0]}") + dim, num_vals = get_dim_and_prodofdim(x, normalized_shape) + + # Mean calculation + mu = x.mean(dim, keepdim=True) + logger.debug(f"Mu: \n {mu[0]}") + mu = quantize(mu, q_config["in_width"], q_config["in_frac_width"], q_config["by_pass"]) + logger.debug(f"Mu Quantized: \n {mu[0]}") + + # Variance calculation + diff = x - mu + logger.debug(f"Diff: \n {diff[0]}") + + squares = diff**2 + logger.debug(f"Squares: {squares[0]}") + + sum_squares = torch.sum(squares, dim, keepdim=True) + + sum_squares = quantize(sum_squares, q_config["variance_width"], q_config["variance_frac_width"], q_config["by_pass"]) + + logger.debug("Num Values: %d" % (num_vals)) + var = sum_squares / num_vals + var = quantize(var, q_config["variance_width"], q_config["variance_frac_width"], q_config["by_pass"]) + logger.debug(f"Variance: \n {var[0]}") + + inv_sqrt = isqrt(var + 1e-05) + inv_sqrt = quantize(inv_sqrt, q_config["isqrt_width"], q_config["isqrt_frac_width"], q_config["by_pass"]) + logger.debug(f"INV SQRT INT: \n {inv_sqrt[0]}") + + # Norm calculation + norm_out = diff * inv_sqrt + logger.debug("Norm:") + logger.debug(norm_out[0]) + + norm_out = quantize(norm_out, q_config["out_width"], q_config["out_frac_width"], q_config["by_pass"]) + logger.debug(f"Norm (Casted): \n {norm_out[0]}") + + return norm_out + +if __name__ == "__main__": + dim = 4 + head = 2 + + torch.manual_seed(0) + q_config = { + "by_pass": False, + "in_width":8, + "in_frac_width":7, + "variance_width":16, + "variance_frac_width":8, + "isqrt_width":16, + "isqrt_frac_width":8, + "out_width":8, + "out_frac_width":4, + } + logger.setLevel(logging.DEBUG) + x = torch.rand(1, dim) + _x = _fixed_group_norm_2d_model( + x, (4,), q_config) + module = torch.nn.LayerNorm(dim,elementwise_affine=False, bias=False) + print(_x) + print(module(x)) \ No newline at end of file diff --git a/a_cx_test_files/Dockerfile-cpu-python13 b/a_cx_test_files/Dockerfile-cpu-python13 new file mode 100644 index 000000000..1a5f846c7 --- /dev/null +++ b/a_cx_test_files/Dockerfile-cpu-python13 @@ -0,0 +1,32 @@ +# This Dockerfile configures a Docker environment that +# contains all the required packages for the tool +FROM ubuntu:22.04 + +USER root + +# Install apt packages +ADD install-pkgs-python13.sh install-pkgs-python13.sh +RUN bash install-pkgs-python13.sh + +CMD ["bash"] + +# Ensure pip is installed for python3.13 if it's missing dependencies +RUN python3 -m ensurepip --upgrade && \ + python3 -m pip install --upgrade pip + +# Install PyTorch and Torch-MLIR +RUN pip3 install --upgrade pip +RUN pip3 install --pre torch-mlir torchvision \ + -f https://github.com/llvm/torch-mlir-release/releases/expanded_assets/dev-wheels \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu + +# Install pip packages +ADD install-pips-python13.sh install-pips-python13.sh +RUN bash install-pips-python13.sh + +# Add environment variables +ARG VHLS_PATH +ARG VHLS_VERSION +ADD install-env.sh install-env.sh +RUN bash install-env.sh $VHLS_PATH $VHLS_VERSION + diff --git a/a_cx_test_files/bash_script/run_hw_test.sh b/a_cx_test_files/bash_script/run_hw_test.sh new file mode 100644 index 000000000..68b0aa8a0 --- /dev/null +++ b/a_cx_test_files/bash_script/run_hw_test.sh @@ -0,0 +1,131 @@ + +# # Activation_layers +# # python3 scripts/build-components.py +# python3 src/mase_components/activation_layers/test/fixed_gelu_tb.py +# python3 src/mase_components/activation_layers/test/fixed_leaky_relu_tb.py +# python3 src/mase_components/activation_layers/test/fixed_relu_tb.py +# python3 src/mase_components/activation_layers/test/fixed_selu_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_sigmoid_tb.py +# python3 src/mase_components/activation_layers/test/fixed_softermax_1d_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_softermax_tb.py +# python3 src/mase_components/activation_layers/test/fixed_softmax_tb.py +# python3 src/mase_components/activation_layers/test/fixed_softplus_tb.py +# python3 src/mase_components/activation_layers/test/fixed_softsign_tb.py +# python3 src/mase_components/activation_layers/test/fixed_tanh_tb.py +# # python3 src/mase_components/activation_layers/test/softermax_global_norm_tb.py +# # python3 src/mase_components/activation_layers/test/softermax_local_window_tb.py +# # python3 src/mase_components/activation_layers/test/softermax_lpw_pow2_tb.py +# # python3 src/mase_components/activation_layers/test/softermax_lpw_reciprocal_tb.py +# # python3 src/mase_components/activation_layers/test/test_lint_activation_layers.py +# # python3 src/mase_components/activation_layers/test/test_synth_activation_layers.py +# # DEV mode (no intention to fix) +# # python3 src/mase_components/activation_layers/test/fixed_elu_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_hardshrink_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_hardswish_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_logsigmoid_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_silu_tb.py +# # python3 src/mase_components/activation_layers/test/fixed_softshrink_tb.py + +# # Cast +# python3 src/mase_components/cast/test/fixed_cast_tb.py +# python3 src/mase_components/cast/test/fixed_rounding_tb.py +# python3 src/mase_components/cast/test/fixed_signed_cast_tb.py +# # python3 src/mase_components/cast/test/fixed_unsigned_cast_tb.py + +# # Common +# python3 src/mase_components/common/test/comparator_accumulator_tb.py +# python3 src/mase_components/common/test/cut_data_tb.py +# python3 src/mase_components/common/test/lut_tb.py +# python3 src/mase_components/common/test/wrap_data_tb.py +# # python3 src/mase_components/common/test/register_slice_tb.py +# # python3 src/mase_components/common/test/test_lint_common.py +# # DEV +# # python3 src/mase_components/common/test/comparator_tree_tb.py +# # python3 src/mase_components/common/test/single_element_repeat_tb.py + +# # Convolution_layers +# python3 src/mase_components/convolution_layers/test/convolution_tb.py + +# # Inteface +# python3 src/mase_components/interface/axi/test/test_lint_axi.py +# # python3 src/mase_components/interface/axi/test/test_synth_axi.py + +# # Language models llmint8 +# python3 src/mase_components/language_models/llmint8/test/find_max_tb.py +# python3 src/mase_components/language_models/llmint8/test/fixed_comparator_tree_layer_tb.py +# python3 src/mase_components/language_models/llmint8/test/fixed_comparator_tree_tb.py +# python3 src/mase_components/language_models/llmint8/test/quantized_matmul_tb.py +# python3 src/mase_components/language_models/llmint8/test/quantizer_top_tb.py +# python3 src/mase_components/language_models/llmint8/test/scatter_tb.py +# # DEV +# # python3 src/mase_components/language_models/llmint8/test/llm_int8_top_tb.py + +# # Linear layers +# # Linear Layer - fixed_linear_layer DEBUG: use bias causes crash +# python3 src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py +# # python3 src/mase_components/linear_layers/fixed_linear_layer/test/binary_activation_binary_linear_tb.py +# # python3 src/mase_components/linear_layers/fixed_linear_layer/test/fixed_activation_binary_linear_tb.py +# # Linear Layer - fixed_operators +# python3 src/mase_components/linear_layers/fixed_operators/test/fixed_accumulator_tb.py +# # python3 src/mase_components/linear_layers/fixed_operators/test/fixed_adder_tree_layer_tb.py +# python3 src/mase_components/linear_layers/fixed_operators/test/fixed_adder_tree_tb.py +# python3 src/mase_components/linear_layers/fixed_operators/test/fixed_dot_product_tb.py +# python3 src/mase_components/linear_layers/fixed_operators/test/fixed_lut_index_tb.py +# # python3 src/mase_components/linear_layers/fixed_operators/test/fixed_matmul_core_tb.py +# python3 src/mase_components/linear_layers/fixed_operators/test/fixed_mult_tb.py +# python3 src/mase_components/linear_layers/fixed_operators/test/fixed_range_augmentation_tb.py +# # python3 src/mase_components/linear_layers/fixed_operators/test/fixed_range_reduction_tb.py +# # Linear Layer - matmul +# # python3 src/mase_components/linear_layers/matmul/test/chain_matmul_tb.py +# # python3 src/mase_components/linear_layers/matmul/test/fixed_mamul_tb.py +# # python3 src/mase_components/linear_layers/matmul/test/matmul_tb.py +# # python3 src/mase_components/linear_layers/matmul/test/matrix_stream_transpose_tb.py +# # python3 src/mase_components/linear_layers/matmul/test/transpose_tb.py +# # DEV Linear Layer - binary_operators +# python3 src/mase_components/linear_layers/binarized_operators/test/binary_activation_binary_adder_tree_layer_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/binary_activation_binary_adder_tree_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/binary_activation_binary_dot_product_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/binary_activation_binary_matmul_core_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/binary_activation_binary_mult_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/binary_activation_binary_vector_mult_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/fixed_activation_binary_dot_product_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/fixed_activation_binary_mult_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/fixed_activation_binary_vector_mult_tb.py +# # python3 src/mase_components/linear_layers/binarized_operators/test/test_lint_binary_arith.py +# # MxInt +# python3 src/mase_components/linear_layers/mxint_operators/test/mxint_cast_tb.py +# python3 src/mase_components/linear_layers/mxint_operators/test/mxint_matmul_tb.py +python3 src/mase_components/linear_layers/mxint_operators/test/mxint_linear_tb.py +python3 src/mase_components/linear_layers/mxint_operators/test/mxint_accumulator_tb.py +python3 src/mase_components/linear_layers/mxint_operators/test/mxint_softmax.py +# Memory +python3 src/mase_components/memory/test/fifo_tb.py +# python3 src/mase_components/memory/test/input_buffer_tb.py +python3 src/mase_components/memory/test/skid_buffer_tb.py +# python3 src/mase_components/memory/test/unpacked_fifo_tb.py +# python3 src/mase_components/memory/test/repeat_circular_buffer_tb.py +# python3 src/mase_components/memory/test/test_lint_memory.py + +# Normalization_layers +python3 src/mase_components/normalization_layers/test/batch_norm_2d_tb.py +python3 src/mase_components/normalization_layers/test/group_norm_2d_tb.py +# DEV +# python3 src/mase_components/normalization_layers/test/channel_selection_tb.py +# python3 src/mase_components/normalization_layers/test/rms_norm_2d_tb.py +# python3 src/mase_components/normalization_layers/test/test_lint_norm.py + +# Scalar operators +python3 src/mase_components/scalar_operators/fixed/test/fixed_isqrt_tb.py +python3 src/mase_components/scalar_operators/fixed/test/isqrt_sw.py +# python3 src/mase_components/scalar_operators/float/test/test_lint_float_arithmetic.py +# python3 src/mase_components/scalar_operators/fixed/test/fixed_nr_stage_tb.py +# python3 src/mase_components/scalar_operators/fixed/test/test_lint_fixed_math.py + +# Systolic array +# python3 src/mase_components/systolic_arrays/test/test_lint_systolic_arrays.py + +# Transformer_layers +python3 src/mase_components/transformer_layers/test/fixed_self_attention_head_tb.py +# python3 src/mase_components/transformer_layers/test/fixed_gqa_head_tb.py +# python3 src/mase_components/transformer_layers/test/fixed_self_attention_tb.py +# python3 src/mase_components/transformer_layers/test/test_lint_attention.py diff --git a/a_cx_test_files/bash_script/run_latency_test.sh b/a_cx_test_files/bash_script/run_latency_test.sh new file mode 100644 index 000000000..18f03abfd --- /dev/null +++ b/a_cx_test_files/bash_script/run_latency_test.sh @@ -0,0 +1,3 @@ +# !/bin/bash +python3 test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_block.py +python3 mase_mxint_top_tb.py diff --git a/a_cx_test_files/bash_script/run_real_top.sh b/a_cx_test_files/bash_script/run_real_top.sh new file mode 100644 index 000000000..54a96eeec --- /dev/null +++ b/a_cx_test_files/bash_script/run_real_top.sh @@ -0,0 +1,5 @@ +#/bin/bash +CONFIG_PATH=$1.yaml python3 test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_folded_top.py +CONFIG_PATH=$1.yaml python3 test/passes/graph/transforms/verilog/test_emit_verilog_mxint_real_top.py +#cd /scratch/cx922/mase/mxint_$1/hardware/top_build_project +#vivado -mode batch -log project_build.log -source build.tcl diff --git a/a_cx_test_files/bash_script/run_vivado.sh b/a_cx_test_files/bash_script/run_vivado.sh new file mode 100644 index 000000000..b1716cc07 --- /dev/null +++ b/a_cx_test_files/bash_script/run_vivado.sh @@ -0,0 +1,4 @@ +#/bin/bash +#python3 ./test/passes/graph/transforms/verilog/test_emit_verilog_$1.py +cd $1/hardware/top_build_project +vivado -mode batch -log project_build.log -source build.tcl diff --git a/a_cx_test_files/deit_base.yaml b/a_cx_test_files/deit_base.yaml new file mode 100644 index 000000000..fd5f9df7a --- /dev/null +++ b/a_cx_test_files/deit_base.yaml @@ -0,0 +1,27 @@ +# Parameters for real top test +img_size: 224 +in_chans: 3 +patch_size: 16 +n: 196 +embed_dim: 768 +num_heads: 12 + +# Parameters for vit folded top test +config: + data_width: 6 + data_exponent_width: 8 + weight_width: 6 + weight_exponent_width: 8 + bias_width: 6 + bias_exponent_width: 8 + +parallelism: 16 +mlp_parallelism: 64 + +folded_depth: 6 # number of times to fold/reuse the streaming blocks +stream_depth: 2 # number of transformer blocks in streaming pipeline + +# Project directory +# project_dir: "/home/cx922/v80_mxint_hardware/deit_base/" +project_dir: "/home/cx922/optimized1_final_result/deit_base" +# project_dir: "/home/cx922/fp8_result/deit_tiny" \ No newline at end of file diff --git a/a_cx_test_files/deit_small.yaml b/a_cx_test_files/deit_small.yaml new file mode 100644 index 000000000..f45dec323 --- /dev/null +++ b/a_cx_test_files/deit_small.yaml @@ -0,0 +1,26 @@ + +# Parameters for rea +img_size: 224 +in_chans: 3 +patch_size: 16 +n: 196 +embed_dim: 384 +num_heads: 6 +# General parameters +config: + data_width: 6 + data_exponent_width: 8 + weight_width: 6 + weight_exponent_width: 8 + bias_width: 6 + bias_exponent_width: 8 + +parallelism: 16 +mlp_parallelism: 48 +# Parameters for vit folded top test +folded_depth: 4 # number of times to fold/reuse the streaming blocks +stream_depth: 3 # number of transformer blocks in streaming pipeline +# Project directory +# project_dir: "/home/cx922/fp16_result/deit_small" +# project_dir: "/home/cx922/optimized_final_result/deit_small" +project_dir: "/home/cx922/optimized1_final_result/deit_small" \ No newline at end of file diff --git a/a_cx_test_files/deit_tiny.yaml b/a_cx_test_files/deit_tiny.yaml new file mode 100644 index 000000000..85386ae24 --- /dev/null +++ b/a_cx_test_files/deit_tiny.yaml @@ -0,0 +1,29 @@ + +# Parameters for rea +img_size: 224 +in_chans: 3 +patch_size: 16 +n: 196 +embed_dim: 192 +num_heads: 3 +# General parameters +config: + data_width: 4 + data_exponent_width: 4 + weight_width: 4 + weight_exponent_width: 4 + bias_width: 4 + bias_exponent_width: 4 + +parallelism: 1 +mlp_parallelism: 1 +# Parameters for vit folded top test +folded_depth: 1 # number of times to fold/reuse the streaming blocks +stream_depth: 12 # number of transformer blocks in streaming pipeline +# Project directory +# project_dir: "/scratch/cx922/mase/mxint_deit_tiny_f4_m3" + +# project_dir: "/home/cx922/optimized_final_result/deit_tiny" +project_dir: "/home/cx922/fp8_result/deit_tiny" +# project_dir: "/scratch/cx922/mase/deit_tiny_check" + diff --git a/a_cx_test_files/install-pips-python13.sh b/a_cx_test_files/install-pips-python13.sh new file mode 100644 index 000000000..7abd38a3d --- /dev/null +++ b/a_cx_test_files/install-pips-python13.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# -------------------------------------------------------------------- +# This script installs pip packages for both Docker containers +# -------------------------------------------------------------------- +set -o errexit +set -o pipefail +set -o nounset + +pip3 install onnx black toml GitPython colorlog cocotb[bus] \ + pytest pytorch-lightning transformers toml \ + timm pytorch-nlp datasets ipython ipdb \ + sentencepiece einops deepspeed pybind11 \ + tabulate tensorboardx hyperopt accelerate \ + optuna stable-baselines3[extra] h5py scikit-learn \ + scipy onnxruntime matplotlib sphinx-rtd-theme \ + imageio imageio-ffmpeg opencv-python kornia einops \ + ghp-import optimum pytest-profiling myst_parser \ + pytest-cov pytest-xdist pytest-sugar pytest-html \ + lightning wandb bitarray bitstring emoji evaluate pynvml cvxpy \ + "numpy<2.0" tensorboard \ + onnxconverter-common absl-py sphinx-glpi-theme prettytable \ + && pip install -U Pillow \ + && pip install mpmath==1.3.0 diff --git a/a_cx_test_files/install-pkgs-python13.sh b/a_cx_test_files/install-pkgs-python13.sh new file mode 100644 index 000000000..0734b0bee --- /dev/null +++ b/a_cx_test_files/install-pkgs-python13.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# -------------------------------------------------------------------- +# This script installs initial packages for both Docker containers +# -------------------------------------------------------------------- +set -o errexit +set -o pipefail +set -o nounset + + +apt-get update -y && apt-get install apt-utils -y +DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata + +# Install basic packages +apt-get upgrade -y +apt-get update -y \ + && apt-get install -y clang graphviz-dev libclang-dev \ + pkg-config g++ libxtst6 xdg-utils \ + libboost-all-dev llvm gcc ninja-build \ + python3 python3-pip build-essential \ + libssl-dev git vim wget htop \ + lld parallel clang-format clang-tidy \ + libtinfo5 libidn11-dev unzip \ + locales python3-sphinx graphviz + +locale-gen en_US.UTF-8 + +# Install SystemVerilog formatter +mkdir -p /srcPkgs \ + && cd /srcPkgs \ + && wget https://github.com/chipsalliance/verible/releases/download/v0.0-2776-gbaf0efe9/verible-v0.0-2776-gbaf0efe9-Ubuntu-22.04-jammy-x86_64.tar.gz \ + && mkdir -p verible \ + && tar xzvf verible-*-x86_64.tar.gz -C verible --strip-components 1 +# Install verilator from source - version v5.020 +apt-get update -y \ + && apt-get install -y git perl make autoconf flex bison \ + ccache libgoogle-perftools-dev numactl \ + perl-doc libfl2 libfl-dev zlib1g zlib1g-dev \ + help2man +# Install Verilator from source +mkdir -p /srcPkgs \ + && cd /srcPkgs \ + && git clone https://github.com/verilator/verilator \ + && unset VERILATOR_ROOT \ + && cd verilator \ + && git checkout v5.020 \ + && autoconf \ + && ./configure \ + && make -j 4 \ + && make install + +# Install latest Cmake from source +mkdir -p /srcPkgs \ + && cd /srcPkgs \ + && wget https://github.com/Kitware/CMake/releases/download/v3.28.0-rc5/cmake-3.28.0-rc5.tar.gz \ + && mkdir -p cmake \ + && tar xzvf cmake-*.tar.gz -C cmake --strip-components 1 \ + && cd cmake \ + && ./bootstrap --prefix=/usr/local \ + && make -j 4 \ + && make install + +# Append any packages you need here +# apt-get ... +apt-get update -y \ + && apt-get install -y clang-12 + +export DEBIAN_FRONTEND=noninteractive \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt update -y \ + && apt install -y python3.13 python3.13-distutils \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 300 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 100 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 200 \ + && update-alternatives --config python3 + diff --git a/a_cx_test_files/new_skid_buffer.py b/a_cx_test_files/new_skid_buffer.py new file mode 100644 index 000000000..d263edd94 --- /dev/null +++ b/a_cx_test_files/new_skid_buffer.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 + +import os, pytest + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + StreamDriver, + StreamMonitor, + ErrorThresholdStreamMonitor, +) +from mase_cocotb.runner import mase_runner + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized.modules.linear import LinearInteger +from chop.nn.quantizers import integer_floor_quantizer + + +class LinearTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + self.data_in_0_driver = StreamDriver( + dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready + ) + self.weight_driver = StreamDriver( + dut.clk, dut.weight, dut.weight_valid, dut.weight_ready + ) + + if self.get_parameter("HAS_BIAS") == 1: + self.bias_driver = StreamDriver( + dut.clk, dut.bias, dut.bias_valid, dut.bias_ready + ) + self.bias_driver.log.setLevel(logging.DEBUG) + + self.data_out_0_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + # self.data_out_0_monitor = ErrorThresholdStreamMonitor( + # dut.clk, + # dut.data_out_0, + # dut.data_out_0_valid, + # dut.data_out_0_ready, + # width=self.get_parameter("DATA_OUT_0_PRECISION_0"), + # signed=True, + # error_bits=1, + # check=True, + # ) + + # Model + self.model = LinearInteger( + in_features=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + out_features=self.get_parameter("DATA_OUT_0_TENSOR_SIZE_DIM_0"), + bias=True if self.get_parameter("HAS_BIAS") == 1 else False, + config={ + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), + "weight_frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + "bias_width": self.get_parameter("BIAS_PRECISION_0"), + "bias_frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + out_config={ + "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + floor=True, + ) + + # Set verbosity of driver and monitor loggers to debug + self.data_in_0_driver.log.setLevel(logging.DEBUG) + self.weight_driver.log.setLevel(logging.DEBUG) + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self): + return torch.randn( + ( + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + ) + ) + + def preprocess_tensor(self, tensor, config, parallelism): + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + + # Quantize + quantizer = partial(integer_floor_quantizer, **config) + q_tensor = quantizer(tensor) + self.log.debug(f"Quantized tensor: {q_tensor}") + + # Convert to integer format + q_tensor = (q_tensor * 2 ** config["frac_width"]).int() + self.log.debug(f"Tensor in integer format: {q_tensor}") + + # Split into chunks according to parallelism in each dimension + # parallelism[0]: along rows, parallelism[1]: along columns + dim_0_split = q_tensor.split(parallelism[0], dim=0) + dim_1_split = [x.split(parallelism[1], dim=1) for x in dim_0_split] + blocks = [] + # Flatten the list of blocks + for i in range(len(dim_1_split)): + for j in range(len(dim_1_split[i])): + blocks.append(dim_1_split[i][j].flatten().tolist()) + return blocks + + async def run_test(self, batches=1, us=100): + await self.reset() + self.log.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) + + # * Load the weights driver + if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: + weights = self.model.weight.transpose(0, 1) + else: + weights = self.model.weight + + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor( + tensor=weights, + config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), + ], + ) + self.weight_driver.load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + self.bias_driver.load_driver(bias) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + + +@cocotb.test() +async def cocotb_test(dut): + tb = LinearTB(dut) + await tb.run_test(batches=10, us=100) + + +async def check_signal(dut, log): + num = {"data_out_0": 0, "data_in_0": 0} + while True: + await RisingEdge(dut.clk) + + +# verified case +# weight per transpoed = 0 +# weight pre transposed = 1 +# has bias = 0 +# has bias = 1 +def get_fixed_linear_config(kwargs={}): + # if pretranspose + # weight1 = in0 + # else + # weight0 = in0 + config = { + "HAS_BIAS": 1, + "WEIGHTS_PRE_TRANSPOSED": 0, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, + "DATA_IN_0_PARALLELISM_DIM_0": 8, + "DATA_IN_0_PARALLELISM_DIM_1": 4, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 16, + "WEIGHT_PARALLELISM_DIM_0": 8, + "WEIGHT_PARALLELISM_DIM_1": 4, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + "WEIGHT_PRECISION_0": 10, + "WEIGHT_PRECISION_1": 3, + "BIAS_PRECISION_0": 5, + "BIAS_PRECISION_1": 2, + "DATA_OUT_0_PRECISION_0": 8, + "DATA_OUT_0_PRECISION_1": 4, + } + config.update(kwargs) + return config + + +@pytest.mark.dev +def test_fixed_linear_smoke(): + """ + Some quick tests to check if the module is working. + """ + mase_runner( + trace=True, + module_param_list=[ + get_fixed_linear_config(), + # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION + get_fixed_linear_config( + { + "WEIGHTS_PRE_TRANSPOSED": 0, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 16, + "WEIGHT_PARALLELISM_DIM_0": 4, + "WEIGHT_PARALLELISM_DIM_1": 2, + }, + ), + ], + ) + + +# @pytest.mark.dev +# def test_fixed_linear_regression(): +# """ +# More extensive tests to check realistic parameter sizes. +# """ +# mase_runner( +# trace=True, +# module_param_list=[ +# get_fixed_linear_config( +# { +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# get_fixed_linear_config( +# { +# "HAS_BIAS": 1, +# "WEIGHTS_PRE_TRANSPOSED": 0, +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# ], +# ) + +torch.manual_seed(3) +if __name__ == "__main__": + test_fixed_linear_smoke() + # test_fixed_linear_regression() diff --git a/a_cx_test_files/new_skid_buffer.sv b/a_cx_test_files/new_skid_buffer.sv new file mode 100644 index 000000000..13c77c69c --- /dev/null +++ b/a_cx_test_files/new_skid_buffer.sv @@ -0,0 +1,37 @@ +`timescale 1ns / 1ps +module new_skid_buffer #( + parameter DATA_WIDTH = 32 +) ( + input logic clk, + input logic rst, + + input logic [DATA_WIDTH - 1:0] data_in, + input logic data_in_valid, + output logic data_in_ready, + + output logic [DATA_WIDTH - 1:0] data_out, + output logic data_out_valid, + input logic data_out_ready +); + // feed the data_out either from + // data_in or a buffered copy of data_in + logic [DATA_WIDTH - 1:0] buffer; + logic buffer_valid; + logic buffer_ready; + always_ff @(posedge clk) begin + if (rst) begin + buffer <= 0; + buffer_valid <= 0; + end else begin + buffer <= data_in; + buffer_valid <= data_in_valid; + data_in_ready <= buffer_ready; + end + end + always_comb begin + buffer_ready = data_out_ready; + data_out = buffer; + data_out_valid = buffer_valid; + end + +endmodule diff --git a/a_cx_test_files/roadmap.drawio b/a_cx_test_files/roadmap.drawio new file mode 100644 index 000000000..c8b3241c6 --- /dev/null +++ b/a_cx_test_files/roadmap.drawio @@ -0,0 +1,223 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/a_cx_test_files/source_code_list/_11.py b/a_cx_test_files/source_code_list/_11.py new file mode 100644 index 000000000..4541111eb --- /dev/null +++ b/a_cx_test_files/source_code_list/_11.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 + +import os +import torch +import logging +from functools import partial +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge, ReadOnly +from pathlib import Path + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import MultiSignalStreamDriver, MultiSignalStreamMonitor +from mase_cocotb.runner import mase_runner +from a_cx_mxint_quant import mxint_quant_block, mxint_hardware +from utils import pack_tensor_to_mx_listed_chunk + +class MxIntLayerNorm1DTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + # Input data driver + self.data_in_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready + ) + + # Output monitor + self.out_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + self.input_drivers = { + "data_in": self.data_in_driver, + } + self.output_monitors = {"out": self.out_monitor} + + # Model parameters + self.tensor_size_dim_0 = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0") + self.parallelism_dim_0 = self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0") + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + (qtensor, mtensor, etensor) = mxint_hardware(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + async def run_test(self): + await self.reset() + self.log.info("Reset finished") + self.out_monitor.ready.value = 1 + + input_data = torch.randn((1, self.tensor_size_dim_0)) + # Update config to match RTL parameter names + input_config = { + "width": self.get_parameter("DATA_IN_0_MAN_WIDTH"), + "exponent_width": self.get_parameter("DATA_IN_0_EXP_WIDTH"), + "round_bits": 4, + } + + input_parallelism = [ + 1, + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ] + (qtensor, mtensor, etensor) = mxint_hardware(input_data, input_config, input_parallelism) + shape = mtensor.shape + mtensor = mtensor.reshape(-1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")).unsqueeze(0) + mtensor = mtensor // 2**(etensor.max() - etensor).unsqueeze(-1) + etensor = etensor.max().repeat(etensor.shape) + input_data_processed = pack_tensor_to_mx_listed_chunk(mtensor, etensor, input_parallelism) + self.data_in_driver.load_driver(input_data_processed) + + from a_cx_mxint_quant.layernorm import mxint_layer_norm + qinput = mtensor * 2**(etensor.unsqueeze(-1) - input_config["width"] - 1) + qinput = qinput.reshape(shape) + layer_norm_config = { + "name": "mxint_hardware", + # data + "data_in_width": self.get_parameter("DATA_IN_0_MAN_WIDTH"), + "data_in_exponent_width": self.get_parameter("DATA_IN_0_EXP_WIDTH"), + "data_in_parallelism": [1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")], + "data_out_width": self.get_parameter("DATA_OUT_0_MAN_WIDTH"), + "data_out_exponent_width": self.get_parameter("DATA_OUT_0_EXP_WIDTH"), + "data_out_parallelism": [1, self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0")], + } + int_config = { + "qx_lossy": True, + "num_val_0_lossy": True, + "num_val_1_lossy": True, + "mean_lossy": True, + "var_lossy": True, + "isqrt_lossy": True, + "data_in_width": layer_norm_config["data_in_width"], + "data_in_frac_width": layer_norm_config["data_in_width"] - 1, + "isqrt_in_width": self.get_parameter("ISQRT_IN_MAN_WIDTH"), + "isqrt_in_exponent_width": 6, + "isqrt_out_width": self.get_parameter("ISQRT_OUT_MAN_WIDTH"), + "isqrt_out_frac_width": self.get_parameter("ISQRT_OUT_MAN_FRAC_WIDTH"), + "isqrt_out_exponent_width": 6, + "weight_width": 8, + "weight_frac_width": 6, + "bias_width": 8, + "bias_frac_width": 6, + "data_out_width": self.get_parameter("DATA_OUT_0_MAN_WIDTH"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_MAN_FRAC_WIDTH"), + } + qout_data, mout_data, eout_data = mxint_layer_norm(qinput, (self.tensor_size_dim_0,), None, None, q_config=int_config) + eout_data = eout_data.repeat(etensor.shape) + + # Simplified parallelism config since RTL only has one dimension + out_parallelism = [ + 1, + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ] + out_processed = pack_tensor_to_mx_listed_chunk(mout_data, eout_data, out_parallelism) + + self.out_monitor.load_monitor(out_processed) + + await Timer(100, units="us") + if not self.out_monitor.exp_queue.empty(): + raise RuntimeError("Output monitor is not empty at end of test") + +@cocotb.test() +async def test_mxint_layer_norm(dut): + cocotb.start_soon(check_signal(dut)) + tb = MxIntLayerNorm1DTB(dut) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + print("end") + +default_config = { + # Input/output dimensions + "DATA_IN_0_TENSOR_SIZE_DIM_0": 10, # Changed from 8 to match RTL + "DATA_IN_0_PARALLELISM_DIM_0": 2, # Changed from 2 to match RTL + + # Data width parameters + "DATA_IN_0_MAN_WIDTH": 8, # Added to match RTL + "DATA_IN_0_MAN_FRAC_WIDTH": 7, # Added to match RTL + "DATA_IN_0_EXP_WIDTH": 4, # Added to match RTL + + "DATA_OUT_0_MAN_WIDTH": 8, # Added to match RTL + "DATA_OUT_0_MAN_FRAC_WIDTH": 7, # Added to match RTL + "DATA_OUT_0_EXP_WIDTH": 4, # Added to match RTL + + # ISQRT parameters + "ISQRT_IN_MAN_WIDTH": 8, # Added to match RTL + "ISQRT_IN_MAN_FRAC_WIDTH": 7, # Added to match RTL + "ISQRT_OUT_MAN_WIDTH": 8, # Added to match RTL + "ISQRT_OUT_MAN_FRAC_WIDTH": 4, # Added to match RTL +} + +def test_layer_norm_smoke(): + valid_width = default_config["ISQRT_IN_MAN_WIDTH"] + 1 + valid_frac_width = default_config["ISQRT_IN_MAN_WIDTH"] - 1 + + out_width = default_config["ISQRT_OUT_MAN_WIDTH"] + out_frac_width = default_config["ISQRT_OUT_MAN_FRAC_WIDTH"] + + from mase_components.helper import generate_memory + generate_memory.generate_sv_lut( + "isqrt", + valid_width, + valid_frac_width, + out_width, + out_frac_width, + path=Path(__file__).parents[1] / "rtl", + constant_mult=1, + floor=False, + ) + mase_runner( + trace=True, + module_param_list=[default_config], + skip_build=False, + sim="verilator", + + ) + +if __name__ == "__main__": + test_layer_norm_smoke() diff --git a/a_cx_test_files/source_code_list/mxint_cast_log.sv b/a_cx_test_files/source_code_list/mxint_cast_log.sv new file mode 100644 index 000000000..5894b5c5b --- /dev/null +++ b/a_cx_test_files/source_code_list/mxint_cast_log.sv @@ -0,0 +1,235 @@ +`timescale 1ns / 1ps +/* +Module : Mxint cast +Description : MxInt Cast between Layers. +*/ +module mxint_cast_log #( + parameter IN_MAN_WIDTH = 1, + parameter IN_MAN_FRAC_WIDTH = IN_MAN_WIDTH - 1, + parameter IN_EXP_WIDTH = 1, + parameter OUT_MAN_WIDTH = 1, + parameter OUT_EXP_WIDTH = 1, + parameter ROUND_BITS = 4, + parameter BLOCK_SIZE = 1 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input logic clk, + input logic rst, + /* verilator lint_on UNUSEDSIGNAL */ + input logic [ IN_MAN_WIDTH-1:0] mdata_in [BLOCK_SIZE-1:0], + input logic [ IN_EXP_WIDTH-1:0] edata_in, + input logic data_in_valid, + output logic data_in_ready, + output logic [OUT_MAN_WIDTH-1:0] mdata_out [BLOCK_SIZE-1:0], + output logic [OUT_EXP_WIDTH-1:0] edata_out, + output logic data_out_valid, + input logic data_out_ready +); + //get max_abs_value of input + localparam LOG2_WIDTH = $clog2(IN_MAN_WIDTH) + 1; + + localparam LOSSLESSS_EDATA_WIDTH = + (LOG2_WIDTH > IN_EXP_WIDTH && LOG2_WIDTH > OUT_EXP_WIDTH) ? LOG2_WIDTH + 2 : + (IN_EXP_WIDTH > OUT_EXP_WIDTH) ? IN_EXP_WIDTH + 2: + OUT_EXP_WIDTH + 2; + + localparam SHIFT_WIDTH = (OUT_EXP_WIDTH > IN_EXP_WIDTH) ? OUT_EXP_WIDTH + 1 : IN_EXP_WIDTH + 1; + localparam SHIFT_DATA_WIDTH = OUT_MAN_WIDTH + 1; + + localparam CAST_WIDTH = OUT_MAN_WIDTH + ROUND_BITS; + + logic [IN_MAN_WIDTH - 1:0] mdata_for_max [BLOCK_SIZE - 1:0]; + logic data_for_max_valid, data_for_max_ready; + + logic [IN_MAN_WIDTH-1:0] mdata_for_out [BLOCK_SIZE-1:0]; + logic [IN_EXP_WIDTH-1:0] edata_for_out; + logic data_for_out_valid, data_for_out_ready; + + // Add register slice after log2_max_abs + logic [LOG2_WIDTH-1:0] log2_max_value_unreg; + logic log2_max_value_valid_unreg, log2_max_value_ready_unreg; + + logic [LOG2_WIDTH - 1:0] log2_max_value; + logic log2_max_value_valid, log2_max_value_ready; + + logic [LOSSLESSS_EDATA_WIDTH - 1:0] edata_out_full; + logic [OUT_EXP_WIDTH - 1:0] edata_out_unreg; + logic [SHIFT_WIDTH - 1:0] shift_value; + logic [IN_EXP_WIDTH + SHIFT_WIDTH - 1:0] merge_shift_edata_unreg; + + logic data_out_join_valid, data_out_join_ready; + // we dont need to implement full shift here, because we'll clamp in the final. + // in order to avoid shift loss, we set the shift_data_width = OUT_MAN_WIDTH + 1. + + logic [IN_EXP_WIDTH + SHIFT_WIDTH - 1:0] merge_shift_edata_reg; + logic [IN_MAN_WIDTH-1:0] mdata_for_out_reg [BLOCK_SIZE-1:0]; + logic [SHIFT_WIDTH-1:0] shift_value_reg; + + logic [IN_EXP_WIDTH + SHIFT_WIDTH - 1:0] merge_shift_edata_reg_1; + logic [IN_MAN_WIDTH-1:0] mdata_for_out_reg_1 [BLOCK_SIZE-1:0]; + logic data_out_reg_valid_1; + logic data_out_reg_ready_1; + + logic [CAST_WIDTH-1:0] mdata_for_cast_reg [BLOCK_SIZE-1:0]; + + logic [OUT_MAN_WIDTH-1:0] mdata_out_reg [BLOCK_SIZE-1:0]; + logic [OUT_EXP_WIDTH-1:0] edata_out_reg; + + logic data_out_reg_valid; + logic data_out_reg_ready; + unpacked_mx_split2_with_data #( + .DEPTH($clog2(BLOCK_SIZE) + 1), + .MAN_WIDTH(IN_MAN_WIDTH), + .EXP_WIDTH(IN_EXP_WIDTH), + .IN_SIZE(BLOCK_SIZE) + ) data_in_0_unpacked_mx_split2_with_data_i ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_in), + .edata_in(edata_in), + .data_in_valid(data_in_valid), + .data_in_ready(data_in_ready), + .fifo_mdata_out(mdata_for_out), + .fifo_edata_out(edata_for_out), + .fifo_data_out_valid(data_for_out_valid), + .fifo_data_out_ready(data_for_out_ready), + .straight_mdata_out(mdata_for_max), + .straight_edata_out(), + .straight_data_out_valid(data_for_max_valid), + .straight_data_out_ready(data_for_max_ready) + ); + + log2_max_abs #( + .IN_SIZE (BLOCK_SIZE), + .IN_WIDTH(IN_MAN_WIDTH) + ) max_bas_i ( + .clk, + .rst, + .data_in_0(mdata_for_max), + .data_in_0_valid(data_for_max_valid), + .data_in_0_ready(data_for_max_ready), + .data_out_0(log2_max_value_unreg), + .data_out_0_valid(log2_max_value_valid_unreg), + .data_out_0_ready(log2_max_value_ready_unreg) + ); + + skid_buffer #( + .DATA_WIDTH(LOG2_WIDTH) + ) log2_reg_slice ( + .clk(clk), + .rst(rst), + .data_in(log2_max_value_unreg), + .data_in_valid(log2_max_value_valid_unreg), + .data_in_ready(log2_max_value_ready_unreg), + .data_out(log2_max_value), + .data_out_valid(log2_max_value_valid), + .data_out_ready(log2_max_value_ready) + ); + + assign edata_out_full = $signed( + log2_max_value + ) + $signed( + edata_for_out + ) - IN_MAN_FRAC_WIDTH; + + // clamp + signed_clamp #( + .IN_WIDTH (LOSSLESSS_EDATA_WIDTH), + .OUT_WIDTH(OUT_EXP_WIDTH) + ) exp_clamp ( + .in_data (edata_out_full), + .out_data(edata_out_unreg) + ); + + assign shift_value = $signed( + edata_out_unreg + ) - $signed( + edata_for_out + ) + IN_MAN_FRAC_WIDTH - (CAST_WIDTH - 1); + + join2 #() join_inst ( + .data_in_ready ({data_for_out_ready, log2_max_value_ready}), + .data_in_valid ({data_for_out_valid, log2_max_value_valid}), + .data_out_valid(data_out_join_valid), + .data_out_ready(data_out_join_ready) + ); + + mxint_register_slice #( + .DATA_PRECISION_0(IN_MAN_WIDTH), + .DATA_PRECISION_1(IN_EXP_WIDTH + SHIFT_WIDTH), + .IN_NUM(BLOCK_SIZE) + ) shift_value_reg_slice ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_for_out), + .edata_in(merge_shift_edata_unreg), + .data_in_valid(data_out_join_valid), + .data_in_ready(data_out_join_ready), + .mdata_out(mdata_for_out_reg_1), + .edata_out(merge_shift_edata_reg_1), + .data_out_valid(data_out_reg_valid_1), + .data_out_ready(data_out_reg_ready_1) + ); + + mxint_register_slice #( + .DATA_PRECISION_0(IN_MAN_WIDTH), + .DATA_PRECISION_1(IN_EXP_WIDTH + SHIFT_WIDTH), + .IN_NUM(BLOCK_SIZE) + ) shift_value_reg_1_slice ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_for_out_reg_1), + .edata_in(merge_shift_edata_reg_1), + .data_in_valid(data_out_reg_valid_1), + .data_in_ready(data_out_reg_ready_1), + .mdata_out(mdata_for_out_reg), + .edata_out(merge_shift_edata_reg), + .data_out_valid(data_out_reg_valid), + .data_out_ready(data_out_reg_ready) + ); + assign merge_shift_edata_unreg = {edata_out_unreg, shift_value}; + assign edata_out_reg = merge_shift_edata_reg[IN_EXP_WIDTH + SHIFT_WIDTH - 1:SHIFT_WIDTH]; + assign shift_value_reg = merge_shift_edata_reg[SHIFT_WIDTH - 1:0]; + + optimized_right_shift #( + .IN_WIDTH(IN_MAN_WIDTH), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(CAST_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(mdata_for_out_reg), + .shift_value(shift_value_reg), + .data_out(mdata_for_cast_reg) + ); + + fixed_rounding #( + .IN_SIZE(BLOCK_SIZE), + .IN_WIDTH(CAST_WIDTH), + .IN_FRAC_WIDTH(CAST_WIDTH - 1), + .OUT_WIDTH(OUT_MAN_WIDTH), + .OUT_FRAC_WIDTH(OUT_MAN_WIDTH - 1) + ) fixed_cast_inst ( + .data_in(mdata_for_cast_reg), + .data_out(mdata_out_reg) // Changed to feed into skid buffer + ); + + + // Add skid buffer at the end + mxint_skid_buffer #( + .DATA_PRECISION_0(OUT_MAN_WIDTH), + .DATA_PRECISION_1(OUT_EXP_WIDTH), + .IN_NUM(BLOCK_SIZE) + ) output_skid_buffer ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_out_reg), + .edata_in(edata_out_reg), + .data_in_valid(data_out_reg_valid), + .data_in_ready(data_out_reg_ready), + .mdata_out(mdata_out), + .edata_out(edata_out), + .data_out_valid(data_out_valid), + .data_out_ready(data_out_ready) + ); + +endmodule \ No newline at end of file diff --git a/a_cx_test_files/source_code_list/mxint_cast_try1.sv b/a_cx_test_files/source_code_list/mxint_cast_try1.sv new file mode 100644 index 000000000..03e1001e0 --- /dev/null +++ b/a_cx_test_files/source_code_list/mxint_cast_try1.sv @@ -0,0 +1,230 @@ +`timescale 1ns / 1ps +/* +Module : Mxint cast +Description : MxInt Cast between Layers. +*/ +module mxint_cast_try1 #( + parameter IN_MAN_WIDTH = 1, + parameter IN_MAN_FRAC_WIDTH = IN_MAN_WIDTH - 1, + parameter IN_EXP_WIDTH = 1, + parameter OUT_MAN_WIDTH = 1, + parameter OUT_EXP_WIDTH = 1, + parameter ROUND_BITS = 4, + parameter BLOCK_SIZE = 1 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input logic clk, + input logic rst, + /* verilator lint_on UNUSEDSIGNAL */ + input logic [ IN_MAN_WIDTH-1:0] mdata_in [BLOCK_SIZE-1:0], + input logic [ IN_EXP_WIDTH-1:0] edata_in, + input logic data_in_valid, + output logic data_in_ready, + output logic [OUT_MAN_WIDTH-1:0] mdata_out [BLOCK_SIZE-1:0], + output logic [OUT_EXP_WIDTH-1:0] edata_out, + output logic data_out_valid, + input logic data_out_ready +); + //get max_abs_value of input + localparam LOG2_WIDTH = $clog2(IN_MAN_WIDTH) + 1; + + localparam LOSSLESSS_EDATA_WIDTH = + (LOG2_WIDTH > IN_EXP_WIDTH && LOG2_WIDTH > OUT_EXP_WIDTH) ? LOG2_WIDTH + 2 : + (IN_EXP_WIDTH > OUT_EXP_WIDTH) ? IN_EXP_WIDTH + 2: + OUT_EXP_WIDTH + 2; + + localparam SHIFT_WIDTH = (OUT_EXP_WIDTH > IN_EXP_WIDTH) ? OUT_EXP_WIDTH + 1 : IN_EXP_WIDTH + 1; + localparam SHIFT_DATA_WIDTH = OUT_MAN_WIDTH + 1; + + localparam CAST_WIDTH = OUT_MAN_WIDTH + ROUND_BITS; + + logic [IN_MAN_WIDTH - 1:0] mdata_for_max [BLOCK_SIZE - 1:0]; + logic data_for_max_valid, data_for_max_ready; + + logic [IN_MAN_WIDTH-1:0] mdata_for_out [BLOCK_SIZE-1:0]; + logic [IN_EXP_WIDTH-1:0] edata_for_out; + logic data_for_out_valid, data_for_out_ready; + + // Add register slice after log2_max_abs + logic [LOG2_WIDTH-1:0] log2_max_value_unreg; + logic log2_max_value_valid_unreg, log2_max_value_ready_unreg; + + logic [LOG2_WIDTH - 1:0] log2_max_value; + logic log2_max_value_valid, log2_max_value_ready; + + logic [LOSSLESSS_EDATA_WIDTH - 1:0] edata_out_full; + logic [SHIFT_WIDTH - 1:0] shift_value; + logic [IN_EXP_WIDTH + SHIFT_WIDTH - 1:0] merge_shift_edata_unreg; + + logic data_out_join_valid, data_out_join_ready; + // we dont need to implement full shift here, because we'll clamp in the final. + // in order to avoid shift loss, we set the shift_data_width = OUT_MAN_WIDTH + 1. + + logic [IN_EXP_WIDTH + SHIFT_WIDTH - 1:0] merge_shift_edata_reg; + logic [IN_MAN_WIDTH-1:0] mdata_for_out_reg [BLOCK_SIZE-1:0]; + logic [SHIFT_WIDTH-1:0] shift_value_reg; + + logic [IN_EXP_WIDTH + SHIFT_WIDTH - 1:0] merge_shift_edata_reg_1; + logic [IN_MAN_WIDTH-1:0] mdata_for_out_reg_1 [BLOCK_SIZE-1:0]; + logic data_out_reg_valid_1; + logic data_out_reg_ready_1; + + logic [CAST_WIDTH-1:0] mdata_for_cast [BLOCK_SIZE-1:0]; + + logic [OUT_MAN_WIDTH-1:0] mdata_out_unreg [BLOCK_SIZE-1:0]; + logic [OUT_EXP_WIDTH-1:0] edata_out_unreg; + + logic data_out_reg_valid; + logic data_out_reg_ready; + + mxint_delay #( + .DATA_PRECISION_0(IN_MAN_WIDTH), + .DATA_PRECISION_1(IN_EXP_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE), + .DELAY_REG_COUNT($clog2(BLOCK_SIZE) + 1) + ) mxint_delay_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_in), + .edata_in(edata_in), + .mdata_out(mdata_for_out), + .edata_out(edata_for_out) + ); + log2_max_abs #( + .IN_SIZE (BLOCK_SIZE), + .IN_WIDTH(IN_MAN_WIDTH) + ) max_bas_i ( + .clk, + .rst, + .data_in_0(mdata_in), + .data_in_0_valid(data_in_valid), + .data_in_0_ready(data_in_ready), + .data_out_0(log2_max_value), + .data_out_0_valid(log2_max_value_valid), + .data_out_0_ready(log2_max_value_ready) + ); + + // get edata_out + assign edata_out_full = $signed( + log2_max_value + ) + $signed( + edata_for_out + ) - IN_MAN_FRAC_WIDTH; + + signed_clamp #( + .IN_WIDTH (LOSSLESSS_EDATA_WIDTH), + .OUT_WIDTH(OUT_EXP_WIDTH) + ) exp_clamp ( + .in_data (edata_out_full), + .out_data(edata_out_unreg) + ); + + //get shift_valud + assign shift_value = $signed( + edata_out_unreg + ) - $signed( + edata_for_out + ) + IN_MAN_FRAC_WIDTH - (CAST_WIDTH - 1); + + optimized_right_shift #( + .IN_WIDTH(IN_MAN_WIDTH), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(CAST_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(mdata_for_out), + .shift_value(shift_value), + .data_out(mdata_for_cast) + ); + fixed_rounding #( + .IN_SIZE(BLOCK_SIZE), + .IN_WIDTH(CAST_WIDTH), + .IN_FRAC_WIDTH(CAST_WIDTH - 1), + .OUT_WIDTH(OUT_MAN_WIDTH), + .OUT_FRAC_WIDTH(OUT_MAN_WIDTH - 1) + ) fixed_cast_inst ( + .data_in(mdata_for_cast), + .data_out(mdata_out_unreg) // Changed to feed into skid buffer + ); + + mxint_register_slice #( + .DATA_PRECISION_0(OUT_MAN_WIDTH), + .DATA_PRECISION_1(OUT_EXP_WIDTH), + .IN_NUM(BLOCK_SIZE) + ) register_slice_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_out_unreg), + .edata_in(edata_out_unreg), + .data_in_valid(log2_max_value_valid), + .data_in_ready(log2_max_value_ready), + .mdata_out(mdata_out), + .edata_out(edata_out), + .data_out_valid(data_out_valid), + .data_out_ready(data_out_ready) + ); + + + +endmodule + +module delay_reg #( + parameter DATA_PRECISION_0 = 1, + parameter DATA_PRECISION_1 = 1, + parameter DELAY_REG_COUNT = 1 +) ( + input logic clk, + input logic rst, + input logic [DATA_PRECISION_0-1:0] data_in, + output logic [DATA_PRECISION_0-1:0] data_out +); + logic [DATA_PRECISION_0-1:0] data_delay[DELAY_REG_COUNT-1:0]; + always_ff @(posedge clk) begin + if (rst) begin + for (int i = 0; i < DELAY_REG_COUNT; i++) begin + data_delay[i] <= '0; + end + end else begin + data_delay[0] <= data_in; + for (int i = 0; i < DELAY_REG_COUNT-1; i++) begin + data_delay[i+1] <= data_delay[i]; + end + end + end + assign data_out = data_delay[DELAY_REG_COUNT-1]; +endmodule + +module mxint_delay #( + parameter DATA_PRECISION_0 = 1, + parameter DATA_PRECISION_1 = 1, + parameter BLOCK_SIZE = 1, + parameter DELAY_REG_COUNT = 1 +) ( + input logic clk, + input logic rst, + input logic [DATA_PRECISION_0-1:0] mdata_in [BLOCK_SIZE-1:0], + input logic [DATA_PRECISION_1-1:0] edata_in, + output logic [DATA_PRECISION_0-1:0] mdata_out [BLOCK_SIZE-1:0], + output logic [DATA_PRECISION_1-1:0] edata_out +); + logic [DATA_PRECISION_0 * BLOCK_SIZE + DATA_PRECISION_1-1:0] data_in_pack; + logic [DATA_PRECISION_0 * BLOCK_SIZE + DATA_PRECISION_1-1:0] data_out_pack; + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign data_in_pack[DATA_PRECISION_0 * (i+1) - 1:DATA_PRECISION_0 * i] = mdata_in[i]; + end + assign data_in_pack[DATA_PRECISION_0 * BLOCK_SIZE + DATA_PRECISION_1-1:DATA_PRECISION_0 * BLOCK_SIZE] = edata_in; + delay_reg #( + .DATA_PRECISION_0(DATA_PRECISION_0 * BLOCK_SIZE + DATA_PRECISION_1), + .DATA_PRECISION_1(DATA_PRECISION_1), + .DELAY_REG_COUNT(DELAY_REG_COUNT) + ) delay_reg_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in_pack), + .data_out(data_out_pack) + ); + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign mdata_out[i] = data_out_pack[DATA_PRECISION_0 * (i+1) - 1:DATA_PRECISION_0 * i]; + end + assign edata_out = data_out_pack[DATA_PRECISION_0 * BLOCK_SIZE + DATA_PRECISION_1-1:DATA_PRECISION_0 * BLOCK_SIZE]; +endmodule diff --git a/src/mase_components/vision_models/vit/test/helpers/__init__.py b/a_cx_test_files/source_code_list/mxint_cast_try2.sv similarity index 100% rename from src/mase_components/vision_models/vit/test/helpers/__init__.py rename to a_cx_test_files/source_code_list/mxint_cast_try2.sv diff --git a/a_cx_test_files/source_code_list/mxint_dot_product_history.sv b/a_cx_test_files/source_code_list/mxint_dot_product_history.sv new file mode 100644 index 000000000..d4836636a --- /dev/null +++ b/a_cx_test_files/source_code_list/mxint_dot_product_history.sv @@ -0,0 +1,114 @@ +`timescale 1ns / 1ps +module mxint_dot_product_history #( + // precision_0 represent mantissa width + // precision_1 represent exponent width + // + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter WEIGHT_PRECISION_0 = 8, + parameter WEIGHT_PRECISION_1 = 8, + parameter BLOCK_SIZE = 6, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + BLOCK_SIZE + ), + parameter DATA_OUT_0_PRECISION_1 = (DATA_IN_0_PRECISION_1 > WEIGHT_PRECISION_1)? DATA_IN_0_PRECISION_1 + 1 : WEIGHT_PRECISION_1 + 1 +) ( + input clk, + input rst, + // m -> mantissa, e -> exponent + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0[BLOCK_SIZE - 1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input data_in_0_valid, + output data_in_0_ready, + + input logic [WEIGHT_PRECISION_0-1:0] mweight[BLOCK_SIZE - 1:0], + input logic [WEIGHT_PRECISION_1-1:0] eweight, + input weight_valid, + output weight_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0, + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + output data_out_0_valid, + input data_out_0_ready +); + + logic [DATA_IN_0_PRECISION_0 - 1:0] mdata_in_0_reg_out[BLOCK_SIZE - 1:0]; + logic mdata_in_0_reg_out_valid, mdata_in_0_reg_out_ready; + logic [DATA_IN_0_PRECISION_1 - 1:0] buffer_edata_in_0; + logic buffer_edata_in_0_valid, buffer_edata_in_0_ready; + + logic [WEIGHT_PRECISION_0 - 1:0] mweight_reg_out[BLOCK_SIZE - 1:0]; + logic mweight_reg_out_valid, mweight_reg_out_ready; + + logic [WEIGHT_PRECISION_1-1:0] buffer_eweight; + logic buffer_eweight_valid, buffer_eweight_ready; + + logic mdata_out_0_valid, mdata_out_0_ready; + mxint_straightm_fifoe #( + .DEPTH($clog2(BLOCK_SIZE) + 1), + .MAN_WIDTH(DATA_IN_0_PRECISION_0), + .EXP_WIDTH(DATA_IN_0_PRECISION_1), + .IN_SIZE(BLOCK_SIZE) + ) data_in_0_split_m_e ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_in_0), + .edata_in(edata_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .fifo_edata_out(buffer_edata_in_0), + .fifo_edata_out_valid(buffer_edata_in_0_valid), + .fifo_edata_out_ready(buffer_edata_in_0_ready), + .straight_mdata_out(mdata_in_0_reg_out), + .straight_mdata_out_valid(mdata_in_0_reg_out_valid), + .straight_mdata_out_ready(mdata_in_0_reg_out_ready) + ); + + mxint_straightm_fifoe #( + .DEPTH($clog2(BLOCK_SIZE) + 1), + .MAN_WIDTH(WEIGHT_PRECISION_0), + .EXP_WIDTH(WEIGHT_PRECISION_1), + .IN_SIZE(BLOCK_SIZE) + ) weight_split_m_e ( + .clk(clk), + .rst(rst), + .mdata_in(mweight), + .edata_in(eweight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + .fifo_edata_out(buffer_eweight), + .fifo_edata_out_valid(buffer_eweight_valid), + .fifo_edata_out_ready(buffer_eweight_ready), + .straight_mdata_out(mweight_reg_out), + .straight_mdata_out_valid(mweight_reg_out_valid), + .straight_mdata_out_ready(mweight_reg_out_ready) + ); + assign edata_out_0 = $signed(buffer_eweight) + $signed(buffer_edata_in_0); + fixed_dot_product #( + .IN_WIDTH(DATA_IN_0_PRECISION_0), + .WEIGHT_WIDTH(WEIGHT_PRECISION_0), + .IN_SIZE(BLOCK_SIZE) + ) fdp_inst ( + .clk(clk), + .rst(rst), + .data_in(mdata_in_0_reg_out), + .data_in_valid(mdata_in_0_reg_out_valid), + .data_in_ready(mdata_in_0_reg_out_ready), + .weight(mweight_reg_out), + .weight_valid(mweight_reg_out_valid), + .weight_ready(mweight_reg_out_ready), + .data_out(mdata_out_0), + .data_out_valid(mdata_out_0_valid), + .data_out_ready(mdata_out_0_ready) + ); + + join_n #( + .NUM_HANDSHAKES(3) + ) join_inst ( + .data_in_ready ({mdata_out_0_ready, buffer_eweight_ready, buffer_edata_in_0_ready}), + .data_in_valid ({mdata_out_0_valid, buffer_eweight_valid, buffer_edata_in_0_valid}), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + +endmodule diff --git a/a_cx_test_files/source_code_list/some_useless_code.py b/a_cx_test_files/source_code_list/some_useless_code.py new file mode 100644 index 000000000..2036fdb9f --- /dev/null +++ b/a_cx_test_files/source_code_list/some_useless_code.py @@ -0,0 +1,532 @@ +class MXIntLinearHardware(_LinearBase): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + config=None, + ) -> None: + super().__init__(in_features, out_features, bias, device, dtype) + assert config is not None, "config is None!" + self.in_features = in_features + self.out_features = out_features + self.config = config + self.bypass = config.get("bypass", False) + if self.bypass: + return + # establish quantizer + w_width, w_exponent_width = ( + config["weight_width"], + config["weight_exponent_width"], + ) + w_p1, w_p0 = ( + config["weight_parallelism"][0], + config["weight_parallelism"][1], + ) + x_width, x_exponent_width = ( + config["data_in_width"], + config["data_in_exponent_width"], + ) + x_p1, x_p0 = ( + config["data_in_parallelism"][0], + config["data_in_parallelism"][1], + ) + # check bias quantizer, if not, use weight quantizer + b_width, b_exponent_width = config["bias_width"], config["bias_exponent_width"] + b_p1, b_p0 = ( + config["bias_parallelism"][0], + config["bias_parallelism"][1], + ) + base_quantizer = block_mxint_quant + out_width, out_exponent_width = ( + config["data_out_width"], + config["data_out_exponent_width"], + ) + out_p1, out_p0 = ( + config["data_out_parallelism"][0], + config["data_out_parallelism"][1], + ) + self.out_quantizer = partial( + base_quantizer, + q_config={"width": out_width, "exponent_width": out_exponent_width}, + parallelism=[out_p1, out_p0], + ) + self.w_quantizer = partial( + base_quantizer, + q_config={"width": w_width, "exponent_width": w_exponent_width}, + parallelism=[w_p1, w_p0], + ) + self.x_quantizer = partial( + base_quantizer, + q_config={"width": x_width, "exponent_width": x_exponent_width}, + parallelism=[x_p1, x_p0], + ) + self.b_quantizer = partial( + base_quantizer, + q_config={"width": b_width, "exponent_width": b_exponent_width}, + parallelism=[b_p1, b_p0], + ) + + def forward(self, x: Tensor) -> Tensor: + x, mx, ex = self.x_quantizer(x) + in_x = (mx, ex) + w, mw, ew = self.w_quantizer(self.weight) + in_w = (mw, ew) + if self.bias is not None: + bias, mbias, ebias = self.b_quantizer(self.bias) + in_bias = (mbias, ebias) + else: + bias = None + in_bias = None + + out = wrapped_mxint_linear_hardware( + in_x, in_w, in_bias, self.in_features, self.out_features, self.config + ) + + return out + + +def wrapped_mxint_linear_hardware(x, w, bias, in_features, out_features, config): + mx = x[0] + n = mx.reshape(-1, in_features).shape[0] + in_config = { + "x_config": { + "width": config["data_in_width"], + "exponent_width": config["data_in_exponent_width"], + "parallism_dim_0": config["data_in_parallelism"][1], + "parallism_dim_1": config["data_in_parallelism"][0], + "depth_dim_0": in_features // config["data_in_parallelism"][1], + "depth_dim_1": n // config["data_in_parallelism"][0], + "dim_0": in_features, + "dim_1": n, + }, + "w_config": { + "width": config["weight_width"], + "exponent_width": config["weight_exponent_width"], + "parallism_dim_0": config["weight_parallelism"][1], + "parallism_dim_1": config["weight_parallelism"][0], + "depth_dim_0": in_features // config["weight_parallelism"][1], + "depth_dim_1": out_features // config["weight_parallelism"][0], + "dim_0": in_features, + "dim_1": out_features, + }, + "bias_config": { + "width": config["bias_width"], + "exponent_width": config["bias_exponent_width"], + "parallism_dim_0": config["bias_parallelism"][1], + "parallism_dim_1": 1, + "depth_dim_0": out_features // config["bias_parallelism"][1], + "depth_dim_1": 1, + "dim_0": out_features, + "dim_1": 1, + }, + "out_config": { + "width": config["data_out_width"], + "exponent_width": config["data_out_exponent_width"], + "parallism_dim_0": config["data_out_parallelism"][1], + "parallism_dim_1": config["data_out_parallelism"][0], + "depth_dim_0": out_features // config["data_out_parallelism"][1], + "depth_dim_1": n // config["data_out_parallelism"][0], + "dim_0": out_features, + "dim_1": n, + }, + } + mout, eout = mxint_linear_hardware(x, w, bias, in_config) + out_config = in_config["out_config"] + reshaped_mout = mout.reshape( + out_config["depth_dim_1"], + out_config["parallism_dim_1"], + out_config["depth_dim_0"], + out_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + reshaped_out = reshaped_mout * 2 ** ( + eout[:, :, None, None] - config["data_out_width"] + 1 + ) + out = reshaped_out.reshape( + out_config["depth_dim_1"], + out_config["depth_dim_0"], + out_config["parallism_dim_1"], + out_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + out = out.reshape(out_config["dim_1"], out_config["dim_0"]) + + return out + + +def mxint_linear_hardware(x, w, bias, config): + """ + assume 2 dimensional input + config = { + "x_config":{ + "width": , + "exponent_width" , + "parallism_dim_0", + "parallism_dim_1", + "depth_dim_0", + "depth_dim_1", + "dim_0", + "dim_1", + }, + "w_config": { + ... + }, + "bias_config": { + ... + }, + "out_config": { + ... + }, + } + """ + mx, ex = x + mw, ew = w + x_config = config["x_config"] + w_config = config["w_config"] + out_config = config["out_config"] + from math import ceil, log2 + + def DotProductCore(man_x, exp_x, man_y, exp_y): + return man_x @ man_y.transpose(0, 1), exp_x + exp_y + + def block_wise_reshape_tensor(x, x_config): + reshaped_x = x.reshape( + x_config["depth_dim_1"], + x_config["parallism_dim_1"], + x_config["depth_dim_0"], + x_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + reshaped_x = reshaped_x.reshape( + x_config["depth_dim_1"] * x_config["depth_dim_0"], + x_config["parallism_dim_1"], + x_config["parallism_dim_0"], + ) + return reshaped_x + + # assume 2 dimensional input + assert ( + x_config["depth_dim_0"] == w_config["depth_dim_0"] + ), "need to check the setting of dim" + assert ( + x_config["parallism_dim_0"] == w_config["parallism_dim_0"] + ), "need to check the setting of dim" + reshaped_ex = ex.reshape(-1) + reshaped_mx = block_wise_reshape_tensor(mx, x_config) + reshaped_ew = ew.reshape(-1) + reshaped_mw = block_wise_reshape_tensor(mw, w_config) + man_out = torch.zeros( + x_config["depth_dim_1"], + w_config["depth_dim_1"], + x_config["parallism_dim_1"] * w_config["parallism_dim_1"], + ) + exp_out = torch.zeros(x_config["depth_dim_1"], w_config["depth_dim_1"]) + for i in range(x_config["depth_dim_1"]): + for j in range(w_config["depth_dim_1"]): + partial_man_out = torch.zeros( + w_config["depth_dim_0"], + x_config["parallism_dim_1"], + w_config["parallism_dim_1"], + ) + partial_exp_out = torch.zeros(w_config["depth_dim_0"]) + for k in range(x_config["depth_dim_0"]): + mx_block = reshaped_mx[i * x_config["depth_dim_0"] + k] + ex_block = reshaped_ex[i * x_config["depth_dim_0"] + k] + mw_block = reshaped_mw[j * w_config["depth_dim_0"] + k] + ew_block = reshaped_ew[j * w_config["depth_dim_0"] + k] + partial_man_out[k], partial_exp_out[k] = DotProductCore( + mx_block, ex_block, mw_block, ew_block + ) + acc_man_out, acc_exp_out = MxIntAccumulator( + partial_man_out.reshape(w_config["depth_dim_0"], -1), partial_exp_out + ) + if bias != None: + bias_config = config["bias_config"] + mbias, ebias = bias + reshaped_mbias = mbias.reshape( + w_config["depth_dim_1"], w_config["parallism_dim_1"] + ) + reshaped_ebias = ebias.reshape(w_config["depth_dim_1"]) + shifted_value = ( + reshaped_ebias[j] + - acc_exp_out + + x_config["width"] + + w_config["width"] + - 2 + - (bias_config["width"] - 1) + ) + shifted_bias = reshaped_mbias[j].repeat( + x_config["parallism_dim_1"] + ) * 2 ** (shifted_value) + print(reshaped_mbias[j]) + print(shifted_value) + acc_man_out = shifted_bias + acc_man_out + print("shfited_bias", shifted_bias) + man_out[i][j], exp_out[i][j] = MxIntCast( + acc_man_out, + acc_exp_out, + { + "in_width": x_config["width"] + + w_config["width"] + + ceil(log2(x_config["dim_0"])), + "in_frac_width": x_config["width"] + w_config["width"] - 2, + "in_exponent_width": max( + x_config["exponent_width"], w_config["exponent_width"] + ) + + 1, + "out_width": out_config["width"], + "out_exponent_width": out_config["exponent_width"], + }, + ) + man_out = ( + man_out.reshape( + x_config["depth_dim_1"], + w_config["depth_dim_1"], + x_config["parallism_dim_1"], + w_config["parallism_dim_1"], + ) + .permute(0, 2, 1, 3) + .reshape(x_config["dim_1"], w_config["dim_1"]) + ) + return man_out, exp_out + + +def MXIntMatmulHardware(man_x, exp_x, man_y, exp_y, x_config, y_config, out_config): + """ + assume 2 dimensional input + config = { + "width": , + "exponent_width" , + "parallism_dim_0", + "parallism_dim_1", + "depth_dim_0", + "depth_dim_1", + "dim_0", + "dim_1", + } + man.shape = [dim_1 * dim_0] + exp.shape = [depth_dim_1, depth_dim_0] + """ + from math import ceil, log2 + + def MatmulCore(man_x, exp_x, man_y, exp_y): + return man_x @ man_y, exp_x + exp_y + + # assume 2 dimensional input + assert ( + x_config["depth_dim_0"] == y_config["depth_dim_1"] + ), "need to check the setting of dim" + + def block_wise_reshape_tensor(x, x_config): + reshaped_x = x.reshape( + x_config["depth_dim_1"], + x_config["parallism_dim_1"], + x_config["depth_dim_0"], + x_config["parallism_dim_0"], + ).permute(0, 2, 1, 3) + reshaped_x = reshaped_x.reshape( + x_config["depth_dim_1"] * x_config["depth_dim_0"], + x_config["parallism_dim_1"], + x_config["parallism_dim_0"], + ) + return reshaped_x + + reshaped_exp_x = exp_x.reshape(-1) + reshaped_man_x = block_wise_reshape_tensor(man_x, x_config) + reshaped_exp_y = exp_y.reshape(-1) + reshaped_man_y = block_wise_reshape_tensor(man_y, y_config) + man_out = torch.zeros( + x_config["depth_dim_1"], + y_config["depth_dim_0"], + x_config["parallism_dim_1"] * y_config["parallism_dim_0"], + ) + exp_out = torch.zeros(x_config["depth_dim_1"], y_config["depth_dim_0"]) + for i in range(x_config["depth_dim_1"]): + for j in range(y_config["depth_dim_0"]): + partial_man_out = torch.zeros( + y_config["depth_dim_1"], + x_config["parallism_dim_1"], + y_config["parallism_dim_0"], + ) + partial_exp_out = torch.zeros(y_config["depth_dim_1"]) + for k in range(y_config["depth_dim_1"]): + man_x_block = reshaped_man_x[i * x_config["depth_dim_0"] + k] + exp_x_block = reshaped_exp_x[i * x_config["depth_dim_0"] + k] + man_y_block = reshaped_man_y[k * y_config["depth_dim_0"] + j] + exp_y_block = reshaped_exp_y[k * y_config["depth_dim_0"] + j] + partial_man_out[k], partial_exp_out[k] = MatmulCore( + man_x_block, exp_x_block, man_y_block, exp_y_block + ) + acc_man_out, acc_exp_out = MxIntAccumulator( + partial_man_out.reshape(y_config["depth_dim_1"], -1), partial_exp_out + ) + man_out[i][j], exp_out[i][j] = MxIntCast( + acc_man_out, + acc_exp_out, + { + "in_width": x_config["width"] + + y_config["width"] + + ceil(log2(x_config["dim_0"])), + "in_frac_width": x_config["width"] + y_config["width"] - 2, + "in_exponent_width": max( + x_config["exponent_width"], y_config["exponent_width"] + ) + + 1, + "out_width": out_config["width"], + "out_exponent_width": out_config["exponent_width"], + }, + ) + man_out = ( + man_out.reshape( + x_config["depth_dim_1"], + y_config["depth_dim_0"], + x_config["parallism_dim_1"], + x_config["parallism_dim_0"], + ) + .permute(0, 2, 1, 3) + .reshape(x_config["dim_1"], y_config["dim_0"]) + ) + return man_out, exp_out + + +def MxIntCast(man_in, exp_in, param): + # In Man Width + max_in = torch.ceil(torch.log2(man_in.abs().max())) + out_width = param["out_width"] + out_exponent_width = param["out_exponent_width"] + in_width = param["in_width"] + in_frac_width = param["in_frac_width"] + in_exponent_width = param["in_exponent_width"] + + out_exponent_max = 2 ** (out_exponent_width - 1) - 1 + out_exponent_min = -(2 ** (out_exponent_width - 1)) + + out_min = -(2 ** (out_width - 1)) + out_max = 2 ** (out_width - 1) - 1 + lma_in = torch.ceil(torch.log2(man_in.abs().max() + 1e-3)) + out_exp_full = lma_in + exp_in - in_frac_width + out_exp = torch.clamp(out_exp_full, out_exponent_min, out_exponent_max) + out_man = man_in // 2 ** (in_frac_width - exp_in + out_exp - (out_width - 1)) + out_man = torch.clamp(out_man, out_min, out_max) + + return out_man, out_exp + + +# def MxIntAccumulator(man, exp, clamp_width = 15): +# IN_DEPTH, BLOCK_SIZE = man.shape[0],man.shape[1] +# min_exp = torch.Tensor([64]) +# mout = torch.zeros(BLOCK_SIZE) +# out_exp = torch.Tensor([64]) +# for i in range(IN_DEPTH): +# min_exp = exp[i] if exp[i] max_exp else max_exp + mout = mout // 2 ** (max_exp - out_exp) + out_exp = max_exp + shifted_man = man[i] // 2 ** (max_exp - exp[i]) + mout = mout + shifted_man + + return mout, out_exp + +def quantized_range_reduction(mx, ex, in_man_width, data_out_n_width): + """Vectorized range reduction""" + def hardware_round(mx, ex, in_man_frac_width, data_out_width): + round_max = 2**(data_out_width-1) - 1 + round_min = -2**(data_out_width-1) + round_x = mx.reshape(-1) // 2**((in_man_frac_width-ex).reshape(-1)) + return torch.clamp(round_x, round_min, round_max) + coefficient_quant_block = partial( + mxint_quantize, + width=8, + exponent_width=4) + _, mlog2_e, elog2_e = coefficient_quant_block(torch.log2(torch.tensor(math.e))) + _, mln_2, eln_2 = coefficient_quant_block(torch.log(torch.tensor(2.0))) + n = hardware_round(mx * mlog2_e, ex + elog2_e, (in_man_width - 1 + 7), data_out_n_width) + print(n) + _mx = n * mln_2 + _ex = eln_2 + shifted_mx = mx // 2**(_ex - ex + (in_man_width - 1) - 7) + print(shifted_mx) + print(_ex - ex + (in_man_width - 1) - 7) + mr = shifted_mx - _mx + # return mr as an fixedpoint ?.7 we can make it 2.7 + # return n as an integer number with width = data_out_width + return mr, n + +def fixed_exp(fr): + frac_width = 7 + exp = 1*2**(frac_width) + fr + fr**2//2**(frac_width + 1) + fr**3*5//2**(frac_width + 4) + return exp + + + +def mxint_softmax(x, q_config): + # fixed_r, integer_n + in_man_width = q_config["in_man_width"] + in_exp_width = q_config["in_exp_width"] + data_out_n_width = q_config["data_out_n_width"] + data_out_man_width = q_config["data_out_man_width"] + data_out_frac_width = data_out_man_width - 1 + data_out_exp_width = q_config["data_out_exp_width"] + + shape = x.shape[0] + mout = torch.zeros_like(x) + eout = torch.zeros_like(x) + + list_of_mexps = [] + list_of_eexps = [] + for i in range(shape): + _, mx, ex = mxint_quantize(x[i], in_man_width, in_exp_width) + fixed_r, integer_n = quantized_range_reduction(mx, ex, in_man_width, data_out_n_width) + # fixed_r will be 2.7 bits, integer_n will be data_out_n_width bits + mexp = fixed_exp(fixed_r) + eexp = integer_n + # currently we got mexp ?.7 bits, integer_n data_out_n_width bits + list_of_mexps.append(mexp) + list_of_eexps.append(eexp) + eexps = torch.stack(list_of_eexps) + mexps = torch.stack(list_of_mexps) + m_sum, e_sum = MxIntAccumulator(torch.stack(list_of_mexps), torch.stack(list_of_eexps)) + extended_mexps = mexps * 2**(data_out_frac_width) + pre_cast_mout = extended_mexps // mexps + pre_cast_eout = eexps - e_sum + pre_cast_out = pre_cast_mout * 2**(pre_cast_eout - 7) + for i in range(shape): + _, mout[i], eout[i] = mxint_quantize(pre_cast_out[i], data_out_man_width, data_out_exp_width) + return mout, eout + + +def preprocess_weight_tensor_for_mxint(self, tensor, config, parallelism): + from utils import mxint_quantize + + t1, t0 = tensor.shape[0], tensor.shape[1] + p1, p0 = parallelism[0], parallelism[1] + reshaped_tensor = tensor.reshape(t1//p1, p1, t0//p0, p0).permute(0, 2, 1, 3) + reshaped_tensor = reshaped_tensor.reshape(-1, p1,p0) + + tensor_inputs = [] + for i in range(t1 * t0 //(p1*p0)): + etensors = [] + mtensors = [] + for j in range(p1): + (qtensor, mtensor, etensor) = mxint_quantize(reshaped_tensor[i][j], width=config["width"], exponent_width=config["exponent_width"]) + etensors.append(int(etensor)) + mtensors += mtensor.int().tolist() + tensor_inputs.append((mtensors, etensors)) + + return tensor_inputs \ No newline at end of file diff --git a/a_cx_test_files/test.drawio b/a_cx_test_files/test.drawio new file mode 100644 index 000000000..e69de29bb diff --git a/a_cx_test_files/test.tex b/a_cx_test_files/test.tex new file mode 100644 index 000000000..c77d9eba5 --- /dev/null +++ b/a_cx_test_files/test.tex @@ -0,0 +1,40 @@ +\begin{figure*} + \begin{subfigure}[b]{0.3\textwidth} + \begin{algorithmic}[1] \footnotesize + \Require $X$ \Comment{Input features} + \Require $H$ \Comment{Number of heads} + \Require $L$ \Comment{Number of hidden layers} + \State $\quant{X_n} \gets \apprx{LayerNorm(\quant{X})} $ + \For{$i \in [0, H)$} + \State $\quant{Q_i} \gets \quant{W_{Q_i}} \apprx{\times} \quant{X_n}$ + \State $\quant{K_i} \gets \quant{W_{K_i}} \apprx{\times} \quant{X_n}$ + \State $\quant{V_i} \gets \quant{W_{V_i}} \apprx{\times} \quant{X_n}$ + \State $\quant{A_i} \gets \frac{\quant{Q_i} \apprx{\times} \quant{K_i}^T}{\sqrt{d_k}} $ + \State $\quant{\hat{A}_i} \gets \apprx{softmax(\quant{A_i})} $ + \State $\quant{B_i} \gets \quant{\hat{A}_i} \apprx{\times} \quant{V_i}$ + \EndFor + \State $\quant{B_c} \gets \apprx{concat(\quant{B_0}.. \quant{B_{H-1}})} $ + \State $\quant{B_o} \gets \quant{W_0} \apprx{\times} \quant{B_c}$ + \State $\quant{B_n} \gets \apprx{LayerNorm(\quant{B_o} + \quant{X_n})} $ + \State $\quant{U} \gets \quant{W_U} \apprx{\times} \quant{B_n}$ + \State $\quant{D} \gets \quant{W_D} (\apprx{GELU(\quant{U})})$ + \State $\quant{O} \gets \quant{D} + \quant{B_n}$ + \State \Return $\quant{O}$ + \end{algorithmic} + \caption{An algorithm view of a block in the ViT model. + Values highlighted in \quant{\em blue} represent quantized values, and operations highlighted in \apprx{green} represent approximated operations.} + \label{fig:motivation} + \end{subfigure} + \hfill + % \begin{subfigure}[b]{0.01\textwidth} + % ~ + % \end{subfigure} + \begin{subfigure}[b]{0.6\textwidth} + \caption{An architecture view of the proposed hardware accelerator. + The proposed architecture pipelines the model in a hierarchical dataflow, and tailors each operation for high area efficiency.} + \label{fig:motivation} + \end{subfigure} + \caption{An overview of the proposed accelerator architecture.} + \label{fig:motivation} + \end{figure*} + \ No newline at end of file diff --git a/justfile b/justfile index 74d5ceee6..fe290e063 100644 --- a/justfile +++ b/justfile @@ -22,7 +22,7 @@ test-hw: # python3 src/mase_components/activation_layers/test/fixed_sigmoid_tb.py python3 src/mase_components/activation_layers/test/fixed_softermax_1d_tb.py # python3 src/mase_components/activation_layers/test/fixed_softermax_tb.py - # python3 src/mase_components/activation_layers/test/fixed_softmax_tb.py + python3 src/mase_components/activation_layers/test/fixed_softmax_tb.py python3 src/mase_components/activation_layers/test/fixed_softplus_tb.py python3 src/mase_components/activation_layers/test/fixed_softsign_tb.py python3 src/mase_components/activation_layers/test/fixed_tanh_tb.py @@ -111,6 +111,7 @@ test-hw: python3 src/mase_components/linear_layers/mxint_operators/test/mxint_matmul_tb.py python3 src/mase_components/linear_layers/mxint_operators/test/mxint_linear_tb.py python3 src/mase_components/linear_layers/mxint_operators/test/mxint_accumulator_tb.py + python3 src/mase_components/linear_layers/mxint_operators/test/mxint_softmax.py # Memory python3 src/mase_components/memory/test/fifo_tb.py # python3 src/mase_components/memory/test/input_buffer_tb.py @@ -143,6 +144,10 @@ test-hw: # python3 src/mase_components/transformer_layers/test/fixed_self_attention_tb.py # python3 src/mase_components/transformer_layers/test/test_lint_attention.py + # ViT layers + python3 src/mase_components/vision_models/test/fixed_self_attention_head_tb.py + + reformat: # format python files black src/chop diff --git a/src/chop/actions/simulate.py b/src/chop/actions/simulate.py index e56a512d8..56d5bf56e 100644 --- a/src/chop/actions/simulate.py +++ b/src/chop/actions/simulate.py @@ -38,11 +38,16 @@ def simulate( gui: bool = False, waves: bool = False, simulator: str = "verilator", + pass_args = {}, ): SIM = getenv("SIM", simulator) runner = get_runner(SIM) - project_dir = Path.home() / ".mase" / "top" + project_dir = ( + pass_args["project_dir"] + if "project_dir" in pass_args.keys() + else Path.home() / ".mase" / "top" + ) if run_emit: emit(model, model_info, task, dataset_info, data_module, load_name, load_type) @@ -64,6 +69,8 @@ def simulate( "--trace-structs", "--trace-depth", str(trace_depth), + "--unroll-count", + "16384" ] else: raise ValueError(f"Unrecognized simulator: {simulator}") diff --git a/src/chop/ir/graph/mase_graph.py b/src/chop/ir/graph/mase_graph.py index fc2fb9fd9..c5f5d2c1f 100644 --- a/src/chop/ir/graph/mase_graph.py +++ b/src/chop/ir/graph/mase_graph.py @@ -187,6 +187,8 @@ def is_leaf_module( custom_leaf_layers = () # quantized functions/layers custom_leaf_functions += tuple(quantized_func_map.values()) + if custom_ops != None: + custom_leaf_layers += tuple(custom_ops.get("modules", {}).keys()) custom_leaf_layers += tuple(quantized_module_map.values()) # patched functions/layers patched_nodes = getattr(model, "patched_nodes", None) diff --git a/src/chop/models/vision/vit/__init__.py b/src/chop/models/vision/vit/__init__.py new file mode 100644 index 000000000..03a3168cc --- /dev/null +++ b/src/chop/models/vision/vit/__init__.py @@ -0,0 +1 @@ +from .vit import get_vit_tiny_patch16, get_vit_base_patch16 diff --git a/src/chop/models/vision/vit/utils.py b/src/chop/models/vision/vit/utils.py new file mode 100644 index 000000000..ed7c23fa6 --- /dev/null +++ b/src/chop/models/vision/vit/utils.py @@ -0,0 +1,199 @@ +# Copyright (c) MEGVII Inc. and its affiliates. All Rights Reserved. +import math +import os + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + + +@torch.no_grad() +def load_weights_from_npz(model, url, check_hash=False, progress=False, prefix=""): + """Load weights from .npz checkpoints for official Google Brain Flax implementation""" + + def _n2p(w, t=True): + if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: + w = w.flatten() + if t: + if w.ndim == 4: + w = w.transpose([3, 2, 0, 1]) + elif w.ndim == 3: + w = w.transpose([2, 0, 1]) + elif w.ndim == 2: + w = w.transpose([1, 0]) + return torch.from_numpy(w) + + def _get_cache_dir(child_dir=""): + """ + Returns the location of the directory where models are cached (and creates it if necessary). + """ + hub_dir = torch.hub.get_dir() + child_dir = () if not child_dir else (child_dir,) + model_dir = os.path.join(hub_dir, "checkpoints", *child_dir) + os.makedirs(model_dir, exist_ok=True) + return model_dir + + def _download_cached_file(url, check_hash=True, progress=False): + parts = torch.hub.urlparse(url) + filename = os.path.basename(parts.path) + cached_file = os.path.join(_get_cache_dir(), filename) + if not os.path.exists(cached_file): + hash_prefix = None + if check_hash: + r = torch.hub.HASH_REGEX.search(filename) # r is Optional[Match[str]] + hash_prefix = r.group(1) if r else None + torch.hub.download_url_to_file( + url, cached_file, hash_prefix, progress=progress + ) + return cached_file + + def adapt_input_conv(in_chans, conv_weight): + conv_type = conv_weight.dtype + # Some weights are in torch.half, ensure it's float for sum on CPU + conv_weight = conv_weight.float() + O, I, J, K = conv_weight.shape + if in_chans == 1: + if I > 3: + assert conv_weight.shape[1] % 3 == 0 + # For models with space2depth stems + conv_weight = conv_weight.reshape(O, I // 3, 3, J, K) + conv_weight = conv_weight.sum(dim=2, keepdim=False) + else: + conv_weight = conv_weight.sum(dim=1, keepdim=True) + elif in_chans != 3: + if I != 3: + raise NotImplementedError("Weight format not supported by conversion.") + else: + # NOTE this strategy should be better than random init, but there could be other combinations of + # the original RGB input layer weights that'd work better for specific cases. + repeat = int(math.ceil(in_chans / 3)) + conv_weight = conv_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :] + conv_weight *= 3 / float(in_chans) + conv_weight = conv_weight.to(conv_type) + return conv_weight + + def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()): + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + ntok_new = posemb_new.shape[1] + if num_tokens: + posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0, num_tokens:] + ntok_new -= num_tokens + else: + posemb_tok, posemb_grid = posemb[:, :0], posemb[0] + gs_old = int(math.sqrt(len(posemb_grid))) + if not len(gs_new): # backwards compatibility + gs_new = [int(math.sqrt(ntok_new))] * 2 + assert len(gs_new) >= 2 + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate( + posemb_grid, size=gs_new, mode="bicubic", align_corners=False + ) + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape( + 1, gs_new[0] * gs_new[1], -1 + ) + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + return posemb + + cached_file = _download_cached_file(url, check_hash=check_hash, progress=progress) + + w = np.load(cached_file) + if not prefix and "opt/target/embedding/kernel" in w: + prefix = "opt/target/" + + if hasattr(model.patch_embed, "backbone"): + # hybrid + backbone = model.patch_embed.backbone + stem_only = not hasattr(backbone, "stem") + stem = backbone if stem_only else backbone.stem + stem.conv.weight.copy_( + adapt_input_conv( + stem.conv.weight.shape[1], _n2p(w[f"{prefix}conv_root/kernel"]) + ) + ) + stem.norm.weight.copy_(_n2p(w[f"{prefix}gn_root/scale"])) + stem.norm.bias.copy_(_n2p(w[f"{prefix}gn_root/bias"])) + if not stem_only: + for i, stage in enumerate(backbone.stages): + for j, block in enumerate(stage.blocks): + bp = f"{prefix}block{i + 1}/unit{j + 1}/" + for r in range(3): + getattr(block, f"conv{r + 1}").weight.copy_( + _n2p(w[f"{bp}conv{r + 1}/kernel"]) + ) + getattr(block, f"norm{r + 1}").weight.copy_( + _n2p(w[f"{bp}gn{r + 1}/scale"]) + ) + getattr(block, f"norm{r + 1}").bias.copy_( + _n2p(w[f"{bp}gn{r + 1}/bias"]) + ) + if block.downsample is not None: + block.downsample.conv.weight.copy_( + _n2p(w[f"{bp}conv_proj/kernel"]) + ) + block.downsample.norm.weight.copy_( + _n2p(w[f"{bp}gn_proj/scale"]) + ) + block.downsample.norm.bias.copy_(_n2p(w[f"{bp}gn_proj/bias"])) + embed_conv_w = _n2p(w[f"{prefix}embedding/kernel"]) + else: + embed_conv_w = adapt_input_conv( + model.patch_embed.proj.weight.shape[1], _n2p(w[f"{prefix}embedding/kernel"]) + ) + model.patch_embed.proj.weight.copy_(embed_conv_w) + model.patch_embed.proj.bias.copy_(_n2p(w[f"{prefix}embedding/bias"])) + model.cls_token.copy_(_n2p(w[f"{prefix}cls"], t=False)) + pos_embed_w = _n2p(w[f"{prefix}Transformer/posembed_input/pos_embedding"], t=False) + if pos_embed_w.shape != model.pos_embed.shape: + pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights + pos_embed_w, + model.pos_embed, + getattr(model, "num_tokens", 1), + model.patch_embed.grid_size, + ) + model.pos_embed.copy_(pos_embed_w) + model.norm.weight.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/scale"])) + model.norm.bias.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/bias"])) + if ( + isinstance(model.head, nn.Linear) + and model.head.bias.shape[0] == w[f"{prefix}head/bias"].shape[-1] + ): + model.head.weight.copy_(_n2p(w[f"{prefix}head/kernel"])) + model.head.bias.copy_(_n2p(w[f"{prefix}head/bias"])) + # if isinstance(getattr(model.pre_logits, 'fc', None), + # nn.Linear) and f'{prefix}pre_logits/bias' in w: + # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) + # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) + for i, block in enumerate(model.blocks.children()): + block_prefix = f"{prefix}Transformer/encoderblock_{i}/" + mha_prefix = block_prefix + "MultiHeadDotProductAttention_1/" + block.norm1.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/scale"])) + block.norm1.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/bias"])) + block.attn.qkv.weight.copy_( + torch.cat( + [ + _n2p(w[f"{mha_prefix}{n}/kernel"], t=False).flatten(1).T + for n in ("query", "key", "value") + ] + ) + ) + block.attn.qkv.bias.copy_( + torch.cat( + [ + _n2p(w[f"{mha_prefix}{n}/bias"], t=False).reshape(-1) + for n in ("query", "key", "value") + ] + ) + ) + block.attn.proj.weight.copy_(_n2p(w[f"{mha_prefix}out/kernel"]).flatten(1)) + block.attn.proj.bias.copy_(_n2p(w[f"{mha_prefix}out/bias"])) + for r in range(2): + getattr(block.mlp, f"fc{r + 1}").weight.copy_( + _n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/kernel"]) + ) + getattr(block.mlp, f"fc{r + 1}").bias.copy_( + _n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/bias"]) + ) + block.norm2.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/scale"])) + block.norm2.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/bias"])) diff --git a/src/chop/models/vision/vit/vit.py b/src/chop/models/vision/vit/vit.py new file mode 100644 index 000000000..23869a8b6 --- /dev/null +++ b/src/chop/models/vision/vit/vit.py @@ -0,0 +1,394 @@ +import torch +import torch.nn as nn +from functools import partial +from logging import getLogger +from timm.layers import ( + get_act_layer, + get_norm_layer, + LayerType, + DropPath, + to_2tuple, + trunc_normal_, +) +from timm.models._hub import load_state_dict_from_hf +import numpy as np +from .utils import load_weights_from_npz + +logger = getLogger(__name__) + +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal + + +from typing import ( + Any, + Callable, + Dict, + Optional, + Set, + Tuple, + Type, + Union, + List, +) + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + self.img_size = img_size + self.patch_size = patch_size + # assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \ + # f"img_size {img_size} should be divided by patch_size {patch_size}." + self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1] + self.num_patches = self.H * self.W + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + self.norm = nn.LayerNorm(embed_dim) + + def forward(self, x): + B, C, H, W = x.shape + + x = self.proj(x).flatten(2).transpose(1, 2) + x = self.norm(x) + H, W = H // self.patch_size[0], W // self.patch_size[1] + + return x, (H, W) + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + # self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + x = self.fc2(x) + # x = self.drop(x) + return x + + +class Attention(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = torch.tensor(self.head_dim**-0.5) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + print("q", q) + q, k = self.q_norm(q), self.k_norm(k) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_drop: float = 0.0, + attn_drop: float = 0.0, + drop_path: float = 0.0, + act_layer: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + mlp_layer: nn.Module = Mlp, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=proj_drop, + ) + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = mlp_layer( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=proj_drop, + ) + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.drop_path1(self.attn(self.norm1(x))) + x = x + self.drop_path2(self.mlp(self.norm2(x))) + return x + + +class VisionTransformer(nn.Module): + """Vision Transformer + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + num_classes: int = 1000, + global_pool: Literal["", "avg", "token"] = "token", + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + class_token: bool = True, + fc_norm: Optional[bool] = None, + drop_rate: float = 0.0, + pos_drop_rate: float = 0.0, + proj_drop_rate: float = 0.0, + attn_drop_rate: float = 0.0, + drop_path_rate: float = 0.0, + embed_layer: Callable = PatchEmbed, + norm_layer: Optional[LayerType] = None, + act_layer: Optional[LayerType] = None, + ) -> None: + """ + Args: + img_size: Input image size. + patch_size: Patch size. + in_chans: Number of image input channels. + num_classes: Mumber of classes for classification head. + global_pool: Type of global pooling for final sequence (default: 'token'). + embed_dim: Transformer embedding dimension. + depth: Depth of transformer. + num_heads: Number of attention heads. + mlp_ratio: Ratio of mlp hidden dim to embedding dim. + qkv_bias: Enable bias for qkv projections if True. + init_values: Layer-scale init values (layer-scale enabled if not None). + class_token: Use class token. + no_embed_class: Don't include position embeddings for class (or reg) tokens. + reg_tokens: Number of register tokens. + fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'. + drop_rate: Head dropout rate. + pos_drop_rate: Position embedding dropout rate. + attn_drop_rate: Attention dropout rate. + drop_path_rate: Stochastic depth rate. + embed_layer: Patch embedding layer. + norm_layer: Normalization layer. + act_layer: MLP activation layer. + block_fn: Transformer block layer. + """ + super().__init__() + assert global_pool in ("", "avg", "token") + assert class_token or global_pool != "token" + use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm + norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6) + act_layer = get_act_layer(act_layer) or nn.GELU + + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = ( + embed_dim # num_features for consistency with other models + ) + self.num_prefix_tokens = 1 if class_token else 0 + self.has_class_token = class_token + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + patch_norm=False, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = ( + nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None + ) + num_patches = num_patches + 1 if class_token else num_patches + self.pos_embed = nn.Parameter(torch.randn(1, num_patches, embed_dim) * 0.02) + self.pos_drop = nn.Dropout(p=pos_drop_rate) + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.Sequential( + *[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_drop=proj_drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ) + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity() + + # Classifier Head + self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity() + self.head_drop = nn.Dropout(drop_rate) + self.head = ( + nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) + + def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: + pos_embed = self.pos_embed + x = torch.cat([self.cls_token.expand(x.shape[0], -1, -1), x], dim=1) + x = x + pos_embed + + return self.pos_drop(x) + + def forward_features(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.patch_embed(x) + x = self._pos_embed(x) + x = self.blocks(x) + x = self.norm(x) + return x + + def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor: + if self.global_pool == "avg": + x = x[:, self.num_prefix_tokens :].mean(dim=1) + elif self.global_pool: + x = x[:, 0] # class token + x = self.fc_norm(x) + x = self.head_drop(x) + return x if pre_logits else self.head(x) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def load_pretrained(pretrained, num_classes, model, url, model_name): + if pretrained: + checkpoint = np.load( + "/root/work/mase-tools/machop/.machop_cache/.machop_cache.ff0e4f759408437a93630130d36308e8.partial" + ) + model.load_state_dict(checkpoint, strict=False) + logger.info("Pretrained weights loaded into {}".format(model_name)) + else: + logger.info("{} randomly initialized".format(model_name)) + + +def get_vit_tiny_patch16(info, pretrained=False, **kwargs): + """ViT-Tiny (Vit-Ti/16)""" + num_classes = info.num_classes + img_size = info.image_size[-1] + model = VisionTransformer( + img_size=img_size, + num_classes=num_classes, + patch_size=16, + embed_dim=192, + num_heads=3, + depth=12, + **kwargs, + ) + if img_size == 224: + url = "https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz" + elif img_size == 384: + url = "https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz" + else: + pretrained = False + logger.warning("this image_size is not supported rightnow") + + load_weights_from_npz(model, url, check_hash=True) + return model + + +def get_vit_base_patch16(info, pretrained=False, **kwargs): + """ViT-Base (Vit-B/16)""" + num_classes = info.num_classes + img_size = info.image_size[-1] + model = VisionTransformer( + img_size=img_size, + num_classes=num_classes, + patch_size=16, + embed_dim=768, + num_heads=12, + depth=12, + **kwargs, + ) + if img_size == 224: + pre_trained_loc = "timm/vit_base_patch16_224.augreg2_in21k_ft_in1k" + elif img_size == 384: + url = "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth" + else: + logger.warning("this image_size is not supported rightnow") + pretrained = False + + if pretrained: + checkpoint = load_state_dict_from_hf(pre_trained_loc) + if num_classes != 1000: + _ = checkpoint.pop("head.weight") + _ = checkpoint.pop("head.bias") + logger.warning( + f"num_classes (={num_classes}) != 1000. The last classifier layer (head) is randomly initialized" + ) + model.load_state_dict(checkpoint, strict=False) + logger.info("Pretrained weights loaded into vit_base_patch16") + else: + logger.info("vit_base_patch16 randomly initialized") + + return model diff --git a/src/chop/nn/quantized/__init__.py b/src/chop/nn/quantized/__init__.py index f9f1389e7..b9dabd9ee 100644 --- a/src/chop/nn/quantized/__init__.py +++ b/src/chop/nn/quantized/__init__.py @@ -2,6 +2,8 @@ quantized_module_map, BertSelfAttentionInteger, BertSelfAttentionHeadInteger, + ViTSelfAttentionHeadInteger, + ViTAttentionInteger, LinearInteger, LayerNormInteger, GELUInteger, diff --git a/src/chop/nn/quantized/functional/__init__.py b/src/chop/nn/quantized/functional/__init__.py index 65ccb1915..8f215ab67 100644 --- a/src/chop/nn/quantized/functional/__init__.py +++ b/src/chop/nn/quantized/functional/__init__.py @@ -1,10 +1,12 @@ from .softermax import fixed_softermax - +from .softmax import softmax_integer +from .layer_norm import IntLayerNormFunc, _int_layer_norm from .add import ( add_block_fp, add_block_log, add_block_minifloat, add_integer, + add_integer_floor, add_log, add_minifloat_denorm, add_minifloat_ieee, @@ -138,6 +140,7 @@ quantized_func_map = { "add_block_minifloat": add_block_minifloat, "add_integer": add_integer, + "add_integer_floor": add_integer_floor, "add_fixed": add_integer, "add_log": add_log, "add_minifloat_ieee": add_minifloat_ieee, diff --git a/src/chop/nn/quantized/functional/add.py b/src/chop/nn/quantized/functional/add.py index 051c2b260..98ae5f935 100644 --- a/src/chop/nn/quantized/functional/add.py +++ b/src/chop/nn/quantized/functional/add.py @@ -7,6 +7,7 @@ block_log_quantizer, block_minifloat_quantizer, integer_quantizer, + integer_floor_quantizer, log_quantizer, minifloat_denorm_quantizer, minifloat_ieee_quantizer, @@ -28,6 +29,21 @@ def add_integer(x, y, config): return x + y +def add_integer_floor(x, y, config): + bypass = config.get("bypass", False) + if bypass: + return x + y + else: + # establish quantizers + x_width, x_frac_width = config["data_in_width"], config["data_in_frac_width"] + x_quantizer = partial( + integer_floor_quantizer, width=x_width, frac_width=x_frac_width + ) + x = x_quantizer(x) + y = x_quantizer(y) + return x + y + + def add_binary(x, y, config): bypass = config.get("bypass", False) if bypass: diff --git a/src/chop/nn/quantized/functional/layer_norm.py b/src/chop/nn/quantized/functional/layer_norm.py new file mode 100644 index 000000000..1ca105a33 --- /dev/null +++ b/src/chop/nn/quantized/functional/layer_norm.py @@ -0,0 +1,141 @@ +from torch import nn +import torch + +from chop.nn.quantizers import integer_floor_quantizer +from math import ceil, log2 + + +def _int_layer_norm( + x: torch.Tensor, + normalized_shape: tuple or int, + weight=None, + bias=None, + eps=1e-5, + q_config={}, +): + def quantize(x, width, frac_width, by_pass=False): + if not by_pass: + x = integer_floor_quantizer(x, width, frac_width) + return x + + def get_dim_and_prodofdim(x, normalized_shape): + dim = tuple(range(0 - len(normalized_shape), 0)) + num_vals = 1 + for items in dim: + num_vals *= x.shape[items] + return dim, num_vals + + def isqrt(x: torch.Tensor): + x = (x + eps).sqrt() + x = x.reciprocal() + return x + + if isinstance(normalized_shape, int): + normalized_shape = (normalized_shape,) + dim, num_vals = get_dim_and_prodofdim(x, normalized_shape) + x = quantize( + x, + q_config.get("data_in_width"), + q_config.get("data_in_frac_width"), + q_config.get("by_pass"), + ) + acc_out_width = ceil(log2(num_vals)) + q_config.get("data_in_width") + inv_num_vals_quant_0 = quantize( + torch.tensor(1 / num_vals), acc_out_width + 2, acc_out_width + ) + # Mean calculation + mu_acc = x.sum(dim, keepdim=True) + mu = mu_acc * inv_num_vals_quant_0 + mu = quantize( + mu, + q_config.get("data_in_width"), + q_config.get("data_in_frac_width"), + q_config.get("by_pass"), + ) + print("mu", mu * 2 ** q_config.get("data_in_frac_width")) + # I hope the output precision here should be $clog2 + # Variance calculation + diff = x - mu + + squares = diff**2 + sum_squares = torch.sum(squares, dim, keepdim=True) + squares_adder_tree_width = 2 * q_config.get("data_in_width") + ceil(log2(num_vals)) + inv_num_vals_quant_1 = quantize( + torch.tensor(1 / num_vals), + squares_adder_tree_width + 2, + squares_adder_tree_width, + ) + var = sum_squares * inv_num_vals_quant_1 + var = quantize( + var, + q_config.get("isqrt_in_width"), + q_config.get("isqrt_in_frac_width"), + q_config.get("by_pass"), + ) + + inv_sqrt = isqrt(var) + inv_sqrt = quantize( + inv_sqrt, + q_config.get("isqrt_out_width"), + q_config.get("isqrt_out_frac_width"), + q_config.get("by_pass"), + ) + + # Norm calculation + norm_out = diff * inv_sqrt + + norm_out = quantize( + norm_out, + q_config.get("data_out_width"), + q_config.get("data_out_frac_width"), + q_config.get("by_pass"), + ) + if weight is not None: + qweight = quantize( + weight, + q_config.get("weight_width"), + q_config.get("weight_frac_width"), + q_config.get("by_pass"), + ) + norm_out = norm_out * qweight + if bias is not None: + qbias = quantize( + bias, + q_config.get("bias_width"), + q_config.get("bias_frac_width"), + q_config.get("by_pass"), + ) + norm_out = norm_out + qbias + norm_out = quantize( + norm_out, + q_config.get("data_out_width"), + q_config.get("data_out_frac_width"), + q_config.get("by_pass"), + ) + return norm_out + + +class IntLayerNormFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, input: torch.Tensor, normalized_shape, weight, bias, eps, config, bypass + ): + with torch.enable_grad(): + layernormed = nn.functional.layer_norm( + input, normalized_shape, weight, bias, eps + ) + ctx.save_for_backward(input, layernormed) + output = ( + _int_layer_norm(input, normalized_shape, weight, bias, eps, config) + if not bypass + else layernormed + ) + return output + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + input, layernormed = ctx.saved_tensors + (grad_input,) = torch.autograd.grad( + layernormed, input, grad_outputs=grad_output + ) + return grad_input, None, None, None, None, None, None diff --git a/src/chop/nn/quantized/functional/matmul.py b/src/chop/nn/quantized/functional/matmul.py index d06eb1ece..8bf4c2300 100644 --- a/src/chop/nn/quantized/functional/matmul.py +++ b/src/chop/nn/quantized/functional/matmul.py @@ -28,7 +28,7 @@ def generic_matmul_integer(x, y, config, style="matmul", out_config=None, floor= if bypass: return matmul(x, y) else: - base_quantizer = integer_quantizer + base_quantizer = integer_floor_quantizer if floor else integer_quantizer x_width, x_frac_width = config["data_in_width"], config["data_in_frac_width"] y_width, y_frac_width = config["weight_width"], config["weight_frac_width"] diff --git a/src/chop/nn/quantized/functional/softmax.py b/src/chop/nn/quantized/functional/softmax.py new file mode 100644 index 000000000..f0a9d57c4 --- /dev/null +++ b/src/chop/nn/quantized/functional/softmax.py @@ -0,0 +1,34 @@ +from torch import nn +import torch + +from chop.nn.quantizers import integer_quantizer, integer_floor_quantizer +from math import ceil, log2 + + +def softmax_integer(x: torch.Tensor, dim: int, config: dict, floor=False): + """ + This function defines the calculation process of hashsoftmax + Exp result is get from a hash table + All the data in this function will be quantized to fixed-point + """ + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + if config["mult_data"] != None: + mult = config["mult_data"] + else: + mult = 1 + quant_x = base_quantizer(x, config["data_in_width"], config["data_in_frac_width"]) + print("quant_x = ", quant_x * 2 ** config["data_in_frac_width"]) + exp_x = (quant_x * mult).exp() + quant_exp = base_quantizer( + exp_x, config["data_in_exp_width"], config["data_in_exp_frac_width"] + ) + print("quant_exp = ", quant_exp * 2 ** config["data_in_exp_frac_width"]) + exp_sum = quant_exp.sum(dim=dim, keepdim=True) + + shift_width = config["data_out_frac_width"] + if torch.all(quant_exp == exp_sum): + out = torch.tensor(1.0, device=x.device).expand(x.shape) + else: + out = quant_exp * (2 ** (shift_width)) // exp_sum + out = out / (2 ** (shift_width)) + return out diff --git a/src/chop/nn/quantized/modules/__init__.py b/src/chop/nn/quantized/modules/__init__.py index 4219d6da9..1b07f192e 100644 --- a/src/chop/nn/quantized/modules/__init__.py +++ b/src/chop/nn/quantized/modules/__init__.py @@ -1,5 +1,5 @@ -from .attention_head import BertSelfAttentionHeadInteger -from .attention import BertSelfAttentionInteger +from .attention_head import BertSelfAttentionHeadInteger, ViTSelfAttentionHeadInteger +from .attention import BertSelfAttentionInteger, ViTAttentionInteger # from .add import AddInteger from .conv1d import ( @@ -32,6 +32,7 @@ LinearBlockFP, LinearBlockMinifloat, LinearInteger, + LinearIntegerFloor, LinearLog, LinearBlockLog, LinearMinifloatDenorm, @@ -43,6 +44,7 @@ LinearLUT, LinearLogicNets, LinearMXIntHardware, + # LinearMxInt, ) from .pool2d import ( AdaptiveAvgPool2dInteger, @@ -67,6 +69,7 @@ ) from .layer_norm import ( LayerNormInteger, + LayerNormIntegerFloor, ) from .group_norm import GroupNormInteger from .instance_norm2d import InstanceNorm2dInteger @@ -113,6 +116,7 @@ GELUBlockFP, GELUBlockMinifloat, GELUInteger, + GELUIntegerFloor, GELULog, GELUBlockLog, GELUMinifloatDenorm, @@ -151,6 +155,8 @@ GroupedQueryAttentionInteger, ) +# from mase_components.linear_layers.mxint_operators.test.utils import MXIntLinearHardware + quantized_module_map = { "conv1d_block_minifloat": Conv1dBlockMinifloat, "conv1d_integer": Conv1dInteger, @@ -176,6 +182,7 @@ "linear_block_minifloat": LinearBlockMinifloat, "linear_integer": LinearInteger, "linear_fixed": LinearInteger, + "linear_integer_floor": LinearIntegerFloor, "linear_log": LinearLog, "linear_mxint_hardware": LinearMXIntHardware, "linear_block_log": LinearBlockLog, @@ -204,6 +211,7 @@ "batch_norm2d_integer": BatchNorm2dInteger, "batch_norm2d_binary": BatchNorm2dBinary, "layer_norm_integer": LayerNormInteger, + "layer_norm_integer_floor": LayerNormIntegerFloor, "group_norm_integer": GroupNormInteger, "instance_norm2d_integer": InstanceNorm2dInteger, "rms_norm_integer": RMSNormInteger, @@ -240,6 +248,7 @@ "gelu_block_minifloat": GELUBlockMinifloat, "gelu_integer": GELUInteger, "gelu_fixed": GELUInteger, + "gelu_integer_floor": GELUIntegerFloor, "gelu_log": GELULog, "gelu_block_log": GELUBlockLog, "gelu_minifloat_ieee": GELUMinifloatIEEE, @@ -271,5 +280,7 @@ "batch_norm1d_linear": BatchNorm1dInteger, "bert_self_attention_head_integer": BertSelfAttentionHeadInteger, "bert_self_attention_integer": BertSelfAttentionInteger, + "bert_self_attention_head_integer": ViTSelfAttentionHeadInteger, + "vit_self_attention_integer": ViTAttentionInteger, "grouped_query_attention_integer": GroupedQueryAttentionInteger, } diff --git a/src/chop/nn/quantized/modules/attention.py b/src/chop/nn/quantized/modules/attention.py index 45819db75..2fda6273b 100644 --- a/src/chop/nn/quantized/modules/attention.py +++ b/src/chop/nn/quantized/modules/attention.py @@ -1,18 +1,21 @@ from functools import partial import torch +import torch.nn as nn from torch import Tensor from torch.nn import functional as F from transformers.models.bert.modeling_bert import BertSelfAttention +from .attention_head import _ViTSelfAttentionHeadBase, ViTSelfAttentionHeadInteger from chop.nn.quantized.modules.linear import ( LinearInteger, ) from chop.nn.quantized.functional import fixed_softermax +from chop.nn.quantizers import integer_quantizer from chop.nn.quantized.functional import matmul_integer -from typing import Optional, Tuple +from typing import Optional, Tuple, Union class _BertSelfAttentionBase(BertSelfAttention): @@ -56,6 +59,87 @@ def forward( return out +class _ViTAttentionBase(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.query = nn.Linear(dim, dim, bias=qkv_bias) + self.key = nn.Linear(dim, dim, bias=qkv_bias) + self.value = nn.Linear(dim, dim, bias=qkv_bias) + self.self_attention = _ViTSelfAttentionHeadBase( + dim=self.head_dim, num_heads=num_heads, attn_drop=attn_drop + ) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + + def _tensor_reshape(x): + return x.reshape(B, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + + q, k, v = ( + _tensor_reshape(self.query(x)), + _tensor_reshape(self.key(x)), + _tensor_reshape(self.value(x)), + ) + x = self.self_attention(q, k, v) + x = x.transpose(1, 2).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class _ViTAttentionBase_before(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.self_attention = _ViTSelfAttentionHeadBase( + dim=self.head_dim, num_heads=num_heads, attn_drop=attn_drop + ) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + + x = self.self_attention(q, k, v) + + x = x.transpose(1, 2).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + class BertSelfAttentionInteger(_BertSelfAttentionBase): def __init__( self, @@ -105,11 +189,120 @@ def __init__( self.matmul = partial( matmul_integer, config={ - "data_in_width": self.q_config["data_out_width"], - "data_in_frac_width": self.q_config["data_out_frac_width"], - "weight_width": self.q_config["data_out_width"], - "weight_frac_width": self.q_config["data_out_frac_width"], + "data_in_width": self.q_config["data_in_width"], + "data_in_frac_width": self.q_config["data_in_frac_width"], + "weight_width": self.q_config["weight_width"], + "weight_frac_width": self.q_config["weight_frac_width"], }, out_config=out_q_config, floor=floor, ) + + +class ViTAttentionInteger(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + q_config: dict = None, + floor=True, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + self.query = LinearInteger( + dim, + dim, + bias=qkv_bias, + config={ + "data_in_width": q_config["data_in_width"], + "data_in_frac_width": q_config["data_in_frac_width"], + "weight_width": q_config["qkv_weight_width"], + "weight_frac_width": q_config["qkv_weight_frac_width"], + "bias_width": q_config["qkv_bias_width"], + "bias_frac_width": q_config["qkv_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkv_width"], + "data_out_frac_width": q_config["qkv_frac_width"], + }, + floor=floor, + ) + self.key = LinearInteger( + dim, + dim, + bias=qkv_bias, + config={ + "data_in_width": q_config["data_in_width"], + "data_in_frac_width": q_config["data_in_frac_width"], + "weight_width": q_config["qkv_weight_width"], + "weight_frac_width": q_config["qkv_weight_frac_width"], + "bias_width": q_config["qkv_bias_width"], + "bias_frac_width": q_config["qkv_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkv_width"], + "data_out_frac_width": q_config["qkv_frac_width"], + }, + floor=floor, + ) + self.value = LinearInteger( + dim, + dim, + bias=qkv_bias, + config={ + "data_in_width": q_config["data_in_width"], + "data_in_frac_width": q_config["data_in_frac_width"], + "weight_width": q_config["qkv_weight_width"], + "weight_frac_width": q_config["qkv_weight_frac_width"], + "bias_width": q_config["qkv_bias_width"], + "bias_frac_width": q_config["qkv_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkv_width"], + "data_out_frac_width": q_config["qkv_frac_width"], + }, + floor=floor, + ) + self.self_attention = ViTSelfAttentionHeadInteger( + dim=self.head_dim, + num_heads=num_heads, + attn_drop=attn_drop, + q_config={ + "query_width": q_config["qkv_width"], + "query_frac_width": q_config["qkv_frac_width"], + "key_width": q_config["qkv_width"], + "key_frac_width": q_config["qkv_frac_width"], + "value_width": q_config["qkv_width"], + "value_frac_width": q_config["qkv_frac_width"], + "qkmm_out_width": q_config["qkmm_out_width"], + "qkmm_out_frac_width": q_config["qkmm_out_frac_width"], + "softmax_exp_width": q_config["softmax_exp_width"], + "softmax_exp_frac_width": q_config["softmax_exp_frac_width"], + "softmax_out_frac_width": q_config["softmax_out_frac_width"], + "svmm_out_width": q_config["svmm_out_width"], + "svmm_out_frac_width": q_config["svmm_out_frac_width"], + }, + floor=floor, + ) + self.proj = LinearInteger( + dim, + dim, + config={ + "data_in_width": q_config["svmm_out_width"], + "data_in_frac_width": q_config["svmm_out_frac_width"], + "weight_width": q_config["proj_weight_width"], + "weight_frac_width": q_config["proj_weight_frac_width"], + "bias_width": q_config["proj_bias_width"], + "bias_frac_width": q_config["proj_bias_frac_width"], + }, + out_config={ + "data_out_width": q_config["data_out_width"], + "data_out_frac_width": q_config["data_out_frac_width"], + }, + floor=floor, + ) diff --git a/src/chop/nn/quantized/modules/attention_head.py b/src/chop/nn/quantized/modules/attention_head.py index 8f9ea5969..7bf1a53af 100644 --- a/src/chop/nn/quantized/modules/attention_head.py +++ b/src/chop/nn/quantized/modules/attention_head.py @@ -9,7 +9,10 @@ from chop.nn.quantized.functional.matmul import ( generic_matmul_integer, ) -from chop.nn.quantizers.integer import integer_quantizer +from chop.nn.quantized.functional.softmax import ( + softmax_integer, +) +from chop.nn.quantizers.integer import integer_quantizer, integer_floor_quantizer class _BertSelfAttentionHeadBase(torch.nn.Module): @@ -89,3 +92,110 @@ def forward( value_layer=value_layer, attention_mask=attention_mask, ) + + +class _ViTSelfAttentionHeadBase(torch.nn.Module): + def __init__(self, dim, num_heads, attn_drop) -> None: + super().__init__() + self.dropout = nn.Dropout(attn_drop) + + self.matmul1 = torch.matmul + self.matmul2 = torch.matmul + self.mult_data = torch.tensor(1 / math.sqrt(dim)) + self.act = nn.functional.softmax + + def self_attention_head( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + ) -> Tensor: + attention_scores = self.matmul1(query_layer, key_layer.transpose(-1, -2)) + print("attention_scores = ", attention_scores * 2**4) + attention_scores = attention_scores * self.mult_data + + # Normalize the attention scores to probabilities. + print("attention_scores = ", attention_scores * 2**4) + attention_probs = self.act(attention_scores, dim=-1) + print("attention_probs = ", attention_probs * 2**4) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + context_layer = self.matmul2(attention_probs, value_layer) + print("value_layer = ", value_layer * 2**4) + print("context_layer = ", context_layer * 2**4) + return context_layer + + def forward( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + ) -> Tensor: + return self.self_attention_head( + query_layer=query_layer, key_layer=key_layer, value_layer=value_layer + ) + + +class ViTSelfAttentionHeadInteger(_ViTSelfAttentionHeadBase): + def __init__( + self, dim, num_heads, attn_drop=0.0, q_config: dict = None, floor=False + ) -> None: + super().__init__(dim, num_heads, attn_drop) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + self.query_quantizer = partial( + base_quantizer, + width=q_config["query_width"], + frac_width=q_config["query_frac_width"], + ) + self.key_quantizer = partial( + base_quantizer, + width=q_config["key_width"], + frac_width=q_config["key_frac_width"], + ) + self.value_quantizer = partial( + base_quantizer, + width=q_config["value_width"], + frac_width=q_config["value_frac_width"], + ) + self.matmul1 = partial( + generic_matmul_integer, + config={ + "data_in_width": q_config["query_width"], + "data_in_frac_width": q_config["query_frac_width"], + "weight_width": q_config["key_width"], + "weight_frac_width": q_config["key_frac_width"], + }, + out_config={ + "data_out_width": q_config["qkmm_out_width"], + "data_out_frac_width": q_config["qkmm_out_frac_width"], + }, + floor=floor, + ) + self.act = partial( + softmax_integer, + config={ + "data_in_width": q_config["qkmm_out_width"], + "data_in_frac_width": q_config["qkmm_out_frac_width"], + "data_in_exp_width": q_config["softmax_exp_width"], + "data_in_exp_frac_width": q_config["softmax_exp_frac_width"], + "data_out_frac_width": q_config["softmax_out_frac_width"], + "mult_data": self.mult_data, + }, + floor=floor, + ) + self.mult_data = torch.tensor(1) + self.matmul2 = partial( + generic_matmul_integer, + config={ + "data_in_width": q_config["softmax_out_frac_width"] + 2, + "data_in_frac_width": q_config["softmax_out_frac_width"], + "weight_width": q_config["value_width"], + "weight_frac_width": q_config["value_frac_width"], + }, + out_config={ + "data_out_width": q_config["svmm_out_width"], + "data_out_frac_width": q_config["svmm_out_frac_width"], + }, + floor=floor, + ) diff --git a/src/chop/nn/quantized/modules/gelu.py b/src/chop/nn/quantized/modules/gelu.py index 074cf4df8..59096a099 100644 --- a/src/chop/nn/quantized/modules/gelu.py +++ b/src/chop/nn/quantized/modules/gelu.py @@ -11,6 +11,7 @@ block_log_quantizer, block_minifloat_quantizer, integer_quantizer, + integer_floor_quantizer, log_quantizer, minifloat_denorm_quantizer, minifloat_ieee_quantizer, @@ -25,13 +26,17 @@ def __init__(self, inplace: bool = False): self.inplace = inplace self.bypass = False self.x_quantizer = None + self.out_quantizer = None def forward(self, x: Tensor) -> Tensor: if self.bypass: return F.gelu(x) else: x = self.x_quantizer(x) - return F.gelu(x) + out = F.gelu(x) + if self.out_quantizer is None: + return out + return self.out_quantizer(out) def get_quantized_output(self, x: Tensor) -> Tensor: x = self.x_quantizer(x) @@ -58,11 +63,32 @@ def __init__(self, inplace: bool = False, config: dict = None): self.x_width = x_width self.x_frac_width = x_frac_width - # def get_output_bitwidth(self) -> dict: - # return { - # "data_out_width": self.config["data_in_width"], - # "data_out_frac_width": self.config["data_in_frac_width"], - # } + +class GELUIntegerFloor(_GELUBase): + bypass = None + + def __init__(self, inplace: bool = False, config: dict = None): + super().__init__(inplace) + assert config is not None, "config is None!" + + self.config = config + self.bypass = config.get("bypass", False) + if self.bypass: + return + # establish quantizers + x_width, x_frac_width = config["data_in_width"], config["data_in_frac_width"] + out_width, out_frac_width = ( + config["data_out_width"], + config["data_out_frac_width"], + ) + self.x_quantizer = partial( + integer_floor_quantizer, width=x_width, frac_width=x_frac_width + ) + self.out_quantizer = partial( + integer_floor_quantizer, width=out_width, frac_width=out_frac_width + ) + self.x_width = x_width + self.x_frac_width = x_frac_width class GELUMinifloatDenorm(_GELUBase): diff --git a/src/chop/nn/quantized/modules/layer_norm.py b/src/chop/nn/quantized/modules/layer_norm.py index 2ca5c6068..42829d77c 100644 --- a/src/chop/nn/quantized/modules/layer_norm.py +++ b/src/chop/nn/quantized/modules/layer_norm.py @@ -1,12 +1,11 @@ from functools import partial - +import torch import torch.nn as nn from torch import Tensor import torch.nn.functional as F -from chop.nn.quantizers import ( - integer_quantizer, -) +from ...quantizers import integer_quantizer +from ..functional import IntLayerNormFunc class _LayerNormBase(nn.LayerNorm): @@ -47,7 +46,37 @@ def __init__( self.bypass = config.get("bypass", False) if self.bypass: return - x_width, x_frac_width = config["data_in_width"], config["data_in_frac_width"] + x_width, x_frac_width = config.get("data_in_width"), config.get( + "data_in_frac_width" + ) self.x_quantizer = partial( integer_quantizer, width=x_width, frac_width=x_frac_width ) + + +class LayerNormIntegerFloor(nn.LayerNorm): + def __init__( + self, + normalized_shape, + eps: float = 0.00001, + elementwise_affine: bool = False, + bias: bool = False, + device=None, + dtype=None, + config=None, + ) -> None: + assert config is not None, "config is None!" + super().__init__(normalized_shape, eps, elementwise_affine, bias, device, dtype) + self.config = config + self.bypass = config.get("bypass", False) + + def forward(self, x: Tensor) -> Tensor: + return IntLayerNormFunc.apply( + x, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + self.config, + self.bypass, + ) diff --git a/src/chop/nn/quantized/modules/linear.py b/src/chop/nn/quantized/modules/linear.py index 0aaaea611..148478488 100644 --- a/src/chop/nn/quantized/modules/linear.py +++ b/src/chop/nn/quantized/modules/linear.py @@ -66,6 +66,7 @@ def forward(self, x: Tensor) -> Tensor: x = self.x_quantizer(x) w = self.w_quantizer(self.weight) bias = self.b_quantizer(self.bias) if self.bias is not None else None + print(w) out = F.linear(x, w, bias) if self.out_quantizer is None: return out @@ -96,6 +97,11 @@ def __init__( x_width, x_frac_width = config["data_in_width"], config["data_in_frac_width"] # check bias quantizer, if not, use weight quantizer b_width, b_frac_width = config["bias_width"], config["bias_frac_width"] + if config.get("data_out_width") is not None: + out_width, out_frac_width = ( + config["data_out_width"], + config["data_out_frac_width"], + ) if out_config is not None: out_width, out_frac_width = ( out_config["data_out_width"], @@ -117,6 +123,46 @@ def __init__( ) +class LinearIntegerFloor(_LinearBase): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + config=None, + ) -> None: + super().__init__(in_features, out_features, bias, device, dtype) + assert config is not None, "config is None!" + self.config = config + self.bypass = config.get("bypass", False) + if self.bypass: + return + # establish quantizer + w_width, w_frac_width = config["weight_width"], config["weight_frac_width"] + x_width, x_frac_width = config["data_in_width"], config["data_in_frac_width"] + # check bias quantizer, if not, use weight quantizer + b_width, b_frac_width = config["bias_width"], config["bias_frac_width"] + out_width, out_frac_width = ( + config["data_out_width"], + config["data_out_frac_width"], + ) + + self.w_quantizer = partial( + integer_floor_quantizer, width=w_width, frac_width=w_frac_width + ) + self.x_quantizer = partial( + integer_floor_quantizer, width=x_width, frac_width=x_frac_width + ) + self.b_quantizer = partial( + integer_floor_quantizer, width=b_width, frac_width=b_frac_width + ) + self.out_quantizer = partial( + integer_floor_quantizer, width=out_width, frac_width=out_frac_width + ) + + class LinearMinifloatDenorm(_LinearBase): def __init__( self, @@ -1028,6 +1074,91 @@ def forward(self, x: Tensor) -> Tensor: return self.math_forward(x) +class LinearMxInt(_LinearBase): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + config=None, + out_config=None, + ) -> None: + super().__init__(in_features, out_features, bias, device, dtype) + assert config is not None, "config is None!" + self.config = config + self.out_config = out_config + self.bypass = config.get("bypass", False) + if self.bypass: + return + # establish quantizer + w_width, w_exponent_width = ( + config["weight_width"], + config["weight_exponent_width"], + ) + w_p1, w_p0 = ( + config["weight_parallelism"][0], + config["weight_parallelism"][1], + ) + x_width, x_exponent_width = ( + config["data_in_width"], + config["data_in_exponent_width"], + ) + x_p1, x_p0 = ( + config["data_in_parallelism"][0], + config["data_in_parallelism"][1], + ) + # check bias quantizer, if not, use weight quantizer + b_width, b_exponent_width = config["bias_width"], config["bias_exponent_width"] + b_p1, b_p0 = config["bias_parallelism"][0], config["bias_parallelism"][1] + base_quantizer = mxint_hardware + if out_config is not None: + out_width, out_exponent_width = ( + config["data_out_width"], + config["data_out_exponent_width"], + ) + out_p1, out_p0 = ( + config["data_out_parallelism_dim_1"], + config["data_out_parallelism_dim_0"], + ) + self.out_quantizer = partial( + base_quantizer, + q_config={"width": out_width, "exponent_width": out_exponent_width}, + parallelism=[out_p1, out_p0], + ) + self.w_quantizer = partial( + base_quantizer, + q_config={"width": w_width, "exponent_width": w_exponent_width}, + parallelism=[w_p1, w_p0], + ) + self.x_quantizer = partial( + base_quantizer, + q_config={"width": x_width, "exponent_width": x_exponent_width}, + parallelism=[x_p1, x_p0], + ) + self.b_quantizer = partial( + base_quantizer, + q_config={"width": b_width, "exponent_width": b_exponent_width}, + parallelism=[b_p1, b_p0], + ) + + def forward(self, x: Tensor) -> Tensor: + if self.bypass: + return F.linear(x, self.weight, self.bias) + else: + x = self.x_quantizer(x) + w = self.w_quantizer(self.weight) + if self.bias is not None: + bias = self.b_quantizer(self.bias) + else: + bias = None + out = F.linear(x, w, bias) + if self.out_quantizer is None: + return out + return self.out_quantizer(out) + + class LinearMXIntHardware(_LinearBase): def __init__( self, diff --git a/src/chop/nn/quantized/modules/mxint_modules.py b/src/chop/nn/quantized/modules/mxint_modules.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/chop/nn/quantizers/mxint_hardware.py b/src/chop/nn/quantizers/mxint_hardware.py index 0c3e06130..e772cd6e9 100644 --- a/src/chop/nn/quantizers/mxint_hardware.py +++ b/src/chop/nn/quantizers/mxint_hardware.py @@ -24,14 +24,14 @@ def mxint_quant_block( # exponent if exponent == None: - exponent = torch.ceil(torch.log2(x.abs().max())) - exponent_bias + exponent = torch.ceil(torch.log2(x.abs().max())) exponent = torch.clamp(exponent, exponent_min, exponent_max) # mantissa int_min = -(2 ** (width - 1)) int_max = 2 ** (width - 1) - 1 - mantissa = x / 2**exponent + mantissa = x * (2 ** (width - 1)) / 2**exponent mantissa = torch.clamp(mantissa.floor(), int_min, int_max) - q_x = (2**exponent) * mantissa + q_x = (2**exponent) * mantissa / ((2 ** (width - 1))) return q_x diff --git a/src/chop/nn/quantizers/quantizers_for_hw.py b/src/chop/nn/quantizers/quantizers_for_hw.py index d5ca3d8cf..4a90e2a38 100644 --- a/src/chop/nn/quantizers/quantizers_for_hw.py +++ b/src/chop/nn/quantizers/quantizers_for_hw.py @@ -3,15 +3,15 @@ import torch.nn.functional as F from torch import Tensor -# from .quantizers import integer_quantizer +from .integer import integer_quantizer, integer_floor_quantizer from .utils import block, my_clamp, my_round, unblock, my_floor -def integer_quantizer_for_hw(x: Tensor, width: int, frac_width: int): +def integer_quantizer_for_hw(x: Tensor, width: int, frac_width: int, floor=False): thresh = 2 ** (width - 1) scale = 2**frac_width - - fixed_point_value = my_clamp(my_round(x.mul(scale)), -thresh, thresh - 1) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + fixed_point_value = base_quantizer(x, width, frac_width) * scale fixed_point_value = fixed_point_value.to(torch.int) fixed_point_value = fixed_point_value % (2**width) return fixed_point_value diff --git a/src/chop/passes/graph/analysis/add_metadata/add_common_metadata.py b/src/chop/passes/graph/analysis/add_metadata/add_common_metadata.py index 346e486e9..3c1ff2efc 100644 --- a/src/chop/passes/graph/analysis/add_metadata/add_common_metadata.py +++ b/src/chop/passes/graph/analysis/add_metadata/add_common_metadata.py @@ -125,13 +125,18 @@ def graph_iterator_for_mase_ops(graph): elif isinstance(module, GroupedQueryAttention): mase_op = "grouped_query_attention" else: - mase_op = None - for module_cls in graph.model.custom_ops["modules"].keys(): - if isinstance(module, module_cls): - mase_op = "user_defined_module" - break - if mase_op is None: - raise ValueError(f"Unknown module: {module_name}") + from chop.nn.quantized import ViTAttentionInteger + + if isinstance(module, ViTAttentionInteger): + mase_op = "vit_self_attention_integer" + else: + mase_op = None + for module_cls in graph.model.custom_ops["modules"].keys(): + if isinstance(module, module_cls): + mase_op = "user_defined_module" + break + if mase_op is None: + raise ValueError(f"Unknown module: {module_name}") node.meta["mase"].parameters["common"]["mase_type"] = mase_type node.meta["mase"].parameters["common"]["mase_op"] = mase_op @@ -252,6 +257,8 @@ def graph_iterator_for_metadata( # node.shape = result.shape # node.dtype = result.dtype + # print(node.op, node.name, result) + # breakpoint() node.meta["mase"] = analyse_fn( node.meta["mase"], result, args, kwargs, add_value=add_value ) diff --git a/src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py b/src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py index 0e2d315ae..d4cb96ac1 100644 --- a/src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py +++ b/src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py @@ -47,13 +47,23 @@ def add_component_source(node): node.meta["mase"]["hardware"]["dependence_files"] = op_info[ "dependence_files" ] - elif mase_op in INTERNAL_COMP.keys(): - node.meta["mase"]["hardware"]["toolchain"] = "INTERNAL_RTL" - # take the first ip in the component list by default - node.meta["mase"]["hardware"]["module"] = INTERNAL_COMP[mase_op][0]["name"] - node.meta["mase"]["hardware"]["dependence_files"] = INTERNAL_COMP[mase_op][0][ - "dependence_files" - ] + elif any(mase_op in key for key in INTERNAL_COMP.keys()): + if node.meta["mase"].parameters["common"]["quant_type"] == "mxint_hardware": + node.meta["mase"]["hardware"]["toolchain"] = "INTERNAL_RTL" + # take the first ip in the component list by default + node.meta["mase"]["hardware"]["module"] = INTERNAL_COMP[ + mase_op + "_mxint_hardware" + ][0]["name"] + node.meta["mase"]["hardware"]["dependence_files"] = INTERNAL_COMP[ + mase_op + "_mxint_hardware" + ][0]["dependence_files"] + else: + node.meta["mase"]["hardware"]["toolchain"] = "INTERNAL_RTL" + # take the first ip in the component list by default + node.meta["mase"]["hardware"]["module"] = INTERNAL_COMP[mase_op][0]["name"] + node.meta["mase"]["hardware"]["dependence_files"] = INTERNAL_COMP[mase_op][ + 0 + ]["dependence_files"] else: node.meta["mase"]["hardware"]["toolchain"] = "INTERNAL_HLS" node.meta["mase"]["hardware"]["module"] = None @@ -96,7 +106,12 @@ def add_verilog_param(node): else 1 ) # Check if max parallelism is defined - if node.meta["mase"]["hardware"]["max_parallelism"] is not None: + if arg_info.get("parallelism") is not None: + # parallelism only support the last 2 dimension + vp[_cap(arg + f"_parallelism_dim_{dim}")] = ( + arg_info["parallelism"][::-1][dim] if dim <= 1 else 1 + ) + elif node.meta["mase"]["hardware"]["max_parallelism"] is not None: # Take the minimum between... vp[_cap(arg + f"_parallelism_dim_{dim}")] = min( # The defined max parallelism for this dimension @@ -125,7 +140,12 @@ def add_verilog_param(node): else 1 ) # Check if max parallelism is defined - if node.meta["mase"]["hardware"]["max_parallelism"] is not None: + if result_info.get("parallelism") is not None: + # parallelism only support the last 2 dimension + vp[_cap(result + f"_parallelism_dim_{dim}")] = ( + result_info["parallelism"][::-1][dim] if dim <= 1 else 1 + ) + elif node.meta["mase"]["hardware"]["max_parallelism"] is not None: # Take the minimum between... vp[_cap(result + f"_parallelism_dim_{dim}")] = min( # The defined max parallelism for this dimension diff --git a/src/chop/passes/graph/analysis/add_metadata/common_metadata_layers.py b/src/chop/passes/graph/analysis/add_metadata/common_metadata_layers.py index 961e514f8..601194b86 100644 --- a/src/chop/passes/graph/analysis/add_metadata/common_metadata_layers.py +++ b/src/chop/passes/graph/analysis/add_metadata/common_metadata_layers.py @@ -272,6 +272,7 @@ "elu": {"input": "data_in"}, "softmax": {"input": "data_in"}, "gelu": {"input": "data_in"}, + "vit_self_attention_integer": {"input": "data_in"}, "grouped_query_attention": {"input": "data_in"}, } @@ -387,7 +388,6 @@ def match_args_and_kwargs(meta, args, kwargs, data, add_value): ordered_func_data = [(k, v) for k, v in data.items()] meta.parameters["common"]["args"] = {} meta_kwargs = {} - arg_type, arg_precision = get_type_and_precision(meta) # * Assign metadata for each argument diff --git a/src/chop/passes/graph/analysis/add_metadata/hardware_metadata_layers.py b/src/chop/passes/graph/analysis/add_metadata/hardware_metadata_layers.py index 778a08001..f6b228ab1 100644 --- a/src/chop/passes/graph/analysis/add_metadata/hardware_metadata_layers.py +++ b/src/chop/passes/graph/analysis/add_metadata/hardware_metadata_layers.py @@ -30,28 +30,84 @@ "normalization_layers/rtl/rms_norm_2d.sv", "normalization_layers/rtl/batch_norm_2d.sv", "normalization_layers/rtl/norm.sv", + "normalization_layers/rtl/layer_norm_1d.sv", ], } - +linear = { + "name": "fixed_linear_with_input_circular", + "dependence_files": [ + "cast/rtl/fixed_round.sv", + "cast/rtl/fixed_rounding.sv", + "cast/rtl/floor_round.sv", + "cast/rtl/signed_clamp.sv", + "cast/rtl/fixed_signed_cast.sv", + "linear_layers/fixed_operators/rtl/fixed_dot_product.sv", + "linear_layers/fixed_operators/rtl/fixed_vector_mult.sv", + "linear_layers/fixed_operators/rtl/fixed_accumulator.sv", + "linear_layers/fixed_operators/rtl/fixed_adder_tree.sv", + "linear_layers/fixed_operators/rtl/fixed_adder_tree_layer.sv", + "linear_layers/fixed_operators/rtl/fixed_mult.sv", + "common/rtl/register_slice.sv", + "common/rtl/join2.sv", + "common/rtl/mux.sv", + "common/rtl/unpacked_register_slice.sv", + "common/rtl/single_element_repeat.sv", + "memory/rtl/unpacked_repeat_circular_buffer.sv", + "memory/rtl/input_buffer.sv", + "memory/rtl/blk_mem_gen_0.sv", + "memory/rtl/simple_dual_port_ram.sv", + "linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv", + "memory/rtl/fifo_for_autogen.sv", + "memory/rtl/unpacked_fifo.sv", + "memory/rtl/skid_buffer.sv", + "memory/rtl/unpacked_skid_buffer.sv", + "memory/rtl/simple_dual_port_ram.sv", + "memory/rtl/fifo.sv", + ], +} +unpacked_mx_split2_with_data = [ + "linear_layers/mxint_operators/rtl/unpacked_mx_split2_with_data.sv", + "common/rtl/split2_with_data.sv", + "common/rtl/split2.sv", + "memory/rtl/fifo.sv", +] +mxint_cast = [ + "linear_layers/mxint_operators/rtl/or_tree_layer.sv", + "linear_layers/mxint_operators/rtl/or_tree.sv", + "linear_layers/mxint_operators/rtl/log2_max_abs.sv", + "linear_layers/mxint_operators/rtl/mxint_cast.sv", + "linear_layers/mxint_operators/rtl/optimized_right_shift.sv" +] +mxint_linear = linear["dependence_files"] + unpacked_mx_split2_with_data + mxint_cast + [ + "linear_layers/mxint_operators/rtl/mxint_linear.sv", + "linear_layers/mxint_operators/rtl/mxint_register_slice.sv", + "linear_layers/mxint_operators/rtl/mxint_skid_buffer.sv", + "linear_layers/mxint_operators/rtl/mxint_straightm_fifoe.sv", + "linear_layers/mxint_operators/rtl/mxint_accumulator.sv", + "linear_layers/mxint_operators/rtl/mxint_circular.sv", + "linear_layers/mxint_operators/rtl/mxint_dot_product.sv", + "linear_layers/mxint_operators/rtl/unpacked_mx_fifo.sv", + "common/rtl/join_n.sv", + ] INTERNAL_COMP = { - "linear": [ + "linear": [linear], + "linear_mxint_hardware": [ + { + "name": "mxint_linear", + "dependence_files": mxint_linear + } + ], + "fifo": [ { - "name": "fixed_linear", + "name": "fifo_for_autogen", "dependence_files": [ - "cast/rtl/fixed_cast.sv", - "linear_layers/fixed_operators/rtl/fixed_dot_product.sv", - "linear_layers/fixed_operators/rtl/fixed_vector_mult.sv", - "linear_layers/fixed_operators/rtl/fixed_accumulator.sv", - "linear_layers/fixed_operators/rtl/fixed_adder_tree.sv", - "linear_layers/fixed_operators/rtl/fixed_adder_tree_layer.sv", - "linear_layers/fixed_operators/rtl/fixed_mult.sv", - "common/rtl/register_slice.sv", - "common/rtl/join2.sv", - "memory/rtl/unpacked_repeat_circular_buffer.sv", + "memory/rtl/fifo_for_autogen.sv", + "memory/rtl/unpacked_fifo.sv", "memory/rtl/skid_buffer.sv", - "linear_layers/fixed_linear_layer/rtl/fixed_linear.sv", + "memory/rtl/simple_dual_port_ram.sv", + "memory/rtl/fifo.sv", ], - }, + } ], "relu": [ { @@ -123,10 +179,19 @@ } ], "batch_norm2d": [norm], - "layer_norm": [norm], "group_norm": [norm], "instance_norm2d": [norm], "rms_norm": [norm], + "layer_norm": [ + { + "name": "layer_norm_2d", + "dependence_files": norm["dependence_files"] + + [ + "normalization_layers/rtl/layer_norm_2d.sv", + "generated_lut/rtl/isqrt_lut.sv", + ], + }, + ], "selu": [ { "name": "fixed_selu", @@ -148,7 +213,62 @@ "name": "fixed_gelu", "dependence_files": [ "activation_layers/rtl/fixed_gelu.sv", - "activation_layers/rtl/gelu_lut.sv", + "generated_lut/rtl/gelu_lut.sv", + "common/rtl/unpacked_register_slice_quick.sv", + ], + }, + ], + "gelu_mxint_hardware": [ + { + "name": "mxint_gelu", + "dependence_files": [ + "linear_layers/mxint_operators/rtl/mxint_gelu.sv", + "generated_lut/rtl/gelu_lut.sv", + "linear_layers/mxint_operators/rtl/mxint_register_slice.sv", + "linear_layers/mxint_operators/rtl/or_tree_layer.sv", + "linear_layers/mxint_operators/rtl/or_tree.sv", + "linear_layers/mxint_operators/rtl/log2_max_abs.sv", + "linear_layers/mxint_operators/rtl/mxint_accumulator.sv", + "linear_layers/mxint_operators/rtl/mxint_cast.sv", + "linear_layers/mxint_operators/rtl/mxint_circular.sv", + "linear_layers/mxint_operators/rtl/mxint_dot_product.sv", + "linear_layers/mxint_operators/rtl/unpacked_mx_fifo.sv", + "common/rtl/unpacked_register_slice_quick.sv", + ], + }, + ], + "mx_int_patch_embed_mxint_hardware": [ + { + "name": "mxint_patch_embed", + "dependence_files": mxint_linear + + [ + "linear_layers/mxint_operators/rtl/mxint_patch_embed.sv", + "convolution_layers/rtl/sliding_window.sv", + "convolution_layers/rtl/padding.sv", + "convolution_layers/rtl/roller.sv", + ] + } + ], + "layer_norm_mxint_hardware": [ + { + "name": "mxint_layernorm", + "dependence_files": norm["dependence_files"] + + [ + "linear_layers/mxint_operators/rtl/mxint_layernorm.sv", + "linear_layers/mxint_operators/rtl/mxint_gelu.sv", + "generated_lut/rtl/isqrt_lut.sv", + "generated_lut/rtl/gelu_lut.sv", + "linear_layers/mxint_operators/rtl/mxint_register_slice.sv", + "linear_layers/mxint_operators/rtl/or_tree_layer.sv", + "linear_layers/mxint_operators/rtl/or_tree.sv", + "linear_layers/mxint_operators/rtl/log2_max_abs.sv", + "linear_layers/mxint_operators/rtl/mxint_accumulator.sv", + "linear_layers/mxint_operators/rtl/mxint_cast.sv", + "linear_layers/mxint_operators/rtl/mxint_circular.sv", + "linear_layers/mxint_operators/rtl/mxint_dot_product.sv", + "linear_layers/mxint_operators/rtl/unpacked_mx_fifo.sv", + "common/rtl/unpacked_register_slice_quick.sv", + ], }, ], @@ -177,6 +297,14 @@ ], } ], + "add_mxint_hardware": [ + { + "name": "mxint_addition", + "dependence_files": [ + "linear_layers/mxint_operators/rtl/mxint_addition.sv", + ], + }, + ], "mul": [ { "name": "fixed_elementwise_multiplier", @@ -191,6 +319,18 @@ "dependence_files": ["common/rtl/df_split.sv", "common/rtl/split2.sv"], } ], + "fork2": [ + { + "name": "fork2", + "dependence_files": ["common/rtl/fork2.sv"], + } + ], + "fork2_mxint_hardware": [ + { + "name": "mxint_fork2", + "dependence_files": ["linear_layers/mxint_operators/rtl/mxint_fork2.sv"], + } + ], "getitem": [ { "name": "buffer", @@ -199,6 +339,30 @@ ], } ], + "vit_self_attention_integer": [ + { + "name": "fixed_vit_attention_single_precision_wrapper", + "dependence_files": linear["dependence_files"] + + [ + "vision_models/vit/rtl/fixed_vit_attention_single_precision_wrapper.sv", + "vision_models/vit/rtl/fixed_vit_attention.sv", + "vision_models/vit/rtl/fixed_vit_attention_head.sv", + "transformer_layers/rtl/self_attention_head_single_scatter.sv", + "transformer_layers/rtl/gqa_head_scatter_control.sv", + "transformer_layers/rtl/self_attention_head_gather.sv", + "vision_models/vit/rtl/fixed_vit_attention_input_block_batched.sv", + "transformer_layers/rtl/self_attention_head_scatter.sv", + "activation_layers/rtl/fixed_softmax.sv", + "scalar_operators/fixed/rtl/fixed_div.sv", + "generated_lut/rtl/exp_lut.sv", + "common/rtl/find_first_arbiter.sv", + "common/rtl/split2.sv", + "common/rtl/split_n.sv", + "memory/rtl/unpacked_fifo.sv", + "memory/rtl/unpacked_skid_buffer.sv", + ], + } + ], "grouped_query_attention": [ { "name": "fixed_gqa_wrapper", diff --git a/src/chop/passes/graph/transforms/__init__.py b/src/chop/passes/graph/transforms/__init__.py index 612773262..e5a220a3f 100644 --- a/src/chop/passes/graph/transforms/__init__.py +++ b/src/chop/passes/graph/transforms/__init__.py @@ -8,6 +8,7 @@ emit_cocotb_transform_pass, emit_verilog_top_transform_pass, emit_vivado_project_transform_pass, + insert_fork_transform_pass, ) from .utils import ( conv_bn_fusion_transform_pass, diff --git a/src/chop/passes/graph/transforms/quantize/modify.py b/src/chop/passes/graph/transforms/quantize/modify.py index 4ea1145f4..088fd3ade 100644 --- a/src/chop/passes/graph/transforms/quantize/modify.py +++ b/src/chop/passes/graph/transforms/quantize/modify.py @@ -164,7 +164,7 @@ def create_new_module( new_module = new_module_cls(config=config) elif mase_op == "gelu": new_module_cls = quantized_module_map[f"gelu_{quant_name}"] - new_module = new_module_cls(inplace=original_module.inplace, config=config) + new_module = new_module_cls(config=config) elif mase_op == "softsign": new_module_cls = quantized_module_map[f"softsign_{quant_name}"] new_module = new_module_cls(inplace=original_module.inplace, config=config) @@ -203,13 +203,17 @@ def create_new_module( copy_weights(original_module.bias, new_module.bias) elif mase_op == "layer_norm": new_module_cls = quantized_module_map[f"layer_norm_{quant_name}"] + new_module = new_module_cls( normalized_shape=original_module.normalized_shape, eps=original_module.eps, elementwise_affine=original_module.elementwise_affine, - bias=original_module.bias, config=config, ) + if original_module.elementwise_affine: + new_module.weight = original_module.weight + if original_module.bias is not None: + new_module.bias = original_module.bias elif mase_op == "group_norm": new_module_cls = quantized_module_map[f"group_norm_{quant_name}"] new_module = new_module_cls( diff --git a/src/chop/passes/graph/transforms/quantize/quant_parsers/parse_quant_config.py b/src/chop/passes/graph/transforms/quantize/quant_parsers/parse_quant_config.py index e027b0819..70b997114 100644 --- a/src/chop/passes/graph/transforms/quantize/quant_parsers/parse_quant_config.py +++ b/src/chop/passes/graph/transforms/quantize/quant_parsers/parse_quant_config.py @@ -23,10 +23,17 @@ "data_in_entries": ("data_in_width", "data_in_frac_width"), "bias_entries": ("bias_width", "bias_frac_width"), }, + "integer_floor": { + "weight_entries": ("weight_width", "weight_frac_width"), + "data_in_entries": ("data_in_width", "data_in_frac_width"), + "bias_entries": ("bias_width", "bias_frac_width"), + "data_out_entries": ("data_out_width", "data_out_frac_width"), + }, "fixed": { "weight_entries": ("weight_width", "weight_frac_width"), "data_in_entries": ("data_in_width", "data_in_frac_width"), "bias_entries": ("bias_width", "bias_frac_width"), + "data_out_entries": ("data_out_width", "data_out_frac_width"), }, "lutnet": { "weight_entries": ( @@ -261,6 +268,11 @@ "bias_exponent_width", "bias_parallelism", ), + "data_out_entries": ( + "data_out_width", + "data_out_exponent_width", + "data_out_parallelism", + ), }, } @@ -278,6 +290,10 @@ def cp_bypass(config: dict, p_config: dict, entries=None, strict: bool = True): cp_multi_values(config, p_config, ("bypass",), strict=strict) +def cp_floor(config: dict, p_config: dict, entries=None, strict: bool = True): + cp_multi_values(config, p_config, ("floor",), strict=strict) + + def cp_weight_entries(config: dict, p_config: dict, entries: dict, strict: bool = True): cp_multi_values(config, p_config, entries["weight_entries"], strict=strict) @@ -339,6 +355,7 @@ def cp_data_out_entries( QUANT_ARITH_TO_CP_FN[quant_arith] = { "name": partial(cp_name, entries=entries), "bypass": partial(cp_bypass, entries=entries), + "floor": partial(cp_floor, entries=entries), "weight_entries": partial(cp_weight_entries, entries=entries), "data_in_entries": partial(cp_data_in_entries, entries=entries), "bias_entries": partial(cp_bias_entries, entries=entries), @@ -366,12 +383,18 @@ def cp_data_out_entries( "mul": (("name", "data_in_entries"), ("bypass",)), "linear": ( ("name", "data_in_entries", "weight_entries"), - ("bias_entries", "bypass", "data_out_entries", "additional_layers_entries"), + ( + "bias_entries", + "bypass", + "data_out_entries", + "additional_layers_entries", + "floor", + ), ), "relu": (("name", "data_in_entries"), ("bypass",)), "selu": (("name", "data_in_entries"), ("bypass",)), "tanh": (("name", "data_in_entries"), ("bypass",)), - "gelu": (("name", "data_in_entries"), ("bypass",)), + "gelu": (("name", "data_in_entries"), ("data_out_entries", "bypass")), "softplus": (("name", "data_in_entries"), ("bypass",)), "softsign": (("name", "data_in_entries"), ("bypass",)), "sub": (("name", "data_in_entries"), ("bypass",)), @@ -385,7 +408,7 @@ def cp_data_out_entries( ), "layer_norm": ( ("name", "data_in_entries"), - ("bypass",), + ("bypass", "isqrt_in_entries", "isqrt_out_entries", "data_out_entries"), ), "group_norm": ( ("name", "data_in_entries"), @@ -423,6 +446,8 @@ def parse_node_config(config: dict, mase_op: str, strict: bool = True) -> dict: a missing `bias_frac_width` in linear node config """ assert mase_op in MASE_OP_TO_ENTRIES, f"Unknown mase op: {mase_op}" + if config.get("noparse", False): + return config if config.get("bypass", False): return config op_entries, op_optional_entries = MASE_OP_TO_ENTRIES[mase_op] diff --git a/src/chop/passes/graph/transforms/quantize/quant_parsers/update_node_meta.py b/src/chop/passes/graph/transforms/quantize/quant_parsers/update_node_meta.py index 0c580c4f2..7161006b6 100644 --- a/src/chop/passes/graph/transforms/quantize/quant_parsers/update_node_meta.py +++ b/src/chop/passes/graph/transforms/quantize/quant_parsers/update_node_meta.py @@ -9,6 +9,7 @@ def entry_to_list(config: dict, entry: str, suffixes: tuple[str]): QUANT_ARITH_TO_SUFFIXES = { "integer": ("width", "frac_width"), "fixed": ("width", "frac_width"), + "integer_floor": ("width", "frac_width"), "binary": ( "width", "stochastic", @@ -69,7 +70,10 @@ def update_arg(node, arg_name, dtype=None, precision=None, size=None): "softplus": (("data_in",), ("data_in_0",)), "sub": (("data_in", "data_in"), ("data_in_0", "data_in_1")), "batch_norm2d": (("data_in", "weight", "bias"), ("data_in_0", "weight", "bias")), - "layer_norm": (("data_in",), ("data_in_0",)), + "layer_norm": (("data_in", "weight", "bias"), ("data_in_0", "weight", "bias")), + "group_norm": (("data_in",), ("data_in_0")), + "instance_norm2d": (("data_in",), ("data_in_0")), + "rms_norm": (("data_in",), ("data_in_0")), "group_norm": (("data_in",), ("data_in_0",)), "instance_norm2d": (("data_in",), ("data_in_0",)), "rms_norm": (("data_in",), ("data_in_0",)), diff --git a/src/chop/passes/graph/transforms/quantize/quantize.py b/src/chop/passes/graph/transforms/quantize/quantize.py index e3682fcdc..72f363b01 100644 --- a/src/chop/passes/graph/transforms/quantize/quantize.py +++ b/src/chop/passes/graph/transforms/quantize/quantize.py @@ -230,9 +230,15 @@ def quantize_transform_pass(graph, pass_args=None): # weight "weight_width": 8, "weight_frac_width": 4, + + # optional # bias "bias_width": 8, "bias_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + # quantize method + "floor": True, } }, } @@ -246,7 +252,7 @@ def quantize_transform_pass(graph, pass_args=None): - by -> str : different quantization schemes choose from ["type", "name", "regx_name"] """ - by = pass_args.pop("by") + by = pass_args.get("by") match by: case "type": graph = graph_iterator_quantize_by_type(graph, pass_args) diff --git a/src/chop/passes/graph/transforms/verilog/__init__.py b/src/chop/passes/graph/transforms/verilog/__init__.py index 262e7905f..573fdadc3 100644 --- a/src/chop/passes/graph/transforms/verilog/__init__.py +++ b/src/chop/passes/graph/transforms/verilog/__init__.py @@ -5,3 +5,4 @@ from .emit_internal import emit_internal_rtl_transform_pass from .emit_logicnets import emit_logicnets_transform_pass from .emit_vivado_project import emit_vivado_project_transform_pass +from .insert_fork import insert_fork_transform_pass diff --git a/src/chop/passes/graph/transforms/verilog/emit_bram.py b/src/chop/passes/graph/transforms/verilog/emit_bram.py index 8aeeb663f..eebe182cd 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_bram.py +++ b/src/chop/passes/graph/transforms/verilog/emit_bram.py @@ -28,6 +28,168 @@ def _cap(name): return str(name).upper() +def emit_mxint_parameters_in_mem_internal(node, param_name, file_name, data_name): + """ + Emit single-port ROM hardware components for each parameter + (Mostly because Vivado does not support string type parameters...) + """ + # ! TO DO: currently emitting too many parameters + + verilog_param_name = param_name.replace(".", "_") + total_size = math.prod( + node.meta["mase"].parameters["common"]["args"][verilog_param_name]["shape"] + ) + # Currently edata will be merged into mdata so out_size = paral1 * paral0 + 1 + out_size = int( + node.meta["mase"].parameters["hardware"]["verilog_param"][ + f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0" + ] + * node.meta["mase"].parameters["hardware"]["verilog_param"][ + f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1" + ] + ) + out_depth = int(total_size / out_size) + out_width = int( + node.meta["mase"].parameters["common"]["args"][verilog_param_name]["precision"][ + 0 + ] + ) + out_exponent_width = int( + node.meta["mase"].parameters["common"]["args"][verilog_param_name]["precision"][ + 1 + ] + ) + + addr_width = clog2(out_depth) + 1 + + node_param_name = f"{vf(node.name)}_{verilog_param_name}" + + rom_verilog = f""" +// ===================================== +// Mase Hardware +// Parameter: {node_param_name} +// {time.strftime('%d/%m/%Y %H:%M:%S')} +// ===================================== + +`timescale 1 ns / 1 ps +module {node_param_name}_rom #( + parameter DWIDTH = {out_size*out_width + out_exponent_width}, + parameter MEM_SIZE = {out_depth}, + parameter AWIDTH = $clog2(MEM_SIZE) + 1 +) ( + input clk, + input logic [AWIDTH-1:0] addr0, + input ce0, + output logic [DWIDTH-1:0] q0 +); + + logic [DWIDTH-1:0] ram[0:MEM_SIZE-1]; + logic [DWIDTH-1:0] q0_t0; + logic [DWIDTH-1:0] q0_t1; + + initial begin + $readmemb("{data_name}", ram); + end + + assign q0 = q0_t1; + + always_ff @(posedge clk) if (ce0) q0_t1 <= q0_t0; + always_ff @(posedge clk) if (ce0) q0_t0 <= ram[addr0]; + +endmodule + +`timescale 1 ns / 1 ps +module {node_param_name} #( + parameter DATA_WIDTH = 32'd{out_width*out_size + out_exponent_width}, + parameter ADDR_RANGE = 32'd{out_depth}, + parameter ADDR_WIDTH = $clog2(ADDR_RANGE) + 1 +) ( + input reset, + input clk, + input logic [ADDR_WIDTH - 1:0] address0, + input ce0, + output logic [DATA_WIDTH - 1:0] q0 +); + + {node_param_name}_rom {node_param_name}_rom_U ( + .clk(clk), + .addr0(address0), + .ce0(ce0), + .q0(q0) + ); + +endmodule + + +`timescale 1ns / 1ps +module {node_param_name}_source #( + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_0 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_1 = -1, + + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = -1, + parameter OUT_DEPTH = ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 / {_cap(verilog_param_name)}_PARALLELISM_DIM_0) * ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 / {_cap(verilog_param_name)}_PARALLELISM_DIM_1) +) ( + input clk, + input rst, + + output logic [{_cap(verilog_param_name)}_PRECISION_0-1:0] mdata_out [{_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1-1:0], + output logic [{_cap(verilog_param_name)}_PRECISION_1-1:0] edata_out, + output data_out_valid, + input data_out_ready +); + // 1-bit wider so IN_DEPTH also fits. + localparam COUNTER_WIDTH = $clog2(OUT_DEPTH); + logic [COUNTER_WIDTH:0] counter; + always_ff @(posedge clk) + if (rst) counter <= 0; + else begin + if (data_out_ready) begin + if (counter == OUT_DEPTH - 1) counter <= 0; + else counter <= counter + 1; + end + end + logic [1:0] clear; + always_ff @(posedge clk) + if (rst) clear <= 0; + else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1; + logic ce0; + assign ce0 = data_out_ready; + + localparam TOTAL_WIDTH = {_cap(verilog_param_name)}_PRECISION_0*({_cap(verilog_param_name)}_PARALLELISM_DIM_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_1) + {_cap(verilog_param_name)}_PRECISION_1; + logic [TOTAL_WIDTH-1:0] data_vector; + {node_param_name} #( + .DATA_WIDTH(TOTAL_WIDTH), + .ADDR_RANGE(OUT_DEPTH) + ) {node_param_name}_mem ( + .clk(clk), + .reset(rst), + .address0(counter), + .ce0(ce0), + .q0(data_vector) + ); + + // Cocotb/verilator does not support array flattening, so + // we need to manually add some reshaping process. + for (genvar j = 0; j < {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1; j++) + assign mdata_out[j] = data_vector[{_cap(verilog_param_name)}_PRECISION_0*j+{_cap(verilog_param_name)}_PRECISION_0-1 + {_cap(verilog_param_name)}_PRECISION_1:{_cap(verilog_param_name)}_PRECISION_0*j + {_cap(verilog_param_name)}_PRECISION_1]; + assign edata_out = data_vector[{_cap(verilog_param_name)}_PRECISION_1-1 : 0]; + assign data_out_valid = clear == 2; + +endmodule +""" + + with open(file_name, "w", encoding="utf-8") as outf: + outf.write(rom_verilog) + logger.debug( + f"ROM module {verilog_param_name} successfully written into {file_name}" + ) + assert os.path.isfile(file_name), "ROM Verilog generation failed." + # os.system(f"verible-verilog-format --inplace {file_name}") + + def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): """ Emit single-port ROM hardware components for each parameter @@ -84,7 +246,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): logic [DWIDTH-1:0] q0_t1; initial begin - $readmemh("{data_name}", ram); + $readmemb("{data_name}", ram); end assign q0 = q0_t1; @@ -119,14 +281,14 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): `timescale 1ns / 1ps module {node_param_name}_source #( - parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = 32, - parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = 1, - parameter {_cap(verilog_param_name)}_PRECISION_0 = 16, - parameter {_cap(verilog_param_name)}_PRECISION_1 = 3, - - parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = 1, - parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = 1, - parameter OUT_DEPTH = {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 / {_cap(verilog_param_name)}_PARALLELISM_DIM_0 + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_0 = -1, + parameter {_cap(verilog_param_name)}_PRECISION_1 = -1, + + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_0 = -1, + parameter {_cap(verilog_param_name)}_PARALLELISM_DIM_1 = -1, + parameter OUT_DEPTH = ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0 / {_cap(verilog_param_name)}_PARALLELISM_DIM_0) * ({_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1 / {_cap(verilog_param_name)}_PARALLELISM_DIM_1) ) ( input clk, input rst, @@ -138,7 +300,6 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): // 1-bit wider so IN_DEPTH also fits. localparam COUNTER_WIDTH = $clog2(OUT_DEPTH); logic [COUNTER_WIDTH:0] counter; - always_ff @(posedge clk) if (rst) counter <= 0; else begin @@ -147,13 +308,16 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): else counter <= counter + 1; end end - + logic [1:0] clear; + always_ff @(posedge clk) + if (rst) clear <= 0; + else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1; logic ce0; - assign ce0 = 1; + assign ce0 = data_out_ready; - logic [{_cap(verilog_param_name)}_PRECISION_0*{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0-1:0] data_vector; + logic [{_cap(verilog_param_name)}_PRECISION_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_0*{_cap(verilog_param_name)}_PARALLELISM_DIM_1-1:0] data_vector; {node_param_name} #( - .DATA_WIDTH({_cap(verilog_param_name)}_PRECISION_0 * {_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0), + .DATA_WIDTH({_cap(verilog_param_name)}_PRECISION_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1), .ADDR_RANGE(OUT_DEPTH) ) {node_param_name}_mem ( .clk(clk), @@ -168,7 +332,7 @@ def emit_parameters_in_mem_internal(node, param_name, file_name, data_name): for (genvar j = 0; j < {_cap(verilog_param_name)}_PARALLELISM_DIM_0 * {_cap(verilog_param_name)}_PARALLELISM_DIM_1; j++) assign data_out[j] = data_vector[{_cap(verilog_param_name)}_PRECISION_0*j+{_cap(verilog_param_name)}_PRECISION_0-1:{_cap(verilog_param_name)}_PRECISION_0*j]; - assign data_out_valid = 1; + assign data_out_valid = clear == 2; endmodule """ @@ -204,27 +368,40 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): out_depth = int(total_size / out_size) data_buff = "" - param_data = node.meta["mase"].module.get_parameter(param_name).data + param_data = ( + node.meta["mase"].parameters["common"]["args"][verilog_param_name]["value"].data + ) + param_meta = node.meta["mase"].parameters["hardware"]["verilog_param"] + # TODO: Currently only support tranpose linear + if node.meta["mase"].parameters["hardware"]["interface"][verilog_param_name][ "transpose" ]: + raise NotImplementedError("only support linear with not tranposed weight") + else: + assert ( + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1"] + % param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"] + == 0 + ) and ( + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0"] + % param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"] + == 0 + ), "The parallesim parameter must be divisible by the tensor size parameter." param_data = torch.reshape( param_data, ( - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_OUT_0_SIZE" - ], - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_DEPTH" - ], - node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_SIZE" - ], + -1, + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_1"] + // param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"], + param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_1"], + param_meta[f"{_cap(verilog_param_name)}_TENSOR_SIZE_DIM_0"] + // param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"], + param_meta[f"{_cap(verilog_param_name)}_PARALLELISM_DIM_0"], ), ) - param_data = torch.transpose(param_data, 0, 1) + param_data = param_data.permute(0, 1, 3, 2, 4) param_data = torch.flatten(param_data).tolist() - if ( node.meta["mase"].parameters["common"]["args"][verilog_param_name]["type"] == "fixed" @@ -241,18 +418,54 @@ def emit_parameters_in_dat_internal(node, param_name, file_name): for i in range(0, out_depth): line_buff = "" for j in range(0, out_size): - value = param_data[i * out_size + out_size - 1 - j] + value = param_data[i * out_size + j] value = integer_quantizer_for_hw( - torch.tensor(value), width, frac_width + torch.tensor(value), width, frac_width, floor=True ).item() - value = str(bin(int(value * scale) % thresh)) + value = str(bin(value)) value_bits = value[value.find("0b") + 2 :] value_bits = "0" * (width - len(value_bits)) + value_bits assert len(value_bits) == width line_buff += value_bits - hex_buff = hex(int(line_buff, 2)) - data_buff += hex_buff[hex_buff.find("0x") + 2 :] + "\n" + + data_buff += line_buff + "\n" + elif ( + node.meta["mase"].parameters["common"]["args"][verilog_param_name]["type"] + == "mxint_hardware" + ): + width = node.meta["mase"].parameters["common"]["args"][verilog_param_name][ + "precision" + ][0] + exponent_width = node.meta["mase"].parameters["common"]["args"][ + verilog_param_name + ]["precision"][1] + from mase_components.linear_layers.mxint_operators.test.utils import mxint_quant_block + + line_buff = "" + + # assert (width >= exponent_width),"current only support width >= exponent_width" + def convert_to_bit(value, width): + value_bits = str(bin(int(value) & (2**width - 1))) + value_bits = value_bits[value_bits.find("0b") + 2 :] + value_bits = "0" * (width - len(value_bits)) + value_bits + assert len(value_bits) == width + return value_bits + + for i in range(0, out_depth): + line_buff = "" + block_data = param_data[i * out_size : i * out_size + out_size] + value, mvalue, evalue = mxint_quant_block( + torch.tensor(block_data), width, exponent_width, round_bits=8, + ) + + for j in range(0, out_size): + value_bits = convert_to_bit(mvalue[j], width) + line_buff += value_bits + evalue_bits = convert_to_bit(evalue, exponent_width) + line_buff += evalue_bits + + data_buff += line_buff + "\n" else: assert False, "Emitting non-fixed parameters is not supported." @@ -349,7 +562,14 @@ def emit_bram_handshake(node, rtl_dir): data_name = os.path.join( rtl_dir, f"{node_name}_{param_verilog_name}_rom.dat" ) - emit_parameters_in_mem_internal(node, param_name, verilog_name, data_name) + if node.meta["mase"].parameters["common"]["quant_type"] == "mxint_hardware": + emit_mxint_parameters_in_mem_internal( + node, param_name, verilog_name, data_name + ) + else: + emit_parameters_in_mem_internal( + node, param_name, verilog_name, data_name + ) emit_parameters_in_dat_internal(node, param_name, data_name) else: assert False, "Emtting parameters in non-BRAM hardware is not supported." diff --git a/src/chop/passes/graph/transforms/verilog/emit_tb.py b/src/chop/passes/graph/transforms/verilog/emit_tb.py index 1afac2fea..4c06ef5df 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_tb.py +++ b/src/chop/passes/graph/transforms/verilog/emit_tb.py @@ -20,6 +20,8 @@ import dill import inspect +torch.manual_seed(0) + def _cap(name): """ @@ -57,14 +59,17 @@ async def test(dut): await tb.wait_end(timeout={wait_time}, timeout_unit="{wait_unit}") """ - - tb_path = Path.home() / ".mase" / "top" / "hardware" / "test" / "mase_top_tb" + tb_path = ( + pass_args["project_dir"] / "hardware" / "test" / "mase_top_tb" + if "project_dir" in pass_args.keys() + else Path.home() / ".mase" / "top" / "hardware" / "test" / "mase_top_tb" + ) tb_path.mkdir(parents=True, exist_ok=True) with open(tb_path / "test.py", "w") as f: f.write(test_template) -def _emit_cocotb_tb(graph): +def _emit_cocotb_tb(graph, pass_args={}): class MaseGraphTB(Testbench): def __init__(self, dut, fail_on_checks=True): super().__init__(dut, dut.clk, dut.rst, fail_on_checks=fail_on_checks) @@ -145,6 +150,7 @@ def load_drivers(self, in_tensors): self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_1"), self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_0"), ], + floor=True, ) else: @@ -175,6 +181,7 @@ def load_monitors(self, expectation): self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_1"), self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_0"), ], + floor=True, ) # Set expectation for each monitor @@ -189,11 +196,183 @@ def load_monitors(self, expectation): # Drive the in-flight flag for each monitor self.output_monitors["data_out_0"].in_flight = True + # Serialize testbench object to be instantiated within test by cocotb runner + cls_obj = MaseGraphTB + tb_path = ( + pass_args["project_dir"] / "hardware" / "test" / "mase_top_tb" + if "project_dir" in pass_args.keys() + else Path.home() / ".mase" / "top" / "hardware" / "test" / "mase_top_tb" + ) + tb_path.mkdir(parents=True, exist_ok=True) + with open(tb_path / "tb_obj.dill", "wb") as file: + import sys + + sys.setrecursionlimit(10000) # Increase recursion limit + dill.dump(cls_obj, file) + with open(tb_path / "__init__.py", "w") as file: + file.write("from .test import test") + + +from mase_components.linear_layers.mxint_operators.test.utils import ( + mxint_hardware, + pack_tensor_to_mx_listed_chunk, +) +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + MultiSignalStreamMonitor, +) +from cocotb.triggers import Timer, RisingEdge, ReadOnly + + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + weight_0 = dut.fc1_weight_source_0 + # if weight_0.data_out_ready.value == 1 and weight_0.data_out_valid.value == 1: + # print("mdata_out = ",[x for x in weight_0.mdata_out.value]) + # print("edata_out = ",weight_0.edata_out.value.signed_integer) + print(weight_0.data_vector) + print(weight_0.fc1_weight_mem.fc1_weight_rom_U.q0_t0) + print(weight_0.fc1_weight_mem.fc1_weight_rom_U.addr0) + print(weight_0.fc1_weight_mem.fc1_weight_rom_U.DWIDTH) + print(weight_0.fc1_weight_mem.fc1_weight_rom_U.MEM_SIZE) + print([x.value for x in weight_0.fc1_weight_mem.fc1_weight_rom_U.ram]) + print("end") + + +def _emit_cocotb_tb_for_mxint(graph): + class MaseGraphTB(Testbench): + def __init__(self, dut, fail_on_checks=True): + super().__init__(dut, dut.clk, dut.rst, fail_on_checks=fail_on_checks) + + # cocotb.start_soon(check_signal(dut)) + # Instantiate as many drivers as required inputs to the model + self.input_drivers = {} + self.output_monitors = {} + + for node in graph.nodes_in: + for arg in node.meta["mase"]["common"]["args"].keys(): + if "data_in" not in arg: + continue + self.input_drivers[arg] = MultiSignalStreamDriver( + dut.clk, + (getattr(dut, "m" + arg), getattr(dut, "e" + arg)), + getattr(dut, f"{arg}_valid"), + getattr(dut, f"{arg}_ready"), + ) + self.input_drivers[arg].log.setLevel(logging.DEBUG) + + # Instantiate as many monitors as required outputs + for node in graph.nodes_out: + for result in node.meta["mase"]["common"]["results"].keys(): + if "data_out" not in result: + continue + self.output_monitors[result] = MultiSignalStreamMonitor( + dut.clk, + (getattr(dut, "m" + result), getattr(dut, "e" + result)), + getattr(dut, f"{result}_valid"), + getattr(dut, f"{result}_ready"), + check=False, + ) + self.output_monitors[result].log.setLevel(logging.DEBUG) + + self.model = graph.model + + # To do: precision per input argument + self.input_precision = graph.meta["mase"]["common"]["args"]["data_in_0"][ + "precision" + ] + + def generate_inputs(self, batches): + """ + Generate inputs for the model by sampling a random tensor + for each input argument, according to its shape + + :param batches: number of batches to generate for each argument + :type batches: int + :return: a dictionary of input arguments and their corresponding tensors + :rtype: Dict + """ + # ! TO DO: iterate through graph.args instead to generalize + inputs = {} + for node in graph.nodes_in: + for arg, arg_info in node.meta["mase"]["common"]["args"].items(): + # Batch dimension always set to 1 in metadata + if "data_in" not in arg: + continue + # print(f"Generating data for node {node}, arg {arg}: {arg_info}") + inputs[f"{arg}"] = torch.rand(([batches] + arg_info["shape"][1:])) + return inputs + + def preprocess_tensor_for_mxint(self, tensor, q_config, parallelism): + (qtensor, mtensor, etensor) = block_mxint_quant( + tensor, q_config, parallelism + ) + tensor_inputs = pack_tensor_to_mx_listed_chunk( + mtensor, etensor, parallelism + ) + return tensor_inputs + + def load_drivers(self, in_tensors): + for arg, arg_batches in in_tensors.items(): + # Quantize input tensor according to precision + if len(self.input_precision) > 1: + in_data_blocks = self.preprocess_tensor_for_mxint( + tensor=arg_batches, + q_config={ + "width": self.get_parameter(f"{_cap(arg)}_PRECISION_0"), + "exponent_width": self.get_parameter( + f"{_cap(arg)}_PRECISION_1" + ), + }, + parallelism=[ + self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_1"), + self.get_parameter(f"{_cap(arg)}_PARALLELISM_DIM_0"), + ], + ) + + else: + # TO DO: convert to integer equivalent of floating point representation + pass + + block_size = self.get_parameter( + "DATA_IN_0_PARALLELISM_DIM_0" + ) * self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1") + for block in in_data_blocks: + self.input_drivers[arg].append(block) + + def load_monitors(self, expectation): + # Process the expectation tensor + output_blocks = self.preprocess_tensor_for_mxint( + tensor=expectation, + q_config={ + "width": self.get_parameter(f"DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter(f"DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter(f"DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + + # Set expectation for each monitor + for block in output_blocks: + # ! TO DO: generalize to multi-output models + self.output_monitors["data_out_0"].expect(block) + + # Drive the in-flight flag for each monitor + self.output_monitors["data_out_0"].in_flight = True + # Serialize testbench object to be instantiated within test by cocotb runner cls_obj = MaseGraphTB tb_path = Path.home() / ".mase" / "top" / "hardware" / "test" / "mase_top_tb" tb_path.mkdir(parents=True, exist_ok=True) with open(tb_path / "tb_obj.dill", "wb") as file: + import sys + + sys.setrecursionlimit(10000) # Increase recursion limit dill.dump(cls_obj, file) with open(tb_path / "__init__.py", "w") as file: file.write("from .test import test") @@ -224,6 +403,7 @@ def emit_cocotb_transform_pass(graph, pass_args={}): init_project(project_dir) _emit_cocotb_test(graph, pass_args=pass_args) - _emit_cocotb_tb(graph) + _emit_cocotb_tb(graph, pass_args=pass_args) + # _emit_cocotb_tb_for_mxint(graph) return graph, None diff --git a/src/chop/passes/graph/transforms/verilog/emit_top.py b/src/chop/passes/graph/transforms/verilog/emit_top.py index 6da3c0043..983a732c9 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_top.py +++ b/src/chop/passes/graph/transforms/verilog/emit_top.py @@ -9,11 +9,12 @@ from chop.passes.graph.utils import vf, v2p, init_project import mase_components.helper.generate_memory as gen_lut import torch.nn as nn - +import sys +from pathlib import Path logger = logging.getLogger(__name__) - +from chop.nn.quantized.modules.layer_norm import LayerNormIntegerFloor +from chop.nn.quantized.modules.attention import ViTAttentionInteger from .util import get_verilog_parameters -from pathlib import Path # ============================================================================= # Utilities @@ -125,6 +126,7 @@ def emit(self, graph, parameter_map): i = 0 for node in nodes_in: node_name = vf(node.name) + quant_type = node.meta["mase"].parameters["common"]["quant_type"] for arg_idx, arg in enumerate( node.meta["mase"].parameters["common"]["args"].keys() ): @@ -136,7 +138,14 @@ def emit(self, graph, parameter_map): for param in parameter_map if param.startswith(f"{arg_name}_PARALLELISM_DIM") ] - interface += f""" + if quant_type == "mxint_hardware": + interface += f""" + input [{arg_name}_PRECISION_0-1:0] mdata_in_{i} [{'*'.join(parallelism_params)}-1:0], + input [{arg_name}_PRECISION_1-1:0] edata_in_{i}, + input data_in_{i}_valid, + output data_in_{i}_ready,""" + else: + interface += f""" input [{arg_name}_PRECISION_0-1:0] data_in_{i} [{'*'.join(parallelism_params)}-1:0], input data_in_{i}_valid, output data_in_{i}_ready,""" @@ -145,6 +154,7 @@ def emit(self, graph, parameter_map): i = 0 for node in nodes_out: node_name = vf(node.name) + quant_type = node.meta["mase"].parameters["common"]["quant_type"] for result in node.meta["mase"].parameters["common"]["results"].keys(): if "data_out" in result: result_name = _cap(result) @@ -153,7 +163,14 @@ def emit(self, graph, parameter_map): for param in parameter_map if param.startswith(f"{result_name}_PARALLELISM_DIM") ] - interface += f""" + if quant_type == "mxint_hardware": + interface += f""" + output [{result_name}_PRECISION_0-1:0] mdata_out_{i} [{'*'.join(parallelism_params)}-1:0], + output [{result_name}_PRECISION_1-1:0] edata_out_{i}, + output data_out_{i}_valid, + input data_out_{i}_ready,""" + else: + interface += f""" output [{result_name}_PRECISION_0-1:0] data_out_{i} [{'*'.join(parallelism_params)}-1:0], output data_out_{i}_valid, input data_out_{i}_ready,""" @@ -177,6 +194,7 @@ def _emit_signals_top_internal(self, node, parameter_map): signals = "" node_name = vf(node.name) # Input signals + quant_type = node.meta["mase"].parameters["common"]["quant_type"] for arg, arg_info in node.meta["mase"].parameters["common"]["args"].items(): if not isinstance(arg_info, dict): continue @@ -199,7 +217,14 @@ def _emit_signals_top_internal(self, node, parameter_map): if node.meta["mase"]["common"]["mase_op"] == "getitem": arg = "data_in_0" - signals += f""" + if quant_type == "mxint_hardware": + signals += f""" +logic [{node_name}_{arg_name}_PRECISION_0-1:0] {node_name}_m{arg} [{'*'.join(parallelism_params)}-1:0]; +logic [{node_name}_{arg_name}_PRECISION_1-1:0] {node_name}_e{arg}; +logic {node_name}_{arg}_valid; +logic {node_name}_{arg}_ready;""" + else: + signals += f""" logic [{node_name}_{arg_name}_PRECISION_0-1:0] {node_name}_{arg} [{'*'.join(parallelism_params)}-1:0]; logic {node_name}_{arg}_valid; logic {node_name}_{arg}_ready;""" @@ -225,7 +250,14 @@ def _emit_signals_top_internal(self, node, parameter_map): for param in parameter_map if f"{node_name}_{result_name}_PARALLELISM_DIM" in param ] - signals += f""" + if quant_type == "mxint_hardware": + signals += f""" +logic [{node_name}_{result_name}_PRECISION_0-1:0] {node_name}_m{result} [{'*'.join(parallelism_params)}-1:0]; +logic [{node_name}_{result_name}_PRECISION_1-1:0] {node_name}_e{result}; +logic {node_name}_{result}_valid; +logic {node_name}_{result}_ready;""" + else: + signals += f""" logic [{node_name}_{result_name}_PRECISION_0-1:0] {node_name}_{result} [{'*'.join(parallelism_params)}-1:0]; logic {node_name}_{result}_valid; logic {node_name}_{result}_ready;""" @@ -321,12 +353,27 @@ def _emit_module_parameters_top_internal(self, key, value, node, parameter_map): component_name_inst = f"{component_name}_0" parameters = "" + quant_type = node.meta["mase"].parameters["common"]["quant_type"] for param in node.meta["mase"].parameters["hardware"]["verilog_param"].keys(): if f"{_cap(key)}_" in param: parameters += f" .{param}({node_name}_{param}),\n" parameters = _remove_last_comma(parameters) - return f""" + if quant_type == "mxint_hardware": + top_component = f""" +{component_name} #( +{parameters} +) {component_name_inst} ( + .clk(clk), + .rst(rst), + .mdata_out({node_name}_m{key}), + .edata_out({node_name}_e{key}), + .data_out_ready({node_name}_{key}_ready), + .data_out_valid({node_name}_{key}_valid) +); +""" + else: + top_component = f""" {component_name} #( {parameters} ) {component_name_inst} ( @@ -338,6 +385,8 @@ def _emit_module_parameters_top_internal(self, key, value, node, parameter_map): ); """ + return top_component + def _emit_getitem_signals(self, node): """ Getitem nodes have arg list like (None, None, None, Arg, None, None) @@ -346,8 +395,22 @@ def _emit_getitem_signals(self, node): """ node_name = vf(node.name) + quant_type = node.meta["mase"].parameters["common"]["quant_type"] - return f""" + if quant_type == "mxint_hardware": + component_interface = f""" + .mdata_in_0 ({node_name}_mdata_in_0), + .edata_in_0 ({node_name}_edata_in_0), + .data_in_0_valid ({node_name}_data_in_0_valid), + .data_in_0_ready ({node_name}_data_in_0_ready), + + .mdata_out_0 ({node_name}_mdata_out_0), + .edata_out_0 ({node_name}_edata_out_0), + .data_out_0_valid ({node_name}_data_out_0_valid), + .data_out_0_ready ({node_name}_data_out_0_ready), + """ + else: + component_interface = f""" .data_in_0 ({node_name}_data_in_0), .data_in_0_valid ({node_name}_data_in_0_valid), .data_in_0_ready ({node_name}_data_in_0_ready), @@ -357,10 +420,13 @@ def _emit_getitem_signals(self, node): .data_out_0_ready ({node_name}_data_out_0_ready), """ + return component_interface + def emit(self, node, parameter_map): node_name = vf(node.name) component_name = node.meta["mase"].parameters["hardware"]["module"] signals = "" + quant_type = node.meta["mase"].parameters["common"]["quant_type"] # Emit component instantiation parameters parameters = "" @@ -385,7 +451,15 @@ def emit(self, node, parameter_map): for key, value in node.meta["mase"].parameters["common"]["args"].items(): if "inplace" in key or not isinstance(value, dict): continue - signals += f""" + if quant_type == "mxint_hardware": + signals += f""" + .m{key}({node_name}_m{key}), + .e{key}({node_name}_e{key}), + .{key}_valid({node_name}_{key}_valid), + .{key}_ready({node_name}_{key}_ready), + """ + else: + signals += f""" .{key}({node_name}_{key}), .{key}_valid({node_name}_{key}_valid), .{key}_ready({node_name}_{key}_ready), @@ -393,7 +467,15 @@ def emit(self, node, parameter_map): # Emit component instantiation output signals for key, value in node.meta["mase"].parameters["common"]["results"].items(): - signals += f""" + if quant_type == "mxint_hardware": + signals += f""" + .m{key}({node_name}_m{key}), + .e{key}({node_name}_e{key}), + .{key}_valid({node_name}_{key}_valid), + .{key}_ready({node_name}_{key}_ready), + """ + else: + signals += f""" .{key}({node_name}_{key}), .{key}_valid({node_name}_{key}_valid), .{key}_ready({node_name}_{key}_ready), @@ -583,10 +665,20 @@ def _emit_top_wires(self): i = 0 for node in nodes_in: node_name = vf(node.name) + quant_type = node.meta["mase"].parameters["common"]["quant_type"] for arg_idx, arg in enumerate( node.meta["mase"].parameters["common"]["args"].keys() ): - if is_real_input_arg(node, arg_idx): + if not is_real_input_arg(node, arg_idx): + continue + if quant_type == "mxint_hardware": + wires += f""" +assign data_in_{i}_ready = {node_name}_{arg}_ready; +assign {node_name}_{arg}_valid = data_in_{i}_valid; +assign {node_name}_m{arg} = mdata_in_{i}; +assign {node_name}_e{arg} = edata_in_{i}; +""" + else: wires += f""" assign data_in_{i}_ready = {node_name}_{arg}_ready; assign {node_name}_{arg}_valid = data_in_{i}_valid; @@ -598,15 +690,21 @@ def _emit_top_wires(self): node_name = vf(node.name) for result in node.meta["mase"].parameters["common"]["results"].keys(): if "data_out" in result: - wires += f""" + if quant_type == "mxint_hardware": + wires += f""" +assign data_out_{i}_valid = {node_name}_{result}_valid; +assign {node_name}_{result}_ready = data_out_{i}_ready; +assign mdata_out_{i} = {node_name}_m{result}; +assign edata_out_{i} = {node_name}_e{result}; +""" + else: + wires += f""" assign data_out_{i}_valid = {node_name}_{result}_valid; assign {node_name}_{result}_ready = data_out_{i}_ready; assign data_out_{i} = {node_name}_{result}; """ i += 1 - # TODO: emit off-chip parameter interface - return wires def _emit_getitem_wires(self, node): @@ -618,19 +716,30 @@ def _emit_getitem_wires(self, node): from_name = vf(node.args[0].name) to_name = vf(node.name) select = node.args[1] + quant_type = node.meta["mase"].parameters["common"]["quant_type"] + if quant_type == "mxint_hardware": + getitem_wires = f""" +assign {from_name}_data_out_{select}_ready = {to_name}_data_in_0_ready; +assign {to_name}_data_in_0_valid = {from_name}_data_out_{select}_valid; +assign {to_name}_mdata_in_0 = {from_name}_mdata_out_{select}; +assign {to_name}_edata_in_0 = {from_name}_edata_out_{select}; +""" - return f""" + else: + getitem_wires = f""" assign {from_name}_data_out_{select}_ready = {to_name}_data_in_0_ready; assign {to_name}_data_in_0_valid = {from_name}_data_out_{select}_valid; assign {to_name}_data_in_0 = {from_name}_data_out_{select}; """ + return getitem_wires + def _emit_node2node_wires(self): nodes_in = self.graph.nodes_in wires = "" + fork_in = {} for node in self.graph.fx_graph.nodes: - if ( # Skip implicit nodes node.meta["mase"].parameters["hardware"]["is_implicit"] @@ -645,13 +754,28 @@ def _emit_node2node_wires(self): continue to_name = vf(node.name) - + quant_type = node.meta["mase"].parameters["common"]["quant_type"] for i, node_in in enumerate(node.all_input_nodes): from_name = vf(node_in.name) - wires += f""" -assign {from_name}_data_out_0_ready = {to_name}_data_in_{i}_ready; -assign {to_name}_data_in_{i}_valid = {from_name}_data_out_0_valid; -assign {to_name}_data_in_{i} = {from_name}_data_out_0; + if "fork2" in from_name: + fork_in[from_name] = ( + 0 if fork_in.get(from_name) == None else fork_in[from_name] + 1 + ) + j = fork_in[from_name] + else: + j = 0 + if quant_type == "mxint_hardware": + wires += f""" +assign {from_name}_data_out_{j}_ready = {to_name}_data_in_{i}_ready; +assign {to_name}_data_in_{i}_valid = {from_name}_data_out_{j}_valid; +assign {to_name}_mdata_in_{i} = {from_name}_mdata_out_{j}; +assign {to_name}_edata_in_{i} = {from_name}_edata_out_{j}; +""" + else: + wires += f""" +assign {from_name}_data_out_{j}_ready = {to_name}_data_in_{i}_ready; +assign {to_name}_data_in_{i}_valid = {from_name}_data_out_{j}_valid; +assign {to_name}_data_in_{i} = {from_name}_data_out_{j}; """ return wires @@ -729,6 +853,255 @@ def emit(self, graph, top_name): return module_inst +def emit_folded_bram(folded_gragh, reuse_name, reuse_times): + def _emit_module_parameters_top_internal(key, node, reuse_name, reuse_times): + node_name = vf(node.name).replace(reuse_name + "_0", reuse_name) + component_name = f"{node_name}_{key}_source" + component_name_inst = f"{component_name}_0" + + # verilog_param = node_name+"_"+_cap(key) + def get_image_depth(key, param_list, node_name): + if "weight" in key: + image_depth = ( + param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_0"] + * param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_1"] + / ( + param_list[f"{_cap(key)}_PARALLELISM_DIM_0"] + * param_list[f"{_cap(key)}_PARALLELISM_DIM_1"] + ) + ) + elif "bias" in key: + if "norm" in node_name: + image_depth = ( + param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_0"] + * param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_1"] + / ( + param_list[f"{_cap(key)}_PARALLELISM_DIM_0"] + * param_list[f"{_cap(key)}_PARALLELISM_DIM_1"] + ) + ) + else: + image_depth = ( + param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_0"] + / param_list[f"{_cap(key)}_PARALLELISM_DIM_0"] + ) + else: + raise NotImplementedError + return image_depth + + image_depth = get_image_depth( + key, node.meta["mase"].parameters["hardware"]["verilog_param"], node.name + ) + parameters = "" + for param in node.meta["mase"].parameters["hardware"]["verilog_param"].keys(): + if f"{_cap(key)}_" in param: + parameters += f" .{param}({param}),\n" + parameters = _remove_last_comma(parameters) + modules = "" + signal = "" + for i in range(reuse_times): + new_node_name = node_name.replace(reuse_name, reuse_name + f"_{i}") + new_componet_name = component_name.replace(reuse_name, reuse_name + f"_{i}") + new_component_name_inst = component_name_inst.replace( + reuse_name, reuse_name + f"_{i}" + ) + signal += f""" +logic [{_cap(key)}_PRECISION_0 - 1:0] {new_node_name}_{key} [{_cap(key)}_PARALLELISM_DIM_0*{_cap(key)}_PARALLELISM_DIM_1 - 1:0]; +logic {new_node_name}_{key}_valid, {new_node_name}_{key}_ready; +""" + modules += f""" +{new_componet_name} #( +{parameters} +) {new_component_name_inst} ( + .clk(clk), + .rst(rst), + .data_out({new_node_name}_{key}), + .data_out_ready({new_node_name}_{key}_ready), + .data_out_valid({new_node_name}_{key}_valid) +); + + """ + + output_connections = f""" +always_comb begin""" + for item in ["", f"_valid"]: + output_connections += f""" + data_out{item} = (counter= (REPEAT_TIMES - 1)*IMAGE_DEPTH)? data_out_0_ready: (counter_in < IMAGE_DEPTH) ? 0 : top_block_data_in_0_ready; +end +endmodule + """ + return top + + +def emit_verilog_folded_top_file(graph, top_name, pass_args): + folded_graph = pass_args["folded_graph"] + folded_node_name = pass_args["folded_node_name"] + reuse_times = pass_args["reuse_times"] + top_block = ( + VerilogEmitter(folded_graph) + .emit(folded_graph, "top_block") + .replace(f"{folded_node_name}_0", folded_node_name) + ) + top_bram = emit_folded_bram(folded_graph, folded_node_name, reuse_times) + top = emit_verilog_folded_top(graph, reuse_times, top_name) + top_file = f""" + {top} + {top_block} + {top_bram} + """ + return top_file + + def emit_verilog_top_transform_pass(graph, pass_args={}): """Emit the top-level model design in Verilog @@ -756,8 +1129,10 @@ def emit_verilog_top_transform_pass(graph, pass_args={}): top_name = pass_args["top_name"] if "top_name" in pass_args.keys() else "top" init_project(project_dir) rtl_dir = os.path.join(project_dir, "hardware", "rtl") - - top = VerilogEmitter(graph).emit(graph, top_name) + if pass_args.get("folded_graph", False): + top = emit_verilog_folded_top_file(graph, top_name, pass_args) + else: + top = VerilogEmitter(graph).emit(graph, top_name) top_file = os.path.join(rtl_dir, f"{top_name}.sv") with open(top_file, "w") as top_design: @@ -768,8 +1143,6 @@ def emit_verilog_top_transform_pass(graph, pass_args={}): # Alternatively, add a class to the emitter that can be called to generate LUTs, for LUT based implementations of activation functions, # or other functions that require LUTs such as PolyLUT or LUTnet neurons. for node in graph.fx_graph.nodes: - # print(vars(node)) - # print(type(node)) if node.op == "call_module": module = dict(graph.model.named_modules())[node.target] if isinstance(module, nn.SiLU): @@ -782,22 +1155,81 @@ def emit_verilog_top_transform_pass(graph, pass_args={}): func = "logsigmoid" elif isinstance(module, nn.Softmax): func = "exp" + elif isinstance(module, nn.GELU) or node.meta["mase"]["common"]["mase_op"] == "gelu": + func = "gelu" + elif isinstance(module, LayerNormIntegerFloor): + func = "isqrt" + elif isinstance(module, ViTAttentionInteger): + func = "exp" else: func = "Unknown" - + mult = 1 + sys.path.append(Path(__file__).resolve().parents[6].as_posix()) + from a_cx_mxint_quant import MXIntGELU if func != "Unknown": - d_in_width = node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_IN_0_PRECISION_0" - ] - d_in_f_width = node.meta["mase"].parameters["hardware"][ - "verilog_param" - ]["DATA_IN_0_PRECISION_1"] - d_out_width = node.meta["mase"].parameters["hardware"]["verilog_param"][ - "DATA_OUT_0_PRECISION_0" - ] - d_out_f_width = node.meta["mase"].parameters["hardware"][ - "verilog_param" - ]["DATA_OUT_0_PRECISION_1"] + if isinstance(module, ViTAttentionInteger): + d_in_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["QKMM_OUT_PRECISION_0"] + d_in_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["QKMM_OUT_PRECISION_1"] + d_out_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["SOFTMAX_EXP_PRECISION_0"] + d_out_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["SOFTMAX_EXP_PRECISION_1"] + from math import sqrt + + mult = 1 / sqrt( + node.meta["mase"].parameters["hardware"]["verilog_param"][ + "DATA_IN_0_TENSOR_SIZE_DIM_0" + ] + // node.meta["mase"].parameters["hardware"]["verilog_param"][ + "NUM_HEADS" + ] + ) + elif isinstance(module, LayerNormIntegerFloor): + d_in_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["ISQRT_IN_PRECISION_0"] + d_in_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["ISQRT_IN_PRECISION_1"] + d_out_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["ISQRT_OUT_PRECISION_0"] + d_out_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["ISQRT_OUT_PRECISION_1"] + elif isinstance(module, MXIntGELU): + d_in_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["DATA_IN_0_PRECISION_0"] + 2 + d_in_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["DATA_IN_0_PRECISION_1"] - 1 + d_out_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["HASH_OUT_WIDTH"] + d_out_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["HASH_OUT_WIDTH"] - 3 + else: + d_in_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["DATA_IN_0_PRECISION_0"] + d_in_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["DATA_IN_0_PRECISION_1"] + d_out_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["DATA_OUT_0_PRECISION_0"] + d_out_f_width = node.meta["mase"].parameters["hardware"][ + "verilog_param" + ]["DATA_OUT_0_PRECISION_1"] + logger.info(f"Generating LUT for {func}") gen_lut.generate_sv_lut( func, d_in_width, @@ -806,5 +1238,7 @@ def emit_verilog_top_transform_pass(graph, pass_args={}): d_out_f_width, path=rtl_dir, path_with_dtype=False, + constant_mult=mult, + floor=False, ) return graph, {} diff --git a/src/chop/passes/graph/transforms/verilog/emit_vivado_project.py b/src/chop/passes/graph/transforms/verilog/emit_vivado_project.py index 3219a42ac..1633cfc60 100644 --- a/src/chop/passes/graph/transforms/verilog/emit_vivado_project.py +++ b/src/chop/passes/graph/transforms/verilog/emit_vivado_project.py @@ -18,8 +18,8 @@ def generate_tcl_script(top_name, vivado_project_path, include_groups, project_d ) tcl_script_template = f""" -set_param board.repoPaths {{{str(Path.home())}/shared/board-files}} -create_project {top_name}_build_project {vivado_project_path} -part xcu280-fsvh2892-2L-e +# set_param board.repoPaths {{{str(Path.home())}/shared/board-files}} +create_project -force {top_name}_build_project {vivado_project_path} -part xcu280-fsvh2892-2L-e set_property board_part xilinx.com:au280:part0:1.1 [current_project] """ for include_group in include_groups: @@ -27,10 +27,21 @@ def generate_tcl_script(top_name, vivado_project_path, include_groups, project_d tcl_script_template += f"\n\nset_property top top [current_fileset]" + tcl_script_template += f""" +add_files /scratch/cx922/mase/src/mase_components/vivado/constraints.xdc +read_xdc /scratch/cx922/mase/src/mase_components/vivado/constraints.xdc +""" tcl_script_template += f""" update_compile_order -fileset sources_1 """ + # syth and impl + tcl_script_template += f""" +launch_runs synth_1 +wait_on_run synth_1 +launch_runs impl_1 +wait_on_run impl_1 +""" # * Package IP tcl_script_template += f""" ipx::package_project -root_dir {project_dir}/hardware/ip_repo -vendor user.org -library user -taxonomy /UserIP -import_files @@ -87,11 +98,12 @@ def emit_vivado_project_transform_pass(graph, pass_args={}): os.makedirs(vivado_project_path, exist_ok=True) # * List include files - include_groups = [ - f"{COMPONENTS_PATH / group / 'rtl'}" - for group in mase_components.get_modules() - if group != "vivado" - ] + [project_dir / "hardware" / "rtl"] + include_groups = [project_dir / "hardware" / "rtl"] + # include_groups = [ + # f"{COMPONENTS_PATH / group / 'rtl'}" + # for group in mase_components.get_modules() + # if group != "vivado" + # ] + [project_dir / "hardware" / "rtl"] generate_tcl_script(top_name, vivado_project_path, include_groups, project_dir) @@ -105,6 +117,6 @@ def emit_vivado_project_transform_pass(graph, pass_args={}): "-source", f"{vivado_project_path}/build.tcl", ] - result = subprocess.run(cmd, capture_output=True, text=True) + # result = subprocess.run(cmd, capture_output=True, text=True) return graph, {} diff --git a/src/chop/passes/graph/transforms/verilog/insert_fork.py b/src/chop/passes/graph/transforms/verilog/insert_fork.py new file mode 100644 index 000000000..82aac3d80 --- /dev/null +++ b/src/chop/passes/graph/transforms/verilog/insert_fork.py @@ -0,0 +1,168 @@ +import torch +import torch.nn as nn +from copy import copy, deepcopy +from chop.ir.graph import MaseMetadata + + +@torch.fx.wrap +def fork2(x): + out = x + return out + + +def insert_fork_transform_pass(graph, pass_args={}): + """Insert hardware-explicit forks into the mase graph + :param graph: a MaseGraph + :type graph: MaseGraph + :param pass_args: this pass requires additional arguments which is explained below, defaults to {} + :type pass_args: _type_, optional + :return: return a tuple of a MaseGraph and an empty dict (no additional info to return) + :rtype: tuple(MaseGrap`h, Dict) + """ + + def generating_mase_metadata(new_node, node, quan_args): + new_node.meta["mase"] = MaseMetadata(new_node, node.meta["mase"].model) + new_node.meta["mase"].parameters["common"]["mase_type"] = "call_function" + new_node.meta["mase"].parameters["common"]["mase_op"] = "fork2" + inherited_metadata = deepcopy( + node.meta["mase"]["common"]["results"]["data_out_0"] + ) + if quan_args["config"]["name"] == "mxint_hardware": + inherited_metadata["precision"] = [quan_args["config"]["data_in_width"], quan_args["config"]["data_in_exponent_width"]], + inherited_metadata["type"] = "mxint_hardware" + else: + inherited_metadata["precision"] = quan_args + inherited_metadata["type"] = "fixed" + new_node.meta["mase"].parameters["common"]["args"] = { + "data_in_0": inherited_metadata + } + new_node.meta["mase"].parameters["common"]["results"] = { + "data_out_0": inherited_metadata, + "data_out_1": inherited_metadata, + } + + new_node.meta["mase"].parameters["hardware"]["is_implicit"] = False + + nodes_to_fork = [] + from chop.tools.utils import to_numpy_if_tensor, to_tensor_if_numpy + from chop.passes.graph.transforms.utils import ( + metadata_value_type_cast_transform_pass, + ) + + graph, _ = metadata_value_type_cast_transform_pass( + graph, pass_args={"fn": to_numpy_if_tensor} + ) + for node in graph.fx_graph.nodes: + user_count = 0 + for u in node.users.keys(): + user_count += 1 + if user_count > 1: + nodes_to_fork.append(node) + for node in nodes_to_fork: + with graph.fx_graph.inserting_after(node): + new_node = graph.fx_graph.call_function(fork2, args=(node,)) + node.replace_all_uses_with(new_node) + new_node.args = (node,) + by = pass_args.get("by", "type") + if by == "type": + generating_mase_metadata(new_node, node, quan_args=pass_args["fork2"]) + else: + generating_mase_metadata( + new_node, node, quan_args=pass_args[new_node.name] + ) + + # test whether the new graph works + insert_fifo_after_fork_pass(graph) + graph, _ = metadata_value_type_cast_transform_pass( + graph, pass_args={"fn": to_tensor_if_numpy} + ) + graph.fx_graph.lint() + return graph, None + + +@torch.fx.wrap +def fifo(x): + out = x + return out + + +def insert_fifo_after_fork_pass(graph, pass_args={}): + def generating_mase_metadata(new_node, node, i): + new_node.meta["mase"] = MaseMetadata(new_node, node.meta["mase"].model) + new_node.meta["mase"].parameters["common"]["mase_type"] = "call_function" + new_node.meta["mase"].parameters["common"]["mase_op"] = "fifo" + inherited_metadata = deepcopy( + node.meta["mase"]["common"]["args"][f"data_in_{i}"] + ) + new_node.meta["mase"].parameters["common"]["args"] = { + "data_in_0": inherited_metadata + } + new_node.meta["mase"].parameters["common"]["results"] = { + "data_out_0": inherited_metadata + } + + new_node.meta["mase"].parameters["hardware"]["is_implicit"] = False + + record_list = [] + for node in graph.fx_graph.nodes: + if node.meta["mase"].parameters["common"]["mase_op"] == "fork2": + for record_node in list(node.users): + if record_node.meta["mase"].parameters["common"]["mase_op"] == "add": + record_list.append(record_node) + for node in record_list: + with graph.fx_graph.inserting_before(node): + for i, arg in enumerate(list(node.args)): + if arg.meta["mase"].parameters["common"]["mase_op"] == "fork2": + new_node = graph.fx_graph.call_function(fifo, args=(arg,)) + generating_mase_metadata(new_node, node, i) + node_args = list(node.args) + node_args[i] = new_node + node.args = tuple(node_args) + return graph, None + + +def insert_fifo_after_specified_modules(graph, pass_args={}): + def generating_mase_metadata(new_node, node, parallelism): + new_node.meta["mase"] = MaseMetadata(new_node, node.meta["mase"].model) + new_node.meta["mase"].parameters["common"]["mase_type"] = "call_function" + new_node.meta["mase"].parameters["common"]["mase_op"] = "fifo" + inherited_metadata = deepcopy( + node.meta["mase"]["common"]["results"][f"data_out_0"] + ) + new_node.meta["mase"].parameters["common"]["args"] = { + "data_in_0": inherited_metadata, + "depth": inherited_metadata["shape"][-1] // parallelism, + } + new_node.meta["mase"].parameters["common"]["results"] = { + "data_out_0": inherited_metadata + } + + new_node.meta["mase"].parameters["hardware"]["is_implicit"] = False + + from chop.tools.utils import to_numpy_if_tensor, to_tensor_if_numpy + from chop.passes.graph.transforms.utils import ( + metadata_value_type_cast_transform_pass, + ) + + graph, _ = metadata_value_type_cast_transform_pass( + graph, pass_args={"fn": to_numpy_if_tensor} + ) + record_list = [] + for node in graph.fx_graph.nodes: + if ( + node.meta["mase"].parameters["common"]["mase_op"] + in pass_args["insert_fifo"] + ): + record_list.append(node) + for node in record_list: + with graph.fx_graph.inserting_after(node): + new_node = graph.fx_graph.call_function(fifo, args=(node,)) + node.replace_all_uses_with(new_node) + new_node.args = (node,) + generating_mase_metadata(new_node, node, pass_args["max_parallelism"]) + + graph, _ = metadata_value_type_cast_transform_pass( + graph, pass_args={"fn": to_tensor_if_numpy} + ) + graph.fx_graph.lint() + return graph, None diff --git a/src/chop/passes/graph/transforms/verilog/util.py b/src/chop/passes/graph/transforms/verilog/util.py index 35abe9560..b0cbeea78 100644 --- a/src/chop/passes/graph/transforms/verilog/util.py +++ b/src/chop/passes/graph/transforms/verilog/util.py @@ -25,11 +25,18 @@ def get_verilog_parameters(graph): parameter_map[f"{node_name}_{key}"] = value # * Return graph level parameters - for node in graph.nodes_in + graph.nodes_out: + for node in graph.nodes_in: for key, value in ( node.meta["mase"].parameters["hardware"]["verilog_param"].items() ): - if "DATA_IN" in key or "DATA_OUT" in key: + if "DATA_IN" in key: + parameter_map[key] = value + + for node in graph.nodes_out: + for key, value in ( + node.meta["mase"].parameters["hardware"]["verilog_param"].items() + ): + if "DATA_OUT" in key: parameter_map[key] = value return parameter_map diff --git a/src/mase_cocotb/interfaces/random_draw.drawio b/src/mase_cocotb/interfaces/random_draw.drawio new file mode 100644 index 000000000..0508a71e1 --- /dev/null +++ b/src/mase_cocotb/interfaces/random_draw.drawio @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_cocotb/interfaces/streaming.py b/src/mase_cocotb/interfaces/streaming.py index e67ebd1a3..01df3ac6c 100644 --- a/src/mase_cocotb/interfaces/streaming.py +++ b/src/mase_cocotb/interfaces/streaming.py @@ -238,50 +238,41 @@ def _check(self, got, exp): self.log.debug("Passed | Got: %20s Exp: %20s Err: %10s" % (g, e, err)) -class MultiSignalStreamDriver(Driver): - def __init__(self, clk, data, valid, ready) -> None: - super().__init__() - self.clk = clk - self.data = data - self.valid = valid - self.ready = ready - self.valid_prob = 1.0 - - def set_valid_prob(self, prob): - assert prob >= 0.0 and prob <= 1.0 - self.valid_prob = prob - - async def _driver_send(self, data) -> None: +class MultiSignalStreamDriver(StreamDriver): + async def _driver_send(self, transaction) -> None: while True: await RisingEdge(self.clk) - for hardware_target, item in zip(self.data, data): - hardware_target.value = item - + if type(self.data) == tuple: + # Drive multiple data bus + for wire, val in zip(self.data, transaction): + wire.value = val + else: + # Drive single data + self.data.value = transaction if random.random() > self.valid_prob: self.valid.value = 0 continue # Try roll random valid again at next clock self.valid.value = 1 await ReadOnly() if self.ready.value == 1: - self.log.debug(f"Sent {data}") + if type(self.data) == tuple: + # Drive multiple data bus + for t in transaction: + self.log.debug("Sent %s" % t) + else: + self.log.debug("Sent %s" % transaction) + if self.record_num_beats: + self.num_beats += 1 break + + # Load extra + # self.load_driver + if self.send_queue.empty(): await RisingEdge(self.clk) self.valid.value = 0 - -class MultiSignalStreamMonitor(Monitor): - def __init__(self, clk, data, valid, ready, check=True): - super().__init__(clk) - self.clk = clk - self.data = data - self.valid = valid - self.ready = ready - self.check = check - - def _trigger(self): - return self.valid.value == 1 and self.ready.value == 1 - +class MultiSignalStreamMonitor(StreamMonitor): def _recv(self): def cast_data(value): if type(value) == list: @@ -296,3 +287,38 @@ def _check(self, got, exp): for g, e in zip(got, exp): if not np.equal(g, e).all(): raise TestFailure("\nGot \n%s, \nExpected \n%s" % (got, exp)) + +class MultiSignalErrorThresholdStreamMonitor(ErrorThresholdStreamMonitor): + def _recv(self): + def cast_data(value): + if type(value) == list: + return [x.signed_integer for x in value] + elif type(value) == BinaryValue: + return value.signed_integer + + return tuple([cast_data(target.value) for target in self.data]) + + def _check(self, got, exp): + if self.check: + mg, eg = got + me, ee = exp + if type(mg) == list: + mg = np.array(mg) + me = np.array(me) + mg = mg // 2**(ee - eg) + mg = mg.astype(np.int64) + + if self.signed: + mg = _sign_extend(mg, self.width) + me = _sign_extend(me, self.width) + err = np.abs(mg - me) + if self.log_error: + self.error_log.append(err) + self.recv_log.append(got) + max_biterr = np.full_like(err, self.error_bits) + if not (err <= max_biterr).all(): + self.log.error("Failed | Got: %20s Exp: %20s Err: %14s" % (mg, me, err)) + assert False, "Test Failed!" + return + else: + assert False, "Not implemented" \ No newline at end of file diff --git a/src/mase_cocotb/monitor.py b/src/mase_cocotb/monitor.py index 682dd2f6e..7ae5a26d2 100644 --- a/src/mase_cocotb/monitor.py +++ b/src/mase_cocotb/monitor.py @@ -15,7 +15,7 @@ def __init__(self, clk, check=True, name=None): self.exp_queue = Queue() self.check = check self.name = name - self.in_flight = False + self.in_flight = True if not hasattr(self, "log"): self.log = SimLog( diff --git a/src/mase_cocotb/runner.py b/src/mase_cocotb/runner.py index 21490555a..cc68fb6a8 100644 --- a/src/mase_cocotb/runner.py +++ b/src/mase_cocotb/runner.py @@ -65,6 +65,7 @@ def _single_test( comp_path: Path, test_work_dir: Path, sim: str = "verilator", + gui: bool = False, extra_build_args: list[str] = [], seed: int = None, trace: bool = False, @@ -126,6 +127,7 @@ def _single_test( seed=seed, results_xml="results.xml", build_dir=test_work_dir, + gui=gui, ) num_tests, fail = get_results(test_work_dir.joinpath("results.xml")) except Exception as e: @@ -144,6 +146,7 @@ def mase_runner( group=None, module_param_list: list[dict[str, Any]] = [dict()], sim: str = "verilator", + gui: str = False, extra_build_args: list[str] = [], seed: int = None, jobs: int = 1, @@ -206,6 +209,7 @@ def mase_runner( comp_path=comp_path, test_work_dir=test_work_dir, sim=sim, + gui=gui, extra_build_args=extra_build_args, seed=seed, trace=trace, @@ -237,6 +241,7 @@ def mase_runner( comp_path=comp_path, test_work_dir=test_work_dir, sim=sim, + gui=gui, extra_build_args=extra_build_args, seed=seed, trace=trace, diff --git a/src/mase_cocotb/testbench.py b/src/mase_cocotb/testbench.py index be535dba5..e7f7293dd 100644 --- a/src/mase_cocotb/testbench.py +++ b/src/mase_cocotb/testbench.py @@ -38,10 +38,6 @@ def get_parameter(self, parameter_name): parameter = getattr(self.dut, parameter_name) return int(parameter) - def get_parameter(self, parameter_name): - parameter = getattr(self.dut, parameter_name) - return int(parameter) - async def reset(self, active_high=True): if self.rst is None: raise Exception( @@ -53,6 +49,10 @@ async def reset(self, active_high=True): self.rst.value = 1 if active_high else 0 await RisingEdge(self.clk) self.rst.value = 0 if active_high else 1 + for monitor in self.output_monitors.values(): + monitor.ready.value = 1 + for driver in self.input_drivers.values(): + driver.valid.value = 0 await RisingEdge(self.clk) async def initialize(self): diff --git a/src/mase_cocotb/utils.py b/src/mase_cocotb/utils.py index 469681cd1..d62d37694 100644 --- a/src/mase_cocotb/utils.py +++ b/src/mase_cocotb/utils.py @@ -12,7 +12,7 @@ from mase_cocotb.z_qlayers import quantize_to_int from functools import partial -from chop.nn.quantizers import integer_quantizer +from chop.nn.quantizers import integer_quantizer, integer_floor_quantizer # Apparently this function only exists in Python 3.12 ... @@ -101,7 +101,9 @@ def product_dict(**kwargs): yield dict(zip(keys, instance)) -def fixed_preprocess_tensor(tensor: Tensor, q_config: dict, parallelism: list) -> list: +def fixed_preprocess_tensor( + tensor: Tensor, q_config: dict, parallelism: list, floor=False +) -> list: """Preprocess a tensor before driving it into the DUT. 1. Quantize to requested fixed-point precision. 2. Convert to integer format to be compatible with Cocotb drivers. @@ -125,12 +127,13 @@ def fixed_preprocess_tensor(tensor: Tensor, q_config: dict, parallelism: list) - tensor = tensor.view((-1, tensor.shape[-1])) # Quantize - quantizer = partial(integer_quantizer, **q_config) + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + quantizer = partial(base_quantizer, **q_config) q_tensor = quantizer(tensor) - + # breakpoint() # Convert to integer format q_tensor = (q_tensor * 2 ** q_config["frac_width"]).int() - q_tensor = signed_to_unsigned(q_tensor, bits=q_config["width"]) + # q_tensor = signed_to_unsigned(q_tensor, bits=q_config["width"]) # Split into chunks according to parallelism in each dimension # parallelism[0]: along rows, parallelism[1]: along columns @@ -175,3 +178,27 @@ def fixed_cast(val, in_width, in_frac_width, out_width, out_frac_width): val = val # val = int(val % (1 << out_width)) return val # << out_frac_width # treat data as data + + +async def check_signal(dut, log, signal_list): + # TODO: support count start + # TODO: support checking signal with different name in valid and ready signal + def handshake_signal_check( + dut, log, signal_base, valid=None, ready=None, count_start: dict = {} + ): + data_valid = getattr(dut, f"{signal_base}_valid") if valid is None else valid + data_ready = getattr(dut, f"{signal_base}_ready") if ready is None else ready + data = getattr(dut, signal_base) + svalue = [i.signed_integer for i in data.value] + if data_valid.value & data_ready.value: + count_start[signal_base] = ( + count_start[signal_base] + 1 + if count_start.get(signal_base) is not None + else " " + ) + log.debug(f"handshake {count_start[signal_base]} {signal_base} = {svalue}") + + while True: + await RisingEdge(dut.clk) + for signal in signal_list: + handshake_signal_check(dut, log, signal) diff --git a/src/mase_components/__init__.py b/src/mase_components/__init__.py index de4c54db6..5779476a7 100644 --- a/src/mase_components/__init__.py +++ b/src/mase_components/__init__.py @@ -10,9 +10,21 @@ def get_modules(): for d in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, d)) ] - if "__pycache__" in mods: - mods.remove("__pycache__") - return mods + detailed_mods = [] + for mod in mods: + new_dir = os.path.join(current_dir, mod) + if "rtl" in os.listdir(new_dir): + detailed_mods.append(mod) + else: + update_mods = [ + mod + "/" + d + for d in os.listdir(new_dir) + if os.path.isdir(os.path.join(new_dir, d)) + ] + detailed_mods += update_mods + if "__pycache__" in detailed_mods: + detailed_mods.remove("__pycache__") + return detailed_mods def get_group_files(group): @@ -27,7 +39,7 @@ def get_group_files(group): def get_module_dependencies(module): - group, mod = module.split("/") + # group, mod = module.split("/") group_deps = MASE_HW_DEPS.get(module, []) file_deps = [] for group_dep in group_deps: diff --git a/src/mase_components/activation_layers/rtl/fixed_gelu.sv b/src/mase_components/activation_layers/rtl/fixed_gelu.sv index 9c0cc4235..a1adaf7d5 100644 --- a/src/mase_components/activation_layers/rtl/fixed_gelu.sv +++ b/src/mase_components/activation_layers/rtl/fixed_gelu.sv @@ -14,12 +14,12 @@ module fixed_gelu #( parameter DATA_OUT_0_PRECISION_0 = 8, parameter DATA_OUT_0_PRECISION_1 = 4, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 10, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = 1, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = 1, - parameter DATA_OUT_0_PARALLELISM_DIM_0 = 1, - parameter DATA_OUT_0_PARALLELISM_DIM_1 = 1, - parameter DATA_OUT_0_PARALLELISM_DIM_2 = 1 + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2 ) ( /* verilator lint_off UNUSEDSIGNAL */ input clk, @@ -34,80 +34,19 @@ module fixed_gelu #( output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0] ); - logic [DATA_IN_0_PRECISION_0-1:0] ff_data[DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; - logic [DATA_IN_0_PRECISION_0-1:0] roll_data[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; - - logic ff_data_valid; - logic ff_data_ready; - - logic roll_data_valid; - logic roll_data_ready; - - unpacked_fifo #( - .DEPTH(IN_0_DEPTH), - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) - ) roller_buffer ( - .clk(clk), - .rst(rst), - .data_in(data_in_0), - .data_in_valid(data_in_0_valid), - .data_in_ready(data_in_0_ready), // write enable - .data_out(ff_data), - .data_out_valid(ff_data_valid), - .data_out_ready(ff_data_ready) // read enable - ); - - localparam STRAIGHT_THROUGH = (DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1 == DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1); - - generate - if (STRAIGHT_THROUGH) begin - unpacked_register_slice_quick #( - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .IN_SIZE(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) - ) single_roll ( - .clk(clk), - .rst(rst), - .in_data(ff_data), - .in_valid(ff_data_valid), - .in_ready(ff_data_ready), - .out_data(roll_data), - .out_valid(roll_data_valid), - .out_ready(roll_data_ready) - ); - - end else begin - - roller #( - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), - .ROLL_NUM(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1) - ) roller_inst ( - .clk(clk), - .rst(rst), - .data_in(ff_data), - .data_in_valid(ff_data_valid), - .data_in_ready(ff_data_ready), - .data_out(roll_data), - .data_out_valid(roll_data_valid), - .data_out_ready(roll_data_ready) - ); - end - endgenerate - - for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1; i++) begin : elu + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1; i++) begin : gelu gelu_lut #( .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) ) elu_map ( - .data_in_0 (roll_data[i]), + .data_in_0 (data_in_0[i]), .data_out_0(data_out_0[i]) ); end - assign data_out_0_valid = roll_data_valid; - assign roll_data_ready = data_out_0_ready; + assign data_out_0_valid = data_in_0_valid; + assign data_in_0_ready = data_out_0_ready; endmodule diff --git a/src/mase_components/activation_layers/rtl/fixed_softmax.sv b/src/mase_components/activation_layers/rtl/fixed_softmax.sv index 1158f9d12..be2ddef2c 100644 --- a/src/mase_components/activation_layers/rtl/fixed_softmax.sv +++ b/src/mase_components/activation_layers/rtl/fixed_softmax.sv @@ -4,27 +4,23 @@ module fixed_softmax #( parameter DATA_IN_0_PRECISION_0 = 8, parameter DATA_IN_0_PRECISION_1 = 4, parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 10, // input vector size - parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 1, // - parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, // incoming elements - - parameter DATA_IN_0_PARALLELISM_DIM_1 = 1, // batch size + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 6, // + parameter DATA_IN_0_PARALLELISM_DIM_0 = 3, // incoming elements - + parameter DATA_IN_0_PARALLELISM_DIM_1 = 2, // batch size parameter IN_0_DEPTH = $rtoi($ceil(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0)), - parameter DATA_OUT_0_PRECISION_0 = 8, parameter DATA_OUT_0_PRECISION_1 = 4, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 10, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = 1, - parameter DATA_OUT_0_PARALLELISM_DIM_0 = 1, - parameter DATA_OUT_0_PARALLELISM_DIM_1 = 1, + parameter DATA_OUT_0_PRECISION_0 = DATA_OUT_0_PRECISION_1 + 2, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, - parameter OUT_0_DEPTH = $rtoi( - $ceil(DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0) - ), + parameter OUT_0_DEPTH = IN_0_DEPTH, - parameter DATA_INTERMEDIATE_0_PRECISION_0 = DATA_IN_0_PRECISION_0, - parameter DATA_INTERMEDIATE_0_PRECISION_1 = DATA_IN_0_PRECISION_1, - - parameter IN_PLACE = 0 + parameter DATA_EXP_0_PRECISION_0 = 12, + parameter DATA_EXP_0_PRECISION_1 = 8 ) ( /* verilator lint_off UNUSEDSIGNAL */ input rst, @@ -43,13 +39,12 @@ module fixed_softmax #( // Can handle multiple batches at once // each iteration recieves a batch of blocks - logic [DATA_IN_0_PRECISION_0-1:0] ff_data[DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; logic [DATA_IN_0_PRECISION_0-1:0] roll_data[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; - logic [DATA_INTERMEDIATE_0_PRECISION_0-1:0] exp_data[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; - logic [DATA_INTERMEDIATE_0_PRECISION_0-1:0] ff_exp_data[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; + logic [DATA_EXP_0_PRECISION_0-1:0] exp_data[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; + logic [DATA_EXP_0_PRECISION_0-1:0] ff_exp_data[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; - logic ff_data_valid; - logic ff_data_ready; + // logic ff_data_valid; + // logic ff_data_ready; logic roll_data_valid; logic roll_data_ready; @@ -60,7 +55,7 @@ module fixed_softmax #( logic ff_exp_data_valid; logic ff_exp_data_ready; - localparam SUM_WIDTH = $clog2(DATA_OUT_0_PARALLELISM_DIM_0) + DATA_INTERMEDIATE_0_PRECISION_0; + localparam SUM_WIDTH = $clog2(DATA_OUT_0_PARALLELISM_DIM_0) + DATA_EXP_0_PRECISION_0; localparam ACC_WIDTH = $clog2(OUT_0_DEPTH) + SUM_WIDTH; logic [SUM_WIDTH-1:0] summed_exp_data[DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // sum of current block @@ -71,68 +66,15 @@ module fixed_softmax #( logic [ACC_WIDTH-1:0] accumulated_exp_data [DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // accumulation of total vector logic [ACC_WIDTH-1:0] ff_accumulated_exp_data [DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // accumulation of total vector - + logic [ACC_WIDTH-1:0] ff_accumulated_exp_data_dup [DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // duplication accumulation of total vector logic acc_out_valid[DATA_OUT_0_PARALLELISM_DIM_1-1:0]; logic acc_out_ready; logic ff_acc_valid; logic ff_acc_ready; - - unpacked_fifo #( - .DEPTH(IN_0_DEPTH), - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) - ) roller_buffer ( - .clk(clk), - .rst(rst), - .data_in(data_in_0), - .data_in_valid(data_in_0_valid), - .data_in_ready(data_in_0_ready), // write enable - .data_out(ff_data), - .data_out_valid(ff_data_valid), - .data_out_ready(ff_data_ready) // read enable - ); - - localparam STRAIGHT_THROUGH = (DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1 == DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1); - - generate - if (STRAIGHT_THROUGH) begin - unpacked_register_slice_quick #( - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .IN_SIZE(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) - ) single_roll ( - .clk(clk), - .rst(rst), - .in_data(ff_data), - .in_valid(ff_data_valid), - .in_ready(ff_data_ready), - .out_data(roll_data), - .out_valid(roll_data_valid), - .out_ready(roll_data_ready) - ); - - end else begin - - roller #( - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), - .ROLL_NUM(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1) - ) roller_inst ( - .clk(clk), - .rst(rst), - .data_in(ff_data), - .data_in_valid(ff_data_valid), - .data_in_ready(ff_data_ready), - .data_out(roll_data), - .data_out_valid(roll_data_valid), - .data_out_ready(roll_data_ready) - ); - end - endgenerate - split2 #() input_handshake_split ( - .data_in_valid (roll_data_valid), - .data_in_ready (roll_data_ready), + .data_in_valid (data_in_0_valid), + .data_in_ready (data_in_0_ready), .data_out_valid({buffer_valid, summed_in_valid}), .data_out_ready({buffer_ready, summed_in_ready[0]}) ); @@ -144,17 +86,17 @@ module fixed_softmax #( exp_lut #( .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), - .DATA_OUT_0_PRECISION_0(DATA_INTERMEDIATE_0_PRECISION_0), - .DATA_OUT_0_PRECISION_1(DATA_INTERMEDIATE_0_PRECISION_1) + .DATA_OUT_0_PRECISION_0(DATA_EXP_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_EXP_0_PRECISION_1) ) exp_map ( - .data_in_0 (roll_data[i]), + .data_in_0 (data_in_0[i]), .data_out_0(exp_data[i]) ); end unpacked_fifo #( - .DEPTH(OUT_0_DEPTH), - .DATA_WIDTH(DATA_INTERMEDIATE_0_PRECISION_0), + .DEPTH(OUT_0_DEPTH * 8), + .DATA_WIDTH(DATA_EXP_0_PRECISION_0), .IN_NUM(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1) ) out_roller_buffer ( .clk(clk), @@ -173,7 +115,7 @@ module fixed_softmax #( if (DATA_OUT_0_PARALLELISM_DIM_0 > 1) begin fixed_adder_tree #( .IN_SIZE (DATA_OUT_0_PARALLELISM_DIM_0), - .IN_WIDTH(DATA_INTERMEDIATE_0_PRECISION_0) + .IN_WIDTH(DATA_EXP_0_PRECISION_0) ) block_sum ( .clk(clk), .rst(rst), @@ -209,10 +151,11 @@ module fixed_softmax #( end endgenerate - hold_buffer #( + input_buffer #( .DATA_WIDTH(ACC_WIDTH), - .DATA_SIZE(DATA_OUT_0_PARALLELISM_DIM_1), - .DEPTH(OUT_0_DEPTH) + .IN_NUM(DATA_OUT_0_PARALLELISM_DIM_1), + .BUFFER_SIZE(1), + .REPEAT(IN_0_DEPTH) ) acc_buffer ( .clk(clk), .rst(rst), @@ -224,127 +167,45 @@ module fixed_softmax #( .data_out_ready(ff_acc_ready) // read enable ); + //TODO: change to register slice - logic [DATA_INTERMEDIATE_0_PRECISION_0 - 1 :0] inter_quotient1 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for rounding division - logic [DATA_INTERMEDIATE_0_PRECISION_0 - 1 :0] inter_quotient2 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for rounding division - logic [DATA_INTERMEDIATE_0_PRECISION_0 - 1 :0] inter_quotient3[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for rounding division - logic [DATA_INTERMEDIATE_0_PRECISION_0 - 1 :0] inter_quotient4 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for rounding division - logic [DATA_INTERMEDIATE_0_PRECISION_0 + DATA_INTERMEDIATE_0_PRECISION_1 - 1 :0] extended_divisor [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for rounding division - - logic [DATA_INTERMEDIATE_0_PRECISION_0 + DATA_INTERMEDIATE_0_PRECISION_1 - 1 :0] extended_quotient [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for quantization - logic [DATA_INTERMEDIATE_0_PRECISION_0 - 1 :0] inter_quotient [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for quantization - - + logic [DATA_EXP_0_PRECISION_0 + DATA_OUT_0_PRECISION_1 - 1 :0] extended_divisor [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for rounding division + logic [DATA_OUT_0_PRECISION_0 + DATA_OUT_0_PRECISION_1 - 1 :0] extended_quotient [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; // extra bit for quantization for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i++) begin : scale_batches for (genvar j = 0; j < DATA_OUT_0_PARALLELISM_DIM_0; j++) begin : div_elements always_comb begin - extended_divisor[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = ff_exp_data[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] << DATA_INTERMEDIATE_0_PRECISION_1; - extended_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = extended_divisor[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] / ff_accumulated_exp_data[i]; - inter_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = extended_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j][DATA_INTERMEDIATE_0_PRECISION_0-1:0]; - // data_out_0[DATA_OUT_0_PARALLELISM_DIM_1*(i) + j] = extended_quotient[DATA_OUT_0_PARALLELISM_DIM_1*(i) + j][DATA_OUT_0_PRECISION_0-1:0]; + extended_divisor[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = ff_exp_data[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] << DATA_OUT_0_PRECISION_1; + ff_accumulated_exp_data_dup[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = ff_accumulated_exp_data[i]; + // extended_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = extended_divisor[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] / ff_accumulated_exp_data[i]; + data_out_0[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j] = extended_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j][DATA_OUT_0_PRECISION_0-1:0]; end - // quick_round #( - // .DATA_WIDTH(DATA_OUT_0_PRECISION_0) - // ) round ( - // .data_in(extended_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j][DATA_OUT_0_PRECISION_0-1:1]), - // .round_bit(extended_quotient[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j][0]), - // .data_out(data_out_0[DATA_OUT_0_PARALLELISM_DIM_0*(i) + j]) - // ); end end - // assign data_out_0 = inter_quotient; - // Divide pipeline (retiming) - logic data_out_0_valid_0; - logic data_out_0_valid_1; - logic data_out_0_valid_2; - logic data_out_0_valid_3; - logic data_out_0_valid_4; - always_ff @(posedge clk) begin - inter_quotient1 <= inter_quotient; - inter_quotient2 <= inter_quotient1; - inter_quotient3 <= inter_quotient2; - inter_quotient4 <= inter_quotient3; - - data_out_0_valid_1 <= data_out_0_valid_0; - data_out_0_valid_2 <= data_out_0_valid_1; - data_out_0_valid_3 <= data_out_0_valid_2; - data_out_0_valid_4 <= data_out_0_valid_3; - - end - - fixed_rounding #( - .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1), - .IN_WIDTH(DATA_INTERMEDIATE_0_PRECISION_0), - .IN_FRAC_WIDTH(DATA_INTERMEDIATE_0_PRECISION_1), - .OUT_WIDTH(DATA_OUT_0_PRECISION_0), - .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) - ) data_out_cast ( - .data_in (inter_quotient4), - .data_out(data_out_0) - ); - - join2 #() output_handshake_split ( - .data_in_valid ({ff_exp_data_valid, ff_acc_valid}), - .data_in_ready ({ff_exp_data_ready, ff_acc_ready}), - .data_out_valid(data_out_0_valid_0), - .data_out_ready(data_out_0_ready) + // join2 #() output_handshake_split ( + // .data_in_valid ({ff_exp_data_valid, ff_acc_valid}), + // .data_in_ready ({ff_exp_data_ready, ff_acc_ready}), + // .data_out_valid(data_out_0_valid), + // .data_out_ready(data_out_0_ready) + // ); + fixed_div #( + .IN_NUM(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1), + .DIVIDEND_WIDTH(DATA_EXP_0_PRECISION_0 + DATA_OUT_0_PRECISION_1), + .DIVISOR_WIDTH(ACC_WIDTH), + .QUOTIENT_WIDTH(DATA_OUT_0_PRECISION_0 + DATA_OUT_0_PRECISION_1), + .FIFO_DEPTH(DATA_OUT_0_TENSOR_SIZE_DIM_0 * DATA_OUT_0_TENSOR_SIZE_DIM_1 / (DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1)) + ) div_inst ( + .clk(clk), + .rst(rst), + .dividend_data(extended_divisor), + .dividend_data_valid(ff_exp_data_valid), + .dividend_data_ready(ff_exp_data_ready), + .divisor_data(ff_accumulated_exp_data_dup), + .divisor_data_valid(ff_acc_valid), + .divisor_data_ready(ff_acc_ready), + .quotient_data(extended_quotient), + .quotient_data_valid(data_out_0_valid), + .quotient_data_ready(data_out_0_ready) ); - - assign data_out_0_valid = data_out_0_valid_4; -endmodule - -/* verilator lint_off DECLFILENAME */ - -module hold_buffer #( - parameter DATA_WIDTH = 16, - parameter DATA_SIZE = 4, - parameter DEPTH = 1 -) ( - input rst, - input clk, - - input logic [DATA_WIDTH - 1:0] data_in[DATA_SIZE - 1:0], - input logic data_in_valid, - output logic data_in_ready, - - output logic [DATA_WIDTH - 1:0] data_out[DATA_SIZE - 1:0], - output logic data_out_valid, - input logic data_out_ready -); - - logic [$clog2(DEPTH) : 0] count; - logic [ DATA_WIDTH - 1:0] data_out_register[DATA_SIZE - 1:0]; - assign data_out = data_out_register; - always_ff @(posedge clk) begin - if (rst) begin - count <= 0; - // data_out_register <= 0; - data_out_valid <= 0; - data_in_ready <= 1; - end else begin - if (count == 0) begin - // The buffer is empty - if (data_in_valid) begin - data_out_register <= data_in; - count <= DEPTH; - data_out_valid <= 1; - data_in_ready <= 0; - end else begin - data_in_ready <= data_out_ready; - data_out_valid <= 0; - end - end else begin - // The buffer has data - if (data_out_ready) begin - count <= count - 1; - end else begin - count <= count; - end - end - end - end - - // take an input and output it for depth length preventing further input from entering. endmodule diff --git a/src/mase_components/activation_layers/rtl/fixed_tanh.sv b/src/mase_components/activation_layers/rtl/fixed_tanh.sv index 4cb3e044f..5492e24ec 100644 --- a/src/mase_components/activation_layers/rtl/fixed_tanh.sv +++ b/src/mase_components/activation_layers/rtl/fixed_tanh.sv @@ -2,21 +2,21 @@ module fixed_tanh #( /* verilator lint_off UNUSEDPARAM */ - parameter DATA_IN_0_PRECISION_0 = 16, //total number of bits used to represent each input data - parameter DATA_IN_0_PRECISION_1 = 8, //fractional bits - parameter DATA_IN_0_PRECISION_INT = DATA_IN_0_PRECISION_0 - DATA_IN_0_PRECISION_1, //number of integer bits - - parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 8, //total input data per tensor along dim 0 - parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 1, //total input data per tensor along dim 1 - parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, //input data along dim 0 coming in parallel in the same clock cycle - parameter DATA_IN_0_PARALLELISM_DIM_1 = 1, //input data along dim 1 coming in parallel in the same clock cycle - - parameter DATA_OUT_0_PRECISION_0 = 16, //total number of bits used to represent each output data. Typically needs only (2 + fractional) bits since tanh varies between +/-1. - parameter DATA_OUT_0_PRECISION_1 = 8, //fractional bits. Output of the module is rounded to satisfy this value - parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 8, //total output data per tensor along dim 0 - parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = 1, //total output data per tensor along dim 1 - parameter DATA_OUT_0_PARALLELISM_DIM_0 = 1, //output data along dim 0 going out in parallel in the same clock cycle - parameter DATA_OUT_0_PARALLELISM_DIM_1 = 1 //output data along dim 1 going out in parallel in the same clock cycle + parameter DATA_IN_0_PRECISION_0 = 16, //total number of bits used to represent each input data + parameter DATA_IN_0_PRECISION_1 = 8, //fractional bits + parameter DATA_IN_0_PRECISION_INT = DATA_IN_0_PRECISION_0 - DATA_IN_0_PRECISION_1, //number of integer bits + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 8, //total input data per tensor along dim 0 + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 1, //total input data per tensor along dim 1 + parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, //input data along dim 0 coming in parallel in the same clock cycle + parameter DATA_IN_0_PARALLELISM_DIM_1 = 1, //input data along dim 1 coming in parallel in the same clock cycle + + parameter DATA_OUT_0_PRECISION_0 = 16, //total number of bits used to represent each output data. Typically needs only (2 + fractional) bits since tanh varies between +/-1. + parameter DATA_OUT_0_PRECISION_1 = 8, //fractional bits. Output of the module is rounded to satisfy this value + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 8, //total output data per tensor along dim 0 + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = 1, //total output data per tensor along dim 1 + parameter DATA_OUT_0_PARALLELISM_DIM_0 = 1, //output data along dim 0 going out in parallel in the same clock cycle + parameter DATA_OUT_0_PARALLELISM_DIM_1 = 1 //output data along dim 1 going out in parallel in the same clock cycle ) ( /* verilator lint_off UNUSEDSIGNAL */ @@ -35,15 +35,15 @@ module fixed_tanh #( data_out_valid1, data_out_valid2, data_out_valid3, - data_out_valid4; //used to store delayed version of input data valid which is given as output datavalid + data_out_valid4; //used to store delayed version of input data valid which is given as output datavalid - //constants a and b that divides the input range. Stored with 32 bit precision. However, they are rounded to the input precision once specified. + //constants a and b that divides the input range. Stored with 32 bit precision. However, they are rounded to the input precision once specified. const logic signed [33 : 0] a = 34'b0110000101000111101011100001010001; const logic signed [34 : 0] b = 35'b01010010001111010111000010100011110; logic signed [DATA_IN_0_PRECISION_0-1:0] a_fixed, b_fixed; - //rounding a to input precision + //rounding a to input precision fixed_round #( .IN_WIDTH(34), .IN_FRAC_WIDTH(32), @@ -54,7 +54,7 @@ module fixed_tanh #( .data_out(a_fixed) ); - //rounding b to input precision + //rounding b to input precision fixed_round #( .IN_WIDTH(35), .IN_FRAC_WIDTH(32), @@ -65,18 +65,18 @@ module fixed_tanh #( .data_out(b_fixed) ); - //constants for polynomial approximation. 16 bit fractional precision is used. c1 is 1. Hence not stored. + //constants for polynomial approximation. 16 bit fractional precision is used. c1 is 1. Hence not stored. const logic signed [16 : 0] m1 = 17'b11011101001110111; const logic signed [16 : 0] d1 = 17'b00000010000011000; const logic signed [16 : 0] m2 = 17'b11110101001001100; const logic signed [16 : 0] c2 = 17'b00110110100110001; const logic signed [16 : 0] d2 = 17'b00111001110101111; - //generating computation block for each parallel input data + //generating computation block for each parallel input data for ( genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1; i++ ) begin : tanh - // Local variables for computation + // Local variables for computation logic signed [DATA_IN_0_PRECISION_0-1:0] data_in1; logic signed [DATA_IN_0_PRECISION_0-1:0] data_in2; logic signed [DATA_IN_0_PRECISION_0-1:0] data_in3; @@ -95,20 +95,20 @@ module fixed_tanh #( assign x_abs = ($signed( data_in_0[i] - ) >= 0) ? data_in_0[i] : -data_in_0[i]; //calculation of absolute value + ) >= 0) ? data_in_0[i] : -data_in_0[i]; //calculation of absolute value assign x_abs_dum = x_abs; - assign x_squared = x_abs * x_abs; //squaring of absolute value + assign x_squared = x_abs * x_abs; //squaring of absolute value always_ff @(posedge clk) begin - if (rst) begin //reset conditions + if (rst) begin //reset conditions term0 <= 0; term1 <= 0; term2 <= 0; temp_result <= 0; - end - else if (data_out_0_ready && (data_in_0_valid ||data_out_valid1||data_out_valid2)) begin //Calculation of polynomial approximation.Computation is performed in two pipelined stages + end + else if (data_out_0_ready && (data_in_0_valid ||data_out_valid1||data_out_valid2)) begin //Calculation of polynomial approximation.Computation is performed in two pipelined stages if (x_abs_dum <= a_fixed) begin term0 <= 0; end else if (x_abs_dum <= b_fixed) begin @@ -155,7 +155,7 @@ module fixed_tanh #( data_in3 <= data_in2; end - //rounding of the output result + //rounding of the output result fixed_round #( .IN_WIDTH(2 * DATA_IN_0_PRECISION_0 + 17), .IN_FRAC_WIDTH(2 * DATA_IN_0_PRECISION_1 + 16), @@ -165,7 +165,7 @@ module fixed_tanh #( .data_in (temp_result), .data_out(temp_out) ); - //assigning the output with sign based on sign of the input. + //assigning the output with sign based on sign of the input. assign data_out_0[i] = (data_in3 >= 0) ? temp_out : -temp_out; end diff --git a/src/mase_components/activation_layers/rtl/softermax_lpw_reciprocal.sv b/src/mase_components/activation_layers/rtl/softermax_lpw_reciprocal.sv index 8d0fd6d60..64df3290b 100644 --- a/src/mase_components/activation_layers/rtl/softermax_lpw_reciprocal.sv +++ b/src/mase_components/activation_layers/rtl/softermax_lpw_reciprocal.sv @@ -41,6 +41,9 @@ module softermax_lpw_reciprocal #( // Parameters // ----- + // let max(a, b) = (a > b) ? a : b; + // This is not syntheable + localparam ENTRIES_WIDTH = $clog2(ENTRIES); // Range reduced num: x diff --git a/src/mase_components/activation_layers/test/fixed_softmax_tb.py b/src/mase_components/activation_layers/test/fixed_softmax_tb.py index 62f37012f..b7c725ac0 100644 --- a/src/mase_components/activation_layers/test/fixed_softmax_tb.py +++ b/src/mase_components/activation_layers/test/fixed_softmax_tb.py @@ -1,220 +1,191 @@ #!/usr/bin/env python3 +import os import pytest -import os, logging -from . import generate_memory -import pdb -from bitstring import BitArray -import cocotb -from functools import partial -from cocotb.triggers import * -from chop.nn.quantizers import integer_quantizer -from mase_cocotb.testbench import Testbench -from mase_cocotb.interfaces.streaming import ( - StreamDriver, - StreamMonitor, - StreamMonitorFloat, -) -from mase_cocotb.z_qlayers import quantize_to_int -from mase_cocotb.runner import mase_runner -from mase_cocotb.utils import bit_driver, sign_extend_t -from math import ceil - -# from chop.passes.graph.transforms.quantize.quantized_modules import LinearInteger import torch +import logging +from functools import partial +from mase_components.helper import generate_memory +from pathlib import Path +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer -logger = logging.getLogger("testbench") -logger.setLevel(logging.INFO) - - -def split_and_flatten_2d_tensor(input_tensor, row_block_size, col_block_size): - rows, cols = input_tensor.size() - - num_row_blocks = rows // row_block_size - num_col_blocks = cols // col_block_size +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor +from mase_cocotb.runner import mase_runner +from mase_cocotb.utils import fixed_preprocess_tensor - reshaped_tensor = input_tensor.view( - num_row_blocks, row_block_size, num_col_blocks, col_block_size - ) - reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous() - flattened_tensor = reshaped_tensor.view(-1, row_block_size * col_block_size) - return flattened_tensor +from mase_cocotb.utils import bit_driver +from chop.nn.quantized.functional import softmax_integer -class fixed_softmax_tb(Testbench): - def __init__(self, module, dut, dut_params, float_test=False) -> None: +class SoftmaxTB(Testbench): + def __init__(self, dut) -> None: super().__init__(dut, dut.clk, dut.rst) - self.data_width = dut_params["DATA_IN_0_PRECISION_0"] - self.frac_width = dut_params["DATA_IN_0_PRECISION_1"] - - self.outputwidth = dut_params["DATA_OUT_0_PRECISION_0"] - self.outputfracw = dut_params["DATA_OUT_0_PRECISION_1"] - - self.num_in_features = dut_params["DATA_IN_0_TENSOR_SIZE_DIM_0"] - self.num_in_batches = dut_params["DATA_IN_0_TENSOR_SIZE_DIM_1"] + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) - self.size_in_feature_blocks = dut_params["DATA_IN_0_PARALLELISM_DIM_0"] - self.size_in_batch_blocks = dut_params["DATA_IN_0_PARALLELISM_DIM_1"] - - self.num_in_feature_splits = int( - ceil(self.num_in_features / self.size_in_feature_blocks) - ) - self.num_in_batch_splits = int( - ceil(self.num_in_batches / self.size_in_batch_blocks) + self.in_data_driver = StreamDriver( + dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready ) - self.num_out_features = dut_params["DATA_OUT_0_TENSOR_SIZE_DIM_0"] - self.num_out_batches = dut_params["DATA_OUT_0_TENSOR_SIZE_DIM_1"] - - self.size_out_feature_blocks = dut_params["DATA_OUT_0_PARALLELISM_DIM_0"] - self.size_out_batch_blocks = dut_params["DATA_OUT_0_PARALLELISM_DIM_1"] - - self.num_out_feature_splits = int( - ceil(self.num_out_features / self.size_out_feature_blocks) + self.out_data_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, ) - self.num_out_batch_splits = int( - ceil(self.num_out_batches / self.size_out_batch_blocks) + # Model + self.model = partial( + softmax_integer, + config={ + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "data_in_exp_width": self.get_parameter("DATA_EXP_0_PRECISION_0"), + "data_in_exp_frac_width": self.get_parameter("DATA_EXP_0_PRECISION_1"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "mult_data": CONSTANT_MULT, + }, + dim=-1, + floor=True, ) - self.data_in_0_driver = StreamDriver( - dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready - ) + # Set verbosity of driver and monitor loggers to debug + self.in_data_driver.log.setLevel(logging.DEBUG) + self.out_data_monitor.log.setLevel(logging.DEBUG) - if float_test: - self.data_out_0_monitor = StreamMonitorFloat( - dut.clk, - dut.data_out_0, - dut.data_out_0_valid, - dut.data_out_0_ready, - self.outputwidth, - self.outputfracw, - ) - else: - self.data_out_0_monitor = StreamMonitor( - dut.clk, dut.data_out_0, dut.data_out_0_valid, dut.data_out_0_ready + def generate_inputs(self): + return torch.randn( + ( + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), ) - - self.in_dquantizer = partial( - integer_quantizer, - width=self.data_width, - frac_width=self.frac_width, - is_signed=True, ) - self.out_dquantizer = partial( - integer_quantizer, - width=self.outputwidth, - frac_width=self.outputfracw, - is_signed=True, - ) + async def run_test(self, batches, us): + await self.reset() + self.log.info(f"Reset finished") + + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = fixed_preprocess_tensor( + tensor=inputs, + q_config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + self.in_data_driver.load_driver(inputs) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = fixed_preprocess_tensor( + tensor=exp_out, + q_config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.out_data_monitor.load_monitor(outs) - self.model = module + await Timer(us, units="us") + assert self.out_data_monitor.exp_queue.empty() - self.real_in_tensor = torch.randn(self.num_in_batches, self.num_in_features) - self.quant_in_tensor = self.in_dquantizer(self.real_in_tensor) - self.real_out_tensor = self.model(self.quant_in_tensor) - logger.info(f"REAL IN TENSOR: \n{self.real_in_tensor}") - logger.info(f"REAL OUT TENSOR: \n{self.real_out_tensor}") +@cocotb.test() +async def single_test(dut): + tb = SoftmaxTB(dut) + tb.out_data_monitor.ready.value = 1 + await tb.run_test(batches=50, us=100) - def exp(self): - # Run the model with the provided inputs and return the expected integer outputs in the format expected by the monitor - m = split_and_flatten_2d_tensor( - self.real_out_tensor, - self.size_out_batch_blocks, - self.size_out_feature_blocks, - ) # match output - logger.info(f"EXP - FLOAT OUTPUT: \n{m}") - m = self.out_dquantizer(m) - m2 = (m * 2**self.outputfracw).to(torch.int64) - m2 = m2.clone().detach() % (2**self.outputwidth) - return m2 +# @cocotb.test() +# async def repeated_mult(dut): +# tb = SoftmaxTB(dut) +# tb.out_data_monitor.ready.value = 1 +# await tb.run_test(batches=100, us=2000) - def generate_inputs(self): - # Generate the integer inputs for the DUT in the format expected by the driver - inputs = split_and_flatten_2d_tensor( - self.real_in_tensor, self.size_in_batch_blocks, self.size_in_feature_blocks - ) - logger.info(f"FLOAT INPUT: \n{inputs}") - inputs = self.in_dquantizer(inputs) - intinp = (inputs * 2**self.frac_width).to(torch.int64) - return intinp, inputs - - def doubletofx(self, num, data_width, f_width, type="bin"): - assert type == "bin" or type == "hex", "type can only be: 'hex' or 'bin'" - intnum = int(num * 2 ** (f_width)) - intbits = BitArray(int=intnum, length=data_width) - return str(intbits.bin) if type == "bin" else str(intbits) - - async def run_test(self): - await self.reset() - logger.info(f"Reset finished") - self.data_out_0_monitor.ready.value = 1 - for i in range(1): - inputs, real_tensor = self.generate_inputs() - exp_out = self.exp() - inputs = inputs.tolist() - exp_out = exp_out.tolist() - logger.info("Inputs and expected generated") - logger.info(f"DUT IN: {inputs}") - logger.info(f"DUT EXP OUT: {exp_out}") - self.data_in_0_driver.load_driver(inputs) - self.data_out_0_monitor.load_monitor(exp_out) - - await Timer(1000, units="us") - assert self.data_out_0_monitor.exp_queue.empty() +# @cocotb.test() +# async def repeated_mult_backpressure(dut): +# tb = SoftmaxTB(dut) +# cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) +# await tb.run_test(batches=10, us=500) -@cocotb.test() -async def cocotb_test(dut): - in_data_width = dut_params["DATA_IN_0_PRECISION_0"] - in_frac_width = dut_params["DATA_IN_0_PRECISION_1"] - out_data_width = dut_params["DATA_OUT_0_PRECISION_0"] - out_frac_width = dut_params["DATA_OUT_0_PRECISION_1"] - inter_data_width = dut_params["DATA_INTERMEDIATE_0_PRECISION_0"] - inter_frac_width = dut_params["DATA_INTERMEDIATE_0_PRECISION_1"] - # generate_memory.generate_sv_lut("exp", in_data_width, in_frac_width, inter_data_width, inter_frac_width) - # print("Generated memory") - tb = fixed_softmax_tb(torch.nn.Softmax(), dut, dut_params, float_test=True) - await tb.run_test() +# @cocotb.test() +# async def repeated_mult_valid_backpressure(dut): +# tb = SoftmaxTB(dut) +# tb.in_data_driver.set_valid_prob(0.7) +# cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) +# await tb.run_test(batches=50, us=200) dut_params = { - "DATA_IN_0_TENSOR_SIZE_DIM_0": 12, - "DATA_IN_0_TENSOR_SIZE_DIM_1": 4, - "DATA_IN_0_PARALLELISM_DIM_0": 6, - "DATA_IN_0_PARALLELISM_DIM_1": 2, "DATA_IN_0_PRECISION_0": 8, "DATA_IN_0_PRECISION_1": 4, - "DATA_OUT_0_PRECISION_0": 8, - "DATA_OUT_0_PRECISION_1": 4, - "DATA_OUT_0_TENSOR_SIZE_DIM_0": 12, - "DATA_OUT_0_TENSOR_SIZE_DIM_1": 4, - "DATA_OUT_0_PARALLELISM_DIM_0": 6, - "DATA_OUT_0_PARALLELISM_DIM_1": 2, - "DATA_INTERMEDIATE_0_PRECISION_0": 12, - "DATA_INTERMEDIATE_0_PRECISION_1": 8, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 1, + "DATA_IN_0_PARALLELISM_DIM_0": 1, + "DATA_IN_0_PARALLELISM_DIM_1": 1, + "DATA_EXP_0_PRECISION_0": 8, + "DATA_EXP_0_PRECISION_1": 4, + "DATA_OUT_0_PRECISION_1": 6, } + +def get_fixed_softmax_config(kwargs={}): + config = dut_params + config.update(kwargs) + return config + + torch.manual_seed(1) +CONSTANT_MULT = 0.19 @pytest.mark.dev -def test_fixed_softmax(): - # generate_memory.generate_sv_lut("exp", dut_params["DATA_IN_0_PRECISION_0"], dut_params["DATA_IN_0_PRECISION_1"]) +def test_fixed_softmax_smoke(): + """ + Some quick tests to check if the module is working. + """ + path = Path(__file__).parents[1] / "rtl" generate_memory.generate_sv_lut( "exp", dut_params["DATA_IN_0_PRECISION_0"], dut_params["DATA_IN_0_PRECISION_1"], - dut_params["DATA_INTERMEDIATE_0_PRECISION_0"], - dut_params["DATA_INTERMEDIATE_0_PRECISION_1"], + dut_params["DATA_EXP_0_PRECISION_0"], + dut_params["DATA_EXP_0_PRECISION_1"], + path=path, + constant_mult=CONSTANT_MULT, + floor=True, + ) + mase_runner( + trace=True, + module_param_list=[ + get_fixed_softmax_config(), + ], + # sim="questa", + # skip_build=True, ) - print("Generated memory") - mase_runner(module_param_list=[dut_params]) if __name__ == "__main__": - test_fixed_softmax() + test_fixed_softmax_smoke() diff --git a/src/mase_components/cast/rtl/fixed_round.sv b/src/mase_components/cast/rtl/fixed_round.sv index a49b43001..2a5291053 100644 --- a/src/mase_components/cast/rtl/fixed_round.sv +++ b/src/mase_components/cast/rtl/fixed_round.sv @@ -24,12 +24,13 @@ module fixed_round #( logic carry_in, input_sign; assign input_sign = data_in[IN_WIDTH-1]; assign input_data = (input_sign) ? ~(data_in[IN_WIDTH-2:0] - 1) : data_in[IN_WIDTH-2:0]; - /* verilator lint_off SELRANGE */ + logic [IN_WIDTH + OUT_FRAC_WIDTH - 1:0] lsb_check; + assign lsb_check = {input_data, {(OUT_FRAC_WIDTH) {1'b0}}}; always_comb begin - lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH] : 0; - lsb_below[1] = (IN_FRAC_WIDTH-1 >= OUT_FRAC_WIDTH) ? input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH-1] : 0; - // lsb_below[0] = (IN_FRAC_WIDTH-2 >= OUT_FRAC_WIDTH) ? |(input_data[IN_FRAC_WIDTH-OUT_FRAC_WIDTH-2:0]): 0; - lsb_below[0] = '0; // to do: fix + lsb_below[2] = (IN_FRAC_WIDTH >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH] : 0; + lsb_below[1] = (IN_FRAC_WIDTH-1 >= OUT_FRAC_WIDTH) ? lsb_check[IN_FRAC_WIDTH-1] : 0; + lsb_below[0] = (IN_FRAC_WIDTH-2 >= OUT_FRAC_WIDTH) ? |(lsb_check[IN_FRAC_WIDTH-2:0]): 0; + // lsb_below[0] = '0; // to do: fix end always_comb begin if ((IN_FRAC_WIDTH - OUT_FRAC_WIDTH) >= 0) diff --git a/src/mase_components/cast/rtl/fixed_rounding.sv b/src/mase_components/cast/rtl/fixed_rounding.sv index 65a0709eb..dc2fd38cf 100644 --- a/src/mase_components/cast/rtl/fixed_rounding.sv +++ b/src/mase_components/cast/rtl/fixed_rounding.sv @@ -11,15 +11,14 @@ module fixed_rounding #( output [OUT_WIDTH - 1:0] data_out[IN_SIZE - 1:0] ); for (genvar i = 0; i < IN_SIZE; i++) begin : parallel_round - fixed_signed_cast #( + fixed_round #( .IN_WIDTH(IN_WIDTH), .IN_FRAC_WIDTH(IN_FRAC_WIDTH), .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH), - .ROUND_FLOOR(1) + .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH) ) fr_inst ( - .in_data (data_in[i]), - .out_data(data_out[i]) + .data_in (data_in[i]), + .data_out(data_out[i]) ); end diff --git a/src/mase_components/common/rtl/comparator_tree.sv b/src/mase_components/common/rtl/comparator_tree.sv index c6599dd7f..4afe5f7c3 100644 --- a/src/mase_components/common/rtl/comparator_tree.sv +++ b/src/mase_components/common/rtl/comparator_tree.sv @@ -35,6 +35,7 @@ module comparator_tree #( logic [DATA_WIDTH-1:0] data[(2**(LEVELS-level))-1:0]; logic valid; logic ready; + if (level == 0) assign data = in_data; end @@ -103,7 +104,7 @@ module comparator_tree #( end // Connect up first and last layer wires - assign vars[0].data = in_data; + // assign vars[0].data = in_data; assign vars[0].valid = in_valid; assign in_ready = vars[0].ready; diff --git a/src/mase_components/common/rtl/fork2.sv b/src/mase_components/common/rtl/fork2.sv new file mode 100644 index 000000000..c7cc13673 --- /dev/null +++ b/src/mase_components/common/rtl/fork2.sv @@ -0,0 +1,83 @@ +`timescale 1ns / 1ps + +module fork2 #( + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 3, + parameter DATA_OUT_1_PRECISION_0 = 8, + parameter DATA_OUT_1_PRECISION_1 = 3, + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = -1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = -1, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = -1, + parameter DATA_IN_0_PARALLELISM_DIM_1 = -1, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = -1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = -1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = -1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = -1, + + parameter DATA_OUT_1_TENSOR_SIZE_DIM_0 = -1, + parameter DATA_OUT_1_PARALLELISM_DIM_0 = -1, + parameter DATA_OUT_1_TENSOR_SIZE_DIM_1 = -1, + parameter DATA_OUT_1_PARALLELISM_DIM_1 = -1, + + parameter DATA_OUT_1_FIFO_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), + parameter DATA_OUT_0_FIFO_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_IN_0_PRECISION_0-1:0] data_out_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready, + + output logic [DATA_IN_0_PRECISION_0-1:0] data_out_1 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + output logic data_out_1_valid, + input logic data_out_1_ready +); +// logic buffered_data_out_1_valid, buffered_data_out_0_valid; +// logic buffered_data_out_1_ready, buffered_data_out_0_ready; + + split2 #() split2_inst ( + .data_out_valid({data_out_1_valid, data_out_0_valid}), + .data_out_ready({data_out_1_ready, data_out_0_ready}), + .data_in_valid (data_in_0_valid), + .data_in_ready (data_in_0_ready) + ); + assign data_out_0 = data_in_0; + assign data_out_1 = data_in_0; +// unpacked_fifo #( +// .DEPTH(DATA_OUT_0_FIFO_DEPTH), +// .DATA_WIDTH(DATA_IN_0_PRECISION_0), +// .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) +// ) data_out_0_buffer ( +// .clk(clk), +// .rst(rst), +// .data_in(data_in_0), +// .data_in_valid(buffered_data_out_0_valid), +// .data_in_ready(buffered_data_out_0_ready), // write enable +// .data_out(data_out_0), +// .data_out_valid(data_out_0_valid), +// .data_out_ready(data_out_0_ready) // read enable +// ); +// unpacked_fifo #( +// .DEPTH(DATA_OUT_1_FIFO_DEPTH), +// .DATA_WIDTH(DATA_IN_0_PRECISION_0), +// .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) +// ) data_out_1_buffer ( +// .clk(clk), +// .rst(rst), +// .data_in(data_in_0), +// .data_in_valid(buffered_data_out_1_valid), +// .data_in_ready(buffered_data_out_1_ready), // write enable +// .data_out(data_out_1), +// .data_out_valid(data_out_1_valid), +// .data_out_ready(data_out_1_ready) // read enable +// ); +endmodule diff --git a/src/mase_components/common/rtl/single_element_repeat.sv b/src/mase_components/common/rtl/single_element_repeat.sv index 9d1b32822..9cf01ce4b 100644 --- a/src/mase_components/common/rtl/single_element_repeat.sv +++ b/src/mase_components/common/rtl/single_element_repeat.sv @@ -81,7 +81,7 @@ module single_element_repeat #( end - skid_buffer #( + register_slice #( .DATA_WIDTH(DATA_WIDTH) ) output_buffer ( .clk(clk), diff --git a/src/mase_components/common/rtl/split2_with_data.sv b/src/mase_components/common/rtl/split2_with_data.sv new file mode 100644 index 000000000..a0683564b --- /dev/null +++ b/src/mase_components/common/rtl/split2_with_data.sv @@ -0,0 +1,49 @@ +/* +Module : split2_width_data +Description : This module implements a 1-to-2 streaming interface handshake. +*/ + +`timescale 1ns / 1ps +module split2_with_data #( + parameter DATA_WIDTH = -1, + parameter FIFO_DEPTH = -1 +) ( + input logic clk, + input logic rst, + input logic [DATA_WIDTH - 1:0] data_in, + input logic data_in_valid, + output logic data_in_ready, + + output logic [DATA_WIDTH - 1:0] fifo_data_out, + output logic fifo_data_out_valid, + input logic fifo_data_out_ready, + + output logic [DATA_WIDTH - 1:0] straight_data_out, + output logic straight_data_out_valid, + input logic straight_data_out_ready +); + logic fifo_in_valid, fifo_in_ready; + split2 #() data_out_n_split_i ( + .data_in_valid (data_in_valid), + .data_in_ready (data_in_ready), + .data_out_valid({fifo_in_valid, straight_data_out_valid}), + .data_out_ready({fifo_in_ready, straight_data_out_ready}) + ); + fifo #( + .DEPTH(FIFO_DEPTH), + .DATA_WIDTH(DATA_WIDTH) + ) ff_inst ( + .clk(clk), + .rst(rst), + .in_data(data_in), + .in_valid(fifo_in_valid), + .in_ready(fifo_in_ready), + .out_data(fifo_data_out), + .out_valid(fifo_data_out_valid), + .out_ready(fifo_data_out_ready), + .empty(), + .full() + ); + assign straight_data_out = data_in; + +endmodule diff --git a/src/mase_components/common/rtl/unpacked_split2_with_data.sv b/src/mase_components/common/rtl/unpacked_split2_with_data.sv new file mode 100644 index 000000000..a4d3b0301 --- /dev/null +++ b/src/mase_components/common/rtl/unpacked_split2_with_data.sv @@ -0,0 +1,59 @@ +module unpacked_split2_with_data #( + parameter DEPTH = 8, + parameter DATA_WIDTH = 8, + parameter IN_SIZE = 8 +) ( + input clk, + input rst, + // Input interface + input [DATA_WIDTH-1:0] data_in[IN_SIZE - 1:0], + input logic data_in_valid, + output logic data_in_ready, + // FIFO output interface + output [DATA_WIDTH-1:0] fifo_data_out[IN_SIZE - 1:0], + output logic fifo_data_out_valid, + input logic fifo_data_out_ready, + // Straight output interface + output [DATA_WIDTH-1:0] straight_data_out[IN_SIZE - 1:0], + output logic straight_data_out_valid, + input logic straight_data_out_ready +); + // Flatten the input data + logic [DATA_WIDTH * IN_SIZE - 1:0] data_in_flatten; + logic [DATA_WIDTH * IN_SIZE - 1:0] fifo_data_out_flatten; + logic [DATA_WIDTH * IN_SIZE - 1:0] straight_data_out_flatten; + + // Input flattening + for (genvar i = 0; i < IN_SIZE; i++) begin : reshape + assign data_in_flatten[i*DATA_WIDTH+DATA_WIDTH-1:i*DATA_WIDTH] = data_in[i]; + end + + // Split2 instance + split2_with_data #( + .DATA_WIDTH(DATA_WIDTH * IN_SIZE), + .FIFO_DEPTH(DEPTH) + ) split2_with_data_i ( + .clk(clk), + .rst(rst), + .data_in(data_in_flatten), + .data_in_valid(data_in_valid), + .data_in_ready(data_in_ready), + .fifo_data_out(fifo_data_out_flatten), + .fifo_data_out_valid(fifo_data_out_valid), + .fifo_data_out_ready(fifo_data_out_ready), + .straight_data_out(straight_data_out_flatten), + .straight_data_out_valid(straight_data_out_valid), + .straight_data_out_ready(straight_data_out_ready) + ); + + // Unflatten FIFO output + for (genvar i = 0; i < IN_SIZE; i++) begin : unreshape_fifo + assign fifo_data_out[i] = fifo_data_out_flatten[i*DATA_WIDTH+DATA_WIDTH-1:i*DATA_WIDTH]; + end + + // Unflatten straight output + for (genvar i = 0; i < IN_SIZE; i++) begin : unreshape_straight + assign straight_data_out[i] = straight_data_out_flatten[i*DATA_WIDTH+DATA_WIDTH-1:i*DATA_WIDTH]; + end + +endmodule \ No newline at end of file diff --git a/src/mase_components/common/test/test_synth_common.py b/src/mase_components/common/test/test_synth_common.py index aa7d4dd79..50b54d052 100644 --- a/src/mase_components/common/test/test_synth_common.py +++ b/src/mase_components/common/test/test_synth_common.py @@ -4,7 +4,7 @@ @pytest.mark.vivado def test_synth_common(): - run_synth("common") + run_synth("common", "comparator_tree.sv") if __name__ == "__main__": diff --git a/src/mase_components/convolution_layers/rtl/convolution.sv b/src/mase_components/convolution_layers/rtl/convolution.sv index 65a9b06c2..2f2b06d1e 100644 --- a/src/mase_components/convolution_layers/rtl/convolution.sv +++ b/src/mase_components/convolution_layers/rtl/convolution.sv @@ -20,7 +20,6 @@ module convolution #( parameter UNROLL_KERNEL_OUT = 4, parameter UNROLL_OUT_C = 2, - parameter SLIDING_NUM = 8, parameter BIAS_SIZE = UNROLL_OUT_C, parameter STRIDE = 1, @@ -29,6 +28,10 @@ module convolution #( parameter PADDING_X = 2, parameter HAS_BIAS = 1, + parameter OUT_Y = (IN_Y - KERNEL_Y + 2 * PADDING_Y + 1) / (STRIDE), + parameter OUT_X = (IN_X - KERNEL_X + 2 * PADDING_X + 1) / (STRIDE), + parameter SLIDING_NUM = OUT_Y * OUT_X, + parameter DATA_OUT_0_PRECISION_0 = 8, parameter DATA_OUT_0_PRECISION_1 = 4 ) ( @@ -77,6 +80,11 @@ module convolution #( logic [DATA_IN_0_PRECISION_0 - 1:0] kernel[KERNEL_Y * KERNEL_X * UNROLL_IN_C - 1:0]; logic kernel_valid; logic kernel_ready; + localparam ROUND_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + KERNEL_X * KERNEL_Y * IN_C + ); + localparam ROUND_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1; + logic [ROUND_PRECISION_0 -1:0] round_in[UNROLL_OUT_C-1:0]; sliding_window #( .IMG_WIDTH (IN_X), .IMG_HEIGHT (IN_Y), @@ -89,14 +97,15 @@ module convolution #( .STRIDE (STRIDE) /* verilator lint_off PINMISSING */ ) sw_inst ( + .clk(clk), + .rst(rst), .data_in(packed_data_in), .data_in_valid(data_in_0_valid), .data_in_ready(data_in_0_ready), .data_out(packed_kernel), .data_out_valid(kernel_valid), - .data_out_ready(kernel_ready), - .* + .data_out_ready(kernel_ready) ); /* verilator lint_on PINMISSING */ for (genvar i = 0; i < KERNEL_Y * KERNEL_X; i++) @@ -109,21 +118,17 @@ module convolution #( .NUM(ROLL_IN_NUM), .ROLL_NUM(UNROLL_KERNEL_OUT) ) roller_inst ( + .clk(clk), + .rst(rst), .data_in(kernel), .data_in_valid(kernel_valid), .data_in_ready(kernel_ready), .data_out(rolled_k), .data_out_valid(rolled_k_valid), - .data_out_ready(rolled_k_ready), - .* + .data_out_ready(rolled_k_ready) ); - localparam ROUND_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( - KERNEL_X * KERNEL_Y * IN_C - ); - localparam ROUND_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1; - logic [ROUND_PRECISION_0 -1:0] round_in[UNROLL_OUT_C-1:0]; - convolution_arith #( + convolution_compute_core #( // assume output will only unroll_out_channels .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), @@ -138,12 +143,21 @@ module convolution #( .OUT_CHANNELS_DEPTH(OUT_C / UNROLL_OUT_C), .WEIGHT_REPEATS(SLIDING_NUM), .HAS_BIAS(HAS_BIAS) - ) convolution_arith_inst ( + ) ccc_inst ( + .clk(clk), + .rst(rst), .data_in_0(rolled_k), .data_in_0_valid(rolled_k_valid), .data_in_0_ready(rolled_k_ready), + .weight(weight), + .weight_valid(weight_valid), + .weight_ready(weight_ready), + .bias(bias), + .bias_valid(bias_valid), + .bias_ready(bias_ready), .data_out_0(round_in), - .* + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) ); fixed_rounding #( diff --git a/src/mase_components/convolution_layers/rtl/convolution_arith.sv b/src/mase_components/convolution_layers/rtl/convolution_compute_core.sv similarity index 99% rename from src/mase_components/convolution_layers/rtl/convolution_arith.sv rename to src/mase_components/convolution_layers/rtl/convolution_compute_core.sv index 2c3568b1d..8e6d0db7f 100644 --- a/src/mase_components/convolution_layers/rtl/convolution_arith.sv +++ b/src/mase_components/convolution_layers/rtl/convolution_compute_core.sv @@ -1,5 +1,6 @@ +/* verilator lint_off DECLFILENAME */ `timescale 1ns / 1ps -module convolution_arith #( +module convolution_compute_core #( // assume output will only unroll_out_channels parameter DATA_IN_0_PRECISION_0 = 16, parameter DATA_IN_0_PRECISION_1 = 3, @@ -107,7 +108,6 @@ module convolution_arith #( // .data_out(data_out_0[i]) // );end endmodule - module simple_convolution_arith #( parameter DATA_IN_0_PRECISION_0 = 16, parameter DATA_IN_0_PRECISION_1 = 3, @@ -119,7 +119,7 @@ module simple_convolution_arith #( parameter ROLL_OUT_NUM = 2, parameter IN_CHANNELS_DEPTH = 4, parameter OUT_CHANNELS_PARALLELISM = 2, - parameter HAS_BIAS, + parameter HAS_BIAS = 1, parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( ROLL_IN_NUM * IN_CHANNELS_DEPTH ), @@ -174,7 +174,7 @@ module simple_convolution_arith #( .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), .WEIGHT_PRECISION_0(WEIGHT_PRECISION_0), .DP_SIZE(ROLL_OUT_NUM), - .ACC_DEPTH(ROLL_IN_NUM / ROLL_OUT_NUM * IN_CHANNELS_DEPTH), + .ACC_DEPTH(ROLL_IN_NUM / ROLL_OUT_NUM * IN_CHANNELS_DEPTH) ) dp_acc_inst ( .clk(clk), .rst(rst), diff --git a/src/mase_components/convolution_layers/rtl/padding.sv b/src/mase_components/convolution_layers/rtl/padding.sv index fb336acb3..715396ff3 100644 --- a/src/mase_components/convolution_layers/rtl/padding.sv +++ b/src/mase_components/convolution_layers/rtl/padding.sv @@ -29,7 +29,11 @@ module padding #( .data_out(reg_out), .data_out_valid(reg_out_valid), .data_out_ready(reg_out_ready), - .* + .data_in(data_in), + .data_in_valid(data_in_valid), + .data_in_ready(data_in_ready), + .clk(clk), + .rst(rst) ); logic [C_WIDTH -1:0] count_c; logic [X_WIDTH -1:0] count_x; diff --git a/src/mase_components/convolution_layers/rtl/sliding_window.sv b/src/mase_components/convolution_layers/rtl/sliding_window.sv index 883e54ca6..79734f745 100644 --- a/src/mase_components/convolution_layers/rtl/sliding_window.sv +++ b/src/mase_components/convolution_layers/rtl/sliding_window.sv @@ -148,8 +148,6 @@ module sliding_window_buffer #( end end end - - endmodule module sliding_window_stride #( @@ -199,13 +197,17 @@ module sliding_window_stride #( .DATA_WIDTH(DATA_WIDTH), .CHANNELS(CHANNELS) ) buffer ( + .clk (clk), + .rst (rst), + .data_in (data_in), + .data_in_valid (data_in_valid), + .data_in_ready (data_in_ready), + .data_out_valid(buffer_valid), + .data_out_ready(buffer_ready), .data_out (buffer_data), .out_x (buffer_x), .out_y (buffer_y), - .out_c (buffer_c), - .data_out_valid(buffer_valid), - .data_out_ready(buffer_ready), - .* + .out_c (buffer_c) ); // enable stride == 1 logic in_range; @@ -312,10 +314,14 @@ module sliding_window #( .DATA_WIDTH(DATA_WIDTH), .CHANNELS(CHANNELS) ) padding_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in), + .data_in_valid(data_in_valid), + .data_in_ready(data_in_ready), .data_out(padding_in), .data_out_valid(padding_in_valid), - .data_out_ready(padding_in_ready), - .* + .data_out_ready(padding_in_ready) ); sliding_window_stride #( diff --git a/src/mase_components/convolution_layers/rtl/binary_activation_binary_convolution.sv b/src/mase_components/convolution_layers/test/binary_activation_binary_convolution.sv similarity index 100% rename from src/mase_components/convolution_layers/rtl/binary_activation_binary_convolution.sv rename to src/mase_components/convolution_layers/test/binary_activation_binary_convolution.sv diff --git a/src/mase_components/convolution_layers/test/convolution_tb.py b/src/mase_components/convolution_layers/test/convolution_tb.py index 1a3789ed9..51bea7fdd 100644 --- a/src/mase_components/convolution_layers/test/convolution_tb.py +++ b/src/mase_components/convolution_layers/test/convolution_tb.py @@ -269,7 +269,7 @@ async def run_test(self): self.data_out_0_monitor.load_monitor(o) # cocotb.start_soon(check_signal(self.dut, self.log)) - await Timer(100, units="us") + await Timer(1000, units="us") assert self.data_out_0_monitor.exp_queue.empty() @@ -299,16 +299,16 @@ def get_fixed_conv_config(kwargs={}): config = { "IN_C": 3, "UNROLL_IN_C": 3, - "IN_X": 3, - "IN_Y": 3, - "KERNEL_X": 3, - "KERNEL_Y": 2, - "UNROLL_KERNEL_OUT": 3, - "OUT_C": 4, - "UNROLL_OUT_C": 2, + "IN_X": 16, + "IN_Y": 16, + "KERNEL_X": 4, + "KERNEL_Y": 4, + "UNROLL_KERNEL_OUT": 4, + "OUT_C": 16, + "UNROLL_OUT_C": 4, "STRIDE": 2, - "PADDING_Y": 1, - "PADDING_X": 2, + "PADDING_Y": 0, + "PADDING_X": 0, "HAS_BIAS": 1, } in_y = config["IN_Y"] @@ -336,6 +336,7 @@ def test_fixed_linear_smoke(): module_param_list=[ get_fixed_conv_config(), ], + sim="questa", ) @@ -357,20 +358,6 @@ def test_fixed_linear_regression(): "OUT_CHANNELS_DEPTH": 96, } ), - # get_fixed_linear_config( - # { - # "HAS_BIAS": 1, - # "WEIGHTS_PRE_TRANSPOSED": 0, - # "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - # "DATA_IN_0_PARALLELISM_DIM_0": 32, - # "WEIGHT_TENSOR_SIZE_DIM_0": 768, - # "WEIGHT_TENSOR_SIZE_DIM_1": 768, - # "WEIGHT_PARALLELISM_DIM_0": 32, - # "WEIGHT_PARALLELISM_DIM_1": 32, - # "BIAS_TENSOR_SIZE_DIM_0": 768, - # "BIAS_PARALLELISM_DIM_0": 32, - # } - # ), ], ) diff --git a/src/mase_components/convolution_layers/test/test_lint_conv.py b/src/mase_components/convolution_layers/test/test_lint_conv.py index d635b192f..dcb176a21 100644 --- a/src/mase_components/convolution_layers/test/test_lint_conv.py +++ b/src/mase_components/convolution_layers/test/test_lint_conv.py @@ -5,7 +5,7 @@ @pytest.mark.skip(reason="Needs to be fixed.") def test_lint_conv(): - run_lint("conv") + run_lint("convolution_layers") if __name__ == "__main__": diff --git a/src/mase_components/convolution_layers/test/test_synth_conv.py b/src/mase_components/convolution_layers/test/test_synth_conv.py index 448ae4e3f..708e31c97 100644 --- a/src/mase_components/convolution_layers/test/test_synth_conv.py +++ b/src/mase_components/convolution_layers/test/test_synth_conv.py @@ -4,7 +4,7 @@ @pytest.mark.vivado def test_synth_conv(): - run_synth("conv") + run_synth("convolution_layers") if __name__ == "__main__": diff --git a/src/mase_components/deps.py b/src/mase_components/deps.py index a06529344..a68c1dbaf 100644 --- a/src/mase_components/deps.py +++ b/src/mase_components/deps.py @@ -21,6 +21,7 @@ "common", "memory", "activation_layers", + "generated_lut", ], "activation_layers/fixed_softsign": [ "common", @@ -37,10 +38,17 @@ "activation_layers/fixed_logsigmoid": ["common", "cast", "activation_layers"], "activation_layers/fixed_softmax": [ "common", + "memory", + "scalar_operators/fixed", "cast", - "fixed_arithmetic", - "conv", + "linear_layers/fixed_operators", + "generated_lut", "activation_layers", + "convolution_layers", + "memory", + "linear_layers/fixed_operators", + "scalar_operators/fixed", + "generated_lut", ], "activation_layers/fixed_softermax_1d": [ "common", @@ -133,6 +141,13 @@ "common", ], # Linear + "linear_layers/fixed_linear_layer/fixed_linear_with_input_circular": [ + "cast", + "common", + "memory", + "linear_layers/fixed_operators", + "scalar_operators/fixed", + ], "linear_layers/fixed_linear_layer/fixed_linear": [ "cast", "common", @@ -258,6 +273,7 @@ "linear_layers/fixed_operators", "common", "memory", + "cast", ], "linear_layers/mxint_operators/mxint_dot_product": [ "linear_layers/mxint_operators", @@ -265,6 +281,34 @@ "common", "memory", ], + "linear_layers/mxint_operators/mxint_range_reduction": [ + "linear_layers/mxint_operators", + "common", + "memory", + "cast", + ], + "linear_layers/mxint_operators/mxint_exp": [ + "linear_layers/mxint_operators", + "common", + "memory", + "cast", + "generated_lut", + ], + "linear_layers/mxint_operators/mxint_softmax": [ + "linear_layers/mxint_operators", + "common", + "memory", + "cast", + "scalar_operators/fixed", + "generated_lut", + ], + "linear_layers/mxint_operators/mxint_addition": [ + "linear_layers/mxint_operators", + "linear_layers/fixed_operators", + "common", + "memory", + "cast", + ], "linear_layers/mxint_operators/mxint_linear": [ "linear_layers/mxint_operators", "linear_layers/fixed_operators", @@ -279,6 +323,33 @@ "memory", "cast", ], + "linear_layers/mxint_operators/mxint_gelu": [ + "linear_layers/mxint_operators", + "linear_layers/fixed_operators", + "common", + "memory", + "cast", + "generated_lut" + ], + "linear_layers/mxint_operators/mxint_vit_attention_head": [ + "linear_layers/mxint_operators", + "linear_layers/fixed_operators", + "linear_layers/matmul", + "common", + "memory", + "cast", + "scalar_operators/fixed", + ], + "linear_layers/mxint_operators/mxint_vit_attention_wrap": [ + "linear_layers/mxint_operators", + "linear_layers/fixed_operators", + "transformer_layers", + "linear_layers/matmul", + "common", + "memory", + "cast", + "scalar_operators/fixed", + ], "linear_layers/mxint_operators/old_linear": [ "linear_layers/mxint_operators", "linear_layers/fixed_operators", @@ -303,11 +374,48 @@ "memory", "cast", ], + "linear_layers/mxint_operators/mxint_patch_embed": [ + "convolution_layers", + "linear_layers/matmul", + "linear_layers/mxint_operators", + "linear_layers/fixed_operators", + "common", + "memory", + "cast", + ], + "linear_layers/mxint_operators/mxint_hardware_round": [ + "linear_layers/mxint_operators", + "common", + "memory", + "cast", + ], "linear_layers/mxint_operators/log2_max_abs": [ "linear_layers/mxint_operators", "common", "memory", ], + "linear_layers/mxint_operators/mxint_layernorm_1d": [ + "common", + "linear_layers/matmul", + "linear_layers/fixed_operators", + "scalar_operators/fixed", + "normalization_layers", + "cast", + "memory", + "generated_lut", + "linear_layers/mxint_operators", + ], + "linear_layers/mxint_operators/mxint_layernorm": [ + "common", + "linear_layers/matmul", + "linear_layers/fixed_operators", + "scalar_operators/fixed", + "normalization_layers", + "cast", + "memory", + "generated_lut", + "linear_layers/mxint_operators", + ], # Memory "memory/skid_buffer": [], "memory/fifo": ["memory"], @@ -316,6 +424,7 @@ "memory/ram_block": [], "memory/unpacked_fifo": ["memory"], "memory/unpacked_skid_buffer": ["memory"], + "memory/weight_source": ["memory"], # Normalization Layers "normalization_layers/batch_norm_2d": [ "normalization_layers", @@ -343,6 +452,16 @@ "cast", "memory", ], + "normalization_layers/layer_norm_2d": [ + "common", + "linear_layers/matmul", + "linear_layers/fixed_operators", + "scalar_operators/fixed", + "normalization_layers", + "cast", + "memory", + "generated_lut", + ], # Scalar Operators "scalar_operators/fixed/fixed_isqrt": [ "memory", @@ -356,6 +475,12 @@ "scalar_operators/fixed", "linear_layers/fixed_operators", ], + "scalar_operators/fixed/fixed_div": [ + "scalar_operators/fixed", + "memory", + "cast", + "common", + ], # Transformer Layers "transformer_layers/fixed_self_attention": [ "transformer_layers", @@ -399,6 +524,31 @@ ], "arithmetic/mac": ["fixed_arithmetic", "float_arithmetic"], # ViT + "vision_models/vit/fixed_vit_attention_head": [ + "vision_models/attention", + "cast", + "memory", + "common", + "linear_layers/fixed_operators", + "linear_layers/fixed_linear_layer", + "linear_layers/matmul", + "activation_layers", + "scalar_operators/fixed", + "generated_lut", + ], + "vision_models/vit/fixed_vit_attention": [ + "vision_models/vit", + "transformer_layers", + "cast", + "memory", + "common", + "linear_layers/fixed_operators", + "linear_layers/fixed_linear_layer", + "linear_layers/matmul", + "activation_layers", + "scalar_operators/fixed", + "generated_lut", + ], "ViT/fixed_patch_embed": [ "conv", "ViT", diff --git a/src/mase_components/generated_lut/rtl/exp_lut.sv b/src/mase_components/generated_lut/rtl/exp_lut.sv new file mode 100644 index 000000000..c8af7c8bf --- /dev/null +++ b/src/mase_components/generated_lut/rtl/exp_lut.sv @@ -0,0 +1,277 @@ + +`timescale 1ns / 1ps +/* verilator lint_off UNUSEDPARAM */ +module exp_lut #( + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter DATA_OUT_0_PRECISION_0 = 16, + parameter DATA_OUT_0_PRECISION_1 = 8 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input logic [7:0] data_in_0, + output logic [7:0] data_out_0 +); + + + always_comb begin + case (data_in_0) + 8'b00000000: data_out_0 = 8'b00010000; + 8'b00000001: data_out_0 = 8'b00010000; + 8'b00000010: data_out_0 = 8'b00010000; + 8'b00000011: data_out_0 = 8'b00010000; + 8'b00000100: data_out_0 = 8'b00010000; + 8'b00000101: data_out_0 = 8'b00010000; + 8'b00000110: data_out_0 = 8'b00010001; + 8'b00000111: data_out_0 = 8'b00010001; + 8'b00001000: data_out_0 = 8'b00010001; + 8'b00001001: data_out_0 = 8'b00010001; + 8'b00001010: data_out_0 = 8'b00010010; + 8'b00001011: data_out_0 = 8'b00010010; + 8'b00001100: data_out_0 = 8'b00010010; + 8'b00001101: data_out_0 = 8'b00010010; + 8'b00001110: data_out_0 = 8'b00010010; + 8'b00001111: data_out_0 = 8'b00010011; + 8'b00010000: data_out_0 = 8'b00010011; + 8'b00010001: data_out_0 = 8'b00010011; + 8'b00010010: data_out_0 = 8'b00010011; + 8'b00010011: data_out_0 = 8'b00010100; + 8'b00010100: data_out_0 = 8'b00010100; + 8'b00010101: data_out_0 = 8'b00010100; + 8'b00010110: data_out_0 = 8'b00010100; + 8'b00010111: data_out_0 = 8'b00010101; + 8'b00011000: data_out_0 = 8'b00010101; + 8'b00011001: data_out_0 = 8'b00010101; + 8'b00011010: data_out_0 = 8'b00010101; + 8'b00011011: data_out_0 = 8'b00010110; + 8'b00011100: data_out_0 = 8'b00010110; + 8'b00011101: data_out_0 = 8'b00010110; + 8'b00011110: data_out_0 = 8'b00010110; + 8'b00011111: data_out_0 = 8'b00010111; + 8'b00100000: data_out_0 = 8'b00010111; + 8'b00100001: data_out_0 = 8'b00010111; + 8'b00100010: data_out_0 = 8'b00010111; + 8'b00100011: data_out_0 = 8'b00011000; + 8'b00100100: data_out_0 = 8'b00011000; + 8'b00100101: data_out_0 = 8'b00011000; + 8'b00100110: data_out_0 = 8'b00011001; + 8'b00100111: data_out_0 = 8'b00011001; + 8'b00101000: data_out_0 = 8'b00011001; + 8'b00101001: data_out_0 = 8'b00011010; + 8'b00101010: data_out_0 = 8'b00011010; + 8'b00101011: data_out_0 = 8'b00011010; + 8'b00101100: data_out_0 = 8'b00011010; + 8'b00101101: data_out_0 = 8'b00011011; + 8'b00101110: data_out_0 = 8'b00011011; + 8'b00101111: data_out_0 = 8'b00011011; + 8'b00110000: data_out_0 = 8'b00011100; + 8'b00110001: data_out_0 = 8'b00011100; + 8'b00110010: data_out_0 = 8'b00011100; + 8'b00110011: data_out_0 = 8'b00011101; + 8'b00110100: data_out_0 = 8'b00011101; + 8'b00110101: data_out_0 = 8'b00011110; + 8'b00110110: data_out_0 = 8'b00011110; + 8'b00110111: data_out_0 = 8'b00011110; + 8'b00111000: data_out_0 = 8'b00011111; + 8'b00111001: data_out_0 = 8'b00011111; + 8'b00111010: data_out_0 = 8'b00011111; + 8'b00111011: data_out_0 = 8'b00100000; + 8'b00111100: data_out_0 = 8'b00100000; + 8'b00111101: data_out_0 = 8'b00100001; + 8'b00111110: data_out_0 = 8'b00100001; + 8'b00111111: data_out_0 = 8'b00100001; + 8'b01000000: data_out_0 = 8'b00100010; + 8'b01000001: data_out_0 = 8'b00100010; + 8'b01000010: data_out_0 = 8'b00100011; + 8'b01000011: data_out_0 = 8'b00100011; + 8'b01000100: data_out_0 = 8'b00100011; + 8'b01000101: data_out_0 = 8'b00100100; + 8'b01000110: data_out_0 = 8'b00100100; + 8'b01000111: data_out_0 = 8'b00100101; + 8'b01001000: data_out_0 = 8'b00100101; + 8'b01001001: data_out_0 = 8'b00100110; + 8'b01001010: data_out_0 = 8'b00100110; + 8'b01001011: data_out_0 = 8'b00100110; + 8'b01001100: data_out_0 = 8'b00100111; + 8'b01001101: data_out_0 = 8'b00100111; + 8'b01001110: data_out_0 = 8'b00101000; + 8'b01001111: data_out_0 = 8'b00101000; + 8'b01010000: data_out_0 = 8'b00101001; + 8'b01010001: data_out_0 = 8'b00101001; + 8'b01010010: data_out_0 = 8'b00101010; + 8'b01010011: data_out_0 = 8'b00101010; + 8'b01010100: data_out_0 = 8'b00101011; + 8'b01010101: data_out_0 = 8'b00101011; + 8'b01010110: data_out_0 = 8'b00101100; + 8'b01010111: data_out_0 = 8'b00101100; + 8'b01011000: data_out_0 = 8'b00101101; + 8'b01011001: data_out_0 = 8'b00101110; + 8'b01011010: data_out_0 = 8'b00101110; + 8'b01011011: data_out_0 = 8'b00101111; + 8'b01011100: data_out_0 = 8'b00101111; + 8'b01011101: data_out_0 = 8'b00110000; + 8'b01011110: data_out_0 = 8'b00110000; + 8'b01011111: data_out_0 = 8'b00110001; + 8'b01100000: data_out_0 = 8'b00110010; + 8'b01100001: data_out_0 = 8'b00110010; + 8'b01100010: data_out_0 = 8'b00110011; + 8'b01100011: data_out_0 = 8'b00110011; + 8'b01100100: data_out_0 = 8'b00110100; + 8'b01100101: data_out_0 = 8'b00110101; + 8'b01100110: data_out_0 = 8'b00110101; + 8'b01100111: data_out_0 = 8'b00110110; + 8'b01101000: data_out_0 = 8'b00110111; + 8'b01101001: data_out_0 = 8'b00110111; + 8'b01101010: data_out_0 = 8'b00111000; + 8'b01101011: data_out_0 = 8'b00111001; + 8'b01101100: data_out_0 = 8'b00111001; + 8'b01101101: data_out_0 = 8'b00111010; + 8'b01101110: data_out_0 = 8'b00111011; + 8'b01101111: data_out_0 = 8'b00111011; + 8'b01110000: data_out_0 = 8'b00111100; + 8'b01110001: data_out_0 = 8'b00111101; + 8'b01110010: data_out_0 = 8'b00111101; + 8'b01110011: data_out_0 = 8'b00111110; + 8'b01110100: data_out_0 = 8'b00111111; + 8'b01110101: data_out_0 = 8'b01000000; + 8'b01110110: data_out_0 = 8'b01000000; + 8'b01110111: data_out_0 = 8'b01000001; + 8'b01111000: data_out_0 = 8'b01000010; + 8'b01111001: data_out_0 = 8'b01000011; + 8'b01111010: data_out_0 = 8'b01000100; + 8'b01111011: data_out_0 = 8'b01000100; + 8'b01111100: data_out_0 = 8'b01000101; + 8'b01111101: data_out_0 = 8'b01000110; + 8'b01111110: data_out_0 = 8'b01000111; + 8'b01111111: data_out_0 = 8'b01001000; + 8'b10000000: data_out_0 = 8'b00000011; + 8'b10000001: data_out_0 = 8'b00000011; + 8'b10000010: data_out_0 = 8'b00000011; + 8'b10000011: data_out_0 = 8'b00000011; + 8'b10000100: data_out_0 = 8'b00000011; + 8'b10000101: data_out_0 = 8'b00000011; + 8'b10000110: data_out_0 = 8'b00000011; + 8'b10000111: data_out_0 = 8'b00000011; + 8'b10001000: data_out_0 = 8'b00000011; + 8'b10001001: data_out_0 = 8'b00000011; + 8'b10001010: data_out_0 = 8'b00000011; + 8'b10001011: data_out_0 = 8'b00000011; + 8'b10001100: data_out_0 = 8'b00000100; + 8'b10001101: data_out_0 = 8'b00000100; + 8'b10001110: data_out_0 = 8'b00000100; + 8'b10001111: data_out_0 = 8'b00000100; + 8'b10010000: data_out_0 = 8'b00000100; + 8'b10010001: data_out_0 = 8'b00000100; + 8'b10010010: data_out_0 = 8'b00000100; + 8'b10010011: data_out_0 = 8'b00000100; + 8'b10010100: data_out_0 = 8'b00000100; + 8'b10010101: data_out_0 = 8'b00000100; + 8'b10010110: data_out_0 = 8'b00000100; + 8'b10010111: data_out_0 = 8'b00000100; + 8'b10011000: data_out_0 = 8'b00000100; + 8'b10011001: data_out_0 = 8'b00000100; + 8'b10011010: data_out_0 = 8'b00000100; + 8'b10011011: data_out_0 = 8'b00000100; + 8'b10011100: data_out_0 = 8'b00000100; + 8'b10011101: data_out_0 = 8'b00000100; + 8'b10011110: data_out_0 = 8'b00000100; + 8'b10011111: data_out_0 = 8'b00000101; + 8'b10100000: data_out_0 = 8'b00000101; + 8'b10100001: data_out_0 = 8'b00000101; + 8'b10100010: data_out_0 = 8'b00000101; + 8'b10100011: data_out_0 = 8'b00000101; + 8'b10100100: data_out_0 = 8'b00000101; + 8'b10100101: data_out_0 = 8'b00000101; + 8'b10100110: data_out_0 = 8'b00000101; + 8'b10100111: data_out_0 = 8'b00000101; + 8'b10101000: data_out_0 = 8'b00000101; + 8'b10101001: data_out_0 = 8'b00000101; + 8'b10101010: data_out_0 = 8'b00000101; + 8'b10101011: data_out_0 = 8'b00000101; + 8'b10101100: data_out_0 = 8'b00000101; + 8'b10101101: data_out_0 = 8'b00000101; + 8'b10101110: data_out_0 = 8'b00000110; + 8'b10101111: data_out_0 = 8'b00000110; + 8'b10110000: data_out_0 = 8'b00000110; + 8'b10110001: data_out_0 = 8'b00000110; + 8'b10110010: data_out_0 = 8'b00000110; + 8'b10110011: data_out_0 = 8'b00000110; + 8'b10110100: data_out_0 = 8'b00000110; + 8'b10110101: data_out_0 = 8'b00000110; + 8'b10110110: data_out_0 = 8'b00000110; + 8'b10110111: data_out_0 = 8'b00000110; + 8'b10111000: data_out_0 = 8'b00000110; + 8'b10111001: data_out_0 = 8'b00000110; + 8'b10111010: data_out_0 = 8'b00000110; + 8'b10111011: data_out_0 = 8'b00000111; + 8'b10111100: data_out_0 = 8'b00000111; + 8'b10111101: data_out_0 = 8'b00000111; + 8'b10111110: data_out_0 = 8'b00000111; + 8'b10111111: data_out_0 = 8'b00000111; + 8'b11000000: data_out_0 = 8'b00000111; + 8'b11000001: data_out_0 = 8'b00000111; + 8'b11000010: data_out_0 = 8'b00000111; + 8'b11000011: data_out_0 = 8'b00000111; + 8'b11000100: data_out_0 = 8'b00000111; + 8'b11000101: data_out_0 = 8'b00000111; + 8'b11000110: data_out_0 = 8'b00001000; + 8'b11000111: data_out_0 = 8'b00001000; + 8'b11001000: data_out_0 = 8'b00001000; + 8'b11001001: data_out_0 = 8'b00001000; + 8'b11001010: data_out_0 = 8'b00001000; + 8'b11001011: data_out_0 = 8'b00001000; + 8'b11001100: data_out_0 = 8'b00001000; + 8'b11001101: data_out_0 = 8'b00001000; + 8'b11001110: data_out_0 = 8'b00001000; + 8'b11001111: data_out_0 = 8'b00001000; + 8'b11010000: data_out_0 = 8'b00001001; + 8'b11010001: data_out_0 = 8'b00001001; + 8'b11010010: data_out_0 = 8'b00001001; + 8'b11010011: data_out_0 = 8'b00001001; + 8'b11010100: data_out_0 = 8'b00001001; + 8'b11010101: data_out_0 = 8'b00001001; + 8'b11010110: data_out_0 = 8'b00001001; + 8'b11010111: data_out_0 = 8'b00001001; + 8'b11011000: data_out_0 = 8'b00001001; + 8'b11011001: data_out_0 = 8'b00001010; + 8'b11011010: data_out_0 = 8'b00001010; + 8'b11011011: data_out_0 = 8'b00001010; + 8'b11011100: data_out_0 = 8'b00001010; + 8'b11011101: data_out_0 = 8'b00001010; + 8'b11011110: data_out_0 = 8'b00001010; + 8'b11011111: data_out_0 = 8'b00001010; + 8'b11100000: data_out_0 = 8'b00001010; + 8'b11100001: data_out_0 = 8'b00001011; + 8'b11100010: data_out_0 = 8'b00001011; + 8'b11100011: data_out_0 = 8'b00001011; + 8'b11100100: data_out_0 = 8'b00001011; + 8'b11100101: data_out_0 = 8'b00001011; + 8'b11100110: data_out_0 = 8'b00001011; + 8'b11100111: data_out_0 = 8'b00001011; + 8'b11101000: data_out_0 = 8'b00001100; + 8'b11101001: data_out_0 = 8'b00001100; + 8'b11101010: data_out_0 = 8'b00001100; + 8'b11101011: data_out_0 = 8'b00001100; + 8'b11101100: data_out_0 = 8'b00001100; + 8'b11101101: data_out_0 = 8'b00001100; + 8'b11101110: data_out_0 = 8'b00001100; + 8'b11101111: data_out_0 = 8'b00001101; + 8'b11110000: data_out_0 = 8'b00001101; + 8'b11110001: data_out_0 = 8'b00001101; + 8'b11110010: data_out_0 = 8'b00001101; + 8'b11110011: data_out_0 = 8'b00001101; + 8'b11110100: data_out_0 = 8'b00001101; + 8'b11110101: data_out_0 = 8'b00001110; + 8'b11110110: data_out_0 = 8'b00001110; + 8'b11110111: data_out_0 = 8'b00001110; + 8'b11111000: data_out_0 = 8'b00001110; + 8'b11111001: data_out_0 = 8'b00001110; + 8'b11111010: data_out_0 = 8'b00001110; + 8'b11111011: data_out_0 = 8'b00001111; + 8'b11111100: data_out_0 = 8'b00001111; + 8'b11111101: data_out_0 = 8'b00001111; + 8'b11111110: data_out_0 = 8'b00001111; + 8'b11111111: data_out_0 = 8'b00001111; + default: data_out_0 = 8'b0; + endcase + end +endmodule diff --git a/src/mase_components/generated_lut/rtl/gelu_lut.sv b/src/mase_components/generated_lut/rtl/gelu_lut.sv new file mode 100644 index 000000000..5cbedb1bc --- /dev/null +++ b/src/mase_components/generated_lut/rtl/gelu_lut.sv @@ -0,0 +1,37 @@ + +`timescale 1ns / 1ps +/* verilator lint_off UNUSEDPARAM */ +module gelu_lut #( + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter DATA_OUT_0_PRECISION_0 = 16, + parameter DATA_OUT_0_PRECISION_1 = 8 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input logic [4:0] data_in_0, + output logic [8:0] data_out_0 +); + + + always_comb begin + case (data_in_0) + 4'b0000: data_out_0 = 8'b00000000; + 4'b0001: data_out_0 = 8'b00001010; + 4'b0010: data_out_0 = 8'b00010110; + 4'b0011: data_out_0 = 8'b00100101; + 4'b0100: data_out_0 = 8'b00110110; + 4'b0101: data_out_0 = 8'b01001000; + 4'b0110: data_out_0 = 8'b01011010; + 4'b0111: data_out_0 = 8'b01101100; + 4'b1000: data_out_0 = 8'b11111101; + 4'b1001: data_out_0 = 8'b11111100; + 4'b1010: data_out_0 = 8'b11111010; + 4'b1011: data_out_0 = 8'b11111000; + 4'b1100: data_out_0 = 8'b11110110; + 4'b1101: data_out_0 = 8'b11110101; + 4'b1110: data_out_0 = 8'b11110110; + 4'b1111: data_out_0 = 8'b11111010; + default: data_out_0 = 8'b0; + endcase + end +endmodule diff --git a/src/mase_components/generated_lut/rtl/isqrt_lut.sv b/src/mase_components/generated_lut/rtl/isqrt_lut.sv new file mode 100644 index 000000000..5b90bcca2 --- /dev/null +++ b/src/mase_components/generated_lut/rtl/isqrt_lut.sv @@ -0,0 +1,277 @@ + +`timescale 1ns / 1ps +/* verilator lint_off UNUSEDPARAM */ +module isqrt_lut #( + parameter DATA_IN_0_PRECISION_0 = 9, + parameter DATA_IN_0_PRECISION_1 = 7, + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input logic [8:0] data_in_0, + output logic [7:0] data_out_0 +); + + + always_comb begin + case (data_in_0) + 9'b000000000: data_out_0 = 8'b01111111; + 9'b000000001: data_out_0 = 8'b01111111; + 9'b000000010: data_out_0 = 8'b01111111; + 9'b000000011: data_out_0 = 8'b01101000; + 9'b000000100: data_out_0 = 8'b01011010; + 9'b000000101: data_out_0 = 8'b01010001; + 9'b000000110: data_out_0 = 8'b01001010; + 9'b000000111: data_out_0 = 8'b01000100; + 9'b000001000: data_out_0 = 8'b01000000; + 9'b000001001: data_out_0 = 8'b00111100; + 9'b000001010: data_out_0 = 8'b00111001; + 9'b000001011: data_out_0 = 8'b00110111; + 9'b000001100: data_out_0 = 8'b00110100; + 9'b000001101: data_out_0 = 8'b00110010; + 9'b000001110: data_out_0 = 8'b00110000; + 9'b000001111: data_out_0 = 8'b00101111; + 9'b000010000: data_out_0 = 8'b00101101; + 9'b000010001: data_out_0 = 8'b00101100; + 9'b000010010: data_out_0 = 8'b00101011; + 9'b000010011: data_out_0 = 8'b00101010; + 9'b000010100: data_out_0 = 8'b00101000; + 9'b000010101: data_out_0 = 8'b00101000; + 9'b000010110: data_out_0 = 8'b00100111; + 9'b000010111: data_out_0 = 8'b00100110; + 9'b000011000: data_out_0 = 8'b00100101; + 9'b000011001: data_out_0 = 8'b00100100; + 9'b000011010: data_out_0 = 8'b00100011; + 9'b000011011: data_out_0 = 8'b00100011; + 9'b000011100: data_out_0 = 8'b00100010; + 9'b000011101: data_out_0 = 8'b00100010; + 9'b000011110: data_out_0 = 8'b00100001; + 9'b000011111: data_out_0 = 8'b00100001; + 9'b000100000: data_out_0 = 8'b00100000; + 9'b000100001: data_out_0 = 8'b00100000; + 9'b000100010: data_out_0 = 8'b00011111; + 9'b000100011: data_out_0 = 8'b00011111; + 9'b000100100: data_out_0 = 8'b00011110; + 9'b000100101: data_out_0 = 8'b00011110; + 9'b000100110: data_out_0 = 8'b00011101; + 9'b000100111: data_out_0 = 8'b00011101; + 9'b000101000: data_out_0 = 8'b00011101; + 9'b000101001: data_out_0 = 8'b00011100; + 9'b000101010: data_out_0 = 8'b00011100; + 9'b000101011: data_out_0 = 8'b00011100; + 9'b000101100: data_out_0 = 8'b00011011; + 9'b000101101: data_out_0 = 8'b00011011; + 9'b000101110: data_out_0 = 8'b00011011; + 9'b000101111: data_out_0 = 8'b00011010; + 9'b000110000: data_out_0 = 8'b00011010; + 9'b000110001: data_out_0 = 8'b00011010; + 9'b000110010: data_out_0 = 8'b00011010; + 9'b000110011: data_out_0 = 8'b00011001; + 9'b000110100: data_out_0 = 8'b00011001; + 9'b000110101: data_out_0 = 8'b00011001; + 9'b000110110: data_out_0 = 8'b00011001; + 9'b000110111: data_out_0 = 8'b00011000; + 9'b000111000: data_out_0 = 8'b00011000; + 9'b000111001: data_out_0 = 8'b00011000; + 9'b000111010: data_out_0 = 8'b00011000; + 9'b000111011: data_out_0 = 8'b00011000; + 9'b000111100: data_out_0 = 8'b00010111; + 9'b000111101: data_out_0 = 8'b00010111; + 9'b000111110: data_out_0 = 8'b00010111; + 9'b000111111: data_out_0 = 8'b00010111; + 9'b001000000: data_out_0 = 8'b00010111; + 9'b001000001: data_out_0 = 8'b00010110; + 9'b001000010: data_out_0 = 8'b00010110; + 9'b001000011: data_out_0 = 8'b00010110; + 9'b001000100: data_out_0 = 8'b00010110; + 9'b001000101: data_out_0 = 8'b00010110; + 9'b001000110: data_out_0 = 8'b00010110; + 9'b001000111: data_out_0 = 8'b00010101; + 9'b001001000: data_out_0 = 8'b00010101; + 9'b001001001: data_out_0 = 8'b00010101; + 9'b001001010: data_out_0 = 8'b00010101; + 9'b001001011: data_out_0 = 8'b00010101; + 9'b001001100: data_out_0 = 8'b00010101; + 9'b001001101: data_out_0 = 8'b00010101; + 9'b001001110: data_out_0 = 8'b00010100; + 9'b001001111: data_out_0 = 8'b00010100; + 9'b001010000: data_out_0 = 8'b00010100; + 9'b001010001: data_out_0 = 8'b00010100; + 9'b001010010: data_out_0 = 8'b00010100; + 9'b001010011: data_out_0 = 8'b00010100; + 9'b001010100: data_out_0 = 8'b00010100; + 9'b001010101: data_out_0 = 8'b00010100; + 9'b001010110: data_out_0 = 8'b00010100; + 9'b001010111: data_out_0 = 8'b00010011; + 9'b001011000: data_out_0 = 8'b00010011; + 9'b001011001: data_out_0 = 8'b00010011; + 9'b001011010: data_out_0 = 8'b00010011; + 9'b001011011: data_out_0 = 8'b00010011; + 9'b001011100: data_out_0 = 8'b00010011; + 9'b001011101: data_out_0 = 8'b00010011; + 9'b001011110: data_out_0 = 8'b00010011; + 9'b001011111: data_out_0 = 8'b00010011; + 9'b001100000: data_out_0 = 8'b00010010; + 9'b001100001: data_out_0 = 8'b00010010; + 9'b001100010: data_out_0 = 8'b00010010; + 9'b001100011: data_out_0 = 8'b00010010; + 9'b001100100: data_out_0 = 8'b00010010; + 9'b001100101: data_out_0 = 8'b00010010; + 9'b001100110: data_out_0 = 8'b00010010; + 9'b001100111: data_out_0 = 8'b00010010; + 9'b001101000: data_out_0 = 8'b00010010; + 9'b001101001: data_out_0 = 8'b00010010; + 9'b001101010: data_out_0 = 8'b00010010; + 9'b001101011: data_out_0 = 8'b00010001; + 9'b001101100: data_out_0 = 8'b00010001; + 9'b001101101: data_out_0 = 8'b00010001; + 9'b001101110: data_out_0 = 8'b00010001; + 9'b001101111: data_out_0 = 8'b00010001; + 9'b001110000: data_out_0 = 8'b00010001; + 9'b001110001: data_out_0 = 8'b00010001; + 9'b001110010: data_out_0 = 8'b00010001; + 9'b001110011: data_out_0 = 8'b00010001; + 9'b001110100: data_out_0 = 8'b00010001; + 9'b001110101: data_out_0 = 8'b00010001; + 9'b001110110: data_out_0 = 8'b00010001; + 9'b001110111: data_out_0 = 8'b00010001; + 9'b001111000: data_out_0 = 8'b00010001; + 9'b001111001: data_out_0 = 8'b00010000; + 9'b001111010: data_out_0 = 8'b00010000; + 9'b001111011: data_out_0 = 8'b00010000; + 9'b001111100: data_out_0 = 8'b00010000; + 9'b001111101: data_out_0 = 8'b00010000; + 9'b001111110: data_out_0 = 8'b00010000; + 9'b001111111: data_out_0 = 8'b00010000; + 9'b010000000: data_out_0 = 8'b00010000; + 9'b010000001: data_out_0 = 8'b00010000; + 9'b010000010: data_out_0 = 8'b00010000; + 9'b010000011: data_out_0 = 8'b00010000; + 9'b010000100: data_out_0 = 8'b00010000; + 9'b010000101: data_out_0 = 8'b00010000; + 9'b010000110: data_out_0 = 8'b00010000; + 9'b010000111: data_out_0 = 8'b00010000; + 9'b010001000: data_out_0 = 8'b00010000; + 9'b010001001: data_out_0 = 8'b00001111; + 9'b010001010: data_out_0 = 8'b00001111; + 9'b010001011: data_out_0 = 8'b00001111; + 9'b010001100: data_out_0 = 8'b00001111; + 9'b010001101: data_out_0 = 8'b00001111; + 9'b010001110: data_out_0 = 8'b00001111; + 9'b010001111: data_out_0 = 8'b00001111; + 9'b010010000: data_out_0 = 8'b00001111; + 9'b010010001: data_out_0 = 8'b00001111; + 9'b010010010: data_out_0 = 8'b00001111; + 9'b010010011: data_out_0 = 8'b00001111; + 9'b010010100: data_out_0 = 8'b00001111; + 9'b010010101: data_out_0 = 8'b00001111; + 9'b010010110: data_out_0 = 8'b00001111; + 9'b010010111: data_out_0 = 8'b00001111; + 9'b010011000: data_out_0 = 8'b00001111; + 9'b010011001: data_out_0 = 8'b00001111; + 9'b010011010: data_out_0 = 8'b00001111; + 9'b010011011: data_out_0 = 8'b00001111; + 9'b010011100: data_out_0 = 8'b00001110; + 9'b010011101: data_out_0 = 8'b00001110; + 9'b010011110: data_out_0 = 8'b00001110; + 9'b010011111: data_out_0 = 8'b00001110; + 9'b010100000: data_out_0 = 8'b00001110; + 9'b010100001: data_out_0 = 8'b00001110; + 9'b010100010: data_out_0 = 8'b00001110; + 9'b010100011: data_out_0 = 8'b00001110; + 9'b010100100: data_out_0 = 8'b00001110; + 9'b010100101: data_out_0 = 8'b00001110; + 9'b010100110: data_out_0 = 8'b00001110; + 9'b010100111: data_out_0 = 8'b00001110; + 9'b010101000: data_out_0 = 8'b00001110; + 9'b010101001: data_out_0 = 8'b00001110; + 9'b010101010: data_out_0 = 8'b00001110; + 9'b010101011: data_out_0 = 8'b00001110; + 9'b010101100: data_out_0 = 8'b00001110; + 9'b010101101: data_out_0 = 8'b00001110; + 9'b010101110: data_out_0 = 8'b00001110; + 9'b010101111: data_out_0 = 8'b00001110; + 9'b010110000: data_out_0 = 8'b00001110; + 9'b010110001: data_out_0 = 8'b00001110; + 9'b010110010: data_out_0 = 8'b00001110; + 9'b010110011: data_out_0 = 8'b00001110; + 9'b010110100: data_out_0 = 8'b00001101; + 9'b010110101: data_out_0 = 8'b00001101; + 9'b010110110: data_out_0 = 8'b00001101; + 9'b010110111: data_out_0 = 8'b00001101; + 9'b010111000: data_out_0 = 8'b00001101; + 9'b010111001: data_out_0 = 8'b00001101; + 9'b010111010: data_out_0 = 8'b00001101; + 9'b010111011: data_out_0 = 8'b00001101; + 9'b010111100: data_out_0 = 8'b00001101; + 9'b010111101: data_out_0 = 8'b00001101; + 9'b010111110: data_out_0 = 8'b00001101; + 9'b010111111: data_out_0 = 8'b00001101; + 9'b011000000: data_out_0 = 8'b00001101; + 9'b011000001: data_out_0 = 8'b00001101; + 9'b011000010: data_out_0 = 8'b00001101; + 9'b011000011: data_out_0 = 8'b00001101; + 9'b011000100: data_out_0 = 8'b00001101; + 9'b011000101: data_out_0 = 8'b00001101; + 9'b011000110: data_out_0 = 8'b00001101; + 9'b011000111: data_out_0 = 8'b00001101; + 9'b011001000: data_out_0 = 8'b00001101; + 9'b011001001: data_out_0 = 8'b00001101; + 9'b011001010: data_out_0 = 8'b00001101; + 9'b011001011: data_out_0 = 8'b00001101; + 9'b011001100: data_out_0 = 8'b00001101; + 9'b011001101: data_out_0 = 8'b00001101; + 9'b011001110: data_out_0 = 8'b00001101; + 9'b011001111: data_out_0 = 8'b00001101; + 9'b011010000: data_out_0 = 8'b00001101; + 9'b011010001: data_out_0 = 8'b00001101; + 9'b011010010: data_out_0 = 8'b00001100; + 9'b011010011: data_out_0 = 8'b00001100; + 9'b011010100: data_out_0 = 8'b00001100; + 9'b011010101: data_out_0 = 8'b00001100; + 9'b011010110: data_out_0 = 8'b00001100; + 9'b011010111: data_out_0 = 8'b00001100; + 9'b011011000: data_out_0 = 8'b00001100; + 9'b011011001: data_out_0 = 8'b00001100; + 9'b011011010: data_out_0 = 8'b00001100; + 9'b011011011: data_out_0 = 8'b00001100; + 9'b011011100: data_out_0 = 8'b00001100; + 9'b011011101: data_out_0 = 8'b00001100; + 9'b011011110: data_out_0 = 8'b00001100; + 9'b011011111: data_out_0 = 8'b00001100; + 9'b011100000: data_out_0 = 8'b00001100; + 9'b011100001: data_out_0 = 8'b00001100; + 9'b011100010: data_out_0 = 8'b00001100; + 9'b011100011: data_out_0 = 8'b00001100; + 9'b011100100: data_out_0 = 8'b00001100; + 9'b011100101: data_out_0 = 8'b00001100; + 9'b011100110: data_out_0 = 8'b00001100; + 9'b011100111: data_out_0 = 8'b00001100; + 9'b011101000: data_out_0 = 8'b00001100; + 9'b011101001: data_out_0 = 8'b00001100; + 9'b011101010: data_out_0 = 8'b00001100; + 9'b011101011: data_out_0 = 8'b00001100; + 9'b011101100: data_out_0 = 8'b00001100; + 9'b011101101: data_out_0 = 8'b00001100; + 9'b011101110: data_out_0 = 8'b00001100; + 9'b011101111: data_out_0 = 8'b00001100; + 9'b011110000: data_out_0 = 8'b00001100; + 9'b011110001: data_out_0 = 8'b00001100; + 9'b011110010: data_out_0 = 8'b00001100; + 9'b011110011: data_out_0 = 8'b00001100; + 9'b011110100: data_out_0 = 8'b00001100; + 9'b011110101: data_out_0 = 8'b00001100; + 9'b011110110: data_out_0 = 8'b00001100; + 9'b011110111: data_out_0 = 8'b00001100; + 9'b011111000: data_out_0 = 8'b00001011; + 9'b011111001: data_out_0 = 8'b00001011; + 9'b011111010: data_out_0 = 8'b00001011; + 9'b011111011: data_out_0 = 8'b00001011; + 9'b011111100: data_out_0 = 8'b00001011; + 9'b011111101: data_out_0 = 8'b00001011; + 9'b011111110: data_out_0 = 8'b00001011; + 9'b011111111: data_out_0 = 8'b00001011; + default: data_out_0 = 8'b0; + endcase + end +endmodule diff --git a/src/mase_components/helper/generate_memory.py b/src/mase_components/helper/generate_memory.py index e46c99604..fdb2c1a6b 100644 --- a/src/mase_components/helper/generate_memory.py +++ b/src/mase_components/helper/generate_memory.py @@ -11,8 +11,14 @@ from pathlib import Path -def make_quantizer(data_width: int, f_width: int): - return partial(integer_quantizer, width=data_width, frac_width=f_width) +def make_quantizer(data_width: int, f_width: int, floor): + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + return partial(base_quantizer, width=data_width, frac_width=f_width) + + +def isqrt(x): + x = (x + 1e-5).sqrt().reciprocal() + return x FUNCTION_TABLE = { @@ -23,7 +29,9 @@ def make_quantizer(data_width: int, f_width: int): "softshrink": nn.Softshrink(), "gelu": nn.GELU(), "exp": torch.exp, + "power2": lambda x: torch.pow(2, x), "softmax": torch.exp, + "isqrt": isqrt, } @@ -41,6 +49,9 @@ def doubletofx(data_width: int, f_width: int, num: float, type="hex"): intbits = BitArray(int=intnum, length=data_width) return str(intbits.bin) if type == "bin" else str(intbits) +def inttobit(data_width:int, num: float, signed: bool = True): + intbits = BitArray(int=num, length=data_width) if signed else BitArray(uint=num, length=data_width) + return intbits def generate_lookup(data_width: int, f_width: int, function: str, type="hex"): f = FUNCTION_TABLE[function] @@ -70,7 +81,14 @@ def generate_lookup(data_width: int, f_width: int, function: str, type="hex"): def aligned_generate_lookup( - in_data_width, in_f_width, data_width: int, f_width: int, function: str, type="hex" + in_data_width, + in_f_width, + data_width: int, + f_width: int, + function: str, + type="hex", + constant_mult=1, + floor=False, ): f = FUNCTION_TABLE[function] lut = { @@ -83,15 +101,15 @@ def aligned_generate_lookup( # entries = 2 ** data_width minval = float(-(2 ** (in_data_width - in_f_width - 1))) maxval = (2 ** (in_data_width - 1) - 1) * 2 ** (-in_f_width) - inp_quanter = make_quantizer(in_data_width, in_f_width) - quanter = make_quantizer(data_width, f_width) + inp_quanter = make_quantizer(in_data_width, in_f_width, floor) + quanter = make_quantizer(data_width, f_width, floor) count = 0 iarr = [] pi = float(0) while pi <= maxval: count += 1 iarr.append(pi) - val = quanter(f(torch.tensor(pi))) # entry in the lookup table + val = quanter(f(torch.tensor(pi * constant_mult))) # entry in the lookup table lut[ doubletofx(data_width=in_data_width, f_width=in_f_width, num=pi, type=type) ] = doubletofx( @@ -99,17 +117,22 @@ def aligned_generate_lookup( ) pi += 2 ** -(in_f_width) - i = minval - while i <= -1 * 2 ** -(in_f_width): - count += 1 - iarr.append(i) - val = quanter(f(torch.tensor(i))) # entry in the lookup table - lut[ - doubletofx(data_width=in_data_width, f_width=in_f_width, num=i, type=type) - ] = doubletofx( - data_width=data_width, f_width=f_width, num=val.item(), type=type - ) - i += 2 ** -(in_f_width) + if function not in ["isqrt"]: + i = minval + while i <= -1 * 2 ** -(in_f_width): + count += 1 + iarr.append(i) + val = quanter( + f(torch.tensor(i * constant_mult)) + ) # entry in the lookup table + lut[ + doubletofx( + data_width=in_data_width, f_width=in_f_width, num=i, type=type + ) + ] = doubletofx( + data_width=data_width, f_width=f_width, num=val.item(), type=type + ) + i += 2 ** -(in_f_width) iarr = [(x * 2 ** (in_f_width)) for x in iarr] # print(iarr) @@ -211,6 +234,8 @@ def lookup_to_sv_file( function: str, file_path=None, path_with_dtype=False, + constant_mult=1, + floor=False, ): dicto = aligned_generate_lookup( in_data_width=in_data_width, @@ -219,6 +244,8 @@ def lookup_to_sv_file( f_width=f_width, function=function, type="bin", + constant_mult=constant_mult, + floor=floor, ) dicto = { k: v @@ -237,31 +264,32 @@ def lookup_to_sv_file( `timescale 1ns / 1ps /* verilator lint_off UNUSEDPARAM */ module {function}_lut{end} #( - parameter DATA_IN_0_PRECISION_0 = 16, - parameter DATA_IN_0_PRECISION_1 = 8, - parameter DATA_OUT_0_PRECISION_0 = 16, - parameter DATA_OUT_0_PRECISION_1 = 8 -) -( + parameter DATA_IN_0_PRECISION_0 = {in_data_width}, + parameter DATA_IN_0_PRECISION_1 = {in_f_width}, + parameter DATA_OUT_0_PRECISION_0 = {data_width}, + parameter DATA_OUT_0_PRECISION_1 = {f_width} +) ( /* verilator lint_off UNUSEDSIGNAL */ - input logic [{in_data_width-1}:0] data_in_0, - output logic [{data_width-1}:0] data_out_0 + input logic [{in_data_width - 1}:0] data_in_0, + output logic [{data_width - 1}:0] data_out_0 ); - + +""" + sv_code += """ + always_comb begin + case (data_in_0) """ - sv_code += " always_comb begin\n" - sv_code += " case(data_in_0)\n" # Adding each case for key, value in dicto.items(): formatted_key = key_format.format(key) formatted_value = value_format.format(value) - sv_code += f" {formatted_key}: data_out_0 = {formatted_value};\n" + sv_code += f" {formatted_key}: data_out_0 = {formatted_value};\n" # Ending the case statement and module - sv_code += f" default: data_out_0 = {data_width}'b0;\n" - sv_code += " endcase\n" - sv_code += " end\n" + sv_code += f" default: data_out_0 = {data_width}'b0;\n" + sv_code += " endcase\n" + sv_code += " end\n" sv_code += "endmodule\n" # Write the code to a SystemVerilog file @@ -271,14 +299,96 @@ def lookup_to_sv_file( print(f"SystemVerilog module generated and saved as {file_path}.") +def inttobit(data_width:int, num: float, signed: bool = True): + intbits = BitArray(int=num, length=data_width) if signed else BitArray(uint=num, length=data_width) + return intbits +class GenerateSVLut: + def __init__(self, function_name, parameter, path): + assert ( + function_name in FUNCTION_TABLE + ), f"Function {function_name} not found in FUNCTION_TABLE" + self.f = FUNCTION_TABLE[function_name] + self.parameter = parameter + self.path = path + def quant_profile(self, bin_in): + bin_out = bin_in + return bin_out + + def generate_lut_address(self): + return NotImplementedError + + def generate_lut(self, lut_address: list): + lut = {} + for i in lut_address: + bin_out = self.quant_profile(i) + lut[i] = bin_out + return lut + + def generate_sv(self,lut): + self.generate_lut() + return NotImplementedError + + def pipeline(self): + lut_address = self.generate_lut_address(self) + lut = self.generate_lut(lut_address) + sv = self.generate_sv(lut) + +from mase_components.linear_layers.mxint_operators.test.utils import mxint_quant_block +class GenerateMxIntSVLut(GenerateSVLut): + def quant_profile(self, bin_in): + in_man_width, in_exp_width, out_man_width, out_exp_width = self.parameter["in_man_width"], self.parameter["in_exp_width"], self.parameter["out_man_width"], self.parameter["out_exp_width"] + _bin = BitArray(bin=bin_in) + exp_int = _bin[0:in_exp_width].int + man_int = _bin[in_exp_width:in_man_width + in_exp_width].int + value = man_int / 2**(in_man_width - 1) * 2**(exp_int) + exp_value = self.f(torch.tensor(value)) + quant_value, mx, ex = mxint_quantize(exp_value,out_man_width,out_exp_width) + exp_bit = inttobit(out_exp_width, num=ex).bin + man_bit = inttobit(out_man_width, num=mx).bin + bin_out = exp_bit + man_bit + return bin_out + def generate_lut_address(self): + in_man_width, in_exp_width, out_man_width, out_exp_width = self.parameter["in_man_width"], self.parameter["in_exp_width"], self.parameter["out_man_width"], self.parameter["out_exp_width"] + # we can determine the upperbound of exp + from math import log + upperbound_of_mx_output = (2**(out_man_width - 1) - 1) / 2**(out_man_width - 1) * 2**(2**(out_exp_width - 1) - 1) + lowerbound_of_mx_output = (1) / 2**(out_man_width - 1) * 2**(-2**(out_exp_width - 1)) + positive_max_bound = log(upperbound_of_mx_output) + negetive_max_bound = log(lowerbound_of_mx_output) + # when input> max_bound or input < lower_boud, we actually dont need to represent them + max_exp = torch.tensor(max(abs(positive_max_bound), abs(negetive_max_bound))) + _, _, max_exp = mxint_quantize(max_exp) + + # actually, we also don't have that much precision to represent the data around 1(exp(0)) + # so the limitation at data around 0 can determine the minimum value of exp. + # so we got two value in the left side or in the right side + _left = (2**(out_man_width - 1) - 1) / 2**(out_man_width - 1) + _right = (1*2**(out_man_width - 2) + 1) / 2**(out_man_width - 1) * 2**(1) + # we need to find a way to rounding them, divide the gap by two, so when it's smaller than this value, we can actually think, it's 0 + _left = 1 - (1 - _left)/2 + _right = 1 + (_right - 1)/2 + positive_min_bound = log(_left) + negetive_min_bound = log(_right) + min_exp = torch.tensor(min(abs(positive_min_bound), abs(negetive_min_bound))) + _, _, min_exp = mxint_quantize(min_exp) + address = [] + for i in range(int(min_exp), int(max_exp+in_man_width)): + for j in range(2**in_man_width): + exp_bin = inttobit(in_exp_width,i).bin + man_bin = inttobit(in_man_width,j, signed=False).bin + address += [str(exp_bin) + str(man_bin)] + return address + def generate_sv_lut( function_name, in_data_width, in_f_width, data_width, f_width, - path=None, + path=None, # maybe not accept path as a parameter due to redundantly-generated exp_lut path_with_dtype=False, + constant_mult=1, + floor=False, ): assert ( function_name in FUNCTION_TABLE @@ -289,27 +399,18 @@ def generate_sv_lut( else: end = "" - if path is None: - p = Path(__file__).parents[1] / "rtl" - lookup_to_sv_file( - in_data_width, - in_f_width, - data_width, - f_width, - function_name, - str(p / f"{function_name}_lut{end}.sv"), - path_with_dtype=path_with_dtype, - ) - else: - lookup_to_sv_file( - in_data_width, - in_f_width, - data_width, - f_width, - function_name, - f"{path}/{function_name}_lut{end}.sv", - path_with_dtype=path_with_dtype, - ) + p = Path(__file__).parents[1] / "generated_lut" / "rtl" + lookup_to_sv_file( + in_data_width, + in_f_width, + data_width, + f_width, + function_name, + str(p / f"{function_name}_lut{end}.sv"), + path_with_dtype=path_with_dtype, + constant_mult=constant_mult, + floor=floor, + ) if __name__ == "__main__": diff --git a/src/mase_components/hls/scalar_ops/int_div/README.md b/src/mase_components/hls/scalar_ops/int_div/README.md new file mode 100644 index 000000000..c829d9a3b --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/README.md @@ -0,0 +1,7 @@ +# Scalar integer/fixed-point divider with handshake interface + +To generate the verilog, run: + +```sh +vitis_hls vhls.tcl +``` diff --git a/src/mase_components/hls/scalar_ops/int_div/div.cpp b/src/mase_components/hls/scalar_ops/int_div/div.cpp new file mode 100644 index 000000000..6966d1ecf --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/div.cpp @@ -0,0 +1,23 @@ +#include "ap_int.h" +#include "hls_stream.h" + +#define total_width_0 32 + +#define total_width_1 32 + +#define total_width_2 16 + +void div(hls::stream> &data_in_0, + hls::stream> &data_in_1, + hls::stream> &data_out_0) { +#pragma HLS PIPELINE II = 1 + if (data_in_0.empty() || data_in_1.empty()) + return; + ap_int in0; + ap_int in1; + data_in_0.read_nb(in0); + data_in_1.read_nb(in1); + ap_int res = in0 / in1; + // TODO: #pragma HLS bind_op variable=res op= impl=fabric + data_out_0.write_nb(res); +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/hls.app b/src/mase_components/hls/scalar_ops/int_div/prj/hls.app new file mode 100644 index 000000000..f0f80b456 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/hls.app @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/.autopilot_exit b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/.autopilot_exit new file mode 100644 index 000000000..cc26a0a77 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/.autopilot_exit @@ -0,0 +1,2 @@ +22:51:45 +08/04/2024 diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/.message_syn.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/.message_syn.xml new file mode 100644 index 000000000..3325f2f29 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/.message_syn.xml @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g new file mode 100644 index 000000000..fed4a96e7 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.0.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.0.bc new file mode 100644 index 000000000..fed4a96e7 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.0.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.1.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.1.bc new file mode 100644 index 000000000..a9959c961 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.1.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.2.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.2.bc new file mode 100644 index 000000000..4364432ab Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.2.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.2.prechk.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.2.prechk.bc new file mode 100644 index 000000000..6d478880a Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.2.prechk.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.bc new file mode 100644 index 000000000..fed4a96e7 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc new file mode 100644 index 000000000..b9c400d9f Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc.clang.reflow.diag.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc.clang.reflow.diag.xml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc.clang.reflow.diag.yml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.0.bc.clang.reflow.diag.yml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.1.lower.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.1.lower.bc new file mode 100644 index 000000000..abecf12c8 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.1.lower.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.1.lower.bc.opt.diag.yml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.1.lower.bc.opt.diag.yml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.2.m1.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.2.m1.bc new file mode 100644 index 000000000..bfad45749 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.2.m1.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.3.fpc.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.3.fpc.bc new file mode 100644 index 000000000..db9054f0c Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.3.fpc.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.3.fpc.bc.opt.diag.yml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.3.fpc.bc.opt.diag.yml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.4.m2.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.4.m2.bc new file mode 100644 index 000000000..28534701f Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.4.m2.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.5.gdce.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.5.gdce.bc new file mode 100644 index 000000000..68c051076 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.5.gdce.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.5.gdce.bc.opt.diag.yml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.5.gdce.bc.opt.diag.yml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.lto.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.lto.bc new file mode 100644 index 000000000..fed4a96e7 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.lto.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.1.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.1.bc new file mode 100644 index 000000000..a9959c961 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.1.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.1.tmp.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.1.tmp.bc new file mode 100644 index 000000000..6ef86274a Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.1.tmp.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.2.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.2.bc new file mode 100644 index 000000000..e0b9d8e14 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.2.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.3.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.3.bc new file mode 100644 index 000000000..1163aec65 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.o.3.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.pp.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.pp.bc new file mode 100644 index 000000000..fed4a96e7 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.pp.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/all.directive.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/all.directive.json new file mode 100644 index 000000000..8d364558d --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/all.directive.json @@ -0,0 +1,23 @@ +[ + { + "functionLabel": "", + "functionName": "div", + "id": 0, + "ifcond": "", + "insert_position": "", + "label": "", + "pragma": { + "name": "TOP", + "option": [ + { + "name": "name", + "value": "div" + } + ] + }, + "slx": false, + "sourceFile": "/workspace/src/mase_components/hls/scalar_ops/int_div/vhls.tcl", + "sourceLine": 8, + "success": true + } +] \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div.cpp b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div.cpp new file mode 100644 index 000000000..8abf6c7ce --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div.cpp @@ -0,0 +1,1218 @@ +#include "hls_signal_handler.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ap_fixed.h" +#include "ap_int.h" +#include "autopilot_cbe.h" +#include "hls_half.h" +#include "hls_stream.h" + +using namespace std; + +// wrapc file define: +#define AUTOTB_TVIN_data_in_0 "../tv/cdatafile/c.div.autotvin_data_in_0.dat" +#define WRAPC_STREAM_SIZE_IN_data_in_0 "../tv/stream_size/stream_size_in_data_in_0.dat" +#define WRAPC_STREAM_INGRESS_STATUS_data_in_0 "../tv/stream_size/stream_ingress_status_data_in_0.dat" +#define AUTOTB_TVIN_data_in_1 "../tv/cdatafile/c.div.autotvin_data_in_1.dat" +#define WRAPC_STREAM_SIZE_IN_data_in_1 "../tv/stream_size/stream_size_in_data_in_1.dat" +#define WRAPC_STREAM_INGRESS_STATUS_data_in_1 "../tv/stream_size/stream_ingress_status_data_in_1.dat" +#define AUTOTB_TVOUT_data_out_0 "../tv/cdatafile/c.div.autotvout_data_out_0.dat" +#define WRAPC_STREAM_SIZE_OUT_data_out_0 "../tv/stream_size/stream_size_out_data_out_0.dat" +#define WRAPC_STREAM_EGRESS_STATUS_data_out_0 "../tv/stream_size/stream_egress_status_data_out_0.dat" + + +// tvout file define: +#define AUTOTB_TVOUT_PC_data_out_0 "../tv/rtldatafile/rtl.div.autotvout_data_out_0.dat" + + +namespace hls::sim +{ + template + struct Byte { + unsigned char a[n]; + + Byte() + { + for (size_t i = 0; i < n; ++i) { + a[i] = 0; + } + } + + template + Byte& operator= (const T &val) + { + std::memcpy(a, &val, n); + return *this; + } + }; + + struct SimException : public std::exception { + const std::string msg; + const size_t line; + SimException(const std::string &msg, const size_t line) + : msg(msg), line(line) + { + } + }; + + void errExit(const size_t line, const std::string &msg) + { + std::string s; + s += "ERROR"; +// s += '('; +// s += __FILE__; +// s += ":"; +// s += std::to_string(line); +// s += ')'; + s += ": "; + s += msg; + s += "\n"; + fputs(s.c_str(), stderr); + exit(1); + } +} + + +namespace hls::sim +{ + template + void move(void* to, void* from) + { + auto t = (hls::stream>*)to; + auto f = (hls::stream>*)from; + while (!f->empty()) { + t->write(f->read()); + } + } + + template + void task_move(void* to, void* from) + { + auto t = (hls::stream>*)to; + auto f = (hls::stream>*)from; + std::thread( + [=] () { while (true) { t->write(f->read()); } } + ).detach(); + } + + template + struct MoveAXIS + { + struct ST { A data; K keep; S strb; U user; L last; I id; E dest; }; + + static void toSC(void* data, void* keep, void* strb, void* user, void* last, void* id, void* dest, void* axis) + { + ST st; + ((hls::stream*)axis)->read(st); + ((hls::stream*)data)->write(st.data); + ((hls::stream*)keep)->write(st.keep); + ((hls::stream*)strb)->write(st.strb); + ((hls::stream*)user)->write(st.user); + ((hls::stream*)last)->write(st.last); + ((hls::stream*)id)->write(st.id); + ((hls::stream*)dest)->write(st.dest); + } + + static void fromSC(void* data, void* keep, void* strb, void* user, void* last, void* id, void* dest, void* axis) + { + ST st; + ((hls::stream*)data)->read(st.data); + ((hls::stream*)keep)->read(st.keep); + ((hls::stream*)strb)->read(st.strb); + ((hls::stream*)user)->read(st.user); + ((hls::stream*)last)->read(st.last); + ((hls::stream*)id)->read(st.id); + ((hls::stream*)dest)->read(st.dest); + ((hls::stream*)axis)->write(st); + } + }; + + template + void move_to_SC(void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest, void* axis) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + while (!((hls::stream*)axis)->empty()) { + M::toSC(data, keep, strb, user, last, id, dest, axis); + } + } + + template + void task_move_to_SC(void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest, void* axis) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + std::thread( + [=] () { while (true) M::toSC(data, keep, strb, user, last, id, dest, axis); } + ).detach(); + } + + template + void move_from_SC(void* axis, void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + while (!((hls::stream>*)data)->empty()) { + M::fromSC(data, keep, strb, user, last, id, dest, axis); + } + } + + template + void task_move_from_SC(void* axis, void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + std::thread( + [=] () { while (true) M::fromSC(data, keep, strb, user, last, id, dest, axis); } + ).detach(); + } +} + +namespace hls::sim +{ + size_t divide_ceil(size_t a, size_t b) + { + return (a + b - 1) / b; + } + + const bool little_endian() + { + int a = 1; + return *(char*)&a == 1; + } + + inline void rev_endian(unsigned char *p, size_t nbytes) + { + std::reverse(p, p+nbytes); + } + + const bool LE = little_endian(); + + inline size_t least_nbyte(size_t width) + { + return (width+7)>>3; + } + + std::string formatData(unsigned char *pos, size_t wbits) + { + size_t wbytes = least_nbyte(wbits); + size_t i = LE ? wbytes-1 : 0; + auto next = [&] () { + auto c = pos[i]; + LE ? --i : ++i; + return c; + }; + std::ostringstream ss; + ss << "0x"; + if (int t = (wbits & 0x7)) { + if (t <= 4) { + unsigned char mask = (1<= 'a' && c <= 'f') { + return c-'a'+10; + } else if (c >= 'A' && c <= 'F') { + return c-'A'+10; + } else if (c >= '0' && c <= '9') { + return c-'0'; + } else { + throw SimException("Not Hexdecimal Digit", __LINE__); + } + } + + void unformatData(const char *data, unsigned char *put, size_t pbytes = 0) + { + size_t nchars = strlen(data+2); + size_t nbytes = (nchars+1)>>1; + if (pbytes == 0) { + pbytes = nbytes; + } else if (pbytes > nbytes) { + throw SimException("Wrong size specified", __LINE__); + } + put = LE ? put : put+pbytes-1; + auto nextp = [&] () { + return LE ? put++ : put--; + }; + const char *c = data + (nchars + 2) - 1; + auto next = [&] () { + char res { *c == 'x' ? (char)0 : ord(*c) }; + --c; + return res; + }; + for (size_t i = 0; i < pbytes; ++i) { + char l = next(); + char h = next(); + *nextp() = (h<<4)+l; + } + } + + char* strip(char *s) + { + while (isspace(*s)) { + ++s; + } + for (char *p = s+strlen(s)-1; p >= s; --p) { + if (isspace(*p)) { + *p = 0; + } else { + return s; + } + } + return s; + } + + size_t sum(const std::vector &v) + { + size_t res = 0; + for (const auto &e : v) { + res += e; + } + return res; + } + + const char* bad = "Bad TV file"; + const char* err = "Error on TV file"; + + const unsigned char bmark[] = { + 0x5a, 0x5a, 0xa5, 0xa5, 0x0f, 0x0f, 0xf0, 0xf0 + }; + +#ifdef USE_BINARY_TV_FILE + class Input { + FILE *fp; + long pos; + + void read(unsigned char *buf, size_t size) + { + if (fread(buf, size, 1, fp) != 1) { + throw SimException(bad, __LINE__); + } + if (LE) { + rev_endian(buf, size); + } + } + + public: + void advance(size_t nbytes) + { + if (fseek(fp, nbytes, SEEK_CUR) == -1) { + throw SimException(bad, __LINE__); + } + } + + Input(const char *path) : fp(nullptr) + { + fp = fopen(path, "rb"); + if (fp == nullptr) { + errExit(__LINE__, err); + } + } + + void begin() + { + advance(8); + pos = ftell(fp); + } + + void reset() + { + fseek(fp, pos, SEEK_SET); + } + + void into(unsigned char *param, size_t wbytes, size_t asize, size_t nbytes) + { + size_t n = nbytes / asize; + size_t r = nbytes % asize; + for (size_t i = 0; i < n; ++i) { + read(param, wbytes); + param += asize; + } + if (r > 0) { + advance(asize-r); + read(param, r); + } + } + + ~Input() + { + unsigned char buf[8]; + size_t res = fread(buf, 8, 1, fp); + fclose(fp); + if (res != 1) { + errExit(__LINE__, bad); + } + if (std::memcmp(buf, bmark, 8) != 0) { + errExit(__LINE__, bad); + } + } + }; + + class Output { + FILE *fp; + + void write(unsigned char *buf, size_t size) + { + if (LE) { + rev_endian(buf, size); + } + if (fwrite(buf, size, 1, fp) != 1) { + throw SimException(err, __LINE__); + } + if (LE) { + rev_endian(buf, size); + } + } + + public: + Output(const char *path) : fp(nullptr) + { + fp = fopen(path, "wb"); + if (fp == nullptr) { + errExit(__LINE__, err); + } + } + + void begin(size_t total) + { + unsigned char buf[8] = {0}; + std::memcpy(buf, &total, sizeof(buf)); + write(buf, sizeof(buf)); + } + + void from(unsigned char *param, size_t wbytes, size_t asize, size_t nbytes, size_t skip) + { + param -= asize*skip; + size_t n = divide_ceil(nbytes, asize); + for (size_t i = 0; i < n; ++i) { + write(param, wbytes); + param += asize; + } + } + + ~Output() + { + size_t res = fwrite(bmark, 8, 1, fp); + fclose(fp); + if (res != 1) { + errExit(__LINE__, err); + } + } + }; +#endif + + class Reader { + FILE *fp; + long pos; + int size; + char *s; + + void readline() + { + s = fgets(s, size, fp); + if (s == nullptr) { + throw SimException(bad, __LINE__); + } + } + + public: + Reader(const char *path) : fp(nullptr), size(1<<12), s(new char[size]) + { + try { + fp = fopen(path, "r"); + if (fp == nullptr) { + throw SimException(err, __LINE__); + } else { + readline(); + static const char mark[] = "[[[runtime]]]\n"; + if (strcmp(s, mark) != 0) { + throw SimException(bad, __LINE__); + } + } + } catch (const hls::sim::SimException &e) { + errExit(e.line, e.msg); + } + } + + ~Reader() + { + fclose(fp); + delete[] s; + } + + void begin() + { + readline(); + static const char mark[] = "[[transaction]]"; + if (strncmp(s, mark, strlen(mark)) != 0) { + throw SimException(bad, __LINE__); + } + pos = ftell(fp); + } + + void reset() + { + fseek(fp, pos, SEEK_SET); + } + + void skip(size_t n) + { + for (size_t i = 0; i < n; ++i) { + readline(); + } + } + + char* next() + { + long pos = ftell(fp); + readline(); + if (*s == '[') { + fseek(fp, pos, SEEK_SET); + return nullptr; + } + return strip(s); + } + + void end() + { + do { + readline(); + } while (strcmp(s, "[[/transaction]]\n") != 0); + } + }; + + class Writer { + FILE *fp; + + void write(const char *s) + { + if (fputs(s, fp) == EOF) { + throw SimException(err, __LINE__); + } + } + + public: + Writer(const char *path) : fp(nullptr) + { + try { + fp = fopen(path, "w"); + if (fp == nullptr) { + throw SimException(err, __LINE__); + } else { + static const char mark[] = "[[[runtime]]]\n"; + write(mark); + } + } catch (const hls::sim::SimException &e) { + errExit(e.line, e.msg); + } + } + + virtual ~Writer() + { + try { + static const char mark[] = "[[[/runtime]]]\n"; + write(mark); + } catch (const hls::sim::SimException &e) { + errExit(e.line, e.msg); + } + fclose(fp); + } + + void begin(size_t AESL_transaction) + { + static const char mark[] = "[[transaction]] "; + write(mark); + auto buf = std::to_string(AESL_transaction); + buf.push_back('\n'); + buf.push_back('\0'); + write(buf.data()); + } + + void next(const char *s) + { + write(s); + write("\n"); + } + + void end() + { + static const char mark[] = "[[/transaction]]\n"; + write(mark); + } + }; + + bool RTLOutputCheckAndReplacement(char *data) + { + bool changed = false; + for (size_t i = 2; i < strlen(data); ++i) { + if (data[i] == 'X' || data[i] == 'x') { + data[i] = '0'; + changed = true; + } + } + return changed; + } + + void warnOnX() + { + static const char msg[] = + "WARNING: [SIM 212-201] RTL produces unknown value " + "'x' or 'X' on some port, possible cause: " + "There are uninitialized variables in the design.\n"; + fprintf(stderr, msg); + } + +#ifndef POST_CHECK + class RefTCL { + FILE *fp; + std::ostringstream ss; + + void formatDepth() + { + ss << "set depth_list {\n"; + for (auto &p : depth) { + ss << " {" << p.first << " " << p.second << "}\n"; + } + if (nameHBM != "") { + ss << " {" << nameHBM << " " << depthHBM << "}\n"; + } + ss << "}\n"; + } + + void formatTransNum() + { + ss << "set trans_num " << AESL_transaction << "\n"; + } + + void formatHBM() + { + ss << "set HBM_ArgDict {\n" + << " Name " << nameHBM << "\n" + << " Port " << portHBM << "\n" + << " BitWidth " << widthHBM << "\n" + << "}\n"; + } + + void close() + { + formatDepth(); + formatTransNum(); + if (nameHBM != "") { + formatHBM(); + } + std::string &&s { ss.str() }; + size_t res = fwrite(s.data(), s.size(), 1, fp); + fclose(fp); + if (res != 1) { + errExit(__LINE__, err); + } + } + + public: + std::map depth; + std::string nameHBM; + size_t depthHBM; + std::string portHBM; + unsigned widthHBM; + size_t AESL_transaction; + std::mutex mut; + + RefTCL(const char *path) + { + fp = fopen(path, "w"); + if (fp == nullptr) { + errExit(__LINE__, err); + } + } + + void set(const char* name, size_t dep) + { + std::lock_guard guard(mut); + if (depth[name] < dep) { + depth[name] = dep; + } + } + + ~RefTCL() + { + close(); + } + }; + +#endif + + struct Register { + const char* name; + unsigned width; +#ifdef POST_CHECK + Reader* reader; +#else + Writer* owriter; + Writer* iwriter; +#endif + void* param; + +#ifndef POST_CHECK + void doTCL(RefTCL &tcl) + { + if (strcmp(name, "return") == 0) { + tcl.set("ap_return", 1); + } else { + tcl.set(name, 1); + } + } +#endif + ~Register() + { +#ifdef POST_CHECK + delete reader; +#else + delete owriter; + delete iwriter; +#endif + } + }; + + template + struct Memory { + unsigned width; + unsigned asize; + bool hbm; + std::vector name; +#ifdef POST_CHECK + Reader* reader; +#else + Writer* owriter; + Writer* iwriter; +#endif + std::vector param; + std::vector nbytes; + std::vector offset; + std::vector hasWrite; + + size_t depth() + { + size_t depth = 0; + for (size_t n : nbytes) { + depth += divide_ceil(n, asize); + } + return depth; + } + +#ifndef POST_CHECK + void doTCL(RefTCL &tcl) + { + if (hbm) { + tcl.nameHBM.append(name[0]); + tcl.portHBM.append("{").append(name[0]); + for (size_t i = 1; i < name.size(); ++i) { + tcl.nameHBM.append("_").append(name[i]); + tcl.portHBM.append(" ").append(name[i]); + } + tcl.nameHBM.append("_HBM"); + tcl.portHBM.append("}"); + tcl.widthHBM = width; + tcl.depthHBM = divide_ceil(nbytes[0], asize); + } else { + tcl.set(name[0], depth()); + } + } +#endif + + ~Memory() + { +#ifdef POST_CHECK + delete reader; +#else + delete owriter; + delete iwriter; +#endif + } + }; + + struct A2Stream { + unsigned width; + unsigned asize; + const char* name; +#ifdef POST_CHECK + Reader* reader; +#else + Writer* owriter; + Writer* iwriter; +#endif + void* param; + size_t nbytes; + bool hasWrite; + +#ifndef POST_CHECK + void doTCL(RefTCL &tcl) + { + tcl.set(name, divide_ceil(nbytes, asize)); + } +#endif + + ~A2Stream() + { +#ifdef POST_CHECK + delete reader; +#else + delete owriter; + delete iwriter; +#endif + } + }; + + template + struct Stream { + unsigned width; + const char* name; +#ifdef POST_CHECK + Reader* reader; +#else + Writer* writer; + Writer* swriter; + Writer* gwriter; +#endif + hls::stream* param; + std::vector buf; + size_t initSize; + size_t depth; + bool hasWrite; + + void markSize() + { + initSize = param->size(); + } + + void buffer() + { + buf.clear(); + while (!param->empty()) { + buf.push_back(param->read()); + } + for (auto &e : buf) { + param->write(e); + } + } + +#ifndef POST_CHECK + void doTCL(RefTCL &tcl) + { + tcl.set(name, depth); + } +#endif + + ~Stream() + { +#ifdef POST_CHECK + delete reader; +#else + delete writer; + delete swriter; + delete gwriter; +#endif + } + }; + +#ifdef POST_CHECK + void check(Register &port) + { + port.reader->begin(); + bool foundX = false; + if (char *s = port.reader->next()) { + foundX |= RTLOutputCheckAndReplacement(s); + unformatData(s, (unsigned char*)port.param); + } + port.reader->end(); + if (foundX) { + warnOnX(); + } + } + +#ifdef USE_BINARY_TV_FILE + void checkHBM(Memory &port) + { + port.reader->begin(); + size_t wbytes = least_nbyte(port.width); + for (size_t i = 0; i < port.param.size(); ++i) { + if (port.hasWrite[i]) { + port.reader->reset(); + size_t skip = wbytes * port.offset[i]; + port.reader->advance(skip); + port.reader->into((unsigned char*)port.param[i], wbytes, + port.asize, port.nbytes[i] - skip); + } + } + } + + void check(Memory &port) + { + if (port.hbm) { + return checkHBM(port); + } else { + port.reader->begin(); + size_t wbytes = least_nbyte(port.width); + for (size_t i = 0; i < port.param.size(); ++i) { + if (port.hasWrite[i]) { + port.reader->into((unsigned char*)port.param[i], wbytes, + port.asize, port.nbytes[i]); + } else { + size_t n = divide_ceil(port.nbytes[i], port.asize); + port.reader->advance(port.asize*n); + } + } + } + } +#endif + void transfer(Reader *reader, size_t nbytes, unsigned char *put, bool &foundX) + { + if (char *s = reader->next()) { + foundX |= RTLOutputCheckAndReplacement(s); + unformatData(s, put, nbytes); + } else { + throw SimException("No more data", __LINE__); + } + } + + void checkHBM(Memory &port) + { + port.reader->begin(); + bool foundX = false; + size_t wbytes = least_nbyte(port.width); + for (size_t i = 0, last = port.param.size()-1; i <= last; ++i) { + if (port.hasWrite[i]) { + port.reader->skip(port.offset[i]); + size_t n = port.nbytes[i] / port.asize - port.offset[i]; + unsigned char *put = (unsigned char*)port.param[i]; + for (size_t j = 0; j < n; ++j) { + transfer(port.reader, wbytes, put, foundX); + put += port.asize; + } + if (i < last) { + port.reader->reset(); + } + } + } + port.reader->end(); + if (foundX) { + warnOnX(); + } + } + + void check(Memory &port) + { + if (port.hbm) { + return checkHBM(port); + } else { + port.reader->begin(); + bool foundX = false; + size_t wbytes = least_nbyte(port.width); + for (size_t i = 0; i < port.param.size(); ++i) { + if (port.hasWrite[i]) { + size_t n = port.nbytes[i] / port.asize; + size_t r = port.nbytes[i] % port.asize; + unsigned char *put = (unsigned char*)port.param[i]; + for (size_t j = 0; j < n; ++j) { + transfer(port.reader, wbytes, put, foundX); + put += port.asize; + } + if (r > 0) { + transfer(port.reader, r, put, foundX); + } + } else { + size_t n = divide_ceil(port.nbytes[i], port.asize); + port.reader->skip(n); + } + } + port.reader->end(); + if (foundX) { + warnOnX(); + } + } + } + + void check(A2Stream &port) + { + port.reader->begin(); + bool foundX = false; + if (port.hasWrite) { + size_t wbytes = least_nbyte(port.width); + size_t n = port.nbytes / port.asize; + size_t r = port.nbytes % port.asize; + unsigned char *put = (unsigned char*)port.param; + for (size_t j = 0; j < n; ++j) { + if (char *s = port.reader->next()) { + foundX |= RTLOutputCheckAndReplacement(s); + unformatData(s, put, wbytes); + } + put += port.asize; + } + if (r > 0) { + if (char *s = port.reader->next()) { + foundX |= RTLOutputCheckAndReplacement(s); + unformatData(s, put, r); + } + } + } + port.reader->end(); + if (foundX) { + warnOnX(); + } + } + + template + void check(Stream &port) + { + if (port.hasWrite) { + port.reader->begin(); + bool foundX = false; + E *p = new E; + while (char *s = port.reader->next()) { + foundX |= RTLOutputCheckAndReplacement(s); + unformatData(s, (unsigned char*)p); + port.param->write(*p); + } + delete p; + port.reader->end(); + if (foundX) { + warnOnX(); + } + } else { + port.reader->begin(); + size_t n = 0; + if (char *s = port.reader->next()) { + std::istringstream ss(s); + ss >> n; + } else { + throw SimException(bad, __LINE__); + } + port.reader->end(); + for (size_t j = 0; j < n; ++j) { + port.param->read(); + } + } + } +#else + void dump(Register &port, Writer *writer, size_t AESL_transaction) + { + writer->begin(AESL_transaction); + std::string &&s { formatData((unsigned char*)port.param, port.width) }; + writer->next(s.data()); + writer->end(); + } + + void error_on_depth_unspecified(const char *portName) + { + std::string msg {"A depth specification is required for MAXI interface port "}; + msg.append("'"); + msg.append(portName); + msg.append("'"); + msg.append(" for cosimulation."); + throw SimException(msg, __LINE__); + } + +#ifdef USE_BINARY_TV_FILE + void dump(Memory &port, Output *writer, size_t AESL_transaction) + { + writer->begin(port.depth()); + size_t wbytes = least_nbyte(port.width); + for (size_t i = 0; i < port.param.size(); ++i) { + if (port.nbytes[i] == 0) { + error_on_depth_unspecified(port.hbm ? port.name[i] : port.name[0]); + } + writer->from((unsigned char*)port.param[i], wbytes, port.asize, + port.nbytes[i], 0); + } + } + +#endif + void dump(Memory &port, Writer *writer, size_t AESL_transaction) + { + writer->begin(AESL_transaction); + for (size_t i = 0; i < port.param.size(); ++i) { + if (port.nbytes[i] == 0) { + error_on_depth_unspecified(port.hbm ? port.name[i] : port.name[0]); + } + size_t n = divide_ceil(port.nbytes[i], port.asize); + unsigned char *put = (unsigned char*)port.param[i]; + for (size_t j = 0; j < n; ++j) { + std::string &&s { + formatData(put, port.width) + }; + writer->next(s.data()); + put += port.asize; + } + if (port.hbm) { + break; + } + } + writer->end(); + } + + void dump(A2Stream &port, Writer *writer, size_t AESL_transaction) + { + writer->begin(AESL_transaction); + size_t n = divide_ceil(port.nbytes, port.asize); + unsigned char *put = (unsigned char*)port.param; + for (size_t j = 0; j < n; ++j) { + std::string &&s { formatData(put, port.width) }; + writer->next(s.data()); + put += port.asize; + } + writer->end(); + } + + template + void dump(Stream &port, size_t AESL_transaction) + { + if (port.hasWrite) { + port.writer->begin(AESL_transaction); + port.depth = port.param->size()-port.initSize; + for (size_t j = 0; j < port.depth; ++j) { + std::string &&s { + formatData((unsigned char*)&port.buf[port.initSize+j], port.width) + }; + port.writer->next(s.c_str()); + } + port.writer->end(); + + port.swriter->begin(AESL_transaction); + port.swriter->next(std::to_string(port.depth).c_str()); + port.swriter->end(); + } else { + port.writer->begin(AESL_transaction); + port.depth = port.initSize-port.param->size(); + for (size_t j = 0; j < port.depth; ++j) { + std::string &&s { + formatData((unsigned char*)&port.buf[j], port.width) + }; + port.writer->next(s.c_str()); + } + port.writer->end(); + + port.swriter->begin(AESL_transaction); + port.swriter->next(std::to_string(port.depth).c_str()); + port.swriter->end(); + + port.gwriter->begin(AESL_transaction); + size_t n = (port.depth ? port.initSize : port.depth); + size_t d = port.depth; + do { + port.gwriter->next(std::to_string(n--).c_str()); + } while (d--); + port.gwriter->end(); + } + } +#endif +} + + + +extern "C" +void div_hw_stub_wrapper(void*, void*, void*); + +extern "C" +void apatb_div_hw(void* __xlx_apatb_param_data_in_0, void* __xlx_apatb_param_data_in_1, void* __xlx_apatb_param_data_out_0) +{ + static hls::sim::Stream> port0 { + .width = 32, + .name = "data_in_0", +#ifdef POST_CHECK + .reader = new hls::sim::Reader(WRAPC_STREAM_SIZE_IN_data_in_0), +#else + .writer = new hls::sim::Writer(AUTOTB_TVIN_data_in_0), + .swriter = new hls::sim::Writer(WRAPC_STREAM_SIZE_IN_data_in_0), + .gwriter = new hls::sim::Writer(WRAPC_STREAM_INGRESS_STATUS_data_in_0), +#endif + }; + port0.param = (hls::stream>*)__xlx_apatb_param_data_in_0; + port0.hasWrite = false; + + static hls::sim::Stream> port1 { + .width = 32, + .name = "data_in_1", +#ifdef POST_CHECK + .reader = new hls::sim::Reader(WRAPC_STREAM_SIZE_IN_data_in_1), +#else + .writer = new hls::sim::Writer(AUTOTB_TVIN_data_in_1), + .swriter = new hls::sim::Writer(WRAPC_STREAM_SIZE_IN_data_in_1), + .gwriter = new hls::sim::Writer(WRAPC_STREAM_INGRESS_STATUS_data_in_1), +#endif + }; + port1.param = (hls::stream>*)__xlx_apatb_param_data_in_1; + port1.hasWrite = false; + + static hls::sim::Stream> port2 { + .width = 16, + .name = "data_out_0", +#ifdef POST_CHECK + .reader = new hls::sim::Reader(AUTOTB_TVOUT_PC_data_out_0), +#else + .writer = new hls::sim::Writer(AUTOTB_TVOUT_data_out_0), + .swriter = new hls::sim::Writer(WRAPC_STREAM_SIZE_OUT_data_out_0), + .gwriter = new hls::sim::Writer(WRAPC_STREAM_EGRESS_STATUS_data_out_0), +#endif + }; + port2.param = (hls::stream>*)__xlx_apatb_param_data_out_0; + port2.hasWrite = true; + + try { +#ifdef POST_CHECK + CodeState = ENTER_WRAPC_PC; + check(port0); + check(port1); + check(port2); +#else + static hls::sim::RefTCL tcl("../tv/cdatafile/ref.tcl"); + CodeState = DUMP_INPUTS; + port0.markSize(); + port1.markSize(); + port0.buffer(); + port1.buffer(); + port2.markSize(); + CodeState = CALL_C_DUT; + div_hw_stub_wrapper(__xlx_apatb_param_data_in_0, __xlx_apatb_param_data_in_1, __xlx_apatb_param_data_out_0); + port2.buffer(); + dump(port0, tcl.AESL_transaction); + dump(port1, tcl.AESL_transaction); + port0.doTCL(tcl); + port1.doTCL(tcl); + CodeState = DUMP_OUTPUTS; + dump(port2, tcl.AESL_transaction); + port2.doTCL(tcl); + tcl.AESL_transaction++; +#endif + } catch (const hls::sim::SimException &e) { + hls::sim::errExit(e.line, e.msg); + } +} \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_ir.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_ir.bc new file mode 100644 index 000000000..829e85e29 Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_ir.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_ir.ll b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_ir.ll new file mode 100644 index 000000000..295781360 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_ir.ll @@ -0,0 +1,279 @@ +; ModuleID = '/workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/a.g.ld.5.gdce.bc' +source_filename = "llvm-link" +target datalayout = "e-m:e-i64:64-i128:128-i256:256-i512:512-i1024:1024-i2048:2048-i4096:4096-n8:16:32:64-S128-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "fpga64-xilinx-none" + +%"class.hls::stream, 0>" = type { %"struct.ap_int<32>" } +%"struct.ap_int<32>" = type { %"struct.ap_int_base<32, true>" } +%"struct.ap_int_base<32, true>" = type { %"struct.ssdm_int<32, true>" } +%"struct.ssdm_int<32, true>" = type { i32 } +%"class.hls::stream, 0>" = type { %"struct.ap_int<16>" } +%"struct.ap_int<16>" = type { %"struct.ap_int_base<16, true>" } +%"struct.ap_int_base<16, true>" = type { %"struct.ssdm_int<16, true>" } +%"struct.ssdm_int<16, true>" = type { i16 } + +; Function Attrs: inaccessiblememonly nounwind +declare void @llvm.sideeffect() #0 + +; Function Attrs: inaccessiblemem_or_argmemonly noinline +define void @apatb_div_ir(%"class.hls::stream, 0>"* noalias nocapture nonnull dereferenceable(4) %data_in_0, %"class.hls::stream, 0>"* noalias nocapture nonnull dereferenceable(4) %data_in_1, %"class.hls::stream, 0>"* noalias nocapture nonnull dereferenceable(2) %data_out_0) local_unnamed_addr #1 { +entry: + %data_in_0_copy = alloca i32, align 512 + call void @llvm.sideeffect() #7 [ "stream_interface"(i32* %data_in_0_copy, i32 0) ] + %data_in_1_copy = alloca i32, align 512 + call void @llvm.sideeffect() #7 [ "stream_interface"(i32* %data_in_1_copy, i32 0) ] + %data_out_0_copy = alloca i16, align 512 + call void @llvm.sideeffect() #8 [ "stream_interface"(i16* %data_out_0_copy, i32 0) ] + call fastcc void @copy_in(%"class.hls::stream, 0>"* nonnull %data_in_0, i32* nonnull align 512 %data_in_0_copy, %"class.hls::stream, 0>"* nonnull %data_in_1, i32* nonnull align 512 %data_in_1_copy, %"class.hls::stream, 0>"* nonnull %data_out_0, i16* nonnull align 512 %data_out_0_copy) + call void @apatb_div_hw(i32* %data_in_0_copy, i32* %data_in_1_copy, i16* %data_out_0_copy) + call void @copy_back(%"class.hls::stream, 0>"* %data_in_0, i32* %data_in_0_copy, %"class.hls::stream, 0>"* %data_in_1, i32* %data_in_1_copy, %"class.hls::stream, 0>"* %data_out_0, i16* %data_out_0_copy) + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @copy_in(%"class.hls::stream, 0>"* noalias "unpacked"="0", i32* noalias nocapture align 512 "unpacked"="1.0", %"class.hls::stream, 0>"* noalias "unpacked"="2", i32* noalias nocapture align 512 "unpacked"="3.0", %"class.hls::stream, 0>"* noalias "unpacked"="4", i16* noalias nocapture align 512 "unpacked"="5.0") unnamed_addr #2 { +entry: + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>"(i32* align 512 %1, %"class.hls::stream, 0>"* %0) + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>"(i32* align 512 %3, %"class.hls::stream, 0>"* %2) + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>"(i16* align 512 %5, %"class.hls::stream, 0>"* %4) + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>"(i16* noalias nocapture align 512 "unpacked"="0.0" %dst, %"class.hls::stream, 0>"* noalias "unpacked"="1" %src) unnamed_addr #3 { +entry: + %0 = icmp eq %"class.hls::stream, 0>"* %src, null + br i1 %0, label %ret, label %copy + +copy: ; preds = %entry + call fastcc void @"streamcpy_hls.p0class.hls::stream, 0>"(i16* align 512 %dst, %"class.hls::stream, 0>"* nonnull %src) + br label %ret + +ret: ; preds = %copy, %entry + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"streamcpy_hls.p0class.hls::stream, 0>"(i16* noalias nocapture align 512 "unpacked"="0.0", %"class.hls::stream, 0>"* noalias nocapture "unpacked"="1") unnamed_addr #4 { +entry: + %2 = alloca %"class.hls::stream, 0>" + %3 = alloca i16 + br label %empty + +empty: ; preds = %push, %entry + %4 = bitcast %"class.hls::stream, 0>"* %1 to i8* + %5 = call i1 @fpga_fifo_not_empty_2(i8* %4) + br i1 %5, label %push, label %ret + +push: ; preds = %empty + %6 = bitcast %"class.hls::stream, 0>"* %2 to i8* + %7 = bitcast %"class.hls::stream, 0>"* %1 to i8* + call void @fpga_fifo_pop_2(i8* %6, i8* %7) + %8 = load volatile %"class.hls::stream, 0>", %"class.hls::stream, 0>"* %2 + %.evi = extractvalue %"class.hls::stream, 0>" %8, 0, 0, 0, 0 + store i16 %.evi, i16* %3 + %9 = bitcast i16* %3 to i8* + %10 = bitcast i16* %0 to i8* + call void @fpga_fifo_push_2(i8* %9, i8* %10) + br label %empty, !llvm.loop !5 + +ret: ; preds = %empty + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @copy_out(%"class.hls::stream, 0>"* noalias "unpacked"="0", i32* noalias nocapture align 512 "unpacked"="1.0", %"class.hls::stream, 0>"* noalias "unpacked"="2", i32* noalias nocapture align 512 "unpacked"="3.0", %"class.hls::stream, 0>"* noalias "unpacked"="4", i16* noalias nocapture align 512 "unpacked"="5.0") unnamed_addr #5 { +entry: + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.23"(%"class.hls::stream, 0>"* %0, i32* align 512 %1) + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.23"(%"class.hls::stream, 0>"* %2, i32* align 512 %3) + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.4"(%"class.hls::stream, 0>"* %4, i16* align 512 %5) + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.4"(%"class.hls::stream, 0>"* noalias "unpacked"="0" %dst, i16* noalias nocapture align 512 "unpacked"="1.0" %src) unnamed_addr #3 { +entry: + %0 = icmp eq %"class.hls::stream, 0>"* %dst, null + br i1 %0, label %ret, label %copy + +copy: ; preds = %entry + call fastcc void @"streamcpy_hls.p0class.hls::stream, 0>.7"(%"class.hls::stream, 0>"* nonnull %dst, i16* align 512 %src) + br label %ret + +ret: ; preds = %copy, %entry + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"streamcpy_hls.p0class.hls::stream, 0>.7"(%"class.hls::stream, 0>"* noalias nocapture "unpacked"="0", i16* noalias nocapture align 512 "unpacked"="1.0") unnamed_addr #4 { +entry: + %2 = alloca i16 + %3 = alloca %"class.hls::stream, 0>" + br label %empty + +empty: ; preds = %push, %entry + %4 = bitcast i16* %1 to i8* + %5 = call i1 @fpga_fifo_not_empty_2(i8* %4) + br i1 %5, label %push, label %ret + +push: ; preds = %empty + %6 = bitcast i16* %2 to i8* + %7 = bitcast i16* %1 to i8* + call void @fpga_fifo_pop_2(i8* %6, i8* %7) + %8 = load volatile i16, i16* %2 + %.ivi = insertvalue %"class.hls::stream, 0>" undef, i16 %8, 0, 0, 0, 0 + store %"class.hls::stream, 0>" %.ivi, %"class.hls::stream, 0>"* %3 + %9 = bitcast %"class.hls::stream, 0>"* %3 to i8* + %10 = bitcast %"class.hls::stream, 0>"* %0 to i8* + call void @fpga_fifo_push_2(i8* %9, i8* %10) + br label %empty, !llvm.loop !7 + +ret: ; preds = %empty + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>"(i32* noalias nocapture align 512 "unpacked"="0.0" %dst, %"class.hls::stream, 0>"* noalias "unpacked"="1" %src) unnamed_addr #3 { +entry: + %0 = icmp eq %"class.hls::stream, 0>"* %src, null + br i1 %0, label %ret, label %copy + +copy: ; preds = %entry + call fastcc void @"streamcpy_hls.p0class.hls::stream, 0>.18"(i32* align 512 %dst, %"class.hls::stream, 0>"* nonnull %src) + br label %ret + +ret: ; preds = %copy, %entry + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"streamcpy_hls.p0class.hls::stream, 0>.18"(i32* noalias nocapture align 512 "unpacked"="0.0", %"class.hls::stream, 0>"* noalias nocapture "unpacked"="1") unnamed_addr #4 { +entry: + %2 = alloca %"class.hls::stream, 0>" + %3 = alloca i32 + br label %empty + +empty: ; preds = %push, %entry + %4 = bitcast %"class.hls::stream, 0>"* %1 to i8* + %5 = call i1 @fpga_fifo_not_empty_4(i8* %4) + br i1 %5, label %push, label %ret + +push: ; preds = %empty + %6 = bitcast %"class.hls::stream, 0>"* %2 to i8* + %7 = bitcast %"class.hls::stream, 0>"* %1 to i8* + call void @fpga_fifo_pop_4(i8* %6, i8* %7) + %8 = load volatile %"class.hls::stream, 0>", %"class.hls::stream, 0>"* %2 + %.evi = extractvalue %"class.hls::stream, 0>" %8, 0, 0, 0, 0 + store i32 %.evi, i32* %3 + %9 = bitcast i32* %3 to i8* + %10 = bitcast i32* %0 to i8* + call void @fpga_fifo_push_4(i8* %9, i8* %10) + br label %empty, !llvm.loop !8 + +ret: ; preds = %empty + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.23"(%"class.hls::stream, 0>"* noalias "unpacked"="0" %dst, i32* noalias nocapture align 512 "unpacked"="1.0" %src) unnamed_addr #3 { +entry: + %0 = icmp eq %"class.hls::stream, 0>"* %dst, null + br i1 %0, label %ret, label %copy + +copy: ; preds = %entry + call fastcc void @"streamcpy_hls.p0class.hls::stream, 0>.26"(%"class.hls::stream, 0>"* nonnull %dst, i32* align 512 %src) + br label %ret + +ret: ; preds = %copy, %entry + ret void +} + +; Function Attrs: argmemonly noinline +define internal fastcc void @"streamcpy_hls.p0class.hls::stream, 0>.26"(%"class.hls::stream, 0>"* noalias nocapture "unpacked"="0", i32* noalias nocapture align 512 "unpacked"="1.0") unnamed_addr #4 { +entry: + %2 = alloca i32 + %3 = alloca %"class.hls::stream, 0>" + br label %empty + +empty: ; preds = %push, %entry + %4 = bitcast i32* %1 to i8* + %5 = call i1 @fpga_fifo_not_empty_4(i8* %4) + br i1 %5, label %push, label %ret + +push: ; preds = %empty + %6 = bitcast i32* %2 to i8* + %7 = bitcast i32* %1 to i8* + call void @fpga_fifo_pop_4(i8* %6, i8* %7) + %8 = load volatile i32, i32* %2 + %.ivi = insertvalue %"class.hls::stream, 0>" undef, i32 %8, 0, 0, 0, 0 + store %"class.hls::stream, 0>" %.ivi, %"class.hls::stream, 0>"* %3 + %9 = bitcast %"class.hls::stream, 0>"* %3 to i8* + %10 = bitcast %"class.hls::stream, 0>"* %0 to i8* + call void @fpga_fifo_push_4(i8* %9, i8* %10) + br label %empty, !llvm.loop !9 + +ret: ; preds = %empty + ret void +} + +declare void @apatb_div_hw(i32*, i32*, i16*) + +; Function Attrs: argmemonly noinline +define internal fastcc void @copy_back(%"class.hls::stream, 0>"* noalias "unpacked"="0", i32* noalias nocapture align 512 "unpacked"="1.0", %"class.hls::stream, 0>"* noalias "unpacked"="2", i32* noalias nocapture align 512 "unpacked"="3.0", %"class.hls::stream, 0>"* noalias "unpacked"="4", i16* noalias nocapture align 512 "unpacked"="5.0") unnamed_addr #5 { +entry: + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.23"(%"class.hls::stream, 0>"* %0, i32* align 512 %1) + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.23"(%"class.hls::stream, 0>"* %2, i32* align 512 %3) + call fastcc void @"onebyonecpy_hls.p0class.hls::stream, 0>.4"(%"class.hls::stream, 0>"* %4, i16* align 512 %5) + ret void +} + +define void @div_hw_stub_wrapper(i32*, i32*, i16*) #6 { +entry: + %3 = alloca %"class.hls::stream, 0>" + %4 = alloca %"class.hls::stream, 0>" + %5 = alloca %"class.hls::stream, 0>" + call void @copy_out(%"class.hls::stream, 0>"* %3, i32* %0, %"class.hls::stream, 0>"* %4, i32* %1, %"class.hls::stream, 0>"* %5, i16* %2) + call void @div_hw_stub(%"class.hls::stream, 0>"* %3, %"class.hls::stream, 0>"* %4, %"class.hls::stream, 0>"* %5) + call void @copy_in(%"class.hls::stream, 0>"* %3, i32* %0, %"class.hls::stream, 0>"* %4, i32* %1, %"class.hls::stream, 0>"* %5, i16* %2) + ret void +} + +declare void @div_hw_stub(%"class.hls::stream, 0>"*, %"class.hls::stream, 0>"*, %"class.hls::stream, 0>"*) + +declare i1 @fpga_fifo_not_empty_4(i8*) + +declare i1 @fpga_fifo_not_empty_2(i8*) + +declare void @fpga_fifo_pop_4(i8*, i8*) + +declare void @fpga_fifo_pop_2(i8*, i8*) + +declare void @fpga_fifo_push_4(i8*, i8*) + +declare void @fpga_fifo_push_2(i8*, i8*) + +attributes #0 = { inaccessiblememonly nounwind } +attributes #1 = { inaccessiblemem_or_argmemonly noinline "fpga.wrapper.func"="wrapper" } +attributes #2 = { argmemonly noinline "fpga.wrapper.func"="copyin" } +attributes #3 = { argmemonly noinline "fpga.wrapper.func"="onebyonecpy_hls" } +attributes #4 = { argmemonly noinline "fpga.wrapper.func"="streamcpy_hls" } +attributes #5 = { argmemonly noinline "fpga.wrapper.func"="copyout" } +attributes #6 = { "fpga.wrapper.func"="stub" } +attributes #7 = { inaccessiblememonly nounwind "xlx.port.bitwidth"="32" "xlx.source"="user" } +attributes #8 = { inaccessiblememonly nounwind "xlx.port.bitwidth"="16" "xlx.source"="user" } + +!llvm.dbg.cu = !{} +!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0} +!llvm.module.flags = !{!1, !2, !3} +!blackbox_cfg = !{!4} + +!0 = !{!"clang version 7.0.0 "} +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"wchar_size", i32 4} +!4 = !{} +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.rotate.disable"} +!7 = distinct !{!7, !6} +!8 = distinct !{!8, !6} +!9 = distinct !{!9, !6} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_util.cpp b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_util.cpp new file mode 100644 index 000000000..ab0ae554c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/apatb_div_util.cpp @@ -0,0 +1,31 @@ +#include +#include "hls_stream.h" + +using namespace std; + +struct __cosim_T_2__ {char data[2];}; +extern "C" void fpga_fifo_push_2(__cosim_T_2__* val, hls::stream<__cosim_T_2__>* fifo) { + fifo->write(*val); +} +extern "C" void fpga_fifo_pop_2(__cosim_T_2__* val, hls::stream<__cosim_T_2__>* fifo) { + *val = fifo->read(); +} +extern "C" bool fpga_fifo_not_empty_2(hls::stream<__cosim_T_2__>* fifo) { + return !fifo->empty(); +} +extern "C" bool fpga_fifo_exist_2(hls::stream<__cosim_T_2__>* fifo) { + return fifo->exist(); +} +struct __cosim_T_4__ {char data[4];}; +extern "C" void fpga_fifo_push_4(__cosim_T_4__* val, hls::stream<__cosim_T_4__>* fifo) { + fifo->write(*val); +} +extern "C" void fpga_fifo_pop_4(__cosim_T_4__* val, hls::stream<__cosim_T_4__>* fifo) { + *val = fifo->read(); +} +extern "C" bool fpga_fifo_not_empty_4(hls::stream<__cosim_T_4__>* fifo) { + return !fifo->empty(); +} +extern "C" bool fpga_fifo_exist_4(hls::stream<__cosim_T_4__>* fifo) { + return fifo->exist(); +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/autopilot.rtl.models.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/autopilot.rtl.models.tcl new file mode 100644 index 000000000..050bbcc6d --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/autopilot.rtl.models.tcl @@ -0,0 +1,7 @@ +set SynModuleInfo { + {SRCNAME div MODELNAME div RTLNAME div IS_TOP 1 + SUBMODULES { + {MODELNAME div_sdiv_32ns_32ns_16_36_1 RTLNAME div_sdiv_32ns_32ns_16_36_1 BINDTYPE op TYPE sdiv IMPL auto LATENCY 35 ALLOW_PRAGMA 1} + } + } +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/autopilot.rtl.models.txt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/autopilot.rtl.models.txt new file mode 100644 index 000000000..b16da2552 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/autopilot.rtl.models.txt @@ -0,0 +1,2 @@ +div_sdiv_32ns_32ns_16_36_1 +div diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/clang.diag.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/clang.diag.xml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.1.0.fe.opt.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.1.0.fe.opt.json new file mode 100644 index 000000000..88b6bfcb1 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.1.0.fe.opt.json @@ -0,0 +1,11 @@ +{ + "0": { + "basicblocks": 5, + "calls": "", + "file": "div.cpp", + "fileloc": 10, + "instructions": 39, + "name": "div", + "total_instructions": 188 + } +} \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.2.0.fe.pragmas.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.2.0.fe.pragmas.json new file mode 100644 index 000000000..f63e4f85c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.2.0.fe.pragmas.json @@ -0,0 +1,11 @@ +{ + "0": { + "basicblocks": 11, + "calls": "", + "file": "div.cpp", + "fileloc": 10, + "instructions": 27, + "name": "div", + "total_instructions": 27 + } +} \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.3.0.fe.perf.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.3.0.fe.perf.json new file mode 100644 index 000000000..4a87c31d6 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.3.0.fe.perf.json @@ -0,0 +1,11 @@ +{ + "0": { + "basicblocks": 5, + "calls": "", + "file": "div.cpp", + "fileloc": 10, + "instructions": 17, + "name": "div", + "total_instructions": 17 + } +} \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.4.0.fe.perf.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.4.0.fe.perf.json new file mode 100644 index 000000000..21a026432 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.4.0.fe.perf.json @@ -0,0 +1,17 @@ +{ + "0": { + "basicblocks": 5, + "calls": "", + "file": "div.cpp", + "fileloc": 10, + "instructions": 44, + "loads": { + "L": 2 + }, + "name": "div", + "stores": { + "L": 1 + }, + "total_instructions": 44 + } +} \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.5.0.fe.end.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.5.0.fe.end.json new file mode 100644 index 000000000..b0ea10fc6 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/cmplx.5.0.fe.end.json @@ -0,0 +1,11 @@ +{ + "0": { + "name": "div", + "file": "div.cpp", + "fileloc": "10", + "instructions": "22", + "total_instructions": "22", + "calls": "", + "basicblocks": "5" + } +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/design.bindinfo.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/design.bindinfo.xml new file mode 100644 index 000000000..666b0915d --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/design.bindinfo.xml @@ -0,0 +1,4 @@ + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.DependenceCheck.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.DependenceCheck.tcl new file mode 100644 index 000000000..df2f7e3cb --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.DependenceCheck.tcl @@ -0,0 +1,6 @@ + + +set RtlHierarchyInfo {[ + {"ID" : "0", "Level" : "0", "Path" : "`AUTOTB_DUT_INST"}, + {"ID" : "1", "Level" : "1", "Path" : "`AUTOTB_DUT_INST.sdiv_32ns_32ns_16_36_1_U1"}]} +set DependenceCheckSize 0 diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.adb b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.adb new file mode 100644 index 000000000..63b8f7543 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.adb @@ -0,0 +1,3922 @@ + + + + + -1 + + + div + Pipeline + 0 + + 3 + 0 + + + + 1 + 1 + data_in_0 + + + 0 + + + + 0 + 0 + + data_in_0 + + + + + FIFO + 0 + 0 + 0 + + + 32 + + 0 + 3 + 0 + + 0 + 0 + + + + + + 1 + 2 + data_in_1 + + + 0 + + + + 0 + 0 + + data_in_1 + + + + + FIFO + 0 + 0 + 0 + + + 32 + + 0 + 3 + 0 + + 0 + 0 + + + + + + 1 + 3 + data_out_0 + + + 0 + + + + 0 + 0 + + data_out_0 + + + + + FIFO + 0 + 0 + 2531319821 + + + 16 + + 1 + 3 + 0 + + 0 + 0 + + + + + 14 + 0 + + + + 0 + 12 + tmp + div.cpp + ../. + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 1 + + + 3 + 0 + 32 + 33 + 35 + + nbreadreq + 0 + 0 + 0 + 0 + 0.00 + 1 + -1 + + + + + 0 + 13 + br_ln14 + div.cpp + ../. + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 0 + + + 3 + 0 + 36 + 37 + 38 + + br + 0 + 0 + 0 + 0 + 0.00 + 2 + -1 + + + + + 0 + 15 + tmp_1 + div.cpp + ../. + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 1 + + + 3 + 0 + 39 + 40 + 41 + + nbreadreq + 0 + 0 + 0 + 0 + 0.00 + 3 + -1 + + + + + 0 + 16 + br_ln14 + div.cpp + ../. + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 0 + + + 3 + 0 + 42 + 43 + 44 + + br + 0 + 0 + 0 + 0 + 0.00 + 4 + -1 + + + + + 0 + 18 + br_ln15 + div.cpp + ../. + 15 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 15 + + + + + + + + + + + 0 + 0 + 4294967295 + + + 0 + + + 1 + 0 + 45 + + br + 0 + 0 + 0 + 0 + 0.00 + 5 + -1 + + + + + 0 + 20 + data_in_0_read + div.cpp + ../. + 18 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 18 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 33 + + + 3 + 0 + 47 + 48 + 147 + + nbread + 0 + 0 + 0 + 0 + 1.42 + 6 + -1 + + + + + 0 + 21 + in0 + div.cpp + ../. + 18 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 18 + + + + + in0 + + + + + + 0 + 0 + 0 + + + 32 + + + 1 + 0 + 49 + + extractvalue + 0 + 0 + 0 + 0 + 0.00 + 7 + -1 + + + + + 0 + 22 + data_in_1_read + div.cpp + ../. + 19 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 19 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 33 + + + 3 + 0 + 50 + 51 + 148 + + nbread + 0 + 0 + 0 + 0 + 1.42 + 8 + -1 + + + + + 0 + 23 + in1 + div.cpp + ../. + 19 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 19 + + + + + in1 + + + + + + 0 + 0 + 1764524928 + + + 32 + + + 1 + 0 + 52 + + extractvalue + 0 + 0 + 0 + 0 + 0.00 + 9 + -1 + + + + + 0 + 24 + sdiv_ln20 + div.cpp + ../. + 20 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 20 + + + + + + sdiv_32ns_32ns_16_36_1_U1 + auto + sdiv + auto + Divider + 0 + 0 + 6 + + + 16 + + + 2 + 0 + 53 + 54 + + sdiv + 0 + 0 + 0 + 0 + 1.16 + 10 + -1 + + + + + 0 + 25 + res + div.cpp + ../. + 20 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 20 + + + + + res + data_out_0_din + + + + + 0 + 0 + 0 + + + 16 + + + 1 + 0 + 55 + + trunc + 0 + 0 + 0 + 0 + 0.00 + 11 + -1 + + + + + 0 + 26 + empty + div.cpp + ../. + 22 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 22 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 1 + + + 3 + 0 + 57 + 58 + 59 + + nbwrite + 0 + 0 + 0 + 0 + 1.42 + 12 + -1 + + + + + 0 + 27 + br_ln23 + div.cpp + ../. + 23 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 23 + + + + + + + + + + + 0 + 0 + 1775678896 + + + 0 + + + 1 + 0 + 60 + + br + 0 + 0 + 0 + 0 + 0.00 + 13 + -1 + + + + + 0 + 29 + _ln23 + div.cpp + ../. + 23 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 23 + + + + + + + + + + + 0 + 0 + 1872 + + + 0 + + + 0 + 0 + + ret + 0 + 0 + 0 + 0 + 0.00 + 14 + -1 + + + + 1 + 0 + + + + 2 + 34 + empty + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775688224 + + + 32 + + 0 + 1 + + + + 5 + 0 + + + 3 + 14 + entry + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 2531278202 + + + + 2 + 0 + 12 + 13 + + + + + 3 + 17 + lor.lhs.false + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775671552 + + + + 2 + 0 + 15 + 16 + + + + + 3 + 19 + if.then + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775611760 + + + + 1 + 0 + 18 + + + + + 3 + 28 + if.end + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 0 + + + + 8 + 0 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + + + + + 3 + 30 + return + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 0 + + + + 1 + 0 + 29 + + + + + 29 + 0 + + 33 + 1 + 1 + 12 + 0 + + + 35 + 1 + 34 + 12 + 0 + + + 36 + 1 + 12 + 13 + 0 + + + 37 + 2 + 19 + 13 + 0 + + + 38 + 2 + 17 + 13 + 0 + + + 40 + 1 + 2 + 15 + 0 + + + 41 + 1 + 34 + 15 + 0 + + + 42 + 1 + 15 + 16 + 0 + + + 43 + 2 + 19 + 16 + 0 + + + 44 + 2 + 28 + 16 + 0 + + + 45 + 2 + 30 + 18 + 0 + + + 48 + 1 + 1 + 20 + 0 + + + 49 + 1 + 20 + 21 + 0 + + + 51 + 1 + 2 + 22 + 0 + + + 52 + 1 + 22 + 23 + 0 + + + 53 + 1 + 21 + 24 + 0 + + + 54 + 1 + 23 + 24 + 0 + + + 55 + 1 + 24 + 25 + 0 + + + 58 + 1 + 3 + 26 + 0 + + + 59 + 1 + 25 + 26 + 0 + + + 60 + 2 + 30 + 27 + 0 + + + 141 + 2 + 14 + 17 + 0 + + + 142 + 2 + 14 + 19 + 0 + + + 143 + 2 + 17 + 28 + 0 + + + 144 + 2 + 17 + 19 + 0 + + + 145 + 2 + 19 + 30 + 0 + + + 146 + 2 + 28 + 30 + 0 + + + 147 + 4 + 12 + 20 + 0 + + + 148 + 4 + 15 + 22 + 0 + + + + + 1 + 0 + + 1 + div + div + 0 + + 0 + 0 + + + 5 + 0 + 14 + 17 + 19 + 28 + 30 + + 1 + 36 + -1 + -1 + 35 + 35 + 0 + + + + + + 36 + 0 + + 1 + + 18 + 0 + + 4 + 1 + 1 + + + 5 + 1 + 1 + + + 6 + 1 + 1 + + + 7 + 1 + 1 + + + 8 + 1 + 1 + + + 9 + 1 + 1 + + + 10 + 1 + 1 + + + 11 + 1 + 1 + + + 12 + 1 + 1 + + + 13 + 1 + 1 + + + 15 + 1 + 1 + + + 16 + 1 + 1 + + + 18 + 1 + 1 + + + 20 + 1 + 1 + + + 21 + 1 + 1 + + + 22 + 1 + 1 + + + 23 + 1 + 1 + + + 24 + 36 + 36 + + + + + 2 + + 1 + 0 + + 24 + 35 + 36 + + + + + 3 + + 1 + 0 + + 24 + 34 + 36 + + + + + 4 + + 1 + 0 + + 24 + 33 + 36 + + + + + 5 + + 1 + 0 + + 24 + 32 + 36 + + + + + 6 + + 1 + 0 + + 24 + 31 + 36 + + + + + 7 + + 1 + 0 + + 24 + 30 + 36 + + + + + 8 + + 1 + 0 + + 24 + 29 + 36 + + + + + 9 + + 1 + 0 + + 24 + 28 + 36 + + + + + 10 + + 1 + 0 + + 24 + 27 + 36 + + + + + 11 + + 1 + 0 + + 24 + 26 + 36 + + + + + 12 + + 1 + 0 + + 24 + 25 + 36 + + + + + 13 + + 1 + 0 + + 24 + 24 + 36 + + + + + 14 + + 1 + 0 + + 24 + 23 + 36 + + + + + 15 + + 1 + 0 + + 24 + 22 + 36 + + + + + 16 + + 1 + 0 + + 24 + 21 + 36 + + + + + 17 + + 1 + 0 + + 24 + 20 + 36 + + + + + 18 + + 1 + 0 + + 24 + 19 + 36 + + + + + 19 + + 1 + 0 + + 24 + 18 + 36 + + + + + 20 + + 1 + 0 + + 24 + 17 + 36 + + + + + 21 + + 1 + 0 + + 24 + 16 + 36 + + + + + 22 + + 1 + 0 + + 24 + 15 + 36 + + + + + 23 + + 1 + 0 + + 24 + 14 + 36 + + + + + 24 + + 1 + 0 + + 24 + 13 + 36 + + + + + 25 + + 1 + 0 + + 24 + 12 + 36 + + + + + 26 + + 1 + 0 + + 24 + 11 + 36 + + + + + 27 + + 1 + 0 + + 24 + 10 + 36 + + + + + 28 + + 1 + 0 + + 24 + 9 + 36 + + + + + 29 + + 1 + 0 + + 24 + 8 + 36 + + + + + 30 + + 1 + 0 + + 24 + 7 + 36 + + + + + 31 + + 1 + 0 + + 24 + 6 + 36 + + + + + 32 + + 1 + 0 + + 24 + 5 + 36 + + + + + 33 + + 1 + 0 + + 24 + 4 + 36 + + + + + 34 + + 1 + 0 + + 24 + 3 + 36 + + + + + 35 + + 1 + 0 + + 24 + 2 + 36 + + + + + 36 + + 5 + 0 + + 24 + 1 + 36 + + + 25 + 1 + 1 + + + 26 + 1 + 1 + + + 27 + 1 + 1 + + + 29 + 1 + 1 + + + + + + 35 + 0 + + 1 + 2 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 2 + 3 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 3 + 4 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 4 + 5 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 5 + 6 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 6 + 7 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 7 + 8 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 8 + 9 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 9 + 10 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 10 + 11 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 11 + 12 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 12 + 13 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 13 + 14 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 14 + 15 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 15 + 16 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 16 + 17 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 17 + 18 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 18 + 19 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 19 + 20 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 20 + 21 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 21 + 22 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 22 + 23 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 23 + 24 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 24 + 25 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 25 + 26 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 26 + 27 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 27 + 28 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 28 + 29 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 29 + 30 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 30 + 31 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 31 + 32 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 32 + 33 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 33 + 34 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 34 + 35 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 35 + 36 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + + + + 1 + 0 + + sdiv_32ns_32ns_16_36_1_U1 (sdiv_32ns_32ns_16_36_1) + + 2 + 0 + + FF + 2283 + + + LUT + 1738 + + + + + + 3 + 0 + + ap_enable_pp0 ( xor ) + + 4 + 0 + + (0P0) + 1 + + + (1P1) + 2 + + + FF + 0 + + + LUT + 2 + + + + + tmp_1_nbreadreq_fu_40_p3 ( and ) + + 3 + 0 + + (0P0) + 1 + + + FF + 0 + + + LUT + 2 + + + + + tmp_nbreadreq_fu_32_p3 ( and ) + + 3 + 0 + + (0P0) + 1 + + + FF + 0 + + + LUT + 2 + + + + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 38 + 0 + + ap_CS_fsm + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter1 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter10 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter11 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter12 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter13 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter14 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter15 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter16 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter17 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter18 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter19 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter2 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter20 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter21 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter22 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter23 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter24 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter25 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter26 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter27 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter28 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter29 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter3 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter30 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter31 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter32 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter33 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter34 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter35 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter4 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter5 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter6 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter7 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter8 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + ap_enable_reg_pp0_iter9 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + tmp_1_reg_90 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + tmp_reg_99 + + 3 + 0 + + (Bits) + 1 + + + (Consts) + 0 + + + FF + 1 + + + + + + 1 + 0 + + sdiv_32ns_32ns_16_36_1_U1 + + 0 + 0 + + + + + 1 + 0 + + sdiv_32ns_32ns_16_36_1_U1 (sdiv_32ns_32ns_16_36_1) + + 1 + 0 + 24 + + + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + + 14 + 0 + + 12 + + 0 + 0 + + + + 13 + + 0 + 0 + + + + 15 + + 0 + 0 + + + + 16 + + 0 + 0 + + + + 18 + + 0 + 0 + + + + 20 + + 0 + 0 + + + + 21 + + 0 + 0 + + + + 22 + + 0 + 0 + + + + 23 + + 0 + 0 + + + + 24 + + 0 + 35 + + + + 25 + + 35 + 0 + + + + 26 + + 35 + 0 + + + + 27 + + 35 + 0 + + + + 29 + + 35 + 0 + + + + + 5 + 0 + + 14 + + 0 + 0 + + + + 17 + + 0 + 0 + + + + 19 + + 0 + 0 + + + + 28 + + 0 + 35 + + + + 30 + + 35 + 35 + + + + + 1 + 0 + + div + + 5 + 0 + 14 + 17 + 19 + 28 + 30 + + + 0 + 0 + + -1 + 8 + 1 + 36 + + 0 + 0 + + + + + 9 + 0 + + 32 + + 1 + 0 + 12 + + + + 40 + + 1 + 0 + 15 + + + + 48 + + 1 + 0 + 20 + + + + 54 + + 1 + 0 + 22 + + + + 60 + + 1 + 0 + 26 + + + + 67 + + 1 + 0 + 23 + + + + 71 + + 1 + 0 + 21 + + + + 75 + + 36 + 0 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + + + + 81 + + 1 + 0 + 25 + + + + + 3 + 0 + + in0_fu_71 + + 1 + 0 + 21 + + + + in1_fu_67 + + 1 + 0 + 23 + + + + res_fu_81 + + 1 + 0 + 25 + + + + + 1 + 0 + + grp_fu_75 + + 36 + 0 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + + + + + 5 + 0 + + data_in_0_read_nbread_fu_48 + + 1 + 0 + 20 + + + + data_in_1_read_nbread_fu_54 + + 1 + 0 + 22 + + + + empty_nbwrite_fu_60 + + 1 + 0 + 26 + + + + tmp_1_nbreadreq_fu_40 + + 1 + 0 + 15 + + + + tmp_nbreadreq_fu_32 + + 1 + 0 + 12 + + + + + 0 + 0 + + + 0 + 0 + + + 4 + 0 + + 86 + + 1 + 0 + 23 + + + + 90 + + 1 + 0 + 15 + + + + 94 + + 1 + 0 + 21 + + + + 99 + + 1 + 0 + 12 + + + + + 4 + 0 + + in0_reg_94 + + 1 + 0 + 21 + + + + in1_reg_86 + + 1 + 0 + 23 + + + + tmp_1_reg_90 + + 1 + 0 + 15 + + + + tmp_reg_99 + + 1 + 0 + 12 + + + + + 0 + 0 + + + 0 + 0 + + + 3 + 0 + + data_in_0 + + 2 + 0 + + nbread + + 1 + 0 + 20 + + + + nbreadreq + + 1 + 0 + 12 + + + + + + data_in_1 + + 2 + 0 + + nbread + + 1 + 0 + 22 + + + + nbreadreq + + 1 + 0 + 15 + + + + + + data_out_0 + + 1 + 0 + + nbwrite + + 1 + 0 + 26 + + + + + + + 3 + 0 + + 1 + + 666 + 7 + + + + 2 + + 666 + 7 + + + + 3 + + 666 + 7 + + + + + 6 + 0 + + 12 + + 666 + 7 + + + + 15 + + 666 + 7 + + + + 20 + + 666 + 7 + + + + 22 + + 666 + 7 + + + + 24 + + 15 + 0 + + + + 26 + + 666 + 7 + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.adb.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.adb.xml new file mode 100644 index 000000000..a61dc149c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.adb.xml @@ -0,0 +1,1759 @@ +div + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +FIFO + + + + + + + + + + + +NULL + + + + + + + + + + + +FIFO + + + + + + + + + + + + +NULL + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + + + +FIFO + + +FIFO + + +FIFO + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bc new file mode 100644 index 000000000..07bf8222b Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bind.adb b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bind.adb new file mode 100644 index 000000000..6be3e72d9 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bind.adb @@ -0,0 +1,3065 @@ + + + + + -1 + + + div + Pipeline + 0 + + 3 + 0 + + + + 1 + 1 + data_in_0 + + + 0 + + + + 0 + 0 + + data_in_0 + + + + + FIFO + 0 + 0 + 0 + + + 32 + + 0 + 3 + 0 + + 0 + 0 + + + + + + 1 + 2 + data_in_1 + + + 0 + + + + 0 + 0 + + data_in_1 + + + + + FIFO + 0 + 0 + 0 + + + 32 + + 0 + 3 + 0 + + 0 + 0 + + + + + + 1 + 3 + data_out_0 + + + 0 + + + + 0 + 0 + + data_out_0 + + + + + FIFO + 0 + 0 + 2531319821 + + + 16 + + 1 + 3 + 0 + + 0 + 0 + + + + + 14 + 0 + + + + 0 + 12 + tmp + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 1 + + + 3 + 0 + 32 + 33 + 35 + + nbreadreq + 0 + 0 + 0 + 0 + 0.00 + 1 + -1 + + + + + 0 + 13 + br_ln14 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 0 + + + 3 + 0 + 36 + 37 + 38 + + br + 0 + 0 + 0 + 0 + 0.00 + 2 + -1 + + + + + 0 + 15 + tmp_1 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 1 + + + 3 + 0 + 39 + 40 + 41 + + nbreadreq + 0 + 0 + 0 + 0 + 0.00 + 3 + -1 + + + + + 0 + 16 + br_ln14 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 0 + + + 3 + 0 + 42 + 43 + 44 + + br + 0 + 0 + 0 + 0 + 0.00 + 4 + -1 + + + + + 0 + 18 + br_ln15 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 15 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 15 + + + + + + + + + + + 0 + 0 + 4294967295 + + + 0 + + + 1 + 0 + 45 + + br + 0 + 0 + 0 + 0 + 0.00 + 5 + -1 + + + + + 0 + 20 + data_in_0_read + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 18 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 18 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 33 + + + 3 + 0 + 47 + 48 + 147 + + nbread + 0 + 0 + 0 + 0 + 1.42 + 6 + -1 + + + + + 0 + 21 + in0 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 18 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 18 + + + + + in0 + + + + + + 0 + 0 + 0 + + + 32 + + + 1 + 0 + 49 + + extractvalue + 0 + 0 + 0 + 0 + 0.00 + 7 + -1 + + + + + 0 + 22 + data_in_1_read + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 19 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 19 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 33 + + + 3 + 0 + 50 + 51 + 148 + + nbread + 0 + 0 + 0 + 0 + 1.42 + 8 + -1 + + + + + 0 + 23 + in1 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 19 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 19 + + + + + in1 + + + + + + 0 + 0 + 1764524928 + + + 32 + + + 1 + 0 + 52 + + extractvalue + 0 + 0 + 0 + 0 + 0.00 + 9 + -1 + + + + + 0 + 24 + sdiv_ln20 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 20 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 20 + + + + + + + auto + sdiv + auto + Divider + 0 + 0 + 6 + + + 16 + + + 2 + 0 + 53 + 54 + + sdiv + 0 + 0 + 0 + 0 + 1.16 + 10 + -1 + + + + + 0 + 25 + res + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 20 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 20 + + + + + res + + + + + + 0 + 0 + 0 + + + 16 + + + 1 + 0 + 55 + + trunc + 0 + 0 + 0 + 0 + 0.00 + 11 + -1 + + + + + 0 + 26 + empty + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 22 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 22 + + + + + + + auto + fifo + memory + FIFO + 0 + 0 + 78 + + + 1 + + + 3 + 0 + 57 + 58 + 59 + + nbwrite + 0 + 0 + 0 + 0 + 1.42 + 12 + -1 + + + + + 0 + 27 + br_ln23 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 23 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 23 + + + + + + + + + + + 0 + 0 + 1775678896 + + + 0 + + + 1 + 0 + 60 + + br + 0 + 0 + 0 + 0 + 0.00 + 13 + -1 + + + + + 0 + 29 + _ln23 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 23 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 23 + + + + + + + + + + + 0 + 0 + 1872 + + + 0 + + + 0 + 0 + + ret + 0 + 0 + 0 + 0 + 0.00 + 14 + -1 + + + + 1 + 0 + + + + 2 + 34 + empty + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775688224 + + + 32 + + 0 + 1 + + + + 5 + 0 + + + 3 + 14 + entry + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 2531278202 + + + + 2 + 0 + 12 + 13 + + + + + 3 + 17 + lor.lhs.false + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775671552 + + + + 2 + 0 + 15 + 16 + + + + + 3 + 19 + if.then + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775611760 + + + + 1 + 0 + 18 + + + + + 3 + 28 + if.end + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 0 + + + + 8 + 0 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + + + + + 3 + 30 + return + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 0 + + + + 1 + 0 + 29 + + + + + 29 + 0 + + 33 + 1 + 1 + 12 + 0 + + + 35 + 1 + 34 + 12 + 0 + + + 36 + 1 + 12 + 13 + 0 + + + 37 + 2 + 19 + 13 + 0 + + + 38 + 2 + 17 + 13 + 0 + + + 40 + 1 + 2 + 15 + 0 + + + 41 + 1 + 34 + 15 + 0 + + + 42 + 1 + 15 + 16 + 0 + + + 43 + 2 + 19 + 16 + 0 + + + 44 + 2 + 28 + 16 + 0 + + + 45 + 2 + 30 + 18 + 0 + + + 48 + 1 + 1 + 20 + 0 + + + 49 + 1 + 20 + 21 + 0 + + + 51 + 1 + 2 + 22 + 0 + + + 52 + 1 + 22 + 23 + 0 + + + 53 + 1 + 21 + 24 + 0 + + + 54 + 1 + 23 + 24 + 0 + + + 55 + 1 + 24 + 25 + 0 + + + 58 + 1 + 3 + 26 + 0 + + + 59 + 1 + 25 + 26 + 0 + + + 60 + 2 + 30 + 27 + 0 + + + 141 + 2 + 14 + 17 + 0 + + + 142 + 2 + 14 + 19 + 0 + + + 143 + 2 + 17 + 28 + 0 + + + 144 + 2 + 17 + 19 + 0 + + + 145 + 2 + 19 + 30 + 0 + + + 146 + 2 + 28 + 30 + 0 + + + 147 + 4 + 12 + 20 + 0 + + + 148 + 4 + 15 + 22 + 0 + + + + + 1 + 0 + + 1 + div + div + 0 + + 0 + 0 + + + 5 + 0 + 14 + 17 + 19 + 28 + 30 + + 1 + 36 + -1 + -1 + 35 + 35 + 0 + + + + + + 36 + 0 + + 1 + + 18 + 0 + + 4 + 1 + 1 + + + 5 + 1 + 1 + + + 6 + 1 + 1 + + + 7 + 1 + 1 + + + 8 + 1 + 1 + + + 9 + 1 + 1 + + + 10 + 1 + 1 + + + 11 + 1 + 1 + + + 12 + 1 + 1 + + + 13 + 1 + 1 + + + 15 + 1 + 1 + + + 16 + 1 + 1 + + + 18 + 1 + 1 + + + 20 + 1 + 1 + + + 21 + 1 + 1 + + + 22 + 1 + 1 + + + 23 + 1 + 1 + + + 24 + 36 + 36 + + + + + 2 + + 1 + 0 + + 24 + 35 + 36 + + + + + 3 + + 1 + 0 + + 24 + 34 + 36 + + + + + 4 + + 1 + 0 + + 24 + 33 + 36 + + + + + 5 + + 1 + 0 + + 24 + 32 + 36 + + + + + 6 + + 1 + 0 + + 24 + 31 + 36 + + + + + 7 + + 1 + 0 + + 24 + 30 + 36 + + + + + 8 + + 1 + 0 + + 24 + 29 + 36 + + + + + 9 + + 1 + 0 + + 24 + 28 + 36 + + + + + 10 + + 1 + 0 + + 24 + 27 + 36 + + + + + 11 + + 1 + 0 + + 24 + 26 + 36 + + + + + 12 + + 1 + 0 + + 24 + 25 + 36 + + + + + 13 + + 1 + 0 + + 24 + 24 + 36 + + + + + 14 + + 1 + 0 + + 24 + 23 + 36 + + + + + 15 + + 1 + 0 + + 24 + 22 + 36 + + + + + 16 + + 1 + 0 + + 24 + 21 + 36 + + + + + 17 + + 1 + 0 + + 24 + 20 + 36 + + + + + 18 + + 1 + 0 + + 24 + 19 + 36 + + + + + 19 + + 1 + 0 + + 24 + 18 + 36 + + + + + 20 + + 1 + 0 + + 24 + 17 + 36 + + + + + 21 + + 1 + 0 + + 24 + 16 + 36 + + + + + 22 + + 1 + 0 + + 24 + 15 + 36 + + + + + 23 + + 1 + 0 + + 24 + 14 + 36 + + + + + 24 + + 1 + 0 + + 24 + 13 + 36 + + + + + 25 + + 1 + 0 + + 24 + 12 + 36 + + + + + 26 + + 1 + 0 + + 24 + 11 + 36 + + + + + 27 + + 1 + 0 + + 24 + 10 + 36 + + + + + 28 + + 1 + 0 + + 24 + 9 + 36 + + + + + 29 + + 1 + 0 + + 24 + 8 + 36 + + + + + 30 + + 1 + 0 + + 24 + 7 + 36 + + + + + 31 + + 1 + 0 + + 24 + 6 + 36 + + + + + 32 + + 1 + 0 + + 24 + 5 + 36 + + + + + 33 + + 1 + 0 + + 24 + 4 + 36 + + + + + 34 + + 1 + 0 + + 24 + 3 + 36 + + + + + 35 + + 1 + 0 + + 24 + 2 + 36 + + + + + 36 + + 5 + 0 + + 24 + 1 + 36 + + + 25 + 1 + 1 + + + 26 + 1 + 1 + + + 27 + 1 + 1 + + + 29 + 1 + 1 + + + + + + 35 + 0 + + 1 + 2 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 2 + 3 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 3 + 4 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 4 + 5 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 5 + 6 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 6 + 7 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 7 + 8 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 8 + 9 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 9 + 10 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 10 + 11 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 11 + 12 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 12 + 13 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 13 + 14 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 14 + 15 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 15 + 16 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 16 + 17 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 17 + 18 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 18 + 19 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 19 + 20 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 20 + 21 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 21 + 22 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 22 + 23 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 23 + 24 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 24 + 25 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 25 + 26 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 26 + 27 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 27 + 28 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 28 + 29 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 29 + 30 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 30 + 31 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 31 + 32 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 32 + 33 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 33 + 34 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 34 + 35 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + 35 + 36 + + -1 + + 1 + 0 + + 0 + 0 + + + + + + + + + 14 + 0 + + 12 + + 0 + 0 + + + + 13 + + 0 + 0 + + + + 15 + + 0 + 0 + + + + 16 + + 0 + 0 + + + + 18 + + 0 + 0 + + + + 20 + + 0 + 0 + + + + 21 + + 0 + 0 + + + + 22 + + 0 + 0 + + + + 23 + + 0 + 0 + + + + 24 + + 0 + 35 + + + + 25 + + 35 + 0 + + + + 26 + + 35 + 0 + + + + 27 + + 35 + 0 + + + + 29 + + 35 + 0 + + + + + 5 + 0 + + 14 + + 0 + 0 + + + + 17 + + 0 + 0 + + + + 19 + + 0 + 0 + + + + 28 + + 0 + 35 + + + + 30 + + 35 + 35 + + + + + 1 + 0 + + div + + 5 + 0 + 14 + 17 + 19 + 28 + 30 + + + 0 + 0 + + -1 + 8 + 1 + 36 + + 0 + 0 + + + + + 9 + 0 + + 32 + + 1 + 0 + 12 + + + + 40 + + 1 + 0 + 15 + + + + 48 + + 1 + 0 + 20 + + + + 54 + + 1 + 0 + 22 + + + + 60 + + 1 + 0 + 26 + + + + 67 + + 1 + 0 + 23 + + + + 71 + + 1 + 0 + 21 + + + + 75 + + 36 + 0 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + + + + 81 + + 1 + 0 + 25 + + + + + 3 + 0 + + in0_fu_71 + + 1 + 0 + 21 + + + + in1_fu_67 + + 1 + 0 + 23 + + + + res_fu_81 + + 1 + 0 + 25 + + + + + 1 + 0 + + grp_fu_75 + + 36 + 0 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + 24 + + + + + 5 + 0 + + data_in_0_read_nbread_fu_48 + + 1 + 0 + 20 + + + + data_in_1_read_nbread_fu_54 + + 1 + 0 + 22 + + + + empty_nbwrite_fu_60 + + 1 + 0 + 26 + + + + tmp_1_nbreadreq_fu_40 + + 1 + 0 + 15 + + + + tmp_nbreadreq_fu_32 + + 1 + 0 + 12 + + + + + 0 + 0 + + + 0 + 0 + + + 4 + 0 + + 86 + + 1 + 0 + 23 + + + + 90 + + 1 + 0 + 15 + + + + 94 + + 1 + 0 + 21 + + + + 99 + + 1 + 0 + 12 + + + + + 4 + 0 + + in0_reg_94 + + 1 + 0 + 21 + + + + in1_reg_86 + + 1 + 0 + 23 + + + + tmp_1_reg_90 + + 1 + 0 + 15 + + + + tmp_reg_99 + + 1 + 0 + 12 + + + + + 0 + 0 + + + 0 + 0 + + + 3 + 0 + + data_in_0 + + 2 + 0 + + nbread + + 1 + 0 + 20 + + + + nbreadreq + + 1 + 0 + 12 + + + + + + data_in_1 + + 2 + 0 + + nbread + + 1 + 0 + 22 + + + + nbreadreq + + 1 + 0 + 15 + + + + + + data_out_0 + + 1 + 0 + + nbwrite + + 1 + 0 + 26 + + + + + + + 3 + 0 + + 1 + + 666 + 7 + + + + 2 + + 666 + 7 + + + + 3 + + 666 + 7 + + + + + 6 + 0 + + 12 + + 666 + 7 + + + + 15 + + 666 + 7 + + + + 20 + + 666 + 7 + + + + 22 + + 666 + 7 + + + + 24 + + 15 + 0 + + + + 26 + + 666 + 7 + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bind.adb.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bind.adb.xml new file mode 100644 index 000000000..a61dc149c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.bind.adb.xml @@ -0,0 +1,1759 @@ +div + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +FIFO + + + + + + + + + + + +NULL + + + + + + + + + + + +FIFO + + + + + + + + + + + + +NULL + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + + + +FIFO + + +FIFO + + +FIFO + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.compgen.dataonly.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.compgen.dataonly.tcl new file mode 100644 index 000000000..c7221ad12 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.compgen.dataonly.tcl @@ -0,0 +1,3 @@ +# This script segment is generated automatically by AutoPilot + +set axilite_register_dict [dict create] diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.compgen.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.compgen.tcl new file mode 100644 index 000000000..1c64f455d --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.compgen.tcl @@ -0,0 +1,125 @@ +# This script segment is generated automatically by AutoPilot + +set name div_sdiv_32ns_32ns_16_36_1 +if {${::AESL::PGuard_rtl_comp_handler}} { + ::AP::rtl_comp_handler $name BINDTYPE {op} TYPE {sdiv} IMPL {auto} LATENCY 35 ALLOW_PRAGMA 1 +} + + +# clear list +if {${::AESL::PGuard_autoexp_gen}} { + cg_default_interface_gen_dc_begin + cg_default_interface_gen_bundle_begin + AESL_LIB_XILADAPTER::native_axis_begin +} + +set axilite_register_dict [dict create] +# Direct connection: +if {${::AESL::PGuard_autoexp_gen}} { +eval "cg_default_interface_gen_dc { \ + id 3 \ + name data_in_0 \ + type fifo \ + dir I \ + reset_level 1 \ + sync_rst true \ + corename dc_data_in_0 \ + op interface \ + ports { data_in_0_dout { I 32 vector } data_in_0_empty_n { I 1 bit } data_in_0_read { O 1 bit } } \ +} " +} + +# Direct connection: +if {${::AESL::PGuard_autoexp_gen}} { +eval "cg_default_interface_gen_dc { \ + id 4 \ + name data_in_1 \ + type fifo \ + dir I \ + reset_level 1 \ + sync_rst true \ + corename dc_data_in_1 \ + op interface \ + ports { data_in_1_dout { I 32 vector } data_in_1_empty_n { I 1 bit } data_in_1_read { O 1 bit } } \ +} " +} + +# Direct connection: +if {${::AESL::PGuard_autoexp_gen}} { +eval "cg_default_interface_gen_dc { \ + id 5 \ + name data_out_0 \ + type fifo \ + dir O \ + reset_level 1 \ + sync_rst true \ + corename dc_data_out_0 \ + op interface \ + ports { data_out_0_din { O 16 vector } data_out_0_full_n { I 1 bit } data_out_0_write { O 1 bit } } \ +} " +} + +# Direct connection: +if {${::AESL::PGuard_autoexp_gen}} { +eval "cg_default_interface_gen_dc { \ + id -1 \ + name ap_ctrl \ + type ap_ctrl \ + reset_level 1 \ + sync_rst true \ + corename ap_ctrl \ + op interface \ + ports { ap_start { I 1 bit } ap_ready { O 1 bit } ap_done { O 1 bit } ap_idle { O 1 bit } } \ +} " +} + + +# Adapter definition: +set PortName ap_clk +set DataWd 1 +if {${::AESL::PGuard_autoexp_gen}} { +if {[info proc cg_default_interface_gen_clock] == "cg_default_interface_gen_clock"} { +eval "cg_default_interface_gen_clock { \ + id -2 \ + name ${PortName} \ + reset_level 1 \ + sync_rst true \ + corename apif_ap_clk \ + data_wd ${DataWd} \ + op interface \ +}" +} else { +puts "@W \[IMPL-113\] Cannot find bus interface model in the library. Ignored generation of bus interface for '${PortName}'" +} +} + + +# Adapter definition: +set PortName ap_rst +set DataWd 1 +if {${::AESL::PGuard_autoexp_gen}} { +if {[info proc cg_default_interface_gen_reset] == "cg_default_interface_gen_reset"} { +eval "cg_default_interface_gen_reset { \ + id -3 \ + name ${PortName} \ + reset_level 1 \ + sync_rst true \ + corename apif_ap_rst \ + data_wd ${DataWd} \ + op interface \ +}" +} else { +puts "@W \[IMPL-114\] Cannot find bus interface model in the library. Ignored generation of bus interface for '${PortName}'" +} +} + + + +# merge +if {${::AESL::PGuard_autoexp_gen}} { + cg_default_interface_gen_dc_end + cg_default_interface_gen_bundle_end + AESL_LIB_XILADAPTER::native_axis_end +} + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.constraint.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.constraint.tcl new file mode 100644 index 000000000..0840b847c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.constraint.tcl @@ -0,0 +1,12 @@ +set clock_constraint { \ + name clk \ + module div \ + port ap_clk \ + period 10 \ + uncertainty 2.7 \ +} + +set all_path {} + +set false_path {} + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.cpp.clang.diag.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.cpp.clang.diag.xml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.design.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.design.xml new file mode 100644 index 000000000..3384bd44b --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.design.xml @@ -0,0 +1,232 @@ + + + + +div + + + + + +div + + +10.00 +2.70 +1 +2.593 + + +35 +35 +35 +0.350 us +0.350 us +0.350 us +1 +36 +yes + + + + +2449 +3456000 +~0 +1808 +1728000 +~0 +0 +5376 +0 +0 +12288 +0 +0 +1280 +0 + + + + +ap_clk +div +return value + +ap_ctrl_hs + +in +1 +control + + +ap_rst +div +return value + +ap_ctrl_hs + +in +1 +control + + +ap_start +div +return value + +ap_ctrl_hs + +in +1 +control + + +ap_done +div +return value + +ap_ctrl_hs + +out +1 +control + + +ap_idle +div +return value + +ap_ctrl_hs + +out +1 +control + + +ap_ready +div +return value + +ap_ctrl_hs + +out +1 +control + + +data_in_0_dout +data_in_0 +pointer + +ap_fifo + +in +32 +control +int + + +data_in_0_empty_n +data_in_0 +pointer + +ap_fifo + +in +1 +control +int + + +data_in_0_read +data_in_0 +pointer + +ap_fifo + +out +1 +control +int + + +data_in_1_dout +data_in_1 +pointer + +ap_fifo + +in +32 +control +int + + +data_in_1_empty_n +data_in_1 +pointer + +ap_fifo + +in +1 +control +int + + +data_in_1_read +data_in_1 +pointer + +ap_fifo + +out +1 +control +int + + +data_out_0_din +data_out_0 +pointer + +ap_fifo + +out +16 +control +int + + +data_out_0_full_n +data_out_0 +pointer + +ap_fifo + +in +1 +control +int + + +data_out_0_write +data_out_0 +pointer + +ap_fifo + +out +1 +control +int + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.g.bc b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.g.bc new file mode 100644 index 000000000..07bf8222b Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.g.bc differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp new file mode 100644 index 000000000..9a89c6d18 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp @@ -0,0 +1,5864 @@ +# 1 "div.cpp" +# 1 "" 1 +# 1 "" 3 +# 376 "" 3 +# 1 "" 1 +# 1 "" 2 +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/autopilot_ssdm_op.h" 1 +# 108 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/autopilot_ssdm_op.h" +extern "C" { + + + + + + + void _ssdm_op_IfRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_IfWrite(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_IfNbRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_IfNbWrite(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_IfCanRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_IfCanWrite(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + + void _ssdm_StreamRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_StreamWrite(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_StreamNbRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_StreamNbWrite(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_StreamCanRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_StreamCanWrite(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned _ssdm_StreamSize(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_ReadReq(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_Read(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_WriteReq(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_Write(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_NbReadReq(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_CanReadReq(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_NbWriteReq(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + unsigned int __attribute__ ((bitwidth(1))) _ssdm_op_CanWriteReq(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + + + + void _ssdm_op_MemShiftRead(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_PrintNone(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_PrintInt(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_PrintDouble(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_Wait(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_Poll(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_Return(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + + void _ssdm_op_SpecSynModule(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecTopModule(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecProcessDecl(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecProcessDef(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecPort(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecConnection(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecChannel(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecSensitive(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecModuleInst(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecPortMap(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecReset(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecPlatform(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecClockDomain(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecPowerDomain(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + int _ssdm_op_SpecRegionBegin(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + int _ssdm_op_SpecRegionEnd(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecLoopName(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecLoopTripCount(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + int _ssdm_op_SpecStateBegin(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + int _ssdm_op_SpecStateEnd(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecInterface(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecPipeline(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecDataflowPipeline(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + + void _ssdm_op_SpecLatency(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecParallel(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecProtocol(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecOccurrence(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecResource(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecResourceLimit(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecCHCore(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecFUCore(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecIFCore(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecIPCore(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecMemCore(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecExt(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + + + + void _ssdm_SpecArrayDimSize(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_RegionBegin(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_RegionEnd(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_Unroll(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_UnrollRegion(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_InlineAll(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_InlineLoop(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_Inline(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_InlineSelf(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_InlineRegion(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_SpecArrayMap(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecArrayPartition(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecArrayReshape(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_SpecStream(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecStable(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecStableContent(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecBindPort(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecPipoDepth(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_SpecExpr(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecExprBalance(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_SpecDependence(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_SpecLoopMerge(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecLoopFlatten(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecLoopRewind(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_SpecFuncInstantiation(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecFuncBuffer(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecFuncExtract(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecConstant(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_DataPack(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_SpecDataPack(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void _ssdm_op_SpecBitsMap(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + void _ssdm_op_SpecLicense(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + void __xilinx_ip_top(...) __attribute__ ((nothrow)) __attribute__((overloadable)); + + +} +# 2 "" 2 +# 1 "div.cpp" 2 +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 1 +# 10 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" 1 +# 41 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_decl.h" 1 +# 54 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_decl.h" +enum ap_q_mode { + AP_RND, + AP_RND_ZERO, + AP_RND_MIN_INF, + AP_RND_INF, + AP_RND_CONV, + AP_TRN, + AP_TRN_ZERO, +}; +# 76 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_decl.h" +enum ap_o_mode { + AP_SAT, + AP_SAT_ZERO, + AP_SAT_SYM, + AP_WRAP, + AP_WRAP_SM, +}; +# 133 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_decl.h" +template +struct ap_int_base; + +template +struct ap_int; + +template +struct ap_uint; + +template +struct ap_range_ref; + +template +struct ap_bit_ref; + +template +struct ap_concat_ref; + +template +struct ap_fixed_base; + +template +struct ap_fixed; + +template +struct ap_ufixed; + +template +struct af_range_ref; + +template +struct af_bit_ref; + + +enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 }; +# 187 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_decl.h" +typedef signed long long ap_slong; +typedef unsigned long long ap_ulong; + + +enum { + _AP_SIZE_char = 8, + _AP_SIZE_short = sizeof(short) * 8, + _AP_SIZE_int = sizeof(int) * 8, + _AP_SIZE_long = sizeof(long) * 8, + _AP_SIZE_ap_slong = sizeof(ap_slong) * 8 +}; +# 42 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" 2 +# 164 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" +enum { CHAR_IS_SIGNED = (char)-1 < 0 }; + + +namespace _ap_type { +template +struct is_signed { + static const bool value = _Tp(-1) < _Tp(1); +}; + +template +struct is_integral { + static const bool value = false; +}; + + + + + +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; +template <> struct is_integral { static const bool value = true; }; + + +template +struct enable_if {}; + +template +struct enable_if { + typedef _Tp type; +}; + +template +struct remove_const { + typedef _Tp type; +}; + +template +struct remove_const<_Tp const> { + typedef _Tp type; +}; +} +# 507 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" +template +struct ssdm_int; + +template +struct ssdm_int<_AP_N, true> { + typedef int __attribute__((bitwidth(_AP_N))) DataType; + int V __attribute__((bitwidth(_AP_N))); + inline __attribute__((always_inline)) ssdm_int<_AP_N, true>(){}; + inline __attribute__((always_inline)) ssdm_int<_AP_N, true>(int o __attribute__((bitwidth(_AP_N)))):V(o){}; +}; + +template +struct ssdm_int<_AP_N, false> { + typedef unsigned __attribute__((bitwidth(_AP_N))) DataType; + unsigned V __attribute__((bitwidth(_AP_N))); + inline __attribute__((always_inline)) ssdm_int<_AP_N, false>(){}; + inline __attribute__((always_inline)) ssdm_int<_AP_N, false>(unsigned o __attribute__((bitwidth(_AP_N)))):V(o){}; +}; +# 575 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/autopilot_ssdm_bits.h" 1 +# 576 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" 2 + +extern "C" void _ssdm_string2bits(...); +# 587 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" +static inline unsigned char guess_radix(const char* s) { + unsigned char rd = 10; + const char* p = s; + + if (p[0] == '-' || p[0] == '+') ++p; + + if (p[0] == '0') { + if (p[1] == 'b' || p[1] == 'B') { + rd = 2; + } else if (p[1] == 'o' || p[1] == 'O') { + rd = 8; + } else if (p[1] == 'x' || p[1] == 'X') { + rd = 16; + } else if (p[1] == 'd' || p[1] == 'D') { + rd = 10; + } + } + return rd; +} + + + + + + + +typedef __fp16 half; +# 718 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_common.h" +inline __attribute__((always_inline)) ap_ulong doubleToRawBits(double pf) { + union { + ap_ulong __L; + double __D; + } LD; + LD.__D = pf; + return LD.__L; +} + +inline __attribute__((always_inline)) unsigned int floatToRawBits(float pf) { + union { + unsigned int __L; + float __D; + } LD; + LD.__D = pf; + return LD.__L; +} + +inline __attribute__((always_inline)) unsigned short halfToRawBits(half pf) { + + union { + unsigned short __L; + half __D; + } LD; + LD.__D = pf; + return LD.__L; + + + +} + + +inline __attribute__((always_inline)) double rawBitsToDouble(ap_ulong pi) { + union { + ap_ulong __L; + double __D; + } LD; + LD.__L = pi; + return LD.__D; +} + + +inline __attribute__((always_inline)) float rawBitsToFloat(unsigned long pi) { + union { + unsigned int __L; + float __D; + } LD; + LD.__L = pi; + return LD.__D; +} + + +inline __attribute__((always_inline)) half rawBitsToHalf(unsigned short pi) { + + union { + unsigned short __L; + half __D; + } LD; + LD.__L = pi; + return LD.__D; + + + + + + +} +# 11 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 2 +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" 1 +# 40 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template +struct retval; + + +template +struct retval<_AP_N, true> { + typedef ap_slong Type; +}; + +template +struct retval<_AP_N, false> { + typedef ap_ulong Type; +}; + + +template <> +struct retval<1, true> { + typedef signed char Type; +}; + +template <> +struct retval<1, false> { + typedef unsigned char Type; +}; + + +template <> +struct retval<2, true> { + typedef short Type; +}; + +template <> +struct retval<2, false> { + typedef unsigned short Type; +}; + + +template <> +struct retval<3, true> { + typedef long Type; +}; + +template <> +struct retval<3, false> { + typedef unsigned long Type; +}; + +template <> +struct retval<4, true> { + typedef long Type; +}; + +template <> +struct retval<4, false> { + typedef unsigned long Type; +}; + + + + +template +struct _ap_int_factory; +template +struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; }; +template +struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; }; + +template +struct ap_int_base : public ssdm_int<_AP_W, _AP_S> { + private: + inline __attribute__((always_inline)) __attribute__((nodebug)) int countLeadingOnes() const { + + + + + return 0; + + } + + public: + typedef ssdm_int<_AP_W, _AP_S> Base; + + + + + + + + typedef typename retval<(((_AP_W + 7) / 8) > (8) ? ((_AP_W + 7) / 8) : (8)), _AP_S>::Type RetType; + + static const int width = _AP_W; + static const bool sign_flag = _AP_S; + + template + struct RType { + enum { + mult_w = _AP_W + _AP_W2, + mult_s = _AP_S || _AP_S2, + plus_w = + ((_AP_W + (_AP_S2 && !_AP_S)) > (_AP_W2 + (_AP_S && !_AP_S2)) ? (_AP_W + (_AP_S2 && !_AP_S)) : (_AP_W2 + (_AP_S && !_AP_S2))) + 1, + plus_s = _AP_S || _AP_S2, + minus_w = + ((_AP_W + (_AP_S2 && !_AP_S)) > (_AP_W2 + (_AP_S && !_AP_S2)) ? (_AP_W + (_AP_S2 && !_AP_S)) : (_AP_W2 + (_AP_S && !_AP_S2))) + 1, + minus_s = true, + div_w = _AP_W + _AP_S2, + div_s = _AP_S || _AP_S2, + mod_w = ((_AP_W) < (_AP_W2 + (!_AP_S2 && _AP_S)) ? (_AP_W) : (_AP_W2 + (!_AP_S2 && _AP_S))), + mod_s = _AP_S, + logic_w = ((_AP_W + (_AP_S2 && !_AP_S)) > (_AP_W2 + (_AP_S && !_AP_S2)) ? (_AP_W + (_AP_S2 && !_AP_S)) : (_AP_W2 + (_AP_S && !_AP_S2))), + logic_s = _AP_S || _AP_S2 + }; + + + typedef ap_int_base mult_base; + typedef ap_int_base plus_base; + typedef ap_int_base minus_base; + typedef ap_int_base logic_base; + typedef ap_int_base div_base; + typedef ap_int_base mod_base; + typedef ap_int_base<_AP_W, _AP_S> arg1_base; + + typedef typename _ap_int_factory::type mult; + typedef typename _ap_int_factory::type plus; + typedef typename _ap_int_factory::type minus; + typedef typename _ap_int_factory::type logic; + typedef typename _ap_int_factory::type div; + typedef typename _ap_int_factory::type mod; + typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1; + typedef bool reduce; + }; + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base() { + + + + + + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op):Base(op.V) { + Base::V = op.V; + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) { + Base::V = op.V; + } +# 208 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const bool op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const char op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const signed char op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const unsigned char op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const short op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const unsigned short op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const int op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const unsigned int op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const long op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const unsigned long op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const ap_slong op):Base(op) { Base::V = op; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const ap_ulong op):Base(op) { Base::V = op; } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(half op) { + ap_int_base<_AP_W, _AP_S> t((float)op); + Base::V = t.V; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(float op) { + const int BITS = 23 + 8 + 1; + ap_int_base reg; + reg.V = floatToRawBits(op); + bool is_neg = ({ typeof(reg.V) __Val2__ = reg.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), BITS - 1); __Result__; }); + + ap_int_base<8 + 1, true> exp = 0; + exp.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(reg.V) __Val2__ = reg.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 23, BITS - 2); __Result__; }); + exp = exp - ((1 << (8 - 1)) - 1); + + ap_int_base<23 + 2, true> man; + man.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(reg.V) __Val2__ = reg.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, 23 - 1); __Result__; }); + + (static_cast(0)); + + + man.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(man.V) __Val2__ = man.V; typeof(1) __Repl2__ = !!1; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 23, 23); __Result__; }); + + + if ((reg.V & 0x7ffffffful) == 0) { + Base::V = 0; + } else { + int sh_amt = 23 - exp.V; + if (sh_amt == 0) { + Base::V = man.V; + } else if (sh_amt > 0) { + if (sh_amt < 23 + 2) { + ap_int_base<23 + 2, true> man_shift; + man_shift.V = (man.V >> sh_amt); + + + + + + + Base::V = man_shift.V; + } else { + + + + Base::V = 0; + } + } else { + sh_amt = -sh_amt; + if (sh_amt < _AP_W) { + Base::V = man.V; + Base::V <<= sh_amt; + } else { + + + + Base::V = 0; + } + } + } + if (is_neg) *this = -(*this); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(double op) { + const int BITS = 52 + 11 + 1; + ap_int_base reg; + reg.V = doubleToRawBits(op); + bool is_neg = ({ typeof(reg.V) __Val2__ = reg.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), BITS - 1); __Result__; }); + + ap_int_base<11 + 1, true> exp, bias = ((1 << (11 - 1)) - 1); + exp.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(reg.V) __Val2__ = reg.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 52, BITS - 2); __Result__; }); + exp = exp - bias; + + ap_int_base<52 + 2, true> man; + man.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(reg.V) __Val2__ = reg.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, 52 - 1); __Result__; }); + + (static_cast(0)); + + + man.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(man.V) __Val2__ = man.V; typeof(1) __Repl2__ = !!1; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 52, 52); __Result__; }); + + + if ((reg.V & 0x7fffffffffffffffull) == 0) { + Base::V = 0; + } else { + int sh_amt = 52 - exp.V; + if (sh_amt == 0) { + Base::V = man.V; + } else if (sh_amt > 0) { + if (sh_amt < 52 + 2) { + ap_int_base<23 + 2, true> man_shift; + man_shift.V = (man.V >> sh_amt); + + + + + + + Base::V = man_shift.V; + } else { + + + + Base::V = 0; + } + } else { + sh_amt = -sh_amt; + if (sh_amt < _AP_W) { + Base::V = man.V; + Base::V <<= sh_amt; + } else { + + + + Base::V = 0; + } + } + } + if (is_neg) *this = -(*this); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + op.checkOverflowCsimFix(_AP_W, _AP_S); + Base::V = op.to_ap_int_base().V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) { + Base::V = (ref.get()).V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) { + Base::V = ref.operator bool(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) { + const ap_int_base::_AP_WR, + false> + tmp = ref.get(); + Base::V = tmp.V; + } +# 387 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const char* s) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S, + AP_TRN, AP_WRAP, 0, true); + Base::V = t; + } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base(const char* s, signed char rd) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S, + AP_TRN, AP_WRAP, 0, true); + Base::V = t; + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + Base::V = (val.get()).V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + Base::V = val.operator bool(); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base read() volatile { + + ap_int_base ret; + ret.V = Base::V; + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile { + + *((volatile typename Base::DataType *)(&(Base::V))) = op2.V; + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=( + const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile { + *((volatile typename Base::DataType *)(&(Base::V))) = op2.V; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=( + const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile { + *((volatile typename Base::DataType *)(&(Base::V))) = op2.V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile { + *((volatile typename Base::DataType *)(&(Base::V))) = op2.V; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile { + *((volatile typename Base::DataType *)(&(Base::V))) = op2.V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=( + const volatile ap_int_base<_AP_W2, _AP_S2>& op2) { + Base::V = op2.V; + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) { + Base::V = op2.V; + return *this; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) { + Base::V = op2.V; + return *this; + } + + + ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) = default; +# 481 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(bool op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(char op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(signed char op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(unsigned char op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(short op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(unsigned short op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(int op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(unsigned int op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(long op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(unsigned long op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(ap_slong op) { Base::V = op; return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(ap_ulong op) { Base::V = op; return *this; } + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) { + Base::V = (bool)op2; + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + Base::V = (ap_int_base<_AP_W2, false>(op2)).V; + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) { + Base::V = op2.get().V; + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = op.to_ap_int_base().V; + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = (bool)op; + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V; + return *this; + } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator RetType() const { return (RetType)(Base::V); } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool to_bool() const { return (bool)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) char to_char() const { return (char)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) signed char to_schar() const { return (signed char)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned char to_uchar() const { return (unsigned char)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) short to_short() const { return (short)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned short to_ushort() const { return (unsigned short)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) int to_int() const { return (int)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned to_uint() const { return (unsigned)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) long to_long() const { return (long)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned long to_ulong() const { return (unsigned long)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_slong to_int64() const { return (ap_slong)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ulong to_uint64() const { return (ap_ulong)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) half to_half() const { return (float)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) float to_float() const { return (float)(Base::V); } + inline __attribute__((always_inline)) __attribute__((nodebug)) double to_double() const { return (double)(Base::V); } +# 586 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const volatile { return _AP_W; } + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { return _AP_W; } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool iszero() const { return Base::V == 0; } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool is_zero() const { return Base::V == 0; } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool sign() const { + if (_AP_S && + ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; })) + return true; + else + return false; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void clear(int i) { + ; + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(0) __Repl2__ = !!0; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), i, i); __Result__; }); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void invert(int i) { + ; + bool val = ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), i); __Result__; }); + if (val) + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(0) __Repl2__ = !!0; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), i, i); __Result__; }); + else + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(1) __Repl2__ = !!1; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), i, i); __Result__; }); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool test(int i) const { + ; + return ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), i); __Result__; }); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& get() { return *this; } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(int i) { + ; + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(1) __Repl2__ = !!1; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), i, i); __Result__; }); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(int i, bool v) { + ; + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(v) __Repl2__ = !!v; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), i, i); __Result__; }); + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& lrotate(int n) { + ; + + + typeof(Base::V) l_p = Base::V << n; + typeof(Base::V) r_p = Base::V >> (_AP_W - n); + Base::V = l_p | r_p; + + + + return *this; + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& rrotate(int n) { + ; + + + typeof(Base::V) l_p = Base::V << (_AP_W - n); + typeof(Base::V) r_p = Base::V >> n; + Base::V = l_p | r_p; + + + + return *this; + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& reverse() { + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), _AP_W - 1, 0); __Result__; }); + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void set_bit(int i, bool v) { + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(v) __Repl2__ = !!v; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), i, i); __Result__; }); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool get_bit(int i) const { + return (bool)({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), i); __Result__; }); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void b_not() { Base::V = ~Base::V; } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void checkOverflowBaseC(T val) { +# 757 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + (void)val; + + return; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool checkOverflowCsim(int _ap_w2, bool _ap_s2, + bool print = true) const { +# 807 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + (void)_ap_w2; + (void)_ap_s2; + (void)print; + + return 0; + } +# 841 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator *=(const ap_int_base<_AP_W2, _AP_S2>& op2) { Base::V *= op2.V; return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator +=(const ap_int_base<_AP_W2, _AP_S2>& op2) { Base::V += op2.V; return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator -=(const ap_int_base<_AP_W2, _AP_S2>& op2) { Base::V -= op2.V; return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator /=(const ap_int_base<_AP_W2, _AP_S2>& op2) { Base::V /= op2.V; return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator %=(const ap_int_base<_AP_W2, _AP_S2>& op2) { Base::V %= op2.V; return *this; } +# 859 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator &=(const ap_int_base<_AP_W2, _AP_S2>& op2) { (static_cast(0)); Base::V &= op2.V; return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator |=(const ap_int_base<_AP_W2, _AP_S2>& op2) { (static_cast(0)); Base::V |= op2.V; return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator ^=(const ap_int_base<_AP_W2, _AP_S2>& op2) { (static_cast(0)); Base::V ^= op2.V; return *this; } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator++() { + operator+=((ap_int_base<1, false>)1); + return *this; + } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator--() { + operator-=((ap_int_base<1, false>)1); + return *this; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) const typename RType<_AP_W,_AP_S>::arg1 operator++(int) { + ap_int_base t = *this; + operator+=((ap_int_base<1, false>)1); + return t; + } + inline __attribute__((always_inline)) __attribute__((nodebug)) const typename RType<_AP_W,_AP_S>::arg1 operator--(int) { + ap_int_base t = *this; + operator-=((ap_int_base<1, false>)1); + return t; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<1, false>::minus operator-() const { + return ap_int_base<1, false>(0) - *this; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!() const { return Base::V == 0; } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W,_AP_S>::arg1 operator~() const { + ap_int_base<_AP_W, _AP_S> r(0); + r.V = ~Base::V; + return r; + } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const { + bool isNeg = ({ typeof(op2.V) __Val2__ = op2.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W2 - 1); __Result__; }); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator>>(sh); + } else + return operator<<(sh); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const { + ap_int_base r(0); + r.V = Base::V << op2.to_uint(); + return r; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const { + bool isNeg = ({ typeof(op2.V) __Val2__ = op2.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W2 - 1); __Result__; }); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator<<(sh); + } + return operator>>(sh); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const { + ap_int_base r(0); + r.V = Base::V >> op2.to_uint(); + return r; + } +# 970 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) { + bool isNeg = ({ typeof(op2.V) __Val2__ = op2.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W2 - 1); __Result__; }); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator>>=(sh); + } else + return operator<<=(sh); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) { + Base::V <<= op2.to_uint(); + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) { + bool isNeg = ({ typeof(op2.V) __Val2__ = op2.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W2 - 1); __Result__; }); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator<<=(sh); + } + return operator>>=(sh); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) { + Base::V >>= op2.to_uint(); + return *this; + } +# 1019 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V == op2.V; + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return !(Base::V == op2.V); + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V < op2.V; + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V >= op2.V; + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V > op2.V; + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V <= op2.V; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) { + (static_cast(0)); + (static_cast(0)); + return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const { + (static_cast(0)); + (static_cast(0)); + return ap_range_ref<_AP_W, _AP_S>(const_cast(this), Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> range() { + return this->range(_AP_W - 1, 0); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> range() const { + return this->range(_AP_W - 1, 0); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) { + return this->range(Hi, Lo); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const { + return this->range(Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } +# 1128 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W, _AP_S> operator[](int index) { + ; + ; + ap_bit_ref<_AP_W, _AP_S> bvh(this, index); + return bvh; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W, _AP_S> operator[]( + const ap_int_base<_AP_W2, _AP_S2>& index) { + ; + ; + ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int()); + return bvh; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator[](int index) const { + ; + ; + ap_bit_ref<_AP_W, _AP_S> br(this, index); + return br.to_bool(); + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const { + ; + ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int()); + return br.to_bool(); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W, _AP_S> bit(int index) { + ; + ; + ap_bit_ref<_AP_W, _AP_S> bvh(this, index); + return bvh; + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W, _AP_S> bit( + const ap_int_base<_AP_W2, _AP_S2>& index) { + ; + ; + ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int()); + return bvh; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool bit(int index) const { + ; + ; + ap_bit_ref<_AP_W, _AP_S> br(this, index); + return br.to_bool(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const { + return bit(index.to_int()); + } +# 1195 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) int countLeadingZeros() const { + + if (_AP_W <= 32) { + ap_int_base<32, false> t(-1UL), x; + x.V = ({ typename _ap_type::remove_constV)>::type __Result__ = 0; typeof(this->V) __Val2__ = this->V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), _AP_W - 1, 0); __Result__; }); + t.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(t.V) __Val2__ = t.V; typeof(x.V) __Repl2__ = x.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 0, _AP_W - 1); __Result__; }); + return __builtin_ctz(t.V); + } else if (_AP_W <= 64) { + ap_int_base<64, false> t(-1ULL); + ap_int_base<64, false> x; + x.V = ({ typename _ap_type::remove_constV)>::type __Result__ = 0; typeof(this->V) __Val2__ = this->V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), _AP_W - 1, 0); __Result__; }); + t.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(t.V) __Val2__ = t.V; typeof(x.V) __Repl2__ = x.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 0, _AP_W - 1); __Result__; }); + return __builtin_ctzll(t.V); + } else { + enum { __N = (_AP_W + 63) / 64 }; + int NZeros = 0; + int i = 0; + bool hitNonZero = false; + VITIS_LOOP_1213_1: for (i = 0; i < __N - 1; ++i) { + ap_int_base<64, false> t; + t.V = ({ typename _ap_type::remove_constV)>::type __Result__ = 0; typeof(this->V) __Val2__ = this->V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), _AP_W - i * 64 - 64, _AP_W - i * 64 - 1); __Result__; }); + NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); + hitNonZero |= (t.V != 0); + } + if (!hitNonZero) { + ap_int_base<64, false> t(-1ULL); + enum { REST = (_AP_W - 1) % 64 }; + ap_int_base<64, false> x; + x.V = ({ typename _ap_type::remove_constV)>::type __Result__ = 0; typeof(this->V) __Val2__ = this->V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, REST); __Result__; }); + t.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(t.V) __Val2__ = t.V; typeof(x.V) __Repl2__ = x.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 63 - REST, 63); __Result__; }); + NZeros += __builtin_clzll(t.V); + } + return NZeros; + } + + + + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + concat(ap_int_base<_AP_W2, _AP_S2>& a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, a2); + } + + template + inline __attribute__((always_inline)) + __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) + __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >(*this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + const_cast&>(*this), a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this, + a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &a2) const { + return ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + const_cast&>(*this), + const_cast< + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, + a2); + } + + template + inline __attribute__((always_inline)) + __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &a2) const { + return ap_concat_ref< + _AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + const_cast&>(*this), + const_cast&>( + a2)); + } + + template + inline __attribute__((always_inline)) + __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref< + _AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_W2 + _AP_W3) > (_AP_W) ? (_AP_W2 + _AP_W3) : (_AP_W)), _AP_S> operator&( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { + return *this & a2.get(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_W2 + _AP_W3) > (_AP_W) ? (_AP_W2 + _AP_W3) : (_AP_W)), _AP_S> operator|( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { + return *this | a2.get(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_W2 + _AP_W3) > (_AP_W) ? (_AP_W2 + _AP_W3) : (_AP_W)), _AP_S> operator^( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { + return *this ^ a2.get(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(const ap_int_base<_AP_W3, false>& val) { + Base::V = val.V; + } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool and_reduce() const { return ({ typeof(Base::V) __what2__ = Base::V; __builtin_bit_and_reduce((void*)(&__what2__)); }); } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool nand_reduce() const { return ({ typeof(Base::V) __what2__ = Base::V; __builtin_bit_nand_reduce((void*)(&__what2__)); }); } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool or_reduce() const { return ({ typeof(Base::V) __what2__ = Base::V; __builtin_bit_or_reduce((void*)(&__what2__)); }); } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool nor_reduce() const { return !(({ typeof(Base::V) __what2__ = Base::V; __builtin_bit_or_reduce((void*)(&__what2__)); })); } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool xor_reduce() const { return ({ typeof(Base::V) __what2__ = Base::V; __builtin_bit_xor_reduce((void*)(&__what2__)); }); } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool xnor_reduce() const { + return !(({ typeof(Base::V) __what2__ = Base::V; __builtin_bit_xor_reduce((void*)(&__what2__)); })); + } +# 1435 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) char* to_string(signed char rd = 2, bool sign = _AP_S) const { + (void)(rd); + (void)(sign); + return 0; + } + +}; +# 1496 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::mult_base lhs(op); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::mult_base rhs(op2); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::mult_base ret(0); ret.V = lhs.V * rhs.V; return ret; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::plus_base lhs(op); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::plus_base rhs(op2); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::plus_base ret(0); ret.V = lhs.V + rhs.V; return ret; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::minus_base lhs(op); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::minus_base rhs(op2); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::minus_base ret(0); ret.V = lhs.V - rhs.V; return ret; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base lhs(op); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base rhs(op2); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base ret(0); ret.V = lhs.V & rhs.V; return ret; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base lhs(op); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base rhs(op2); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base ret(0); ret.V = lhs.V | rhs.V; return ret; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base lhs(op); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base rhs(op2); typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::logic_base ret(0); ret.V = lhs.V ^ rhs.V; return ret; } +# 1515 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::div_base ret(0); ret.V = op.V / op2.V; return ret; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { typename ap_int_base<_AP_W, _AP_S>::template RType< _AP_W2, _AP_S2>::mod_base ret(0); ret.V = op.V % op2.V; return ret; } +# 1543 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) PTR_TYPE* operator +(PTR_TYPE* i_op, const ap_int_base<_AP_W, _AP_S>& op) { ap_slong op2 = op.to_int64(); return i_op + op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) PTR_TYPE* operator +(const ap_int_base<_AP_W, _AP_S>& op, PTR_TYPE* i_op) { ap_slong op2 = op.to_int64(); return op2 + i_op; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) PTR_TYPE* operator -(PTR_TYPE* i_op, const ap_int_base<_AP_W, _AP_S>& op) { ap_slong op2 = op.to_int64(); return i_op - op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) PTR_TYPE* operator -(const ap_int_base<_AP_W, _AP_S>& op, PTR_TYPE* i_op) { ap_slong op2 = op.to_int64(); return op2 - i_op; } +# 1572 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator *(float i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op * op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator *(const ap_int_base<_AP_W, _AP_S>& op, float i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 * i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator /(float i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op / op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator /(const ap_int_base<_AP_W, _AP_S>& op, float i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 / i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator +(float i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op + op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator +(const ap_int_base<_AP_W, _AP_S>& op, float i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 + i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator -(float i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op - op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float operator -(const ap_int_base<_AP_W, _AP_S>& op, float i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 - i_op; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator *(double i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op * op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator *(const ap_int_base<_AP_W, _AP_S>& op, double i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 * i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator /(double i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op / op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator /(const ap_int_base<_AP_W, _AP_S>& op, double i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 / i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator +(double i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op + op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator +(const ap_int_base<_AP_W, _AP_S>& op, double i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 + i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator -(double i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op - op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double operator -(const ap_int_base<_AP_W, _AP_S>& op, double i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 - i_op; } +# 1597 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator *(half i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op * op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator *(const ap_int_base<_AP_W, _AP_S>& op, half i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 * i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator /(half i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op / op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator /(const ap_int_base<_AP_W, _AP_S>& op, half i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 / i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator +(half i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op + op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator +(const ap_int_base<_AP_W, _AP_S>& op, half i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 + i_op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator -(half i_op, const ap_int_base<_AP_W, _AP_S>& op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return i_op - op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half operator -(const ap_int_base<_AP_W, _AP_S>& op, half i_op) { typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; return op2 - i_op; } +# 1629 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::mult operator *(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op * ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::plus operator +(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op + ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::minus operator -(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op - ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::div operator /(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op / ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::mod operator %(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op % ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::logic operator &(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op & ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::logic operator |(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op | ap_int_base<1, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::logic operator ^(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<1, false>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, bool i_op) { return op ^ ap_int_base<1, false>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::mult operator *(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op * ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::plus operator +(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op + ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::minus operator -(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op - ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::div operator /(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op / ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::mod operator %(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op % ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::logic operator &(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op & ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::logic operator |(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op | ap_int_base<8, CHAR_IS_SIGNED>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::logic operator ^(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, CHAR_IS_SIGNED>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, char i_op) { return op ^ ap_int_base<8, CHAR_IS_SIGNED>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::mult operator *(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op * ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::plus operator +(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op + ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::minus operator -(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op - ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::div operator /(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op / ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::mod operator %(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op % ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::logic operator &(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op & ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::logic operator |(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op | ap_int_base<8, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::logic operator ^(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, true>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, signed char i_op) { return op ^ ap_int_base<8, true>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::mult operator *(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op * ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::plus operator +(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op + ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::minus operator -(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op - ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::div operator /(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op / ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::mod operator %(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op % ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::logic operator &(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op & ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::logic operator |(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op | ap_int_base<8, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::logic operator ^(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<8, false>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, unsigned char i_op) { return op ^ ap_int_base<8, false>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::mult operator *(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op * ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::plus operator +(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op + ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::minus operator -(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op - ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::div operator /(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op / ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::mod operator %(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op % ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::logic operator &(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op & ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::logic operator |(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op | ap_int_base<_AP_SIZE_short, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::logic operator ^(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, true>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, short i_op) { return op ^ ap_int_base<_AP_SIZE_short, true>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::mult operator *(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op * ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::plus operator +(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op + ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::minus operator -(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op - ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::div operator /(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op / ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::mod operator %(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op % ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::logic operator &(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op & ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::logic operator |(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op | ap_int_base<_AP_SIZE_short, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::logic operator ^(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_short, false>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, unsigned short i_op) { return op ^ ap_int_base<_AP_SIZE_short, false>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::mult operator *(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op * ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::plus operator +(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op + ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::minus operator -(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op - ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::div operator /(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op / ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::mod operator %(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op % ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::logic operator &(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op & ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::logic operator |(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op | ap_int_base<_AP_SIZE_int, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::logic operator ^(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, true>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, int i_op) { return op ^ ap_int_base<_AP_SIZE_int, true>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::mult operator *(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op * ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::plus operator +(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op + ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::minus operator -(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op - ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::div operator /(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op / ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::mod operator %(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op % ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::logic operator &(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op & ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::logic operator |(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op | ap_int_base<_AP_SIZE_int, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::logic operator ^(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_int, false>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, unsigned int i_op) { return op ^ ap_int_base<_AP_SIZE_int, false>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::mult operator *(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op * ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::plus operator +(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op + ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::minus operator -(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op - ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::div operator /(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op / ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::mod operator %(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op % ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::logic operator &(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op & ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::logic operator |(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op | ap_int_base<_AP_SIZE_long, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::logic operator ^(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, true>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, long i_op) { return op ^ ap_int_base<_AP_SIZE_long, true>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::mult operator *(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op * ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::plus operator +(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op + ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::minus operator -(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op - ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::div operator /(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op / ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::mod operator %(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op % ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::logic operator &(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op & ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::logic operator |(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op | ap_int_base<_AP_SIZE_long, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::logic operator ^(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_long, false>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, unsigned long i_op) { return op ^ ap_int_base<_AP_SIZE_long, false>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::mult operator *(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op * ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::plus operator +(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op + ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::minus operator -(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op - ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::div operator /(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op / ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::mod operator %(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op % ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::logic operator &(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op & ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::logic operator |(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op | ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::logic operator ^(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, true>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, ap_slong i_op) { return op ^ ap_int_base<_AP_SIZE_ap_slong, true>(i_op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::mult operator *(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::mult operator *(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op * ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::plus operator +(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::plus operator +(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op + ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::minus operator -(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::minus operator -(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op - ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::div operator /(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::div operator /(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op / ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::mod operator %(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) %(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::mod operator %(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op % ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::logic operator &(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::logic operator &(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op & ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::logic operator |(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::logic operator |(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op | ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::logic operator ^(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_SIZE_ap_slong, false>::logic operator ^(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong i_op) { return op ^ ap_int_base<_AP_SIZE_ap_slong, false>(i_op); } +# 1668 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, char op2) { ap_int_base<_AP_W, _AP_S> r(0); if (CHAR_IS_SIGNED) r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); else r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, char op2) { ap_int_base<_AP_W, _AP_S> r(0); if (CHAR_IS_SIGNED) r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); else r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); else r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); else r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, short op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); else r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, short op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); else r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, int op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); else r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, int op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); else r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, long op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); else r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, long op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); else r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); else r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { ap_int_base<_AP_W, _AP_S> r(0); if (true) r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); else r.V = op.V >> op2; return r; } +# 1692 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, bool op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, bool op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V >> op2; return r; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V << op2; return r; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { ap_int_base<_AP_W, _AP_S> r(0); r.V = op.V >> op2; return r; } +# 1723 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op += ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op -= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op *= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op /= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op %= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op &= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op |= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op ^= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op >>= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op <<= ap_int_base<1, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op += ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op -= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op *= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op /= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op %= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op &= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op |= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op ^= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op >>= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, char op2) { return op <<= ap_int_base<8, CHAR_IS_SIGNED>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op += ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op -= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op *= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op /= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op %= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op &= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op |= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op ^= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op >>= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op <<= ap_int_base<8, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op += ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op -= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op *= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op /= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op %= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op &= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op |= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op ^= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op >>= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op <<= ap_int_base<8, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op += ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op -= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op *= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op /= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op %= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op &= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op |= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op ^= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op >>= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, short op2) { return op <<= ap_int_base<_AP_SIZE_short, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op += ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op -= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op *= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op /= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op %= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op &= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op |= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op ^= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op >>= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op <<= ap_int_base<_AP_SIZE_short, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op += ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op -= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op *= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op /= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op %= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op &= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op |= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op ^= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op >>= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, int op2) { return op <<= ap_int_base<_AP_SIZE_int, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op += ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op -= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op *= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op /= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op %= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op &= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op |= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op ^= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op >>= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op <<= ap_int_base<_AP_SIZE_int, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op += ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op -= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op *= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op /= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op %= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op &= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op |= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op ^= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op >>= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, long op2) { return op <<= ap_int_base<_AP_SIZE_long, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op += ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op -= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op *= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op /= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op %= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op &= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op |= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op ^= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op >>= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op <<= ap_int_base<_AP_SIZE_long, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op += ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op -= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op *= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op /= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op %= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op &= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op |= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op ^= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op >>= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op <<= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator +=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op += ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator -=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op -= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator *=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op *= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator /=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op /= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator %=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op %= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator &=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op &= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator |=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op |= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator ^=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op ^= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator >>=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op >>= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, _AP_S>& operator <<=( ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op <<= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } +# 1756 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) half& operator +=( half& op2, ap_int_base<_AP_W, _AP_S>& op) { half op_rt = op.to_half(); return op2 += op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half& operator -=( half& op2, ap_int_base<_AP_W, _AP_S>& op) { half op_rt = op.to_half(); return op2 -= op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half& operator *=( half& op2, ap_int_base<_AP_W, _AP_S>& op) { half op_rt = op.to_half(); return op2 *= op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) half& operator /=( half& op2, ap_int_base<_AP_W, _AP_S>& op) { half op_rt = op.to_half(); return op2 /= op_rt; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) float& operator +=( float& op2, ap_int_base<_AP_W, _AP_S>& op) { float op_rt = op.to_float(); return op2 += op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float& operator -=( float& op2, ap_int_base<_AP_W, _AP_S>& op) { float op_rt = op.to_float(); return op2 -= op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float& operator *=( float& op2, ap_int_base<_AP_W, _AP_S>& op) { float op_rt = op.to_float(); return op2 *= op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) float& operator /=( float& op2, ap_int_base<_AP_W, _AP_S>& op) { float op_rt = op.to_float(); return op2 /= op_rt; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) double& operator +=( double& op2, ap_int_base<_AP_W, _AP_S>& op) { double op_rt = op.to_double(); return op2 += op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double& operator -=( double& op2, ap_int_base<_AP_W, _AP_S>& op) { double op_rt = op.to_double(); return op2 -= op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double& operator *=( double& op2, ap_int_base<_AP_W, _AP_S>& op) { double op_rt = op.to_double(); return op2 *= op_rt; } template inline __attribute__((always_inline)) __attribute__((nodebug)) double& operator /=( double& op2, ap_int_base<_AP_W, _AP_S>& op) { double op_rt = op.to_double(); return op2 /= op_rt; } +# 1784 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op > ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op < ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op >= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op <= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op == ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(bool i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<1, false>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, bool op2) { return op != ap_int_base<1, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, char op2) { return op > ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, char op2) { return op < ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, char op2) { return op >= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, char op2) { return op <= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, char op2) { return op == ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, CHAR_IS_SIGNED>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, char op2) { return op != ap_int_base<8, CHAR_IS_SIGNED>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op > ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op < ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op >= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op <= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op == ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(signed char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, true>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, signed char op2) { return op != ap_int_base<8, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op > ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op < ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op >= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op <= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op == ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned char i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<8, false>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, unsigned char op2) { return op != ap_int_base<8, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, short op2) { return op > ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, short op2) { return op < ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, short op2) { return op >= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, short op2) { return op <= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, short op2) { return op == ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, true>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, short op2) { return op != ap_int_base<_AP_SIZE_short, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op > ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op < ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op >= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op <= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op == ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned short i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_short, false>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, unsigned short op2) { return op != ap_int_base<_AP_SIZE_short, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, int op2) { return op > ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, int op2) { return op < ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, int op2) { return op >= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, int op2) { return op <= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, int op2) { return op == ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, true>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, int op2) { return op != ap_int_base<_AP_SIZE_int, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op > ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op < ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op >= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op <= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op == ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned int i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_int, false>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, unsigned int op2) { return op != ap_int_base<_AP_SIZE_int, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, long op2) { return op > ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, long op2) { return op < ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, long op2) { return op >= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, long op2) { return op <= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, long op2) { return op == ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, true>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, long op2) { return op != ap_int_base<_AP_SIZE_long, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op > ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op < ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op >= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op <= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op == ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned long i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_long, false>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, unsigned long op2) { return op != ap_int_base<_AP_SIZE_long, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op > ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op < ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op >= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op <= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op == ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(ap_slong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, true>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, ap_slong op2) { return op != ap_int_base<_AP_SIZE_ap_slong, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) > op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op > ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) < op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op < ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) >= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op >= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) <= op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op <= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) == op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op == ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(ap_ulong i_op, const ap_int_base<_AP_W, _AP_S>& op) { return ap_int_base<_AP_SIZE_ap_slong, false>(i_op) != op; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op, ap_ulong op2) { return op != ap_int_base<_AP_SIZE_ap_slong, false>(op2); } +# 1821 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op1, double op2) { return op1.to_double() > op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(double op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 > op2.to_double() ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W, _AP_S>& op1, float op2) { return op1.to_double() > op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(float op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 > op2.to_double() ; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op1, double op2) { return op1.to_double() < op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(double op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 < op2.to_double() ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W, _AP_S>& op1, float op2) { return op1.to_double() < op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(float op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 < op2.to_double() ; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op1, double op2) { return op1.to_double() >= op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(double op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 >= op2.to_double() ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W, _AP_S>& op1, float op2) { return op1.to_double() >= op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(float op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 >= op2.to_double() ; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op1, double op2) { return op1.to_double() <= op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(double op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 <= op2.to_double() ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W, _AP_S>& op1, float op2) { return op1.to_double() <= op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(float op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 <= op2.to_double() ; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op1, double op2) { return op1.to_double() == op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(double op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 == op2.to_double() ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W, _AP_S>& op1, float op2) { return op1.to_double() == op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(float op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 == op2.to_double() ; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op1, double op2) { return op1.to_double() != op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(double op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 != op2.to_double() ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W, _AP_S>& op1, float op2) { return op1.to_double() != op2 ; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(float op1, const ap_int_base<_AP_W, _AP_S>& op2) { return op1 != op2.to_double() ; } +# 1851 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::plus operator +(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) + op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::plus operator +(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 + ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::minus operator -(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) - op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::minus operator -(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 - ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::mult operator *(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) * op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::mult operator *(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 * ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::div operator /(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) / op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::div operator /(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 / ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::mod operator %(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) % op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::mod operator %(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 % ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::logic operator &(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) & op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::logic operator &(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 & ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::logic operator |(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) | op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::logic operator |(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 | ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::logic operator ^(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) ^ op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::logic operator ^(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 ^ ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::arg1 operator >>(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) >> op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::arg1 operator >>(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 >> ap_int_base<_AP_W2, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::arg1 operator <<(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1) << op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, _AP_S2>::arg1 operator <<(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 << ap_int_base<_AP_W2, false>(op2); } +# 1882 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator +=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 += ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator +=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp += op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator -=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 -= ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator -=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp -= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator *=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 *= ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator *=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp *= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator /=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 /= ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator /=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp /= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator %=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 %= ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator %=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp %= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator >>=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 >>= ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator >>=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp >>= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator <<=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1 <<= ap_int_base<_AP_W2, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator <<=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp <<= op2; op1 = tmp; return op1; } +# 1914 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator &=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W2, false> tmp(op2); op1.V &= tmp.V; return op1; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator &=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp.V &= op2.V; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator |=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W2, false> tmp(op2); op1.V |= tmp.V; return op1; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator |=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp.V |= op2.V; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator ^=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W2, false> tmp(op2); op1.V ^= tmp.V; return op1; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W1, _AP_S1>& operator ^=( ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<_AP_W1, false> tmp(op1); tmp.V ^= op2.V; op1 = tmp; return op1; } +# 1933 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1).operator ==(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1.operator ==(op2.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1).operator !=(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1.operator !=(op2.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1).operator >(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1.operator >(op2.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1).operator >=(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1.operator >=(op2.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1).operator <(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1.operator <(op2.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<_AP_W1, false>(op1).operator <=(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_range_ref<_AP_W2, _AP_S2>& op2) { return op1.operator <=(op2.operator ap_int_base<_AP_W2, false>()); } +# 1960 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::plus operator +(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 + ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::plus operator +(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) + op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::minus operator -(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 - ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::minus operator -(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) - op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::mult operator *(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 * ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::mult operator *(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) * op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::div operator /(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 / ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::div operator /(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) / op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::mod operator %(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 % ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::mod operator %(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) % op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::logic operator &(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 & ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::logic operator &(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) & op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::logic operator |(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 | ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::logic operator |(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) | op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::logic operator ^(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 ^ ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::logic operator ^(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) ^ op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::arg1 operator >>(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 >> ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::arg1 operator >>(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) >> op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::arg1 operator <<(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 << ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::arg1 operator <<(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) << op2; } +# 1991 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator +=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 += ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator +=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp += op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator -=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 -= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator -=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp -= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator *=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 *= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator *=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp *= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator /=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 /= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator /=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp /= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator %=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 %= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator %=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp %= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator >>=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 >>= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator >>=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp >>= op2; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator <<=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 <<= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator <<=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp <<= op2; op1 = tmp; return op1; } +# 2023 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator &=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op2); op1.V &= tmp.V; return op1; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator &=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp.V &= op2.V; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator |=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op2); op1.V |= tmp.V; return op1; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator |=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp.V |= op2.V; op1 = tmp; return op1; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W1, _AP_S1>& operator ^=( ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op2); op1.V ^= tmp.V; return op1; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref<_AP_W1, _AP_S1>& operator ^=( ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { ap_int_base<1, false> tmp(op1); tmp.V ^= op2.V; op1 = tmp; return op1; } +# 2042 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 == ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) == op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 != ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) != op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 > ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) > op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 >= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) >= op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 < ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) < op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_int_base<_AP_W1, _AP_S1>& op1, const ap_bit_ref<_AP_W2, _AP_S2>& op2) { return op1 <= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W1, _AP_S1>& op1, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op1) <= op2; } +# 2149 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_int_base<_AP_W3, _AP_S3>& op1, const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { return op1 == op2.get(); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, const ap_int_base<_AP_W3, _AP_S3>& op2) { return op1.get() == op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_int_base<_AP_W3, _AP_S3>& op1, const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { return op1 != op2.get(); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, const ap_int_base<_AP_W3, _AP_S3>& op2) { return op1.get() != op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_int_base<_AP_W3, _AP_S3>& op1, const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { return op1 > op2.get(); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, const ap_int_base<_AP_W3, _AP_S3>& op2) { return op1.get() > op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_int_base<_AP_W3, _AP_S3>& op1, const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { return op1 >= op2.get(); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, const ap_int_base<_AP_W3, _AP_S3>& op2) { return op1.get() >= op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_int_base<_AP_W3, _AP_S3>& op1, const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { return op1 < op2.get(); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, const ap_int_base<_AP_W3, _AP_S3>& op2) { return op1.get() < op2; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_int_base<_AP_W3, _AP_S3>& op1, const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { return op1 <= op2.get(); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, const ap_int_base<_AP_W3, _AP_S3>& op2) { return op1.get() <= op2; } +# 12 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 2 +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" 1 +# 27 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template +struct ap_concat_ref { + enum { + _AP_WR = _AP_W1 + _AP_W2, + }; + + _AP_T1& mbv1; + _AP_T2& mbv2; + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref) + : mbv1(ref.mbv1), mbv2(ref.mbv2) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> vval(val); + int W_ref1 = mbv1.length(); + int W_ref2 = mbv2.length(); + ap_int_base<_AP_W1, false> Part1; + Part1.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(vval.V) __Val2__ = vval.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), W_ref2, W_ref1 + W_ref2 - 1); __Result__; }); + mbv1.set(Part1); + ap_int_base<_AP_W2, false> Part2; + Part2.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(vval.V) __Val2__ = vval.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, W_ref2 - 1); __Result__; }); + mbv2.set(Part2); + return *this; + } +# 70 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(bool val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(char val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(signed char val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(unsigned char val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(short val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(unsigned short val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(int val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(unsigned int val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(long val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(unsigned long val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(ap_slong val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(ap_ulong val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(half val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(float val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(double val) { ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); return operator=(tmpVal); } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=( + const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=( + const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=( + const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) { + return operator=((const ap_int_base<_AP_W3, false>)(val)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=( + const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& + val) { + return operator=(val.to_ap_int_base()); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref& operator=( + const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) { + return operator=((ap_ulong)(bool)(val)); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_int_base<_AP_WR, false>() const { return get(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_ulong() const { return get().to_uint64(); } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_range_ref<_AP_W3, _AP_S3> > + operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_range_ref<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(ap_int_base<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >(*this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) { + + ap_int_base<_AP_W3, _AP_S3> op(a2); + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >( + *this, const_cast&>(op)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> > + operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4, + ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> > + operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4, + ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref< + _AP_WR, ap_concat_ref, _AP_W3, + af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> > + operator,( + const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) { + return ap_concat_ref< + _AP_WR, ap_concat_ref, _AP_W3, + af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >( + *this, + const_cast< + af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_WR, ap_concat_ref, 1, + af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> > + operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> + &a2) { + return ap_concat_ref< + _AP_WR, ap_concat_ref, 1, + af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >( + *this, + const_cast&>( + a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_WR) > (_AP_W3) ? (_AP_WR) : (_AP_W3)), _AP_S3> operator&( + const ap_int_base<_AP_W3, _AP_S3>& a2) { + return get() & a2; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_WR) > (_AP_W3) ? (_AP_WR) : (_AP_W3)), _AP_S3> operator|( + const ap_int_base<_AP_W3, _AP_S3>& a2) { + return get() | a2; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_WR) > (_AP_W3) ? (_AP_WR) : (_AP_W3)), _AP_S3> operator^( + const ap_int_base<_AP_W3, _AP_S3>& a2) { + return get() ^ a2; + } +# 258 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_WR, false> get() const { + ap_int_base<_AP_WR, false> tmpVal(0); + int W_ref1 = mbv1.length(); + int W_ref2 = mbv2.length(); + ap_int_base<_AP_W2, false> v2(mbv2); + ap_int_base<_AP_W1, false> v1(mbv1); + tmpVal.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(tmpVal.V) __Val2__ = tmpVal.V; typeof(v2.V) __Repl2__ = v2.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 0, W_ref2 - 1); __Result__; }); + tmpVal.V = + ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(tmpVal.V) __Val2__ = tmpVal.V; typeof(v1.V) __Repl2__ = v1.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), W_ref2, W_ref1 + W_ref2 - 1); __Result__; }); + return tmpVal; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(const ap_int_base<_AP_W3, false>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> vval(val); + int W_ref1 = mbv1.length(); + int W_ref2 = mbv2.length(); + ap_int_base<_AP_W1, false> tmpVal1; + tmpVal1.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(vval.V) __Val2__ = vval.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), W_ref2, W_ref1 + W_ref2 - 1); __Result__; }); + mbv1.set(tmpVal1); + ap_int_base<_AP_W2, false> tmpVal2; + tmpVal2.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(vval.V) __Val2__ = vval.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, W_ref2 - 1); __Result__; }); + mbv2.set(tmpVal2); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { return mbv1.length() + mbv2.length(); } +}; + + + + +template +struct ap_range_ref { + + + + typedef ap_int_base<_AP_W, _AP_S> ref_type; + ref_type& d_bv; + int l_index; + int h_index; + + public: + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref) + : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref(ref_type* bv, int h, int l) + : d_bv(*bv), l_index(l), h_index(h) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref(const ref_type* bv, int h, int l) + : d_bv(*const_cast(bv)), l_index(l), h_index(h) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_int_base<_AP_W, false>() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_ulong() const { return to_uint64(); } +# 339 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(bool val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(char val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(signed char val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(unsigned char val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(short val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(unsigned short val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(int val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(unsigned int val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(long val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(unsigned long val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(ap_slong val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(ap_ulong val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(half val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(float val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(double val) { ap_int_base<_AP_W, false> tmp(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(const char* val) { + const ap_int_base<_AP_W, false> tmp(val); + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + return *this; + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) { + ap_int_base<_AP_W, false> tmp(val); + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + return *this; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(const ap_range_ref& val) { + return operator=((const ap_int_base<_AP_W, false>)val); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + return operator=((const ap_int_base<_AP_W2, false>)val); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=((ap_ulong)(bool)(val)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + val) { + return operator=(val.to_ap_int_base()); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((const ap_int_base<_AP_W2, false>)val); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((ap_ulong)(bool)(val)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& val) { + return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, a2); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> > + operator,(ap_int_base<_AP_W, _AP_S>& a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W, + ap_int_base<_AP_W, _AP_S> >(*this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref< + _AP_W, ap_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) { + return ap_concat_ref< + _AP_W, ap_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast< + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, ap_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &a2) { + return ap_concat_ref< + _AP_W, ap_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> hop(op2); + return lop == hop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator==(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> hop(op2); + return lop < hop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> hop(op2); + return lop <= hop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator<=(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator<(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S>& operator|=( + const ap_range_ref<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V |= (op2.d_bv).V; + return *this; + }; + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S>& operator|=( + const ap_int_base<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V |= op2.V; + return *this; + }; + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S>& operator&=( + const ap_range_ref<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V &= (op2.d_bv).V; + return *this; + }; + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S>& operator&=( + const ap_int_base<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V &= op2.V; + return *this; + }; + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S>& operator^=( + const ap_range_ref<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V ^= (op2.d_bv).V; + return *this; + }; + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_range_ref<_AP_W, _AP_S>& operator^=( + const ap_int_base<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V ^= op2.V; + return *this; + }; + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator~() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return (~ret); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator!() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return (!ret); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator+() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator-() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return (-ret); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> get() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(const ap_int_base<_AP_W2, false>& val) { + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val.V) __Repl2__ = val.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { + return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int to_int() const { + return (int)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned to_uint() const { + ap_int_base<_AP_W, false> t; + t.V = d_bv.V; + return (unsigned)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(t.V) __Val2__ = t.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) long to_long() const { + return (long)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned long to_ulong() const { + ap_int_base<_AP_W, false> t; + t.V = d_bv.V; + return (unsigned long)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(t.V) __Val2__ = t.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_slong to_int64() const { + return (ap_slong)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ulong to_uint64() const { + ap_int_base<_AP_W, false> t; + t.V = d_bv.V; + return (ap_ulong)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(t.V) __Val2__ = t.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool and_reduce() const { + bool ret = true; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + VITIS_LOOP_676_1: for (unsigned i = low; i != high; ++i) { + +#pragma HLS unroll + + ret &= ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), i); __Result__; }); + } + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool or_reduce() const { + bool ret = false; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + VITIS_LOOP_690_1: for (unsigned i = low; i != high; ++i) { + +#pragma HLS unroll + + ret |= ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), i); __Result__; }); + } + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool xor_reduce() const { + bool ret = false; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + VITIS_LOOP_704_1: for (unsigned i = low; i != high; ++i) { + +#pragma HLS unroll + + ret ^= ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), i); __Result__; }); + } + return ret; + } +# 720 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) char* to_string(signed char radix = 2) const { + (void)(radix); + return 0; + } + +}; +# 760 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template +struct ap_bit_ref { + + + + typedef ap_int_base<_AP_W, _AP_S> ref_type; + ref_type& d_bv; + int d_index; + + public: + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref) + : d_bv(ref.d_bv), d_index(ref.d_index) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref(const ref_type* bv, int index = 0) + : d_bv(*const_cast(bv)), d_index(index) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator bool() const { return ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool to_bool() const { return ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); } +# 798 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(bool val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(char val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(signed char val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(unsigned char val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(short val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(unsigned short val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(int val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(unsigned int val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(long val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(unsigned long val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(ap_slong val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(ap_ulong val) { d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } +# 820 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(half val) { bool tmp_val = val; d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp_val) __Repl2__ = !!tmp_val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(float val) { bool tmp_val = val; d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp_val) __Repl2__ = !!tmp_val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(double val) { bool tmp_val = val; d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp_val) __Repl2__ = !!tmp_val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); return *this; } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) { + return operator=((ap_ulong)(val.V != 0)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + return operator=((ap_int_base<_AP_W2, false>)val); + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(const ap_bit_ref& val) { + return operator=((ap_ulong)(bool)val); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=((ap_ulong)(bool)val); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((const ap_int_base<_AP_W2, false>)val); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((ap_ulong)(bool)val); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_bit_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, a2); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) { + ap_int_base<_AP_W2, _AP_S2> op(a2); + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + ap_int_base<_AP_W2, _AP_S2> op(a2); + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,( + const ap_bit_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + *this, const_cast&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref< + 1, ap_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref< + 1, ap_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast< + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + a2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) { + return get() == op.get(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) { + return get() != op.get(); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool get() const { return ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool get() { return ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(const ap_int_base<_AP_W3, false>& val) { + operator=(val); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator~() const { + bool bit = ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); + return bit ? false : true; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { return 1; } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) char* to_string() const { return 0; } + +}; +# 1018 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, bool op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(bool op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, bool op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, bool op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(bool op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, bool op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, bool op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(bool op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, bool op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, bool op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(bool op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, bool op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<1, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, char op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, char op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, char op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, char op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<8, CHAR_IS_SIGNED>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, signed char op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(signed char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, signed char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, signed char op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(signed char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, signed char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, signed char op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(signed char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, signed char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, signed char op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(signed char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, signed char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<8, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned char op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned char op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned char op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned char op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<8, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, short op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, short op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, short op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, short op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_short, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned short op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned short op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned short op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned short op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_short, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, int op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, int op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, int op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, int op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_int, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned int op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned int op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned int op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned int op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_int, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, long op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, long op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, long op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, long op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_long, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned long op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(unsigned long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned long op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(unsigned long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned long op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(unsigned long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned long op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(unsigned long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_long, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, ap_slong op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(ap_slong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_slong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, ap_slong op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(ap_slong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_slong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, ap_slong op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(ap_slong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_slong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, ap_slong op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(ap_slong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_slong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_ap_slong, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_bit_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(ap_ulong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_ulong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) > ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_bit_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(ap_ulong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_ulong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) < ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_bit_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(ap_ulong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_ulong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) >= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_bit_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(ap_ulong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_ulong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) <= ap_int_base<_AP_SIZE_ap_slong, false>(op2); } + + + + + + +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, bool op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(bool op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, bool op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<1, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, bool op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(bool op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, bool op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<1, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, char op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<8, CHAR_IS_SIGNED>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, char op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<8, CHAR_IS_SIGNED>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, signed char op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(signed char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, signed char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<8, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, signed char op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(signed char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, signed char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<8, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned char op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<8, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned char op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned char op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned char op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<8, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, short op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_short, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, short op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_short, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned short op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_short, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned short op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned short op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned short op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_short, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, int op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_int, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, int op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_int, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned int op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_int, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned int op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned int op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned int op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_int, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, long op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_long, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, long op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_long, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned long op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(unsigned long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_long, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, unsigned long op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(unsigned long op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, unsigned long op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_long, false>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, ap_slong op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(ap_slong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_slong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_ap_slong, true>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, ap_slong op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(ap_slong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_slong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_ap_slong, true>(op2); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_bit_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(ap_ulong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_ulong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) == ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<_AP_SIZE_ap_slong, false>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_bit_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(ap_ulong op2, const ap_bit_ref<_AP_W, _AP_S>& op) { return op2 != bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, ap_ulong op2) { return ap_int_base<_AP_W + _AP_W1, false>(op) != ap_int_base<_AP_SIZE_ap_slong, false>(op2); } +# 1077 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::plus operator +(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::minus operator -(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::mult operator *(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::div operator /(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::mod operator %(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::plus operator +(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::minus operator -(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::mult operator *(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::div operator /(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::mod operator %(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::plus operator +(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::minus operator -(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::mult operator *(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::div operator /(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::mod operator %(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::plus operator +(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::minus operator -(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::mult operator *(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::div operator /(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::mod operator %(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::plus operator +(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::minus operator -(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::mult operator *(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::div operator /(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::mod operator %(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::plus operator +(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::minus operator -(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::mult operator *(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::div operator /(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::mod operator %(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::plus operator +(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::minus operator -(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::mult operator *(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::div operator /(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::mod operator %(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::plus operator +(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::minus operator -(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::mult operator *(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::div operator /(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::mod operator %(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::plus operator +(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::minus operator -(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::mult operator *(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::div operator /(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::mod operator %(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::plus operator +(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::minus operator -(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::mult operator *(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::div operator /(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::mod operator %(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::plus operator +(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::minus operator -(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::mult operator *(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::div operator /(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::mod operator %(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) % ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) + ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::plus operator +(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) + ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) - ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::minus operator -(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) - ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) * ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::mult operator *(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) * ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::div operator /(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) / ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::div operator /(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) / ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) % ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::mod operator %(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) % ap_int_base<_AP_W, false>(op); } +# 1100 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::logic operator &(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::logic operator |(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::logic operator ^(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::arg1 operator >>(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(1), (false)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, bool op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(1), (false)>::template RType<_AP_W, false>::arg1 operator <<(bool op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(1), (false)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::logic operator &(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::logic operator |(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::logic operator ^(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::arg1 operator >>(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (CHAR_IS_SIGNED)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, char op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (CHAR_IS_SIGNED)>::template RType<_AP_W, false>::arg1 operator <<(char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::logic operator &(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::logic operator |(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::logic operator ^(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::arg1 operator >>(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (true)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (true)>::template RType<_AP_W, false>::arg1 operator <<(signed char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (true)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::logic operator &(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::logic operator |(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::logic operator ^(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::arg1 operator >>(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(8), (false)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(8), (false)>::template RType<_AP_W, false>::arg1 operator <<(unsigned char op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(8), (false)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::logic operator &(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::logic operator |(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::logic operator ^(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::arg1 operator >>(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (true)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, short op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (true)>::template RType<_AP_W, false>::arg1 operator <<(short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::logic operator &(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::logic operator |(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::logic operator ^(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::arg1 operator >>(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_short), (false)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_short), (false)>::template RType<_AP_W, false>::arg1 operator <<(unsigned short op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::logic operator &(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::logic operator |(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::logic operator ^(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::arg1 operator >>(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (true)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, int op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (true)>::template RType<_AP_W, false>::arg1 operator <<(int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::logic operator &(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::logic operator |(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::logic operator ^(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::arg1 operator >>(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_int), (false)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_int), (false)>::template RType<_AP_W, false>::arg1 operator <<(unsigned int op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::logic operator &(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::logic operator |(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::logic operator ^(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::arg1 operator >>(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (true)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, long op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (true)>::template RType<_AP_W, false>::arg1 operator <<(long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::logic operator &(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::logic operator |(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::logic operator ^(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::arg1 operator >>(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_long), (false)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_long), (false)>::template RType<_AP_W, false>::arg1 operator <<(unsigned long op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::logic operator &(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::logic operator |(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::logic operator ^(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::arg1 operator >>(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (true)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (true)>::template RType<_AP_W, false>::arg1 operator <<(ap_slong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) << ap_int_base<_AP_W, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) & ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::logic operator &(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) & ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) | ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::logic operator |(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) | ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) ^ ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::logic operator ^(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) ^ ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) >> ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::arg1 operator >>(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) >> ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<(_AP_SIZE_ap_slong), (false)>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) << ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<(_AP_SIZE_ap_slong), (false)>::template RType<_AP_W, false>::arg1 operator <<(ap_ulong op2, const ap_range_ref<_AP_W, _AP_S>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) << ap_int_base<_AP_W, false>(op); } +# 1128 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::plus operator +(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())+( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::minus operator -(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())-( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::mult operator *(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())*( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::div operator /(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())/( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::mod operator %(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())%( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::logic operator &(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())&( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::logic operator |(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())|( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::logic operator ^(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())^( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::arg1 operator >>(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())>>( rhs.operator ap_int_base<_AP_W2, false>()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::arg1 operator <<(const ap_range_ref<_AP_W, _AP_S>& lhs, const ap_range_ref<_AP_W2, _AP_S2>& rhs) { return (lhs.operator ap_int_base<_AP_W, false>())<<( rhs.operator ap_int_base<_AP_W2, false>()); } +# 1177 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::plus operator +( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() + rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::minus operator -( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() - rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::mult operator *( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() * rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::div operator /( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() / rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::mod operator %( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() % rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::logic operator &( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() & rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::logic operator |( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() | rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::logic operator ^( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() ^ rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::arg1 operator >>( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() >> rhs.get(); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< _AP_RW1 + _AP_RW2, false>::arg1 operator <<( const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { return lhs.get() << rhs.get(); } +# 1324 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 1, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, bool op2) { ap_int_base<1 + _AP_W, false> val(op2); ap_int_base<1 + _AP_W, false> ret(op1); ret <<= 1; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 1, false> operator,( bool op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<1 + _AP_W, false> val(op1); ap_int_base<1 + _AP_W, false> ret(op2); if (_AP_S) { ret <<= 1; ret >>= 1; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 1, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, bool op2) { ap_int_base<1 + _AP_W, false> val(op2); ap_int_base<1 + _AP_W, false> ret(op1); ret <<= 1; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 1, false> operator,( bool op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<1 + _AP_W, false> val(op1); ap_int_base<1 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, bool op2) { ap_int_base<1 + 1, false> val(op2); val[1] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 1, false> operator,( bool op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<1 + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 1, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, bool op2) { ap_int_base<1 + _AP_W + _AP_W2, false> val(op2); ap_int_base<1 + _AP_W + _AP_W2, false> ret(op1); if (false) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= 1; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 1, false> operator,( bool op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<1 + _AP_W + _AP_W2, false> val(op1); ap_int_base<1 + _AP_W + _AP_W2, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 1, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, bool op2) { ap_int_base<1 + _AP_W, false> val(op2); ap_int_base<1 + _AP_W, false> ret(op1); if (false) { val <<= _AP_W; val >>= _AP_W; } ret <<= 1; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 1, false> operator,( bool op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<1 + _AP_W, false> val(op1); ap_int_base<1 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 1, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, bool op2) { ap_int_base<1 + 1, false> val(op2); val[1] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 1, false> operator,( bool op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<1 + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); ret <<= 8; if (CHAR_IS_SIGNED) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( char op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); if (_AP_S) { ret <<= 8; ret >>= 8; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); ret <<= 8; if (CHAR_IS_SIGNED) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( char op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<8 + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, char op2) { ap_int_base<8 + 1, false> val(op2); val[8] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<8 + 1, false> operator,( char op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<8 + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 8, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, char op2) { ap_int_base<8 + _AP_W + _AP_W2, CHAR_IS_SIGNED> val(op2); ap_int_base<8 + _AP_W + _AP_W2, CHAR_IS_SIGNED> ret(op1); if (CHAR_IS_SIGNED) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= 8; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 8, false> operator,( char op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<8 + _AP_W + _AP_W2, CHAR_IS_SIGNED> val(op1); ap_int_base<8 + _AP_W + _AP_W2, CHAR_IS_SIGNED> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); if (CHAR_IS_SIGNED) { val <<= _AP_W; val >>= _AP_W; } ret <<= 8; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( char op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 8, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, char op2) { ap_int_base<8 + 1, CHAR_IS_SIGNED> val(op2); val[8] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 8, false> operator,( char op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<8 + 1, CHAR_IS_SIGNED> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, signed char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); ret <<= 8; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( signed char op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); if (_AP_S) { ret <<= 8; ret >>= 8; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, signed char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); ret <<= 8; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( signed char op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<8 + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, signed char op2) { ap_int_base<8 + 1, false> val(op2); val[8] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<8 + 1, false> operator,( signed char op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<8 + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 8, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, signed char op2) { ap_int_base<8 + _AP_W + _AP_W2, true> val(op2); ap_int_base<8 + _AP_W + _AP_W2, true> ret(op1); if (true) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= 8; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 8, false> operator,( signed char op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<8 + _AP_W + _AP_W2, true> val(op1); ap_int_base<8 + _AP_W + _AP_W2, true> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, signed char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); if (true) { val <<= _AP_W; val >>= _AP_W; } ret <<= 8; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( signed char op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 8, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, signed char op2) { ap_int_base<8 + 1, true> val(op2); val[8] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 8, false> operator,( signed char op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<8 + 1, true> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, unsigned char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); ret <<= 8; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( unsigned char op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); if (_AP_S) { ret <<= 8; ret >>= 8; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, unsigned char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); ret <<= 8; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( unsigned char op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<8 + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, unsigned char op2) { ap_int_base<8 + 1, false> val(op2); val[8] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<8 + 1, false> operator,( unsigned char op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<8 + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 8, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, unsigned char op2) { ap_int_base<8 + _AP_W + _AP_W2, false> val(op2); ap_int_base<8 + _AP_W + _AP_W2, false> ret(op1); if (false) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= 8; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + 8, false> operator,( unsigned char op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<8 + _AP_W + _AP_W2, false> val(op1); ap_int_base<8 + _AP_W + _AP_W2, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned char op2) { ap_int_base<8 + _AP_W, false> val(op2); ap_int_base<8 + _AP_W, false> ret(op1); if (false) { val <<= _AP_W; val >>= _AP_W; } ret <<= 8; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + 8, false> operator,( unsigned char op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<8 + _AP_W, false> val(op1); ap_int_base<8 + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 8, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned char op2) { ap_int_base<8 + 1, false> val(op2); val[8] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + 8, false> operator,( unsigned char op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<8 + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, short op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op1); ret <<= _AP_SIZE_short; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( short op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_short; ret >>= _AP_SIZE_short; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, short op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op1); ret <<= _AP_SIZE_short; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( short op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_short + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, short op2) { ap_int_base<_AP_SIZE_short + 1, false> val(op2); val[_AP_SIZE_short] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_short + 1, false> operator,( short op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_short + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_short, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, short op2) { ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, true> val(op2); ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, true> ret(op1); if (true) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_short; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_short, false> operator,( short op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, true> val(op1); ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, true> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, short op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op1); if (true) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_short; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( short op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_short, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, short op2) { ap_int_base<_AP_SIZE_short + 1, true> val(op2); val[_AP_SIZE_short] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_short, false> operator,( short op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_short + 1, true> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, unsigned short op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op1); ret <<= _AP_SIZE_short; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( unsigned short op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_short; ret >>= _AP_SIZE_short; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, unsigned short op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op1); ret <<= _AP_SIZE_short; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( unsigned short op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_short + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, unsigned short op2) { ap_int_base<_AP_SIZE_short + 1, false> val(op2); val[_AP_SIZE_short] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_short + 1, false> operator,( unsigned short op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_short + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_short, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, unsigned short op2) { ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, false> ret(op1); if (false) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_short; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_short, false> operator,( unsigned short op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W + _AP_W2, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned short op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op1); if (false) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_short; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_short, false> operator,( unsigned short op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_short + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_short + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_short, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned short op2) { ap_int_base<_AP_SIZE_short + 1, false> val(op2); val[_AP_SIZE_short] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_short, false> operator,( unsigned short op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_short + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, int op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op1); ret <<= _AP_SIZE_int; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( int op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_int; ret >>= _AP_SIZE_int; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, int op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op1); ret <<= _AP_SIZE_int; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( int op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_int + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, int op2) { ap_int_base<_AP_SIZE_int + 1, false> val(op2); val[_AP_SIZE_int] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_int + 1, false> operator,( int op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_int + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_int, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, int op2) { ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, true> val(op2); ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, true> ret(op1); if (true) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_int; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_int, false> operator,( int op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, true> val(op1); ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, true> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, int op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op1); if (true) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_int; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( int op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_int, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, int op2) { ap_int_base<_AP_SIZE_int + 1, true> val(op2); val[_AP_SIZE_int] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_int, false> operator,( int op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_int + 1, true> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, unsigned int op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op1); ret <<= _AP_SIZE_int; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( unsigned int op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_int; ret >>= _AP_SIZE_int; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, unsigned int op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op1); ret <<= _AP_SIZE_int; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( unsigned int op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_int + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, unsigned int op2) { ap_int_base<_AP_SIZE_int + 1, false> val(op2); val[_AP_SIZE_int] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_int + 1, false> operator,( unsigned int op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_int + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_int, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, unsigned int op2) { ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, false> ret(op1); if (false) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_int; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_int, false> operator,( unsigned int op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W + _AP_W2, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned int op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op1); if (false) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_int; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_int, false> operator,( unsigned int op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_int + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_int + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_int, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned int op2) { ap_int_base<_AP_SIZE_int + 1, false> val(op2); val[_AP_SIZE_int] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_int, false> operator,( unsigned int op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_int + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, long op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op1); ret <<= _AP_SIZE_long; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( long op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_long; ret >>= _AP_SIZE_long; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, long op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op1); ret <<= _AP_SIZE_long; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( long op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_long + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, long op2) { ap_int_base<_AP_SIZE_long + 1, false> val(op2); val[_AP_SIZE_long] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_long + 1, false> operator,( long op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_long + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_long, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, long op2) { ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, true> val(op2); ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, true> ret(op1); if (true) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_long; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_long, false> operator,( long op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, true> val(op1); ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, true> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, long op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op1); if (true) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_long; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( long op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_long, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, long op2) { ap_int_base<_AP_SIZE_long + 1, true> val(op2); val[_AP_SIZE_long] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_long, false> operator,( long op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_long + 1, true> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, unsigned long op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op1); ret <<= _AP_SIZE_long; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( unsigned long op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_long; ret >>= _AP_SIZE_long; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, unsigned long op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op1); ret <<= _AP_SIZE_long; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( unsigned long op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_long + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, unsigned long op2) { ap_int_base<_AP_SIZE_long + 1, false> val(op2); val[_AP_SIZE_long] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_long + 1, false> operator,( unsigned long op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_long + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_long, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, unsigned long op2) { ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, false> ret(op1); if (false) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_long; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_long, false> operator,( unsigned long op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W + _AP_W2, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned long op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op1); if (false) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_long; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_long, false> operator,( unsigned long op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_long + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_long + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_long, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, unsigned long op2) { ap_int_base<_AP_SIZE_long + 1, false> val(op2); val[_AP_SIZE_long] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_long, false> operator,( unsigned long op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_long + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, ap_slong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op1); ret <<= _AP_SIZE_ap_slong; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( ap_slong op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_ap_slong; ret >>= _AP_SIZE_ap_slong; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, ap_slong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op1); ret <<= _AP_SIZE_ap_slong; if (true) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( ap_slong op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_ap_slong + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, ap_slong op2) { ap_int_base<_AP_SIZE_ap_slong + 1, false> val(op2); val[_AP_SIZE_ap_slong] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_ap_slong + 1, false> operator,( ap_slong op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_ap_slong + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_ap_slong, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, ap_slong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, true> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, true> ret(op1); if (true) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_ap_slong; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_ap_slong, false> operator,( ap_slong op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, true> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, true> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, ap_slong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op1); if (true) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_ap_slong; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( ap_slong op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_ap_slong, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, ap_slong op2) { ap_int_base<_AP_SIZE_ap_slong + 1, true> val(op2); val[_AP_SIZE_ap_slong] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_ap_slong, false> operator,( ap_slong op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_ap_slong + 1, true> val(op1); val <<= 1; val[0] = op2; return val; } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( const ap_int_base<_AP_W, _AP_S> &op1, ap_ulong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op1); ret <<= _AP_SIZE_ap_slong; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( ap_ulong op1, const ap_int_base<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op2); if (_AP_S) { ret <<= _AP_SIZE_ap_slong; ret >>= _AP_SIZE_ap_slong; } ret |= val << _AP_W; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( const ap_range_ref<_AP_W, _AP_S> &op1, ap_ulong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op1); ret <<= _AP_SIZE_ap_slong; if (false) { val <<= _AP_W; val >>= _AP_W; } ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( ap_ulong op1, const ap_range_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_ap_slong + 1, false> operator,( const ap_bit_ref<_AP_W, _AP_S> &op1, ap_ulong op2) { ap_int_base<_AP_SIZE_ap_slong + 1, false> val(op2); val[_AP_SIZE_ap_slong] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_SIZE_ap_slong + 1, false> operator,( ap_ulong op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { ap_int_base<_AP_SIZE_ap_slong + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_ap_slong, false> operator,( const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, ap_ulong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, false> ret(op1); if (false) { val <<= _AP_W + _AP_W2; val >>= _AP_W + _AP_W2; } ret <<= _AP_SIZE_ap_slong; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_W2 + _AP_SIZE_ap_slong, false> operator,( ap_ulong op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W + _AP_W2, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, ap_ulong op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op2); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op1); if (false) { val <<= _AP_W; val >>= _AP_W; } ret <<= _AP_SIZE_ap_slong; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W + _AP_SIZE_ap_slong, false> operator,( ap_ulong op1, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> val(op1); ap_int_base<_AP_SIZE_ap_slong + _AP_W, false> ret(op2); int len = op2.length(); val <<= len; ret |= val; return ret; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_ap_slong, false> operator,( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, ap_ulong op2) { ap_int_base<_AP_SIZE_ap_slong + 1, false> val(op2); val[_AP_SIZE_ap_slong] = op1; return val; } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<1 + _AP_SIZE_ap_slong, false> operator,( ap_ulong op1, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { ap_int_base<_AP_SIZE_ap_slong + 1, false> val(op1); val <<= 1; val[0] = op2; return val; } +# 1348 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator <<( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, int rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() << int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator <<( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, unsigned int rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() << int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator <<( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, long rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() << int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator <<( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, unsigned long rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() << int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator <<( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, ap_slong rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() << int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator <<( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, ap_ulong rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() << int(rhs); } + +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator >>( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, int rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() >> int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator >>( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, unsigned int rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() >> int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator >>( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, long rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() >> int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator >>( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, unsigned long rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() >> int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator >>( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, ap_slong rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() >> int(rhs); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint<_AP_W + _AP_W1> operator >>( const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, ap_ulong rhs) { return ap_uint<_AP_W + _AP_W1>(lhs).get() >> int(rhs); } +# 13 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 2 + + + + +template +struct ap_int : ap_int_base<_AP_W, true> { + typedef ap_int_base<_AP_W, true> Base; + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int() {} + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_int<_AP_W2>& op): Base((ap_int_base<_AP_W2, true>)op){ + + + + op.checkOverflowCsim(_AP_W, true); + + Base::V = op.V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const volatile ap_int<_AP_W2>& op) { + const_cast& >(op).checkOverflowCsim(_AP_W, true); + Base::V = op.V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_uint<_AP_W2>& op): Base((ap_int_base<_AP_W2, false>)op){ + + + + op.checkOverflowCsim(_AP_W, true); + + Base::V = op.V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const volatile ap_uint<_AP_W2>& op) { + const_cast& >(op).checkOverflowCsim(_AP_W, true); + Base::V = op.V; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(unsigned V __attribute__((bitwidth(_AP_W))), bool raw): Base(0) { + Base::V = V; + (void)(raw); + } + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) + : Base(ref) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int( + const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int( + const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const ap_int_base<_AP_W2, _AP_S2>& op):Base(op) { + + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(bool val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(char val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(signed char val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(unsigned char val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(short val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(unsigned short val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(int val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(unsigned int val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(long val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(unsigned long val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(ap_slong val):Base(val){ this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(ap_ulong val):Base(val){ this->checkOverflowBaseC(val); } + + ap_int(double val) : Base(val) {} + ap_int(float val) : Base(val) {} + ap_int(half val) : Base(val) {} + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const char* s) : Base(s) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int(const char* s, signed char rd) : Base(s, rd) {} + + + + + ap_int &operator=(const ap_int<_AP_W> &op2) = default; +# 161 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int& operator=(const volatile ap_int<_AP_W>& op2) { + Base::V = op2.V; + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const ap_int<_AP_W>& op2) volatile { + *((volatile typename Base::Base::DataType *)(&(Base::V))) = op2.V; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const volatile ap_int<_AP_W>& op2) volatile { + *((volatile typename Base::Base::DataType *)(&(Base::V))) = op2.V; + } + +}; + + + + +template +struct ap_uint : ap_int_base<_AP_W, false> { + typedef ap_int_base<_AP_W, false> Base; + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint() {} + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_uint<_AP_W2>& op):Base((ap_int_base<_AP_W2, false>)op) { + op.checkOverflowCsim(_AP_W, false); + + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_int<_AP_W2>& op):Base((ap_int_base<_AP_W2, true>)op) { + op.checkOverflowCsim(_AP_W, false); + + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const volatile ap_uint<_AP_W2>& op) { + const_cast& >(op).checkOverflowCsim(_AP_W, false); + Base::V = op.V; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const volatile ap_int<_AP_W2>& op) { + const_cast& >(op).checkOverflowCsim(_AP_W, false); + Base::V = op.V; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(unsigned V __attribute__((bitwidth(_AP_W))), bool raw):Base(0) { + Base::V = V; + (void)(raw); + } + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) + : Base(ref) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint( + const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint( + const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op):Base(op) { + + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(bool val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(char val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(signed char val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(unsigned char val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(short val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(unsigned short val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(int val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(unsigned int val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(long val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(unsigned long val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(ap_slong val):Base(val) { this->checkOverflowBaseC(val); } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(ap_ulong val):Base(val) { this->checkOverflowBaseC(val); } + + ap_uint(double val) : Base(val) {} + ap_uint(float val) : Base(val) {} + ap_uint(half val) : Base(val) {} + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const char* s) : Base(s) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint(const char* s, signed char rd) : Base(s, rd) {} + + + + + + ap_uint &operator=(const ap_uint<_AP_W> &op2) = default; +# 317 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) { + Base::V = op2.V; + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const ap_uint<_AP_W>& op2) volatile { + *((volatile typename Base::Base::DataType *)(&(Base::V))) = op2.V; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const volatile ap_uint<_AP_W>& op2) volatile { + *((volatile typename Base::Base::DataType *)(&(Base::V))) = op2.V; + } + +}; +# 352 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_special.h" 1 +# 20 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_special.h" +namespace std { +template class complex; +} + + + + + + + +namespace std { +# 48 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_special.h" +template +class complex > { + public: + typedef ap_int<_AP_W> _Tp; + typedef _Tp value_type; + + + + + __attribute__((nodebug)) complex() : _M_real(_Tp()), _M_imag(_Tp()) {} + + + __attribute__((nodebug)) complex(const _Tp &__r, const _Tp &__i = _Tp(0)) + : _M_real(__r), _M_imag(__i) {} + + + template + __attribute__((nodebug)) complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {} + + + __attribute__((nodebug)) const _Tp& real() const { return _M_real; } + const _Tp& imag() const { return _M_imag; } + + + + + + + + __attribute__((nodebug)) void real(_Tp __val) { _M_real = __val; } + + __attribute__((nodebug)) void imag(_Tp __val) { _M_imag = __val; } + + + + __attribute__((nodebug)) complex<_Tp> &operator=(const _Tp __t) { + _M_real = __t; + _M_imag = _Tp(0); + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator+=(const _Tp &__t) { + _M_real += __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator-=(const _Tp &__t) { + _M_real -= __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator*=(const _Tp &__t) { + _M_real *= __t; + _M_imag *= __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator/=(const _Tp &__t) { + _M_real /= __t; + _M_imag /= __t; + return *this; + } + + + template + __attribute__((nodebug)) complex<_Tp> &operator=(const complex<_Up> &__z) { + _M_real = __z.real(); + _M_imag = __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator+=(const complex<_Up> &__z) { + _M_real += __z.real(); + _M_imag += __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator-=(const complex<_Up> &__z) { + _M_real -= __z.real(); + _M_imag -= __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator*=(const complex<_Up> &__z) { + const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag(); + _M_imag = _M_real * __z.imag() + _M_imag * __z.real(); + _M_real = __r; + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator/=(const complex<_Up> &__z) { + complex<_Tp> cj (__z.real(), -__z.imag()); + complex<_Tp> a = (*this) * cj; + complex<_Tp> b = cj * __z; + _M_real = a.real() / b.real(); + _M_imag = a.imag() / b.real(); + return *this; + } + + private: + _Tp _M_real; + _Tp _M_imag; + +}; +# 181 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_special.h" +template +inline __attribute__((nodebug)) bool operator==(const complex > &__x, const ap_int<_AP_W> &__y) { + return __x.real() == __y && + __x.imag() == 0; +} + + +template +inline __attribute__((nodebug)) bool operator==(const ap_int<_AP_W> &__x, const complex > &__y) { + return __x == __y.real() && + 0 == __y.imag(); +} + + +template +inline __attribute__((nodebug)) bool operator!=(const complex > &__x, const ap_int<_AP_W> &__y) { + return __x.real() != __y || + __x.imag() != 0; +} + + +template +inline __attribute__((nodebug)) bool operator!=(const ap_int<_AP_W> &__x, const complex > &__y) { + return __x != __y.real() || + 0 != __y.imag(); +} + +} +# 353 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 2 + + + + + + + +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" 1 +# 10 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" 1 +# 16 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 1 +# 17 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" 2 +# 87 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" +template +struct _ap_fixed_factory; +template +struct _ap_fixed_factory<_AP_W2, _AP_I2, true> { + typedef ap_fixed<_AP_W2, _AP_I2> type; +}; +template +struct _ap_fixed_factory<_AP_W2, _AP_I2, false> { + typedef ap_ufixed<_AP_W2, _AP_I2> type; +}; +# 108 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" +template +struct ap_fixed_base : ssdm_int<_AP_W, _AP_S> { + public: + typedef ssdm_int<_AP_W, _AP_S> Base; + static const int width = _AP_W; + static const int iwidth = _AP_I; + static const ap_q_mode qmode = _AP_Q; + static const ap_o_mode omode = _AP_O; + + + template + struct RType { + enum { + _AP_F = _AP_W - _AP_I, + F2 = _AP_W2 - _AP_I2, + mult_w = _AP_W + _AP_W2, + mult_i = _AP_I + _AP_I2, + mult_s = _AP_S || _AP_S2, + plus_w = ((_AP_I + (_AP_S2 && !_AP_S)) > (_AP_I2 + (_AP_S && !_AP_S2)) ? (_AP_I + (_AP_S2 && !_AP_S)) : (_AP_I2 + (_AP_S && !_AP_S2))) + + 1 + ((_AP_F) > (F2) ? (_AP_F) : (F2)), + plus_i = + ((_AP_I + (_AP_S2 && !_AP_S)) > (_AP_I2 + (_AP_S && !_AP_S2)) ? (_AP_I + (_AP_S2 && !_AP_S)) : (_AP_I2 + (_AP_S && !_AP_S2))) + 1, + plus_s = _AP_S || _AP_S2, + minus_w = + ((_AP_I + (_AP_S2 && !_AP_S)) > (_AP_I2 + (_AP_S && !_AP_S2)) ? (_AP_I + (_AP_S2 && !_AP_S)) : (_AP_I2 + (_AP_S && !_AP_S2))) + 1 + + ((_AP_F) > (F2) ? (_AP_F) : (F2)), + minus_i = + ((_AP_I + (_AP_S2 && !_AP_S)) > (_AP_I2 + (_AP_S && !_AP_S2)) ? (_AP_I + (_AP_S2 && !_AP_S)) : (_AP_I2 + (_AP_S && !_AP_S2))) + 1, + minus_s = true, + + div_w = _AP_S2 + _AP_W + ((F2) > (0) ? (F2) : (0)), + + + + div_i = _AP_S2 + _AP_I + F2, + div_s = _AP_S || _AP_S2, + logic_w = + ((_AP_I + (_AP_S2 && !_AP_S)) > (_AP_I2 + (_AP_S && !_AP_S2)) ? (_AP_I + (_AP_S2 && !_AP_S)) : (_AP_I2 + (_AP_S && !_AP_S2))) + + ((_AP_F) > (F2) ? (_AP_F) : (F2)), + logic_i = ((_AP_I + (_AP_S2 && !_AP_S)) > (_AP_I2 + (_AP_S && !_AP_S2)) ? (_AP_I + (_AP_S2 && !_AP_S)) : (_AP_I2 + (_AP_S && !_AP_S2))), + logic_s = _AP_S || _AP_S2 + }; + + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs; + typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs; + + typedef ap_fixed_base mult_base; + typedef ap_fixed_base plus_base; + typedef ap_fixed_base minus_base; + typedef ap_fixed_base logic_base; + typedef ap_fixed_base div_base; + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base; + + typedef typename _ap_fixed_factory::type mult; + typedef typename _ap_fixed_factory::type plus; + typedef typename _ap_fixed_factory::type minus; + typedef typename _ap_fixed_factory::type logic; + typedef typename _ap_fixed_factory::type div; + typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1; + }; + + private: +# 295 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) void report() {} + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void overflow_adjust(bool underflow, bool overflow, bool lD, + bool sign) { + if (!underflow && !overflow) return; + if (_AP_O == AP_WRAP) { + if (_AP_N == 0) return; + if (_AP_S) { + + + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(sign) __Repl2__ = !!sign; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), _AP_W - 1, _AP_W - 1); __Result__; }); + if (_AP_N > 1) { + + ap_int_base<_AP_W, false> mask(-1); + if (sign) mask.V = 0; + Base::V = + ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(mask.V) __Repl2__ = mask.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), _AP_W - _AP_N, _AP_W - 2); __Result__; }); + } + } else { + + ap_int_base<_AP_W, false> mask(-1); + Base::V = + ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(mask.V) __Repl2__ = mask.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), _AP_W - _AP_N, _AP_W - 1); __Result__; }); + } + } else if (_AP_O == AP_SAT_ZERO) { + Base::V = 0; + } else if (_AP_O == AP_WRAP_SM && _AP_S) { + bool Ro = ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + if (_AP_N == 0) { + if (lD != Ro) { + Base::V = ~Base::V; + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(lD) __Repl2__ = !!lD; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), _AP_W - 1, _AP_W - 1); __Result__; }); + } + } else { + if (_AP_N == 1 && sign != Ro) { + Base::V = ~Base::V; + } else if (_AP_N > 1) { + bool lNo = ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - _AP_N); __Result__; }); + if (lNo == sign) Base::V = ~Base::V; + ap_int_base<_AP_W, false> mask(-1); + if (sign) mask.V = 0; + Base::V = + ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(mask.V) __Repl2__ = mask.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), _AP_W - _AP_N, _AP_W - 2); __Result__; }); + } + Base::V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; typeof(sign) __Repl2__ = !!sign; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), _AP_W - 1, _AP_W - 1); __Result__; }); + } + } else { + if (_AP_S) { + if (overflow) { + Base::V = 1; + Base::V <<= _AP_W - 1; + Base::V = ~Base::V; + } else if (underflow) { + Base::V = 1; + Base::V <<= _AP_W - 1; + if (_AP_O == AP_SAT_SYM) Base::V |= 1; + } + } else { + if (overflow) + Base::V = ~(ap_int_base<_AP_W, false>(0).V); + else if (underflow) + Base::V = 0; + } + } + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool quantization_adjust(bool qb, bool r, bool s) { + bool carry = (bool)({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + if (_AP_Q == AP_TRN) return false; + if (_AP_Q == AP_RND_ZERO) + qb &= s || r; + else if (_AP_Q == AP_RND_MIN_INF) + qb &= r; + else if (_AP_Q == AP_RND_INF) + qb &= !s || r; + else if (_AP_Q == AP_RND_CONV) + qb &= ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), 0); __Result__; }) || r; + else if (_AP_Q == AP_TRN_ZERO) + qb = s && (qb || r); + Base::V += qb; + return carry && (!(bool)({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; })); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) int countLeadingOnes() const { + + + + + return 0; + + } + + public: + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base() {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op):Base(op.V) { + + operator=(op); + + + + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base( + const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + operator=(op); + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op):Base(op.V) { + ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp(0); + tmp.V = op.V; + + operator=(tmp); + + + + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) { + ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp; + tmp.V = op.V; + operator=(tmp); + report(); + } +# 458 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const char* s) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q, + _AP_O, _AP_N, true); + Base::V = t; + } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const char* s, signed char rd) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q, + _AP_O, _AP_N, true); + Base::V = t; + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) { + *this = ((bool)op); + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) { + *this = (ap_int_base<_AP_W2, false>(op)); + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) { + *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op)); + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + *this = (bool(op)); + report(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + *this = (ap_int_base<_AP_W2, false>(op)); + report(); + } +# 526 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const bool x) { ap_fixed_base<(1), (1), (false)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const char x) { ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const signed char x) { ap_fixed_base<(8), (8), (true)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const unsigned char x) { ap_fixed_base<(8), (8), (false)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const short x) { ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const unsigned short x) { ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const int x) { ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const unsigned int x) { ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const long x) { ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const unsigned long x) { ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const ap_slong x) { ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)> tmp; tmp.V = x; *this = tmp; } + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(const ap_ulong x) { ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)> tmp; tmp.V = x; *this = tmp; } + + + + + + + + __attribute__((nodebug)) ap_fixed_base(double d) { + ap_int_base<64, false> ireg; + ireg.V = doubleToRawBits(d); + bool isneg = ({ typeof(ireg.V) __Val2__ = ireg.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), 63); __Result__; }); + + ap_int_base<11 + 1, true> exp, bias = ((1 << (11 - 1)) - 1); + ap_int_base<11, false> exp_tmp; + exp_tmp.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(ireg.V) __Val2__ = ireg.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 52, 52 + 11 - 1); __Result__; }); + exp = exp_tmp - bias; + ap_int_base<52 + 2, true> man; + man.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(ireg.V) __Val2__ = ireg.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, 52 - 1); __Result__; }); + + + (static_cast(0)); + + man.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(man.V) __Val2__ = man.V; typeof(1) __Repl2__ = !!1; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 52, 52); __Result__; }); + if (isneg) man = -man; + if ((ireg.V & 0x7fffffffffffffffLL) == 0) { + Base::V = 0; + } else { + int _AP_W2 = 52 + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I, + F2 = _AP_W2 - _AP_I2; + bool _AP_S2 = true, + QUAN_INC = F2 > _AP_F && + !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2)); + bool carry = false; + + unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2; + if (F2 == _AP_F) + Base::V = man.V; + else if (F2 > _AP_F) { + if (sh_amt < 52 + 2) + Base::V = man.V >> sh_amt; + else { + Base::V = isneg ? -1 : 0; + } + if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) { + bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)({ typeof(man.V) __Val2__ = man.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), F2 - _AP_F - 1); __Result__; }); + + bool r = + (F2 > _AP_F + 1) + ? ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(man.V) __Val2__ = man.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, (F2 - _AP_F - 2 < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1)); __Result__; }) != 0 + + + : false; + carry = quantization_adjust(qb, r, isneg); + } + } else { + Base::V = man.V; + if (sh_amt < _AP_W) + Base::V = Base::V << sh_amt; + else + Base::V = 0; + } + + if ((_AP_O != AP_WRAP || _AP_N != 0) && + ((!_AP_S && _AP_S2) || + _AP_I - _AP_S < + _AP_I2 - _AP_S2 + + (QUAN_INC || + (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { + bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true; + bool neg_src = isneg; + bool lD = false; + int pos1 = F2 - _AP_F + _AP_W; + int pos2 = F2 - _AP_F + _AP_W + 1; + bool newsignbit = ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + if (pos1 < _AP_W2 && pos1 >= 0) + + lD = (man.V >> pos1) & 1; + if (pos1 < _AP_W2) { + bool Range1_all_ones = true; + bool Range1_all_zeros = true; + bool Range2_all_ones = true; + ap_int_base<52 + 2, false> Range2; + ap_int_base<52 + 2, false> all_ones(-1); + + if (pos2 >= 0 && pos2 < _AP_W2) { + + + Range2.V = man.V; + Range2.V >>= pos2; + Range2_all_ones = Range2 == (all_ones >> pos2); + } else if (pos2 < 0) + Range2_all_ones = false; + if (pos1 >= 0 && pos2 < _AP_W2) { + Range1_all_ones = Range2_all_ones && lD; + Range1_all_zeros = !Range2.V && !lD; + } else if (pos2 == _AP_W2) { + Range1_all_ones = lD; + Range1_all_zeros = !lD; + } else if (pos1 < 0) { + Range1_all_zeros = !man.V; + Range1_all_ones = false; + } + + deleted_zeros = + deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros); + deleted_ones = + carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones; + neg_src = isneg && !(carry && Range1_all_ones); + } else + neg_src = isneg && newsignbit; + bool neg_trg = _AP_S && newsignbit; + bool overflow = (neg_trg || !deleted_zeros) && !isneg; + bool underflow = (!neg_trg || !deleted_ones) && neg_src; + if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S) + underflow |= + neg_src && + (_AP_W > 1 ? ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, _AP_W - 2); __Result__; }) == 0 + : true); + overflow_adjust(underflow, overflow, lD, neg_src); + } + } + report(); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); } +# 800 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + + const int _AP_F = _AP_W - _AP_I; + const int F2 = _AP_W2 - _AP_I2; + const int QUAN_INC = + F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2)); + + if (!op) Base::V = 0; + bool carry = false; + bool signbit = ({ typeof(op.V) __Val2__ = op.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W2 - 1); __Result__; }); + bool isneg = signbit && _AP_S2; + if (F2 == _AP_F) + Base::V = op.V; + else if (F2 > _AP_F) { + unsigned int sh_amt = F2 - _AP_F; + + if (sh_amt < _AP_W2) { + Base::V = op.V >> sh_amt; + } else { + Base::V = isneg ? -1 : 0; + } + if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) { + bool qbit = ({ typeof(op.V) __Val2__ = op.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), F2 - _AP_F - 1); __Result__; }); + + bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit; + enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) }; + + bool r = (F2 > _AP_F + 1) ? (({ typename _ap_type::remove_const::type __Result__ = 0; typeof(op.V) __Val2__ = op.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, hi); __Result__; }) != 0) : false; + carry = quantization_adjust(qb, r, isneg); + } + } else { + unsigned sh_amt = _AP_F - F2; + + if (sh_amt < _AP_W) { + if (_AP_W > _AP_W2) { + + Base::V = op.V; + Base::V <<= sh_amt; + } else { + + Base::V = op.V << sh_amt; + } + } else { + Base::V = 0; + } + } + + if ((_AP_O != AP_WRAP || _AP_N != 0) && + ((!_AP_S && _AP_S2) || + _AP_I - _AP_S < + _AP_I2 - _AP_S2 + + (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { + bool deleted_zeros = _AP_S2 ? true : !carry; + bool deleted_ones = true; + bool neg_src = isneg; + bool newsignbit = ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 }; + bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? ({ typeof(op.V) __Val2__ = op.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), pos1); __Result__; }) + : false; + if (pos1 < _AP_W2) { + bool Range1_all_ones = true; + bool Range1_all_zeros = true; + bool Range2_all_ones = true; + ap_int_base<_AP_W2, false> all_ones(-1); + + if (pos2 < _AP_W2 && pos2 >= 0) { + ap_int_base<_AP_W2, false> Range2(0); + Range2.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(op.V) __Val2__ = op.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), pos2, _AP_W2 - 1); __Result__; }); + Range2_all_ones = Range2 == (all_ones >> pos2); + } else if (pos2 < 0) { + Range2_all_ones = false; + } + + if (pos1 >= 0 && pos2 < _AP_W2) { + ap_int_base<_AP_W2, false> Range1(0); + Range1.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(op.V) __Val2__ = op.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), pos1, _AP_W2 - 1); __Result__; }); + Range1_all_ones = Range1 == (all_ones >> pos1); + Range1_all_zeros = !Range1.V; + } else if (pos2 == _AP_W2) { + Range1_all_ones = lD; + Range1_all_zeros = !lD; + } else if (pos1 < 0) { + Range1_all_zeros = !op.V; + Range1_all_ones = false; + } + + deleted_zeros = + deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros); + deleted_ones = + carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones; + neg_src = isneg && !(carry && Range1_all_ones); + } else + neg_src = isneg && newsignbit; + bool neg_trg = _AP_S && newsignbit; + bool overflow = (neg_trg || !deleted_zeros) && !isneg; + bool underflow = (!neg_trg || !deleted_ones) && neg_src; + if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S) + underflow |= + neg_src && + (_AP_W > 1 ? ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, _AP_W - 2); __Result__; }) == 0 + : true); + + overflow_adjust(underflow, overflow, lD, neg_src); + } + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator=( + const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + operator=(const_cast&>(op)); + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& setBits(ap_ulong bv) { + + Base::V = bv; + return *this; + } + + + static inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base bitsToFixed(ap_ulong bv) { + + ap_fixed_base t; + + t.V = bv; + + + + return t; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void checkOverflowCsimFix(int _ap_w2, bool _ap_s2) const { +# 973 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + (void)_ap_w2; + (void)_ap_s2; + + return; + } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<((_AP_I) > (1) ? (_AP_I) : (1)), _AP_S> to_ap_int_base( + bool Cnative = true) const { + ap_int_base<((_AP_I) > (1) ? (_AP_I) : (1)), _AP_S> ret(0); + if (_AP_I == 0) { + ret.V = 0; + } else if (_AP_I > 0 && _AP_I <= _AP_W) { + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), _AP_W - _AP_I, _AP_W - 1); __Result__; }); + } else if (_AP_I > _AP_W) { + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, _AP_W - 1); __Result__; }); + ret.V <<= (_AP_I - _AP_W); + } +# 1002 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + if (Cnative && _AP_I < _AP_W) { + + if (_AP_S && ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }) && (_AP_I < _AP_W) && + (({ typename _ap_type::remove_const::type __Result__ = 0; typeof(Base::V) __Val2__ = Base::V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1); __Result__; }) != (unsigned long)0)) + + + ret = ret + 1; + } else { + + } + return ret; + }; + + public: + template + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_int_base<_AP_W2, _AP_S2>() const { + return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base()); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) char to_char() const { return to_ap_int_base().to_char(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int to_int() const { return to_ap_int_base().to_int(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned to_uint() const { return to_ap_int_base().to_uint(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_slong to_int64() const { return to_ap_int_base().to_int64(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int<_AP_I> to_ap_int() const { return ap_int<_AP_I>(to_ap_int_base()); } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) double to_double() const { + + + + + enum { BITS = 52 + 11 + 1 }; + if (!Base::V) return 0.0f; + bool s = _AP_S && ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + ap_int_base<_AP_W, false> tmp(0); + if (s) + tmp.V = -Base::V; + else + tmp.V = Base::V; + int l = tmp.countLeadingZeros(); + int e = _AP_I - l - 1 + ((1 << (11 - 1)) - 1); + int lsb_index = _AP_W - l - 1 - 52; + + bool a = (lsb_index >=2) ? + (({ typename _ap_type::remove_const::type __Result__ = 0; typeof(tmp.V) __Val2__ = tmp.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, lsb_index - 2); __Result__; }) != 0) : 0; + + a |= (lsb_index >=0) ? ({ typeof(tmp.V) __Val2__ = tmp.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), lsb_index); __Result__; }) : 0; + + ap_ulong m = 0; + + if (_AP_W > BITS) { + m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1)) + : (ap_ulong)(tmp.V << (1 - lsb_index)); + } else { + m = (ap_ulong)tmp.V; + m = (lsb_index >= 1) ? (m >> (lsb_index - 1)) + : (m << (1 - lsb_index)); + } + m += a; + m >>= 1; + + + if (({ typeof(m) __Val2__ = m; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), 52 + 1); __Result__; })) { + e += 1; + } + + m = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(m) __Val2__ = m; typeof(s) __Repl2__ = !!s; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), BITS - 1, BITS - 1); __Result__; }); + + m = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(m) __Val2__ = m; typeof(e) __Repl2__ = e; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 52, 52 + 11 - 1); __Result__; }); + + + return rawBitsToDouble(m); + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) float to_float() const { + + + + + enum { BITS = 23 + 8 + 1 }; + if (!Base::V) return 0.0f; + bool s = _AP_S && ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + ap_int_base<_AP_W, false> tmp; + if (s) + tmp.V = -Base::V; + else + tmp.V = Base::V; + int l = tmp.countLeadingZeros(); + int e = _AP_I - l - 1 + ((1 << (8 - 1)) - 1); + int lsb_index = _AP_W - l - 1 - 23; + + bool a = (lsb_index >=2) ? + (({ typename _ap_type::remove_const::type __Result__ = 0; typeof(tmp.V) __Val2__ = tmp.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, lsb_index - 2); __Result__; }) != 0) : 0; + + a |= (lsb_index >=0) ? ({ typeof(tmp.V) __Val2__ = tmp.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), lsb_index); __Result__; }) : 0; + + unsigned long m; + + if (_AP_W > BITS) { + m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1)) + : (unsigned long)(tmp.V << (1 - lsb_index)); + } else { + m = (unsigned long)tmp.V; + m = (lsb_index >= 1) ? (m >> (lsb_index - 1)) + : (m << (1 - lsb_index)); + } + m += a; + m >>= 1; + + if (({ typeof(m) __Val2__ = m; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), 23 + 1); __Result__; })) { + e += 1; + } + + m = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(m) __Val2__ = m; typeof(s) __Repl2__ = !!s; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), BITS - 1, BITS - 1); __Result__; }); + m = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(m) __Val2__ = m; typeof(e) __Repl2__ = e; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 23, 23 + 8 - 1); __Result__; }); + + return rawBitsToFloat(m); + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) half to_half() const { + + + + + enum { BITS = 10 + 5 + 1 }; + if (!Base::V) return 0.0f; + bool s = _AP_S && ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; }); + ap_int_base<_AP_W, false> tmp; + if (s) + tmp.V = -Base::V; + else + tmp.V = Base::V; + int l = tmp.countLeadingZeros(); + int e = _AP_I - l - 1 + ((1 << (5 - 1)) - 1); + int lsb_index = _AP_W - l - 1 - 10; + + bool a = (lsb_index >=2) ? + (({ typename _ap_type::remove_const::type __Result__ = 0; typeof(tmp.V) __Val2__ = tmp.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), 0, lsb_index - 2); __Result__; }) != 0) : 0; + + a |= (lsb_index >=0) ? ({ typeof(tmp.V) __Val2__ = tmp.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), lsb_index); __Result__; }) : 0; + + unsigned short m; + + if (_AP_W > BITS) { + m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1)) + : (unsigned short)(tmp.V << (1 - lsb_index)); + } else { + m = (unsigned short)tmp.V; + m = (lsb_index >= 1) ? (m >> (lsb_index - 1)) + : (m << (1 - lsb_index)); + } + m += a; + m >>= 1; + + if (({ typeof(m) __Val2__ = m; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), 10 + 1); __Result__; })) { + e += 1; + } + + m = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(m) __Val2__ = m; typeof(s) __Repl2__ = !!s; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), BITS - 1, BITS - 1); __Result__; }); + m = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(m) __Val2__ = m; typeof(e) __Repl2__ = e; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), 10, 10 + 5 - 1); __Result__; }); + + return rawBitsToHalf(m); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator long double() const { return (long double)to_double(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator double() const { return to_double(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator float() const { return to_float(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator half() const { return to_half(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator bool() const { return (bool)Base::V != 0; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator char() const { return (char)to_int(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator signed char() const { return (signed char)to_int(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator unsigned char() const { return (unsigned char)to_uint(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator short() const { return (short)to_int(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator unsigned short() const { return (unsigned short)to_uint(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator int() const { return to_int(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator unsigned int() const { return to_uint(); } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator long() const { return (long)to_int64(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator unsigned long() const { return (unsigned long)to_uint64(); } + + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_ulong() const { return to_uint64(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_slong() const { return to_int64(); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { return _AP_W; }; +# 1231 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) int countLeadingZeros() const { + + + if (_AP_W <= 32) { + ap_int_base<32, false> t(-1ULL); + t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1); + return __builtin_ctz(t.V); + } else if (_AP_W <= 64) { + ap_int_base<64, false> t(-1ULL); + t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1); + return __builtin_ctzll(t.V); + } else { + enum {__N = (_AP_W + 63) / 64}; + int NZeros = 0; + int i = 0; + bool hitNonZero = false; + VITIS_LOOP_1247_1: for (i = 0; i < __N - 1; ++i) { + ap_int_base<64, false> t; + t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1); + NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); + hitNonZero |= (t != 0); + } + if (!hitNonZero) { + ap_int_base<64, false> t(-1ULL); + t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64); + NZeros += __builtin_clzll(t.V); + } + return NZeros; + } + + + + } + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) + const { + typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r(0), t(0); + r.V = Base::V; + t.V = op2.V; + r.V *= op2.V; + return r; + } + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) + const { + typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r; +# 1301 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + ap_fixed_base<_AP_W + ((_AP_W2 - _AP_I2) > (0) ? (_AP_W2 - _AP_I2) : (0)),_AP_I, _AP_S> t(*this); + + + + r.V = t.V / op2.V; +# 1334 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + return r; + } +# 1349 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + template inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::plus operator +( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { typename RType<_AP_W2, _AP_I2, _AP_S2>::plus_base ret(0), lhs(*this), rhs(op2); ret.V = lhs.V + rhs.V; return ret; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::minus operator -( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { typename RType<_AP_W2, _AP_I2, _AP_S2>::minus_base ret(0), lhs(*this), rhs(op2); ret.V = lhs.V - rhs.V; return ret; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::logic operator &( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { typename RType<_AP_W2, _AP_I2, _AP_S2>::logic_base ret(0), lhs(*this), rhs(op2); ret.V = lhs.V & rhs.V; return ret; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::logic operator |( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { typename RType<_AP_W2, _AP_I2, _AP_S2>::logic_base ret(0), lhs(*this), rhs(op2); ret.V = lhs.V | rhs.V; return ret; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) typename RType<_AP_W2, _AP_I2, _AP_S2>::logic operator ^( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { typename RType<_AP_W2, _AP_I2, _AP_S2>::logic_base ret(0), lhs(*this), rhs(op2); ret.V = lhs.V ^ rhs.V; return ret; } +# 1367 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator *=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator *(op2); return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator /=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator /(op2); return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator +=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator +(op2); return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator -=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator -(op2); return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator &=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator &(op2); return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator |=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator |(op2); return *this; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator ^=( const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { *this = operator ^(op2); return *this; } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator++() { + operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1)); + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator--() { + operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1)); + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) const ap_fixed_base operator++(int) { + ap_fixed_base r(*this); + operator++(); + return r; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) const ap_fixed_base operator--(int) { + ap_fixed_base r(*this); + operator--(); + return r; + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type operator+() { return *this; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) typename _ap_fixed_factory<_AP_W + 1, _AP_I + 1, true>::type operator-() const { + ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this); + r.V = -r.V; + return r; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() { + ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this); + r.V = -r.V; + return r; + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!() const { return Base::V == (unsigned long)0; } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const { + ap_fixed_base<_AP_W, _AP_I, _AP_S> r(0); + r.V = ~Base::V; + return r; + } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const { + ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r(0); + r.V = Base::V; + return r; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const { + ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r(0); + r.V = Base::V; + return r; + } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator<<(unsigned int sh) const { + ap_fixed_base r(0); + r.V = Base::V << sh; +# 1485 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + return r; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator>>(unsigned int sh) const { + ap_fixed_base r(0); + r.V = Base::V >> sh; +# 1507 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + return r; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator<<(int sh) const { + ap_fixed_base r(0); + bool isNeg = sh < 0; + unsigned int ush = isNeg ? -sh : sh; + if (isNeg) { + return operator>>(ush); + } else { + return operator<<(ush); + } + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator>>(int sh) const { + bool isNeg = sh < 0; + unsigned int ush = isNeg ? -sh : sh; + if (isNeg) { + return operator<<(ush); + } else { + return operator>>(ush); + } + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const { + + + int sh = op2.to_int(); + return operator<<(sh); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const { + int sh = op2.to_int(); + return operator>>(sh); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const { + unsigned int sh = op2.to_uint(); + return operator<<(sh); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const { + unsigned int sh = op2.to_uint(); + return operator>>(sh); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator<<( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + op2) { + return operator<<(op2.to_ap_int_base()); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base operator>>( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + op2) { + return operator>>(op2.to_ap_int_base()); + } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator<<=(const int sh) { + *this = operator<<(sh); + return *this; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator<<=(const unsigned int sh) { + *this = operator<<(sh); + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) { + *this = operator<<(sh.to_int()); + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator<<=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + sh) { + *this = operator<<(sh.to_int()); + return *this; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator>>=(const int sh) { + *this = operator>>(sh); + return *this; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator>>=(const unsigned int sh) { + *this = operator>>(sh); + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) { + *this = operator>>(sh.to_int()); + return *this; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base& operator>>=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + sh) { + *this = operator>>(sh.to_int()); + return *this; + } +# 1651 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; if (_AP_F == F2) return Base::V > op2.V; else if (_AP_F > F2) return Base::V > ap_fixed_base<((_AP_W2 + _AP_F - F2) > (1) ? (_AP_W2 + _AP_F - F2) : (1)), _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; else return ap_fixed_base<((_AP_W + F2 - _AP_F + 1) > (1) ? (_AP_W + F2 - _AP_F + 1) : (1)), _AP_I + 1, _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V > op2.V; return false; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; if (_AP_F == F2) return Base::V < op2.V; else if (_AP_F > F2) return Base::V < ap_fixed_base<((_AP_W2 + _AP_F - F2) > (1) ? (_AP_W2 + _AP_F - F2) : (1)), _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; else return ap_fixed_base<((_AP_W + F2 - _AP_F + 1) > (1) ? (_AP_W + F2 - _AP_F + 1) : (1)), _AP_I + 1, _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V < op2.V; return false; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; if (_AP_F == F2) return Base::V >= op2.V; else if (_AP_F > F2) return Base::V >= ap_fixed_base<((_AP_W2 + _AP_F - F2) > (1) ? (_AP_W2 + _AP_F - F2) : (1)), _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; else return ap_fixed_base<((_AP_W + F2 - _AP_F + 1) > (1) ? (_AP_W + F2 - _AP_F + 1) : (1)), _AP_I + 1, _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V >= op2.V; return false; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; if (_AP_F == F2) return Base::V <= op2.V; else if (_AP_F > F2) return Base::V <= ap_fixed_base<((_AP_W2 + _AP_F - F2) > (1) ? (_AP_W2 + _AP_F - F2) : (1)), _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; else return ap_fixed_base<((_AP_W + F2 - _AP_F + 1) > (1) ? (_AP_W + F2 - _AP_F + 1) : (1)), _AP_I + 1, _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V <= op2.V; return false; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; if (_AP_F == F2) return Base::V == op2.V; else if (_AP_F > F2) return Base::V == ap_fixed_base<((_AP_W2 + _AP_F - F2) > (1) ? (_AP_W2 + _AP_F - F2) : (1)), _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; else return ap_fixed_base<((_AP_W + F2 - _AP_F + 1) > (1) ? (_AP_W + F2 - _AP_F + 1) : (1)), _AP_I + 1, _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V == op2.V; return false; } + template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) const { enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; if (_AP_F == F2) return Base::V != op2.V; else if (_AP_F > F2) return Base::V != ap_fixed_base<((_AP_W2 + _AP_F - F2) > (1) ? (_AP_W2 + _AP_F - F2) : (1)), _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; else return ap_fixed_base<((_AP_W + F2 - _AP_F + 1) > (1) ? (_AP_W + F2 - _AP_F + 1) : (1)), _AP_I + 1, _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V != op2.V; return false; } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >(double d) const { return to_double() > d; } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <(double d) const { return to_double() < d; } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=(double d) const { return to_double() >= d; } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=(double d) const { return to_double() <= d; } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==(double d) const { return to_double() == d; } + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=(double d) const { return to_double() != d; } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[]( + unsigned index) { + (static_cast(0)); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[]( + const ap_int_base<_AP_W2, _AP_S2>& index) { + (static_cast(0)); + (static_cast(0)); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, + index.to_int()); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator[](unsigned index) const { + (static_cast(0)); + return ({ typeof(const_cast(this)->V) __Val2__ = const_cast(this)->V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), index); __Result__; }); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit( + unsigned index) { + (static_cast(0)); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit( + const ap_int_base<_AP_W2, _AP_S2>& index) { + (static_cast(0)); + (static_cast(0)); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, + index.to_int()); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool bit(unsigned index) const { + (static_cast(0)); + return ({ typeof(const_cast(this)->V) __Val2__ = const_cast(this)->V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), index); __Result__; }); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit( + const ap_int_base<_AP_W2, true>& index) { + (static_cast(0)); + + (static_cast(0)); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>( + this, index.to_int() + _AP_W - _AP_I); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool get_bit(int index) const { + (static_cast(0)); + (static_cast(0)); + return ({ typeof(const_cast(this)->V) __Val2__ = const_cast(this)->V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), index + _AP_W - _AP_I); __Result__; }); + + } +# 1737 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool get_bit(const ap_int_base<_AP_W2, true>& index) const { + (static_cast(0)); + (static_cast(0)); + return ({ typeof(const_cast(this)->V) __Val2__ = const_cast(this)->V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), index.to_int() + _AP_W - _AP_I); __Result__; }); + + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi, + int Lo) { + (static_cast(0)); + return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range( + int Hi, int Lo) const { + (static_cast(0)); + return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>( + const_cast(this), Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() { + return this->range(_AP_W - 1, 0); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const { + return this->range(_AP_W - 1, 0); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + int Hi, int Lo) { + return this->range(Hi, Lo); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + int Hi, int Lo) const { + return this->range(Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool is_zero() const { return Base::V == (unsigned long)0; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool is_neg() const { + if (_AP_S && ({ typeof(Base::V) __Val2__ = Base::V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), _AP_W - 1); __Result__; })) return true; + return false; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int wl() const { return _AP_W; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int iwl() const { return _AP_I; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_q_mode q_mode() const { return _AP_Q; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_o_mode o_mode() const { return _AP_O; } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int n_bits() const { return _AP_N; } +# 1920 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) char* to_string(unsigned char radix = 2, bool sign = _AP_S) const { + (void)(radix); + (void)(sign); + return 0; + } + +}; + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void b_not( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { + ret.V = ~op.V; +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void b_and( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + ret.V = op1.V & op2.V; +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void b_or( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + ret.V = op1.V | op2.V; +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void b_xor( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + ret.V = op1.V ^ op2.V; +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void neg( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2, + _AP_N2> + t(0); + t.V = -op.V; + ret = t; +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void lshift( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op, + int i) { + enum { + F2 = _AP_W2 - _AP_I2, + _AP_I3 = ((_AP_I) > (_AP_I2) ? (_AP_I) : (_AP_I2)), + _AP_W3 = _AP_I3 + F2, + }; + + ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t(0); + t.V = op.V; + t.V <<= i; + + ret = t; +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) void rshift( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op, + int i) { + enum { + F = _AP_W - _AP_I, + F2 = _AP_W2 - _AP_I2, + F3 = ((F) > (F2) ? (F) : (F2)), + _AP_W3 = _AP_I2 + F3, + sh = F - F2, + }; + + ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t(0); + t.V = op.V; + if (sh >= 0) + t.V <<= (int) sh; + t.V >>= i; + + ret = t; +} +# 2372 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator +(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::plus operator +( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator -(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::minus operator -( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator *(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::mult operator *( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator /(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::div operator /( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator &(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::logic operator &( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator |(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::logic operator |( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator ^(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::logic operator ^( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator >>(ap_int_base<(1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (1), (1), (false)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator <<(ap_int_base<(1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator +=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator -=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator *=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator /=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator &=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator |=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator ^=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator >>=(ap_int_base<(1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator <<=(ap_int_base<(1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator >(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator <(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator >=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator <=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator ==(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool i_op) { return op.operator !=(ap_fixed_base<(1), (1), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( bool i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(1), (1), (false)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator +(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::plus operator +( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator -(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::minus operator -( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator *(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::mult operator *( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator /(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::div operator /( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator &(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::logic operator &( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator |(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::logic operator |( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator ^(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::logic operator ^( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator >>(ap_int_base<(8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (CHAR_IS_SIGNED)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator <<(ap_int_base<(8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator +=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator -=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator *=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator /=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator &=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator |=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator ^=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator >>=(ap_int_base<(8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator <<=(ap_int_base<(8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator >(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator <(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator >=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator <=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator ==(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char i_op) { return op.operator !=(ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (CHAR_IS_SIGNED)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator +(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::plus operator +( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator -(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::minus operator -( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator *(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::mult operator *( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator /(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::div operator /( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator &(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::logic operator &( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator |(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::logic operator |( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator ^(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::logic operator ^( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator >>(ap_int_base<(8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (true)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator <<(ap_int_base<(8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator +=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator -=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator *=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator /=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator &=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator |=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator ^=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator >>=(ap_int_base<(8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator <<=(ap_int_base<(8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator >(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator <(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator >=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator <=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator ==(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char i_op) { return op.operator !=(ap_fixed_base<(8), (8), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( signed char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (true)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator +(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::plus operator +( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator -(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::minus operator -( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator *(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::mult operator *( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator /(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::div operator /( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator &(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::logic operator &( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator |(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::logic operator |( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator ^(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::logic operator ^( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator >>(ap_int_base<(8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (8), (8), (false)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator <<(ap_int_base<(8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator +=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator -=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator *=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator /=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator &=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator |=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator ^=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator >>=(ap_int_base<(8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator <<=(ap_int_base<(8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator >(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator <(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator >=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator <=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator ==(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char i_op) { return op.operator !=(ap_fixed_base<(8), (8), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned char i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(8), (8), (false)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::plus operator +( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::minus operator -( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::mult operator *( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::div operator /( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::logic operator &( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::logic operator |( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::logic operator ^( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (true)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (true)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::plus operator +( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::minus operator -( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::mult operator *( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::div operator /( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::logic operator &( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::logic operator |( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::logic operator ^( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_short), (_AP_SIZE_short), (false)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned short i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_short), (_AP_SIZE_short), (false)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::plus operator +( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::minus operator -( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::mult operator *( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::div operator /( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::logic operator &( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::logic operator |( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::logic operator ^( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (true)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (true)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::plus operator +( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::minus operator -( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::mult operator *( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::div operator /( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::logic operator &( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::logic operator |( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::logic operator ^( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_int), (_AP_SIZE_int), (false)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned int i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_int), (_AP_SIZE_int), (false)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::plus operator +( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::minus operator -( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::mult operator *( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::div operator /( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::logic operator &( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::logic operator |( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::logic operator ^( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (true)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (true)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::plus operator +( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::minus operator -( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::mult operator *( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::div operator /( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::logic operator &( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::logic operator |( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::logic operator ^( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_long), (_AP_SIZE_long), (false)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned long i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_long), (_AP_SIZE_long), (false)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::plus operator +( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::minus operator -( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::mult operator *( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::div operator /( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::logic operator &( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::logic operator |( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::logic operator ^( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( ap_slong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (true)>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator +(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::plus operator +( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator -(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::minus operator -( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator *(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::mult operator *( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator /(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::div operator /( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator &(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::logic operator &( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator |(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::logic operator |( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator ^(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::logic operator ^( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::lhs operator >>( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator >>(ap_int_base<(_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< (_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>::lhs operator <<( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator <<(ap_int_base<(_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator +=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator -=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator *=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator /=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator &=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator |=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator ^=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator >>=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator >>=(ap_int_base<(_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator <<=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator <<=(ap_int_base<(_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator >(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator >(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator <(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator <(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator >=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator >=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator <=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator <=(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator ==(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator ==(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong i_op) { return op.operator !=(ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( ap_ulong i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<(_AP_SIZE_ap_slong), (_AP_SIZE_ap_slong), (false)>(i_op).operator !=(op); } +# 2460 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::plus operator +( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator +(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::plus operator +( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator +(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::minus operator -( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator -(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::minus operator -( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator -(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::mult operator *( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator *(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::mult operator *( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator *(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::div operator /( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator /(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::div operator /( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator /(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::logic operator &( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator &(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::logic operator &( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator &(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::logic operator |( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator |(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::logic operator |( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator |(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< _AP_W, _AP_I, _AP_S>::logic operator ^( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator ^(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< _AP_W2, _AP_W2, _AP_S2>::logic operator ^( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator ^(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } + + + +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator +=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator +=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator +=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator +=(op.to_ap_int_base()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator -=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator -=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator -=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator -=(op.to_ap_int_base()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator *=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator *=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator *=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator *=(op.to_ap_int_base()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator /=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator /=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator /=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator /=(op.to_ap_int_base()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator &=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator &=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator &=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator &=(op.to_ap_int_base()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator |=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator |=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator |=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator |=(op.to_ap_int_base()); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& operator ^=( ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator ^=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W2, _AP_S2>& operator ^=( ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return i_op.operator ^=(op.to_ap_int_base()); } + + + +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator ==(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator ==(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator !=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator !=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator >(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator >(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator >=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator >=(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator <(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator <(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& i_op) { return op.operator <=(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_int_base<_AP_W2, _AP_S2>& i_op, const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator <=(op); } + + + + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator==(op1); +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator!=(op1); +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator<(op1); +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>=( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator<=(op1); +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator>(op1); +} + +template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<=( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator>=(op1); +} +# 11 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" 2 +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" 1 +# 25 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" +template +struct af_bit_ref { + + + + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type; + ref_type& d_bv; + int d_index; + + public: + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref( + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref) + : d_bv(ref.d_bv), d_index(ref.d_index) { + + + + + + + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref(const ref_type* bv, int index = 0) + : d_bv(*const_cast(bv)), d_index(index) {} + + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator bool() const { return ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=(bool val) { + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val) __Repl2__ = !!val; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), d_index, d_index); __Result__; }); + return *this; + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=(const af_bit_ref& val) { + return operator=(bool(val)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=(bool(val)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=(bool(val)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) { + return operator=(val != 0); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + return operator=(ap_int_base<_AP_W2, false>(val)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=(ap_int_base<_AP_W2, false>(val)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_bit_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val)); + } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, op); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,( + const ap_bit_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this, + op); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >( + *this, op); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) { + return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this, + op); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref< + 1, af_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) { + return ap_concat_ref< + 1, af_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, + op); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) { + return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + op)); + } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + return get() == op.get(); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + return get() != op.get(); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator~() const { + bool bit = ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); + return bit ? false : true; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool get() const { return ({ typeof(d_bv.V) __Val2__ = d_bv.V; bool __Result__ = __builtin_bit_select((void*)(&__Val2__), d_index); __Result__; }); } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { return 1; } + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) char* to_string() const { return 0; } + +}; +# 212 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" +template +struct af_range_ref { + + + + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type; + ref_type& d_bv; + int l_index; + int h_index; + + public: + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref( + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref) + : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {} + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref(ref_type* bv, int h, int l) + : d_bv(*bv), l_index(l), h_index(h) { +# 242 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref(const ref_type* bv, int h, int l) + : d_bv(*const_cast(bv)), l_index(l), h_index(h) { +# 254 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" + } +# 266 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const bool val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const char val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const signed char val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const unsigned char val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const short val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const unsigned short val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const int val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const unsigned int val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const long val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const unsigned long val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const ap_slong val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const ap_ulong val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const half val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const float val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const double val) { ap_int_base<_AP_W, false> loc(val); d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(loc.V) __Repl2__ = loc.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); return *this; } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const char* val) { + const ap_int_base<_AP_W, false> tmp(val); + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(tmp.V) __Repl2__ = tmp.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + return *this; + } + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) { + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val.V) __Repl2__ = val.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + return *this; + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + const ap_int_base<_AP_W2, false> tmp(val); + return operator=(tmp); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + const ap_int_base<1, false> tmp((bool)val); + return operator=(tmp); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + val) { + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val.V) __Repl2__ = val.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + return *this; + } + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=(const af_range_ref& val) { + ap_int_base<_AP_W, false> tmp(val); + return operator=(tmp); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + ap_int_base<_AP_W2, false> tmp(val); + return operator=(tmp); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + ap_int_base<1, false> tmp((bool)val); + return operator=(tmp); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) af_range_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val); + return operator=(tmp); + } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop == rop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator==(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop < rop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop > rop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator>(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator<(op2)); + } + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator==( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop == rop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator!=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + return !(operator==(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop < rop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop > rop; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator<=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + return !(operator>(op2)); + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator>=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + return !(operator<(op2)); + } + + + + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<_AP_W, af_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, op); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<_AP_W, af_range_ref, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) { + return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + *this, const_cast&>(op)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, af_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &op) { + return ap_concat_ref< + _AP_W, af_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + op)); + } + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) + ap_concat_ref<_AP_W, af_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) { + return ap_concat_ref< + _AP_W, af_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + op)); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_ulong() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret.to_uint64(); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) operator ap_int_base<_AP_W, false>() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> to_ap_int_base() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) char to_char() const { + return (char)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int to_int() const { + return (int)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned to_uint() const { + return (unsigned)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) long to_long() const { + return (long)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned long to_ulong() const { + return (unsigned long)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_slong to_int64() const { + return (ap_slong)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ulong to_uint64() const { + return (ap_ulong)(({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; })); + } + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator~() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return (~ret); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator!() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return (!ret); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator+() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> operator-() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return (-ret); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_int_base<_AP_W, false> get() const { + ap_int_base<_AP_W, false> ret; + ret.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; __builtin_bit_part_select((void*)(&__Result__), (void*)(&__Val2__), l_index, h_index); __Result__; }); + return ret; + } + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) void set(const ap_int_base<_AP_W2, false>& val) { + d_bv.V = ({ typename _ap_type::remove_const::type __Result__ = 0; typeof(d_bv.V) __Val2__ = d_bv.V; typeof(val.V) __Repl2__ = val.V; __builtin_bit_part_set((void*)(&__Result__), (void*)(&__Val2__), (void*)(&__Repl2__), l_index, h_index); __Result__; }); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) int length() const { + return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1; + } +# 615 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) char* to_string(signed char rd = 2) const { + (void)(rd); + return 0; + } + +}; +# 679 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( bool op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(1), (false)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( bool op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( bool op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(1), (false)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( bool op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( bool op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(1), (false)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( bool op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( bool op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(1), (false)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( bool op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( bool op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(1), (false)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( bool op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(1), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( bool op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(1), (false)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, bool op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( bool op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(8), (CHAR_IS_SIGNED)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (CHAR_IS_SIGNED)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, char op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( signed char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (true)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( signed char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( signed char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (true)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( signed char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( signed char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (true)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( signed char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( signed char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (true)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( signed char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( signed char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (true)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( signed char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(8), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( signed char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (true)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, signed char op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( signed char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (false)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (false)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (false)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (false)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (false)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(8), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned char op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(8), (false)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned char op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned char op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_short), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (true)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, short op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_short), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned short op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_short), (false)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned short op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned short op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_int), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (true)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, int op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_int), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned int op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_int), (false)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned int op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned int op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_long), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (true)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, long op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( unsigned long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( unsigned long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( unsigned long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( unsigned long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( unsigned long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_long), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned long op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_long), (false)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, unsigned long op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( unsigned long op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( ap_slong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( ap_slong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( ap_slong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( ap_slong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( ap_slong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( ap_slong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( ap_slong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( ap_slong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( ap_slong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( ap_slong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( ap_slong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (true)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_slong op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( ap_slong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) > ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( ap_ulong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return bool(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( ap_ulong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) < ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( ap_ulong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return bool(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( ap_ulong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) >= ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( ap_ulong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return bool(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( ap_ulong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) <= ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( ap_ulong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return bool(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( ap_ulong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) == ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( ap_ulong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return bool(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( ap_ulong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == bool(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return ap_int_base<_AP_W, false>(op) != ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( ap_ulong op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return ap_int_base<(_AP_SIZE_ap_slong), (false)>(op2) != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, ap_ulong op2) { return bool(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( ap_ulong op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != bool(op); } +# 725 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_ref.h" +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S>& op2) { return ap_int_base<_AP_W, false>(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op) > op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 > ap_int_base<1, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S>& op2) { return ap_int_base<_AP_W, false>(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op) < op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 < ap_int_base<1, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S>& op2) { return ap_int_base<_AP_W, false>(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op) >= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator >=( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 >= ap_int_base<1, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S>& op2) { return ap_int_base<_AP_W, false>(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op) <= op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator <=( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 <= ap_int_base<1, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S>& op2) { return ap_int_base<_AP_W, false>(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op) == op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator ==( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 == ap_int_base<1, false>(op); } +template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S>& op2) { return ap_int_base<_AP_W, false>(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != ap_int_base<_AP_W, false>(op); } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, const ap_int_base<_AP_W2, _AP_S2>& op2) { return ap_int_base<1, false>(op) != op2; } template inline __attribute__((always_inline)) __attribute__((nodebug)) bool operator !=( const ap_int_base<_AP_W2, _AP_S2>& op2, const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { return op2 != ap_int_base<1, false>(op); } +# 12 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" 2 + + + + + +template +struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> { + typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base; + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed() : Base() {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} +# 66 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(unsigned V __attribute__((bitwidth(_AP_W))), bool raw) { + Base::V = V; + (void)(raw); + } +# 101 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) + : Base(op) {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(bool v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(char v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(signed char v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(unsigned char v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(short v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(unsigned short v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(int v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(unsigned int v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(long v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(unsigned long v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(ap_slong v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(ap_ulong v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(half v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(float v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(double v) : Base(v) {} + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const char* s) : Base(s) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed(const char* s, signed char rd) : Base(s, rd) {} + + + + + + + + ap_fixed & + operator=(const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &op) = default; +# 168 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile { + Base::V = op.V; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_fixed& operator=( + const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=( + const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile { + Base::V = op.V; + } +}; + + + + + +template +struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> { + typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base; + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed() : Base() {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} + + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} +# 237 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(unsigned V __attribute__((bitwidth(_AP_W))), bool raw) { + Base::V = V; + (void)(raw); + } +# 269 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(bool v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(char v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(signed char v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(unsigned char v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(short v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(unsigned short v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(int v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(unsigned int v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(long v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(unsigned long v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(ap_slong v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(ap_ulong v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(half v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(float v) : Base(v) {} + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(double v) : Base(v) {} + + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const char* s) : Base(s) {} + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed(const char* s, signed char rd) : Base(s, rd) {} + + + + ap_ufixed & + operator=(const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &op) = default; +# 327 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=( + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile { + Base::V = op.V; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) ap_ufixed& operator=( + const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, + _AP_N>& op) volatile { + Base::V = op.V; + } +}; +# 365 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_special.h" 1 +# 20 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_special.h" +namespace std { +template class complex; +} + + + + + + + +namespace std { +# 48 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_special.h" +template +class complex > { + public: + typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp; + typedef _Tp value_type; + + + + + __attribute__((nodebug)) complex() : _M_real(_Tp()), _M_imag(_Tp()) {} + + + __attribute__((nodebug)) complex(const _Tp &__r, const _Tp &__i = _Tp(0)) + : _M_real(__r), _M_imag(__i) {} + + + template + __attribute__((nodebug)) complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {} + + + const __attribute__((nodebug)) _Tp& real() const { return _M_real; } + const __attribute__((nodebug)) _Tp& imag() const { return _M_imag; } + + + + + + + + __attribute__((nodebug)) void real(_Tp __val) { _M_real = __val; } + + __attribute__((nodebug)) void imag(_Tp __val) { _M_imag = __val; } + + + + __attribute__((nodebug)) complex<_Tp> &operator=(const _Tp __t) { + _M_real = __t; + _M_imag = _Tp(0); + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator+=(const _Tp &__t) { + _M_real += __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator-=(const _Tp &__t) { + _M_real -= __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator*=(const _Tp &__t) { + _M_real *= __t; + _M_imag *= __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator/=(const _Tp &__t) { + _M_real /= __t; + _M_imag /= __t; + return *this; + } + + + template + __attribute__((nodebug)) complex<_Tp> &operator=(const complex<_Up> &__z) { + auto tmp1 = __z; + complex<_Tp> tmp2 = {tmp1.real(), tmp1.imag()}; + *this = tmp2; + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator+=(const complex<_Up> &__z) { + _M_real += __z.real(); + _M_imag += __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator-=(const complex<_Up> &__z) { + _M_real -= __z.real(); + _M_imag -= __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator*=(const complex<_Up> &__z) { + const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag(); + _M_imag = _M_real * __z.imag() + _M_imag * __z.real(); + _M_real = __r; + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator/=(const complex<_Up> &__z) { + complex<_Tp> cj (__z.real(), -__z.imag()); + complex<_Tp> a = (*this) * cj; + complex<_Tp> b = cj * __z; + _M_real = a.real() / b.real(); + _M_imag = a.imag() / b.real(); + return *this; + } + + private: + _Tp _M_real; + _Tp _M_imag; + +}; +# 190 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_special.h" +template +class complex > { + public: + typedef ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp; + typedef _Tp value_type; + + + + + __attribute__((nodebug)) complex() : _M_real(_Tp()), _M_imag(_Tp()) {} + + + __attribute__((nodebug)) complex(const _Tp &__r, const _Tp &__i = _Tp(0)) + : _M_real(__r), _M_imag(__i) {} + + + template + __attribute__((nodebug)) complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {} + + + const __attribute__((nodebug)) _Tp& real() const { return _M_real; } + const __attribute__((nodebug)) _Tp& imag() const { return _M_imag; } + + + + + + + + __attribute__((nodebug)) void real(_Tp __val) { _M_real = __val; } + + __attribute__((nodebug)) void imag(_Tp __val) { _M_imag = __val; } + + + + __attribute__((nodebug)) complex<_Tp> &operator=(const _Tp __t) { + _M_real = __t; + _M_imag = _Tp(0); + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator+=(const _Tp &__t) { + _M_real += __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator-=(const _Tp &__t) { + _M_real -= __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator*=(const _Tp &__t) { + _M_real *= __t; + _M_imag *= __t; + return *this; + } + + + + __attribute__((nodebug)) complex<_Tp> &operator/=(const _Tp &__t) { + _M_real /= __t; + _M_imag /= __t; + return *this; + } + + + template + __attribute__((nodebug)) complex<_Tp> &operator=(const complex<_Up> &__z) { + auto tmp1 = __z; + complex<_Tp> tmp2 = {tmp1.real(), tmp1.imag()}; + *this = tmp2; + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator+=(const complex<_Up> &__z) { + _M_real += __z.real(); + _M_imag += __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator-=(const complex<_Up> &__z) { + _M_real -= __z.real(); + _M_imag -= __z.imag(); + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator*=(const complex<_Up> &__z) { + const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag(); + _M_imag = _M_real * __z.imag() + _M_imag * __z.real(); + _M_real = __r; + return *this; + } + + + + template + __attribute__((nodebug)) complex<_Tp> &operator/=(const complex<_Up> &__z) { + complex<_Tp> cj (__z.real(), -__z.imag()); + complex<_Tp> a = (*this) * cj; + complex<_Tp> b = cj * __z; + _M_real = a.real() / b.real(); + _M_imag = a.imag() / b.real(); + return *this; + } + + private: + _Tp _M_real; + _Tp _M_imag; + +}; +# 323 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_special.h" +template +inline __attribute__((nodebug)) bool operator==( + const complex > &__x, + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) { + return __x.real() == __y && + __x.imag() == 0; +} + + +template +inline __attribute__((nodebug)) bool operator==( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x, + const complex > &__y) { + return __x == __y.real() && + 0 == __y.imag(); +} + + +template +inline __attribute__((nodebug)) bool operator!=( + const complex > &__x, + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) { + return __x.real() != __y || + __x.imag() != 0; +} + + +template +inline __attribute__((nodebug)) bool operator!=( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x, + const complex > &__y) { + return __x != __y.real() || + 0 != __y.imag(); +} + + +template +inline __attribute__((nodebug)) bool operator==( + const complex > &__x, + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) { + return __x.real() == __y && + __x.imag() == 0; +} + + +template +inline __attribute__((nodebug)) bool operator==( + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x, + const complex > &__y) { + return __x == __y.real() && + 0 == __y.imag(); +} + + +template +inline __attribute__((nodebug)) bool operator!=( + const complex > &__x, + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) { + return __x.real() != __y || + __x.imag() != 0; +} + + +template +inline __attribute__((nodebug)) bool operator!=( + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x, + const complex > &__y) { + return __x != __y.real() || + 0 != __y.imag(); +} + +} +# 366 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_fixed.h" 2 +# 361 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/ap_int.h" 2 +# 2 "div.cpp" 2 +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/hls_stream.h" 1 +# 15 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/hls_stream.h" +# 1 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/hls_stream_39.h" 1 +# 26 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/hls_stream_39.h" +namespace hls { +# 52 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/hls_stream_39.h" +template +class stream; + +template +class stream<__STREAM_T__, 0> +{ + public: + using value_type = __STREAM_T__; + + inline __attribute__((always_inline)) __attribute__((nodebug)) stream() { + __fpga_set_stream_depth(&this->V, 0); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) stream(const char* name) { + (void)(name); + __fpga_set_stream_depth(&this->V, 0); + } + + + private: + inline __attribute__((always_inline)) __attribute__((nodebug)) stream(const stream< __STREAM_T__ >& chn):V(chn.V) { + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) stream& operator= (const stream< __STREAM_T__ >& chn) { + V = chn.V; + return *this; + } + + public: + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator >> (__STREAM_T__& rdata) { + read(rdata); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) void operator << (const __STREAM_T__& wdata) { + write(wdata); + } + + + public: + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool empty() const { + return !__fpga_fifo_not_empty(&V); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool full() const { + return !__fpga_fifo_not_full(&V); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void read(__STREAM_T__& dout) { + __fpga_fifo_pop(&V, &dout); + } + + + inline __attribute__((noinline)) __attribute__((nodebug)) bool read_dep(__STREAM_T__& dout, volatile bool flag) { + __fpga_fifo_pop(&V, &dout); + return flag; + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) __STREAM_T__ read() { + __STREAM_T__ tmp; + read(tmp); + return tmp; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool read_nb(__STREAM_T__& dout) { + __STREAM_T__ tmp; + + if (__fpga_fifo_nb_pop(&V, &tmp)) { + dout = tmp; + return true; + } else { + return false; + } + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) void write(const __STREAM_T__& din) { + __fpga_fifo_push(&V, &din); + } + + + inline __attribute__((noinline)) __attribute__((nodebug)) bool write_dep(const __STREAM_T__& din, volatile bool flag) { + __fpga_fifo_push(&V, &din); + return flag; + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) bool write_nb(const __STREAM_T__& din) { + return __fpga_fifo_nb_push(&V, &din); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned size() const { + return __fpga_fifo_size(&V); + } + + + inline __attribute__((always_inline)) __attribute__((nodebug)) unsigned capacity() const { + return __fpga_fifo_capacity(&V); + } + + + void set_name(const char* name) { (void)(name); } + + public: + __STREAM_T__ V __attribute__((no_ctor)); +}; + +template +class stream : public stream<__STREAM_T__, 0> { + public: + inline __attribute__((always_inline)) __attribute__((nodebug)) stream() { + __fpga_set_stream_depth(&this->V, DEPTH); + } + + inline __attribute__((always_inline)) __attribute__((nodebug)) stream(const char* name) { + (void)(name); + __fpga_set_stream_depth(&this->V, DEPTH); + } +}; +} +# 16 "/mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/hls_stream.h" 2 +# 3 "div.cpp" 2 + + + + + + + +__attribute__((sdx_kernel("div", 0))) void div(hls::stream> &data_in_0, + hls::stream> &data_in_1, + hls::stream> &data_out_0) { +#line 8 "/workspace/src/mase_components/hls/scalar_ops/int_div/vhls.tcl" +#pragma HLSDIRECTIVE TOP name=div +# 12 "div.cpp" + +#pragma HLS PIPELINE II = 1 + if (data_in_0.empty() || data_in_1.empty()) + return; + ap_int<32> in0; + ap_int<32> in1; + data_in_0.read_nb(in0); + data_in_1.read_nb(in1); + ap_int<16> res = in0 / in1; + + data_out_0.write_nb(res); +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang-tidy.loop-label.diag.yml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang-tidy.loop-label.diag.yml new file mode 100644 index 000000000..cd7aeb657 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang-tidy.loop-label.diag.yml @@ -0,0 +1,49 @@ +--- +MainSourceFile: /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp +Diagnostics: + - DiagnosticName: xilinx-label-all-loops + Message: 'Added loop label VITIS_LOOP_1213_1: ' + FileOffset: 1213 + FilePath: /mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_base.h + Replacements: + - FilePath: /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp + Offset: 55884 + Length: 0 + ReplacementText: 'VITIS_LOOP_1213_1: ' + - DiagnosticName: xilinx-label-all-loops + Message: 'Added loop label VITIS_LOOP_676_1: ' + FileOffset: 676 + FilePath: /mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h + Replacements: + - FilePath: /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp + Offset: 283740 + Length: 0 + ReplacementText: 'VITIS_LOOP_676_1: ' + - DiagnosticName: xilinx-label-all-loops + Message: 'Added loop label VITIS_LOOP_690_1: ' + FileOffset: 690 + FilePath: /mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h + Replacements: + - FilePath: /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp + Offset: 284204 + Length: 0 + ReplacementText: 'VITIS_LOOP_690_1: ' + - DiagnosticName: xilinx-label-all-loops + Message: 'Added loop label VITIS_LOOP_704_1: ' + FileOffset: 704 + FilePath: /mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_int_ref.h + Replacements: + - FilePath: /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp + Offset: 284669 + Length: 0 + ReplacementText: 'VITIS_LOOP_704_1: ' + - DiagnosticName: xilinx-label-all-loops + Message: 'Added loop label VITIS_LOOP_1247_1: ' + FileOffset: 1247 + FilePath: /mnt/applications/Xilinx/23.1/Vitis_HLS/2023.1/common/technology/autopilot/etc/ap_fixed_base.h + Replacements: + - FilePath: /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp + Offset: 568011 + Length: 0 + ReplacementText: 'VITIS_LOOP_1247_1: ' +... diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang.diag.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang.diag.xml new file mode 100644 index 000000000..2cd98d6d3 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang.diag.xml @@ -0,0 +1,39 @@ + + main-file + /workspace/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp + diagnostics + + + level + warning + filename + /workspace/src/mase_components/hls/scalar_ops/int_div/vhls.tcl + line + 8 + column + 9 + message + HLS pragma dump _XLX_SEP_ PragmaIsValid=1_XLX_SEP_ PragmaType=top_XLX_SEP_ PragmaContext=directive_XLX_SEP_ PragmaFunction=div_XLX_SEP_ PragmaOptions=name=div_XLX_SEP_ + ID + 5471 + WarningOption + dump-hls-pragmas + + + level + warning + filename + div.cpp + line + 13 + column + 9 + message + HLS pragma dump _XLX_SEP_ PragmaIsValid=1_XLX_SEP_ PragmaType=pipeline_XLX_SEP_ PragmaContext=_XLX_SEP_ PragmaFunction=div_XLX_SEP_ PragmaOptions=II = 1_XLX_SEP_ + ID + 5471 + WarningOption + dump-hls-pragmas + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang.diag.yml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.pp.0.cpp.clang.diag.yml new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.protoinst b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.protoinst new file mode 100644 index 000000000..9cb1cf8fa --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.protoinst @@ -0,0 +1,19 @@ +{ + "version": "1.0", + "modules": { + "div": { + "proto_instances": { + "/AESL_inst_div_activity": { + "interface": "xilinx.com:interface:internal_hls_dataflow:1.0", + "ports": { + "AP_CLK": { "actual": "ap_clk"}, + "AP_DONE": { "actual": "ap_done"}, + "AP_READY": { "actual": "ap_ready"}, + "AP_RESET": { "actual": "ap_rst"}, + "AP_START": { "actual": "ap_start"} + } + } + } + } + } +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.rtl_wrap.cfg.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.rtl_wrap.cfg.tcl new file mode 100644 index 000000000..2e9ca80d3 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.rtl_wrap.cfg.tcl @@ -0,0 +1,54 @@ +set lang "C" +set moduleName "div" +set moduleIsExternC "1" +set rawDecl "" +set globalVariable "" +set PortList "" +set PortName "data_in_0" +set BitWidth "32" +set ArrayOpt "" +set Const "0" +set Volatile "0" +set Pointer "0" +set Reference "0" +set Dims [list 0] +set Interface "[list AP_STREAM 0]" +set DataType "int" +set Port [list $PortName $Interface $DataType $Pointer $Dims $Const $Volatile $ArrayOpt] +lappend PortList $Port +set PortName "data_in_1" +set BitWidth "32" +set ArrayOpt "" +set Const "0" +set Volatile "0" +set Pointer "0" +set Reference "0" +set Dims [list 0] +set Interface "[list AP_STREAM 0]" +set DataType "int" +set Port [list $PortName $Interface $DataType $Pointer $Dims $Const $Volatile $ArrayOpt] +lappend PortList $Port +set PortName "data_out_0" +set BitWidth "32" +set ArrayOpt "" +set Const "0" +set Volatile "0" +set Pointer "0" +set Reference "0" +set Dims [list 0] +set Interface "[list AP_STREAM 0]" +set DataType "int" +set Port [list $PortName $Interface $DataType $Pointer $Dims $Const $Volatile $ArrayOpt] +lappend PortList $Port +set globalAPint "" +set returnAPInt "" +set hasCPPAPInt 0 +set argAPInt "" +set hasCPPAPFix 0 +set hasSCFix 0 +set hasCBool 0 +set hasCPPComplex 0 +set isTemplateTop 0 +set hasHalf 0 +set dataPackList "" +set module [list $moduleName $PortList $rawDecl $argAPInt $returnAPInt $dataPackList] diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.sched.adb b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.sched.adb new file mode 100644 index 000000000..cfbd65e2c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.sched.adb @@ -0,0 +1,1592 @@ + + + + + -1 + + + div + Pipeline + 0 + + 3 + 0 + + + + 1 + 1 + data_in_0 + + + 0 + + + + 0 + 0 + + data_in_0 + + + + + + 0 + 0 + 0 + + + 32 + + 0 + 3 + 0 + + 0 + 0 + + + + + + 1 + 2 + data_in_1 + + + 0 + + + + 0 + 0 + + data_in_1 + + + + + + 0 + 0 + 0 + + + 32 + + 0 + 3 + 0 + + 0 + 0 + + + + + + 1 + 3 + data_out_0 + + + 0 + + + + 0 + 0 + + data_out_0 + + + + + + 0 + 0 + 2531319821 + + + 16 + + 1 + 3 + 0 + + 0 + 0 + + + + + 14 + 0 + + + + 0 + 12 + tmp + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 1 + + + 3 + 0 + 32 + 33 + 35 + + nbreadreq + 0 + 0 + 0 + 0 + 0.00 + 1 + -1 + + + + + 0 + 13 + br_ln14 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 0 + + + 3 + 0 + 36 + 37 + 38 + + br + 0 + 0 + 0 + 0 + 0.00 + 2 + -1 + + + + + 0 + 15 + tmp_1 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 1 + + + 3 + 0 + 39 + 40 + 41 + + nbreadreq + 0 + 0 + 0 + 0 + 0.00 + 3 + -1 + + + + + 0 + 16 + br_ln14 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 14 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 14 + + + + + + + + + + + 0 + 0 + 0 + + + 0 + + + 3 + 0 + 42 + 43 + 44 + + br + 0 + 0 + 0 + 0 + 0.00 + 4 + -1 + + + + + 0 + 18 + br_ln15 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 15 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 15 + + + + + + + + + + + 0 + 0 + 4294967295 + + + 0 + + + 1 + 0 + 45 + + br + 0 + 0 + 0 + 0 + 0.00 + 5 + -1 + + + + + 0 + 20 + data_in_0_read + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 18 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 18 + + + + + + + + + + + 0 + 0 + 0 + + + 33 + + + 3 + 0 + 47 + 48 + 147 + + nbread + 0 + 0 + 0 + 0 + 1.42 + 6 + -1 + + + + + 0 + 21 + in0 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 18 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 18 + + + + + in0 + + + + + + 0 + 0 + 0 + + + 32 + + + 1 + 0 + 49 + + extractvalue + 0 + 0 + 0 + 0 + 0.00 + 7 + -1 + + + + + 0 + 22 + data_in_1_read + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 19 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 19 + + + + + + + + + + + 0 + 0 + 0 + + + 33 + + + 3 + 0 + 50 + 51 + 148 + + nbread + 0 + 0 + 0 + 0 + 1.42 + 8 + -1 + + + + + 0 + 23 + in1 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 19 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 19 + + + + + in1 + + + + + + 0 + 0 + 1764524928 + + + 32 + + + 1 + 0 + 52 + + extractvalue + 0 + 0 + 0 + 0 + 0.00 + 9 + -1 + + + + + 0 + 24 + sdiv_ln20 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 20 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 20 + + + + + + + + + + + 0 + 0 + 1775678400 + + + 16 + + + 2 + 0 + 53 + 54 + + sdiv + 0 + 0 + 0 + 0 + 1.16 + 10 + -1 + + + + + 0 + 25 + res + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 20 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 20 + + + + + res + + + + + + 0 + 0 + 0 + + + 16 + + + 1 + 0 + 55 + + trunc + 0 + 0 + 0 + 0 + 0.00 + 11 + -1 + + + + + 0 + 26 + empty + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 22 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 22 + + + + + + + + + + + 0 + 0 + 1775680256 + + + 1 + + + 3 + 0 + 57 + 58 + 59 + + nbwrite + 0 + 0 + 0 + 0 + 1.42 + 12 + -1 + + + + + 0 + 27 + br_ln23 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 23 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 23 + + + + + + + + + + + 0 + 0 + 1775678896 + + + 0 + + + 1 + 0 + 60 + + br + 0 + 0 + 0 + 0 + 0.00 + 13 + -1 + + + + + 0 + 29 + _ln23 + div.cpp + /workspace/src/mase_components/hls/scalar_ops/int_div + 23 + div + div + + 1 + 0 + + /workspace/src/mase_components/hls/scalar_ops/int_div + + 1 + 0 + + + div.cpp + div + + 23 + + + + + + + + + + + 0 + 0 + 1872 + + + 0 + + + 0 + 0 + + ret + 0 + 0 + 0 + 0 + 0.00 + 14 + -1 + + + + 1 + 0 + + + + 2 + 34 + empty + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775688224 + + + 32 + + 0 + 1 + + + + 5 + 0 + + + 3 + 14 + entry + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 2531278202 + + + + 2 + 0 + 12 + 13 + + + + + 3 + 17 + lor.lhs.false + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775671552 + + + + 2 + 0 + 15 + 16 + + + + + 3 + 19 + if.then + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 1775611760 + + + + 1 + 0 + 18 + + + + + 3 + 28 + if.end + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 0 + + + + 8 + 0 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + + + + + 3 + 30 + return + + + 0 + + + + 0 + 0 + + + + + + + + 0 + 0 + 0 + + + + 1 + 0 + 29 + + + + + 29 + 0 + + 33 + 1 + 1 + 12 + 0 + + + 35 + 1 + 34 + 12 + 0 + + + 36 + 1 + 12 + 13 + 0 + + + 37 + 2 + 19 + 13 + 0 + + + 38 + 2 + 17 + 13 + 0 + + + 40 + 1 + 2 + 15 + 0 + + + 41 + 1 + 34 + 15 + 0 + + + 42 + 1 + 15 + 16 + 0 + + + 43 + 2 + 19 + 16 + 0 + + + 44 + 2 + 28 + 16 + 0 + + + 45 + 2 + 30 + 18 + 0 + + + 48 + 1 + 1 + 20 + 0 + + + 49 + 1 + 20 + 21 + 0 + + + 51 + 1 + 2 + 22 + 0 + + + 52 + 1 + 22 + 23 + 0 + + + 53 + 1 + 21 + 24 + 0 + + + 54 + 1 + 23 + 24 + 0 + + + 55 + 1 + 24 + 25 + 0 + + + 58 + 1 + 3 + 26 + 0 + + + 59 + 1 + 25 + 26 + 0 + + + 60 + 2 + 30 + 27 + 0 + + + 141 + 2 + 14 + 17 + 0 + + + 142 + 2 + 14 + 19 + 0 + + + 143 + 2 + 17 + 28 + 0 + + + 144 + 2 + 17 + 19 + 0 + + + 145 + 2 + 19 + 30 + 0 + + + 146 + 2 + 28 + 30 + 0 + + + 147 + 4 + 12 + 20 + 0 + + + 148 + 4 + 15 + 22 + 0 + + + + + 1 + 0 + + 1 + div + div + 0 + + 0 + 0 + + + 5 + 0 + 14 + 17 + 19 + 28 + 30 + + 1 + 36 + -1 + -1 + 35 + 35 + 0 + + + + + + + 14 + 0 + + 12 + + 0 + 0 + + + + 13 + + 0 + 0 + + + + 15 + + 0 + 0 + + + + 16 + + 0 + 0 + + + + 18 + + 0 + 0 + + + + 20 + + 0 + 0 + + + + 21 + + 0 + 0 + + + + 22 + + 0 + 0 + + + + 23 + + 0 + 0 + + + + 24 + + 0 + 35 + + + + 25 + + 35 + 0 + + + + 26 + + 35 + 0 + + + + 27 + + 35 + 0 + + + + 29 + + 35 + 0 + + + + + 5 + 0 + + 14 + + 0 + 0 + + + + 17 + + 0 + 0 + + + + 19 + + 0 + 0 + + + + 28 + + 0 + 35 + + + + 30 + + 35 + 35 + + + + + 1 + 0 + + div + + 5 + 0 + 14 + 17 + 19 + 28 + 30 + + + 0 + 0 + + -1 + 8 + 1 + 36 + + 0 + 0 + + + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + 0 + 0 + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.sched.adb.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.sched.adb.xml new file mode 100644 index 000000000..63d0cb936 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.sched.adb.xml @@ -0,0 +1,1284 @@ +div + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +NULL + + + + + + + + + + + +FIFO + + + + + + + + + + + +NULL + + + + + + + + + + + +FIFO + + + + + + + + + + + + +NULL + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + + + + +Divider + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +FIFO + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + +NULL + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.tbgen.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.tbgen.tcl new file mode 100644 index 000000000..25cc254b3 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.tbgen.tcl @@ -0,0 +1,127 @@ +set moduleName div +set isTopModule 1 +set isCombinational 0 +set isDatapathOnly 0 +set isPipelined 1 +set pipeline_type function +set FunctionProtocol ap_ctrl_hs +set isOneStateSeq 0 +set ProfileFlag 0 +set StallSigGenFlag 0 +set isEnableWaveformDebug 1 +set hasInterrupt 0 +set DLRegFirstOffset 0 +set DLRegItemOffset 0 +set C_modelName {div} +set C_modelType { void 0 } +set C_modelArgList { + { data_in_0 int 32 regular {fifo 0 volatile } } + { data_in_1 int 32 regular {fifo 0 volatile } } + { data_out_0 int 16 regular {fifo 1 volatile } } +} +set hasAXIMCache 0 +set C_modelArgMapList {[ + { "Name" : "data_in_0", "interface" : "fifo", "bitwidth" : 32, "direction" : "READONLY"} , + { "Name" : "data_in_1", "interface" : "fifo", "bitwidth" : 32, "direction" : "READONLY"} , + { "Name" : "data_out_0", "interface" : "fifo", "bitwidth" : 16, "direction" : "WRITEONLY"} ]} +# RTL Port declarations: +set portNum 15 +set portList { + { ap_clk sc_in sc_logic 1 clock -1 } + { ap_rst sc_in sc_logic 1 reset -1 active_high_sync } + { ap_start sc_in sc_logic 1 start -1 } + { ap_done sc_out sc_logic 1 predone -1 } + { ap_idle sc_out sc_logic 1 done -1 } + { ap_ready sc_out sc_logic 1 ready -1 } + { data_in_0_dout sc_in sc_lv 32 signal 0 } + { data_in_0_empty_n sc_in sc_logic 1 signal 0 } + { data_in_0_read sc_out sc_logic 1 signal 0 } + { data_in_1_dout sc_in sc_lv 32 signal 1 } + { data_in_1_empty_n sc_in sc_logic 1 signal 1 } + { data_in_1_read sc_out sc_logic 1 signal 1 } + { data_out_0_din sc_out sc_lv 16 signal 2 } + { data_out_0_full_n sc_in sc_logic 1 signal 2 } + { data_out_0_write sc_out sc_logic 1 signal 2 } +} +set NewPortList {[ + { "name": "ap_clk", "direction": "in", "datatype": "sc_logic", "bitwidth":1, "type": "clock", "bundle":{"name": "ap_clk", "role": "default" }} , + { "name": "ap_rst", "direction": "in", "datatype": "sc_logic", "bitwidth":1, "type": "reset", "bundle":{"name": "ap_rst", "role": "default" }} , + { "name": "ap_start", "direction": "in", "datatype": "sc_logic", "bitwidth":1, "type": "start", "bundle":{"name": "ap_start", "role": "default" }} , + { "name": "ap_done", "direction": "out", "datatype": "sc_logic", "bitwidth":1, "type": "predone", "bundle":{"name": "ap_done", "role": "default" }} , + { "name": "ap_idle", "direction": "out", "datatype": "sc_logic", "bitwidth":1, "type": "done", "bundle":{"name": "ap_idle", "role": "default" }} , + { "name": "ap_ready", "direction": "out", "datatype": "sc_logic", "bitwidth":1, "type": "ready", "bundle":{"name": "ap_ready", "role": "default" }} , + { "name": "data_in_0_dout", "direction": "in", "datatype": "sc_lv", "bitwidth":32, "type": "signal", "bundle":{"name": "data_in_0", "role": "dout" }} , + { "name": "data_in_0_empty_n", "direction": "in", "datatype": "sc_logic", "bitwidth":1, "type": "signal", "bundle":{"name": "data_in_0", "role": "empty_n" }} , + { "name": "data_in_0_read", "direction": "out", "datatype": "sc_logic", "bitwidth":1, "type": "signal", "bundle":{"name": "data_in_0", "role": "read" }} , + { "name": "data_in_1_dout", "direction": "in", "datatype": "sc_lv", "bitwidth":32, "type": "signal", "bundle":{"name": "data_in_1", "role": "dout" }} , + { "name": "data_in_1_empty_n", "direction": "in", "datatype": "sc_logic", "bitwidth":1, "type": "signal", "bundle":{"name": "data_in_1", "role": "empty_n" }} , + { "name": "data_in_1_read", "direction": "out", "datatype": "sc_logic", "bitwidth":1, "type": "signal", "bundle":{"name": "data_in_1", "role": "read" }} , + { "name": "data_out_0_din", "direction": "out", "datatype": "sc_lv", "bitwidth":16, "type": "signal", "bundle":{"name": "data_out_0", "role": "din" }} , + { "name": "data_out_0_full_n", "direction": "in", "datatype": "sc_logic", "bitwidth":1, "type": "signal", "bundle":{"name": "data_out_0", "role": "full_n" }} , + { "name": "data_out_0_write", "direction": "out", "datatype": "sc_logic", "bitwidth":1, "type": "signal", "bundle":{"name": "data_out_0", "role": "write" }} ]} + +set RtlHierarchyInfo {[ + {"ID" : "0", "Level" : "0", "Path" : "`AUTOTB_DUT_INST", "Parent" : "", "Child" : ["1"], + "CDFG" : "div", + "Protocol" : "ap_ctrl_hs", + "ControlExist" : "1", "ap_start" : "1", "ap_ready" : "1", "ap_done" : "1", "ap_continue" : "0", "ap_idle" : "1", "real_start" : "0", + "Pipeline" : "Aligned", "UnalignedPipeline" : "0", "RewindPipeline" : "0", "ProcessNetwork" : "0", + "II" : "1", + "VariableLatency" : "0", "ExactLatency" : "35", "EstimateLatencyMin" : "35", "EstimateLatencyMax" : "35", + "Combinational" : "0", + "Datapath" : "0", + "ClockEnable" : "0", + "HasSubDataflow" : "0", + "InDataflowNetwork" : "0", + "HasNonBlockingOperation" : "1", + "IsBlackBox" : "0", + "Port" : [ + {"Name" : "data_in_0", "Type" : "Fifo", "Direction" : "I"}, + {"Name" : "data_in_1", "Type" : "Fifo", "Direction" : "I"}, + {"Name" : "data_out_0", "Type" : "Fifo", "Direction" : "O"}]}, + {"ID" : "1", "Level" : "1", "Path" : "`AUTOTB_DUT_INST.sdiv_32ns_32ns_16_36_1_U1", "Parent" : "0"}]} + + +set ArgLastReadFirstWriteLatency { + div { + data_in_0 {Type I LastRead 0 FirstWrite -1} + data_in_1 {Type I LastRead 0 FirstWrite -1} + data_out_0 {Type O LastRead 35 FirstWrite -1}}} + +set hasDtUnsupportedChannel 0 + +set PerformanceInfo {[ + {"Name" : "Latency", "Min" : "35", "Max" : "35"} + , {"Name" : "Interval", "Min" : "1", "Max" : "1"} +]} + +set PipelineEnableSignalInfo {[ + {"Pipeline" : "0", "EnableSignal" : "ap_enable_pp0"} +]} + +set Spec2ImplPortList { + data_in_0 { ap_fifo { { data_in_0_dout fifo_port_we 0 32 } { data_in_0_empty_n fifo_status 0 1 } { data_in_0_read fifo_data 1 1 } } } + data_in_1 { ap_fifo { { data_in_1_dout fifo_port_we 0 32 } { data_in_1_empty_n fifo_status 0 1 } { data_in_1_read fifo_data 1 1 } } } + data_out_0 { ap_fifo { { data_out_0_din fifo_port_we 1 16 } { data_out_0_full_n fifo_status 0 1 } { data_out_0_write fifo_data 1 1 } } } +} + +set maxi_interface_dict [dict create] + +# RTL port scheduling information: +set fifoSchedulingInfoList { + data_in_0 { fifo_read 2 has_conditional } + data_in_1 { fifo_read 2 has_conditional } + data_out_0 { fifo_write 1 has_conditional } +} + +# RTL bus port read request latency information: +set busReadReqLatencyList { +} + +# RTL bus port write response latency information: +set busWriteResLatencyList { +} + +# RTL array port load latency information: +set memoryLoadLatencyList { +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.bind.rpt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.bind.rpt new file mode 100644 index 000000000..e5844c141 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.bind.rpt @@ -0,0 +1,752 @@ + + +================================================================ +== Vitis HLS Report for 'div' +================================================================ +* Date: Sun Aug 4 22:51:44 2024 + +* Version: 2023.1 (Build 3854077 on May 4 2023) +* Project: prj +* Solution: solution1 (Vivado IP Flow Target) +* Product family: virtexuplus +* Target device: xcu250-figd2104-2L-e + + +================================================================ +== Performance Estimates +================================================================ ++ Timing: + * Summary: + +--------+----------+----------+------------+ + | Clock | Target | Estimated| Uncertainty| + +--------+----------+----------+------------+ + |ap_clk | 10.00 ns| 2.593 ns| 2.70 ns| + +--------+----------+----------+------------+ + ++ Latency: + * Summary: + +---------+---------+----------+----------+-----+-----+---------+ + | Latency (cycles) | Latency (absolute) | Interval | Pipeline| + | min | max | min | max | min | max | Type | + +---------+---------+----------+----------+-----+-----+---------+ + | 35| 35| 0.350 us| 0.350 us| 1| 1| yes| + +---------+---------+----------+----------+-----+-----+---------+ + + + Detail: + * Instance: + N/A + + * Loop: + N/A + +============================================================ ++ Verbose Summary: Synthesis Manager +============================================================ +InlineROM: 1 +ExposeGlobal: 0 +============================================================ ++ Verbose Summary: CDFG Model +============================================================ +IsTopModel: 1 +ResetActiveHigh: 1 +IsCombinational: 0 +IsDatapathOnly: 0 +HasWiredReturn: 1 +HasMFsm: 2 +HasVarLatency: 0 +IsPipeline: 1 +IsRtlPipelined: 1 +IsInstanceOverlapped: 0 +IsDontTouch: 0 +HasImplIP: 0 +IsGatedGlobalClock: 0 + ++ Individual pipeline summary: + * Pipeline-0: initiation interval (II) = 1, depth = 36 + + +============================================================ ++ Verbose Summary: Schedule +============================================================ +* Number of FSM states : 36 +* Pipeline : 1 + Pipeline-0 : II = 1, D = 36, States = { 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 } +* Dataflow Pipeline: 0 + +* FSM state transitions: +1 --> 2 +2 --> 3 +3 --> 4 +4 --> 5 +5 --> 6 +6 --> 7 +7 --> 8 +8 --> 9 +9 --> 10 +10 --> 11 +11 --> 12 +12 --> 13 +13 --> 14 +14 --> 15 +15 --> 16 +16 --> 17 +17 --> 18 +18 --> 19 +19 --> 20 +20 --> 21 +21 --> 22 +22 --> 23 +23 --> 24 +24 --> 25 +25 --> 26 +26 --> 27 +27 --> 28 +28 --> 29 +29 --> 30 +30 --> 31 +31 --> 32 +32 --> 33 +33 --> 34 +34 --> 35 +35 --> 36 +36 --> + +* FSM state operations: + +State 1 +ST_1 : Operation 37 [1/1] (0.00ns) ---> "%specpipeline_ln13 = specpipeline void @_ssdm_op_SpecPipeline, i32 1, i32 0, i32 0, i32 0, void @empty" [div.cpp:13] ---> Operation 37 'specpipeline' 'specpipeline_ln13' +ST_1 : Operation 38 [1/1] (0.00ns) ---> "%spectopmodule_ln10 = spectopmodule void @_ssdm_op_SpecTopModule, void @empty_1" [div.cpp:10] ---> Operation 38 'spectopmodule' 'spectopmodule_ln10' +ST_1 : Operation 39 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i32 %data_in_0, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 39 'specinterface' 'specinterface_ln0' +ST_1 : Operation 40 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i32 %data_in_0" ---> Operation 40 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 41 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i32 %data_in_1, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 41 'specinterface' 'specinterface_ln0' +ST_1 : Operation 42 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i32 %data_in_1" ---> Operation 42 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 43 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i16 %data_out_0, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 43 'specinterface' 'specinterface_ln0' +ST_1 : Operation 44 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i16 %data_out_0" ---> Operation 44 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 45 [1/1] (0.00ns) ---> "%tmp = nbreadreq i1 @_ssdm_op_NbReadReq.ap_fifo.i32P0A, i32 %data_in_0, i32 1" [div.cpp:14] ---> Operation 45 'nbreadreq' 'tmp' ---> Core 78 'FIFO' +ST_1 : Operation 46 [1/1] (0.00ns) ---> "%br_ln14 = br i1 %tmp, void %if.then, void %lor.lhs.false" [div.cpp:14] ---> Operation 46 'br' 'br_ln14' +ST_1 : Operation 47 [1/1] (0.00ns) ---> "%tmp_1 = nbreadreq i1 @_ssdm_op_NbReadReq.ap_fifo.i32P0A, i32 %data_in_1, i32 1" [div.cpp:14] ---> Operation 47 'nbreadreq' 'tmp_1' ---> Core 78 'FIFO' +ST_1 : Operation 48 [1/1] (0.00ns) ---> "%br_ln14 = br i1 %tmp_1, void %if.then, void %if.end" [div.cpp:14] ---> Operation 48 'br' 'br_ln14' +ST_1 : Operation 49 [1/1] (0.00ns) ---> "%br_ln15 = br void %return" [div.cpp:15] ---> Operation 49 'br' 'br_ln15' +ST_1 : Operation 50 [1/1] (1.42ns) ---> "%data_in_0_read = nbread i33 @_ssdm_op_NbRead.ap_fifo.volatile.i32P0A, i32 %data_in_0" [div.cpp:18] ---> Operation 50 'nbread' 'data_in_0_read' ---> Core 78 'FIFO' +ST_1 : Operation 51 [1/1] (0.00ns) ---> "%in0 = extractvalue i33 %data_in_0_read" [div.cpp:18] ---> Operation 51 'extractvalue' 'in0' +ST_1 : Operation 52 [1/1] (1.42ns) ---> "%data_in_1_read = nbread i33 @_ssdm_op_NbRead.ap_fifo.volatile.i32P0A, i32 %data_in_1" [div.cpp:19] ---> Operation 52 'nbread' 'data_in_1_read' ---> Core 78 'FIFO' +ST_1 : Operation 53 [1/1] (0.00ns) ---> "%in1 = extractvalue i33 %data_in_1_read" [div.cpp:19] ---> Operation 53 'extractvalue' 'in1' +ST_1 : Operation 54 [36/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 54 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 2 +ST_2 : Operation 55 [35/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 55 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 3 +ST_3 : Operation 56 [34/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 56 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 4 +ST_4 : Operation 57 [33/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 57 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 5 +ST_5 : Operation 58 [32/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 58 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 6 +ST_6 : Operation 59 [31/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 59 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 7 +ST_7 : Operation 60 [30/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 60 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 8 +ST_8 : Operation 61 [29/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 61 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 9 +ST_9 : Operation 62 [28/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 62 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 10 +ST_10 : Operation 63 [27/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 63 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 11 +ST_11 : Operation 64 [26/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 64 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 12 +ST_12 : Operation 65 [25/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 65 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 13 +ST_13 : Operation 66 [24/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 66 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 14 +ST_14 : Operation 67 [23/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 67 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 15 +ST_15 : Operation 68 [22/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 68 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 16 +ST_16 : Operation 69 [21/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 69 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 17 +ST_17 : Operation 70 [20/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 70 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 18 +ST_18 : Operation 71 [19/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 71 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 19 +ST_19 : Operation 72 [18/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 72 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 20 +ST_20 : Operation 73 [17/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 73 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 21 +ST_21 : Operation 74 [16/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 74 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 22 +ST_22 : Operation 75 [15/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 75 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 23 +ST_23 : Operation 76 [14/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 76 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 24 +ST_24 : Operation 77 [13/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 77 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 25 +ST_25 : Operation 78 [12/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 78 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 26 +ST_26 : Operation 79 [11/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 79 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 27 +ST_27 : Operation 80 [10/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 80 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 28 +ST_28 : Operation 81 [9/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 81 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 29 +ST_29 : Operation 82 [8/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 82 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 30 +ST_30 : Operation 83 [7/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 83 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 31 +ST_31 : Operation 84 [6/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 84 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 32 +ST_32 : Operation 85 [5/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 85 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 33 +ST_33 : Operation 86 [4/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 86 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 34 +ST_34 : Operation 87 [3/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 87 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 35 +ST_35 : Operation 88 [2/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 88 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 36 +ST_36 : Operation 89 [1/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 89 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' +ST_36 : Operation 90 [1/1] (0.00ns) ---> "%res = trunc i16 %sdiv_ln20" [div.cpp:20] ---> Operation 90 'trunc' 'res' +ST_36 : Operation 91 [1/1] (1.42ns) ---> "%empty = nbwrite i1 @_ssdm_op_NbWrite.ap_fifo.volatile.i16P0A, i16 %data_out_0, i16 %res" [div.cpp:22] ---> Operation 91 'nbwrite' 'empty' ---> Core 78 'FIFO' +ST_36 : Operation 92 [1/1] (0.00ns) ---> "%br_ln23 = br void %return" [div.cpp:23] ---> Operation 92 'br' 'br_ln23' +ST_36 : Operation 93 [1/1] (0.00ns) ---> "%ret_ln23 = ret" [div.cpp:23] ---> Operation 93 'ret' 'ret_ln23' + + +============================================================ ++ Verbose Summary: Binding +============================================================ +STG Binding: +---------------- STG Properties BEGIN ---------------- +- Is combinational: 0 +- Is one-state seq: 0 +- Is datapath-only: 0 +- Is pipelined: 1 +- Is top level: 1 +Port [ Return ] is wired: 1; IO mode=ap_ctrl_hs:ce=0 +Port [ data_in_0]: wired=1; compound=1; hidden=0; nouse=0; global=0; static=0; extern=0; dir=0; type=3; pingpong=0; private_global=0; IO mode=ap_fifo:ce=0 +Port [ data_in_1]: wired=1; compound=1; hidden=0; nouse=0; global=0; static=0; extern=0; dir=0; type=3; pingpong=0; private_global=0; IO mode=ap_fifo:ce=0 +Port [ data_out_0]: wired=1; compound=1; hidden=0; nouse=0; global=0; static=0; extern=0; dir=1; type=3; pingpong=0; private_global=0; IO mode=ap_fifo:ce=0 +---------------- STG Properties END ------------------ + +---------------- Datapath Model BEGIN ---------------- + + + +specpipeline_ln13 (specpipeline ) [ 0000000000000000000000000000000000000] +spectopmodule_ln10 (spectopmodule) [ 0000000000000000000000000000000000000] +specinterface_ln0 (specinterface) [ 0000000000000000000000000000000000000] +specbitsmap_ln0 (specbitsmap ) [ 0000000000000000000000000000000000000] +specinterface_ln0 (specinterface) [ 0000000000000000000000000000000000000] +specbitsmap_ln0 (specbitsmap ) [ 0000000000000000000000000000000000000] +specinterface_ln0 (specinterface) [ 0000000000000000000000000000000000000] +specbitsmap_ln0 (specbitsmap ) [ 0000000000000000000000000000000000000] +tmp (nbreadreq ) [ 0111111111111111111111111111111111111] +br_ln14 (br ) [ 0000000000000000000000000000000000000] +tmp_1 (nbreadreq ) [ 0111111111111111111111111111111111111] +br_ln14 (br ) [ 0000000000000000000000000000000000000] +br_ln15 (br ) [ 0000000000000000000000000000000000000] +data_in_0_read (nbread ) [ 0000000000000000000000000000000000000] +in0 (extractvalue ) [ 0111111111111111111111111111111111111] +data_in_1_read (nbread ) [ 0000000000000000000000000000000000000] +in1 (extractvalue ) [ 0111111111111111111111111111111111111] +sdiv_ln20 (sdiv ) [ 0000000000000000000000000000000000000] +res (trunc ) [ 0000000000000000000000000000000000000] +empty (nbwrite ) [ 0000000000000000000000000000000000000] +br_ln23 (br ) [ 0000000000000000000000000000000000000] +ret_ln23 (ret ) [ 0000000000000000000000000000000000000] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +---------------- Datapath Model END ------------------ + +* FSMD analyzer results: + - Output states: + Port: data_out_0 | {36 } + - Input state : + Port: div : data_in_0 | {1 } + Port: div : data_in_1 | {1 } + - Chain level: + State 1 + sdiv_ln20 : 1 + State 2 + State 3 + State 4 + State 5 + State 6 + State 7 + State 8 + State 9 + State 10 + State 11 + State 12 + State 13 + State 14 + State 15 + State 16 + State 17 + State 18 + State 19 + State 20 + State 21 + State 22 + State 23 + State 24 + State 25 + State 26 + State 27 + State 28 + State 29 + State 30 + State 31 + State 32 + State 33 + State 34 + State 35 + State 36 + res : 1 + empty : 2 + + +============================================================ ++ Verbose Summary: Datapath Resource usage +============================================================ + +* Functional unit list: +|----------|-----------------------------|---------|---------| +| Operation| Functional Unit | FF | LUT | +|----------|-----------------------------|---------|---------| +| sdiv | grp_fu_75 | 2283 | 1738 | +|----------|-----------------------------|---------|---------| +| nbreadreq| tmp_nbreadreq_fu_32 | 0 | 0 | +| | tmp_1_nbreadreq_fu_40 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| nbread | data_in_0_read_nbread_fu_48 | 0 | 0 | +| | data_in_1_read_nbread_fu_54 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| nbwrite | empty_nbwrite_fu_60 | 0 | 0 | +|----------|-----------------------------|---------|---------| +|extractvalue| in1_fu_67 | 0 | 0 | +| | in0_fu_71 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| trunc | res_fu_81 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| Total | | 2283 | 1738 | +|----------|-----------------------------|---------|---------| + +Memories: +N/A + +* Register list: ++------------+--------+ +| | FF | ++------------+--------+ +| in0_reg_94 | 32 | +| in1_reg_86 | 32 | +|tmp_1_reg_90| 1 | +| tmp_reg_99 | 1 | ++------------+--------+ +| Total | 66 | ++------------+--------+ + +* Multiplexer (MUX) list: +|-----------|------|------|------|--------||---------||---------| +| Comp | Pin | Size | BW | S x BW || Delay || LUT | +|-----------|------|------|------|--------||---------||---------| +| grp_fu_75 | p0 | 2 | 32 | 64 || 9 | +| grp_fu_75 | p1 | 2 | 32 | 64 || 9 | +|-----------|------|------|------|--------||---------||---------| +| Total | | | | 128 || 0.774 || 18 | +|-----------|------|------|------|--------||---------||---------| + + + +* Summary: ++-----------+--------+--------+--------+ +| | Delay | FF | LUT | ++-----------+--------+--------+--------+ +| Function | - | 2283 | 1738 | +| Memory | - | - | - | +|Multiplexer| 0 | - | 18 | +| Register | - | 66 | - | ++-----------+--------+--------+--------+ +| Total | 0 | 2349 | 1756 | ++-----------+--------+--------+--------+ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.bind.rpt.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.bind.rpt.xml new file mode 100644 index 000000000..90e0332a8 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.bind.rpt.xml @@ -0,0 +1,45 @@ + + +
+Sun Aug 4 22:51:44 2024 + +2023.1 (Build 3854077 on May 4 2023) +prj +solution1 (Vivado IP Flow Target) +virtexuplus +xcu250-figd2104-2L-e +
+ +
+ +
+ +Clock, Target, Estimated, Uncertainty +10.00 ns, 2.593 ns, 2.70 ns +
+
+
+
+ +
+ +, min, max, min, max, min, max, Type +35, 35, 0.350 us, 0.350 us, 1, 1, yes +
+
+ +
+ +Instance, Module, min, max, min, max, min, max, Type +
+
+ +Loop Name, min, max, Latency, achieved, target, Count, Pipelined +
+
+
+
+
+
+
+
diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.rpt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.rpt new file mode 100644 index 000000000..407be5c4e --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.rpt @@ -0,0 +1,887 @@ + + +================================================================ +== Vitis HLS Report for 'div' +================================================================ +* Date: Sun Aug 4 22:51:44 2024 + +* Version: 2023.1 (Build 3854077 on May 4 2023) +* Project: prj +* Solution: solution1 (Vivado IP Flow Target) +* Product family: virtexuplus +* Target device: xcu250-figd2104-2L-e + + +================================================================ +== Performance Estimates +================================================================ ++ Timing: + * Summary: + +--------+----------+----------+------------+ + | Clock | Target | Estimated| Uncertainty| + +--------+----------+----------+------------+ + |ap_clk | 10.00 ns| 2.593 ns| 2.70 ns| + +--------+----------+----------+------------+ + ++ Latency: + * Summary: + +---------+---------+----------+----------+-----+-----+---------+ + | Latency (cycles) | Latency (absolute) | Interval | Pipeline| + | min | max | min | max | min | max | Type | + +---------+---------+----------+----------+-----+-----+---------+ + | 35| 35| 0.350 us| 0.350 us| 1| 1| yes| + +---------+---------+----------+----------+-----+-----+---------+ + + + Detail: + * Instance: + N/A + + * Loop: + N/A + + + +================================================================ +== Utilization Estimates +================================================================ +* Summary: ++---------------------+---------+-------+---------+---------+------+ +| Name | BRAM_18K| DSP | FF | LUT | URAM | ++---------------------+---------+-------+---------+---------+------+ +|DSP | -| -| -| -| -| +|Expression | -| -| 0| 6| -| +|FIFO | -| -| -| -| -| +|Instance | -| -| 2283| 1738| -| +|Memory | -| -| -| -| -| +|Multiplexer | -| -| -| -| -| +|Register | -| -| 166| 64| -| ++---------------------+---------+-------+---------+---------+------+ +|Total | 0| 0| 2449| 1808| 0| ++---------------------+---------+-------+---------+---------+------+ +|Available SLR | 1344| 3072| 864000| 432000| 320| ++---------------------+---------+-------+---------+---------+------+ +|Utilization SLR (%) | 0| 0| ~0| ~0| 0| ++---------------------+---------+-------+---------+---------+------+ +|Available | 5376| 12288| 3456000| 1728000| 1280| ++---------------------+---------+-------+---------+---------+------+ +|Utilization (%) | 0| 0| ~0| ~0| 0| ++---------------------+---------+-------+---------+---------+------+ + ++ Detail: + * Instance: + +---------------------------+------------------------+---------+----+------+------+-----+ + | Instance | Module | BRAM_18K| DSP| FF | LUT | URAM| + +---------------------------+------------------------+---------+----+------+------+-----+ + |sdiv_32ns_32ns_16_36_1_U1 |sdiv_32ns_32ns_16_36_1 | 0| 0| 2283| 1738| 0| + +---------------------------+------------------------+---------+----+------+------+-----+ + |Total | | 0| 0| 2283| 1738| 0| + +---------------------------+------------------------+---------+----+------+------+-----+ + + * DSP: + N/A + + * Memory: + N/A + + * FIFO: + N/A + + * Expression: + +--------------------------+----------+----+---+----+------------+------------+ + | Variable Name | Operation| DSP| FF| LUT| Bitwidth P0| Bitwidth P1| + +--------------------------+----------+----+---+----+------------+------------+ + |tmp_1_nbreadreq_fu_40_p3 | and| 0| 0| 2| 1| 0| + |tmp_nbreadreq_fu_32_p3 | and| 0| 0| 2| 1| 0| + |ap_enable_pp0 | xor| 0| 0| 2| 1| 2| + +--------------------------+----------+----+---+----+------------+------------+ + |Total | | 0| 0| 6| 3| 2| + +--------------------------+----------+----+---+----+------------+------------+ + + * Multiplexer: + N/A + + * Register: + +--------------------------+----+----+-----+-----------+ + | Name | FF | LUT| Bits| Const Bits| + +--------------------------+----+----+-----+-----------+ + |ap_CS_fsm | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter1 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter10 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter11 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter12 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter13 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter14 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter15 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter16 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter17 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter18 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter19 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter2 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter20 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter21 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter22 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter23 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter24 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter25 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter26 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter27 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter28 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter29 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter3 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter30 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter31 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter32 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter33 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter34 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter35 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter4 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter5 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter6 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter7 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter8 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter9 | 1| 0| 1| 0| + |tmp_1_reg_90 | 1| 0| 1| 0| + |tmp_reg_99 | 1| 0| 1| 0| + |tmp_1_reg_90 | 64| 32| 1| 0| + |tmp_reg_99 | 64| 32| 1| 0| + +--------------------------+----+----+-----+-----------+ + |Total | 166| 64| 40| 0| + +--------------------------+----+----+-----+-----------+ + + + +================================================================ +== Interface +================================================================ +* Summary: ++-------------------+-----+-----+------------+--------------+--------------+ +| RTL Ports | Dir | Bits| Protocol | Source Object| C Type | ++-------------------+-----+-----+------------+--------------+--------------+ +|ap_clk | in| 1| ap_ctrl_hs| div| return value| +|ap_rst | in| 1| ap_ctrl_hs| div| return value| +|ap_start | in| 1| ap_ctrl_hs| div| return value| +|ap_done | out| 1| ap_ctrl_hs| div| return value| +|ap_idle | out| 1| ap_ctrl_hs| div| return value| +|ap_ready | out| 1| ap_ctrl_hs| div| return value| +|data_in_0_dout | in| 32| ap_fifo| data_in_0| pointer| +|data_in_0_empty_n | in| 1| ap_fifo| data_in_0| pointer| +|data_in_0_read | out| 1| ap_fifo| data_in_0| pointer| +|data_in_1_dout | in| 32| ap_fifo| data_in_1| pointer| +|data_in_1_empty_n | in| 1| ap_fifo| data_in_1| pointer| +|data_in_1_read | out| 1| ap_fifo| data_in_1| pointer| +|data_out_0_din | out| 16| ap_fifo| data_out_0| pointer| +|data_out_0_full_n | in| 1| ap_fifo| data_out_0| pointer| +|data_out_0_write | out| 1| ap_fifo| data_out_0| pointer| ++-------------------+-----+-----+------------+--------------+--------------+ + +============================================================ ++ Verbose Summary: Synthesis Manager +============================================================ +InlineROM: 1 +ExposeGlobal: 0 +============================================================ ++ Verbose Summary: CDFG Model +============================================================ +IsTopModel: 1 +ResetActiveHigh: 1 +IsCombinational: 2 +IsDatapathOnly: 2 +HasWiredReturn: 1 +HasMFsm: 2 +HasVarLatency: 0 +IsPipeline: 1 +IsRtlPipelined: 1 +IsInstanceOverlapped: 0 +IsDontTouch: 0 +HasImplIP: 0 +IsGatedGlobalClock: 0 + ++ Individual pipeline summary: + * Pipeline-0: initiation interval (II) = 1, depth = 36 + + +============================================================ ++ Verbose Summary: Schedule +============================================================ +* Number of FSM states : 36 +* Pipeline : 1 + Pipeline-0 : II = 1, D = 36, States = { 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 } +* Dataflow Pipeline: 0 + +* FSM state transitions: +1 --> 2 +2 --> 3 +3 --> 4 +4 --> 5 +5 --> 6 +6 --> 7 +7 --> 8 +8 --> 9 +9 --> 10 +10 --> 11 +11 --> 12 +12 --> 13 +13 --> 14 +14 --> 15 +15 --> 16 +16 --> 17 +17 --> 18 +18 --> 19 +19 --> 20 +20 --> 21 +21 --> 22 +22 --> 23 +23 --> 24 +24 --> 25 +25 --> 26 +26 --> 27 +27 --> 28 +28 --> 29 +29 --> 30 +30 --> 31 +31 --> 32 +32 --> 33 +33 --> 34 +34 --> 35 +35 --> 36 +36 --> + +* FSM state operations: + +State 1 +ST_1 : Operation 37 [1/1] (0.00ns) ---> "%specpipeline_ln13 = specpipeline void @_ssdm_op_SpecPipeline, i32 1, i32 0, i32 0, i32 0, void @empty" [div.cpp:13] ---> Operation 37 'specpipeline' 'specpipeline_ln13' +ST_1 : Operation 38 [1/1] (0.00ns) ---> "%spectopmodule_ln10 = spectopmodule void @_ssdm_op_SpecTopModule, void @empty_1" [div.cpp:10] ---> Operation 38 'spectopmodule' 'spectopmodule_ln10' +ST_1 : Operation 39 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i32 %data_in_0, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 39 'specinterface' 'specinterface_ln0' +ST_1 : Operation 40 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i32 %data_in_0" ---> Operation 40 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 41 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i32 %data_in_1, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 41 'specinterface' 'specinterface_ln0' +ST_1 : Operation 42 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i32 %data_in_1" ---> Operation 42 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 43 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i16 %data_out_0, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 43 'specinterface' 'specinterface_ln0' +ST_1 : Operation 44 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i16 %data_out_0" ---> Operation 44 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 45 [1/1] (0.00ns) ---> "%tmp = nbreadreq i1 @_ssdm_op_NbReadReq.ap_fifo.i32P0A, i32 %data_in_0, i32 1" [div.cpp:14] ---> Operation 45 'nbreadreq' 'tmp' ---> Core 78 'FIFO' +ST_1 : Operation 46 [1/1] (0.00ns) ---> "%br_ln14 = br i1 %tmp, void %if.then, void %lor.lhs.false" [div.cpp:14] ---> Operation 46 'br' 'br_ln14' +ST_1 : Operation 47 [1/1] (0.00ns) ---> "%tmp_1 = nbreadreq i1 @_ssdm_op_NbReadReq.ap_fifo.i32P0A, i32 %data_in_1, i32 1" [div.cpp:14] ---> Operation 47 'nbreadreq' 'tmp_1' ---> Core 78 'FIFO' +ST_1 : Operation 48 [1/1] (0.00ns) ---> "%br_ln14 = br i1 %tmp_1, void %if.then, void %if.end" [div.cpp:14] ---> Operation 48 'br' 'br_ln14' +ST_1 : Operation 49 [1/1] (0.00ns) ---> "%br_ln15 = br void %return" [div.cpp:15] ---> Operation 49 'br' 'br_ln15' +ST_1 : Operation 50 [1/1] (1.42ns) ---> "%data_in_0_read = nbread i33 @_ssdm_op_NbRead.ap_fifo.volatile.i32P0A, i32 %data_in_0" [div.cpp:18] ---> Operation 50 'nbread' 'data_in_0_read' ---> Core 78 'FIFO' +ST_1 : Operation 51 [1/1] (0.00ns) ---> "%in0 = extractvalue i33 %data_in_0_read" [div.cpp:18] ---> Operation 51 'extractvalue' 'in0' +ST_1 : Operation 52 [1/1] (1.42ns) ---> "%data_in_1_read = nbread i33 @_ssdm_op_NbRead.ap_fifo.volatile.i32P0A, i32 %data_in_1" [div.cpp:19] ---> Operation 52 'nbread' 'data_in_1_read' ---> Core 78 'FIFO' +ST_1 : Operation 53 [1/1] (0.00ns) ---> "%in1 = extractvalue i33 %data_in_1_read" [div.cpp:19] ---> Operation 53 'extractvalue' 'in1' +ST_1 : Operation 54 [36/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 54 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 2 +ST_2 : Operation 55 [35/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 55 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 3 +ST_3 : Operation 56 [34/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 56 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 4 +ST_4 : Operation 57 [33/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 57 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 5 +ST_5 : Operation 58 [32/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 58 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 6 +ST_6 : Operation 59 [31/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 59 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 7 +ST_7 : Operation 60 [30/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 60 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 8 +ST_8 : Operation 61 [29/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 61 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 9 +ST_9 : Operation 62 [28/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 62 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 10 +ST_10 : Operation 63 [27/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 63 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 11 +ST_11 : Operation 64 [26/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 64 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 12 +ST_12 : Operation 65 [25/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 65 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 13 +ST_13 : Operation 66 [24/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 66 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 14 +ST_14 : Operation 67 [23/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 67 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 15 +ST_15 : Operation 68 [22/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 68 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 16 +ST_16 : Operation 69 [21/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 69 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 17 +ST_17 : Operation 70 [20/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 70 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 18 +ST_18 : Operation 71 [19/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 71 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 19 +ST_19 : Operation 72 [18/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 72 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 20 +ST_20 : Operation 73 [17/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 73 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 21 +ST_21 : Operation 74 [16/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 74 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 22 +ST_22 : Operation 75 [15/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 75 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 23 +ST_23 : Operation 76 [14/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 76 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 24 +ST_24 : Operation 77 [13/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 77 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 25 +ST_25 : Operation 78 [12/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 78 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 26 +ST_26 : Operation 79 [11/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 79 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 27 +ST_27 : Operation 80 [10/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 80 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 28 +ST_28 : Operation 81 [9/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 81 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 29 +ST_29 : Operation 82 [8/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 82 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 30 +ST_30 : Operation 83 [7/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 83 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 31 +ST_31 : Operation 84 [6/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 84 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 32 +ST_32 : Operation 85 [5/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 85 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 33 +ST_33 : Operation 86 [4/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 86 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 34 +ST_34 : Operation 87 [3/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 87 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 35 +ST_35 : Operation 88 [2/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 88 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 36 +ST_36 : Operation 89 [1/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 89 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' +ST_36 : Operation 90 [1/1] (0.00ns) ---> "%res = trunc i16 %sdiv_ln20" [div.cpp:20] ---> Operation 90 'trunc' 'res' +ST_36 : Operation 91 [1/1] (1.42ns) ---> "%empty = nbwrite i1 @_ssdm_op_NbWrite.ap_fifo.volatile.i16P0A, i16 %data_out_0, i16 %res" [div.cpp:22] ---> Operation 91 'nbwrite' 'empty' ---> Core 78 'FIFO' +ST_36 : Operation 92 [1/1] (0.00ns) ---> "%br_ln23 = br void %return" [div.cpp:23] ---> Operation 92 'br' 'br_ln23' +ST_36 : Operation 93 [1/1] (0.00ns) ---> "%ret_ln23 = ret" [div.cpp:23] ---> Operation 93 'ret' 'ret_ln23' + + +============================================================ ++ Verbose Summary: Binding +============================================================ +STG Binding: +---------------- STG Properties BEGIN ---------------- +- Is combinational: 0 +- Is one-state seq: 0 +- Is datapath-only: 0 +- Is pipelined: 1 +- Is top level: 1 +Port [ Return ] is wired: 1; IO mode=ap_ctrl_hs:ce=0 +Port [ data_in_0]: wired=1; compound=1; hidden=0; nouse=0; global=0; static=0; extern=0; dir=0; type=3; pingpong=0; private_global=0; IO mode=ap_fifo:ce=0 +Port [ data_in_1]: wired=1; compound=1; hidden=0; nouse=0; global=0; static=0; extern=0; dir=0; type=3; pingpong=0; private_global=0; IO mode=ap_fifo:ce=0 +Port [ data_out_0]: wired=1; compound=1; hidden=0; nouse=0; global=0; static=0; extern=0; dir=1; type=3; pingpong=0; private_global=0; IO mode=ap_fifo:ce=0 +---------------- STG Properties END ------------------ + +---------------- Datapath Model BEGIN ---------------- + + + +specpipeline_ln13 (specpipeline ) [ 0000000000000000000000000000000000000] +spectopmodule_ln10 (spectopmodule) [ 0000000000000000000000000000000000000] +specinterface_ln0 (specinterface) [ 0000000000000000000000000000000000000] +specbitsmap_ln0 (specbitsmap ) [ 0000000000000000000000000000000000000] +specinterface_ln0 (specinterface) [ 0000000000000000000000000000000000000] +specbitsmap_ln0 (specbitsmap ) [ 0000000000000000000000000000000000000] +specinterface_ln0 (specinterface) [ 0000000000000000000000000000000000000] +specbitsmap_ln0 (specbitsmap ) [ 0000000000000000000000000000000000000] +tmp (nbreadreq ) [ 0111111111111111111111111111111111111] +br_ln14 (br ) [ 0000000000000000000000000000000000000] +tmp_1 (nbreadreq ) [ 0111111111111111111111111111111111111] +br_ln14 (br ) [ 0000000000000000000000000000000000000] +br_ln15 (br ) [ 0000000000000000000000000000000000000] +data_in_0_read (nbread ) [ 0000000000000000000000000000000000000] +in0 (extractvalue ) [ 0111111111111111111111111111111111111] +data_in_1_read (nbread ) [ 0000000000000000000000000000000000000] +in1 (extractvalue ) [ 0111111111111111111111111111111111111] +sdiv_ln20 (sdiv ) [ 0000000000000000000000000000000000000] +res (trunc ) [ 0000000000000000000000000000000000000] +empty (nbwrite ) [ 0000000000000000000000000000000000000] +br_ln23 (br ) [ 0000000000000000000000000000000000000] +ret_ln23 (ret ) [ 0000000000000000000000000000000000000] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +---------------- Datapath Model END ------------------ + +* FSMD analyzer results: + - Output states: + Port: data_out_0 | {36 } + - Input state : + Port: div : data_in_0 | {1 } + Port: div : data_in_1 | {1 } + - Chain level: + State 1 + sdiv_ln20 : 1 + State 2 + State 3 + State 4 + State 5 + State 6 + State 7 + State 8 + State 9 + State 10 + State 11 + State 12 + State 13 + State 14 + State 15 + State 16 + State 17 + State 18 + State 19 + State 20 + State 21 + State 22 + State 23 + State 24 + State 25 + State 26 + State 27 + State 28 + State 29 + State 30 + State 31 + State 32 + State 33 + State 34 + State 35 + State 36 + res : 1 + empty : 2 + + +============================================================ ++ Verbose Summary: Datapath Resource usage +============================================================ + +* Functional unit list: +|----------|-----------------------------|---------|---------| +| Operation| Functional Unit | FF | LUT | +|----------|-----------------------------|---------|---------| +| sdiv | grp_fu_75 | 2283 | 1738 | +|----------|-----------------------------|---------|---------| +| nbreadreq| tmp_nbreadreq_fu_32 | 0 | 0 | +| | tmp_1_nbreadreq_fu_40 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| nbread | data_in_0_read_nbread_fu_48 | 0 | 0 | +| | data_in_1_read_nbread_fu_54 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| nbwrite | empty_nbwrite_fu_60 | 0 | 0 | +|----------|-----------------------------|---------|---------| +|extractvalue| in1_fu_67 | 0 | 0 | +| | in0_fu_71 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| trunc | res_fu_81 | 0 | 0 | +|----------|-----------------------------|---------|---------| +| Total | | 2283 | 1738 | +|----------|-----------------------------|---------|---------| + +Memories: +N/A + +* Register list: ++------------+--------+ +| | FF | ++------------+--------+ +| in0_reg_94 | 32 | +| in1_reg_86 | 32 | +|tmp_1_reg_90| 1 | +| tmp_reg_99 | 1 | ++------------+--------+ +| Total | 66 | ++------------+--------+ + +* Multiplexer (MUX) list: +|-----------|------|------|------|--------||---------||---------| +| Comp | Pin | Size | BW | S x BW || Delay || LUT | +|-----------|------|------|------|--------||---------||---------| +| grp_fu_75 | p0 | 2 | 32 | 64 || 9 | +| grp_fu_75 | p1 | 2 | 32 | 64 || 9 | +|-----------|------|------|------|--------||---------||---------| +| Total | | | | 128 || 0.774 || 18 | +|-----------|------|------|------|--------||---------||---------| + + + +* Summary: ++-----------+--------+--------+--------+ +| | Delay | FF | LUT | ++-----------+--------+--------+--------+ +| Function | - | 2283 | 1738 | +| Memory | - | - | - | +|Multiplexer| 0 | - | 18 | +| Register | - | 66 | - | ++-----------+--------+--------+--------+ +| Total | 0 | 2349 | 1756 | ++-----------+--------+--------+--------+ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.rpt.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.rpt.xml new file mode 100644 index 000000000..d8bc3c9c9 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.rpt.xml @@ -0,0 +1,161 @@ + + +
+Sun Aug 4 22:51:44 2024 + +2023.1 (Build 3854077 on May 4 2023) +prj +solution1 (Vivado IP Flow Target) +virtexuplus +xcu250-figd2104-2L-e +
+ +
+ +
+ +Clock, Target, Estimated, Uncertainty +10.00 ns, 2.593 ns, 2.70 ns +
+
+
+
+ +
+ +, min, max, min, max, min, max, Type +35, 35, 0.350 us, 0.350 us, 1, 1, yes +
+
+ +
+ +Instance, Module, min, max, min, max, min, max, Type +
+
+ +Loop Name, min, max, Latency, achieved, target, Count, Pipelined +
+
+
+
+
+
+
+ +
+ +Name, BRAM_18K, DSP, FF, LUT, URAM +-, -, -, -, - +-, -, 0, 6, - +-, -, -, -, - +-, -, 2283, 1738, - +-, -, -, -, - +-, -, -, -, - +-, -, 166, 64, - +1344, 3072, 864000, 432000, 320 +0, 0, ~0, ~0, 0 +5376, 12288, 3456000, 1728000, 1280 +0, 0, ~0, ~0, 0 +
+
+ +
+ +Instance, Module, BRAM_18K, DSP, FF, LUT, URAM +sdiv_32ns_32ns_16_36_1, 0, 0, 2283, 1738, 0 +
+
+ +Instance, Module, Expression +
+
+ +Memory, Module, BRAM_18K, FF, LUT, URAM, Words, Bits, Banks, W*Bits*Banks +
+
+ +Name, BRAM_18K, FF, LUT, URAM, Depth, Bits, Size:D*B +
+
+ +Variable Name, Operation, DSP, FF, LUT, Bitwidth P0, Bitwidth P1 +and, 0, 0, 2, 1, 0 +and, 0, 0, 2, 1, 0 +xor, 0, 0, 2, 1, 2 +
+
+ +Name, LUT, Input Size, Bits, Total Bits +
+
+ +Name, FF, LUT, Bits, Const Bits +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +1, 0, 1, 0 +64, 32, 1, 0 +64, 32, 1, 0 +
+
+
+
+
+ +
+ +RTL Ports, Dir, Bits, Protocol, Source Object, C Type +in, 1, ap_ctrl_hs, div, return value +in, 1, ap_ctrl_hs, div, return value +in, 1, ap_ctrl_hs, div, return value +out, 1, ap_ctrl_hs, div, return value +out, 1, ap_ctrl_hs, div, return value +out, 1, ap_ctrl_hs, div, return value +in, 32, ap_fifo, data_in_0, pointer +in, 1, ap_fifo, data_in_0, pointer +out, 1, ap_fifo, data_in_0, pointer +in, 32, ap_fifo, data_in_1, pointer +in, 1, ap_fifo, data_in_1, pointer +out, 1, ap_fifo, data_in_1, pointer +out, 16, ap_fifo, data_out_0, pointer +in, 1, ap_fifo, data_out_0, pointer +out, 1, ap_fifo, data_out_0, pointer +
+
+
+
diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.sched.rpt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.sched.rpt new file mode 100644 index 000000000..bcf936388 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.sched.rpt @@ -0,0 +1,447 @@ + + +================================================================ +== Vitis HLS Report for 'div' +================================================================ +* Date: Sun Aug 4 22:51:43 2024 + +* Version: 2023.1 (Build 3854077 on May 4 2023) +* Project: prj +* Solution: solution1 (Vivado IP Flow Target) +* Product family: virtexuplus +* Target device: xcu250-figd2104-2L-e + + +================================================================ +== Performance Estimates +================================================================ ++ Timing: + * Summary: + +--------+----------+----------+------------+ + | Clock | Target | Estimated| Uncertainty| + +--------+----------+----------+------------+ + |ap_clk | 10.00 ns| 2.593 ns| 2.70 ns| + +--------+----------+----------+------------+ + ++ Latency: + * Summary: + +---------+---------+----------+----------+-----+-----+---------+ + | Latency (cycles) | Latency (absolute) | Interval | Pipeline| + | min | max | min | max | min | max | Type | + +---------+---------+----------+----------+-----+-----+---------+ + | 35| 35| 0.350 us| 0.350 us| 1| 1| yes| + +---------+---------+----------+----------+-----+-----+---------+ + + + Detail: + * Instance: + N/A + + * Loop: + N/A + +============================================================ ++ Verbose Summary: Synthesis Manager +============================================================ +InlineROM: 1 +ExposeGlobal: 0 +============================================================ ++ Verbose Summary: CDFG Model +============================================================ +IsTopModel: 1 +ResetActiveHigh: 1 +IsCombinational: 0 +IsDatapathOnly: 0 +HasWiredReturn: 1 +HasMFsm: 2 +HasVarLatency: 0 +IsPipeline: 1 +IsRtlPipelined: 1 +IsInstanceOverlapped: 0 +IsDontTouch: 0 +HasImplIP: 0 +IsGatedGlobalClock: 0 + ++ Individual pipeline summary: + * Pipeline-0: initiation interval (II) = 1, depth = 36 + + +============================================================ ++ Verbose Summary: Schedule +============================================================ +* Number of FSM states : 36 +* Pipeline : 1 + Pipeline-0 : II = 1, D = 36, States = { 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 } +* Dataflow Pipeline: 0 + +* FSM state transitions: +1 --> 2 +2 --> 3 +3 --> 4 +4 --> 5 +5 --> 6 +6 --> 7 +7 --> 8 +8 --> 9 +9 --> 10 +10 --> 11 +11 --> 12 +12 --> 13 +13 --> 14 +14 --> 15 +15 --> 16 +16 --> 17 +17 --> 18 +18 --> 19 +19 --> 20 +20 --> 21 +21 --> 22 +22 --> 23 +23 --> 24 +24 --> 25 +25 --> 26 +26 --> 27 +27 --> 28 +28 --> 29 +29 --> 30 +30 --> 31 +31 --> 32 +32 --> 33 +33 --> 34 +34 --> 35 +35 --> 36 +36 --> + +* FSM state operations: + +State 1 +ST_1 : Operation 37 [1/1] (0.00ns) ---> "%specpipeline_ln13 = specpipeline void @_ssdm_op_SpecPipeline, i32 1, i32 0, i32 0, i32 0, void @empty" [div.cpp:13] ---> Operation 37 'specpipeline' 'specpipeline_ln13' +ST_1 : Operation 38 [1/1] (0.00ns) ---> "%spectopmodule_ln10 = spectopmodule void @_ssdm_op_SpecTopModule, void @empty_1" [div.cpp:10] ---> Operation 38 'spectopmodule' 'spectopmodule_ln10' +ST_1 : Operation 39 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i32 %data_in_0, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 39 'specinterface' 'specinterface_ln0' +ST_1 : Operation 40 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i32 %data_in_0" ---> Operation 40 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 41 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i32 %data_in_1, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 41 'specinterface' 'specinterface_ln0' +ST_1 : Operation 42 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i32 %data_in_1" ---> Operation 42 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 43 [1/1] (0.00ns) ---> "%specinterface_ln0 = specinterface void @_ssdm_op_SpecInterface, i16 %data_out_0, void @empty_0, i32 0, i32 0, void @empty, i32 0, i32 0, void @empty, void @empty, void @empty, i32 0, i32 0, i32 0, i32 0, void @empty, void @empty, i32 4294967295, i32 0" ---> Operation 43 'specinterface' 'specinterface_ln0' +ST_1 : Operation 44 [1/1] (0.00ns) ---> "%specbitsmap_ln0 = specbitsmap void @_ssdm_op_SpecBitsMap, i16 %data_out_0" ---> Operation 44 'specbitsmap' 'specbitsmap_ln0' +ST_1 : Operation 45 [1/1] (0.00ns) ---> "%tmp = nbreadreq i1 @_ssdm_op_NbReadReq.ap_fifo.i32P0A, i32 %data_in_0, i32 1" [div.cpp:14] ---> Operation 45 'nbreadreq' 'tmp' ---> Core 78 'FIFO' +ST_1 : Operation 46 [1/1] (0.00ns) ---> "%br_ln14 = br i1 %tmp, void %if.then, void %lor.lhs.false" [div.cpp:14] ---> Operation 46 'br' 'br_ln14' +ST_1 : Operation 47 [1/1] (0.00ns) ---> "%tmp_1 = nbreadreq i1 @_ssdm_op_NbReadReq.ap_fifo.i32P0A, i32 %data_in_1, i32 1" [div.cpp:14] ---> Operation 47 'nbreadreq' 'tmp_1' ---> Core 78 'FIFO' +ST_1 : Operation 48 [1/1] (0.00ns) ---> "%br_ln14 = br i1 %tmp_1, void %if.then, void %if.end" [div.cpp:14] ---> Operation 48 'br' 'br_ln14' +ST_1 : Operation 49 [1/1] (0.00ns) ---> "%br_ln15 = br void %return" [div.cpp:15] ---> Operation 49 'br' 'br_ln15' +ST_1 : Operation 50 [1/1] (1.42ns) ---> "%data_in_0_read = nbread i33 @_ssdm_op_NbRead.ap_fifo.volatile.i32P0A, i32 %data_in_0" [div.cpp:18] ---> Operation 50 'nbread' 'data_in_0_read' ---> Core 78 'FIFO' +ST_1 : Operation 51 [1/1] (0.00ns) ---> "%in0 = extractvalue i33 %data_in_0_read" [div.cpp:18] ---> Operation 51 'extractvalue' 'in0' +ST_1 : Operation 52 [1/1] (1.42ns) ---> "%data_in_1_read = nbread i33 @_ssdm_op_NbRead.ap_fifo.volatile.i32P0A, i32 %data_in_1" [div.cpp:19] ---> Operation 52 'nbread' 'data_in_1_read' ---> Core 78 'FIFO' +ST_1 : Operation 53 [1/1] (0.00ns) ---> "%in1 = extractvalue i33 %data_in_1_read" [div.cpp:19] ---> Operation 53 'extractvalue' 'in1' +ST_1 : Operation 54 [36/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 54 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 2 +ST_2 : Operation 55 [35/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 55 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 3 +ST_3 : Operation 56 [34/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 56 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 4 +ST_4 : Operation 57 [33/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 57 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 5 +ST_5 : Operation 58 [32/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 58 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 6 +ST_6 : Operation 59 [31/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 59 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 7 +ST_7 : Operation 60 [30/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 60 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 8 +ST_8 : Operation 61 [29/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 61 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 9 +ST_9 : Operation 62 [28/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 62 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 10 +ST_10 : Operation 63 [27/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 63 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 11 +ST_11 : Operation 64 [26/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 64 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 12 +ST_12 : Operation 65 [25/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 65 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 13 +ST_13 : Operation 66 [24/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 66 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 14 +ST_14 : Operation 67 [23/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 67 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 15 +ST_15 : Operation 68 [22/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 68 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 16 +ST_16 : Operation 69 [21/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 69 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 17 +ST_17 : Operation 70 [20/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 70 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 18 +ST_18 : Operation 71 [19/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 71 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 19 +ST_19 : Operation 72 [18/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 72 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 20 +ST_20 : Operation 73 [17/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 73 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 21 +ST_21 : Operation 74 [16/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 74 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 22 +ST_22 : Operation 75 [15/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 75 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 23 +ST_23 : Operation 76 [14/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 76 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 24 +ST_24 : Operation 77 [13/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 77 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 25 +ST_25 : Operation 78 [12/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 78 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 26 +ST_26 : Operation 79 [11/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 79 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 27 +ST_27 : Operation 80 [10/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 80 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 28 +ST_28 : Operation 81 [9/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 81 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 29 +ST_29 : Operation 82 [8/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 82 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 30 +ST_30 : Operation 83 [7/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 83 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 31 +ST_31 : Operation 84 [6/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 84 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 32 +ST_32 : Operation 85 [5/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 85 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 33 +ST_33 : Operation 86 [4/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 86 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 34 +ST_34 : Operation 87 [3/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 87 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 35 +ST_35 : Operation 88 [2/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 88 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' + +State 36 +ST_36 : Operation 89 [1/36] (1.16ns) ---> "%sdiv_ln20 = sdiv i32 %in0, i32 %in1" [div.cpp:20] ---> Operation 89 'sdiv' 'sdiv_ln20' ---> Core 6 'Divider' +ST_36 : Operation 90 [1/1] (0.00ns) ---> "%res = trunc i16 %sdiv_ln20" [div.cpp:20] ---> Operation 90 'trunc' 'res' +ST_36 : Operation 91 [1/1] (1.42ns) ---> "%empty = nbwrite i1 @_ssdm_op_NbWrite.ap_fifo.volatile.i16P0A, i16 %data_out_0, i16 %res" [div.cpp:22] ---> Operation 91 'nbwrite' 'empty' ---> Core 78 'FIFO' +ST_36 : Operation 92 [1/1] (0.00ns) ---> "%br_ln23 = br void %return" [div.cpp:23] ---> Operation 92 'br' 'br_ln23' +ST_36 : Operation 93 [1/1] (0.00ns) ---> "%ret_ln23 = ret" [div.cpp:23] ---> Operation 93 'ret' 'ret_ln23' + + +============================================================ ++ Verbose Summary: Timing violations +============================================================ +Target clock period: 10.000ns, clock uncertainty: 2.700ns. + + : 2.593ns +The critical path consists of the following: + fifo read operation ('data_in_0_read', div.cpp:18) on port 'data_in_0' (div.cpp:18) [20] (1.428 ns) + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 1.165ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + + : 2.593ns +The critical path consists of the following: + 'sdiv' operation ('sdiv_ln20', div.cpp:20) [24] (1.165 ns) + fifo write operation ('empty', div.cpp:22) on port 'data_out_0' (div.cpp:22) [26] (1.428 ns) + + +============================================================ ++ Verbose Summary: Binding +============================================================ +N/A +* FSMD analyzer results: + - Output states: + - Input state : + - Chain level: + State 1 + State 2 + State 3 + State 4 + State 5 + State 6 + State 7 + State 8 + State 9 + State 10 + State 11 + State 12 + State 13 + State 14 + State 15 + State 16 + State 17 + State 18 + State 19 + State 20 + State 21 + State 22 + State 23 + State 24 + State 25 + State 26 + State 27 + State 28 + State 29 + State 30 + State 31 + State 32 + State 33 + State 34 + State 35 + State 36 + + +============================================================ ++ Verbose Summary: Datapath Resource usage +============================================================ +N/A diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.sched.rpt.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.sched.rpt.xml new file mode 100644 index 000000000..bcfcc444a --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div.verbose.sched.rpt.xml @@ -0,0 +1,45 @@ + + +
+Sun Aug 4 22:51:43 2024 + +2023.1 (Build 3854077 on May 4 2023) +prj +solution1 (Vivado IP Flow Target) +virtexuplus +xcu250-figd2104-2L-e +
+ +
+ +
+ +Clock, Target, Estimated, Uncertainty +10.00 ns, 2.593 ns, 2.70 ns +
+
+
+
+ +
+ +, min, max, min, max, min, max, Type +35, 35, 0.350 us, 0.350 us, 1, 1, yes +
+
+ +
+ +Instance, Module, min, max, min, max, min, max, Type +
+
+ +Loop Name, min, max, Latency, achieved, target, Count, Pipelined +
+
+
+
+
+
+
+
diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div_dataflow_ana.wcfg b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div_dataflow_ana.wcfg new file mode 100644 index 000000000..fdad874ea --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/div_dataflow_ana.wcfg @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + + + + HLS Process Summary + label + + AESL_inst_div_activity + label + true + AESL_inst_div_activity + AESL_inst_div_activity + UNSIGNEDDECRADIX + STYLE_ENUM_TRANSACTION + 0=blank ffff=blank 0bad=#FF0000 + true + turquoise + 65535=blank + /apatb_div_top/AESL_inst_div//AESL_inst_div_activity.dataflowTxWaveData.Cycles + /apatb_div_top/AESL_inst_div//AESL_inst_div_activity.dataflowTxWaveData.OutStanding + /apatb_div_top/AESL_inst_div//AESL_inst_div_activity.dataflowTxTooltipData + + + + AESL_inst_div_activity + AESL_inst_div_activity + + + AESL_inst_div + label + + Block-level handshake + label + + ap_start + ap_start + + + ap_ready + ap_ready + + + ap_done + ap_done + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.systemc.txt.ap_header.txt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.systemc.txt.ap_header.txt new file mode 100644 index 000000000..568afffec --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.systemc.txt.ap_header.txt @@ -0,0 +1,6 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.verilog.txt.ap_header.txt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.verilog.txt.ap_header.txt new file mode 100644 index 000000000..568afffec --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.verilog.txt.ap_header.txt @@ -0,0 +1,6 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.vhdl.txt.ap_header.txt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.vhdl.txt.ap_header.txt new file mode 100644 index 000000000..584f78604 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/divap_header.vhdl.txt.ap_header.txt @@ -0,0 +1,6 @@ +-- ============================================================== +-- Generated by Vitis HLS v2023.1 +-- Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +-- Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +-- ============================================================== + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/dsp_style b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/dsp_style new file mode 100644 index 000000000..5103aff30 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/dsp_style @@ -0,0 +1 @@ +DSP48E2 \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/fe_messages.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/fe_messages.xml new file mode 100644 index 000000000..733e6c1e5 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/fe_messages.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/fe_pragma_dump.reflow.0.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/fe_pragma_dump.reflow.0.xml new file mode 100644 index 000000000..ad99778f6 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/fe_pragma_dump.reflow.0.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/global.setting.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/global.setting.tcl new file mode 100644 index 000000000..3437327a2 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/global.setting.tcl @@ -0,0 +1,65 @@ + +set TopModule "div" +set ClockPeriod 10 +set ClockList ap_clk +set HasVivadoClockPeriod 0 +set CombLogicFlag 0 +set PipelineFlag 1 +set DataflowTaskPipelineFlag 1 +set TrivialPipelineFlag 0 +set noPortSwitchingFlag 0 +set FloatingPointFlag 0 +set FftOrFirFlag 0 +set NbRWValue 3 +set intNbAccess 0 +set NewDSPMapping 1 +set HasDSPModule 0 +set ResetLevelFlag 1 +set ResetStyle control +set ResetSyncFlag 1 +set ResetRegisterFlag 0 +set ResetVariableFlag 0 +set ResetRegisterNum 0 +set FsmEncStyle onehot +set MaxFanout 0 +set RtlPrefix {} +set RtlSubPrefix div_ +set ExtraCCFlags {} +set ExtraCLdFlags {} +set SynCheckOptions {} +set PresynOptions {} +set PreprocOptions {} +set SchedOptions {} +set BindOptions {} +set RtlGenOptions {} +set RtlWriterOptions {} +set CbcGenFlag {} +set CasGenFlag {} +set CasMonitorFlag {} +set AutoSimOptions {} +set ExportMCPathFlag 0 +set SCTraceFileName mytrace +set SCTraceFileFormat vcd +set SCTraceOption all +set TargetInfo xcu250:-figd2104:-2L-e +set SourceFiles {sc {} c ../../div.cpp} +set SourceFlags {sc {} c {{}}} +set DirectiveFile {} +set TBFiles {bc {} c {} sc {} cas {} vhdl {} verilog {}} +set SpecLanguage C +set TVInFiles {bc {} c {} sc {} cas {} vhdl {} verilog {}} +set TVOutFiles {bc {} c {} sc {} cas {} vhdl {} verilog {}} +set TBTops {bc "" c "" sc "" cas "" vhdl "" verilog ""} +set TBInstNames {bc "" c "" sc "" cas "" vhdl "" verilog ""} +set XDCFiles {} +set ExtraGlobalOptions {"area_timing" 1 "clock_gate" 1 "impl_flow" map "power_gate" 0} +set TBTVFileNotFound {} +set AppFile ../hls.app +set ApsFile solution1.aps +set AvePath ../../. +set DefaultPlatform DefaultPlatform +set multiClockList {} +set SCPortClockMap {} +set intNbAccess 0 +set PlatformFiles {{DefaultPlatform {xilinx/virtexuplus/virtexuplus}}} +set HPFPO 0 diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.cpp b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.cpp new file mode 100644 index 000000000..167b76158 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.cpp @@ -0,0 +1,19 @@ +#include "hls_design_meta.h" +const Port_Property HLS_Design_Meta::port_props[]={ + Port_Property("ap_clk", 1, hls_in, -1, "", "", 1), + Port_Property("ap_rst", 1, hls_in, -1, "", "", 1), + Port_Property("ap_start", 1, hls_in, -1, "", "", 1), + Port_Property("ap_done", 1, hls_out, -1, "", "", 1), + Port_Property("ap_idle", 1, hls_out, -1, "", "", 1), + Port_Property("ap_ready", 1, hls_out, -1, "", "", 1), + Port_Property("data_in_0_dout", 32, hls_in, 0, "ap_fifo", "fifo_port_we", 2), + Port_Property("data_in_0_empty_n", 1, hls_in, 0, "ap_fifo", "fifo_status", 2), + Port_Property("data_in_0_read", 1, hls_out, 0, "ap_fifo", "fifo_data", 2), + Port_Property("data_in_1_dout", 32, hls_in, 1, "ap_fifo", "fifo_port_we", 2), + Port_Property("data_in_1_empty_n", 1, hls_in, 1, "ap_fifo", "fifo_status", 2), + Port_Property("data_in_1_read", 1, hls_out, 1, "ap_fifo", "fifo_data", 2), + Port_Property("data_out_0_din", 16, hls_out, 2, "ap_fifo", "fifo_port_we", 1), + Port_Property("data_out_0_full_n", 1, hls_in, 2, "ap_fifo", "fifo_status", 1), + Port_Property("data_out_0_write", 1, hls_out, 2, "ap_fifo", "fifo_data", 1), +}; +const char* HLS_Design_Meta::dut_name = "div"; diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.h b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.h new file mode 100644 index 000000000..f9b955097 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.h @@ -0,0 +1,13 @@ +#ifndef HLS_DESIGN_META_H +#define HLS_DESIGN_META_H +#include "hls_design.h" + +struct HLS_Design_Meta { +// port data: name, bitwidth, direction (enumerator: hls_in, hls_out, hls_inout), group. +static const Port_Property port_props[15]; +static const HLS_INT32 latency = 35; +static const HLS_INT32 II = 1; +static const char* dut_name; +static bool is_vld_port(HLS_UINT32); +}; +#endif diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.tcl new file mode 100644 index 000000000..7d15fb51c --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/hls_design_meta.tcl @@ -0,0 +1,2 @@ +set design_latency 35 +set design_II 1 diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/kernel_module_hierarchy.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/kernel_module_hierarchy.tcl new file mode 100644 index 000000000..f7bace611 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/kernel_module_hierarchy.tcl @@ -0,0 +1,3 @@ +set ModuleHierarchy {[{ +"Name" : "div","ID" : "0","Type" : "pipeline" +}]} \ No newline at end of file diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/mapper_div.cpp b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/mapper_div.cpp new file mode 100644 index 000000000..f6f9fc86f --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/mapper_div.cpp @@ -0,0 +1,373 @@ +#include "hls_signal_handler.h" +#include +#include +#include +#include +#include +#include +#include +#include "ap_fixed.h" +#include "ap_int.h" +#include "hls_stream.h" +using namespace std; + +namespace hls::sim +{ + template + struct Byte { + unsigned char a[n]; + + Byte() + { + for (size_t i = 0; i < n; ++i) { + a[i] = 0; + } + } + + template + Byte& operator= (const T &val) + { + std::memcpy(a, &val, n); + return *this; + } + }; + + struct SimException : public std::exception { + const std::string msg; + const size_t line; + SimException(const std::string &msg, const size_t line) + : msg(msg), line(line) + { + } + }; + + void errExit(const size_t line, const std::string &msg) + { + std::string s; + s += "ERROR"; +// s += '('; +// s += __FILE__; +// s += ":"; +// s += std::to_string(line); +// s += ')'; + s += ": "; + s += msg; + s += "\n"; + fputs(s.c_str(), stderr); + exit(1); + } +} + + +namespace hls::sim +{ + template + void move(void* to, void* from) + { + auto t = (hls::stream>*)to; + auto f = (hls::stream>*)from; + while (!f->empty()) { + t->write(f->read()); + } + } + + template + void task_move(void* to, void* from) + { + auto t = (hls::stream>*)to; + auto f = (hls::stream>*)from; + std::thread( + [=] () { while (true) { t->write(f->read()); } } + ).detach(); + } + + template + struct MoveAXIS + { + struct ST { A data; K keep; S strb; U user; L last; I id; E dest; }; + + static void toSC(void* data, void* keep, void* strb, void* user, void* last, void* id, void* dest, void* axis) + { + ST st; + ((hls::stream*)axis)->read(st); + ((hls::stream
*)data)->write(st.data); + ((hls::stream*)keep)->write(st.keep); + ((hls::stream*)strb)->write(st.strb); + ((hls::stream*)user)->write(st.user); + ((hls::stream*)last)->write(st.last); + ((hls::stream*)id)->write(st.id); + ((hls::stream*)dest)->write(st.dest); + } + + static void fromSC(void* data, void* keep, void* strb, void* user, void* last, void* id, void* dest, void* axis) + { + ST st; + ((hls::stream*)data)->read(st.data); + ((hls::stream*)keep)->read(st.keep); + ((hls::stream*)strb)->read(st.strb); + ((hls::stream*)user)->read(st.user); + ((hls::stream*)last)->read(st.last); + ((hls::stream*)id)->read(st.id); + ((hls::stream*)dest)->read(st.dest); + ((hls::stream*)axis)->write(st); + } + }; + + template + void move_to_SC(void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest, void* axis) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + while (!((hls::stream*)axis)->empty()) { + M::toSC(data, keep, strb, user, last, id, dest, axis); + } + } + + template + void task_move_to_SC(void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest, void* axis) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + std::thread( + [=] () { while (true) M::toSC(data, keep, strb, user, last, id, dest, axis); } + ).detach(); + } + + template + void move_from_SC(void* axis, void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + while (!((hls::stream>*)data)->empty()) { + M::fromSC(data, keep, strb, user, last, id, dest, axis); + } + } + + template + void task_move_from_SC(void* axis, void* data, void* keep, void* strb, void* user, void* last, + void* id, void* dest) + { + typedef MoveAXIS, ap_uint, ap_uint, + ap_uint, ap_uint, ap_uint, + ap_uint> M; + std::thread( + [=] () { while (true) M::fromSC(data, keep, strb, user, last, id, dest, axis); } + ).detach(); + } +} + + +namespace hls::sim +{ + struct Buffer { + char *first; + Buffer(char *addr) : first(addr) + { + } + }; + + struct DBuffer : public Buffer { + static const size_t total = 1<<10; + size_t ufree; + + DBuffer(size_t usize) : Buffer(nullptr), ufree(total) + { + first = new char[usize*ufree]; + } + + ~DBuffer() + { + delete[] first; + } + }; + + struct CStream { + char *front; + char *back; + size_t num; + size_t usize; + std::list bufs; + bool dynamic; + + CStream() : front(nullptr), back(nullptr), + num(0), usize(0), dynamic(true) + { + } + + ~CStream() + { + for (Buffer *p : bufs) { + delete p; + } + } + + template + T* data() + { + return (T*)front; + } + + template + void transfer(hls::stream *param) + { + while (!empty()) { + param->write(*(T*)nextRead()); + } + } + + bool empty(); + char* nextRead(); + char* nextWrite(); + }; + + bool CStream::empty() + { + return num == 0; + } + + char* CStream::nextRead() + { + assert(num > 0); + char *res = front; + front += usize; + if (dynamic) { + if (++static_cast(bufs.front())->ufree == DBuffer::total) { + if (bufs.size() > 1) { + bufs.pop_front(); + front = bufs.front()->first; + } else { + front = back = bufs.front()->first; + } + } + } + --num; + return res; + } + + char* CStream::nextWrite() + { + if (dynamic) { + if (static_cast(bufs.back())->ufree == 0) { + bufs.push_back(new DBuffer(usize)); + back = bufs.back()->first; + } + --static_cast(bufs.back())->ufree; + } + char *res = back; + back += usize; + ++num; + return res; + } + + std::list streams; + std::map prebuilt; + + CStream* createStream(size_t usize) + { + streams.emplace_front(); + CStream &s = streams.front(); + { + s.dynamic = true; + s.bufs.push_back(new DBuffer(usize)); + s.front = s.bufs.back()->first; + s.back = s.front; + s.num = 0; + s.usize = usize; + } + return &s; + } + + template + CStream* createStream(hls::stream *param) + { + CStream *s = createStream(sizeof(T)); + { + s->dynamic = true; + while (!param->empty()) { + T data = param->read(); + memcpy(s->nextWrite(), (char*)&data, sizeof(T)); + } + prebuilt[s->front] = s; + } + return s; + } + + template + CStream* createStream(T *param, size_t usize) + { + streams.emplace_front(); + CStream &s = streams.front(); + { + s.dynamic = false; + s.bufs.push_back(new Buffer((char*)param)); + s.front = s.back = s.bufs.back()->first; + s.usize = usize; + s.num = ~0UL; + } + prebuilt[s.front] = &s; + return &s; + } + + CStream* findStream(char *buf) + { + return prebuilt.at(buf); + } +} +class AESL_RUNTIME_BC { + public: + AESL_RUNTIME_BC(const char* name) { + file_token.open( name); + if (!file_token.good()) { + cout << "Failed to open tv file " << name << endl; + exit (1); + } + file_token >> mName;//[[[runtime]]] + } + ~AESL_RUNTIME_BC() { + file_token.close(); + } + int read_size () { + int size = 0; + file_token >> mName;//[[transaction]] + file_token >> mName;//transaction number + file_token >> mName;//pop_size + size = atoi(mName.c_str()); + file_token >> mName;//[[/transaction]] + return size; + } + public: + fstream file_token; + string mName; +}; +unsigned int ap_apatb_data_in_0_cap_bc; +static AESL_RUNTIME_BC __xlx_data_in_0_V_size_Reader("../tv/stream_size/stream_size_in_data_in_0.dat"); +unsigned int ap_apatb_data_in_1_cap_bc; +static AESL_RUNTIME_BC __xlx_data_in_1_V_size_Reader("../tv/stream_size/stream_size_in_data_in_1.dat"); +unsigned int ap_apatb_data_out_0_cap_bc; +static AESL_RUNTIME_BC __xlx_data_out_0_V_size_Reader("../tv/stream_size/stream_size_out_data_out_0.dat"); +using hls::sim::Byte; +extern "C" void div(int*, int*, short*); +extern "C" void apatb_div_hw(volatile void * __xlx_apatb_param_data_in_0, volatile void * __xlx_apatb_param_data_in_1, volatile void * __xlx_apatb_param_data_out_0) { +using hls::sim::createStream; +auto* sdata_in_0 = createStream((hls::stream*)__xlx_apatb_param_data_in_0); +auto* sdata_in_1 = createStream((hls::stream*)__xlx_apatb_param_data_in_1); + //Create input buffer for data_out_0 + ap_apatb_data_out_0_cap_bc = __xlx_data_out_0_V_size_Reader.read_size(); + short* __xlx_data_out_0_input_buffer= new short[ap_apatb_data_out_0_cap_bc]; +auto* sdata_out_0 = createStream((hls::stream*)__xlx_apatb_param_data_out_0); + // DUT call + div(sdata_in_0->data(), sdata_in_1->data(), sdata_out_0->data()); +sdata_in_0->transfer((hls::stream*)__xlx_apatb_param_data_in_0); +sdata_in_1->transfer((hls::stream*)__xlx_apatb_param_data_in_1); +sdata_out_0->transfer((hls::stream*)__xlx_apatb_param_data_out_0); +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/top-io-be.tcl b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/top-io-be.tcl new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/top-io-fe.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/top-io-fe.xml new file mode 100644 index 000000000..d425f6f3d --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/top-io-fe.xml @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd new file mode 100644 index 000000000..ab2e84837 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd @@ -0,0 +1,198 @@ +-- ============================================================== +-- Generated by Vitis HLS v2023.1 +-- Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +-- Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +-- ============================================================== +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity div_sdiv_32ns_32ns_16_36_1_divider is + generic ( + in0_WIDTH : INTEGER :=32; + in1_WIDTH : INTEGER :=32; + out_WIDTH : INTEGER :=32); + port ( + clk : in STD_LOGIC; + reset : in STD_LOGIC; + ce : in STD_LOGIC; + dividend : in STD_LOGIC_VECTOR(in0_WIDTH-1 downto 0); + divisor : in STD_LOGIC_VECTOR(in1_WIDTH-1 downto 0); + sign_i : in STD_LOGIC_VECTOR(1 downto 0); + sign_o : out STD_LOGIC_VECTOR(1 downto 0); + quot : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0); + remd : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0)); + + function max (left, right : INTEGER) return INTEGER is + begin + if left > right then return left; + else return right; + end if; + end max; + +end entity; + +architecture rtl of div_sdiv_32ns_32ns_16_36_1_divider is + constant cal_WIDTH : INTEGER := max(in0_WIDTH, in1_WIDTH); + type in0_vector is array(INTEGER range <>) of UNSIGNED(in0_WIDTH-1 downto 0); + type in1_vector is array(INTEGER range <>) of UNSIGNED(in1_WIDTH-1 downto 0); + type cal_vector is array(INTEGER range <>) of UNSIGNED(cal_WIDTH downto 0); + type sign_vector is array(INTEGER range <>) of UNSIGNED(1 downto 0); + + signal dividend_tmp : in0_vector(0 to in0_WIDTH); + signal divisor_tmp : in1_vector(0 to in0_WIDTH); + signal remd_tmp : in0_vector(0 to in0_WIDTH); + signal comb_tmp : in0_vector(0 to in0_WIDTH-1); + signal cal_tmp : cal_vector(0 to in0_WIDTH-1); + signal sign_tmp : sign_vector(0 to in0_WIDTH); +begin + quot <= STD_LOGIC_VECTOR(RESIZE(dividend_tmp(in0_WIDTH), out_WIDTH)); + remd <= STD_LOGIC_VECTOR(RESIZE(remd_tmp(in0_WIDTH), out_WIDTH)); + sign_o <= STD_LOGIC_VECTOR(sign_tmp(in0_WIDTH)); + + tran_tmp_proc : process (clk) + begin + if (clk'event and clk='1') then + if (ce = '1') then + dividend_tmp(0) <= UNSIGNED(dividend); + divisor_tmp(0) <= UNSIGNED(divisor); + sign_tmp(0) <= UNSIGNED(sign_i); + remd_tmp(0) <= (others => '0'); + end if; + end if; + end process tran_tmp_proc; + + run_proc: for i in 0 to in0_WIDTH-1 generate + begin + comb_tmp(i) <= remd_tmp(i)(in0_WIDTH-2 downto 0) & dividend_tmp(i)(in0_WIDTH-1); + cal_tmp(i) <= ('0' & comb_tmp(i)) - ('0' & divisor_tmp(i)); + + process (clk) + begin + if (clk'event and clk='1') then + if (ce = '1') then + dividend_tmp(i+1) <= dividend_tmp(i)(in0_WIDTH-2 downto 0) & (not cal_tmp(i)(cal_WIDTH)); + divisor_tmp(i+1) <= divisor_tmp(i); + sign_tmp(i+1) <= sign_tmp(i); + if cal_tmp(i)(cal_WIDTH) = '1' then + remd_tmp(i+1) <= comb_tmp(i); + else + remd_tmp(i+1) <= cal_tmp(i)(in0_WIDTH-1 downto 0); + end if; + end if; + end if; + end process; + end generate run_proc; + +end architecture; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity div_sdiv_32ns_32ns_16_36_1 is + generic ( + ID : INTEGER :=1; + NUM_STAGE : INTEGER :=2; + din0_WIDTH : INTEGER :=32; + din1_WIDTH : INTEGER :=32; + dout_WIDTH : INTEGER :=32); + port ( + clk : in STD_LOGIC; + reset : in STD_LOGIC; + ce : in STD_LOGIC; + din0 : in STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + din1 : in STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + dout : out STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0)); +end entity; + +architecture rtl of div_sdiv_32ns_32ns_16_36_1 is + component div_sdiv_32ns_32ns_16_36_1_divider is + generic ( + in0_WIDTH : INTEGER :=32; + in1_WIDTH : INTEGER :=32; + out_WIDTH : INTEGER :=32); + port ( + reset : in STD_LOGIC; + clk : in STD_LOGIC; + ce : in STD_LOGIC; + dividend : in STD_LOGIC_VECTOR(in0_WIDTH-1 downto 0); + divisor : in STD_LOGIC_VECTOR(in1_WIDTH-1 downto 0); + sign_i : in STD_LOGIC_VECTOR(1 downto 0); + sign_o : out STD_LOGIC_VECTOR(1 downto 0); + quot : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0); + remd : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0)); + end component; + + signal dividend0 : STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + signal divisor0 : STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + signal dividend_u : STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + signal divisor_u : STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + signal quot_u : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal remd_u : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal quot : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal remd : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal sign_i : STD_LOGIC_VECTOR(1 downto 0); + signal sign_o : STD_LOGIC_VECTOR(1 downto 0); +begin + div_sdiv_32ns_32ns_16_36_1_divider_u : div_sdiv_32ns_32ns_16_36_1_divider + generic map( + in0_WIDTH => din0_WIDTH, + in1_WIDTH => din1_WIDTH, + out_WIDTH => dout_WIDTH) + port map( + clk => clk, + reset => reset, + ce => ce, + dividend => dividend_u, + divisor => divisor_u, + sign_i => sign_i, + sign_o => sign_o, + quot => quot_u, + remd => remd_u); + + sign_i <= (dividend0(din0_WIDTH-1) xor divisor0(din1_WIDTH-1)) & dividend0(din0_WIDTH-1); + dividend_u <= STD_LOGIC_VECTOR(UNSIGNED(not dividend0) + 1) when dividend0(din0_WIDTH-1) = '1' else dividend0; + divisor_u <= STD_LOGIC_VECTOR(UNSIGNED(not divisor0) + 1) when divisor0(din1_WIDTH-1) = '1' else divisor0; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + dividend0 <= din0; + divisor0 <= din1; + end if; + end if; +end process; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + if (sign_o(1) = '1') then + quot <= STD_LOGIC_VECTOR(UNSIGNED(not quot_u) + 1); + else + quot <= quot_u; + end if; + end if; + end if; +end process; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + if (sign_o(0) = '1') then + remd <= STD_LOGIC_VECTOR(UNSIGNED(not remd_u) + 1); + else + remd <= remd_u; + end if; + end if; + end if; +end process; + +dout <= quot; + +end architecture; + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/vlog/div_sdiv_32ns_32ns_16_36_1.v b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/vlog/div_sdiv_32ns_32ns_16_36_1.v new file mode 100644 index 000000000..765d3dc70 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.autopilot/db/vlog/div_sdiv_32ns_32ns_16_36_1.v @@ -0,0 +1,156 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== +`timescale 1 ns / 1 ps + +module div_sdiv_32ns_32ns_16_36_1_divider +#(parameter + in0_WIDTH = 32, + in1_WIDTH = 32, + out_WIDTH = 32 +) +( + input clk, + input reset, + input ce, + input [in0_WIDTH-1:0] dividend, + input [in1_WIDTH-1:0] divisor, + input [1:0] sign_i, + output wire [1:0] sign_o, + output wire [out_WIDTH-1:0] quot, + output wire [out_WIDTH-1:0] remd +); + +localparam cal_WIDTH = (in0_WIDTH > in1_WIDTH)? in0_WIDTH : in1_WIDTH; + +//------------------------Local signal------------------- +reg [in0_WIDTH-1:0] dividend_tmp[0:in0_WIDTH]; +reg [in1_WIDTH-1:0] divisor_tmp[0:in0_WIDTH]; +reg [in0_WIDTH-1:0] remd_tmp[0:in0_WIDTH]; +wire [in0_WIDTH-1:0] comb_tmp[0:in0_WIDTH-1]; +wire [cal_WIDTH:0] cal_tmp[0:in0_WIDTH-1]; +reg [1:0] sign_tmp[0:in0_WIDTH]; +//------------------------Body--------------------------- +assign quot = dividend_tmp[in0_WIDTH]; +assign remd = remd_tmp[in0_WIDTH]; +assign sign_o = sign_tmp[in0_WIDTH]; + +// dividend_tmp[0], divisor_tmp[0], remd_tmp[0] +always @(posedge clk) +begin + if (ce) begin + dividend_tmp[0] <= dividend; + divisor_tmp[0] <= divisor; + sign_tmp[0] <= sign_i; + remd_tmp[0] <= 1'b0; + end +end + +genvar i; +generate + for (i = 0; i < in0_WIDTH; i = i + 1) + begin : loop + if (in0_WIDTH == 1) assign comb_tmp[i] = dividend_tmp[i][0]; + else assign comb_tmp[i] = {remd_tmp[i][in0_WIDTH-2:0], dividend_tmp[i][in0_WIDTH-1]}; + assign cal_tmp[i] = {1'b0, comb_tmp[i]} - {1'b0, divisor_tmp[i]}; + + always @(posedge clk) + begin + if (ce) begin + if (in0_WIDTH == 1) dividend_tmp[i+1] <= ~cal_tmp[i][cal_WIDTH]; + else dividend_tmp[i+1] <= {dividend_tmp[i][in0_WIDTH-2:0], ~cal_tmp[i][cal_WIDTH]}; + divisor_tmp[i+1] <= divisor_tmp[i]; + remd_tmp[i+1] <= cal_tmp[i][cal_WIDTH]? comb_tmp[i] : cal_tmp[i][in0_WIDTH-1:0]; + sign_tmp[i+1] <= sign_tmp[i]; + end + end + end +endgenerate + +endmodule + +module div_sdiv_32ns_32ns_16_36_1 +#(parameter + ID = 1, + NUM_STAGE = 2, + din0_WIDTH = 32, + din1_WIDTH = 32, + dout_WIDTH = 32 +) +( + input clk, + input reset, + input ce, + input [din0_WIDTH-1:0] din0, + input [din1_WIDTH-1:0] din1, + output [dout_WIDTH-1:0] dout +); +//------------------------Local signal------------------- +reg [din0_WIDTH-1:0] dividend0; +reg [din1_WIDTH-1:0] divisor0; +wire [din0_WIDTH-1:0] dividend_u; +wire [din1_WIDTH-1:0] divisor_u; +wire [dout_WIDTH-1:0] quot_u; +wire [dout_WIDTH-1:0] remd_u; +reg [dout_WIDTH-1:0] quot; +reg [dout_WIDTH-1:0] remd; +wire [1:0] sign_i; +wire [1:0] sign_o; +//------------------------Instantiation------------------ +div_sdiv_32ns_32ns_16_36_1_divider #( + .in0_WIDTH ( din0_WIDTH ), + .in1_WIDTH ( din1_WIDTH ), + .out_WIDTH ( dout_WIDTH ) +) div_sdiv_32ns_32ns_16_36_1_divider_u ( + .clk ( clk ), + .reset ( reset ), + .ce ( ce ), + .dividend ( dividend_u ), + .divisor ( divisor_u ), + .sign_i ( sign_i ), + .sign_o ( sign_o ), + .quot ( quot_u ), + .remd ( remd_u ) +); +//------------------------Body--------------------------- +assign sign_i = {dividend0[din0_WIDTH-1] ^ divisor0[din1_WIDTH-1], dividend0[din0_WIDTH-1]}; +assign dividend_u = dividend0[din0_WIDTH-1]? ~dividend0[din0_WIDTH-1:0] + 1'b1 : + dividend0[din0_WIDTH-1:0]; +assign divisor_u = divisor0[din1_WIDTH-1]? ~divisor0[din1_WIDTH-1:0] + 1'b1 : + divisor0[din1_WIDTH-1:0]; + +always @(posedge clk) +begin + if (ce) begin + dividend0 <= din0; + divisor0 <= din1; + end +end + +always @(posedge clk) +begin + if (ce) begin + if (sign_o[1]) + quot <= ~quot_u + 1'b1; + else + quot <= quot_u; + end +end + +always @(posedge clk) +begin + if (ce) begin + if (sign_o[0]) + remd <= ~remd_u + 1'b1; + else + remd <= remd_u; + end +end + +assign dout = quot; + +endmodule + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.debug/div.protoinst b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.debug/div.protoinst new file mode 100644 index 000000000..9cb1cf8fa --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/.debug/div.protoinst @@ -0,0 +1,19 @@ +{ + "version": "1.0", + "modules": { + "div": { + "proto_instances": { + "/AESL_inst_div_activity": { + "interface": "xilinx.com:interface:internal_hls_dataflow:1.0", + "ports": { + "AP_CLK": { "actual": "ap_clk"}, + "AP_DONE": { "actual": "ap_done"}, + "AP_READY": { "actual": "ap_ready"}, + "AP_RESET": { "actual": "ap_rst"}, + "AP_START": { "actual": "ap_start"} + } + } + } + } + } +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/misc/logo.png b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/misc/logo.png new file mode 100755 index 000000000..f490ef0fb Binary files /dev/null and b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/misc/logo.png differ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/verilog/div.v b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/verilog/div.v new file mode 100644 index 000000000..4476f0432 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/verilog/div.v @@ -0,0 +1,887 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== + +`timescale 1 ns / 1 ps + +(* CORE_GENERATION_INFO="div_div,hls_ip_2023_1,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xcu250-figd2104-2L-e,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=pipeline,HLS_SYN_CLOCK=2.593000,HLS_SYN_LAT=35,HLS_SYN_TPT=1,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=2449,HLS_SYN_LUT=1808,HLS_VERSION=2023_1}" *) + +module div ( + ap_clk, + ap_rst, + ap_start, + ap_done, + ap_idle, + ap_ready, + data_in_0_dout, + data_in_0_empty_n, + data_in_0_read, + data_in_1_dout, + data_in_1_empty_n, + data_in_1_read, + data_out_0_din, + data_out_0_full_n, + data_out_0_write +); + +parameter ap_ST_fsm_pp0_stage0 = 1'd1; + +input ap_clk; +input ap_rst; +input ap_start; +output ap_done; +output ap_idle; +output ap_ready; +input [31:0] data_in_0_dout; +input data_in_0_empty_n; +output data_in_0_read; +input [31:0] data_in_1_dout; +input data_in_1_empty_n; +output data_in_1_read; +output [15:0] data_out_0_din; +input data_out_0_full_n; +output data_out_0_write; + +reg ap_done; +reg ap_idle; +reg ap_ready; +reg data_in_0_read; +reg data_in_1_read; +reg data_out_0_write; + +(* fsm_encoding = "none" *) reg [0:0] ap_CS_fsm; +wire ap_CS_fsm_pp0_stage0; +wire ap_enable_reg_pp0_iter0; +reg ap_enable_reg_pp0_iter1; +reg ap_enable_reg_pp0_iter2; +reg ap_enable_reg_pp0_iter3; +reg ap_enable_reg_pp0_iter4; +reg ap_enable_reg_pp0_iter5; +reg ap_enable_reg_pp0_iter6; +reg ap_enable_reg_pp0_iter7; +reg ap_enable_reg_pp0_iter8; +reg ap_enable_reg_pp0_iter9; +reg ap_enable_reg_pp0_iter10; +reg ap_enable_reg_pp0_iter11; +reg ap_enable_reg_pp0_iter12; +reg ap_enable_reg_pp0_iter13; +reg ap_enable_reg_pp0_iter14; +reg ap_enable_reg_pp0_iter15; +reg ap_enable_reg_pp0_iter16; +reg ap_enable_reg_pp0_iter17; +reg ap_enable_reg_pp0_iter18; +reg ap_enable_reg_pp0_iter19; +reg ap_enable_reg_pp0_iter20; +reg ap_enable_reg_pp0_iter21; +reg ap_enable_reg_pp0_iter22; +reg ap_enable_reg_pp0_iter23; +reg ap_enable_reg_pp0_iter24; +reg ap_enable_reg_pp0_iter25; +reg ap_enable_reg_pp0_iter26; +reg ap_enable_reg_pp0_iter27; +reg ap_enable_reg_pp0_iter28; +reg ap_enable_reg_pp0_iter29; +reg ap_enable_reg_pp0_iter30; +reg ap_enable_reg_pp0_iter31; +reg ap_enable_reg_pp0_iter32; +reg ap_enable_reg_pp0_iter33; +reg ap_enable_reg_pp0_iter34; +reg ap_enable_reg_pp0_iter35; +reg ap_idle_pp0; +wire ap_block_state1_pp0_stage0_iter0; +wire ap_block_state2_pp0_stage0_iter1; +wire ap_block_state3_pp0_stage0_iter2; +wire ap_block_state4_pp0_stage0_iter3; +wire ap_block_state5_pp0_stage0_iter4; +wire ap_block_state6_pp0_stage0_iter5; +wire ap_block_state7_pp0_stage0_iter6; +wire ap_block_state8_pp0_stage0_iter7; +wire ap_block_state9_pp0_stage0_iter8; +wire ap_block_state10_pp0_stage0_iter9; +wire ap_block_state11_pp0_stage0_iter10; +wire ap_block_state12_pp0_stage0_iter11; +wire ap_block_state13_pp0_stage0_iter12; +wire ap_block_state14_pp0_stage0_iter13; +wire ap_block_state15_pp0_stage0_iter14; +wire ap_block_state16_pp0_stage0_iter15; +wire ap_block_state17_pp0_stage0_iter16; +wire ap_block_state18_pp0_stage0_iter17; +wire ap_block_state19_pp0_stage0_iter18; +wire ap_block_state20_pp0_stage0_iter19; +wire ap_block_state21_pp0_stage0_iter20; +wire ap_block_state22_pp0_stage0_iter21; +wire ap_block_state23_pp0_stage0_iter22; +wire ap_block_state24_pp0_stage0_iter23; +wire ap_block_state25_pp0_stage0_iter24; +wire ap_block_state26_pp0_stage0_iter25; +wire ap_block_state27_pp0_stage0_iter26; +wire ap_block_state28_pp0_stage0_iter27; +wire ap_block_state29_pp0_stage0_iter28; +wire ap_block_state30_pp0_stage0_iter29; +wire ap_block_state31_pp0_stage0_iter30; +wire ap_block_state32_pp0_stage0_iter31; +wire ap_block_state33_pp0_stage0_iter32; +wire ap_block_state34_pp0_stage0_iter33; +wire ap_block_state35_pp0_stage0_iter34; +wire ap_block_state36_pp0_stage0_iter35; +wire ap_block_pp0_stage0_subdone; +wire ap_block_pp0_stage0_11001; +wire [0:0] tmp_nbreadreq_fu_32_p3; +wire [0:0] tmp_1_nbreadreq_fu_40_p3; +reg [0:0] tmp_1_reg_90; +reg [0:0] tmp_1_reg_90_pp0_iter1_reg; +reg [0:0] tmp_1_reg_90_pp0_iter2_reg; +reg [0:0] tmp_1_reg_90_pp0_iter3_reg; +reg [0:0] tmp_1_reg_90_pp0_iter4_reg; +reg [0:0] tmp_1_reg_90_pp0_iter5_reg; +reg [0:0] tmp_1_reg_90_pp0_iter6_reg; +reg [0:0] tmp_1_reg_90_pp0_iter7_reg; +reg [0:0] tmp_1_reg_90_pp0_iter8_reg; +reg [0:0] tmp_1_reg_90_pp0_iter9_reg; +reg [0:0] tmp_1_reg_90_pp0_iter10_reg; +reg [0:0] tmp_1_reg_90_pp0_iter11_reg; +reg [0:0] tmp_1_reg_90_pp0_iter12_reg; +reg [0:0] tmp_1_reg_90_pp0_iter13_reg; +reg [0:0] tmp_1_reg_90_pp0_iter14_reg; +reg [0:0] tmp_1_reg_90_pp0_iter15_reg; +reg [0:0] tmp_1_reg_90_pp0_iter16_reg; +reg [0:0] tmp_1_reg_90_pp0_iter17_reg; +reg [0:0] tmp_1_reg_90_pp0_iter18_reg; +reg [0:0] tmp_1_reg_90_pp0_iter19_reg; +reg [0:0] tmp_1_reg_90_pp0_iter20_reg; +reg [0:0] tmp_1_reg_90_pp0_iter21_reg; +reg [0:0] tmp_1_reg_90_pp0_iter22_reg; +reg [0:0] tmp_1_reg_90_pp0_iter23_reg; +reg [0:0] tmp_1_reg_90_pp0_iter24_reg; +reg [0:0] tmp_1_reg_90_pp0_iter25_reg; +reg [0:0] tmp_1_reg_90_pp0_iter26_reg; +reg [0:0] tmp_1_reg_90_pp0_iter27_reg; +reg [0:0] tmp_1_reg_90_pp0_iter28_reg; +reg [0:0] tmp_1_reg_90_pp0_iter29_reg; +reg [0:0] tmp_1_reg_90_pp0_iter30_reg; +reg [0:0] tmp_1_reg_90_pp0_iter31_reg; +reg [0:0] tmp_1_reg_90_pp0_iter32_reg; +reg [0:0] tmp_1_reg_90_pp0_iter33_reg; +reg [0:0] tmp_1_reg_90_pp0_iter34_reg; +reg [0:0] tmp_reg_99; +reg [0:0] tmp_reg_99_pp0_iter1_reg; +reg [0:0] tmp_reg_99_pp0_iter2_reg; +reg [0:0] tmp_reg_99_pp0_iter3_reg; +reg [0:0] tmp_reg_99_pp0_iter4_reg; +reg [0:0] tmp_reg_99_pp0_iter5_reg; +reg [0:0] tmp_reg_99_pp0_iter6_reg; +reg [0:0] tmp_reg_99_pp0_iter7_reg; +reg [0:0] tmp_reg_99_pp0_iter8_reg; +reg [0:0] tmp_reg_99_pp0_iter9_reg; +reg [0:0] tmp_reg_99_pp0_iter10_reg; +reg [0:0] tmp_reg_99_pp0_iter11_reg; +reg [0:0] tmp_reg_99_pp0_iter12_reg; +reg [0:0] tmp_reg_99_pp0_iter13_reg; +reg [0:0] tmp_reg_99_pp0_iter14_reg; +reg [0:0] tmp_reg_99_pp0_iter15_reg; +reg [0:0] tmp_reg_99_pp0_iter16_reg; +reg [0:0] tmp_reg_99_pp0_iter17_reg; +reg [0:0] tmp_reg_99_pp0_iter18_reg; +reg [0:0] tmp_reg_99_pp0_iter19_reg; +reg [0:0] tmp_reg_99_pp0_iter20_reg; +reg [0:0] tmp_reg_99_pp0_iter21_reg; +reg [0:0] tmp_reg_99_pp0_iter22_reg; +reg [0:0] tmp_reg_99_pp0_iter23_reg; +reg [0:0] tmp_reg_99_pp0_iter24_reg; +reg [0:0] tmp_reg_99_pp0_iter25_reg; +reg [0:0] tmp_reg_99_pp0_iter26_reg; +reg [0:0] tmp_reg_99_pp0_iter27_reg; +reg [0:0] tmp_reg_99_pp0_iter28_reg; +reg [0:0] tmp_reg_99_pp0_iter29_reg; +reg [0:0] tmp_reg_99_pp0_iter30_reg; +reg [0:0] tmp_reg_99_pp0_iter31_reg; +reg [0:0] tmp_reg_99_pp0_iter32_reg; +reg [0:0] tmp_reg_99_pp0_iter33_reg; +reg [0:0] tmp_reg_99_pp0_iter34_reg; +wire ap_block_pp0_stage0_01001; +wire ap_block_pp0_stage0; +wire [15:0] grp_fu_75_p2; +reg [0:0] ap_NS_fsm; +reg ap_idle_pp0_0to34; +reg ap_reset_idle_pp0; +wire ap_enable_pp0; +wire ap_ce_reg; + +// power-on initialization +initial begin +#0 ap_CS_fsm = 1'd1; +#0 ap_enable_reg_pp0_iter1 = 1'b0; +#0 ap_enable_reg_pp0_iter2 = 1'b0; +#0 ap_enable_reg_pp0_iter3 = 1'b0; +#0 ap_enable_reg_pp0_iter4 = 1'b0; +#0 ap_enable_reg_pp0_iter5 = 1'b0; +#0 ap_enable_reg_pp0_iter6 = 1'b0; +#0 ap_enable_reg_pp0_iter7 = 1'b0; +#0 ap_enable_reg_pp0_iter8 = 1'b0; +#0 ap_enable_reg_pp0_iter9 = 1'b0; +#0 ap_enable_reg_pp0_iter10 = 1'b0; +#0 ap_enable_reg_pp0_iter11 = 1'b0; +#0 ap_enable_reg_pp0_iter12 = 1'b0; +#0 ap_enable_reg_pp0_iter13 = 1'b0; +#0 ap_enable_reg_pp0_iter14 = 1'b0; +#0 ap_enable_reg_pp0_iter15 = 1'b0; +#0 ap_enable_reg_pp0_iter16 = 1'b0; +#0 ap_enable_reg_pp0_iter17 = 1'b0; +#0 ap_enable_reg_pp0_iter18 = 1'b0; +#0 ap_enable_reg_pp0_iter19 = 1'b0; +#0 ap_enable_reg_pp0_iter20 = 1'b0; +#0 ap_enable_reg_pp0_iter21 = 1'b0; +#0 ap_enable_reg_pp0_iter22 = 1'b0; +#0 ap_enable_reg_pp0_iter23 = 1'b0; +#0 ap_enable_reg_pp0_iter24 = 1'b0; +#0 ap_enable_reg_pp0_iter25 = 1'b0; +#0 ap_enable_reg_pp0_iter26 = 1'b0; +#0 ap_enable_reg_pp0_iter27 = 1'b0; +#0 ap_enable_reg_pp0_iter28 = 1'b0; +#0 ap_enable_reg_pp0_iter29 = 1'b0; +#0 ap_enable_reg_pp0_iter30 = 1'b0; +#0 ap_enable_reg_pp0_iter31 = 1'b0; +#0 ap_enable_reg_pp0_iter32 = 1'b0; +#0 ap_enable_reg_pp0_iter33 = 1'b0; +#0 ap_enable_reg_pp0_iter34 = 1'b0; +#0 ap_enable_reg_pp0_iter35 = 1'b0; +end + +div_sdiv_32ns_32ns_16_36_1 #( + .ID( 1 ), + .NUM_STAGE( 36 ), + .din0_WIDTH( 32 ), + .din1_WIDTH( 32 ), + .dout_WIDTH( 16 )) +sdiv_32ns_32ns_16_36_1_U1( + .clk(ap_clk), + .reset(ap_rst), + .din0(data_in_0_dout), + .din1(data_in_1_dout), + .ce(1'b1), + .dout(grp_fu_75_p2) +); + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_CS_fsm <= ap_ST_fsm_pp0_stage0; + end else begin + ap_CS_fsm <= ap_NS_fsm; + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter1 <= 1'b0; + end else begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_subdone))) begin + ap_enable_reg_pp0_iter1 <= ap_start; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter10 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter10 <= ap_enable_reg_pp0_iter9; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter11 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter11 <= ap_enable_reg_pp0_iter10; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter12 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter12 <= ap_enable_reg_pp0_iter11; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter13 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter13 <= ap_enable_reg_pp0_iter12; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter14 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter14 <= ap_enable_reg_pp0_iter13; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter15 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter15 <= ap_enable_reg_pp0_iter14; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter16 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter16 <= ap_enable_reg_pp0_iter15; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter17 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter17 <= ap_enable_reg_pp0_iter16; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter18 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter18 <= ap_enable_reg_pp0_iter17; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter19 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter19 <= ap_enable_reg_pp0_iter18; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter2 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter2 <= ap_enable_reg_pp0_iter1; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter20 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter20 <= ap_enable_reg_pp0_iter19; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter21 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter21 <= ap_enable_reg_pp0_iter20; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter22 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter22 <= ap_enable_reg_pp0_iter21; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter23 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter23 <= ap_enable_reg_pp0_iter22; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter24 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter24 <= ap_enable_reg_pp0_iter23; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter25 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter25 <= ap_enable_reg_pp0_iter24; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter26 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter26 <= ap_enable_reg_pp0_iter25; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter27 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter27 <= ap_enable_reg_pp0_iter26; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter28 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter28 <= ap_enable_reg_pp0_iter27; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter29 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter29 <= ap_enable_reg_pp0_iter28; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter3 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter3 <= ap_enable_reg_pp0_iter2; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter30 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter30 <= ap_enable_reg_pp0_iter29; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter31 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter31 <= ap_enable_reg_pp0_iter30; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter32 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter32 <= ap_enable_reg_pp0_iter31; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter33 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter33 <= ap_enable_reg_pp0_iter32; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter34 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter34 <= ap_enable_reg_pp0_iter33; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter35 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter35 <= ap_enable_reg_pp0_iter34; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter4 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter4 <= ap_enable_reg_pp0_iter3; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter5 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter5 <= ap_enable_reg_pp0_iter4; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter6 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter6 <= ap_enable_reg_pp0_iter5; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter7 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter7 <= ap_enable_reg_pp0_iter6; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter8 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter8 <= ap_enable_reg_pp0_iter7; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter9 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter9 <= ap_enable_reg_pp0_iter8; + end + end +end + +always @ (posedge ap_clk) begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + tmp_1_reg_90 <= tmp_1_nbreadreq_fu_40_p3; + end +end + +always @ (posedge ap_clk) begin + if ((1'b0 == ap_block_pp0_stage0_11001)) begin + tmp_1_reg_90_pp0_iter10_reg <= tmp_1_reg_90_pp0_iter9_reg; + tmp_1_reg_90_pp0_iter11_reg <= tmp_1_reg_90_pp0_iter10_reg; + tmp_1_reg_90_pp0_iter12_reg <= tmp_1_reg_90_pp0_iter11_reg; + tmp_1_reg_90_pp0_iter13_reg <= tmp_1_reg_90_pp0_iter12_reg; + tmp_1_reg_90_pp0_iter14_reg <= tmp_1_reg_90_pp0_iter13_reg; + tmp_1_reg_90_pp0_iter15_reg <= tmp_1_reg_90_pp0_iter14_reg; + tmp_1_reg_90_pp0_iter16_reg <= tmp_1_reg_90_pp0_iter15_reg; + tmp_1_reg_90_pp0_iter17_reg <= tmp_1_reg_90_pp0_iter16_reg; + tmp_1_reg_90_pp0_iter18_reg <= tmp_1_reg_90_pp0_iter17_reg; + tmp_1_reg_90_pp0_iter19_reg <= tmp_1_reg_90_pp0_iter18_reg; + tmp_1_reg_90_pp0_iter20_reg <= tmp_1_reg_90_pp0_iter19_reg; + tmp_1_reg_90_pp0_iter21_reg <= tmp_1_reg_90_pp0_iter20_reg; + tmp_1_reg_90_pp0_iter22_reg <= tmp_1_reg_90_pp0_iter21_reg; + tmp_1_reg_90_pp0_iter23_reg <= tmp_1_reg_90_pp0_iter22_reg; + tmp_1_reg_90_pp0_iter24_reg <= tmp_1_reg_90_pp0_iter23_reg; + tmp_1_reg_90_pp0_iter25_reg <= tmp_1_reg_90_pp0_iter24_reg; + tmp_1_reg_90_pp0_iter26_reg <= tmp_1_reg_90_pp0_iter25_reg; + tmp_1_reg_90_pp0_iter27_reg <= tmp_1_reg_90_pp0_iter26_reg; + tmp_1_reg_90_pp0_iter28_reg <= tmp_1_reg_90_pp0_iter27_reg; + tmp_1_reg_90_pp0_iter29_reg <= tmp_1_reg_90_pp0_iter28_reg; + tmp_1_reg_90_pp0_iter2_reg <= tmp_1_reg_90_pp0_iter1_reg; + tmp_1_reg_90_pp0_iter30_reg <= tmp_1_reg_90_pp0_iter29_reg; + tmp_1_reg_90_pp0_iter31_reg <= tmp_1_reg_90_pp0_iter30_reg; + tmp_1_reg_90_pp0_iter32_reg <= tmp_1_reg_90_pp0_iter31_reg; + tmp_1_reg_90_pp0_iter33_reg <= tmp_1_reg_90_pp0_iter32_reg; + tmp_1_reg_90_pp0_iter34_reg <= tmp_1_reg_90_pp0_iter33_reg; + tmp_1_reg_90_pp0_iter3_reg <= tmp_1_reg_90_pp0_iter2_reg; + tmp_1_reg_90_pp0_iter4_reg <= tmp_1_reg_90_pp0_iter3_reg; + tmp_1_reg_90_pp0_iter5_reg <= tmp_1_reg_90_pp0_iter4_reg; + tmp_1_reg_90_pp0_iter6_reg <= tmp_1_reg_90_pp0_iter5_reg; + tmp_1_reg_90_pp0_iter7_reg <= tmp_1_reg_90_pp0_iter6_reg; + tmp_1_reg_90_pp0_iter8_reg <= tmp_1_reg_90_pp0_iter7_reg; + tmp_1_reg_90_pp0_iter9_reg <= tmp_1_reg_90_pp0_iter8_reg; + tmp_reg_99_pp0_iter10_reg <= tmp_reg_99_pp0_iter9_reg; + tmp_reg_99_pp0_iter11_reg <= tmp_reg_99_pp0_iter10_reg; + tmp_reg_99_pp0_iter12_reg <= tmp_reg_99_pp0_iter11_reg; + tmp_reg_99_pp0_iter13_reg <= tmp_reg_99_pp0_iter12_reg; + tmp_reg_99_pp0_iter14_reg <= tmp_reg_99_pp0_iter13_reg; + tmp_reg_99_pp0_iter15_reg <= tmp_reg_99_pp0_iter14_reg; + tmp_reg_99_pp0_iter16_reg <= tmp_reg_99_pp0_iter15_reg; + tmp_reg_99_pp0_iter17_reg <= tmp_reg_99_pp0_iter16_reg; + tmp_reg_99_pp0_iter18_reg <= tmp_reg_99_pp0_iter17_reg; + tmp_reg_99_pp0_iter19_reg <= tmp_reg_99_pp0_iter18_reg; + tmp_reg_99_pp0_iter20_reg <= tmp_reg_99_pp0_iter19_reg; + tmp_reg_99_pp0_iter21_reg <= tmp_reg_99_pp0_iter20_reg; + tmp_reg_99_pp0_iter22_reg <= tmp_reg_99_pp0_iter21_reg; + tmp_reg_99_pp0_iter23_reg <= tmp_reg_99_pp0_iter22_reg; + tmp_reg_99_pp0_iter24_reg <= tmp_reg_99_pp0_iter23_reg; + tmp_reg_99_pp0_iter25_reg <= tmp_reg_99_pp0_iter24_reg; + tmp_reg_99_pp0_iter26_reg <= tmp_reg_99_pp0_iter25_reg; + tmp_reg_99_pp0_iter27_reg <= tmp_reg_99_pp0_iter26_reg; + tmp_reg_99_pp0_iter28_reg <= tmp_reg_99_pp0_iter27_reg; + tmp_reg_99_pp0_iter29_reg <= tmp_reg_99_pp0_iter28_reg; + tmp_reg_99_pp0_iter2_reg <= tmp_reg_99_pp0_iter1_reg; + tmp_reg_99_pp0_iter30_reg <= tmp_reg_99_pp0_iter29_reg; + tmp_reg_99_pp0_iter31_reg <= tmp_reg_99_pp0_iter30_reg; + tmp_reg_99_pp0_iter32_reg <= tmp_reg_99_pp0_iter31_reg; + tmp_reg_99_pp0_iter33_reg <= tmp_reg_99_pp0_iter32_reg; + tmp_reg_99_pp0_iter34_reg <= tmp_reg_99_pp0_iter33_reg; + tmp_reg_99_pp0_iter3_reg <= tmp_reg_99_pp0_iter2_reg; + tmp_reg_99_pp0_iter4_reg <= tmp_reg_99_pp0_iter3_reg; + tmp_reg_99_pp0_iter5_reg <= tmp_reg_99_pp0_iter4_reg; + tmp_reg_99_pp0_iter6_reg <= tmp_reg_99_pp0_iter5_reg; + tmp_reg_99_pp0_iter7_reg <= tmp_reg_99_pp0_iter6_reg; + tmp_reg_99_pp0_iter8_reg <= tmp_reg_99_pp0_iter7_reg; + tmp_reg_99_pp0_iter9_reg <= tmp_reg_99_pp0_iter8_reg; + end +end + +always @ (posedge ap_clk) begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_11001))) begin + tmp_1_reg_90_pp0_iter1_reg <= tmp_1_reg_90; + tmp_reg_99 <= tmp_nbreadreq_fu_32_p3; + tmp_reg_99_pp0_iter1_reg <= tmp_reg_99; + end +end + +always @ (*) begin + if (((1'b0 == ap_block_pp0_stage0_subdone) & (ap_enable_reg_pp0_iter35 == 1'b1))) begin + ap_done = 1'b1; + end else begin + ap_done = 1'b0; + end +end + +always @ (*) begin + if (((ap_start == 1'b0) & (1'b1 == ap_CS_fsm_pp0_stage0) & (ap_idle_pp0 == 1'b1))) begin + ap_idle = 1'b1; + end else begin + ap_idle = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter26 == 1'b0) & (ap_enable_reg_pp0_iter25 == 1'b0) & (ap_enable_reg_pp0_iter24 == 1'b0) & (ap_enable_reg_pp0_iter23 == 1'b0) & (ap_enable_reg_pp0_iter22 == 1'b0) & (ap_enable_reg_pp0_iter21 == 1'b0) & (ap_enable_reg_pp0_iter20 == 1'b0) & (ap_enable_reg_pp0_iter19 == 1'b0) & (ap_enable_reg_pp0_iter18 == 1'b0) & (ap_enable_reg_pp0_iter17 == 1'b0) & (ap_enable_reg_pp0_iter16 == 1'b0) & (ap_enable_reg_pp0_iter15 == 1'b0) & (ap_enable_reg_pp0_iter14 == 1'b0) & (ap_enable_reg_pp0_iter13 == 1'b0) & (ap_enable_reg_pp0_iter12 == 1'b0) & (ap_enable_reg_pp0_iter11 == 1'b0) & (ap_enable_reg_pp0_iter10 == 1'b0) & (ap_enable_reg_pp0_iter9 == 1'b0) & (ap_enable_reg_pp0_iter8 == 1'b0) & (ap_enable_reg_pp0_iter7 == 1'b0) & (ap_enable_reg_pp0_iter6 == 1'b0) & (ap_enable_reg_pp0_iter5 == 1'b0) & (ap_enable_reg_pp0_iter4 == 1'b0) & (ap_enable_reg_pp0_iter3 == 1'b0) & (ap_enable_reg_pp0_iter2 == 1'b0) & (ap_enable_reg_pp0_iter1 == 1'b0) & (ap_enable_reg_pp0_iter0 == 1'b0) & (ap_enable_reg_pp0_iter35 == 1'b0) + & (ap_enable_reg_pp0_iter34 == 1'b0) & (ap_enable_reg_pp0_iter33 == 1'b0) & (ap_enable_reg_pp0_iter32 == 1'b0) & (ap_enable_reg_pp0_iter31 == 1'b0) & (ap_enable_reg_pp0_iter30 == 1'b0) & (ap_enable_reg_pp0_iter29 == 1'b0) & (ap_enable_reg_pp0_iter28 == 1'b0) & (ap_enable_reg_pp0_iter27 == 1'b0))) begin + ap_idle_pp0 = 1'b1; + end else begin + ap_idle_pp0 = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter26 == 1'b0) & (ap_enable_reg_pp0_iter25 == 1'b0) & (ap_enable_reg_pp0_iter24 == 1'b0) & (ap_enable_reg_pp0_iter23 == 1'b0) & (ap_enable_reg_pp0_iter22 == 1'b0) & (ap_enable_reg_pp0_iter21 == 1'b0) & (ap_enable_reg_pp0_iter20 == 1'b0) & (ap_enable_reg_pp0_iter19 == 1'b0) & (ap_enable_reg_pp0_iter18 == 1'b0) & (ap_enable_reg_pp0_iter17 == 1'b0) & (ap_enable_reg_pp0_iter16 == 1'b0) & (ap_enable_reg_pp0_iter15 == 1'b0) & (ap_enable_reg_pp0_iter14 == 1'b0) & (ap_enable_reg_pp0_iter13 == 1'b0) & (ap_enable_reg_pp0_iter12 == 1'b0) & (ap_enable_reg_pp0_iter11 == 1'b0) & (ap_enable_reg_pp0_iter10 == 1'b0) & (ap_enable_reg_pp0_iter9 == 1'b0) & (ap_enable_reg_pp0_iter8 == 1'b0) & (ap_enable_reg_pp0_iter7 == 1'b0) & (ap_enable_reg_pp0_iter6 == 1'b0) & (ap_enable_reg_pp0_iter5 == 1'b0) & (ap_enable_reg_pp0_iter4 == 1'b0) & (ap_enable_reg_pp0_iter3 == 1'b0) & (ap_enable_reg_pp0_iter2 == 1'b0) & (ap_enable_reg_pp0_iter1 == 1'b0) & (ap_enable_reg_pp0_iter0 == 1'b0) & (ap_enable_reg_pp0_iter34 == 1'b0) + & (ap_enable_reg_pp0_iter33 == 1'b0) & (ap_enable_reg_pp0_iter32 == 1'b0) & (ap_enable_reg_pp0_iter31 == 1'b0) & (ap_enable_reg_pp0_iter30 == 1'b0) & (ap_enable_reg_pp0_iter29 == 1'b0) & (ap_enable_reg_pp0_iter28 == 1'b0) & (ap_enable_reg_pp0_iter27 == 1'b0))) begin + ap_idle_pp0_0to34 = 1'b1; + end else begin + ap_idle_pp0_0to34 = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_subdone))) begin + ap_ready = 1'b1; + end else begin + ap_ready = 1'b0; + end +end + +always @ (*) begin + if (((ap_start == 1'b0) & (ap_idle_pp0_0to34 == 1'b1))) begin + ap_reset_idle_pp0 = 1'b1; + end else begin + ap_reset_idle_pp0 = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_1_nbreadreq_fu_40_p3 == 1'd1) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (data_in_0_empty_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + data_in_0_read = 1'b1; + end else begin + data_in_0_read = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_1_nbreadreq_fu_40_p3 == 1'd1) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (data_in_1_empty_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + data_in_1_read = 1'b1; + end else begin + data_in_1_read = 1'b0; + end +end + +always @ (*) begin + if (((tmp_reg_99_pp0_iter34_reg == 1'd1) & (tmp_1_reg_90_pp0_iter34_reg == 1'd1) & (data_out_0_full_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001) & (ap_enable_reg_pp0_iter35 == 1'b1))) begin + data_out_0_write = 1'b1; + end else begin + data_out_0_write = 1'b0; + end +end + +always @ (*) begin + case (ap_CS_fsm) + ap_ST_fsm_pp0_stage0 : begin + ap_NS_fsm = ap_ST_fsm_pp0_stage0; + end + default : begin + ap_NS_fsm = 'bx; + end + endcase +end + +assign ap_CS_fsm_pp0_stage0 = ap_CS_fsm[32'd0]; + +assign ap_block_pp0_stage0 = ~(1'b1 == 1'b1); + +assign ap_block_pp0_stage0_01001 = ~(1'b1 == 1'b1); + +assign ap_block_pp0_stage0_11001 = ~(1'b1 == 1'b1); + +assign ap_block_pp0_stage0_subdone = ~(1'b1 == 1'b1); + +assign ap_block_state10_pp0_stage0_iter9 = ~(1'b1 == 1'b1); + +assign ap_block_state11_pp0_stage0_iter10 = ~(1'b1 == 1'b1); + +assign ap_block_state12_pp0_stage0_iter11 = ~(1'b1 == 1'b1); + +assign ap_block_state13_pp0_stage0_iter12 = ~(1'b1 == 1'b1); + +assign ap_block_state14_pp0_stage0_iter13 = ~(1'b1 == 1'b1); + +assign ap_block_state15_pp0_stage0_iter14 = ~(1'b1 == 1'b1); + +assign ap_block_state16_pp0_stage0_iter15 = ~(1'b1 == 1'b1); + +assign ap_block_state17_pp0_stage0_iter16 = ~(1'b1 == 1'b1); + +assign ap_block_state18_pp0_stage0_iter17 = ~(1'b1 == 1'b1); + +assign ap_block_state19_pp0_stage0_iter18 = ~(1'b1 == 1'b1); + +assign ap_block_state1_pp0_stage0_iter0 = ~(1'b1 == 1'b1); + +assign ap_block_state20_pp0_stage0_iter19 = ~(1'b1 == 1'b1); + +assign ap_block_state21_pp0_stage0_iter20 = ~(1'b1 == 1'b1); + +assign ap_block_state22_pp0_stage0_iter21 = ~(1'b1 == 1'b1); + +assign ap_block_state23_pp0_stage0_iter22 = ~(1'b1 == 1'b1); + +assign ap_block_state24_pp0_stage0_iter23 = ~(1'b1 == 1'b1); + +assign ap_block_state25_pp0_stage0_iter24 = ~(1'b1 == 1'b1); + +assign ap_block_state26_pp0_stage0_iter25 = ~(1'b1 == 1'b1); + +assign ap_block_state27_pp0_stage0_iter26 = ~(1'b1 == 1'b1); + +assign ap_block_state28_pp0_stage0_iter27 = ~(1'b1 == 1'b1); + +assign ap_block_state29_pp0_stage0_iter28 = ~(1'b1 == 1'b1); + +assign ap_block_state2_pp0_stage0_iter1 = ~(1'b1 == 1'b1); + +assign ap_block_state30_pp0_stage0_iter29 = ~(1'b1 == 1'b1); + +assign ap_block_state31_pp0_stage0_iter30 = ~(1'b1 == 1'b1); + +assign ap_block_state32_pp0_stage0_iter31 = ~(1'b1 == 1'b1); + +assign ap_block_state33_pp0_stage0_iter32 = ~(1'b1 == 1'b1); + +assign ap_block_state34_pp0_stage0_iter33 = ~(1'b1 == 1'b1); + +assign ap_block_state35_pp0_stage0_iter34 = ~(1'b1 == 1'b1); + +assign ap_block_state36_pp0_stage0_iter35 = ~(1'b1 == 1'b1); + +assign ap_block_state3_pp0_stage0_iter2 = ~(1'b1 == 1'b1); + +assign ap_block_state4_pp0_stage0_iter3 = ~(1'b1 == 1'b1); + +assign ap_block_state5_pp0_stage0_iter4 = ~(1'b1 == 1'b1); + +assign ap_block_state6_pp0_stage0_iter5 = ~(1'b1 == 1'b1); + +assign ap_block_state7_pp0_stage0_iter6 = ~(1'b1 == 1'b1); + +assign ap_block_state8_pp0_stage0_iter7 = ~(1'b1 == 1'b1); + +assign ap_block_state9_pp0_stage0_iter8 = ~(1'b1 == 1'b1); + +assign ap_enable_pp0 = (ap_idle_pp0 ^ 1'b1); + +assign ap_enable_reg_pp0_iter0 = ap_start; + +assign data_out_0_din = grp_fu_75_p2[15:0]; + +assign tmp_1_nbreadreq_fu_40_p3 = data_in_1_empty_n; + +assign tmp_nbreadreq_fu_32_p3 = data_in_0_empty_n; + +endmodule //div diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/verilog/div_sdiv_32ns_32ns_16_36_1.v b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/verilog/div_sdiv_32ns_32ns_16_36_1.v new file mode 100644 index 000000000..765d3dc70 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/verilog/div_sdiv_32ns_32ns_16_36_1.v @@ -0,0 +1,156 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== +`timescale 1 ns / 1 ps + +module div_sdiv_32ns_32ns_16_36_1_divider +#(parameter + in0_WIDTH = 32, + in1_WIDTH = 32, + out_WIDTH = 32 +) +( + input clk, + input reset, + input ce, + input [in0_WIDTH-1:0] dividend, + input [in1_WIDTH-1:0] divisor, + input [1:0] sign_i, + output wire [1:0] sign_o, + output wire [out_WIDTH-1:0] quot, + output wire [out_WIDTH-1:0] remd +); + +localparam cal_WIDTH = (in0_WIDTH > in1_WIDTH)? in0_WIDTH : in1_WIDTH; + +//------------------------Local signal------------------- +reg [in0_WIDTH-1:0] dividend_tmp[0:in0_WIDTH]; +reg [in1_WIDTH-1:0] divisor_tmp[0:in0_WIDTH]; +reg [in0_WIDTH-1:0] remd_tmp[0:in0_WIDTH]; +wire [in0_WIDTH-1:0] comb_tmp[0:in0_WIDTH-1]; +wire [cal_WIDTH:0] cal_tmp[0:in0_WIDTH-1]; +reg [1:0] sign_tmp[0:in0_WIDTH]; +//------------------------Body--------------------------- +assign quot = dividend_tmp[in0_WIDTH]; +assign remd = remd_tmp[in0_WIDTH]; +assign sign_o = sign_tmp[in0_WIDTH]; + +// dividend_tmp[0], divisor_tmp[0], remd_tmp[0] +always @(posedge clk) +begin + if (ce) begin + dividend_tmp[0] <= dividend; + divisor_tmp[0] <= divisor; + sign_tmp[0] <= sign_i; + remd_tmp[0] <= 1'b0; + end +end + +genvar i; +generate + for (i = 0; i < in0_WIDTH; i = i + 1) + begin : loop + if (in0_WIDTH == 1) assign comb_tmp[i] = dividend_tmp[i][0]; + else assign comb_tmp[i] = {remd_tmp[i][in0_WIDTH-2:0], dividend_tmp[i][in0_WIDTH-1]}; + assign cal_tmp[i] = {1'b0, comb_tmp[i]} - {1'b0, divisor_tmp[i]}; + + always @(posedge clk) + begin + if (ce) begin + if (in0_WIDTH == 1) dividend_tmp[i+1] <= ~cal_tmp[i][cal_WIDTH]; + else dividend_tmp[i+1] <= {dividend_tmp[i][in0_WIDTH-2:0], ~cal_tmp[i][cal_WIDTH]}; + divisor_tmp[i+1] <= divisor_tmp[i]; + remd_tmp[i+1] <= cal_tmp[i][cal_WIDTH]? comb_tmp[i] : cal_tmp[i][in0_WIDTH-1:0]; + sign_tmp[i+1] <= sign_tmp[i]; + end + end + end +endgenerate + +endmodule + +module div_sdiv_32ns_32ns_16_36_1 +#(parameter + ID = 1, + NUM_STAGE = 2, + din0_WIDTH = 32, + din1_WIDTH = 32, + dout_WIDTH = 32 +) +( + input clk, + input reset, + input ce, + input [din0_WIDTH-1:0] din0, + input [din1_WIDTH-1:0] din1, + output [dout_WIDTH-1:0] dout +); +//------------------------Local signal------------------- +reg [din0_WIDTH-1:0] dividend0; +reg [din1_WIDTH-1:0] divisor0; +wire [din0_WIDTH-1:0] dividend_u; +wire [din1_WIDTH-1:0] divisor_u; +wire [dout_WIDTH-1:0] quot_u; +wire [dout_WIDTH-1:0] remd_u; +reg [dout_WIDTH-1:0] quot; +reg [dout_WIDTH-1:0] remd; +wire [1:0] sign_i; +wire [1:0] sign_o; +//------------------------Instantiation------------------ +div_sdiv_32ns_32ns_16_36_1_divider #( + .in0_WIDTH ( din0_WIDTH ), + .in1_WIDTH ( din1_WIDTH ), + .out_WIDTH ( dout_WIDTH ) +) div_sdiv_32ns_32ns_16_36_1_divider_u ( + .clk ( clk ), + .reset ( reset ), + .ce ( ce ), + .dividend ( dividend_u ), + .divisor ( divisor_u ), + .sign_i ( sign_i ), + .sign_o ( sign_o ), + .quot ( quot_u ), + .remd ( remd_u ) +); +//------------------------Body--------------------------- +assign sign_i = {dividend0[din0_WIDTH-1] ^ divisor0[din1_WIDTH-1], dividend0[din0_WIDTH-1]}; +assign dividend_u = dividend0[din0_WIDTH-1]? ~dividend0[din0_WIDTH-1:0] + 1'b1 : + dividend0[din0_WIDTH-1:0]; +assign divisor_u = divisor0[din1_WIDTH-1]? ~divisor0[din1_WIDTH-1:0] + 1'b1 : + divisor0[din1_WIDTH-1:0]; + +always @(posedge clk) +begin + if (ce) begin + dividend0 <= din0; + divisor0 <= din1; + end +end + +always @(posedge clk) +begin + if (ce) begin + if (sign_o[1]) + quot <= ~quot_u + 1'b1; + else + quot <= quot_u; + end +end + +always @(posedge clk) +begin + if (ce) begin + if (sign_o[0]) + remd <= ~remd_u + 1'b1; + else + remd <= remd_u; + end +end + +assign dout = quot; + +endmodule + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/vhdl/div.vhd b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/vhdl/div.vhd new file mode 100644 index 000000000..4d56aba1f --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/vhdl/div.vhd @@ -0,0 +1,982 @@ +-- ============================================================== +-- Generated by Vitis HLS v2023.1 +-- Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +-- Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +-- ============================================================== + +library IEEE; +use IEEE.std_logic_1164.all; +use IEEE.numeric_std.all; + +entity div is +port ( + ap_clk : IN STD_LOGIC; + ap_rst : IN STD_LOGIC; + ap_start : IN STD_LOGIC; + ap_done : OUT STD_LOGIC; + ap_idle : OUT STD_LOGIC; + ap_ready : OUT STD_LOGIC; + data_in_0_dout : IN STD_LOGIC_VECTOR (31 downto 0); + data_in_0_empty_n : IN STD_LOGIC; + data_in_0_read : OUT STD_LOGIC; + data_in_1_dout : IN STD_LOGIC_VECTOR (31 downto 0); + data_in_1_empty_n : IN STD_LOGIC; + data_in_1_read : OUT STD_LOGIC; + data_out_0_din : OUT STD_LOGIC_VECTOR (15 downto 0); + data_out_0_full_n : IN STD_LOGIC; + data_out_0_write : OUT STD_LOGIC ); +end; + + +architecture behav of div is + attribute CORE_GENERATION_INFO : STRING; + attribute CORE_GENERATION_INFO of behav : architecture is + "div_div,hls_ip_2023_1,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xcu250-figd2104-2L-e,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=pipeline,HLS_SYN_CLOCK=2.593000,HLS_SYN_LAT=35,HLS_SYN_TPT=1,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=2449,HLS_SYN_LUT=1808,HLS_VERSION=2023_1}"; + constant ap_const_logic_1 : STD_LOGIC := '1'; + constant ap_const_logic_0 : STD_LOGIC := '0'; + constant ap_ST_fsm_pp0_stage0 : STD_LOGIC_VECTOR (0 downto 0) := "1"; + constant ap_const_lv32_0 : STD_LOGIC_VECTOR (31 downto 0) := "00000000000000000000000000000000"; + constant ap_const_boolean_1 : BOOLEAN := true; + constant ap_const_boolean_0 : BOOLEAN := false; + constant ap_const_lv1_1 : STD_LOGIC_VECTOR (0 downto 0) := "1"; + + signal ap_CS_fsm : STD_LOGIC_VECTOR (0 downto 0) := "1"; + attribute fsm_encoding : string; + attribute fsm_encoding of ap_CS_fsm : signal is "none"; + signal ap_CS_fsm_pp0_stage0 : STD_LOGIC; + attribute fsm_encoding of ap_CS_fsm_pp0_stage0 : signal is "none"; + signal ap_enable_reg_pp0_iter0 : STD_LOGIC; + signal ap_enable_reg_pp0_iter1 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter2 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter3 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter4 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter5 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter6 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter7 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter8 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter9 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter10 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter11 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter12 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter13 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter14 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter15 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter16 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter17 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter18 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter19 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter20 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter21 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter22 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter23 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter24 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter25 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter26 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter27 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter28 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter29 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter30 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter31 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter32 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter33 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter34 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter35 : STD_LOGIC := '0'; + signal ap_idle_pp0 : STD_LOGIC; + signal ap_block_state1_pp0_stage0_iter0 : BOOLEAN; + signal ap_block_state2_pp0_stage0_iter1 : BOOLEAN; + signal ap_block_state3_pp0_stage0_iter2 : BOOLEAN; + signal ap_block_state4_pp0_stage0_iter3 : BOOLEAN; + signal ap_block_state5_pp0_stage0_iter4 : BOOLEAN; + signal ap_block_state6_pp0_stage0_iter5 : BOOLEAN; + signal ap_block_state7_pp0_stage0_iter6 : BOOLEAN; + signal ap_block_state8_pp0_stage0_iter7 : BOOLEAN; + signal ap_block_state9_pp0_stage0_iter8 : BOOLEAN; + signal ap_block_state10_pp0_stage0_iter9 : BOOLEAN; + signal ap_block_state11_pp0_stage0_iter10 : BOOLEAN; + signal ap_block_state12_pp0_stage0_iter11 : BOOLEAN; + signal ap_block_state13_pp0_stage0_iter12 : BOOLEAN; + signal ap_block_state14_pp0_stage0_iter13 : BOOLEAN; + signal ap_block_state15_pp0_stage0_iter14 : BOOLEAN; + signal ap_block_state16_pp0_stage0_iter15 : BOOLEAN; + signal ap_block_state17_pp0_stage0_iter16 : BOOLEAN; + signal ap_block_state18_pp0_stage0_iter17 : BOOLEAN; + signal ap_block_state19_pp0_stage0_iter18 : BOOLEAN; + signal ap_block_state20_pp0_stage0_iter19 : BOOLEAN; + signal ap_block_state21_pp0_stage0_iter20 : BOOLEAN; + signal ap_block_state22_pp0_stage0_iter21 : BOOLEAN; + signal ap_block_state23_pp0_stage0_iter22 : BOOLEAN; + signal ap_block_state24_pp0_stage0_iter23 : BOOLEAN; + signal ap_block_state25_pp0_stage0_iter24 : BOOLEAN; + signal ap_block_state26_pp0_stage0_iter25 : BOOLEAN; + signal ap_block_state27_pp0_stage0_iter26 : BOOLEAN; + signal ap_block_state28_pp0_stage0_iter27 : BOOLEAN; + signal ap_block_state29_pp0_stage0_iter28 : BOOLEAN; + signal ap_block_state30_pp0_stage0_iter29 : BOOLEAN; + signal ap_block_state31_pp0_stage0_iter30 : BOOLEAN; + signal ap_block_state32_pp0_stage0_iter31 : BOOLEAN; + signal ap_block_state33_pp0_stage0_iter32 : BOOLEAN; + signal ap_block_state34_pp0_stage0_iter33 : BOOLEAN; + signal ap_block_state35_pp0_stage0_iter34 : BOOLEAN; + signal ap_block_state36_pp0_stage0_iter35 : BOOLEAN; + signal ap_block_pp0_stage0_subdone : BOOLEAN; + signal ap_block_pp0_stage0_11001 : BOOLEAN; + signal tmp_nbreadreq_fu_32_p3 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_nbreadreq_fu_40_p3 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter1_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter2_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter3_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter4_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter5_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter6_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter7_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter8_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter9_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter10_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter11_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter12_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter13_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter14_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter15_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter16_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter17_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter18_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter19_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter20_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter21_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter22_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter23_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter24_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter25_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter26_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter27_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter28_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter29_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter30_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter31_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter32_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter33_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter34_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter1_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter2_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter3_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter4_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter5_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter6_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter7_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter8_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter9_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter10_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter11_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter12_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter13_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter14_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter15_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter16_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter17_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter18_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter19_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter20_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter21_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter22_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter23_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter24_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter25_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter26_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter27_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter28_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter29_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter30_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter31_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter32_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter33_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter34_reg : STD_LOGIC_VECTOR (0 downto 0); + signal ap_block_pp0_stage0_01001 : BOOLEAN; + signal ap_block_pp0_stage0 : BOOLEAN; + signal grp_fu_75_p2 : STD_LOGIC_VECTOR (15 downto 0); + signal ap_NS_fsm : STD_LOGIC_VECTOR (0 downto 0); + signal ap_idle_pp0_0to34 : STD_LOGIC; + signal ap_reset_idle_pp0 : STD_LOGIC; + signal ap_enable_pp0 : STD_LOGIC; + signal ap_ce_reg : STD_LOGIC; + + component div_sdiv_32ns_32ns_16_36_1 IS + generic ( + ID : INTEGER; + NUM_STAGE : INTEGER; + din0_WIDTH : INTEGER; + din1_WIDTH : INTEGER; + dout_WIDTH : INTEGER ); + port ( + clk : IN STD_LOGIC; + reset : IN STD_LOGIC; + din0 : IN STD_LOGIC_VECTOR (31 downto 0); + din1 : IN STD_LOGIC_VECTOR (31 downto 0); + ce : IN STD_LOGIC; + dout : OUT STD_LOGIC_VECTOR (15 downto 0) ); + end component; + + + +begin + sdiv_32ns_32ns_16_36_1_U1 : component div_sdiv_32ns_32ns_16_36_1 + generic map ( + ID => 1, + NUM_STAGE => 36, + din0_WIDTH => 32, + din1_WIDTH => 32, + dout_WIDTH => 16) + port map ( + clk => ap_clk, + reset => ap_rst, + din0 => data_in_0_dout, + din1 => data_in_1_dout, + ce => ap_const_logic_1, + dout => grp_fu_75_p2); + + + + + + ap_CS_fsm_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_CS_fsm <= ap_ST_fsm_pp0_stage0; + else + ap_CS_fsm <= ap_NS_fsm; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter1_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter1 <= ap_const_logic_0; + else + if (((ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_const_boolean_0 = ap_block_pp0_stage0_subdone))) then + ap_enable_reg_pp0_iter1 <= ap_start; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter10_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter10 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter10 <= ap_enable_reg_pp0_iter9; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter11_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter11 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter11 <= ap_enable_reg_pp0_iter10; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter12_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter12 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter12 <= ap_enable_reg_pp0_iter11; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter13_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter13 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter13 <= ap_enable_reg_pp0_iter12; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter14_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter14 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter14 <= ap_enable_reg_pp0_iter13; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter15_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter15 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter15 <= ap_enable_reg_pp0_iter14; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter16_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter16 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter16 <= ap_enable_reg_pp0_iter15; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter17_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter17 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter17 <= ap_enable_reg_pp0_iter16; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter18_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter18 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter18 <= ap_enable_reg_pp0_iter17; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter19_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter19 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter19 <= ap_enable_reg_pp0_iter18; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter2_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter2 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter2 <= ap_enable_reg_pp0_iter1; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter20_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter20 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter20 <= ap_enable_reg_pp0_iter19; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter21_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter21 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter21 <= ap_enable_reg_pp0_iter20; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter22_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter22 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter22 <= ap_enable_reg_pp0_iter21; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter23_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter23 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter23 <= ap_enable_reg_pp0_iter22; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter24_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter24 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter24 <= ap_enable_reg_pp0_iter23; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter25_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter25 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter25 <= ap_enable_reg_pp0_iter24; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter26_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter26 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter26 <= ap_enable_reg_pp0_iter25; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter27_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter27 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter27 <= ap_enable_reg_pp0_iter26; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter28_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter28 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter28 <= ap_enable_reg_pp0_iter27; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter29_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter29 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter29 <= ap_enable_reg_pp0_iter28; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter3_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter3 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter3 <= ap_enable_reg_pp0_iter2; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter30_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter30 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter30 <= ap_enable_reg_pp0_iter29; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter31_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter31 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter31 <= ap_enable_reg_pp0_iter30; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter32_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter32 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter32 <= ap_enable_reg_pp0_iter31; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter33_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter33 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter33 <= ap_enable_reg_pp0_iter32; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter34_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter34 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter34 <= ap_enable_reg_pp0_iter33; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter35_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter35 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter35 <= ap_enable_reg_pp0_iter34; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter4_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter4 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter4 <= ap_enable_reg_pp0_iter3; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter5_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter5 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter5 <= ap_enable_reg_pp0_iter4; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter6_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter6 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter6 <= ap_enable_reg_pp0_iter5; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter7_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter7 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter7 <= ap_enable_reg_pp0_iter6; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter8_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter8 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter8 <= ap_enable_reg_pp0_iter7; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter9_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter9 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter9 <= ap_enable_reg_pp0_iter8; + end if; + end if; + end if; + end process; + + process (ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (((ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (tmp_nbreadreq_fu_32_p3 = ap_const_lv1_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + tmp_1_reg_90 <= tmp_1_nbreadreq_fu_40_p3; + end if; + end if; + end process; + process (ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if ((ap_const_boolean_0 = ap_block_pp0_stage0_11001)) then + tmp_1_reg_90_pp0_iter10_reg <= tmp_1_reg_90_pp0_iter9_reg; + tmp_1_reg_90_pp0_iter11_reg <= tmp_1_reg_90_pp0_iter10_reg; + tmp_1_reg_90_pp0_iter12_reg <= tmp_1_reg_90_pp0_iter11_reg; + tmp_1_reg_90_pp0_iter13_reg <= tmp_1_reg_90_pp0_iter12_reg; + tmp_1_reg_90_pp0_iter14_reg <= tmp_1_reg_90_pp0_iter13_reg; + tmp_1_reg_90_pp0_iter15_reg <= tmp_1_reg_90_pp0_iter14_reg; + tmp_1_reg_90_pp0_iter16_reg <= tmp_1_reg_90_pp0_iter15_reg; + tmp_1_reg_90_pp0_iter17_reg <= tmp_1_reg_90_pp0_iter16_reg; + tmp_1_reg_90_pp0_iter18_reg <= tmp_1_reg_90_pp0_iter17_reg; + tmp_1_reg_90_pp0_iter19_reg <= tmp_1_reg_90_pp0_iter18_reg; + tmp_1_reg_90_pp0_iter20_reg <= tmp_1_reg_90_pp0_iter19_reg; + tmp_1_reg_90_pp0_iter21_reg <= tmp_1_reg_90_pp0_iter20_reg; + tmp_1_reg_90_pp0_iter22_reg <= tmp_1_reg_90_pp0_iter21_reg; + tmp_1_reg_90_pp0_iter23_reg <= tmp_1_reg_90_pp0_iter22_reg; + tmp_1_reg_90_pp0_iter24_reg <= tmp_1_reg_90_pp0_iter23_reg; + tmp_1_reg_90_pp0_iter25_reg <= tmp_1_reg_90_pp0_iter24_reg; + tmp_1_reg_90_pp0_iter26_reg <= tmp_1_reg_90_pp0_iter25_reg; + tmp_1_reg_90_pp0_iter27_reg <= tmp_1_reg_90_pp0_iter26_reg; + tmp_1_reg_90_pp0_iter28_reg <= tmp_1_reg_90_pp0_iter27_reg; + tmp_1_reg_90_pp0_iter29_reg <= tmp_1_reg_90_pp0_iter28_reg; + tmp_1_reg_90_pp0_iter2_reg <= tmp_1_reg_90_pp0_iter1_reg; + tmp_1_reg_90_pp0_iter30_reg <= tmp_1_reg_90_pp0_iter29_reg; + tmp_1_reg_90_pp0_iter31_reg <= tmp_1_reg_90_pp0_iter30_reg; + tmp_1_reg_90_pp0_iter32_reg <= tmp_1_reg_90_pp0_iter31_reg; + tmp_1_reg_90_pp0_iter33_reg <= tmp_1_reg_90_pp0_iter32_reg; + tmp_1_reg_90_pp0_iter34_reg <= tmp_1_reg_90_pp0_iter33_reg; + tmp_1_reg_90_pp0_iter3_reg <= tmp_1_reg_90_pp0_iter2_reg; + tmp_1_reg_90_pp0_iter4_reg <= tmp_1_reg_90_pp0_iter3_reg; + tmp_1_reg_90_pp0_iter5_reg <= tmp_1_reg_90_pp0_iter4_reg; + tmp_1_reg_90_pp0_iter6_reg <= tmp_1_reg_90_pp0_iter5_reg; + tmp_1_reg_90_pp0_iter7_reg <= tmp_1_reg_90_pp0_iter6_reg; + tmp_1_reg_90_pp0_iter8_reg <= tmp_1_reg_90_pp0_iter7_reg; + tmp_1_reg_90_pp0_iter9_reg <= tmp_1_reg_90_pp0_iter8_reg; + tmp_reg_99_pp0_iter10_reg <= tmp_reg_99_pp0_iter9_reg; + tmp_reg_99_pp0_iter11_reg <= tmp_reg_99_pp0_iter10_reg; + tmp_reg_99_pp0_iter12_reg <= tmp_reg_99_pp0_iter11_reg; + tmp_reg_99_pp0_iter13_reg <= tmp_reg_99_pp0_iter12_reg; + tmp_reg_99_pp0_iter14_reg <= tmp_reg_99_pp0_iter13_reg; + tmp_reg_99_pp0_iter15_reg <= tmp_reg_99_pp0_iter14_reg; + tmp_reg_99_pp0_iter16_reg <= tmp_reg_99_pp0_iter15_reg; + tmp_reg_99_pp0_iter17_reg <= tmp_reg_99_pp0_iter16_reg; + tmp_reg_99_pp0_iter18_reg <= tmp_reg_99_pp0_iter17_reg; + tmp_reg_99_pp0_iter19_reg <= tmp_reg_99_pp0_iter18_reg; + tmp_reg_99_pp0_iter20_reg <= tmp_reg_99_pp0_iter19_reg; + tmp_reg_99_pp0_iter21_reg <= tmp_reg_99_pp0_iter20_reg; + tmp_reg_99_pp0_iter22_reg <= tmp_reg_99_pp0_iter21_reg; + tmp_reg_99_pp0_iter23_reg <= tmp_reg_99_pp0_iter22_reg; + tmp_reg_99_pp0_iter24_reg <= tmp_reg_99_pp0_iter23_reg; + tmp_reg_99_pp0_iter25_reg <= tmp_reg_99_pp0_iter24_reg; + tmp_reg_99_pp0_iter26_reg <= tmp_reg_99_pp0_iter25_reg; + tmp_reg_99_pp0_iter27_reg <= tmp_reg_99_pp0_iter26_reg; + tmp_reg_99_pp0_iter28_reg <= tmp_reg_99_pp0_iter27_reg; + tmp_reg_99_pp0_iter29_reg <= tmp_reg_99_pp0_iter28_reg; + tmp_reg_99_pp0_iter2_reg <= tmp_reg_99_pp0_iter1_reg; + tmp_reg_99_pp0_iter30_reg <= tmp_reg_99_pp0_iter29_reg; + tmp_reg_99_pp0_iter31_reg <= tmp_reg_99_pp0_iter30_reg; + tmp_reg_99_pp0_iter32_reg <= tmp_reg_99_pp0_iter31_reg; + tmp_reg_99_pp0_iter33_reg <= tmp_reg_99_pp0_iter32_reg; + tmp_reg_99_pp0_iter34_reg <= tmp_reg_99_pp0_iter33_reg; + tmp_reg_99_pp0_iter3_reg <= tmp_reg_99_pp0_iter2_reg; + tmp_reg_99_pp0_iter4_reg <= tmp_reg_99_pp0_iter3_reg; + tmp_reg_99_pp0_iter5_reg <= tmp_reg_99_pp0_iter4_reg; + tmp_reg_99_pp0_iter6_reg <= tmp_reg_99_pp0_iter5_reg; + tmp_reg_99_pp0_iter7_reg <= tmp_reg_99_pp0_iter6_reg; + tmp_reg_99_pp0_iter8_reg <= tmp_reg_99_pp0_iter7_reg; + tmp_reg_99_pp0_iter9_reg <= tmp_reg_99_pp0_iter8_reg; + end if; + end if; + end process; + process (ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (((ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + tmp_1_reg_90_pp0_iter1_reg <= tmp_1_reg_90; + tmp_reg_99 <= tmp_nbreadreq_fu_32_p3; + tmp_reg_99_pp0_iter1_reg <= tmp_reg_99; + end if; + end if; + end process; + + ap_NS_fsm_assign_proc : process (ap_CS_fsm, ap_block_pp0_stage0_subdone, ap_reset_idle_pp0) + begin + case ap_CS_fsm is + when ap_ST_fsm_pp0_stage0 => + ap_NS_fsm <= ap_ST_fsm_pp0_stage0; + when others => + ap_NS_fsm <= "X"; + end case; + end process; + ap_CS_fsm_pp0_stage0 <= ap_CS_fsm(0); + ap_block_pp0_stage0 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_pp0_stage0_01001 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_pp0_stage0_11001 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_pp0_stage0_subdone <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state10_pp0_stage0_iter9 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state11_pp0_stage0_iter10 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state12_pp0_stage0_iter11 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state13_pp0_stage0_iter12 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state14_pp0_stage0_iter13 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state15_pp0_stage0_iter14 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state16_pp0_stage0_iter15 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state17_pp0_stage0_iter16 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state18_pp0_stage0_iter17 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state19_pp0_stage0_iter18 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state1_pp0_stage0_iter0 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state20_pp0_stage0_iter19 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state21_pp0_stage0_iter20 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state22_pp0_stage0_iter21 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state23_pp0_stage0_iter22 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state24_pp0_stage0_iter23 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state25_pp0_stage0_iter24 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state26_pp0_stage0_iter25 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state27_pp0_stage0_iter26 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state28_pp0_stage0_iter27 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state29_pp0_stage0_iter28 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state2_pp0_stage0_iter1 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state30_pp0_stage0_iter29 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state31_pp0_stage0_iter30 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state32_pp0_stage0_iter31 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state33_pp0_stage0_iter32 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state34_pp0_stage0_iter33 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state35_pp0_stage0_iter34 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state36_pp0_stage0_iter35 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state3_pp0_stage0_iter2 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state4_pp0_stage0_iter3 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state5_pp0_stage0_iter4 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state6_pp0_stage0_iter5 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state7_pp0_stage0_iter6 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state8_pp0_stage0_iter7 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state9_pp0_stage0_iter8 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + + ap_done_assign_proc : process(ap_enable_reg_pp0_iter35, ap_block_pp0_stage0_subdone) + begin + if (((ap_const_boolean_0 = ap_block_pp0_stage0_subdone) and (ap_enable_reg_pp0_iter35 = ap_const_logic_1))) then + ap_done <= ap_const_logic_1; + else + ap_done <= ap_const_logic_0; + end if; + end process; + + ap_enable_pp0 <= (ap_idle_pp0 xor ap_const_logic_1); + ap_enable_reg_pp0_iter0 <= ap_start; + + ap_idle_assign_proc : process(ap_start, ap_CS_fsm_pp0_stage0, ap_idle_pp0) + begin + if (((ap_start = ap_const_logic_0) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_idle_pp0 = ap_const_logic_1))) then + ap_idle <= ap_const_logic_1; + else + ap_idle <= ap_const_logic_0; + end if; + end process; + + + ap_idle_pp0_assign_proc : process(ap_enable_reg_pp0_iter0, ap_enable_reg_pp0_iter1, ap_enable_reg_pp0_iter2, ap_enable_reg_pp0_iter3, ap_enable_reg_pp0_iter4, ap_enable_reg_pp0_iter5, ap_enable_reg_pp0_iter6, ap_enable_reg_pp0_iter7, ap_enable_reg_pp0_iter8, ap_enable_reg_pp0_iter9, ap_enable_reg_pp0_iter10, ap_enable_reg_pp0_iter11, ap_enable_reg_pp0_iter12, ap_enable_reg_pp0_iter13, ap_enable_reg_pp0_iter14, ap_enable_reg_pp0_iter15, ap_enable_reg_pp0_iter16, ap_enable_reg_pp0_iter17, ap_enable_reg_pp0_iter18, ap_enable_reg_pp0_iter19, ap_enable_reg_pp0_iter20, ap_enable_reg_pp0_iter21, ap_enable_reg_pp0_iter22, ap_enable_reg_pp0_iter23, ap_enable_reg_pp0_iter24, ap_enable_reg_pp0_iter25, ap_enable_reg_pp0_iter26, ap_enable_reg_pp0_iter27, ap_enable_reg_pp0_iter28, ap_enable_reg_pp0_iter29, ap_enable_reg_pp0_iter30, ap_enable_reg_pp0_iter31, ap_enable_reg_pp0_iter32, ap_enable_reg_pp0_iter33, ap_enable_reg_pp0_iter34, ap_enable_reg_pp0_iter35) + begin + if (((ap_enable_reg_pp0_iter26 = ap_const_logic_0) and (ap_enable_reg_pp0_iter25 = ap_const_logic_0) and (ap_enable_reg_pp0_iter24 = ap_const_logic_0) and (ap_enable_reg_pp0_iter23 = ap_const_logic_0) and (ap_enable_reg_pp0_iter22 = ap_const_logic_0) and (ap_enable_reg_pp0_iter21 = ap_const_logic_0) and (ap_enable_reg_pp0_iter20 = ap_const_logic_0) and (ap_enable_reg_pp0_iter19 = ap_const_logic_0) and (ap_enable_reg_pp0_iter18 = ap_const_logic_0) and (ap_enable_reg_pp0_iter17 = ap_const_logic_0) and (ap_enable_reg_pp0_iter16 = ap_const_logic_0) and (ap_enable_reg_pp0_iter15 = ap_const_logic_0) and (ap_enable_reg_pp0_iter14 = ap_const_logic_0) and (ap_enable_reg_pp0_iter13 = ap_const_logic_0) and (ap_enable_reg_pp0_iter12 = ap_const_logic_0) and (ap_enable_reg_pp0_iter11 = ap_const_logic_0) and (ap_enable_reg_pp0_iter10 = ap_const_logic_0) and (ap_enable_reg_pp0_iter9 = ap_const_logic_0) and (ap_enable_reg_pp0_iter8 = ap_const_logic_0) and (ap_enable_reg_pp0_iter7 = ap_const_logic_0) and (ap_enable_reg_pp0_iter6 = + ap_const_logic_0) and (ap_enable_reg_pp0_iter5 = ap_const_logic_0) and (ap_enable_reg_pp0_iter4 = ap_const_logic_0) and (ap_enable_reg_pp0_iter3 = ap_const_logic_0) and (ap_enable_reg_pp0_iter2 = ap_const_logic_0) and (ap_enable_reg_pp0_iter1 = ap_const_logic_0) and (ap_enable_reg_pp0_iter0 = ap_const_logic_0) and (ap_enable_reg_pp0_iter35 = ap_const_logic_0) and (ap_enable_reg_pp0_iter34 = ap_const_logic_0) and (ap_enable_reg_pp0_iter33 = ap_const_logic_0) and (ap_enable_reg_pp0_iter32 = ap_const_logic_0) and (ap_enable_reg_pp0_iter31 = ap_const_logic_0) and (ap_enable_reg_pp0_iter30 = ap_const_logic_0) and (ap_enable_reg_pp0_iter29 = ap_const_logic_0) and (ap_enable_reg_pp0_iter28 = ap_const_logic_0) and (ap_enable_reg_pp0_iter27 = ap_const_logic_0))) then + ap_idle_pp0 <= ap_const_logic_1; + else + ap_idle_pp0 <= ap_const_logic_0; + end if; + end process; + + + ap_idle_pp0_0to34_assign_proc : process(ap_enable_reg_pp0_iter0, ap_enable_reg_pp0_iter1, ap_enable_reg_pp0_iter2, ap_enable_reg_pp0_iter3, ap_enable_reg_pp0_iter4, ap_enable_reg_pp0_iter5, ap_enable_reg_pp0_iter6, ap_enable_reg_pp0_iter7, ap_enable_reg_pp0_iter8, ap_enable_reg_pp0_iter9, ap_enable_reg_pp0_iter10, ap_enable_reg_pp0_iter11, ap_enable_reg_pp0_iter12, ap_enable_reg_pp0_iter13, ap_enable_reg_pp0_iter14, ap_enable_reg_pp0_iter15, ap_enable_reg_pp0_iter16, ap_enable_reg_pp0_iter17, ap_enable_reg_pp0_iter18, ap_enable_reg_pp0_iter19, ap_enable_reg_pp0_iter20, ap_enable_reg_pp0_iter21, ap_enable_reg_pp0_iter22, ap_enable_reg_pp0_iter23, ap_enable_reg_pp0_iter24, ap_enable_reg_pp0_iter25, ap_enable_reg_pp0_iter26, ap_enable_reg_pp0_iter27, ap_enable_reg_pp0_iter28, ap_enable_reg_pp0_iter29, ap_enable_reg_pp0_iter30, ap_enable_reg_pp0_iter31, ap_enable_reg_pp0_iter32, ap_enable_reg_pp0_iter33, ap_enable_reg_pp0_iter34) + begin + if (((ap_enable_reg_pp0_iter26 = ap_const_logic_0) and (ap_enable_reg_pp0_iter25 = ap_const_logic_0) and (ap_enable_reg_pp0_iter24 = ap_const_logic_0) and (ap_enable_reg_pp0_iter23 = ap_const_logic_0) and (ap_enable_reg_pp0_iter22 = ap_const_logic_0) and (ap_enable_reg_pp0_iter21 = ap_const_logic_0) and (ap_enable_reg_pp0_iter20 = ap_const_logic_0) and (ap_enable_reg_pp0_iter19 = ap_const_logic_0) and (ap_enable_reg_pp0_iter18 = ap_const_logic_0) and (ap_enable_reg_pp0_iter17 = ap_const_logic_0) and (ap_enable_reg_pp0_iter16 = ap_const_logic_0) and (ap_enable_reg_pp0_iter15 = ap_const_logic_0) and (ap_enable_reg_pp0_iter14 = ap_const_logic_0) and (ap_enable_reg_pp0_iter13 = ap_const_logic_0) and (ap_enable_reg_pp0_iter12 = ap_const_logic_0) and (ap_enable_reg_pp0_iter11 = ap_const_logic_0) and (ap_enable_reg_pp0_iter10 = ap_const_logic_0) and (ap_enable_reg_pp0_iter9 = ap_const_logic_0) and (ap_enable_reg_pp0_iter8 = ap_const_logic_0) and (ap_enable_reg_pp0_iter7 = ap_const_logic_0) and (ap_enable_reg_pp0_iter6 = + ap_const_logic_0) and (ap_enable_reg_pp0_iter5 = ap_const_logic_0) and (ap_enable_reg_pp0_iter4 = ap_const_logic_0) and (ap_enable_reg_pp0_iter3 = ap_const_logic_0) and (ap_enable_reg_pp0_iter2 = ap_const_logic_0) and (ap_enable_reg_pp0_iter1 = ap_const_logic_0) and (ap_enable_reg_pp0_iter0 = ap_const_logic_0) and (ap_enable_reg_pp0_iter34 = ap_const_logic_0) and (ap_enable_reg_pp0_iter33 = ap_const_logic_0) and (ap_enable_reg_pp0_iter32 = ap_const_logic_0) and (ap_enable_reg_pp0_iter31 = ap_const_logic_0) and (ap_enable_reg_pp0_iter30 = ap_const_logic_0) and (ap_enable_reg_pp0_iter29 = ap_const_logic_0) and (ap_enable_reg_pp0_iter28 = ap_const_logic_0) and (ap_enable_reg_pp0_iter27 = ap_const_logic_0))) then + ap_idle_pp0_0to34 <= ap_const_logic_1; + else + ap_idle_pp0_0to34 <= ap_const_logic_0; + end if; + end process; + + + ap_ready_assign_proc : process(ap_CS_fsm_pp0_stage0, ap_enable_reg_pp0_iter0, ap_block_pp0_stage0_subdone) + begin + if (((ap_enable_reg_pp0_iter0 = ap_const_logic_1) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_const_boolean_0 = ap_block_pp0_stage0_subdone))) then + ap_ready <= ap_const_logic_1; + else + ap_ready <= ap_const_logic_0; + end if; + end process; + + + ap_reset_idle_pp0_assign_proc : process(ap_start, ap_idle_pp0_0to34) + begin + if (((ap_start = ap_const_logic_0) and (ap_idle_pp0_0to34 = ap_const_logic_1))) then + ap_reset_idle_pp0 <= ap_const_logic_1; + else + ap_reset_idle_pp0 <= ap_const_logic_0; + end if; + end process; + + + data_in_0_read_assign_proc : process(ap_CS_fsm_pp0_stage0, ap_enable_reg_pp0_iter0, data_in_0_empty_n, ap_block_pp0_stage0_11001, tmp_nbreadreq_fu_32_p3, tmp_1_nbreadreq_fu_40_p3) + begin + if (((ap_enable_reg_pp0_iter0 = ap_const_logic_1) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (tmp_1_nbreadreq_fu_40_p3 = ap_const_lv1_1) and (tmp_nbreadreq_fu_32_p3 = ap_const_lv1_1) and (data_in_0_empty_n = ap_const_logic_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + data_in_0_read <= ap_const_logic_1; + else + data_in_0_read <= ap_const_logic_0; + end if; + end process; + + + data_in_1_read_assign_proc : process(ap_CS_fsm_pp0_stage0, ap_enable_reg_pp0_iter0, data_in_1_empty_n, ap_block_pp0_stage0_11001, tmp_nbreadreq_fu_32_p3, tmp_1_nbreadreq_fu_40_p3) + begin + if (((ap_enable_reg_pp0_iter0 = ap_const_logic_1) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (tmp_1_nbreadreq_fu_40_p3 = ap_const_lv1_1) and (tmp_nbreadreq_fu_32_p3 = ap_const_lv1_1) and (data_in_1_empty_n = ap_const_logic_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + data_in_1_read <= ap_const_logic_1; + else + data_in_1_read <= ap_const_logic_0; + end if; + end process; + + data_out_0_din <= grp_fu_75_p2(16 - 1 downto 0); + + data_out_0_write_assign_proc : process(ap_enable_reg_pp0_iter35, data_out_0_full_n, ap_block_pp0_stage0_11001, tmp_1_reg_90_pp0_iter34_reg, tmp_reg_99_pp0_iter34_reg) + begin + if (((tmp_reg_99_pp0_iter34_reg = ap_const_lv1_1) and (tmp_1_reg_90_pp0_iter34_reg = ap_const_lv1_1) and (data_out_0_full_n = ap_const_logic_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001) and (ap_enable_reg_pp0_iter35 = ap_const_logic_1))) then + data_out_0_write <= ap_const_logic_1; + else + data_out_0_write <= ap_const_logic_0; + end if; + end process; + + tmp_1_nbreadreq_fu_40_p3 <= (0=>(data_in_1_empty_n), others=>'-'); + tmp_nbreadreq_fu_32_p3 <= (0=>(data_in_0_empty_n), others=>'-'); +end behav; diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd new file mode 100644 index 000000000..ab2e84837 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/impl/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd @@ -0,0 +1,198 @@ +-- ============================================================== +-- Generated by Vitis HLS v2023.1 +-- Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +-- Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +-- ============================================================== +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity div_sdiv_32ns_32ns_16_36_1_divider is + generic ( + in0_WIDTH : INTEGER :=32; + in1_WIDTH : INTEGER :=32; + out_WIDTH : INTEGER :=32); + port ( + clk : in STD_LOGIC; + reset : in STD_LOGIC; + ce : in STD_LOGIC; + dividend : in STD_LOGIC_VECTOR(in0_WIDTH-1 downto 0); + divisor : in STD_LOGIC_VECTOR(in1_WIDTH-1 downto 0); + sign_i : in STD_LOGIC_VECTOR(1 downto 0); + sign_o : out STD_LOGIC_VECTOR(1 downto 0); + quot : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0); + remd : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0)); + + function max (left, right : INTEGER) return INTEGER is + begin + if left > right then return left; + else return right; + end if; + end max; + +end entity; + +architecture rtl of div_sdiv_32ns_32ns_16_36_1_divider is + constant cal_WIDTH : INTEGER := max(in0_WIDTH, in1_WIDTH); + type in0_vector is array(INTEGER range <>) of UNSIGNED(in0_WIDTH-1 downto 0); + type in1_vector is array(INTEGER range <>) of UNSIGNED(in1_WIDTH-1 downto 0); + type cal_vector is array(INTEGER range <>) of UNSIGNED(cal_WIDTH downto 0); + type sign_vector is array(INTEGER range <>) of UNSIGNED(1 downto 0); + + signal dividend_tmp : in0_vector(0 to in0_WIDTH); + signal divisor_tmp : in1_vector(0 to in0_WIDTH); + signal remd_tmp : in0_vector(0 to in0_WIDTH); + signal comb_tmp : in0_vector(0 to in0_WIDTH-1); + signal cal_tmp : cal_vector(0 to in0_WIDTH-1); + signal sign_tmp : sign_vector(0 to in0_WIDTH); +begin + quot <= STD_LOGIC_VECTOR(RESIZE(dividend_tmp(in0_WIDTH), out_WIDTH)); + remd <= STD_LOGIC_VECTOR(RESIZE(remd_tmp(in0_WIDTH), out_WIDTH)); + sign_o <= STD_LOGIC_VECTOR(sign_tmp(in0_WIDTH)); + + tran_tmp_proc : process (clk) + begin + if (clk'event and clk='1') then + if (ce = '1') then + dividend_tmp(0) <= UNSIGNED(dividend); + divisor_tmp(0) <= UNSIGNED(divisor); + sign_tmp(0) <= UNSIGNED(sign_i); + remd_tmp(0) <= (others => '0'); + end if; + end if; + end process tran_tmp_proc; + + run_proc: for i in 0 to in0_WIDTH-1 generate + begin + comb_tmp(i) <= remd_tmp(i)(in0_WIDTH-2 downto 0) & dividend_tmp(i)(in0_WIDTH-1); + cal_tmp(i) <= ('0' & comb_tmp(i)) - ('0' & divisor_tmp(i)); + + process (clk) + begin + if (clk'event and clk='1') then + if (ce = '1') then + dividend_tmp(i+1) <= dividend_tmp(i)(in0_WIDTH-2 downto 0) & (not cal_tmp(i)(cal_WIDTH)); + divisor_tmp(i+1) <= divisor_tmp(i); + sign_tmp(i+1) <= sign_tmp(i); + if cal_tmp(i)(cal_WIDTH) = '1' then + remd_tmp(i+1) <= comb_tmp(i); + else + remd_tmp(i+1) <= cal_tmp(i)(in0_WIDTH-1 downto 0); + end if; + end if; + end if; + end process; + end generate run_proc; + +end architecture; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity div_sdiv_32ns_32ns_16_36_1 is + generic ( + ID : INTEGER :=1; + NUM_STAGE : INTEGER :=2; + din0_WIDTH : INTEGER :=32; + din1_WIDTH : INTEGER :=32; + dout_WIDTH : INTEGER :=32); + port ( + clk : in STD_LOGIC; + reset : in STD_LOGIC; + ce : in STD_LOGIC; + din0 : in STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + din1 : in STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + dout : out STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0)); +end entity; + +architecture rtl of div_sdiv_32ns_32ns_16_36_1 is + component div_sdiv_32ns_32ns_16_36_1_divider is + generic ( + in0_WIDTH : INTEGER :=32; + in1_WIDTH : INTEGER :=32; + out_WIDTH : INTEGER :=32); + port ( + reset : in STD_LOGIC; + clk : in STD_LOGIC; + ce : in STD_LOGIC; + dividend : in STD_LOGIC_VECTOR(in0_WIDTH-1 downto 0); + divisor : in STD_LOGIC_VECTOR(in1_WIDTH-1 downto 0); + sign_i : in STD_LOGIC_VECTOR(1 downto 0); + sign_o : out STD_LOGIC_VECTOR(1 downto 0); + quot : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0); + remd : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0)); + end component; + + signal dividend0 : STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + signal divisor0 : STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + signal dividend_u : STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + signal divisor_u : STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + signal quot_u : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal remd_u : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal quot : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal remd : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal sign_i : STD_LOGIC_VECTOR(1 downto 0); + signal sign_o : STD_LOGIC_VECTOR(1 downto 0); +begin + div_sdiv_32ns_32ns_16_36_1_divider_u : div_sdiv_32ns_32ns_16_36_1_divider + generic map( + in0_WIDTH => din0_WIDTH, + in1_WIDTH => din1_WIDTH, + out_WIDTH => dout_WIDTH) + port map( + clk => clk, + reset => reset, + ce => ce, + dividend => dividend_u, + divisor => divisor_u, + sign_i => sign_i, + sign_o => sign_o, + quot => quot_u, + remd => remd_u); + + sign_i <= (dividend0(din0_WIDTH-1) xor divisor0(din1_WIDTH-1)) & dividend0(din0_WIDTH-1); + dividend_u <= STD_LOGIC_VECTOR(UNSIGNED(not dividend0) + 1) when dividend0(din0_WIDTH-1) = '1' else dividend0; + divisor_u <= STD_LOGIC_VECTOR(UNSIGNED(not divisor0) + 1) when divisor0(din1_WIDTH-1) = '1' else divisor0; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + dividend0 <= din0; + divisor0 <= din1; + end if; + end if; +end process; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + if (sign_o(1) = '1') then + quot <= STD_LOGIC_VECTOR(UNSIGNED(not quot_u) + 1); + else + quot <= quot_u; + end if; + end if; + end if; +end process; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + if (sign_o(0) = '1') then + remd <= STD_LOGIC_VECTOR(UNSIGNED(not remd_u) + 1); + else + remd <= remd_u; + end if; + end if; + end if; +end process; + +dout <= quot; + +end architecture; + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1.aps b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1.aps new file mode 100644 index 000000000..be474c0f7 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1.aps @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1.directive b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1.directive new file mode 100644 index 000000000..256020a24 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1.directive @@ -0,0 +1,10 @@ + + + + + + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1_data.json b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1_data.json new file mode 100644 index 000000000..e7d98e453 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/solution1_data.json @@ -0,0 +1,325 @@ +{ + "Top": "div", + "RtlTop": "div", + "RtlPrefix": "", + "RtlSubPrefix": "div_", + "SourceLanguage": "cpp", + "HostMachineBits": "64", + "FunctionProtocol": "ap_ctrl_hs", + "ResetStyle": "control", + "Target": { + "Family": "virtexuplus", + "Device": "xcu250", + "Package": "-figd2104", + "Speed": "-2L-e", + "Triple": "fpga64-xilinx-none" + }, + "Args": { + "data_in_0": { + "index": "0", + "direction": "in", + "srcType": "stream, 0>&", + "srcSize": "32", + "hwRefs": [{ + "type": "interface", + "interface": "data_in_0", + "name": "", + "usage": "data", + "direction": "in" + }] + }, + "data_in_1": { + "index": "1", + "direction": "in", + "srcType": "stream, 0>&", + "srcSize": "32", + "hwRefs": [{ + "type": "interface", + "interface": "data_in_1", + "name": "", + "usage": "data", + "direction": "in" + }] + }, + "data_out_0": { + "index": "2", + "direction": "out", + "srcType": "stream, 0>&", + "srcSize": "16", + "hwRefs": [{ + "type": "interface", + "interface": "data_out_0", + "name": "", + "usage": "data", + "direction": "out" + }] + } + }, + "HlsSolution": { + "FlowTarget": "vivado", + "ConfigTcl": ["config_bind -effort=high"], + "DirectiveTcl": ["set_directive_top div -name div"], + "ProfileOption": "0", + "ProfileType": "none", + "KernelName": "div" + }, + "ClockInfo": { + "ClockName": "ap_clk", + "ClockPeriod": "10", + "Uncertainty": "2.7", + "IsCombinational": "0", + "II": "1", + "Latency": "35" + }, + "Xdc": {"OocClocks": ["create_clock -name ap_clk -period 10.000 [get_ports ap_clk]"]}, + "Ipx": { + "Vendor": "xilinx.com", + "Library": "hls", + "Name": "div", + "Version": "1.0", + "DisplayName": "Div", + "Revision": "2113675971", + "Description": "An IP generated by Vitis HLS", + "Taxonomy": "\/VITIS_HLS_IP", + "AutoFamilySupport": "", + "ZipFile": "xilinx_com_hls_div_1_0.zip" + }, + "Files": { + "CSource": ["..\/..\/div.cpp"], + "Vhdl": [ + "impl\/vhdl\/div_sdiv_32ns_32ns_16_36_1.vhd", + "impl\/vhdl\/div.vhd" + ], + "Verilog": [ + "impl\/verilog\/div_sdiv_32ns_32ns_16_36_1.v", + "impl\/verilog\/div.v" + ], + "IpMisc": ["impl\/misc\/logo.png"], + "CsynthXml": "syn\/report\/csynth.xml", + "DebugDir": ".debug", + "KernelXml": ".autopilot\/db\/kernel.internal.xml", + "Xo": "", + "XoHlsDir": "", + "ProtoInst": [".debug\/div.protoinst"] + }, + "SubcoreInfo": { + "HasXpmMemory": false, + "HasClockedDsp": false, + "Ip": [] + }, + "Interfaces": { + "ap_clk": { + "type": "clock", + "busTypeName": "clock", + "mode": "slave", + "busParams": {"ASSOCIATED_RESET": "ap_rst"}, + "portMap": {"ap_clk": "CLK"}, + "ports": ["ap_clk"] + }, + "ap_rst": { + "type": "reset", + "busTypeName": "reset", + "mode": "slave", + "busParams": {"POLARITY": "ACTIVE_HIGH"}, + "portMap": {"ap_rst": "RST"}, + "ports": ["ap_rst"] + }, + "ap_ctrl": { + "type": "ap_ctrl", + "busTypeName": "acc_handshake", + "mode": "slave", + "portMap": { + "ap_start": "start", + "ap_done": "done", + "ap_idle": "idle", + "ap_ready": "ready" + }, + "ports": [ + "ap_done", + "ap_idle", + "ap_ready", + "ap_start" + ] + }, + "data_in_0": { + "type": "ap_fifo", + "busTypeName": "acc_fifo_read", + "mode": "master", + "dataWidth": "32", + "portPrefix": "data_in_0_", + "portMap": { + "data_in_0_dout": "RD_DATA", + "data_in_0_empty_n": "EMPTY_N", + "data_in_0_read": "RD_EN" + }, + "ports": [ + "data_in_0_dout", + "data_in_0_empty_n", + "data_in_0_read" + ], + "constraints": [{ + "constraint_type": "pragma interface", + "mode": "ap_fifo", + "register_option": "0", + "argName": "data_in_0" + }] + }, + "data_in_1": { + "type": "ap_fifo", + "busTypeName": "acc_fifo_read", + "mode": "master", + "dataWidth": "32", + "portPrefix": "data_in_1_", + "portMap": { + "data_in_1_dout": "RD_DATA", + "data_in_1_empty_n": "EMPTY_N", + "data_in_1_read": "RD_EN" + }, + "ports": [ + "data_in_1_dout", + "data_in_1_empty_n", + "data_in_1_read" + ], + "constraints": [{ + "constraint_type": "pragma interface", + "mode": "ap_fifo", + "register_option": "0", + "argName": "data_in_1" + }] + }, + "data_out_0": { + "type": "ap_fifo", + "busTypeName": "acc_fifo_write", + "mode": "master", + "dataWidth": "16", + "portPrefix": "data_out_0_", + "portMap": { + "data_out_0_din": "WR_DATA", + "data_out_0_full_n": "FULL_N", + "data_out_0_write": "WR_EN" + }, + "ports": [ + "data_out_0_din", + "data_out_0_full_n", + "data_out_0_write" + ], + "constraints": [{ + "constraint_type": "pragma interface", + "mode": "ap_fifo", + "register_option": "0", + "argName": "data_out_0" + }] + } + }, + "RtlPorts": { + "ap_clk": { + "dir": "in", + "width": "1" + }, + "ap_rst": { + "dir": "in", + "width": "1" + }, + "ap_start": { + "dir": "in", + "width": "1" + }, + "ap_done": { + "dir": "out", + "width": "1" + }, + "ap_idle": { + "dir": "out", + "width": "1" + }, + "ap_ready": { + "dir": "out", + "width": "1" + }, + "data_in_0_dout": { + "dir": "in", + "width": "32" + }, + "data_in_0_empty_n": { + "dir": "in", + "width": "1" + }, + "data_in_0_read": { + "dir": "out", + "width": "1" + }, + "data_in_1_dout": { + "dir": "in", + "width": "32" + }, + "data_in_1_empty_n": { + "dir": "in", + "width": "1" + }, + "data_in_1_read": { + "dir": "out", + "width": "1" + }, + "data_out_0_din": { + "dir": "out", + "width": "16" + }, + "data_out_0_full_n": { + "dir": "in", + "width": "1" + }, + "data_out_0_write": { + "dir": "out", + "width": "1" + } + }, + "ModuleInfo": { + "Hierarchy": {"ModuleName": "div"}, + "Info": {"div": { + "FunctionProtocol": "ap_ctrl_hs", + "isTaskLevelControl": "0", + "isPipelined": "1", + "isCombinational": "0", + "isOneStateSeq": "0" + }}, + "Metrics": {"div": { + "Latency": { + "LatencyBest": "35", + "LatencyAvg": "35", + "LatencyWorst": "35", + "PipelineII": "1", + "PipelineDepth": "36", + "PipelineType": "yes" + }, + "Timing": { + "Target": "10.00", + "Uncertainty": "2.70", + "Estimate": "2.593" + }, + "Area": { + "FF": "2449", + "AVAIL_FF": "3456000", + "UTIL_FF": "~0", + "LUT": "1808", + "AVAIL_LUT": "1728000", + "UTIL_LUT": "~0", + "BRAM_18K": "0", + "AVAIL_BRAM": "5376", + "UTIL_BRAM": "0", + "DSP": "0", + "AVAIL_DSP": "12288", + "UTIL_DSP": "0", + "URAM": "0", + "AVAIL_URAM": "1280", + "UTIL_URAM": "0" + } + }} + }, + "GenerateBdFiles": "0", + "GenData": { + "DataVersion": "0.2", + "Time": "2024-08-04 22:51:45 UTC", + "ToolName": "vitis_hls", + "ToolVersion": "2023.1" + } +} diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth.rpt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth.rpt new file mode 100644 index 000000000..12766fd7f --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth.rpt @@ -0,0 +1,90 @@ + + +================================================================ +== Synthesis Summary Report of 'div' +================================================================ ++ General Information: + * Date: Sun Aug 4 22:51:44 2024 + * Version: 2023.1 (Build 3854077 on May 4 2023) + * Project: prj + * Solution: solution1 (Vivado IP Flow Target) + * Product family: virtexuplus + * Target device: xcu250-figd2104-2L-e + + ++ Performance & Resource Estimates: + + PS: '+' for module; 'o' for loop; '*' for dataflow + +--------+------+------+---------+---------+----------+---------+------+----------+------+----+------------+------------+-----+ + | Modules| Issue| | Latency | Latency | Iteration| | Trip | | | | | | | + | & Loops| Type | Slack| (cycles)| (ns) | Latency | Interval| Count| Pipelined| BRAM | DSP| FF | LUT | URAM| + +--------+------+------+---------+---------+----------+---------+------+----------+------+----+------------+------------+-----+ + |+ div | -| 4.71| 35| 350.000| -| 1| -| yes| -| -| 2449 (~0%)| 1808 (~0%)| -| + +--------+------+------+---------+---------+----------+---------+------+----------+------+----+------------+------------+-----+ + + +================================================================ +== HW Interfaces +================================================================ +* AP_FIFO ++------------+-----------+------------+ +| Interface | Direction | Data Width | ++------------+-----------+------------+ +| data_in_0 | out | 32 | +| data_in_1 | out | 32 | +| data_out_0 | out | 16 | ++------------+-----------+------------+ + +* TOP LEVEL CONTROL ++-----------+------------+-----------------------------------+ +| Interface | Type | Ports | ++-----------+------------+-----------------------------------+ +| ap_clk | clock | ap_clk | +| ap_rst | reset | ap_rst | +| ap_ctrl | ap_ctrl_hs | ap_done ap_idle ap_ready ap_start | ++-----------+------------+-----------------------------------+ + + +================================================================ +== SW I/O Information +================================================================ +* Top Function Arguments ++------------+-----------+------------------------+ +| Argument | Direction | Datatype | ++------------+-----------+------------------------+ +| data_in_0 | in | stream, 0>& | +| data_in_1 | in | stream, 0>& | +| data_out_0 | out | stream, 0>& | ++------------+-----------+------------------------+ + +* SW-to-HW Mapping ++------------+--------------+-----------+ +| Argument | HW Interface | HW Type | ++------------+--------------+-----------+ +| data_in_0 | data_in_0 | interface | +| data_in_1 | data_in_1 | interface | +| data_out_0 | data_out_0 | interface | ++------------+--------------+-----------+ + + +================================================================ +== Bind Op Report +================================================================ + No bind op info in design + +================================================================ +== Bind Storage Report +================================================================ + No bind storage info in design + +================================================================ +== Pragma Report +================================================================ +* Valid Pragma Syntax ++----------+---------+-------------------+ +| Type | Options | Location | ++----------+---------+-------------------+ +| pipeline | II = 1 | div.cpp:13 in div | ++----------+---------+-------------------+ + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth.xml new file mode 100644 index 000000000..2b5d66fc1 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth.xml @@ -0,0 +1,422 @@ + + + 2023.1 + + + ns + virtexuplus + xcu250-figd2104-2L-e + div + 10.00 + 2.70 + 1 + vivado + + + yes + + ns + 2.593 + + + clock cycles + 35 + 35 + 35 + 0.350 us + 0.350 us + 0.350 us + 1 + 36 + 1 + 1 + + + + + 2449 + 1808 + 0 + 0 + 0 + + + 5376 + 12288 + 3456000 + 1728000 + 1280 + + + + + ap_clk + div + return value + + ap_ctrl_hs + + in + 1 + control + + + ap_rst + div + return value + + ap_ctrl_hs + + in + 1 + control + + + ap_start + div + return value + + ap_ctrl_hs + + in + 1 + control + + + ap_done + div + return value + + ap_ctrl_hs + + out + 1 + control + + + ap_idle + div + return value + + ap_ctrl_hs + + out + 1 + control + + + ap_ready + div + return value + + ap_ctrl_hs + + out + 1 + control + + + data_in_0_dout + data_in_0 + pointer + + ap_fifo + + in + 32 + control + int + + + data_in_0_empty_n + data_in_0 + pointer + + ap_fifo + + in + 1 + control + int + + + data_in_0_read + data_in_0 + pointer + + ap_fifo + + out + 1 + control + int + + + data_in_1_dout + data_in_1 + pointer + + ap_fifo + + in + 32 + control + int + + + data_in_1_empty_n + data_in_1 + pointer + + ap_fifo + + in + 1 + control + int + + + data_in_1_read + data_in_1 + pointer + + ap_fifo + + out + 1 + control + int + + + data_out_0_din + data_out_0 + pointer + + ap_fifo + + out + 16 + control + int + + + data_out_0_full_n + data_out_0 + pointer + + ap_fifo + + in + 1 + control + int + + + data_out_0_write + data_out_0 + pointer + + ap_fifo + + out + 1 + control + int + + + + + div + + + + + div + + + 10.00 + 2.70 + 1 + 2.593 + + + 35 + 35 + 35 + 0.350 us + 0.350 us + 0.350 us + 1 + 36 + yes + + + + + 2449 + 3456000 + ~0 + 1808 + 1728000 + ~0 + 0 + 5376 + 0 + 0 + 12288 + 0 + 0 + 1280 + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + ap_rst + + + CLK + + + ap_clk + + + + + ACTIVE_HIGH + + + RST + + + ap_rst + + + + + start + done + idle + ready + + + ap_done + ap_idle + ap_ready + ap_start + + + + + RD_DATA + EMPTY_N + RD_EN + + + data_in_0_dout + data_in_0_empty_n + data_in_0_read + + + + + + + + RD_DATA + EMPTY_N + RD_EN + + + data_in_1_dout + data_in_1_empty_n + data_in_1_read + + + + + + + + WR_DATA + FULL_N + WR_EN + + + data_out_0_din + data_out_0_full_n + data_out_0_write + + + + + + + +
+ + + Interface, Direction, Data Width + out, 32 + out, 32 + out, 16 +
+
+ + + Interface, Type, Ports + clock, ap_clk + reset, ap_rst + ap_ctrl_hs, ap_done ap_idle ap_ready ap_start +
+
+
+
+ +
+ + + Argument, Direction, Datatype + in, stream<ap_int<32> 0>& + in, stream<ap_int<32> 0>& + out, stream<ap_int<16> 0>& +
+
+ + + Argument, HW Interface, HW Type + data_in_0, interface + data_in_1, interface + data_out_0, interface +
+
+
+
+ www.xilinx.com/cgi-bin/docs/rdoc?v=2023.1;t=hls+guidance;d=ZZZ.html + + + +
+ diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth_design_size.rpt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth_design_size.rpt new file mode 100644 index 000000000..2b4a087df --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth_design_size.rpt @@ -0,0 +1,29 @@ +================================================================ +== Design Size Report +================================================================ + +* Instructions per Function for each Compilation Phase ++----------+------------+--------------+---------------+----------------------+---------------+ +| Function | Location | Compile/Link | Unroll/Inline | Performance/Pipeline | Optimizations | ++----------+------------+--------------+---------------+----------------------+---------------+ +| div | div.cpp:10 | 188 | 27 | 17 | 44 | ++----------+------------+--------------+---------------+----------------------+---------------+ + +* Description of Compilation Phases ++----------------------+------------------------------------------------------------+ +| Compilation Phase | Description | ++----------------------+------------------------------------------------------------+ +| Compile/Link | All functions are compiled and linked into a single design | +| Unroll/Inline | After user unroll and inline pragmas are applied | +| Performance/Pipeline | After performance and pipeline pragmas are applied | +| Optimizations | After high level synthesis optimizations | ++----------------------+------------------------------------------------------------+ + +* Design Size Message Settings ++---------------------------------------------+--------+------------------------------------------------------------------+ +| Message Setting | Value | Description | ++---------------------------------------------+--------+------------------------------------------------------------------+ +| config_compile -design_size_maximum_warning | 100000 | Show a warning when total design instructions exceeds this value | ++---------------------------------------------+--------+------------------------------------------------------------------+ + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth_design_size.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth_design_size.xml new file mode 100644 index 000000000..327784e75 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/csynth_design_size.xml @@ -0,0 +1,27 @@ + + + +
+ + + + + + + + + Compilation Phase, Description + All functions are compiled and linked into a single design + After user unroll and inline pragmas are applied + After performance and pipeline pragmas are applied + After high level synthesis optimizations +
+
+ + + Message Setting, Value, Description + 100000, Show a warning when total design instructions exceeds this value +
+
+ + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/div_csynth.rpt b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/div_csynth.rpt new file mode 100644 index 000000000..43ecb5822 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/div_csynth.rpt @@ -0,0 +1,176 @@ + + +================================================================ +== Vitis HLS Report for 'div' +================================================================ +* Date: Sun Aug 4 22:51:44 2024 + +* Version: 2023.1 (Build 3854077 on May 4 2023) +* Project: prj +* Solution: solution1 (Vivado IP Flow Target) +* Product family: virtexuplus +* Target device: xcu250-figd2104-2L-e + + +================================================================ +== Performance Estimates +================================================================ ++ Timing: + * Summary: + +--------+----------+----------+------------+ + | Clock | Target | Estimated| Uncertainty| + +--------+----------+----------+------------+ + |ap_clk | 10.00 ns| 2.593 ns| 2.70 ns| + +--------+----------+----------+------------+ + ++ Latency: + * Summary: + +---------+---------+----------+----------+-----+-----+---------+ + | Latency (cycles) | Latency (absolute) | Interval | Pipeline| + | min | max | min | max | min | max | Type | + +---------+---------+----------+----------+-----+-----+---------+ + | 35| 35| 0.350 us| 0.350 us| 1| 1| yes| + +---------+---------+----------+----------+-----+-----+---------+ + + + Detail: + * Instance: + N/A + + * Loop: + N/A + + + +================================================================ +== Utilization Estimates +================================================================ +* Summary: ++---------------------+---------+-------+---------+---------+------+ +| Name | BRAM_18K| DSP | FF | LUT | URAM | ++---------------------+---------+-------+---------+---------+------+ +|DSP | -| -| -| -| -| +|Expression | -| -| 0| 6| -| +|FIFO | -| -| -| -| -| +|Instance | -| -| 2283| 1738| -| +|Memory | -| -| -| -| -| +|Multiplexer | -| -| -| -| -| +|Register | -| -| 166| 64| -| ++---------------------+---------+-------+---------+---------+------+ +|Total | 0| 0| 2449| 1808| 0| ++---------------------+---------+-------+---------+---------+------+ +|Available SLR | 1344| 3072| 864000| 432000| 320| ++---------------------+---------+-------+---------+---------+------+ +|Utilization SLR (%) | 0| 0| ~0| ~0| 0| ++---------------------+---------+-------+---------+---------+------+ +|Available | 5376| 12288| 3456000| 1728000| 1280| ++---------------------+---------+-------+---------+---------+------+ +|Utilization (%) | 0| 0| ~0| ~0| 0| ++---------------------+---------+-------+---------+---------+------+ + ++ Detail: + * Instance: + +---------------------------+------------------------+---------+----+------+------+-----+ + | Instance | Module | BRAM_18K| DSP| FF | LUT | URAM| + +---------------------------+------------------------+---------+----+------+------+-----+ + |sdiv_32ns_32ns_16_36_1_U1 |sdiv_32ns_32ns_16_36_1 | 0| 0| 2283| 1738| 0| + +---------------------------+------------------------+---------+----+------+------+-----+ + |Total | | 0| 0| 2283| 1738| 0| + +---------------------------+------------------------+---------+----+------+------+-----+ + + * DSP: + N/A + + * Memory: + N/A + + * FIFO: + N/A + + * Expression: + +--------------------------+----------+----+---+----+------------+------------+ + | Variable Name | Operation| DSP| FF| LUT| Bitwidth P0| Bitwidth P1| + +--------------------------+----------+----+---+----+------------+------------+ + |tmp_1_nbreadreq_fu_40_p3 | and| 0| 0| 2| 1| 0| + |tmp_nbreadreq_fu_32_p3 | and| 0| 0| 2| 1| 0| + |ap_enable_pp0 | xor| 0| 0| 2| 1| 2| + +--------------------------+----------+----+---+----+------------+------------+ + |Total | | 0| 0| 6| 3| 2| + +--------------------------+----------+----+---+----+------------+------------+ + + * Multiplexer: + N/A + + * Register: + +--------------------------+----+----+-----+-----------+ + | Name | FF | LUT| Bits| Const Bits| + +--------------------------+----+----+-----+-----------+ + |ap_CS_fsm | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter1 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter10 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter11 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter12 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter13 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter14 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter15 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter16 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter17 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter18 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter19 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter2 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter20 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter21 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter22 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter23 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter24 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter25 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter26 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter27 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter28 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter29 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter3 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter30 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter31 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter32 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter33 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter34 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter35 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter4 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter5 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter6 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter7 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter8 | 1| 0| 1| 0| + |ap_enable_reg_pp0_iter9 | 1| 0| 1| 0| + |tmp_1_reg_90 | 1| 0| 1| 0| + |tmp_reg_99 | 1| 0| 1| 0| + |tmp_1_reg_90 | 64| 32| 1| 0| + |tmp_reg_99 | 64| 32| 1| 0| + +--------------------------+----+----+-----+-----------+ + |Total | 166| 64| 40| 0| + +--------------------------+----+----+-----+-----------+ + + + +================================================================ +== Interface +================================================================ +* Summary: ++-------------------+-----+-----+------------+--------------+--------------+ +| RTL Ports | Dir | Bits| Protocol | Source Object| C Type | ++-------------------+-----+-----+------------+--------------+--------------+ +|ap_clk | in| 1| ap_ctrl_hs| div| return value| +|ap_rst | in| 1| ap_ctrl_hs| div| return value| +|ap_start | in| 1| ap_ctrl_hs| div| return value| +|ap_done | out| 1| ap_ctrl_hs| div| return value| +|ap_idle | out| 1| ap_ctrl_hs| div| return value| +|ap_ready | out| 1| ap_ctrl_hs| div| return value| +|data_in_0_dout | in| 32| ap_fifo| data_in_0| pointer| +|data_in_0_empty_n | in| 1| ap_fifo| data_in_0| pointer| +|data_in_0_read | out| 1| ap_fifo| data_in_0| pointer| +|data_in_1_dout | in| 32| ap_fifo| data_in_1| pointer| +|data_in_1_empty_n | in| 1| ap_fifo| data_in_1| pointer| +|data_in_1_read | out| 1| ap_fifo| data_in_1| pointer| +|data_out_0_din | out| 16| ap_fifo| data_out_0| pointer| +|data_out_0_full_n | in| 1| ap_fifo| data_out_0| pointer| +|data_out_0_write | out| 1| ap_fifo| data_out_0| pointer| ++-------------------+-----+-----+------------+--------------+--------------+ + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/div_csynth.xml b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/div_csynth.xml new file mode 100644 index 000000000..cac770419 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/report/div_csynth.xml @@ -0,0 +1,233 @@ + + + +2023.1 + + + +ns +virtexuplus +xcu250-figd2104-2L-e +div +10.00 +2.70 +1 +vivado + + + +yes + +ns +2.593 + + +clock cycles +35 +35 +35 +0.350 us +0.350 us +0.350 us +1 +36 +1 +1 + + + + + +2449 +1808 +0 +0 +0 + + +5376 +12288 +3456000 +1728000 +1280 + + + + + +ap_clk +div +return value + +ap_ctrl_hs + +in +1 +control + + +ap_rst +div +return value + +ap_ctrl_hs + +in +1 +control + + +ap_start +div +return value + +ap_ctrl_hs + +in +1 +control + + +ap_done +div +return value + +ap_ctrl_hs + +out +1 +control + + +ap_idle +div +return value + +ap_ctrl_hs + +out +1 +control + + +ap_ready +div +return value + +ap_ctrl_hs + +out +1 +control + + +data_in_0_dout +data_in_0 +pointer + +ap_fifo + +in +32 +control +int + + +data_in_0_empty_n +data_in_0 +pointer + +ap_fifo + +in +1 +control +int + + +data_in_0_read +data_in_0 +pointer + +ap_fifo + +out +1 +control +int + + +data_in_1_dout +data_in_1 +pointer + +ap_fifo + +in +32 +control +int + + +data_in_1_empty_n +data_in_1 +pointer + +ap_fifo + +in +1 +control +int + + +data_in_1_read +data_in_1 +pointer + +ap_fifo + +out +1 +control +int + + +data_out_0_din +data_out_0 +pointer + +ap_fifo + +out +16 +control +int + + +data_out_0_full_n +data_out_0 +pointer + +ap_fifo + +in +1 +control +int + + +data_out_0_write +data_out_0 +pointer + +ap_fifo + +out +1 +control +int + + + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/verilog/div.v b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/verilog/div.v new file mode 100644 index 000000000..4476f0432 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/verilog/div.v @@ -0,0 +1,887 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== + +`timescale 1 ns / 1 ps + +(* CORE_GENERATION_INFO="div_div,hls_ip_2023_1,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xcu250-figd2104-2L-e,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=pipeline,HLS_SYN_CLOCK=2.593000,HLS_SYN_LAT=35,HLS_SYN_TPT=1,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=2449,HLS_SYN_LUT=1808,HLS_VERSION=2023_1}" *) + +module div ( + ap_clk, + ap_rst, + ap_start, + ap_done, + ap_idle, + ap_ready, + data_in_0_dout, + data_in_0_empty_n, + data_in_0_read, + data_in_1_dout, + data_in_1_empty_n, + data_in_1_read, + data_out_0_din, + data_out_0_full_n, + data_out_0_write +); + +parameter ap_ST_fsm_pp0_stage0 = 1'd1; + +input ap_clk; +input ap_rst; +input ap_start; +output ap_done; +output ap_idle; +output ap_ready; +input [31:0] data_in_0_dout; +input data_in_0_empty_n; +output data_in_0_read; +input [31:0] data_in_1_dout; +input data_in_1_empty_n; +output data_in_1_read; +output [15:0] data_out_0_din; +input data_out_0_full_n; +output data_out_0_write; + +reg ap_done; +reg ap_idle; +reg ap_ready; +reg data_in_0_read; +reg data_in_1_read; +reg data_out_0_write; + +(* fsm_encoding = "none" *) reg [0:0] ap_CS_fsm; +wire ap_CS_fsm_pp0_stage0; +wire ap_enable_reg_pp0_iter0; +reg ap_enable_reg_pp0_iter1; +reg ap_enable_reg_pp0_iter2; +reg ap_enable_reg_pp0_iter3; +reg ap_enable_reg_pp0_iter4; +reg ap_enable_reg_pp0_iter5; +reg ap_enable_reg_pp0_iter6; +reg ap_enable_reg_pp0_iter7; +reg ap_enable_reg_pp0_iter8; +reg ap_enable_reg_pp0_iter9; +reg ap_enable_reg_pp0_iter10; +reg ap_enable_reg_pp0_iter11; +reg ap_enable_reg_pp0_iter12; +reg ap_enable_reg_pp0_iter13; +reg ap_enable_reg_pp0_iter14; +reg ap_enable_reg_pp0_iter15; +reg ap_enable_reg_pp0_iter16; +reg ap_enable_reg_pp0_iter17; +reg ap_enable_reg_pp0_iter18; +reg ap_enable_reg_pp0_iter19; +reg ap_enable_reg_pp0_iter20; +reg ap_enable_reg_pp0_iter21; +reg ap_enable_reg_pp0_iter22; +reg ap_enable_reg_pp0_iter23; +reg ap_enable_reg_pp0_iter24; +reg ap_enable_reg_pp0_iter25; +reg ap_enable_reg_pp0_iter26; +reg ap_enable_reg_pp0_iter27; +reg ap_enable_reg_pp0_iter28; +reg ap_enable_reg_pp0_iter29; +reg ap_enable_reg_pp0_iter30; +reg ap_enable_reg_pp0_iter31; +reg ap_enable_reg_pp0_iter32; +reg ap_enable_reg_pp0_iter33; +reg ap_enable_reg_pp0_iter34; +reg ap_enable_reg_pp0_iter35; +reg ap_idle_pp0; +wire ap_block_state1_pp0_stage0_iter0; +wire ap_block_state2_pp0_stage0_iter1; +wire ap_block_state3_pp0_stage0_iter2; +wire ap_block_state4_pp0_stage0_iter3; +wire ap_block_state5_pp0_stage0_iter4; +wire ap_block_state6_pp0_stage0_iter5; +wire ap_block_state7_pp0_stage0_iter6; +wire ap_block_state8_pp0_stage0_iter7; +wire ap_block_state9_pp0_stage0_iter8; +wire ap_block_state10_pp0_stage0_iter9; +wire ap_block_state11_pp0_stage0_iter10; +wire ap_block_state12_pp0_stage0_iter11; +wire ap_block_state13_pp0_stage0_iter12; +wire ap_block_state14_pp0_stage0_iter13; +wire ap_block_state15_pp0_stage0_iter14; +wire ap_block_state16_pp0_stage0_iter15; +wire ap_block_state17_pp0_stage0_iter16; +wire ap_block_state18_pp0_stage0_iter17; +wire ap_block_state19_pp0_stage0_iter18; +wire ap_block_state20_pp0_stage0_iter19; +wire ap_block_state21_pp0_stage0_iter20; +wire ap_block_state22_pp0_stage0_iter21; +wire ap_block_state23_pp0_stage0_iter22; +wire ap_block_state24_pp0_stage0_iter23; +wire ap_block_state25_pp0_stage0_iter24; +wire ap_block_state26_pp0_stage0_iter25; +wire ap_block_state27_pp0_stage0_iter26; +wire ap_block_state28_pp0_stage0_iter27; +wire ap_block_state29_pp0_stage0_iter28; +wire ap_block_state30_pp0_stage0_iter29; +wire ap_block_state31_pp0_stage0_iter30; +wire ap_block_state32_pp0_stage0_iter31; +wire ap_block_state33_pp0_stage0_iter32; +wire ap_block_state34_pp0_stage0_iter33; +wire ap_block_state35_pp0_stage0_iter34; +wire ap_block_state36_pp0_stage0_iter35; +wire ap_block_pp0_stage0_subdone; +wire ap_block_pp0_stage0_11001; +wire [0:0] tmp_nbreadreq_fu_32_p3; +wire [0:0] tmp_1_nbreadreq_fu_40_p3; +reg [0:0] tmp_1_reg_90; +reg [0:0] tmp_1_reg_90_pp0_iter1_reg; +reg [0:0] tmp_1_reg_90_pp0_iter2_reg; +reg [0:0] tmp_1_reg_90_pp0_iter3_reg; +reg [0:0] tmp_1_reg_90_pp0_iter4_reg; +reg [0:0] tmp_1_reg_90_pp0_iter5_reg; +reg [0:0] tmp_1_reg_90_pp0_iter6_reg; +reg [0:0] tmp_1_reg_90_pp0_iter7_reg; +reg [0:0] tmp_1_reg_90_pp0_iter8_reg; +reg [0:0] tmp_1_reg_90_pp0_iter9_reg; +reg [0:0] tmp_1_reg_90_pp0_iter10_reg; +reg [0:0] tmp_1_reg_90_pp0_iter11_reg; +reg [0:0] tmp_1_reg_90_pp0_iter12_reg; +reg [0:0] tmp_1_reg_90_pp0_iter13_reg; +reg [0:0] tmp_1_reg_90_pp0_iter14_reg; +reg [0:0] tmp_1_reg_90_pp0_iter15_reg; +reg [0:0] tmp_1_reg_90_pp0_iter16_reg; +reg [0:0] tmp_1_reg_90_pp0_iter17_reg; +reg [0:0] tmp_1_reg_90_pp0_iter18_reg; +reg [0:0] tmp_1_reg_90_pp0_iter19_reg; +reg [0:0] tmp_1_reg_90_pp0_iter20_reg; +reg [0:0] tmp_1_reg_90_pp0_iter21_reg; +reg [0:0] tmp_1_reg_90_pp0_iter22_reg; +reg [0:0] tmp_1_reg_90_pp0_iter23_reg; +reg [0:0] tmp_1_reg_90_pp0_iter24_reg; +reg [0:0] tmp_1_reg_90_pp0_iter25_reg; +reg [0:0] tmp_1_reg_90_pp0_iter26_reg; +reg [0:0] tmp_1_reg_90_pp0_iter27_reg; +reg [0:0] tmp_1_reg_90_pp0_iter28_reg; +reg [0:0] tmp_1_reg_90_pp0_iter29_reg; +reg [0:0] tmp_1_reg_90_pp0_iter30_reg; +reg [0:0] tmp_1_reg_90_pp0_iter31_reg; +reg [0:0] tmp_1_reg_90_pp0_iter32_reg; +reg [0:0] tmp_1_reg_90_pp0_iter33_reg; +reg [0:0] tmp_1_reg_90_pp0_iter34_reg; +reg [0:0] tmp_reg_99; +reg [0:0] tmp_reg_99_pp0_iter1_reg; +reg [0:0] tmp_reg_99_pp0_iter2_reg; +reg [0:0] tmp_reg_99_pp0_iter3_reg; +reg [0:0] tmp_reg_99_pp0_iter4_reg; +reg [0:0] tmp_reg_99_pp0_iter5_reg; +reg [0:0] tmp_reg_99_pp0_iter6_reg; +reg [0:0] tmp_reg_99_pp0_iter7_reg; +reg [0:0] tmp_reg_99_pp0_iter8_reg; +reg [0:0] tmp_reg_99_pp0_iter9_reg; +reg [0:0] tmp_reg_99_pp0_iter10_reg; +reg [0:0] tmp_reg_99_pp0_iter11_reg; +reg [0:0] tmp_reg_99_pp0_iter12_reg; +reg [0:0] tmp_reg_99_pp0_iter13_reg; +reg [0:0] tmp_reg_99_pp0_iter14_reg; +reg [0:0] tmp_reg_99_pp0_iter15_reg; +reg [0:0] tmp_reg_99_pp0_iter16_reg; +reg [0:0] tmp_reg_99_pp0_iter17_reg; +reg [0:0] tmp_reg_99_pp0_iter18_reg; +reg [0:0] tmp_reg_99_pp0_iter19_reg; +reg [0:0] tmp_reg_99_pp0_iter20_reg; +reg [0:0] tmp_reg_99_pp0_iter21_reg; +reg [0:0] tmp_reg_99_pp0_iter22_reg; +reg [0:0] tmp_reg_99_pp0_iter23_reg; +reg [0:0] tmp_reg_99_pp0_iter24_reg; +reg [0:0] tmp_reg_99_pp0_iter25_reg; +reg [0:0] tmp_reg_99_pp0_iter26_reg; +reg [0:0] tmp_reg_99_pp0_iter27_reg; +reg [0:0] tmp_reg_99_pp0_iter28_reg; +reg [0:0] tmp_reg_99_pp0_iter29_reg; +reg [0:0] tmp_reg_99_pp0_iter30_reg; +reg [0:0] tmp_reg_99_pp0_iter31_reg; +reg [0:0] tmp_reg_99_pp0_iter32_reg; +reg [0:0] tmp_reg_99_pp0_iter33_reg; +reg [0:0] tmp_reg_99_pp0_iter34_reg; +wire ap_block_pp0_stage0_01001; +wire ap_block_pp0_stage0; +wire [15:0] grp_fu_75_p2; +reg [0:0] ap_NS_fsm; +reg ap_idle_pp0_0to34; +reg ap_reset_idle_pp0; +wire ap_enable_pp0; +wire ap_ce_reg; + +// power-on initialization +initial begin +#0 ap_CS_fsm = 1'd1; +#0 ap_enable_reg_pp0_iter1 = 1'b0; +#0 ap_enable_reg_pp0_iter2 = 1'b0; +#0 ap_enable_reg_pp0_iter3 = 1'b0; +#0 ap_enable_reg_pp0_iter4 = 1'b0; +#0 ap_enable_reg_pp0_iter5 = 1'b0; +#0 ap_enable_reg_pp0_iter6 = 1'b0; +#0 ap_enable_reg_pp0_iter7 = 1'b0; +#0 ap_enable_reg_pp0_iter8 = 1'b0; +#0 ap_enable_reg_pp0_iter9 = 1'b0; +#0 ap_enable_reg_pp0_iter10 = 1'b0; +#0 ap_enable_reg_pp0_iter11 = 1'b0; +#0 ap_enable_reg_pp0_iter12 = 1'b0; +#0 ap_enable_reg_pp0_iter13 = 1'b0; +#0 ap_enable_reg_pp0_iter14 = 1'b0; +#0 ap_enable_reg_pp0_iter15 = 1'b0; +#0 ap_enable_reg_pp0_iter16 = 1'b0; +#0 ap_enable_reg_pp0_iter17 = 1'b0; +#0 ap_enable_reg_pp0_iter18 = 1'b0; +#0 ap_enable_reg_pp0_iter19 = 1'b0; +#0 ap_enable_reg_pp0_iter20 = 1'b0; +#0 ap_enable_reg_pp0_iter21 = 1'b0; +#0 ap_enable_reg_pp0_iter22 = 1'b0; +#0 ap_enable_reg_pp0_iter23 = 1'b0; +#0 ap_enable_reg_pp0_iter24 = 1'b0; +#0 ap_enable_reg_pp0_iter25 = 1'b0; +#0 ap_enable_reg_pp0_iter26 = 1'b0; +#0 ap_enable_reg_pp0_iter27 = 1'b0; +#0 ap_enable_reg_pp0_iter28 = 1'b0; +#0 ap_enable_reg_pp0_iter29 = 1'b0; +#0 ap_enable_reg_pp0_iter30 = 1'b0; +#0 ap_enable_reg_pp0_iter31 = 1'b0; +#0 ap_enable_reg_pp0_iter32 = 1'b0; +#0 ap_enable_reg_pp0_iter33 = 1'b0; +#0 ap_enable_reg_pp0_iter34 = 1'b0; +#0 ap_enable_reg_pp0_iter35 = 1'b0; +end + +div_sdiv_32ns_32ns_16_36_1 #( + .ID( 1 ), + .NUM_STAGE( 36 ), + .din0_WIDTH( 32 ), + .din1_WIDTH( 32 ), + .dout_WIDTH( 16 )) +sdiv_32ns_32ns_16_36_1_U1( + .clk(ap_clk), + .reset(ap_rst), + .din0(data_in_0_dout), + .din1(data_in_1_dout), + .ce(1'b1), + .dout(grp_fu_75_p2) +); + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_CS_fsm <= ap_ST_fsm_pp0_stage0; + end else begin + ap_CS_fsm <= ap_NS_fsm; + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter1 <= 1'b0; + end else begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_subdone))) begin + ap_enable_reg_pp0_iter1 <= ap_start; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter10 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter10 <= ap_enable_reg_pp0_iter9; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter11 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter11 <= ap_enable_reg_pp0_iter10; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter12 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter12 <= ap_enable_reg_pp0_iter11; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter13 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter13 <= ap_enable_reg_pp0_iter12; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter14 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter14 <= ap_enable_reg_pp0_iter13; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter15 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter15 <= ap_enable_reg_pp0_iter14; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter16 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter16 <= ap_enable_reg_pp0_iter15; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter17 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter17 <= ap_enable_reg_pp0_iter16; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter18 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter18 <= ap_enable_reg_pp0_iter17; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter19 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter19 <= ap_enable_reg_pp0_iter18; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter2 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter2 <= ap_enable_reg_pp0_iter1; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter20 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter20 <= ap_enable_reg_pp0_iter19; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter21 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter21 <= ap_enable_reg_pp0_iter20; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter22 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter22 <= ap_enable_reg_pp0_iter21; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter23 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter23 <= ap_enable_reg_pp0_iter22; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter24 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter24 <= ap_enable_reg_pp0_iter23; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter25 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter25 <= ap_enable_reg_pp0_iter24; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter26 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter26 <= ap_enable_reg_pp0_iter25; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter27 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter27 <= ap_enable_reg_pp0_iter26; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter28 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter28 <= ap_enable_reg_pp0_iter27; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter29 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter29 <= ap_enable_reg_pp0_iter28; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter3 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter3 <= ap_enable_reg_pp0_iter2; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter30 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter30 <= ap_enable_reg_pp0_iter29; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter31 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter31 <= ap_enable_reg_pp0_iter30; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter32 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter32 <= ap_enable_reg_pp0_iter31; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter33 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter33 <= ap_enable_reg_pp0_iter32; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter34 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter34 <= ap_enable_reg_pp0_iter33; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter35 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter35 <= ap_enable_reg_pp0_iter34; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter4 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter4 <= ap_enable_reg_pp0_iter3; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter5 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter5 <= ap_enable_reg_pp0_iter4; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter6 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter6 <= ap_enable_reg_pp0_iter5; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter7 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter7 <= ap_enable_reg_pp0_iter6; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter8 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter8 <= ap_enable_reg_pp0_iter7; + end + end +end + +always @ (posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter9 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter9 <= ap_enable_reg_pp0_iter8; + end + end +end + +always @ (posedge ap_clk) begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + tmp_1_reg_90 <= tmp_1_nbreadreq_fu_40_p3; + end +end + +always @ (posedge ap_clk) begin + if ((1'b0 == ap_block_pp0_stage0_11001)) begin + tmp_1_reg_90_pp0_iter10_reg <= tmp_1_reg_90_pp0_iter9_reg; + tmp_1_reg_90_pp0_iter11_reg <= tmp_1_reg_90_pp0_iter10_reg; + tmp_1_reg_90_pp0_iter12_reg <= tmp_1_reg_90_pp0_iter11_reg; + tmp_1_reg_90_pp0_iter13_reg <= tmp_1_reg_90_pp0_iter12_reg; + tmp_1_reg_90_pp0_iter14_reg <= tmp_1_reg_90_pp0_iter13_reg; + tmp_1_reg_90_pp0_iter15_reg <= tmp_1_reg_90_pp0_iter14_reg; + tmp_1_reg_90_pp0_iter16_reg <= tmp_1_reg_90_pp0_iter15_reg; + tmp_1_reg_90_pp0_iter17_reg <= tmp_1_reg_90_pp0_iter16_reg; + tmp_1_reg_90_pp0_iter18_reg <= tmp_1_reg_90_pp0_iter17_reg; + tmp_1_reg_90_pp0_iter19_reg <= tmp_1_reg_90_pp0_iter18_reg; + tmp_1_reg_90_pp0_iter20_reg <= tmp_1_reg_90_pp0_iter19_reg; + tmp_1_reg_90_pp0_iter21_reg <= tmp_1_reg_90_pp0_iter20_reg; + tmp_1_reg_90_pp0_iter22_reg <= tmp_1_reg_90_pp0_iter21_reg; + tmp_1_reg_90_pp0_iter23_reg <= tmp_1_reg_90_pp0_iter22_reg; + tmp_1_reg_90_pp0_iter24_reg <= tmp_1_reg_90_pp0_iter23_reg; + tmp_1_reg_90_pp0_iter25_reg <= tmp_1_reg_90_pp0_iter24_reg; + tmp_1_reg_90_pp0_iter26_reg <= tmp_1_reg_90_pp0_iter25_reg; + tmp_1_reg_90_pp0_iter27_reg <= tmp_1_reg_90_pp0_iter26_reg; + tmp_1_reg_90_pp0_iter28_reg <= tmp_1_reg_90_pp0_iter27_reg; + tmp_1_reg_90_pp0_iter29_reg <= tmp_1_reg_90_pp0_iter28_reg; + tmp_1_reg_90_pp0_iter2_reg <= tmp_1_reg_90_pp0_iter1_reg; + tmp_1_reg_90_pp0_iter30_reg <= tmp_1_reg_90_pp0_iter29_reg; + tmp_1_reg_90_pp0_iter31_reg <= tmp_1_reg_90_pp0_iter30_reg; + tmp_1_reg_90_pp0_iter32_reg <= tmp_1_reg_90_pp0_iter31_reg; + tmp_1_reg_90_pp0_iter33_reg <= tmp_1_reg_90_pp0_iter32_reg; + tmp_1_reg_90_pp0_iter34_reg <= tmp_1_reg_90_pp0_iter33_reg; + tmp_1_reg_90_pp0_iter3_reg <= tmp_1_reg_90_pp0_iter2_reg; + tmp_1_reg_90_pp0_iter4_reg <= tmp_1_reg_90_pp0_iter3_reg; + tmp_1_reg_90_pp0_iter5_reg <= tmp_1_reg_90_pp0_iter4_reg; + tmp_1_reg_90_pp0_iter6_reg <= tmp_1_reg_90_pp0_iter5_reg; + tmp_1_reg_90_pp0_iter7_reg <= tmp_1_reg_90_pp0_iter6_reg; + tmp_1_reg_90_pp0_iter8_reg <= tmp_1_reg_90_pp0_iter7_reg; + tmp_1_reg_90_pp0_iter9_reg <= tmp_1_reg_90_pp0_iter8_reg; + tmp_reg_99_pp0_iter10_reg <= tmp_reg_99_pp0_iter9_reg; + tmp_reg_99_pp0_iter11_reg <= tmp_reg_99_pp0_iter10_reg; + tmp_reg_99_pp0_iter12_reg <= tmp_reg_99_pp0_iter11_reg; + tmp_reg_99_pp0_iter13_reg <= tmp_reg_99_pp0_iter12_reg; + tmp_reg_99_pp0_iter14_reg <= tmp_reg_99_pp0_iter13_reg; + tmp_reg_99_pp0_iter15_reg <= tmp_reg_99_pp0_iter14_reg; + tmp_reg_99_pp0_iter16_reg <= tmp_reg_99_pp0_iter15_reg; + tmp_reg_99_pp0_iter17_reg <= tmp_reg_99_pp0_iter16_reg; + tmp_reg_99_pp0_iter18_reg <= tmp_reg_99_pp0_iter17_reg; + tmp_reg_99_pp0_iter19_reg <= tmp_reg_99_pp0_iter18_reg; + tmp_reg_99_pp0_iter20_reg <= tmp_reg_99_pp0_iter19_reg; + tmp_reg_99_pp0_iter21_reg <= tmp_reg_99_pp0_iter20_reg; + tmp_reg_99_pp0_iter22_reg <= tmp_reg_99_pp0_iter21_reg; + tmp_reg_99_pp0_iter23_reg <= tmp_reg_99_pp0_iter22_reg; + tmp_reg_99_pp0_iter24_reg <= tmp_reg_99_pp0_iter23_reg; + tmp_reg_99_pp0_iter25_reg <= tmp_reg_99_pp0_iter24_reg; + tmp_reg_99_pp0_iter26_reg <= tmp_reg_99_pp0_iter25_reg; + tmp_reg_99_pp0_iter27_reg <= tmp_reg_99_pp0_iter26_reg; + tmp_reg_99_pp0_iter28_reg <= tmp_reg_99_pp0_iter27_reg; + tmp_reg_99_pp0_iter29_reg <= tmp_reg_99_pp0_iter28_reg; + tmp_reg_99_pp0_iter2_reg <= tmp_reg_99_pp0_iter1_reg; + tmp_reg_99_pp0_iter30_reg <= tmp_reg_99_pp0_iter29_reg; + tmp_reg_99_pp0_iter31_reg <= tmp_reg_99_pp0_iter30_reg; + tmp_reg_99_pp0_iter32_reg <= tmp_reg_99_pp0_iter31_reg; + tmp_reg_99_pp0_iter33_reg <= tmp_reg_99_pp0_iter32_reg; + tmp_reg_99_pp0_iter34_reg <= tmp_reg_99_pp0_iter33_reg; + tmp_reg_99_pp0_iter3_reg <= tmp_reg_99_pp0_iter2_reg; + tmp_reg_99_pp0_iter4_reg <= tmp_reg_99_pp0_iter3_reg; + tmp_reg_99_pp0_iter5_reg <= tmp_reg_99_pp0_iter4_reg; + tmp_reg_99_pp0_iter6_reg <= tmp_reg_99_pp0_iter5_reg; + tmp_reg_99_pp0_iter7_reg <= tmp_reg_99_pp0_iter6_reg; + tmp_reg_99_pp0_iter8_reg <= tmp_reg_99_pp0_iter7_reg; + tmp_reg_99_pp0_iter9_reg <= tmp_reg_99_pp0_iter8_reg; + end +end + +always @ (posedge ap_clk) begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_11001))) begin + tmp_1_reg_90_pp0_iter1_reg <= tmp_1_reg_90; + tmp_reg_99 <= tmp_nbreadreq_fu_32_p3; + tmp_reg_99_pp0_iter1_reg <= tmp_reg_99; + end +end + +always @ (*) begin + if (((1'b0 == ap_block_pp0_stage0_subdone) & (ap_enable_reg_pp0_iter35 == 1'b1))) begin + ap_done = 1'b1; + end else begin + ap_done = 1'b0; + end +end + +always @ (*) begin + if (((ap_start == 1'b0) & (1'b1 == ap_CS_fsm_pp0_stage0) & (ap_idle_pp0 == 1'b1))) begin + ap_idle = 1'b1; + end else begin + ap_idle = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter26 == 1'b0) & (ap_enable_reg_pp0_iter25 == 1'b0) & (ap_enable_reg_pp0_iter24 == 1'b0) & (ap_enable_reg_pp0_iter23 == 1'b0) & (ap_enable_reg_pp0_iter22 == 1'b0) & (ap_enable_reg_pp0_iter21 == 1'b0) & (ap_enable_reg_pp0_iter20 == 1'b0) & (ap_enable_reg_pp0_iter19 == 1'b0) & (ap_enable_reg_pp0_iter18 == 1'b0) & (ap_enable_reg_pp0_iter17 == 1'b0) & (ap_enable_reg_pp0_iter16 == 1'b0) & (ap_enable_reg_pp0_iter15 == 1'b0) & (ap_enable_reg_pp0_iter14 == 1'b0) & (ap_enable_reg_pp0_iter13 == 1'b0) & (ap_enable_reg_pp0_iter12 == 1'b0) & (ap_enable_reg_pp0_iter11 == 1'b0) & (ap_enable_reg_pp0_iter10 == 1'b0) & (ap_enable_reg_pp0_iter9 == 1'b0) & (ap_enable_reg_pp0_iter8 == 1'b0) & (ap_enable_reg_pp0_iter7 == 1'b0) & (ap_enable_reg_pp0_iter6 == 1'b0) & (ap_enable_reg_pp0_iter5 == 1'b0) & (ap_enable_reg_pp0_iter4 == 1'b0) & (ap_enable_reg_pp0_iter3 == 1'b0) & (ap_enable_reg_pp0_iter2 == 1'b0) & (ap_enable_reg_pp0_iter1 == 1'b0) & (ap_enable_reg_pp0_iter0 == 1'b0) & (ap_enable_reg_pp0_iter35 == 1'b0) + & (ap_enable_reg_pp0_iter34 == 1'b0) & (ap_enable_reg_pp0_iter33 == 1'b0) & (ap_enable_reg_pp0_iter32 == 1'b0) & (ap_enable_reg_pp0_iter31 == 1'b0) & (ap_enable_reg_pp0_iter30 == 1'b0) & (ap_enable_reg_pp0_iter29 == 1'b0) & (ap_enable_reg_pp0_iter28 == 1'b0) & (ap_enable_reg_pp0_iter27 == 1'b0))) begin + ap_idle_pp0 = 1'b1; + end else begin + ap_idle_pp0 = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter26 == 1'b0) & (ap_enable_reg_pp0_iter25 == 1'b0) & (ap_enable_reg_pp0_iter24 == 1'b0) & (ap_enable_reg_pp0_iter23 == 1'b0) & (ap_enable_reg_pp0_iter22 == 1'b0) & (ap_enable_reg_pp0_iter21 == 1'b0) & (ap_enable_reg_pp0_iter20 == 1'b0) & (ap_enable_reg_pp0_iter19 == 1'b0) & (ap_enable_reg_pp0_iter18 == 1'b0) & (ap_enable_reg_pp0_iter17 == 1'b0) & (ap_enable_reg_pp0_iter16 == 1'b0) & (ap_enable_reg_pp0_iter15 == 1'b0) & (ap_enable_reg_pp0_iter14 == 1'b0) & (ap_enable_reg_pp0_iter13 == 1'b0) & (ap_enable_reg_pp0_iter12 == 1'b0) & (ap_enable_reg_pp0_iter11 == 1'b0) & (ap_enable_reg_pp0_iter10 == 1'b0) & (ap_enable_reg_pp0_iter9 == 1'b0) & (ap_enable_reg_pp0_iter8 == 1'b0) & (ap_enable_reg_pp0_iter7 == 1'b0) & (ap_enable_reg_pp0_iter6 == 1'b0) & (ap_enable_reg_pp0_iter5 == 1'b0) & (ap_enable_reg_pp0_iter4 == 1'b0) & (ap_enable_reg_pp0_iter3 == 1'b0) & (ap_enable_reg_pp0_iter2 == 1'b0) & (ap_enable_reg_pp0_iter1 == 1'b0) & (ap_enable_reg_pp0_iter0 == 1'b0) & (ap_enable_reg_pp0_iter34 == 1'b0) + & (ap_enable_reg_pp0_iter33 == 1'b0) & (ap_enable_reg_pp0_iter32 == 1'b0) & (ap_enable_reg_pp0_iter31 == 1'b0) & (ap_enable_reg_pp0_iter30 == 1'b0) & (ap_enable_reg_pp0_iter29 == 1'b0) & (ap_enable_reg_pp0_iter28 == 1'b0) & (ap_enable_reg_pp0_iter27 == 1'b0))) begin + ap_idle_pp0_0to34 = 1'b1; + end else begin + ap_idle_pp0_0to34 = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_subdone))) begin + ap_ready = 1'b1; + end else begin + ap_ready = 1'b0; + end +end + +always @ (*) begin + if (((ap_start == 1'b0) & (ap_idle_pp0_0to34 == 1'b1))) begin + ap_reset_idle_pp0 = 1'b1; + end else begin + ap_reset_idle_pp0 = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_1_nbreadreq_fu_40_p3 == 1'd1) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (data_in_0_empty_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + data_in_0_read = 1'b1; + end else begin + data_in_0_read = 1'b0; + end +end + +always @ (*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_1_nbreadreq_fu_40_p3 == 1'd1) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (data_in_1_empty_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + data_in_1_read = 1'b1; + end else begin + data_in_1_read = 1'b0; + end +end + +always @ (*) begin + if (((tmp_reg_99_pp0_iter34_reg == 1'd1) & (tmp_1_reg_90_pp0_iter34_reg == 1'd1) & (data_out_0_full_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001) & (ap_enable_reg_pp0_iter35 == 1'b1))) begin + data_out_0_write = 1'b1; + end else begin + data_out_0_write = 1'b0; + end +end + +always @ (*) begin + case (ap_CS_fsm) + ap_ST_fsm_pp0_stage0 : begin + ap_NS_fsm = ap_ST_fsm_pp0_stage0; + end + default : begin + ap_NS_fsm = 'bx; + end + endcase +end + +assign ap_CS_fsm_pp0_stage0 = ap_CS_fsm[32'd0]; + +assign ap_block_pp0_stage0 = ~(1'b1 == 1'b1); + +assign ap_block_pp0_stage0_01001 = ~(1'b1 == 1'b1); + +assign ap_block_pp0_stage0_11001 = ~(1'b1 == 1'b1); + +assign ap_block_pp0_stage0_subdone = ~(1'b1 == 1'b1); + +assign ap_block_state10_pp0_stage0_iter9 = ~(1'b1 == 1'b1); + +assign ap_block_state11_pp0_stage0_iter10 = ~(1'b1 == 1'b1); + +assign ap_block_state12_pp0_stage0_iter11 = ~(1'b1 == 1'b1); + +assign ap_block_state13_pp0_stage0_iter12 = ~(1'b1 == 1'b1); + +assign ap_block_state14_pp0_stage0_iter13 = ~(1'b1 == 1'b1); + +assign ap_block_state15_pp0_stage0_iter14 = ~(1'b1 == 1'b1); + +assign ap_block_state16_pp0_stage0_iter15 = ~(1'b1 == 1'b1); + +assign ap_block_state17_pp0_stage0_iter16 = ~(1'b1 == 1'b1); + +assign ap_block_state18_pp0_stage0_iter17 = ~(1'b1 == 1'b1); + +assign ap_block_state19_pp0_stage0_iter18 = ~(1'b1 == 1'b1); + +assign ap_block_state1_pp0_stage0_iter0 = ~(1'b1 == 1'b1); + +assign ap_block_state20_pp0_stage0_iter19 = ~(1'b1 == 1'b1); + +assign ap_block_state21_pp0_stage0_iter20 = ~(1'b1 == 1'b1); + +assign ap_block_state22_pp0_stage0_iter21 = ~(1'b1 == 1'b1); + +assign ap_block_state23_pp0_stage0_iter22 = ~(1'b1 == 1'b1); + +assign ap_block_state24_pp0_stage0_iter23 = ~(1'b1 == 1'b1); + +assign ap_block_state25_pp0_stage0_iter24 = ~(1'b1 == 1'b1); + +assign ap_block_state26_pp0_stage0_iter25 = ~(1'b1 == 1'b1); + +assign ap_block_state27_pp0_stage0_iter26 = ~(1'b1 == 1'b1); + +assign ap_block_state28_pp0_stage0_iter27 = ~(1'b1 == 1'b1); + +assign ap_block_state29_pp0_stage0_iter28 = ~(1'b1 == 1'b1); + +assign ap_block_state2_pp0_stage0_iter1 = ~(1'b1 == 1'b1); + +assign ap_block_state30_pp0_stage0_iter29 = ~(1'b1 == 1'b1); + +assign ap_block_state31_pp0_stage0_iter30 = ~(1'b1 == 1'b1); + +assign ap_block_state32_pp0_stage0_iter31 = ~(1'b1 == 1'b1); + +assign ap_block_state33_pp0_stage0_iter32 = ~(1'b1 == 1'b1); + +assign ap_block_state34_pp0_stage0_iter33 = ~(1'b1 == 1'b1); + +assign ap_block_state35_pp0_stage0_iter34 = ~(1'b1 == 1'b1); + +assign ap_block_state36_pp0_stage0_iter35 = ~(1'b1 == 1'b1); + +assign ap_block_state3_pp0_stage0_iter2 = ~(1'b1 == 1'b1); + +assign ap_block_state4_pp0_stage0_iter3 = ~(1'b1 == 1'b1); + +assign ap_block_state5_pp0_stage0_iter4 = ~(1'b1 == 1'b1); + +assign ap_block_state6_pp0_stage0_iter5 = ~(1'b1 == 1'b1); + +assign ap_block_state7_pp0_stage0_iter6 = ~(1'b1 == 1'b1); + +assign ap_block_state8_pp0_stage0_iter7 = ~(1'b1 == 1'b1); + +assign ap_block_state9_pp0_stage0_iter8 = ~(1'b1 == 1'b1); + +assign ap_enable_pp0 = (ap_idle_pp0 ^ 1'b1); + +assign ap_enable_reg_pp0_iter0 = ap_start; + +assign data_out_0_din = grp_fu_75_p2[15:0]; + +assign tmp_1_nbreadreq_fu_40_p3 = data_in_1_empty_n; + +assign tmp_nbreadreq_fu_32_p3 = data_in_0_empty_n; + +endmodule //div diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/verilog/div_sdiv_32ns_32ns_16_36_1.v b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/verilog/div_sdiv_32ns_32ns_16_36_1.v new file mode 100644 index 000000000..765d3dc70 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/verilog/div_sdiv_32ns_32ns_16_36_1.v @@ -0,0 +1,156 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== +`timescale 1 ns / 1 ps + +module div_sdiv_32ns_32ns_16_36_1_divider +#(parameter + in0_WIDTH = 32, + in1_WIDTH = 32, + out_WIDTH = 32 +) +( + input clk, + input reset, + input ce, + input [in0_WIDTH-1:0] dividend, + input [in1_WIDTH-1:0] divisor, + input [1:0] sign_i, + output wire [1:0] sign_o, + output wire [out_WIDTH-1:0] quot, + output wire [out_WIDTH-1:0] remd +); + +localparam cal_WIDTH = (in0_WIDTH > in1_WIDTH)? in0_WIDTH : in1_WIDTH; + +//------------------------Local signal------------------- +reg [in0_WIDTH-1:0] dividend_tmp[0:in0_WIDTH]; +reg [in1_WIDTH-1:0] divisor_tmp[0:in0_WIDTH]; +reg [in0_WIDTH-1:0] remd_tmp[0:in0_WIDTH]; +wire [in0_WIDTH-1:0] comb_tmp[0:in0_WIDTH-1]; +wire [cal_WIDTH:0] cal_tmp[0:in0_WIDTH-1]; +reg [1:0] sign_tmp[0:in0_WIDTH]; +//------------------------Body--------------------------- +assign quot = dividend_tmp[in0_WIDTH]; +assign remd = remd_tmp[in0_WIDTH]; +assign sign_o = sign_tmp[in0_WIDTH]; + +// dividend_tmp[0], divisor_tmp[0], remd_tmp[0] +always @(posedge clk) +begin + if (ce) begin + dividend_tmp[0] <= dividend; + divisor_tmp[0] <= divisor; + sign_tmp[0] <= sign_i; + remd_tmp[0] <= 1'b0; + end +end + +genvar i; +generate + for (i = 0; i < in0_WIDTH; i = i + 1) + begin : loop + if (in0_WIDTH == 1) assign comb_tmp[i] = dividend_tmp[i][0]; + else assign comb_tmp[i] = {remd_tmp[i][in0_WIDTH-2:0], dividend_tmp[i][in0_WIDTH-1]}; + assign cal_tmp[i] = {1'b0, comb_tmp[i]} - {1'b0, divisor_tmp[i]}; + + always @(posedge clk) + begin + if (ce) begin + if (in0_WIDTH == 1) dividend_tmp[i+1] <= ~cal_tmp[i][cal_WIDTH]; + else dividend_tmp[i+1] <= {dividend_tmp[i][in0_WIDTH-2:0], ~cal_tmp[i][cal_WIDTH]}; + divisor_tmp[i+1] <= divisor_tmp[i]; + remd_tmp[i+1] <= cal_tmp[i][cal_WIDTH]? comb_tmp[i] : cal_tmp[i][in0_WIDTH-1:0]; + sign_tmp[i+1] <= sign_tmp[i]; + end + end + end +endgenerate + +endmodule + +module div_sdiv_32ns_32ns_16_36_1 +#(parameter + ID = 1, + NUM_STAGE = 2, + din0_WIDTH = 32, + din1_WIDTH = 32, + dout_WIDTH = 32 +) +( + input clk, + input reset, + input ce, + input [din0_WIDTH-1:0] din0, + input [din1_WIDTH-1:0] din1, + output [dout_WIDTH-1:0] dout +); +//------------------------Local signal------------------- +reg [din0_WIDTH-1:0] dividend0; +reg [din1_WIDTH-1:0] divisor0; +wire [din0_WIDTH-1:0] dividend_u; +wire [din1_WIDTH-1:0] divisor_u; +wire [dout_WIDTH-1:0] quot_u; +wire [dout_WIDTH-1:0] remd_u; +reg [dout_WIDTH-1:0] quot; +reg [dout_WIDTH-1:0] remd; +wire [1:0] sign_i; +wire [1:0] sign_o; +//------------------------Instantiation------------------ +div_sdiv_32ns_32ns_16_36_1_divider #( + .in0_WIDTH ( din0_WIDTH ), + .in1_WIDTH ( din1_WIDTH ), + .out_WIDTH ( dout_WIDTH ) +) div_sdiv_32ns_32ns_16_36_1_divider_u ( + .clk ( clk ), + .reset ( reset ), + .ce ( ce ), + .dividend ( dividend_u ), + .divisor ( divisor_u ), + .sign_i ( sign_i ), + .sign_o ( sign_o ), + .quot ( quot_u ), + .remd ( remd_u ) +); +//------------------------Body--------------------------- +assign sign_i = {dividend0[din0_WIDTH-1] ^ divisor0[din1_WIDTH-1], dividend0[din0_WIDTH-1]}; +assign dividend_u = dividend0[din0_WIDTH-1]? ~dividend0[din0_WIDTH-1:0] + 1'b1 : + dividend0[din0_WIDTH-1:0]; +assign divisor_u = divisor0[din1_WIDTH-1]? ~divisor0[din1_WIDTH-1:0] + 1'b1 : + divisor0[din1_WIDTH-1:0]; + +always @(posedge clk) +begin + if (ce) begin + dividend0 <= din0; + divisor0 <= din1; + end +end + +always @(posedge clk) +begin + if (ce) begin + if (sign_o[1]) + quot <= ~quot_u + 1'b1; + else + quot <= quot_u; + end +end + +always @(posedge clk) +begin + if (ce) begin + if (sign_o[0]) + remd <= ~remd_u + 1'b1; + else + remd <= remd_u; + end +end + +assign dout = quot; + +endmodule + + diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/vhdl/div.vhd b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/vhdl/div.vhd new file mode 100644 index 000000000..4d56aba1f --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/vhdl/div.vhd @@ -0,0 +1,982 @@ +-- ============================================================== +-- Generated by Vitis HLS v2023.1 +-- Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +-- Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +-- ============================================================== + +library IEEE; +use IEEE.std_logic_1164.all; +use IEEE.numeric_std.all; + +entity div is +port ( + ap_clk : IN STD_LOGIC; + ap_rst : IN STD_LOGIC; + ap_start : IN STD_LOGIC; + ap_done : OUT STD_LOGIC; + ap_idle : OUT STD_LOGIC; + ap_ready : OUT STD_LOGIC; + data_in_0_dout : IN STD_LOGIC_VECTOR (31 downto 0); + data_in_0_empty_n : IN STD_LOGIC; + data_in_0_read : OUT STD_LOGIC; + data_in_1_dout : IN STD_LOGIC_VECTOR (31 downto 0); + data_in_1_empty_n : IN STD_LOGIC; + data_in_1_read : OUT STD_LOGIC; + data_out_0_din : OUT STD_LOGIC_VECTOR (15 downto 0); + data_out_0_full_n : IN STD_LOGIC; + data_out_0_write : OUT STD_LOGIC ); +end; + + +architecture behav of div is + attribute CORE_GENERATION_INFO : STRING; + attribute CORE_GENERATION_INFO of behav : architecture is + "div_div,hls_ip_2023_1,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xcu250-figd2104-2L-e,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=pipeline,HLS_SYN_CLOCK=2.593000,HLS_SYN_LAT=35,HLS_SYN_TPT=1,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=2449,HLS_SYN_LUT=1808,HLS_VERSION=2023_1}"; + constant ap_const_logic_1 : STD_LOGIC := '1'; + constant ap_const_logic_0 : STD_LOGIC := '0'; + constant ap_ST_fsm_pp0_stage0 : STD_LOGIC_VECTOR (0 downto 0) := "1"; + constant ap_const_lv32_0 : STD_LOGIC_VECTOR (31 downto 0) := "00000000000000000000000000000000"; + constant ap_const_boolean_1 : BOOLEAN := true; + constant ap_const_boolean_0 : BOOLEAN := false; + constant ap_const_lv1_1 : STD_LOGIC_VECTOR (0 downto 0) := "1"; + + signal ap_CS_fsm : STD_LOGIC_VECTOR (0 downto 0) := "1"; + attribute fsm_encoding : string; + attribute fsm_encoding of ap_CS_fsm : signal is "none"; + signal ap_CS_fsm_pp0_stage0 : STD_LOGIC; + attribute fsm_encoding of ap_CS_fsm_pp0_stage0 : signal is "none"; + signal ap_enable_reg_pp0_iter0 : STD_LOGIC; + signal ap_enable_reg_pp0_iter1 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter2 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter3 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter4 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter5 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter6 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter7 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter8 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter9 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter10 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter11 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter12 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter13 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter14 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter15 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter16 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter17 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter18 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter19 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter20 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter21 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter22 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter23 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter24 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter25 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter26 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter27 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter28 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter29 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter30 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter31 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter32 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter33 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter34 : STD_LOGIC := '0'; + signal ap_enable_reg_pp0_iter35 : STD_LOGIC := '0'; + signal ap_idle_pp0 : STD_LOGIC; + signal ap_block_state1_pp0_stage0_iter0 : BOOLEAN; + signal ap_block_state2_pp0_stage0_iter1 : BOOLEAN; + signal ap_block_state3_pp0_stage0_iter2 : BOOLEAN; + signal ap_block_state4_pp0_stage0_iter3 : BOOLEAN; + signal ap_block_state5_pp0_stage0_iter4 : BOOLEAN; + signal ap_block_state6_pp0_stage0_iter5 : BOOLEAN; + signal ap_block_state7_pp0_stage0_iter6 : BOOLEAN; + signal ap_block_state8_pp0_stage0_iter7 : BOOLEAN; + signal ap_block_state9_pp0_stage0_iter8 : BOOLEAN; + signal ap_block_state10_pp0_stage0_iter9 : BOOLEAN; + signal ap_block_state11_pp0_stage0_iter10 : BOOLEAN; + signal ap_block_state12_pp0_stage0_iter11 : BOOLEAN; + signal ap_block_state13_pp0_stage0_iter12 : BOOLEAN; + signal ap_block_state14_pp0_stage0_iter13 : BOOLEAN; + signal ap_block_state15_pp0_stage0_iter14 : BOOLEAN; + signal ap_block_state16_pp0_stage0_iter15 : BOOLEAN; + signal ap_block_state17_pp0_stage0_iter16 : BOOLEAN; + signal ap_block_state18_pp0_stage0_iter17 : BOOLEAN; + signal ap_block_state19_pp0_stage0_iter18 : BOOLEAN; + signal ap_block_state20_pp0_stage0_iter19 : BOOLEAN; + signal ap_block_state21_pp0_stage0_iter20 : BOOLEAN; + signal ap_block_state22_pp0_stage0_iter21 : BOOLEAN; + signal ap_block_state23_pp0_stage0_iter22 : BOOLEAN; + signal ap_block_state24_pp0_stage0_iter23 : BOOLEAN; + signal ap_block_state25_pp0_stage0_iter24 : BOOLEAN; + signal ap_block_state26_pp0_stage0_iter25 : BOOLEAN; + signal ap_block_state27_pp0_stage0_iter26 : BOOLEAN; + signal ap_block_state28_pp0_stage0_iter27 : BOOLEAN; + signal ap_block_state29_pp0_stage0_iter28 : BOOLEAN; + signal ap_block_state30_pp0_stage0_iter29 : BOOLEAN; + signal ap_block_state31_pp0_stage0_iter30 : BOOLEAN; + signal ap_block_state32_pp0_stage0_iter31 : BOOLEAN; + signal ap_block_state33_pp0_stage0_iter32 : BOOLEAN; + signal ap_block_state34_pp0_stage0_iter33 : BOOLEAN; + signal ap_block_state35_pp0_stage0_iter34 : BOOLEAN; + signal ap_block_state36_pp0_stage0_iter35 : BOOLEAN; + signal ap_block_pp0_stage0_subdone : BOOLEAN; + signal ap_block_pp0_stage0_11001 : BOOLEAN; + signal tmp_nbreadreq_fu_32_p3 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_nbreadreq_fu_40_p3 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter1_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter2_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter3_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter4_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter5_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter6_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter7_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter8_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter9_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter10_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter11_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter12_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter13_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter14_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter15_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter16_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter17_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter18_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter19_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter20_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter21_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter22_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter23_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter24_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter25_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter26_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter27_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter28_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter29_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter30_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter31_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter32_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter33_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_1_reg_90_pp0_iter34_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99 : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter1_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter2_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter3_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter4_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter5_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter6_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter7_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter8_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter9_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter10_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter11_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter12_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter13_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter14_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter15_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter16_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter17_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter18_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter19_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter20_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter21_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter22_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter23_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter24_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter25_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter26_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter27_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter28_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter29_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter30_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter31_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter32_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter33_reg : STD_LOGIC_VECTOR (0 downto 0); + signal tmp_reg_99_pp0_iter34_reg : STD_LOGIC_VECTOR (0 downto 0); + signal ap_block_pp0_stage0_01001 : BOOLEAN; + signal ap_block_pp0_stage0 : BOOLEAN; + signal grp_fu_75_p2 : STD_LOGIC_VECTOR (15 downto 0); + signal ap_NS_fsm : STD_LOGIC_VECTOR (0 downto 0); + signal ap_idle_pp0_0to34 : STD_LOGIC; + signal ap_reset_idle_pp0 : STD_LOGIC; + signal ap_enable_pp0 : STD_LOGIC; + signal ap_ce_reg : STD_LOGIC; + + component div_sdiv_32ns_32ns_16_36_1 IS + generic ( + ID : INTEGER; + NUM_STAGE : INTEGER; + din0_WIDTH : INTEGER; + din1_WIDTH : INTEGER; + dout_WIDTH : INTEGER ); + port ( + clk : IN STD_LOGIC; + reset : IN STD_LOGIC; + din0 : IN STD_LOGIC_VECTOR (31 downto 0); + din1 : IN STD_LOGIC_VECTOR (31 downto 0); + ce : IN STD_LOGIC; + dout : OUT STD_LOGIC_VECTOR (15 downto 0) ); + end component; + + + +begin + sdiv_32ns_32ns_16_36_1_U1 : component div_sdiv_32ns_32ns_16_36_1 + generic map ( + ID => 1, + NUM_STAGE => 36, + din0_WIDTH => 32, + din1_WIDTH => 32, + dout_WIDTH => 16) + port map ( + clk => ap_clk, + reset => ap_rst, + din0 => data_in_0_dout, + din1 => data_in_1_dout, + ce => ap_const_logic_1, + dout => grp_fu_75_p2); + + + + + + ap_CS_fsm_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_CS_fsm <= ap_ST_fsm_pp0_stage0; + else + ap_CS_fsm <= ap_NS_fsm; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter1_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter1 <= ap_const_logic_0; + else + if (((ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_const_boolean_0 = ap_block_pp0_stage0_subdone))) then + ap_enable_reg_pp0_iter1 <= ap_start; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter10_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter10 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter10 <= ap_enable_reg_pp0_iter9; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter11_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter11 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter11 <= ap_enable_reg_pp0_iter10; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter12_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter12 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter12 <= ap_enable_reg_pp0_iter11; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter13_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter13 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter13 <= ap_enable_reg_pp0_iter12; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter14_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter14 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter14 <= ap_enable_reg_pp0_iter13; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter15_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter15 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter15 <= ap_enable_reg_pp0_iter14; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter16_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter16 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter16 <= ap_enable_reg_pp0_iter15; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter17_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter17 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter17 <= ap_enable_reg_pp0_iter16; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter18_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter18 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter18 <= ap_enable_reg_pp0_iter17; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter19_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter19 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter19 <= ap_enable_reg_pp0_iter18; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter2_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter2 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter2 <= ap_enable_reg_pp0_iter1; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter20_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter20 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter20 <= ap_enable_reg_pp0_iter19; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter21_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter21 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter21 <= ap_enable_reg_pp0_iter20; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter22_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter22 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter22 <= ap_enable_reg_pp0_iter21; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter23_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter23 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter23 <= ap_enable_reg_pp0_iter22; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter24_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter24 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter24 <= ap_enable_reg_pp0_iter23; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter25_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter25 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter25 <= ap_enable_reg_pp0_iter24; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter26_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter26 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter26 <= ap_enable_reg_pp0_iter25; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter27_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter27 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter27 <= ap_enable_reg_pp0_iter26; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter28_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter28 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter28 <= ap_enable_reg_pp0_iter27; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter29_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter29 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter29 <= ap_enable_reg_pp0_iter28; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter3_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter3 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter3 <= ap_enable_reg_pp0_iter2; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter30_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter30 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter30 <= ap_enable_reg_pp0_iter29; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter31_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter31 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter31 <= ap_enable_reg_pp0_iter30; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter32_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter32 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter32 <= ap_enable_reg_pp0_iter31; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter33_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter33 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter33 <= ap_enable_reg_pp0_iter32; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter34_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter34 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter34 <= ap_enable_reg_pp0_iter33; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter35_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter35 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter35 <= ap_enable_reg_pp0_iter34; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter4_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter4 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter4 <= ap_enable_reg_pp0_iter3; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter5_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter5 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter5 <= ap_enable_reg_pp0_iter4; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter6_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter6 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter6 <= ap_enable_reg_pp0_iter5; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter7_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter7 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter7 <= ap_enable_reg_pp0_iter6; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter8_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter8 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter8 <= ap_enable_reg_pp0_iter7; + end if; + end if; + end if; + end process; + + + ap_enable_reg_pp0_iter9_assign_proc : process(ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (ap_rst = '1') then + ap_enable_reg_pp0_iter9 <= ap_const_logic_0; + else + if ((ap_const_boolean_0 = ap_block_pp0_stage0_subdone)) then + ap_enable_reg_pp0_iter9 <= ap_enable_reg_pp0_iter8; + end if; + end if; + end if; + end process; + + process (ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (((ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (tmp_nbreadreq_fu_32_p3 = ap_const_lv1_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + tmp_1_reg_90 <= tmp_1_nbreadreq_fu_40_p3; + end if; + end if; + end process; + process (ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if ((ap_const_boolean_0 = ap_block_pp0_stage0_11001)) then + tmp_1_reg_90_pp0_iter10_reg <= tmp_1_reg_90_pp0_iter9_reg; + tmp_1_reg_90_pp0_iter11_reg <= tmp_1_reg_90_pp0_iter10_reg; + tmp_1_reg_90_pp0_iter12_reg <= tmp_1_reg_90_pp0_iter11_reg; + tmp_1_reg_90_pp0_iter13_reg <= tmp_1_reg_90_pp0_iter12_reg; + tmp_1_reg_90_pp0_iter14_reg <= tmp_1_reg_90_pp0_iter13_reg; + tmp_1_reg_90_pp0_iter15_reg <= tmp_1_reg_90_pp0_iter14_reg; + tmp_1_reg_90_pp0_iter16_reg <= tmp_1_reg_90_pp0_iter15_reg; + tmp_1_reg_90_pp0_iter17_reg <= tmp_1_reg_90_pp0_iter16_reg; + tmp_1_reg_90_pp0_iter18_reg <= tmp_1_reg_90_pp0_iter17_reg; + tmp_1_reg_90_pp0_iter19_reg <= tmp_1_reg_90_pp0_iter18_reg; + tmp_1_reg_90_pp0_iter20_reg <= tmp_1_reg_90_pp0_iter19_reg; + tmp_1_reg_90_pp0_iter21_reg <= tmp_1_reg_90_pp0_iter20_reg; + tmp_1_reg_90_pp0_iter22_reg <= tmp_1_reg_90_pp0_iter21_reg; + tmp_1_reg_90_pp0_iter23_reg <= tmp_1_reg_90_pp0_iter22_reg; + tmp_1_reg_90_pp0_iter24_reg <= tmp_1_reg_90_pp0_iter23_reg; + tmp_1_reg_90_pp0_iter25_reg <= tmp_1_reg_90_pp0_iter24_reg; + tmp_1_reg_90_pp0_iter26_reg <= tmp_1_reg_90_pp0_iter25_reg; + tmp_1_reg_90_pp0_iter27_reg <= tmp_1_reg_90_pp0_iter26_reg; + tmp_1_reg_90_pp0_iter28_reg <= tmp_1_reg_90_pp0_iter27_reg; + tmp_1_reg_90_pp0_iter29_reg <= tmp_1_reg_90_pp0_iter28_reg; + tmp_1_reg_90_pp0_iter2_reg <= tmp_1_reg_90_pp0_iter1_reg; + tmp_1_reg_90_pp0_iter30_reg <= tmp_1_reg_90_pp0_iter29_reg; + tmp_1_reg_90_pp0_iter31_reg <= tmp_1_reg_90_pp0_iter30_reg; + tmp_1_reg_90_pp0_iter32_reg <= tmp_1_reg_90_pp0_iter31_reg; + tmp_1_reg_90_pp0_iter33_reg <= tmp_1_reg_90_pp0_iter32_reg; + tmp_1_reg_90_pp0_iter34_reg <= tmp_1_reg_90_pp0_iter33_reg; + tmp_1_reg_90_pp0_iter3_reg <= tmp_1_reg_90_pp0_iter2_reg; + tmp_1_reg_90_pp0_iter4_reg <= tmp_1_reg_90_pp0_iter3_reg; + tmp_1_reg_90_pp0_iter5_reg <= tmp_1_reg_90_pp0_iter4_reg; + tmp_1_reg_90_pp0_iter6_reg <= tmp_1_reg_90_pp0_iter5_reg; + tmp_1_reg_90_pp0_iter7_reg <= tmp_1_reg_90_pp0_iter6_reg; + tmp_1_reg_90_pp0_iter8_reg <= tmp_1_reg_90_pp0_iter7_reg; + tmp_1_reg_90_pp0_iter9_reg <= tmp_1_reg_90_pp0_iter8_reg; + tmp_reg_99_pp0_iter10_reg <= tmp_reg_99_pp0_iter9_reg; + tmp_reg_99_pp0_iter11_reg <= tmp_reg_99_pp0_iter10_reg; + tmp_reg_99_pp0_iter12_reg <= tmp_reg_99_pp0_iter11_reg; + tmp_reg_99_pp0_iter13_reg <= tmp_reg_99_pp0_iter12_reg; + tmp_reg_99_pp0_iter14_reg <= tmp_reg_99_pp0_iter13_reg; + tmp_reg_99_pp0_iter15_reg <= tmp_reg_99_pp0_iter14_reg; + tmp_reg_99_pp0_iter16_reg <= tmp_reg_99_pp0_iter15_reg; + tmp_reg_99_pp0_iter17_reg <= tmp_reg_99_pp0_iter16_reg; + tmp_reg_99_pp0_iter18_reg <= tmp_reg_99_pp0_iter17_reg; + tmp_reg_99_pp0_iter19_reg <= tmp_reg_99_pp0_iter18_reg; + tmp_reg_99_pp0_iter20_reg <= tmp_reg_99_pp0_iter19_reg; + tmp_reg_99_pp0_iter21_reg <= tmp_reg_99_pp0_iter20_reg; + tmp_reg_99_pp0_iter22_reg <= tmp_reg_99_pp0_iter21_reg; + tmp_reg_99_pp0_iter23_reg <= tmp_reg_99_pp0_iter22_reg; + tmp_reg_99_pp0_iter24_reg <= tmp_reg_99_pp0_iter23_reg; + tmp_reg_99_pp0_iter25_reg <= tmp_reg_99_pp0_iter24_reg; + tmp_reg_99_pp0_iter26_reg <= tmp_reg_99_pp0_iter25_reg; + tmp_reg_99_pp0_iter27_reg <= tmp_reg_99_pp0_iter26_reg; + tmp_reg_99_pp0_iter28_reg <= tmp_reg_99_pp0_iter27_reg; + tmp_reg_99_pp0_iter29_reg <= tmp_reg_99_pp0_iter28_reg; + tmp_reg_99_pp0_iter2_reg <= tmp_reg_99_pp0_iter1_reg; + tmp_reg_99_pp0_iter30_reg <= tmp_reg_99_pp0_iter29_reg; + tmp_reg_99_pp0_iter31_reg <= tmp_reg_99_pp0_iter30_reg; + tmp_reg_99_pp0_iter32_reg <= tmp_reg_99_pp0_iter31_reg; + tmp_reg_99_pp0_iter33_reg <= tmp_reg_99_pp0_iter32_reg; + tmp_reg_99_pp0_iter34_reg <= tmp_reg_99_pp0_iter33_reg; + tmp_reg_99_pp0_iter3_reg <= tmp_reg_99_pp0_iter2_reg; + tmp_reg_99_pp0_iter4_reg <= tmp_reg_99_pp0_iter3_reg; + tmp_reg_99_pp0_iter5_reg <= tmp_reg_99_pp0_iter4_reg; + tmp_reg_99_pp0_iter6_reg <= tmp_reg_99_pp0_iter5_reg; + tmp_reg_99_pp0_iter7_reg <= tmp_reg_99_pp0_iter6_reg; + tmp_reg_99_pp0_iter8_reg <= tmp_reg_99_pp0_iter7_reg; + tmp_reg_99_pp0_iter9_reg <= tmp_reg_99_pp0_iter8_reg; + end if; + end if; + end process; + process (ap_clk) + begin + if (ap_clk'event and ap_clk = '1') then + if (((ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + tmp_1_reg_90_pp0_iter1_reg <= tmp_1_reg_90; + tmp_reg_99 <= tmp_nbreadreq_fu_32_p3; + tmp_reg_99_pp0_iter1_reg <= tmp_reg_99; + end if; + end if; + end process; + + ap_NS_fsm_assign_proc : process (ap_CS_fsm, ap_block_pp0_stage0_subdone, ap_reset_idle_pp0) + begin + case ap_CS_fsm is + when ap_ST_fsm_pp0_stage0 => + ap_NS_fsm <= ap_ST_fsm_pp0_stage0; + when others => + ap_NS_fsm <= "X"; + end case; + end process; + ap_CS_fsm_pp0_stage0 <= ap_CS_fsm(0); + ap_block_pp0_stage0 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_pp0_stage0_01001 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_pp0_stage0_11001 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_pp0_stage0_subdone <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state10_pp0_stage0_iter9 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state11_pp0_stage0_iter10 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state12_pp0_stage0_iter11 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state13_pp0_stage0_iter12 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state14_pp0_stage0_iter13 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state15_pp0_stage0_iter14 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state16_pp0_stage0_iter15 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state17_pp0_stage0_iter16 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state18_pp0_stage0_iter17 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state19_pp0_stage0_iter18 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state1_pp0_stage0_iter0 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state20_pp0_stage0_iter19 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state21_pp0_stage0_iter20 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state22_pp0_stage0_iter21 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state23_pp0_stage0_iter22 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state24_pp0_stage0_iter23 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state25_pp0_stage0_iter24 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state26_pp0_stage0_iter25 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state27_pp0_stage0_iter26 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state28_pp0_stage0_iter27 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state29_pp0_stage0_iter28 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state2_pp0_stage0_iter1 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state30_pp0_stage0_iter29 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state31_pp0_stage0_iter30 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state32_pp0_stage0_iter31 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state33_pp0_stage0_iter32 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state34_pp0_stage0_iter33 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state35_pp0_stage0_iter34 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state36_pp0_stage0_iter35 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state3_pp0_stage0_iter2 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state4_pp0_stage0_iter3 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state5_pp0_stage0_iter4 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state6_pp0_stage0_iter5 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state7_pp0_stage0_iter6 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state8_pp0_stage0_iter7 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + ap_block_state9_pp0_stage0_iter8 <= not((ap_const_boolean_1 = ap_const_boolean_1)); + + ap_done_assign_proc : process(ap_enable_reg_pp0_iter35, ap_block_pp0_stage0_subdone) + begin + if (((ap_const_boolean_0 = ap_block_pp0_stage0_subdone) and (ap_enable_reg_pp0_iter35 = ap_const_logic_1))) then + ap_done <= ap_const_logic_1; + else + ap_done <= ap_const_logic_0; + end if; + end process; + + ap_enable_pp0 <= (ap_idle_pp0 xor ap_const_logic_1); + ap_enable_reg_pp0_iter0 <= ap_start; + + ap_idle_assign_proc : process(ap_start, ap_CS_fsm_pp0_stage0, ap_idle_pp0) + begin + if (((ap_start = ap_const_logic_0) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_idle_pp0 = ap_const_logic_1))) then + ap_idle <= ap_const_logic_1; + else + ap_idle <= ap_const_logic_0; + end if; + end process; + + + ap_idle_pp0_assign_proc : process(ap_enable_reg_pp0_iter0, ap_enable_reg_pp0_iter1, ap_enable_reg_pp0_iter2, ap_enable_reg_pp0_iter3, ap_enable_reg_pp0_iter4, ap_enable_reg_pp0_iter5, ap_enable_reg_pp0_iter6, ap_enable_reg_pp0_iter7, ap_enable_reg_pp0_iter8, ap_enable_reg_pp0_iter9, ap_enable_reg_pp0_iter10, ap_enable_reg_pp0_iter11, ap_enable_reg_pp0_iter12, ap_enable_reg_pp0_iter13, ap_enable_reg_pp0_iter14, ap_enable_reg_pp0_iter15, ap_enable_reg_pp0_iter16, ap_enable_reg_pp0_iter17, ap_enable_reg_pp0_iter18, ap_enable_reg_pp0_iter19, ap_enable_reg_pp0_iter20, ap_enable_reg_pp0_iter21, ap_enable_reg_pp0_iter22, ap_enable_reg_pp0_iter23, ap_enable_reg_pp0_iter24, ap_enable_reg_pp0_iter25, ap_enable_reg_pp0_iter26, ap_enable_reg_pp0_iter27, ap_enable_reg_pp0_iter28, ap_enable_reg_pp0_iter29, ap_enable_reg_pp0_iter30, ap_enable_reg_pp0_iter31, ap_enable_reg_pp0_iter32, ap_enable_reg_pp0_iter33, ap_enable_reg_pp0_iter34, ap_enable_reg_pp0_iter35) + begin + if (((ap_enable_reg_pp0_iter26 = ap_const_logic_0) and (ap_enable_reg_pp0_iter25 = ap_const_logic_0) and (ap_enable_reg_pp0_iter24 = ap_const_logic_0) and (ap_enable_reg_pp0_iter23 = ap_const_logic_0) and (ap_enable_reg_pp0_iter22 = ap_const_logic_0) and (ap_enable_reg_pp0_iter21 = ap_const_logic_0) and (ap_enable_reg_pp0_iter20 = ap_const_logic_0) and (ap_enable_reg_pp0_iter19 = ap_const_logic_0) and (ap_enable_reg_pp0_iter18 = ap_const_logic_0) and (ap_enable_reg_pp0_iter17 = ap_const_logic_0) and (ap_enable_reg_pp0_iter16 = ap_const_logic_0) and (ap_enable_reg_pp0_iter15 = ap_const_logic_0) and (ap_enable_reg_pp0_iter14 = ap_const_logic_0) and (ap_enable_reg_pp0_iter13 = ap_const_logic_0) and (ap_enable_reg_pp0_iter12 = ap_const_logic_0) and (ap_enable_reg_pp0_iter11 = ap_const_logic_0) and (ap_enable_reg_pp0_iter10 = ap_const_logic_0) and (ap_enable_reg_pp0_iter9 = ap_const_logic_0) and (ap_enable_reg_pp0_iter8 = ap_const_logic_0) and (ap_enable_reg_pp0_iter7 = ap_const_logic_0) and (ap_enable_reg_pp0_iter6 = + ap_const_logic_0) and (ap_enable_reg_pp0_iter5 = ap_const_logic_0) and (ap_enable_reg_pp0_iter4 = ap_const_logic_0) and (ap_enable_reg_pp0_iter3 = ap_const_logic_0) and (ap_enable_reg_pp0_iter2 = ap_const_logic_0) and (ap_enable_reg_pp0_iter1 = ap_const_logic_0) and (ap_enable_reg_pp0_iter0 = ap_const_logic_0) and (ap_enable_reg_pp0_iter35 = ap_const_logic_0) and (ap_enable_reg_pp0_iter34 = ap_const_logic_0) and (ap_enable_reg_pp0_iter33 = ap_const_logic_0) and (ap_enable_reg_pp0_iter32 = ap_const_logic_0) and (ap_enable_reg_pp0_iter31 = ap_const_logic_0) and (ap_enable_reg_pp0_iter30 = ap_const_logic_0) and (ap_enable_reg_pp0_iter29 = ap_const_logic_0) and (ap_enable_reg_pp0_iter28 = ap_const_logic_0) and (ap_enable_reg_pp0_iter27 = ap_const_logic_0))) then + ap_idle_pp0 <= ap_const_logic_1; + else + ap_idle_pp0 <= ap_const_logic_0; + end if; + end process; + + + ap_idle_pp0_0to34_assign_proc : process(ap_enable_reg_pp0_iter0, ap_enable_reg_pp0_iter1, ap_enable_reg_pp0_iter2, ap_enable_reg_pp0_iter3, ap_enable_reg_pp0_iter4, ap_enable_reg_pp0_iter5, ap_enable_reg_pp0_iter6, ap_enable_reg_pp0_iter7, ap_enable_reg_pp0_iter8, ap_enable_reg_pp0_iter9, ap_enable_reg_pp0_iter10, ap_enable_reg_pp0_iter11, ap_enable_reg_pp0_iter12, ap_enable_reg_pp0_iter13, ap_enable_reg_pp0_iter14, ap_enable_reg_pp0_iter15, ap_enable_reg_pp0_iter16, ap_enable_reg_pp0_iter17, ap_enable_reg_pp0_iter18, ap_enable_reg_pp0_iter19, ap_enable_reg_pp0_iter20, ap_enable_reg_pp0_iter21, ap_enable_reg_pp0_iter22, ap_enable_reg_pp0_iter23, ap_enable_reg_pp0_iter24, ap_enable_reg_pp0_iter25, ap_enable_reg_pp0_iter26, ap_enable_reg_pp0_iter27, ap_enable_reg_pp0_iter28, ap_enable_reg_pp0_iter29, ap_enable_reg_pp0_iter30, ap_enable_reg_pp0_iter31, ap_enable_reg_pp0_iter32, ap_enable_reg_pp0_iter33, ap_enable_reg_pp0_iter34) + begin + if (((ap_enable_reg_pp0_iter26 = ap_const_logic_0) and (ap_enable_reg_pp0_iter25 = ap_const_logic_0) and (ap_enable_reg_pp0_iter24 = ap_const_logic_0) and (ap_enable_reg_pp0_iter23 = ap_const_logic_0) and (ap_enable_reg_pp0_iter22 = ap_const_logic_0) and (ap_enable_reg_pp0_iter21 = ap_const_logic_0) and (ap_enable_reg_pp0_iter20 = ap_const_logic_0) and (ap_enable_reg_pp0_iter19 = ap_const_logic_0) and (ap_enable_reg_pp0_iter18 = ap_const_logic_0) and (ap_enable_reg_pp0_iter17 = ap_const_logic_0) and (ap_enable_reg_pp0_iter16 = ap_const_logic_0) and (ap_enable_reg_pp0_iter15 = ap_const_logic_0) and (ap_enable_reg_pp0_iter14 = ap_const_logic_0) and (ap_enable_reg_pp0_iter13 = ap_const_logic_0) and (ap_enable_reg_pp0_iter12 = ap_const_logic_0) and (ap_enable_reg_pp0_iter11 = ap_const_logic_0) and (ap_enable_reg_pp0_iter10 = ap_const_logic_0) and (ap_enable_reg_pp0_iter9 = ap_const_logic_0) and (ap_enable_reg_pp0_iter8 = ap_const_logic_0) and (ap_enable_reg_pp0_iter7 = ap_const_logic_0) and (ap_enable_reg_pp0_iter6 = + ap_const_logic_0) and (ap_enable_reg_pp0_iter5 = ap_const_logic_0) and (ap_enable_reg_pp0_iter4 = ap_const_logic_0) and (ap_enable_reg_pp0_iter3 = ap_const_logic_0) and (ap_enable_reg_pp0_iter2 = ap_const_logic_0) and (ap_enable_reg_pp0_iter1 = ap_const_logic_0) and (ap_enable_reg_pp0_iter0 = ap_const_logic_0) and (ap_enable_reg_pp0_iter34 = ap_const_logic_0) and (ap_enable_reg_pp0_iter33 = ap_const_logic_0) and (ap_enable_reg_pp0_iter32 = ap_const_logic_0) and (ap_enable_reg_pp0_iter31 = ap_const_logic_0) and (ap_enable_reg_pp0_iter30 = ap_const_logic_0) and (ap_enable_reg_pp0_iter29 = ap_const_logic_0) and (ap_enable_reg_pp0_iter28 = ap_const_logic_0) and (ap_enable_reg_pp0_iter27 = ap_const_logic_0))) then + ap_idle_pp0_0to34 <= ap_const_logic_1; + else + ap_idle_pp0_0to34 <= ap_const_logic_0; + end if; + end process; + + + ap_ready_assign_proc : process(ap_CS_fsm_pp0_stage0, ap_enable_reg_pp0_iter0, ap_block_pp0_stage0_subdone) + begin + if (((ap_enable_reg_pp0_iter0 = ap_const_logic_1) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (ap_const_boolean_0 = ap_block_pp0_stage0_subdone))) then + ap_ready <= ap_const_logic_1; + else + ap_ready <= ap_const_logic_0; + end if; + end process; + + + ap_reset_idle_pp0_assign_proc : process(ap_start, ap_idle_pp0_0to34) + begin + if (((ap_start = ap_const_logic_0) and (ap_idle_pp0_0to34 = ap_const_logic_1))) then + ap_reset_idle_pp0 <= ap_const_logic_1; + else + ap_reset_idle_pp0 <= ap_const_logic_0; + end if; + end process; + + + data_in_0_read_assign_proc : process(ap_CS_fsm_pp0_stage0, ap_enable_reg_pp0_iter0, data_in_0_empty_n, ap_block_pp0_stage0_11001, tmp_nbreadreq_fu_32_p3, tmp_1_nbreadreq_fu_40_p3) + begin + if (((ap_enable_reg_pp0_iter0 = ap_const_logic_1) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (tmp_1_nbreadreq_fu_40_p3 = ap_const_lv1_1) and (tmp_nbreadreq_fu_32_p3 = ap_const_lv1_1) and (data_in_0_empty_n = ap_const_logic_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + data_in_0_read <= ap_const_logic_1; + else + data_in_0_read <= ap_const_logic_0; + end if; + end process; + + + data_in_1_read_assign_proc : process(ap_CS_fsm_pp0_stage0, ap_enable_reg_pp0_iter0, data_in_1_empty_n, ap_block_pp0_stage0_11001, tmp_nbreadreq_fu_32_p3, tmp_1_nbreadreq_fu_40_p3) + begin + if (((ap_enable_reg_pp0_iter0 = ap_const_logic_1) and (ap_const_logic_1 = ap_CS_fsm_pp0_stage0) and (tmp_1_nbreadreq_fu_40_p3 = ap_const_lv1_1) and (tmp_nbreadreq_fu_32_p3 = ap_const_lv1_1) and (data_in_1_empty_n = ap_const_logic_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001))) then + data_in_1_read <= ap_const_logic_1; + else + data_in_1_read <= ap_const_logic_0; + end if; + end process; + + data_out_0_din <= grp_fu_75_p2(16 - 1 downto 0); + + data_out_0_write_assign_proc : process(ap_enable_reg_pp0_iter35, data_out_0_full_n, ap_block_pp0_stage0_11001, tmp_1_reg_90_pp0_iter34_reg, tmp_reg_99_pp0_iter34_reg) + begin + if (((tmp_reg_99_pp0_iter34_reg = ap_const_lv1_1) and (tmp_1_reg_90_pp0_iter34_reg = ap_const_lv1_1) and (data_out_0_full_n = ap_const_logic_1) and (ap_const_boolean_0 = ap_block_pp0_stage0_11001) and (ap_enable_reg_pp0_iter35 = ap_const_logic_1))) then + data_out_0_write <= ap_const_logic_1; + else + data_out_0_write <= ap_const_logic_0; + end if; + end process; + + tmp_1_nbreadreq_fu_40_p3 <= (0=>(data_in_1_empty_n), others=>'-'); + tmp_nbreadreq_fu_32_p3 <= (0=>(data_in_0_empty_n), others=>'-'); +end behav; diff --git a/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd new file mode 100644 index 000000000..ab2e84837 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/prj/solution1/syn/vhdl/div_sdiv_32ns_32ns_16_36_1.vhd @@ -0,0 +1,198 @@ +-- ============================================================== +-- Generated by Vitis HLS v2023.1 +-- Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +-- Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +-- ============================================================== +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity div_sdiv_32ns_32ns_16_36_1_divider is + generic ( + in0_WIDTH : INTEGER :=32; + in1_WIDTH : INTEGER :=32; + out_WIDTH : INTEGER :=32); + port ( + clk : in STD_LOGIC; + reset : in STD_LOGIC; + ce : in STD_LOGIC; + dividend : in STD_LOGIC_VECTOR(in0_WIDTH-1 downto 0); + divisor : in STD_LOGIC_VECTOR(in1_WIDTH-1 downto 0); + sign_i : in STD_LOGIC_VECTOR(1 downto 0); + sign_o : out STD_LOGIC_VECTOR(1 downto 0); + quot : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0); + remd : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0)); + + function max (left, right : INTEGER) return INTEGER is + begin + if left > right then return left; + else return right; + end if; + end max; + +end entity; + +architecture rtl of div_sdiv_32ns_32ns_16_36_1_divider is + constant cal_WIDTH : INTEGER := max(in0_WIDTH, in1_WIDTH); + type in0_vector is array(INTEGER range <>) of UNSIGNED(in0_WIDTH-1 downto 0); + type in1_vector is array(INTEGER range <>) of UNSIGNED(in1_WIDTH-1 downto 0); + type cal_vector is array(INTEGER range <>) of UNSIGNED(cal_WIDTH downto 0); + type sign_vector is array(INTEGER range <>) of UNSIGNED(1 downto 0); + + signal dividend_tmp : in0_vector(0 to in0_WIDTH); + signal divisor_tmp : in1_vector(0 to in0_WIDTH); + signal remd_tmp : in0_vector(0 to in0_WIDTH); + signal comb_tmp : in0_vector(0 to in0_WIDTH-1); + signal cal_tmp : cal_vector(0 to in0_WIDTH-1); + signal sign_tmp : sign_vector(0 to in0_WIDTH); +begin + quot <= STD_LOGIC_VECTOR(RESIZE(dividend_tmp(in0_WIDTH), out_WIDTH)); + remd <= STD_LOGIC_VECTOR(RESIZE(remd_tmp(in0_WIDTH), out_WIDTH)); + sign_o <= STD_LOGIC_VECTOR(sign_tmp(in0_WIDTH)); + + tran_tmp_proc : process (clk) + begin + if (clk'event and clk='1') then + if (ce = '1') then + dividend_tmp(0) <= UNSIGNED(dividend); + divisor_tmp(0) <= UNSIGNED(divisor); + sign_tmp(0) <= UNSIGNED(sign_i); + remd_tmp(0) <= (others => '0'); + end if; + end if; + end process tran_tmp_proc; + + run_proc: for i in 0 to in0_WIDTH-1 generate + begin + comb_tmp(i) <= remd_tmp(i)(in0_WIDTH-2 downto 0) & dividend_tmp(i)(in0_WIDTH-1); + cal_tmp(i) <= ('0' & comb_tmp(i)) - ('0' & divisor_tmp(i)); + + process (clk) + begin + if (clk'event and clk='1') then + if (ce = '1') then + dividend_tmp(i+1) <= dividend_tmp(i)(in0_WIDTH-2 downto 0) & (not cal_tmp(i)(cal_WIDTH)); + divisor_tmp(i+1) <= divisor_tmp(i); + sign_tmp(i+1) <= sign_tmp(i); + if cal_tmp(i)(cal_WIDTH) = '1' then + remd_tmp(i+1) <= comb_tmp(i); + else + remd_tmp(i+1) <= cal_tmp(i)(in0_WIDTH-1 downto 0); + end if; + end if; + end if; + end process; + end generate run_proc; + +end architecture; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity div_sdiv_32ns_32ns_16_36_1 is + generic ( + ID : INTEGER :=1; + NUM_STAGE : INTEGER :=2; + din0_WIDTH : INTEGER :=32; + din1_WIDTH : INTEGER :=32; + dout_WIDTH : INTEGER :=32); + port ( + clk : in STD_LOGIC; + reset : in STD_LOGIC; + ce : in STD_LOGIC; + din0 : in STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + din1 : in STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + dout : out STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0)); +end entity; + +architecture rtl of div_sdiv_32ns_32ns_16_36_1 is + component div_sdiv_32ns_32ns_16_36_1_divider is + generic ( + in0_WIDTH : INTEGER :=32; + in1_WIDTH : INTEGER :=32; + out_WIDTH : INTEGER :=32); + port ( + reset : in STD_LOGIC; + clk : in STD_LOGIC; + ce : in STD_LOGIC; + dividend : in STD_LOGIC_VECTOR(in0_WIDTH-1 downto 0); + divisor : in STD_LOGIC_VECTOR(in1_WIDTH-1 downto 0); + sign_i : in STD_LOGIC_VECTOR(1 downto 0); + sign_o : out STD_LOGIC_VECTOR(1 downto 0); + quot : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0); + remd : out STD_LOGIC_VECTOR(out_WIDTH-1 downto 0)); + end component; + + signal dividend0 : STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + signal divisor0 : STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + signal dividend_u : STD_LOGIC_VECTOR(din0_WIDTH-1 downto 0); + signal divisor_u : STD_LOGIC_VECTOR(din1_WIDTH-1 downto 0); + signal quot_u : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal remd_u : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal quot : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal remd : STD_LOGIC_VECTOR(dout_WIDTH-1 downto 0); + signal sign_i : STD_LOGIC_VECTOR(1 downto 0); + signal sign_o : STD_LOGIC_VECTOR(1 downto 0); +begin + div_sdiv_32ns_32ns_16_36_1_divider_u : div_sdiv_32ns_32ns_16_36_1_divider + generic map( + in0_WIDTH => din0_WIDTH, + in1_WIDTH => din1_WIDTH, + out_WIDTH => dout_WIDTH) + port map( + clk => clk, + reset => reset, + ce => ce, + dividend => dividend_u, + divisor => divisor_u, + sign_i => sign_i, + sign_o => sign_o, + quot => quot_u, + remd => remd_u); + + sign_i <= (dividend0(din0_WIDTH-1) xor divisor0(din1_WIDTH-1)) & dividend0(din0_WIDTH-1); + dividend_u <= STD_LOGIC_VECTOR(UNSIGNED(not dividend0) + 1) when dividend0(din0_WIDTH-1) = '1' else dividend0; + divisor_u <= STD_LOGIC_VECTOR(UNSIGNED(not divisor0) + 1) when divisor0(din1_WIDTH-1) = '1' else divisor0; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + dividend0 <= din0; + divisor0 <= din1; + end if; + end if; +end process; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + if (sign_o(1) = '1') then + quot <= STD_LOGIC_VECTOR(UNSIGNED(not quot_u) + 1); + else + quot <= quot_u; + end if; + end if; + end if; +end process; + +process (clk) +begin + if (clk'event and clk = '1') then + if (ce = '1') then + if (sign_o(0) = '1') then + remd <= STD_LOGIC_VECTOR(UNSIGNED(not remd_u) + 1); + else + remd <= remd_u; + end if; + end if; + end if; +end process; + +dout <= quot; + +end architecture; + + diff --git a/src/mase_components/hls/scalar_ops/int_div/vhls.tcl b/src/mase_components/hls/scalar_ops/int_div/vhls.tcl new file mode 100644 index 000000000..2a53a2366 --- /dev/null +++ b/src/mase_components/hls/scalar_ops/int_div/vhls.tcl @@ -0,0 +1,8 @@ +open_project -reset prj +set_top div +add_files div.cpp +open_solution -reset "solution1" +set_part {xcu250-figd2104-2L-e} +create_clock -period 10 -name default +config_bind -effort high +csynth_design diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv index 1e0edce53..f6ed504d5 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv +++ b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear.sv @@ -18,7 +18,8 @@ module fixed_linear #( /* verilator lint_off UNUSEDPARAM */ parameter HAS_BIAS = 1, - parameter WEIGHTS_PRE_TRANSPOSED = 0, + parameter WEIGHTS_PRE_TRANSPOSED = 1, + parameter FIFO = 1, parameter DATA_IN_0_PRECISION_0 = 16, parameter DATA_IN_0_PRECISION_1 = 3, @@ -112,7 +113,10 @@ module fixed_linear #( logic [DATA_OUT_0_PRECISION_0 - 1:0] add_bias_in_casted [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; logic add_bias_in_valid; logic add_bias_in_ready; - + + logic [DATA_OUT_0_PRECISION_0 - 1:0] rounding_out [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0]; + logic rounding_out_valid; + logic rounding_out_ready; // * Instances // * --------------------------------------------------------------------------------------------------- @@ -223,9 +227,9 @@ module fixed_linear #( .data_in_valid(add_bias_in_valid), .data_in_ready(add_bias_in_ready), - .data_out(data_out_0), - .data_out_valid(data_out_0_valid), - .data_out_ready(data_out_0_ready) + .data_out(rounding_out), + .data_out_valid(rounding_out_valid), + .data_out_ready(rounding_out_ready) ); end @@ -245,7 +249,7 @@ module fixed_linear #( // * Add bias if (HAS_BIAS == 1) begin - fixed_cast #( + fixed_rounding #( .IN_SIZE (BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1), .IN_WIDTH (BIAS_PRECISION_0), .IN_FRAC_WIDTH (BIAS_PRECISION_1), @@ -275,10 +279,46 @@ module fixed_linear #( .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) ) output_cast ( .data_in (matmul_out), - .data_out(data_out_0) + .data_out(rounding_out) ); - assign data_out_0_valid = matmul_out_valid; - assign matmul_out_ready = data_out_0_ready; + assign rounding_out_valid = matmul_out_valid; + assign matmul_out_ready = rounding_out_ready; end + if (FIFO == 1) begin + localparam FIFO_DEPTH = DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0; + + fifo_for_autogen #( + .DATA_IN_0_PRECISION_0(DATA_OUT_0_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(DATA_OUT_0_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1), // = 2 + .DEPTH(FIFO_DEPTH), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1) + ) fifo_1_inst ( + .clk(clk), + .rst(rst), + + .data_in_0(rounding_out), + .data_in_0_valid(rounding_out_valid), + .data_in_0_ready(rounding_out_ready), + .data_out_0(data_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + end + else begin + always_comb begin + data_out_0 = rounding_out; + data_out_0_valid = rounding_out_valid; + rounding_out_ready = data_out_0_ready; + end + end endmodule diff --git a/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv new file mode 100644 index 000000000..eb253bec5 --- /dev/null +++ b/src/mase_components/linear_layers/fixed_linear_layer/rtl/fixed_linear_with_input_circular.sv @@ -0,0 +1,386 @@ +`timescale 1ns / 1ps + +/* + * +*/ + +module fixed_linear_with_input_circular #( + /* verilator lint_off UNUSEDPARAM */ + parameter HAS_BIAS = 1, + parameter FIFO = 1, + + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + localparam IN_0_DEPTH_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, + localparam IN_0_DEPTH_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1, + + parameter WEIGHT_PRECISION_0 = 16, + parameter WEIGHT_PRECISION_1 = 3, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 20, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 20, + parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + + // Inferred precision of the output data + // if the data out precision will be replaced by the setting + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + DATA_IN_0_TENSOR_SIZE_DIM_0 + ) + HAS_BIAS, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + + parameter BIAS_PRECISION_0 = 16, + parameter BIAS_PRECISION_1 = 3, + parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_0 = DATA_OUT_0_PARALLELISM_DIM_0, + parameter BIAS_PARALLELISM_DIM_1 = 1 +) ( + input clk, + input rst, + + // input port for data_inivations + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + // input port for weight + input logic [WEIGHT_PRECISION_0-1:0] weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic weight_valid, + output logic weight_ready, + + input logic [BIAS_PRECISION_0-1:0] bias[BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic bias_valid, + output logic bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + logic [DATA_IN_0_PRECISION_0-1:0]circular_data_in_0[DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1-1:0]; + logic circular_data_in_0_valid, circular_data_in_0_ready; + logic [WEIGHT_PRECISION_0-1:0]circular_weight[WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0]; + logic circular_weight_valid, circular_weight_ready; + logic [BIAS_PRECISION_0-1:0]circular_bias[BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1-1:0]; + logic circular_bias_valid, circular_bias_ready; + logic [DATA_IN_0_PRECISION_0-1:0] data_in_0_reg [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; + logic data_in_0_reg_valid, data_in_0_reg_ready; + if (FIFO == 1) begin + localparam FIFO_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0; + + fifo_for_autogen #( + .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), // = 2 + .DEPTH(FIFO_DEPTH), + .DATA_OUT_0_PRECISION_0(DATA_IN_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_IN_0_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1) + ) fifo_1_inst ( + .clk(clk), + .rst(rst), + .data_in_0(data_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + .data_out_0(data_in_0_reg), + .data_out_0_valid(data_in_0_reg_valid), + .data_out_0_ready(data_in_0_reg_ready) + ); + end + else begin + always_comb begin + data_in_0_reg = data_in_0; + data_in_0_reg_valid = data_in_0_valid; + data_in_0_ready = data_in_0_reg_ready; + end + end + input_buffer #( + .DATA_WIDTH (DATA_IN_0_PRECISION_0), + .IN_NUM (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), + .REPEAT (WEIGHT_TENSOR_SIZE_DIM_1 / WEIGHT_PARALLELISM_DIM_1), + .BUFFER_SIZE(IN_0_DEPTH_DIM_0) + ) data_in_0_buffer ( + .clk, + .rst, + // Input streaming port + .data_in(data_in_0_reg), + .data_in_valid(data_in_0_reg_valid), + .data_in_ready(data_in_0_reg_ready), + // Output streaming port + .data_out(circular_data_in_0), + .data_out_valid(circular_data_in_0_valid), + .data_out_ready(circular_data_in_0_ready) + ); + input_buffer #( + .DATA_WIDTH(WEIGHT_PRECISION_0), + .IN_NUM(WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1), + .REPEAT(IN_0_DEPTH_DIM_1), + .BUFFER_SIZE(WEIGHT_TENSOR_SIZE_DIM_0*WEIGHT_TENSOR_SIZE_DIM_1 / (WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1)) + ) weight_buffer ( + .clk, + .rst, + // Input streaming port + .data_in(weight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + // Output streaming port + .data_out(circular_weight), + .data_out_valid(circular_weight_valid), + .data_out_ready(circular_weight_ready) + ); + input_buffer #( + .DATA_WIDTH (BIAS_PRECISION_0), + .IN_NUM (BIAS_PARALLELISM_DIM_0), + .REPEAT (IN_0_DEPTH_DIM_1), + .BUFFER_SIZE(BIAS_TENSOR_SIZE_DIM_0 / (BIAS_PARALLELISM_DIM_0)) + ) bias_buffer ( + .clk, + .rst, + // Input streaming port + .data_in(bias), + .data_in_valid(bias_valid), + .data_in_ready(bias_ready), + // Output streaming port + .data_out(circular_bias), + .data_out_valid(circular_bias_valid), + .data_out_ready(circular_bias_ready) + ); + logic [DATA_OUT_0_PARALLELISM_DIM_1 - 1:0] + linear_1d_data_in_0_ready, + linear_1d_weight_ready, + linear_1d_bias_ready, + linear_1d_data_out_valid; + always_comb begin + circular_data_in_0_ready = linear_1d_data_in_0_ready[0]; + circular_weight_ready = linear_1d_weight_ready[0]; + circular_bias_ready = linear_1d_bias_ready[0]; + data_out_0_valid = linear_1d_data_out_valid[0]; + end + for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i = i + 1) begin + linear_1d #( + .HAS_BIAS(HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) fixed_linear ( + .clk, + .rst, + .data_in_0 (circular_data_in_0[(i+1) * DATA_IN_0_PARALLELISM_DIM_0 - 1:i * DATA_IN_0_PARALLELISM_DIM_0]), + .data_in_0_valid(circular_data_in_0_valid), + .data_in_0_ready(linear_1d_data_in_0_ready[i]), + .weight(circular_weight), + .weight_valid(circular_weight_valid), + .weight_ready(linear_1d_weight_ready[i]), + .bias(circular_bias), + .bias_valid(circular_bias_valid), + .bias_ready(linear_1d_bias_ready[i]), + .data_out_0 (data_out_0[(i+1) * DATA_OUT_0_PARALLELISM_DIM_0- 1:i * DATA_OUT_0_PARALLELISM_DIM_0]), + .data_out_0_valid(linear_1d_data_out_valid[i]), + .data_out_0_ready(data_out_0_ready) + ); + end +endmodule +module linear_1d #( + /* verilator lint_off UNUSEDPARAM */ + parameter HAS_BIAS = 0, + + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, + parameter IN_0_DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, + + parameter WEIGHT_PRECISION_0 = 16, + parameter WEIGHT_PRECISION_1 = 3, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 32, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 1, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + parameter WEIGHT_PARALLELISM_DIM_0 = 1, + + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + + parameter BIAS_PRECISION_0 = 16, + parameter BIAS_PRECISION_1 = 3, + parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, + parameter BIAS_PARALLELISM_DIM_0 = DATA_OUT_0_PARALLELISM_DIM_0 + +) ( + input clk, + input rst, + + // input port for data_inivations + input [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], + input data_in_0_valid, + output data_in_0_ready, + + // input port for weight + input [WEIGHT_PRECISION_0-1:0] weight [WEIGHT_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_0-1:0], + input weight_valid, + output weight_ready, + + /* verilator lint_off UNUSEDSIGNAL */ + input [BIAS_PRECISION_0-1:0] bias[BIAS_PARALLELISM_DIM_0-1:0], + input bias_valid, + /* verilator lint_on UNUSEDSIGNAL */ + output bias_ready, + + output [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output data_out_0_valid, + input data_out_0_ready +); + + localparam FDP_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + DATA_IN_0_PARALLELISM_DIM_0 + ); + localparam ACC_WIDTH = FDP_WIDTH + $clog2(IN_0_DEPTH); + localparam LOSSLESS_OUT_WIDTH = ACC_WIDTH + HAS_BIAS; + logic fdp_join_valid, fdp_join_ready; + join2 #() fdp_join_inst ( + .data_in_ready ({weight_ready, data_in_0_ready}), + .data_in_valid ({weight_valid, data_in_0_valid}), + .data_out_valid(fdp_join_valid), + .data_out_ready(fdp_join_ready) + ); + + /* verilator lint_off UNUSEDSIGNAL */ + // Assume the parallelised hardware above have the same arrival time + // which means that they always have the same state. So we can just + // pick one of the valid signal to use. + logic [WEIGHT_PARALLELISM_DIM_1-1:0] fdp_data_ready, fdp_weight_ready; + assign fdp_join_ready = fdp_data_ready[0]; + /* verilator lint_on UNUSEDSIGNAL */ + + logic acc_ready; + logic [ ACC_WIDTH-1:0] acc_data_out [DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [LOSSLESS_OUT_WIDTH-1:0] cast_data_out_0[DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + // There are WEIGHT_PARALLELISM_DIM_0 number of dot product instances with DATA_IN_0_TENSOR_SIZE_DIM_0 inputs + // and each one computes for IN_0_DEPTH iterations for each inputs. + for (genvar i = 0; i < WEIGHT_PARALLELISM_DIM_1; i = i + 1) begin : linear + // Assume the weight are transposed and partitioned + logic [WEIGHT_PRECISION_0-1:0] current_weight[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + assign current_weight = weight[DATA_IN_0_PARALLELISM_DIM_0*(i+1)-1:DATA_IN_0_PARALLELISM_DIM_0*i]; + + logic [FDP_WIDTH-1:0] fdp_data_out; + logic fdp_data_out_valid, fdp_data_out_ready; + + // The inputs are already sync-ed by the previous join + fixed_dot_product #( + .IN_WIDTH(DATA_IN_0_PRECISION_0), + .WEIGHT_WIDTH(WEIGHT_PRECISION_0), + .IN_SIZE(DATA_IN_0_PARALLELISM_DIM_0) + ) fdp_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(fdp_join_valid), + .data_in_ready(fdp_data_ready[i]), + .weight(current_weight), + .weight_valid(fdp_join_valid), + .weight_ready(fdp_weight_ready[i]), + .data_out(fdp_data_out), + .data_out_valid(fdp_data_out_valid), + .data_out_ready(fdp_data_out_ready) + ); + + /* verilator lint_off UNUSEDSIGNAL */ + logic acc_data_out_valid, acc_data_out_ready; + /* verilator lint_on UNUSEDSIGNAL */ + + fixed_accumulator #( + .IN_WIDTH(FDP_WIDTH), + .IN_DEPTH(IN_0_DEPTH) + ) fixed_accumulator_inst ( + .clk(clk), + .rst(rst), + .data_in(fdp_data_out), + .data_in_valid(fdp_data_out_valid), + .data_in_ready(fdp_data_out_ready), + .data_out(acc_data_out[i]), + .data_out_valid(acc_data_out_valid), + .data_out_ready(acc_data_out_ready) + ); + + // Assume the parallelised hardware above have the same arrival time + // which means that they always have the same state. So we can just + // pick one of the valid signal to use. + assign acc_data_out_ready = acc_ready; + end + + + if (HAS_BIAS == 1) begin + logic [ACC_WIDTH-1:0] bias_sext[BIAS_PARALLELISM_DIM_0-1:0]; + logic acc_join_valid, acc_join_ready; + logic [DATA_OUT_0_PARALLELISM_DIM_0-1:0] reg_ready; + + join2 #() acc_join_inst ( + .data_in_ready ({bias_ready, acc_ready}), + .data_in_valid ({bias_valid, linear[0].acc_data_out_valid}), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + + fixed_rounding #( + .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0), + .IN_WIDTH(BIAS_PRECISION_0), + .IN_FRAC_WIDTH(BIAS_PRECISION_1), + .OUT_WIDTH(ACC_WIDTH), + .OUT_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1) + ) bias_cast ( + .data_in (bias), + .data_out(bias_sext) + ); + + for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_0; i = i + 1) begin : add_bias + assign cast_data_out_0[i] = $signed(acc_data_out[i]) + $signed(bias_sext[i]); + end + end else begin + assign acc_ready = data_out_0_ready; + assign data_out_0_valid = linear[0].acc_data_out_valid; + assign cast_data_out_0 = acc_data_out; + assign bias_ready = 1; + end + fixed_rounding #( + .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0), + .IN_WIDTH(LOSSLESS_OUT_WIDTH), + .IN_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1), + .OUT_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) + ) bias_cast ( + .data_in (cast_data_out_0), + .data_out(data_out_0) + ); + +endmodule diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py index c7683cbe3..d263edd94 100644 --- a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py +++ b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_tb.py @@ -52,6 +52,17 @@ def __init__(self, dut) -> None: check=True, ) + # self.data_out_0_monitor = ErrorThresholdStreamMonitor( + # dut.clk, + # dut.data_out_0, + # dut.data_out_0_valid, + # dut.data_out_0_ready, + # width=self.get_parameter("DATA_OUT_0_PRECISION_0"), + # signed=True, + # error_bits=1, + # check=True, + # ) + # Model self.model = LinearInteger( in_features=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), @@ -109,80 +120,80 @@ def preprocess_tensor(self, tensor, config, parallelism): blocks.append(dim_1_split[i][j].flatten().tolist()) return blocks - async def run_test(self, us): + async def run_test(self, batches=1, us=100): await self.reset() self.log.info(f"Reset finished") self.data_out_0_monitor.ready.value = 1 + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) - inputs = self.generate_inputs() - exp_out = self.model(inputs) - - # * Load the inputs driver - self.log.info(f"Processing inputs: {inputs}") - inputs = self.preprocess_tensor( - tensor=inputs, - config={ - "width": self.get_parameter("DATA_IN_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), - ], - ) - self.data_in_0_driver.load_driver(inputs) - - # * Load the weights driver - if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: - weights = self.model.weight.transpose(0, 1) - else: - weights = self.model.weight - - self.log.info(f"Processing weights: {weights}") - weights = self.preprocess_tensor( - tensor=weights, - config={ - "width": self.get_parameter("WEIGHT_PRECISION_0"), - "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), - self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), - ], - ) - self.weight_driver.load_driver(weights) + # * Load the weights driver + if self.get_parameter("WEIGHTS_PRE_TRANSPOSED") == 1: + weights = self.model.weight.transpose(0, 1) + else: + weights = self.model.weight - # * Load the bias driver - if self.get_parameter("HAS_BIAS") == 1: - bias = self.model.bias - self.log.info(f"Processing bias: {bias}") - bias = self.preprocess_tensor( - tensor=bias, + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor( + tensor=weights, config={ - "width": self.get_parameter("BIAS_PRECISION_0"), - "frac_width": self.get_parameter("BIAS_PRECISION_1"), + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), }, parallelism=[ - self.get_parameter("BIAS_PARALLELISM_DIM_1"), - self.get_parameter("BIAS_PARALLELISM_DIM_0"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), ], ) - self.bias_driver.load_driver(bias) - - # * Load the output monitor - self.log.info(f"Processing outputs: {exp_out}") - outs = self.preprocess_tensor( - tensor=exp_out, - config={ - "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), - "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), - ], - ) - self.data_out_0_monitor.load_monitor(outs) + self.weight_driver.load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + self.bias_driver.load_driver(bias) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) await Timer(us, units="us") assert self.data_out_0_monitor.exp_queue.empty() @@ -191,9 +202,20 @@ async def run_test(self, us): @cocotb.test() async def cocotb_test(dut): tb = LinearTB(dut) - await tb.run_test(us=100) + await tb.run_test(batches=10, us=100) +async def check_signal(dut, log): + num = {"data_out_0": 0, "data_in_0": 0} + while True: + await RisingEdge(dut.clk) + + +# verified case +# weight per transpoed = 0 +# weight pre transposed = 1 +# has bias = 0 +# has bias = 1 def get_fixed_linear_config(kwargs={}): # if pretranspose # weight1 = in0 @@ -201,22 +223,22 @@ def get_fixed_linear_config(kwargs={}): # weight0 = in0 config = { "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 1, + "WEIGHTS_PRE_TRANSPOSED": 0, "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, - "DATA_IN_0_PARALLELISM_DIM_0": 4, + "DATA_IN_0_PARALLELISM_DIM_0": 8, "DATA_IN_0_PARALLELISM_DIM_1": 4, - "WEIGHT_TENSOR_SIZE_DIM_0": 16, - "WEIGHT_TENSOR_SIZE_DIM_1": 32, - "WEIGHT_PARALLELISM_DIM_0": 2, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 16, + "WEIGHT_PARALLELISM_DIM_0": 8, "WEIGHT_PARALLELISM_DIM_1": 4, "DATA_IN_0_PRECISION_0": 8, "DATA_IN_0_PRECISION_1": 4, - "WEIGHT_PRECISION_0": 8, - "WEIGHT_PRECISION_1": 4, - "BIAS_PRECISION_0": 8, - "BIAS_PRECISION_1": 4, - "DATA_OUT_0_PRECISION_0": 10, + "WEIGHT_PRECISION_0": 10, + "WEIGHT_PRECISION_1": 3, + "BIAS_PRECISION_0": 5, + "BIAS_PRECISION_1": 2, + "DATA_OUT_0_PRECISION_0": 8, "DATA_OUT_0_PRECISION_1": 4, } config.update(kwargs) @@ -246,44 +268,44 @@ def test_fixed_linear_smoke(): ) -@pytest.mark.dev -def test_fixed_linear_regression(): - """ - More extensive tests to check realistic parameter sizes. - """ - mase_runner( - trace=True, - module_param_list=[ - get_fixed_linear_config( - { - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - get_fixed_linear_config( - { - "HAS_BIAS": 1, - "WEIGHTS_PRE_TRANSPOSED": 0, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), - ], - ) - - +# @pytest.mark.dev +# def test_fixed_linear_regression(): +# """ +# More extensive tests to check realistic parameter sizes. +# """ +# mase_runner( +# trace=True, +# module_param_list=[ +# get_fixed_linear_config( +# { +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# get_fixed_linear_config( +# { +# "HAS_BIAS": 1, +# "WEIGHTS_PRE_TRANSPOSED": 0, +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, +# "DATA_IN_0_PARALLELISM_DIM_0": 32, +# "WEIGHT_TENSOR_SIZE_DIM_0": 768, +# "WEIGHT_TENSOR_SIZE_DIM_1": 768, +# "WEIGHT_PARALLELISM_DIM_0": 32, +# "WEIGHT_PARALLELISM_DIM_1": 32, +# "BIAS_TENSOR_SIZE_DIM_0": 768, +# "BIAS_PARALLELISM_DIM_0": 32, +# } +# ), +# ], +# ) + +torch.manual_seed(3) if __name__ == "__main__": test_fixed_linear_smoke() # test_fixed_linear_regression() diff --git a/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py new file mode 100644 index 000000000..9976fe97b --- /dev/null +++ b/src/mase_components/linear_layers/fixed_linear_layer/test/fixed_linear_with_input_circular_tb.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 + +import os, pytest + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + StreamDriver, + StreamMonitor, + ErrorThresholdStreamMonitor, +) +from mase_cocotb.runner import mase_runner + +from mase_cocotb.utils import bit_driver + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized.modules.linear import LinearInteger +from chop.nn.quantizers import integer_floor_quantizer + + +class LinearTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + self.data_in_0_driver = StreamDriver( + dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready + ) + self.weight_driver = StreamDriver( + dut.clk, dut.weight, dut.weight_valid, dut.weight_ready + ) + + if self.get_parameter("HAS_BIAS") == 1: + self.bias_driver = StreamDriver( + dut.clk, dut.bias, dut.bias_valid, dut.bias_ready + ) + self.bias_driver.log.setLevel(logging.DEBUG) + + self.data_out_0_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + # Model + self.model = LinearInteger( + in_features=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + out_features=self.get_parameter("DATA_OUT_0_TENSOR_SIZE_DIM_0"), + bias=True if self.get_parameter("HAS_BIAS") == 1 else False, + config={ + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), + "weight_frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + "bias_width": self.get_parameter("BIAS_PRECISION_0"), + "bias_frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + out_config={ + "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + floor=True, + ) + + # Set verbosity of driver and monitor loggers to debug + self.data_in_0_driver.log.setLevel(logging.DEBUG) + self.weight_driver.log.setLevel(logging.DEBUG) + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self): + return torch.randn( + ( + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + ) + ) + + def preprocess_tensor(self, tensor, config, parallelism): + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + + # Quantize + quantizer = partial(integer_floor_quantizer, **config) + q_tensor = quantizer(tensor) + self.log.debug(f"Quantized tensor: {q_tensor}") + + # Convert to integer format + q_tensor = (q_tensor * 2 ** config["frac_width"]).int() + self.log.debug(f"Tensor in integer format: {q_tensor}") + + # Split into chunks according to parallelism in each dimension + # parallelism[0]: along rows, parallelism[1]: along columns + dim_0_split = q_tensor.split(parallelism[0], dim=0) + dim_1_split = [x.split(parallelism[1], dim=1) for x in dim_0_split] + blocks = [] + # Flatten the list of blocks + for i in range(len(dim_1_split)): + for j in range(len(dim_1_split[i])): + blocks.append(dim_1_split[i][j].flatten().tolist()) + return blocks + + async def run_test(self, us): + await self.reset() + self.log.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) + + # * Load the weights driver + weights = self.model.weight + + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor( + tensor=weights, + config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), + ], + ) + self.weight_driver.load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + self.bias_driver.load_driver(bias) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + + +@cocotb.test() +async def cocotb_test(dut): + tb = LinearTB(dut) + await tb.run_test(us=100) + + +@cocotb.test() +async def repeated_mult_valid_backpressure(dut): + tb = LinearTB(dut) + tb.data_in_0_driver.set_valid_prob(0.7) + tb.weight_driver.set_valid_prob(0.7) + cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) + await tb.run_test(us=200) + + +def get_fixed_linear_config(kwargs={}): + # if pretranspose + # weight1 = in0 + # else + # weight0 = in0 + config = { + "HAS_BIAS": 1, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, + "DATA_IN_0_PARALLELISM_DIM_0": 8, + "DATA_IN_0_PARALLELISM_DIM_1": 4, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 24, + "WEIGHT_PARALLELISM_DIM_0": 8, + "WEIGHT_PARALLELISM_DIM_1": 2, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + "WEIGHT_PRECISION_0": 10, + "WEIGHT_PRECISION_1": 3, + "BIAS_PRECISION_0": 5, + "BIAS_PRECISION_1": 2, + "DATA_OUT_0_PRECISION_0": 8, + "DATA_OUT_0_PRECISION_1": 4, + } + config.update(kwargs) + return config + + +@pytest.mark.dev +def test_fixed_linear_smoke(): + """ + Some quick tests to check if the module is working. + """ + mase_runner( + trace=True, + module_param_list=[ + get_fixed_linear_config(), + # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION + ], + ) + + +@pytest.mark.dev +def test_fixed_linear_regression(): + """ + More extensive tests to check realistic parameter sizes. + """ + mase_runner( + trace=True, + module_param_list=[ + get_fixed_linear_config( + { + "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, + "DATA_IN_0_PARALLELISM_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_0": 768, + "WEIGHT_TENSOR_SIZE_DIM_1": 768, + "WEIGHT_PARALLELISM_DIM_0": 32, + "WEIGHT_PARALLELISM_DIM_1": 32, + "BIAS_TENSOR_SIZE_DIM_0": 768, + "BIAS_PARALLELISM_DIM_0": 32, + } + ), + get_fixed_linear_config( + { + "HAS_BIAS": 1, + "WEIGHTS_PRE_TRANSPOSED": 0, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, + "DATA_IN_0_PARALLELISM_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_0": 768, + "WEIGHT_TENSOR_SIZE_DIM_1": 768, + "WEIGHT_PARALLELISM_DIM_0": 32, + "WEIGHT_PARALLELISM_DIM_1": 32, + "BIAS_TENSOR_SIZE_DIM_0": 768, + "BIAS_PARALLELISM_DIM_0": 32, + } + ), + ], + ) + + +if __name__ == "__main__": + test_fixed_linear_smoke() + # test_fixed_linear_regression() diff --git a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_accumulator.sv b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_accumulator.sv index 1f6f48c64..c99523b6e 100644 --- a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_accumulator.sv +++ b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_accumulator.sv @@ -1,4 +1,5 @@ `timescale 1ns / 1ps +`timescale 1ns / 1ps module fixed_accumulator #( parameter IN_DEPTH = 4, parameter IN_WIDTH = 32, @@ -65,3 +66,59 @@ module fixed_accumulator #( endmodule + +// module fixed_accumulator #( +// parameter IN_DEPTH = 4, +// parameter IN_WIDTH = 32, +// parameter OUT_WIDTH = $clog2(IN_DEPTH) + IN_WIDTH +// ) ( +// input logic clk, +// input logic rst, + +// input logic [IN_WIDTH-1:0] data_in, +// input logic data_in_valid, +// output logic data_in_ready, + +// output logic [OUT_WIDTH-1:0] data_out, +// output logic data_out_valid, +// input logic data_out_ready +// ); +// // 1-bit wider so IN_DEPTH also fits. +// localparam COUNTER_WIDTH = $clog2(IN_DEPTH); +// logic [COUNTER_WIDTH:0] counter; + +// // Sign extension before feeding into the accumulator +// logic [ OUT_WIDTH-1:0] data_in_sext; +// assign data_in_sext = {{(OUT_WIDTH - IN_WIDTH) {data_in[IN_WIDTH-1]}}, data_in}; + +// /* verilator lint_off WIDTH */ +// assign data_in_ready = (counter != IN_DEPTH) || data_out_ready; +// assign data_out_valid = (counter == IN_DEPTH); +// /* verilator lint_on WIDTH */ + +// // counter +// always_ff @(posedge clk) +// if (rst) counter <= 0; +// else begin +// if (data_out_valid) begin +// if (data_out_ready) begin +// if (data_in_valid) counter <= 1; +// else counter <= 0; +// end +// end else if (data_in_valid && data_in_ready) counter <= counter + 1; +// end + +// // data_out +// always_ff @(posedge clk) +// if (rst) data_out <= '0; +// else begin +// if (data_out_valid) begin +// if (data_out_ready) begin +// if (data_in_valid) data_out <= data_in_sext; +// else data_out <= '0; +// end +// end else if (data_in_valid && data_in_ready) data_out <= data_out + data_in_sext; +// end + + +// endmodule diff --git a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder.sv b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder.sv index ab49b5049..79c70abff 100644 --- a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder.sv +++ b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder.sv @@ -76,17 +76,18 @@ module fixed_adder #( ); // * Cast the sum to the requested output precision - fixed_cast #( + fixed_rounding #( .IN_SIZE (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), .IN_WIDTH (SUM_PRECISION_0), .IN_FRAC_WIDTH (SUM_PRECISION_1), .OUT_WIDTH (DATA_OUT_0_PRECISION_0), .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1) - ) bias_cast_i ( + ) output_cast ( .data_in (add_result), .data_out(cast_out) ); + // * Register the output unpacked_register_slice #( .DATA_WIDTH(DATA_OUT_0_PRECISION_0), @@ -109,7 +110,7 @@ module fixed_adder #( // * Do the sum for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1; i++) begin - assign add_result[i] = data_in_0[i] + data_in_1[i]; + assign add_result[i] = $signed(data_in_0[i]) + $signed(data_in_1[i]); end endmodule diff --git a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder_tree.sv b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder_tree.sv index 99d6f409f..f52400f26 100644 --- a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder_tree.sv +++ b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_adder_tree.sv @@ -56,7 +56,7 @@ module fixed_adder_tree #( .data_out(sum[i]) // flattened LEVEL_OUT_SIZE * LEVEL_OUT_WIDTH ); - skid_buffer #( + register_slice #( .DATA_WIDTH(LEVEL_OUT_SIZE * LEVEL_OUT_WIDTH) ) register_slice ( .clk (clk), diff --git a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_vector_mult.sv b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_vector_mult.sv index 2a6b0757a..2123c26e3 100644 --- a/src/mase_components/linear_layers/fixed_operators/rtl/fixed_vector_mult.sv +++ b/src/mase_components/linear_layers/fixed_operators/rtl/fixed_vector_mult.sv @@ -68,7 +68,7 @@ module fixed_vector_mult #( assign product_data_in[PRODUCT_WIDTH*i+PRODUCT_WIDTH-1:PRODUCT_WIDTH*i] = product_vector[i]; end - skid_buffer #( + register_slice #( .DATA_WIDTH($bits(product_vector)) ) register_slice ( .clk (clk), diff --git a/src/mase_components/linear_layers/matmul/rtl/matmul.sv b/src/mase_components/linear_layers/matmul/rtl/matmul.sv index e8443426a..ba418bd55 100644 --- a/src/mase_components/linear_layers/matmul/rtl/matmul.sv +++ b/src/mase_components/linear_layers/matmul/rtl/matmul.sv @@ -134,6 +134,9 @@ module matmul #( // Matrix unflatten output logic [B_WIDTH-1:0] b_buffer_out_data[B_COMPUTE_DIM0*B_COMPUTE_DIM1-1:0]; + logic [SM_OUT_WIDTH-1:0] buffered_sm_out_data[C_COMPUTE_DIM0*C_COMPUTE_DIM1]; + logic buffered_sm_out_valid, buffered_sm_out_ready; + logic [SM_OUT_WIDTH-1:0] sm_out_data[C_COMPUTE_DIM0*C_COMPUTE_DIM1]; logic sm_out_valid, sm_out_ready; @@ -275,10 +278,24 @@ module matmul #( .y_data (b_buffer_out_data), .y_valid (b_buffer_out_valid), .y_ready (b_buffer_out_ready), - .out_data (sm_out_data), - .out_valid(sm_out_valid), - .out_ready(sm_out_ready) + .out_data (buffered_sm_out_data), + .out_valid(buffered_sm_out_valid), + .out_ready(buffered_sm_out_ready) ); + //cut the long ready path + unpacked_skid_buffer #( + .DATA_WIDTH(SM_OUT_WIDTH), + .IN_NUM (C_COMPUTE_DIM0 * C_COMPUTE_DIM1) + ) sm_out_reg_slice ( + .clk (clk), + .rst (rst), + .data_in (buffered_sm_out_data), + .data_in_valid (buffered_sm_out_valid), + .data_in_ready (buffered_sm_out_ready), + .data_out (sm_out_data), + .data_out_valid(sm_out_valid), + .data_out_ready(sm_out_ready) + ); // Direct the result of the simple matmul to the correct matrix_accumulator diff --git a/src/mase_components/linear_layers/mxint_operators/doc/accumulator.drawio b/src/mase_components/linear_layers/mxint_operators/doc/accumulator.drawio new file mode 100644 index 000000000..0b0e04599 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/accumulator.drawio @@ -0,0 +1,465 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/cast.drawio b/src/mase_components/linear_layers/mxint_operators/doc/cast.drawio new file mode 100644 index 000000000..63fd3d989 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/cast.drawio @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/conv.drawio b/src/mase_components/linear_layers/mxint_operators/doc/conv.drawio new file mode 100644 index 000000000..40f4a156e --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/conv.drawio @@ -0,0 +1,403 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/dot_product.drawio b/src/mase_components/linear_layers/mxint_operators/doc/dot_product.drawio new file mode 100644 index 000000000..16ab30ca7 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/dot_product.drawio @@ -0,0 +1,439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/exp.drawio b/src/mase_components/linear_layers/mxint_operators/doc/exp.drawio new file mode 100644 index 000000000..a1b9f0e0b --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/exp.drawio @@ -0,0 +1,160 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/gelu.drawio b/src/mase_components/linear_layers/mxint_operators/doc/gelu.drawio new file mode 100644 index 000000000..24c048080 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/gelu.drawio @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/head.drawio b/src/mase_components/linear_layers/mxint_operators/doc/head.drawio new file mode 100644 index 000000000..107f23d99 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/head.drawio @@ -0,0 +1,706 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/matmul.drawio b/src/mase_components/linear_layers/mxint_operators/doc/matmul.drawio new file mode 100644 index 000000000..c6ce9b553 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/matmul.drawio @@ -0,0 +1,652 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/mxint_exp.drawio b/src/mase_components/linear_layers/mxint_operators/doc/mxint_exp.drawio new file mode 100644 index 000000000..cfdb97300 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/mxint_exp.drawio @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/range_reduction.drawio b/src/mase_components/linear_layers/mxint_operators/doc/range_reduction.drawio new file mode 100644 index 000000000..ff639368f --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/range_reduction.drawio @@ -0,0 +1,4528 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/doc/self_attention.drawio b/src/mase_components/linear_layers/mxint_operators/doc/self_attention.drawio new file mode 100644 index 000000000..e69de29bb diff --git a/src/mase_components/linear_layers/mxint_operators/doc/softmax.drawio b/src/mase_components/linear_layers/mxint_operators/doc/softmax.drawio new file mode 100644 index 000000000..3b9d88ec9 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/doc/softmax.drawio @@ -0,0 +1,244 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/fixed_taylor_exp.sv b/src/mase_components/linear_layers/mxint_operators/rtl/fixed_taylor_exp.sv new file mode 100644 index 000000000..77e16dc1b --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/fixed_taylor_exp.sv @@ -0,0 +1,92 @@ +`timescale 1ns / 1ps +module fixed_taylor_exp #( + /* verilator lint_off UNUSEDPARAM */ + parameter DATA_IN_WIDTH = 4, + parameter DATA_IN_FRAC_WIDTH = 4, + parameter DATA_OUT_WIDTH = 8, + parameter DATA_OUT_FRAC_WIDTH = 8 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input rst, + input clk, + input logic [DATA_IN_WIDTH-1:0] data_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_WIDTH-1:0] data_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready +); + + localparam ORDERS = 4; + logic [DATA_OUT_WIDTH-1:0] powers[ORDERS - 1:0]; + logic [DATA_OUT_WIDTH-1:0] powers_register_in[ORDERS - 1:0]; + logic [DATA_OUT_WIDTH-1:0] powers_with_coefficient[ORDERS - 1:0]; + logic powers_valid, powers_ready; + + power #( + .DATA_IN_WIDTH(DATA_IN_WIDTH), + .DATA_IN_FRAC_WIDTH(DATA_IN_FRAC_WIDTH), + .DATA_OUT_WIDTH(DATA_OUT_WIDTH), + .DATA_OUT_FRAC_WIDTH(DATA_OUT_FRAC_WIDTH), + .ORDERS(ORDERS) + ) power_inst ( + .data_in (data_in_0), + .data_out(powers_register_in) + ); + unpacked_register_slice #( + .DATA_WIDTH(DATA_OUT_WIDTH), + .IN_SIZE (ORDERS) + ) register_slice_i ( + .clk(clk), + .rst(rst), + + .data_in(powers_register_in), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + + .data_out(powers), + .data_out_valid(powers_valid), + .data_out_ready(powers_ready) + ); + assign powers_with_coefficient[0] = powers[0]; + assign powers_with_coefficient[1] = powers[1]; + assign powers_with_coefficient[2] = powers[2] >>> 1; + assign powers_with_coefficient[3] = $signed(powers[3]) * 4'b0111 >>> 5; + + assign data_out_0 = powers_with_coefficient[0] + powers_with_coefficient[1] + powers_with_coefficient[2] + powers_with_coefficient[3]; + assign data_out_0_valid = powers_valid; + assign powers_ready = data_out_0_ready; + + +endmodule + +module power #( + parameter DATA_IN_WIDTH = 8, + parameter DATA_IN_FRAC_WIDTH = 4, + parameter DATA_OUT_WIDTH = 8, + parameter DATA_OUT_FRAC_WIDTH = 4, + parameter ORDERS = 4 +) ( + input logic [ DATA_IN_WIDTH-1:0] data_in, + output logic [DATA_OUT_WIDTH-1:0] data_out[ORDERS - 1:0] +); + + assign data_out[0] = 1 << DATA_OUT_FRAC_WIDTH; + + for (genvar i = 0; i < ORDERS - 1; i++) begin + logic [DATA_IN_WIDTH * 2 - 1:0] intermediate_data_out; + assign intermediate_data_out = $signed(data_out[i]) * $signed(data_in); + fixed_signed_cast #( + .IN_WIDTH(DATA_IN_WIDTH * 2), + .IN_FRAC_WIDTH(DATA_IN_FRAC_WIDTH * 2), + .OUT_WIDTH(DATA_OUT_WIDTH), + .OUT_FRAC_WIDTH(DATA_OUT_FRAC_WIDTH), + .ROUND_FLOOR(1) + ) fr_inst ( + .in_data (intermediate_data_out), + .out_data(data_out[i+1]) + ); + end + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/log2_max_abs.sv b/src/mase_components/linear_layers/mxint_operators/rtl/log2_max_abs.sv index f3ed5beac..52a3f2656 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/log2_max_abs.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/log2_max_abs.sv @@ -13,12 +13,12 @@ module log2_max_abs #( input logic clk, input logic rst, /* verilator lint_on UNUSEDSIGNAL */ - input logic [ IN_WIDTH-1:0] data_in [IN_SIZE-1:0], - input logic data_in_valid, - output logic data_in_ready, - output logic [OUT_WIDTH-1:0] data_out, - output logic data_out_valid, - input logic data_out_ready + input logic [ IN_WIDTH-1:0] data_in_0 [IN_SIZE-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + output logic [OUT_WIDTH-1:0] data_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready ); logic [IN_WIDTH - 1:0] or_result; logic [IN_WIDTH - 1:0] abs_data_in[IN_SIZE - 1:0]; @@ -26,28 +26,28 @@ module log2_max_abs #( abs #( .IN_WIDTH(IN_WIDTH) ) abs_i ( - .data_in (data_in[i]), + .data_in (data_in_0[i]), .data_out(abs_data_in[i]) ); end or_tree #( .IN_SIZE (IN_SIZE), - .IN_WIDTH(IN_WIDTH), - ) max_bas_i ( + .IN_WIDTH(IN_WIDTH) + ) or_tree_i ( .clk, .rst, .data_in(abs_data_in), - .data_in_valid(data_in_valid), - .data_in_ready(data_in_ready), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), .data_out(or_result), - .data_out_valid(data_out_valid), - .data_out_ready(data_out_ready) + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) ); log2_value #( - .IN_WIDTH(IN_WIDTH), + .IN_WIDTH(IN_WIDTH) ) log2_i ( .data_in (or_result), - .data_out(data_out) + .data_out(data_out_0) ); endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_accumulator.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_accumulator.sv index ccbf0ddf3..7e7eedfea 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_accumulator.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_accumulator.sv @@ -5,14 +5,15 @@ Description : The accumulator for mxint. When inputing different exponent, the mantissa will cast to the same bitwidth then accumulate. */ module mxint_accumulator #( - parameter DATA_IN_0_PRECISION_0 = 8, - parameter DATA_IN_0_PRECISION_1 = 4, + // precision_0 = mantissa_width + // precision_1 = exponent_width + parameter DATA_IN_0_PRECISION_0 = 4, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter UNDERFLOW_BITS = 0, // This parameter represents the number of bits that will be used to allow underflow. parameter BLOCK_SIZE = 4, parameter IN_DEPTH = 2, - parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + 2 ** DATA_IN_0_PRECISION_1 + $clog2( - IN_DEPTH - ), - parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + localparam DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0 + $clog2(IN_DEPTH) + UNDERFLOW_BITS, + localparam DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 ) ( input logic clk, input logic rst, @@ -37,15 +38,30 @@ module mxint_accumulator #( assign data_out_0_valid = (counter == IN_DEPTH); /* verilator lint_on WIDTH */ - // mantissa shift - logic [DATA_OUT_0_PRECISION_0 - 1:0] shifted_mdata_in_0[BLOCK_SIZE - 1:0]; - logic [DATA_OUT_0_PRECISION_0 - 1:0] shifted_mdata_out_0[BLOCK_SIZE - 1:0]; + localparam DATA_IN_0_PRECISION_0_EXT = DATA_IN_0_PRECISION_0 + UNDERFLOW_BITS; + localparam DATA_OUT_0_PRECISION_0_EXT = DATA_OUT_0_PRECISION_0 + UNDERFLOW_BITS; + // lossless shift + logic [DATA_IN_0_PRECISION_0_EXT - 1:0] shifted_mdata_in_0[BLOCK_SIZE - 1:0]; + logic [DATA_OUT_0_PRECISION_0_EXT - 1:0] shifted_mdata_out_0[BLOCK_SIZE - 1:0]; + + logic [DATA_IN_0_PRECISION_0_EXT - 1:0] extended_mdata_in_0[BLOCK_SIZE - 1:0]; + logic [DATA_OUT_0_PRECISION_0_EXT - 1:0] extended_mdata_out_0[BLOCK_SIZE - 1:0]; + + logic [DATA_IN_0_PRECISION_0_EXT - 1:0] shifted_mdata_in_list [BLOCK_SIZE - 1:0][DATA_IN_0_PRECISION_0_EXT - 1:0]; + logic [DATA_OUT_0_PRECISION_0_EXT - 1:0] shifted_mdata_out_list [BLOCK_SIZE - 1:0][DATA_OUT_0_PRECISION_0_EXT - 1:0]; logic no_value_in_register; - logic [DATA_IN_0_PRECISION_1 - 1:0] exp_min; + logic [DATA_IN_0_PRECISION_1 - 1:0] exp_max; + + localparam SHIFT_WIDTH = DATA_IN_0_PRECISION_1 + 1; + logic [SHIFT_WIDTH - 1:0] mdata_in_shift_value; + logic [SHIFT_WIDTH - 1:0] mdata_in_real_shift_value; + logic [SHIFT_WIDTH - 1:0] mdata_out_shift_value; + logic [SHIFT_WIDTH - 1:0] mdata_out_real_shift_value; + assign no_value_in_register =(counter == 0 || (data_out_0_valid && data_out_0_ready && data_in_0_valid)); - assign exp_min = ($signed(edata_out_0) > $signed(edata_in_0)) ? edata_in_0 : edata_out_0; + assign exp_max = ($signed(edata_out_0) < $signed(edata_in_0)) ? edata_in_0 : edata_out_0; // counter always_ff @(posedge clk) if (rst) counter <= 0; @@ -58,43 +74,55 @@ module mxint_accumulator #( end else if (data_in_0_valid && data_in_0_ready) counter <= counter + 1; end // mantissa + always_comb begin + mdata_in_shift_value = $signed(exp_max) - $signed(edata_in_0); + mdata_out_shift_value = $signed(exp_max) - $signed(edata_out_0); + end - for (genvar i = 0; i < BLOCK_SIZE; i++) begin : mantissa_block - // mantissa shift - for (genvar j = 0; j < 2 ** DATA_IN_0_PRECISION_1; j++) begin : static_shift + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : underflow + always_comb begin + extended_mdata_in_0[i] = $signed(mdata_in_0[i]) <<< UNDERFLOW_BITS; + extended_mdata_out_0[i] = $signed(mdata_out_0[i]); + end + end + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : optimize_variable_shift + for (genvar j = 0; j < DATA_IN_0_PRECISION_0_EXT; j++) begin : data_in_shift + always_comb begin + shifted_mdata_in_list[i][j] = no_value_in_register ? $signed(extended_mdata_in_0[i]) : + $signed(extended_mdata_in_0[i]) >>> j; + end + end + for (genvar k = 0; k < DATA_OUT_0_PRECISION_0_EXT; k++) begin : data_out_shift always_comb begin - if (($signed(edata_in_0) - $signed(exp_min)) == j) - shifted_mdata_in_0[i] = no_value_in_register ? $signed( - mdata_in_0[i] - ) : $signed( - mdata_in_0[i] - ) <<< j; - if (($signed(edata_out_0) - $signed(exp_min)) == j) - shifted_mdata_out_0[i] = $signed(mdata_out_0[i]) <<< j; + shifted_mdata_out_list[i][k] = $signed(extended_mdata_out_0[i]) >>> k; end end - // mantissa out + assign shifted_mdata_in_0[i] = shifted_mdata_in_list[i][mdata_in_shift_value]; + assign shifted_mdata_out_0[i] = shifted_mdata_out_list[i][mdata_out_shift_value]; + end + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : mantissa_block always_ff @(posedge clk) if (rst) mdata_out_0[i] <= '0; else begin if (data_out_0_valid) begin if (data_out_0_ready) begin - if (data_in_0_valid) mdata_out_0[i] <= shifted_mdata_in_0[i]; + if (data_in_0_valid) mdata_out_0[i] <= $signed(shifted_mdata_in_0[i]); else mdata_out_0[i] <= '0; end end else if (data_in_0_valid && data_in_0_ready) mdata_out_0[i] <= $signed(shifted_mdata_out_0[i]) + $signed(shifted_mdata_in_0[i]); end end - localparam signed [DATA_IN_0_PRECISION_1 - 1:0] MAXIMUM_EXPONENTIAL = 2**(DATA_IN_0_PRECISION_1 - 1) - 1; + localparam signed [DATA_IN_0_PRECISION_1 - 1:0] MINIMUM_EXPONENTIAL = - 2**(DATA_IN_0_PRECISION_1 - 1); // exponent always_ff @(posedge clk) - if (rst) edata_out_0 <= MAXIMUM_EXPONENTIAL; + if (rst) edata_out_0 <= MINIMUM_EXPONENTIAL; else if (data_out_0_valid) begin if (data_out_0_ready) begin if (data_in_0_valid) edata_out_0 <= edata_in_0; - else edata_out_0 <= MAXIMUM_EXPONENTIAL; + else edata_out_0 <= MINIMUM_EXPONENTIAL; end - end else if (data_in_0_valid && data_in_0_ready) edata_out_0 <= exp_min; + end else if (data_in_0_valid && data_in_0_ready) edata_out_0 <= exp_max; endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_addition.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_addition.sv new file mode 100644 index 000000000..25eb2af0d --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_addition.sv @@ -0,0 +1,138 @@ +`timescale 1ns / 1ps +module mxint_addition #( + // precision_0 represent mantissa width + // precision_1 represent exponent width + // + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter DATA_IN_1_PRECISION_0 = 8, + parameter DATA_IN_1_PRECISION_1 = 8, + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, + + parameter DATA_IN_1_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_1_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_1_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_1_PARALLELISM_DIM_0 = 20, + parameter DATA_IN_1_PARALLELISM_DIM_1 = 20, + parameter DATA_IN_1_PARALLELISM_DIM_2 = 1, + + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = 1, + + parameter DATA_OUT_0_PARALLELISM_DIM_0 = 20, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = 20, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = 1, + localparam BLOCK_SIZE = DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1 +) ( + input clk, + input rst, + // m -> mantissa, e -> exponent + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0[BLOCK_SIZE - 1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input data_in_0_valid, + output data_in_0_ready, + + input logic [DATA_IN_1_PRECISION_0-1:0] mdata_in_1[BLOCK_SIZE - 1:0], + input logic [DATA_IN_1_PRECISION_1-1:0] edata_in_1, + input data_in_1_valid, + output data_in_1_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0 [BLOCK_SIZE - 1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + output data_out_0_valid, + input data_out_0_ready +); +localparam ADD_OUT_WIDTH = DATA_OUT_0_PRECISION_0 + 1; +localparam ADD_OUT_FRAC_WIDTH = DATA_OUT_0_PRECISION_0; + +// Internal signals for addition pipeline +logic add_out_valid, add_out_ready; + +// Signals for shift value calculation +logic [DATA_IN_0_PRECISION_1-1:0] max_value; +logic [DATA_IN_0_PRECISION_1-1:0] shift_value_0; +logic [DATA_IN_0_PRECISION_1-1:0] shift_value_1; + +// Shifted mantissa signals +logic [DATA_OUT_0_PRECISION_0-1:0] shifted_mdata_in_0[BLOCK_SIZE-1:0]; +logic [DATA_OUT_0_PRECISION_0-1:0] shifted_mdata_in_1[BLOCK_SIZE-1:0]; + +// Addition output signals +logic [ADD_OUT_WIDTH-1:0] madd_out_0[BLOCK_SIZE-1:0]; +logic [DATA_IN_0_PRECISION_1-1:0] eadd_out_0; + + initial begin + assert( + (DATA_IN_0_PRECISION_0==DATA_IN_1_PRECISION_0) & + (DATA_IN_0_PRECISION_1==DATA_IN_1_PRECISION_1) + ) else $fatal("Precision of input data 0 and input data 1 must be the same"); + end + + + join2 join_inst ( + .data_in_ready ({data_in_0_ready, data_in_1_ready}), + .data_in_valid ({data_in_0_valid, data_in_1_valid}), + .data_out_valid(add_out_valid), + .data_out_ready(add_out_ready) + ); + assign max_value = ($signed(edata_in_0) > $signed(edata_in_1))? edata_in_0 : edata_in_1; + assign shift_value_0 = max_value - edata_in_0; + assign shift_value_1 = max_value - edata_in_1; + optimized_right_shift #( + .IN_WIDTH(DATA_IN_0_PRECISION_0), + .SHIFT_WIDTH(DATA_IN_0_PRECISION_1), + .OUT_WIDTH(DATA_OUT_0_PRECISION_0), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_0_inst ( + .data_in(mdata_in_0), + .shift_value(shift_value), + .data_out(shifted_mdata_in_0) + ); + + optimized_right_shift #( + .IN_WIDTH(DATA_IN_1_PRECISION_0), + .SHIFT_WIDTH(DATA_IN_1_PRECISION_1), + .OUT_WIDTH(DATA_OUT_0_PRECISION_0), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_1_inst ( + .data_in(mdata_in_1), + .shift_value(shift_value_1), + .data_out(shifted_mdata_in_1) + ); + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign madd_out_0[i] = shifted_mdata_in_0[i] + shifted_mdata_in_1[i]; + end + assign eadd_out_0 = max_value; + + mxint_cast #( + .IN_MAN_WIDTH(ADD_OUT_WIDTH), + .IN_MAN_FRAC_WIDTH(ADD_OUT_FRAC_WIDTH), + .IN_EXP_WIDTH(DATA_IN_0_PRECISION_1), + .OUT_MAN_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .BLOCK_SIZE(BLOCK_SIZE) + ) cast_i ( + .clk(clk), + .rst(rst), + .mdata_in(madd_out_0), // Changed from skid_mdata_out + .edata_in(eadd_out_0), // Changed from skid_edata_out + .data_in_valid(add_out_valid), // Changed from skid_data_out_valid + .data_in_ready(add_out_ready), // Changed from skid_data_out_ready + .mdata_out(mdata_out_0), + .edata_out(edata_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_cast.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_cast.sv index c5d02830f..00655bbb9 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_cast.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_cast.sv @@ -5,9 +5,11 @@ Description : MxInt Cast between Layers. */ module mxint_cast #( parameter IN_MAN_WIDTH = 1, + parameter IN_MAN_FRAC_WIDTH = IN_MAN_WIDTH - 1, parameter IN_EXP_WIDTH = 1, parameter OUT_MAN_WIDTH = 1, parameter OUT_EXP_WIDTH = 1, + parameter ROUND_BITS = 4, parameter BLOCK_SIZE = 1 ) ( /* verilator lint_off UNUSEDSIGNAL */ @@ -24,110 +26,161 @@ module mxint_cast #( input logic data_out_ready ); //get max_abs_value of input - logic data_for_max_valid, data_for_max_ready, data_for_out_valid, data_for_out_ready; - split2 #() split_i ( - .data_in_valid (data_in_valid), - .data_in_ready (data_in_ready), - .data_out_valid({data_for_max_valid, data_for_out_valid}), - .data_out_ready({data_for_max_ready, data_for_out_ready}) - ); - logic [IN_MAN_WIDTH-1:0] mbuffer_data_for_out [BLOCK_SIZE-1:0]; - logic [IN_EXP_WIDTH-1:0] ebuffer_data_for_out; - logic buffer_data_for_out_valid, buffer_data_for_out_ready; - localparam LOG2_WIDTH = $clog2(IN_MAN_WIDTH) + 1; + + localparam LOSSLESSS_EDATA_WIDTH = + (LOG2_WIDTH > IN_EXP_WIDTH && LOG2_WIDTH > OUT_EXP_WIDTH) ? LOG2_WIDTH + 2 : + (IN_EXP_WIDTH > OUT_EXP_WIDTH) ? IN_EXP_WIDTH + 2: + OUT_EXP_WIDTH + 2; + + localparam SHIFT_WIDTH = (OUT_EXP_WIDTH > IN_EXP_WIDTH) ? OUT_EXP_WIDTH + 1 : IN_EXP_WIDTH + 1; + localparam SHIFT_DATA_WIDTH = OUT_MAN_WIDTH + 1; + + localparam CAST_WIDTH = OUT_MAN_WIDTH + ROUND_BITS; + + logic [IN_MAN_WIDTH - 1:0] mdata_for_max [BLOCK_SIZE - 1:0]; + logic data_for_max_valid, data_for_max_ready; + + logic [IN_MAN_WIDTH-1:0] mdata_for_out [BLOCK_SIZE-1:0]; + logic [IN_EXP_WIDTH-1:0] edata_for_out; + logic data_for_out_valid, data_for_out_ready; + + logic [CAST_WIDTH-1:0] mdata_for_cast [BLOCK_SIZE-1:0]; + logic [LOG2_WIDTH - 1:0] log2_max_value; logic log2_max_value_valid, log2_max_value_ready; - localparam EBIAS = 2 ** (OUT_EXP_WIDTH - 1); - localparam LOSSLESSS_EDATA_WIDTH = max(LOG2_WIDTH, IN_EXP_WIDTH, OUT_EXP_WIDTH) + 2; - localparam FIFO_DEPTH = $clog2(BLOCK_SIZE); logic [LOSSLESSS_EDATA_WIDTH - 1:0] edata_out_full; + logic [SHIFT_WIDTH - 1:0] shift_value; + // we dont need to implement full shift here, because we'll clamp in the final. + // in order to avoid shift loss, we set the shift_data_width = OUT_MAN_WIDTH + 1. + + logic [SHIFT_DATA_WIDTH - 1:0] shift_buffer_data_for_out[BLOCK_SIZE - 1:0]; + logic [SHIFT_DATA_WIDTH - 1:0] shift_data[BLOCK_SIZE - 1:0][SHIFT_DATA_WIDTH - 1:0]; + + // Add intermediate signals + logic [OUT_MAN_WIDTH-1:0] mdata_out_unreg [BLOCK_SIZE-1:0]; + logic [OUT_EXP_WIDTH-1:0] edata_out_unreg; + logic data_out_unreg_valid; + logic data_out_unreg_ready; + unpacked_mx_split2_with_data #( + .DEPTH($clog2(BLOCK_SIZE) + 1), + .MAN_WIDTH(IN_MAN_WIDTH), + .EXP_WIDTH(IN_EXP_WIDTH), + .IN_SIZE(BLOCK_SIZE) + ) data_in_0_unpacked_mx_split2_with_data_i ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_in), + .edata_in(edata_in), + .data_in_valid(data_in_valid), + .data_in_ready(data_in_ready), + .fifo_mdata_out(mdata_for_out), + .fifo_edata_out(edata_for_out), + .fifo_data_out_valid(data_for_out_valid), + .fifo_data_out_ready(data_for_out_ready), + .straight_mdata_out(mdata_for_max), + .straight_edata_out(), + .straight_data_out_valid(data_for_max_valid), + .straight_data_out_ready(data_for_max_ready) + ); + log2_max_abs #( .IN_SIZE (BLOCK_SIZE), - .IN_WIDTH(IN_MAN_WIDTH), + .IN_WIDTH(IN_MAN_WIDTH) ) max_bas_i ( .clk, .rst, - .data_in(mdata_in), - .data_in_valid(data_for_max_valid), - .data_in_ready(data_for_max_ready), + .data_in_0(mdata_for_max), + .data_in_0_valid(data_for_max_valid), + .data_in_0_ready(data_for_max_ready), + .data_out_0(log2_max_value_unreg), + .data_out_0_valid(log2_max_value_valid_unreg), + .data_out_0_ready(log2_max_value_ready_unreg) + ); + + // Add register slice after log2_max_abs + logic [LOG2_WIDTH-1:0] log2_max_value_unreg; + logic log2_max_value_valid_unreg, log2_max_value_ready_unreg; + + skid_buffer #( + .DATA_WIDTH(LOG2_WIDTH) + ) log2_reg_slice ( + .clk(clk), + .rst(rst), + .data_in(log2_max_value_unreg), + .data_in_valid(log2_max_value_valid_unreg), + .data_in_ready(log2_max_value_ready_unreg), .data_out(log2_max_value), .data_out_valid(log2_max_value_valid), .data_out_ready(log2_max_value_ready) ); - if (FIFO_DEPTH == 0) begin - always_comb begin - mbuffer_data_for_out = mdata_in; - ebuffer_data_for_out = edata_in; - buffer_data_for_out_valid = data_for_out_valid; - data_for_out_ready = buffer_data_for_out_ready; - end - end else begin - unpacked_mx_fifo #( - .DEPTH(FIFO_DEPTH), - .MAN_WIDTH(IN_MAN_WIDTH), - .EXP_WIDTH(IN_EXP_WIDTH), - .IN_SIZE(BLOCK_SIZE) - ) ff_inst ( - .clk(clk), - .rst(rst), - .mdata_in(mdata_in), - .edata_in(edata_in), - .data_in_valid(data_for_out_valid), - .data_in_ready(data_for_out_ready), - .mdata_out(mbuffer_data_for_out), - .edata_out(ebuffer_data_for_out), - .data_out_valid(buffer_data_for_out_valid), - .data_out_ready(buffer_data_for_out_ready) - ); - end join2 #() join_inst ( - .data_in_ready ({buffer_data_for_out_ready, log2_max_value_ready}), - .data_in_valid ({buffer_data_for_out_valid, log2_max_value_valid}), - .data_out_valid(data_out_valid), - .data_out_ready(data_out_ready) + .data_in_ready ({data_for_out_ready, log2_max_value_ready}), + .data_in_valid ({data_for_out_valid, log2_max_value_valid}), + .data_out_valid(data_out_unreg_valid), + .data_out_ready(data_out_unreg_ready) ); - assign edata_out_full = $signed(log2_max_value) + $signed(ebuffer_data_for_out) - EBIAS; + + assign edata_out_full = $signed( + log2_max_value + ) + $signed( + edata_for_out + ) - IN_MAN_FRAC_WIDTH; + // clamp signed_clamp #( .IN_WIDTH (LOSSLESSS_EDATA_WIDTH), .OUT_WIDTH(OUT_EXP_WIDTH) ) exp_clamp ( .in_data (edata_out_full), - .out_data(edata_out) + .out_data(edata_out_unreg) ); - localparam SHIFT_WIDTH = max(OUT_EXP_WIDTH, IN_EXP_WIDTH, 0) + 1; - logic [SHIFT_WIDTH - 1:0] shift_value; - assign shift_value = $signed(edata_out) - $signed(ebuffer_data_for_out); - logic [SHIFT_WIDTH - 1:0] abs_shift_value; - assign abs_shift_value = (shift_value[SHIFT_WIDTH-1]) ? (~shift_value + 1) : shift_value; - - logic [IN_MAN_WIDTH + EBIAS - 1:0] shift_buffer_data_for_out[BLOCK_SIZE - 1:0]; - for (genvar i = 0; i < BLOCK_SIZE; i++) begin - for (genvar j = 0; j < 2 ** SHIFT_WIDTH; j++) - always_comb - if (abs_shift_value == j) - shift_buffer_data_for_out[i] = (shift_value[SHIFT_WIDTH-1]) ? $signed( - mbuffer_data_for_out[i] - ) <<< j : $signed( - mbuffer_data_for_out[i] - ) >>> j; - signed_clamp #( - .IN_WIDTH (IN_MAN_WIDTH + EBIAS), - .OUT_WIDTH(OUT_MAN_WIDTH) - ) exp_clamp ( - .in_data (shift_buffer_data_for_out[i]), - .out_data(mdata_out[i]) + + optimized_right_shift #( + .IN_WIDTH(IN_MAN_WIDTH), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(CAST_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(mdata_for_out), + .shift_value(shift_value), + .data_out(mdata_for_cast) + ); + + assign shift_value = $signed( + edata_out_unreg + ) - $signed( + edata_for_out + ) + IN_MAN_FRAC_WIDTH - (CAST_WIDTH - 1); + fixed_rounding #( + .IN_SIZE(BLOCK_SIZE), + .IN_WIDTH(CAST_WIDTH), + .IN_FRAC_WIDTH(CAST_WIDTH - 1), + .OUT_WIDTH(OUT_MAN_WIDTH), + .OUT_FRAC_WIDTH(OUT_MAN_WIDTH - 1) + ) fixed_cast_inst ( + .data_in(mdata_for_cast), + .data_out(mdata_out_unreg) // Changed to feed into skid buffer + ); + + // Add skid buffer at the end + mxint_skid_buffer #( + .DATA_PRECISION_0(OUT_MAN_WIDTH), + .DATA_PRECISION_1(OUT_EXP_WIDTH), + .IN_NUM(BLOCK_SIZE) + ) output_skid_buffer ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_out_unreg), + .edata_in(edata_out_unreg), + .data_in_valid(data_out_unreg_valid), + .data_in_ready(data_out_unreg_ready), + .mdata_out(mdata_out), + .edata_out(edata_out), + .data_out_valid(data_out_valid), + .data_out_ready(data_out_ready) ); - end -endmodule -function [31:0] max; - input [31:0] x, y, z; - begin - if (x > y && x > z) max = x; - else if (y > z) max = y; - else max = z; - end -endfunction + +endmodule \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_circular.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_circular.sv index 563b2be5b..0396255ff 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_circular.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_circular.sv @@ -27,33 +27,36 @@ module mxint_circular #( output data_out_valid, input data_out_ready ); - initial begin - assert (DATA_PRECISION_0 >= DATA_PRECISION_1) - else $fatal("DATA_PRECISION_0 must larger than PRECISION_1"); - end - logic [DATA_PRECISION_0 - 1:0] packed_data_in [IN_NUM:0]; - logic [DATA_PRECISION_0 - 1:0] packed_data_out[IN_NUM:0]; - always_comb begin : data_pack - packed_data_in[IN_NUM-1:0] = mdata_in; - packed_data_in[IN_NUM] = $signed(edata_in); - mdata_out = packed_data_out[IN_NUM-1:0]; - edata_out = packed_data_out[IN_NUM]; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] data_in_flatten[0:0]; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] data_out_flatten[0:0]; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] packed_data_out_flatten; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] packed_data_in_flatten; + assign data_in_flatten[0] = packed_data_in_flatten; + assign packed_data_out_flatten = data_out_flatten[0]; + for (genvar i = 0; i < IN_NUM; i++) begin : reshape + assign packed_data_in_flatten[(i+1)*DATA_PRECISION_0-1:i*DATA_PRECISION_0] = mdata_in[i]; end + assign packed_data_in_flatten[DATA_PRECISION_0*IN_NUM+DATA_PRECISION_1-1:DATA_PRECISION_0*IN_NUM] = edata_in; input_buffer #( - .DATA_WIDTH (DATA_PRECISION_0), - .IN_NUM (IN_NUM + 1), + .DATA_WIDTH (DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1), + .IN_NUM (1), .REPEAT (REPEAT), .BUFFER_SIZE(BUFFER_SIZE) ) mdata_in_0_buffer ( .clk, .rst, // Input streaming port - .data_in(packed_data_in), + .data_in(data_in_flatten), .data_in_valid(data_in_valid), .data_in_ready(data_in_ready), // Output streaming port - .data_out(packed_data_out), + .data_out(data_out_flatten), .data_out_valid(data_out_valid), .data_out_ready(data_out_ready) ); + for (genvar i = 0; i < IN_NUM; i++) begin : unreshape + assign mdata_out[i] = packed_data_out_flatten[(i+1)*DATA_PRECISION_0-1:i*DATA_PRECISION_0]; + end + assign edata_out = packed_data_out_flatten[DATA_PRECISION_0*IN_NUM+DATA_PRECISION_1-1:DATA_PRECISION_0*IN_NUM]; + endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_div.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_div.sv new file mode 100644 index 000000000..a58d1d326 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_div.sv @@ -0,0 +1,129 @@ +`timescale 1 ns / 1 ps +module mxint_div #( + parameter DATA_DIVIDEND_PRECISION_0 = 8, + parameter DATA_DIVIDEND_PRECISION_1 = 8, + parameter DATA_DIVISOR_PRECISION_0 = 8, + parameter DATA_DIVISOR_PRECISION_1 = 8, + parameter DATA_QUOTIENT_PRECISION_0 = 8, + parameter DATA_QUOTIENT_PRECISION_1 = 8, + parameter BLOCK_SIZE = 4, + parameter DATA_IN_0_DIM = 8 // Add this parameter +) ( + input logic clk, + input logic rst, + input logic [DATA_DIVIDEND_PRECISION_0-1:0] mdividend_data[BLOCK_SIZE - 1:0], + input logic [DATA_DIVIDEND_PRECISION_1-1:0] edividend_data, + input logic dividend_data_valid, + output logic dividend_data_ready, + input logic [DATA_DIVISOR_PRECISION_0-1:0] mdivisor_data[BLOCK_SIZE - 1:0], + input logic [DATA_DIVISOR_PRECISION_1-1:0] edivisor_data, + input logic divisor_data_valid, + output logic divisor_data_ready, + output logic [DATA_QUOTIENT_PRECISION_0-1:0] mquotient_data[BLOCK_SIZE - 1:0], + output logic [DATA_QUOTIENT_PRECISION_1-1:0] equotient_data, + output logic quotient_data_valid, + input logic quotient_data_ready +); + // Signal declarations + logic [DATA_DIVIDEND_PRECISION_0-1:0] straight_mdividend_data[BLOCK_SIZE - 1:0]; + logic straight_mdividend_data_valid; + logic straight_mdividend_data_ready; + + logic [DATA_DIVISOR_PRECISION_0-1:0] straight_mdivisor_data[BLOCK_SIZE - 1:0]; + logic straight_mdivisor_data_valid; + logic straight_mdivisor_data_ready; + + logic [DATA_DIVIDEND_PRECISION_1-1:0] fifo_edividend_data; + logic fifo_edividend_data_valid; + logic fifo_edividend_data_ready; + + logic [DATA_DIVISOR_PRECISION_1-1:0] fifo_edivisor_data; + logic fifo_edivisor_data_valid; + logic fifo_edivisor_data_ready; + + logic mquotient_data_valid; + logic mquotient_data_ready; + + // First split2 instance (for dividend) + unpacked_mx_split2_with_data #( + .DEPTH(DATA_IN_0_DIM), + .MAN_WIDTH(DATA_DIVIDEND_PRECISION_0), + .EXP_WIDTH(DATA_DIVIDEND_PRECISION_1), + .IN_SIZE(BLOCK_SIZE) + ) split2_dividend ( // Renamed instance + .clk(clk), + .rst(rst), + // Input from circular buffer + .mdata_in(mdividend_data), + .edata_in(edividend_data), + .data_in_valid(dividend_data_valid), + .data_in_ready(dividend_data_ready), + .fifo_mdata_out(), + .fifo_edata_out(fifo_edividend_data), + .fifo_data_out_valid(fifo_edividend_data_valid), + .fifo_data_out_ready(fifo_edividend_data_ready), + // Straight output path + .straight_mdata_out(straight_mdividend_data), // Connect to the same signals previously used + .straight_edata_out(), + .straight_data_out_valid(straight_mdividend_data_valid), + .straight_data_out_ready(straight_mdividend_data_ready) + ); + + // Second split2 instance (for divisor) + unpacked_mx_split2_with_data #( + .DEPTH(DATA_IN_0_DIM), + .MAN_WIDTH(DATA_DIVISOR_PRECISION_0), + .EXP_WIDTH(DATA_DIVISOR_PRECISION_1), + .IN_SIZE(BLOCK_SIZE) + ) split2_divisor ( // Renamed instance + .clk(clk), + .rst(rst), + // Input from circular buffer + .mdata_in(mdivisor_data), + .edata_in(edivisor_data), + .data_in_valid(divisor_data_valid), + .data_in_ready(divisor_data_ready), + .fifo_mdata_out(), + .fifo_edata_out(fifo_edivisor_data), + .fifo_data_out_valid(fifo_edivisor_data_valid), + .fifo_data_out_ready(fifo_edivisor_data_ready), + // Straight output path + .straight_mdata_out(straight_mdivisor_data), // Connect to the same signals previously used + .straight_edata_out(), + .straight_data_out_valid(straight_mdivisor_data_valid), + .straight_data_out_ready(straight_mdivisor_data_ready) + ); + // Integer division instance + int_div #( + .IN_NUM(BLOCK_SIZE), + .DIVIDEND_WIDTH(DATA_DIVIDEND_PRECISION_0), + .DIVISOR_WIDTH(DATA_DIVISOR_PRECISION_0), + .QUOTIENT_WIDTH(DATA_QUOTIENT_PRECISION_0) + ) div_inst ( + .clk(clk), + .rst(rst), + .dividend_data(straight_mdividend_data), + .dividend_data_valid(straight_mdividend_data_valid), // Updated to use skid buffer + .dividend_data_ready(straight_mdividend_data_ready), // Updated to use skid buffer + .divisor_data(straight_mdivisor_data), + .divisor_data_valid(straight_mdivisor_data_valid), // Updated to use skid buffer + .divisor_data_ready(straight_mdivisor_data_ready), // Updated to use skid buffer + .quotient_data(mquotient_data), + .quotient_data_valid(mquotient_data_valid), + .quotient_data_ready(mquotient_data_ready) + ); + + // Exponent calculation and join logic + assign equotient_data = $signed(fifo_edividend_data) - $signed(fifo_edivisor_data); + + // Join the handshake signals + join_n #( + .NUM_HANDSHAKES(3) + ) join_n_inst ( + .data_in_valid({mquotient_data_valid, fifo_edividend_data_valid, fifo_edivisor_data_valid}), + .data_in_ready({mquotient_data_ready, fifo_edividend_data_ready, fifo_edivisor_data_ready}), + .data_out_valid(quotient_data_valid), + .data_out_ready(quotient_data_ready) + ); + +endmodule \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_dot_product.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_dot_product.sv index 8b24e9009..1ad76d17f 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_dot_product.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_dot_product.sv @@ -31,89 +31,57 @@ module mxint_dot_product #( output data_out_0_valid, input data_out_0_ready ); - logic [DATA_OUT_0_PRECISION_0-1:0] mdp [BLOCK_SIZE-1:0]; - logic [DATA_OUT_0_PRECISION_1-1:0] edp; - logic mdp_valid, mdp_ready; - logic mdata_in_0_valid, mdata_in_0_ready; - logic edata_in_0_valid, edata_in_0_ready; + + logic [DATA_IN_0_PRECISION_0 - 1:0] mdata_in_0_reg_out[BLOCK_SIZE - 1:0]; + logic mdata_in_0_reg_out_valid, mdata_in_0_reg_out_ready; logic [DATA_IN_0_PRECISION_1 - 1:0] buffer_edata_in_0; logic buffer_edata_in_0_valid, buffer_edata_in_0_ready; - logic mweight_valid, mweight_ready; - logic eweight_valid, eweight_ready; + + logic [WEIGHT_PRECISION_0 - 1:0] mweight_reg_out[BLOCK_SIZE - 1:0]; + logic mweight_reg_out_valid, mweight_reg_out_ready; + logic [WEIGHT_PRECISION_1-1:0] buffer_eweight; logic buffer_eweight_valid, buffer_eweight_ready; + logic mdata_out_0_valid, mdata_out_0_ready; - split2 #() data_in_split_i ( - .data_in_valid (data_in_0_valid), - .data_in_ready (data_in_0_ready), - .data_out_valid({mdata_in_0_valid, edata_in_0_valid}), - .data_out_ready({mdata_in_0_ready, edata_in_0_ready}) - ); - logic [DATA_IN_0_PRECISION_0 - 1:0] mdata_in_0_reg_out[BLOCK_SIZE - 1:0]; - logic mdata_in_0_reg_out_valid, mdata_in_0_reg_out_ready; - logic [WEIGHT_PRECISION_0 - 1:0] mweight_reg_out[BLOCK_SIZE - 1:0]; - logic mweight_reg_out_valid, mweight_reg_out_ready; - unpacked_skid_buffer #( - .DATA_WIDTH(DATA_IN_0_PRECISION_0), - .IN_NUM(BLOCK_SIZE) - ) mdata_in_register_slice ( - .clk (clk), - .rst (rst), - .data_in (mdata_in_0), - .data_in_valid (mdata_in_0_valid), - .data_in_ready (mdata_in_0_ready), - .data_out (mdata_in_0_reg_out), - .data_out_valid(mdata_in_0_reg_out_valid), - .data_out_ready(mdata_in_0_reg_out_ready) - ); - unpacked_skid_buffer #( - .DATA_WIDTH(WEIGHT_PRECISION_0), - .IN_NUM(BLOCK_SIZE) - ) mweight_register_slice ( - .clk (clk), - .rst (rst), - .data_in (mweight), - .data_in_valid (mweight_valid), - .data_in_ready (mweight_ready), - .data_out (mweight_reg_out), - .data_out_valid(mweight_reg_out_valid), - .data_out_ready(mweight_reg_out_ready) - ); - split2 #() weight_split_i ( - .data_in_valid (weight_valid), - .data_in_ready (weight_ready), - .data_out_valid({mweight_valid, eweight_valid}), - .data_out_ready({mweight_ready, eweight_ready}) - ); - fifo #( - .DEPTH($clog2(BLOCK_SIZE)), - .DATA_WIDTH(DATA_IN_0_PRECISION_1) - ) data_in_0_ff_inst ( + mxint_straightm_fifoe #( + .DEPTH($clog2(BLOCK_SIZE) + 1), + .MAN_WIDTH(DATA_IN_0_PRECISION_0), + .EXP_WIDTH(DATA_IN_0_PRECISION_1), + .IN_SIZE(BLOCK_SIZE) + ) data_in_0_split_m_e ( .clk(clk), .rst(rst), - .in_data(edata_in_0), - .in_valid(edata_in_0_valid), - .in_ready(edata_in_0_ready), - .out_data(buffer_edata_in_0), - .out_valid(buffer_edata_in_0_valid), - .out_ready(buffer_edata_in_0_ready), - .empty(), - .full() + .mdata_in(mdata_in_0), + .edata_in(edata_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .fifo_edata_out(buffer_edata_in_0), + .fifo_edata_out_valid(buffer_edata_in_0_valid), + .fifo_edata_out_ready(buffer_edata_in_0_ready), + .straight_mdata_out(mdata_in_0_reg_out), + .straight_mdata_out_valid(mdata_in_0_reg_out_valid), + .straight_mdata_out_ready(mdata_in_0_reg_out_ready) ); - fifo #( - .DEPTH($clog2(BLOCK_SIZE)), - .DATA_WIDTH(WEIGHT_PRECISION_1) - ) weight_ff_inst ( + + mxint_straightm_fifoe #( + .DEPTH($clog2(BLOCK_SIZE) + 1), + .MAN_WIDTH(WEIGHT_PRECISION_0), + .EXP_WIDTH(WEIGHT_PRECISION_1), + .IN_SIZE(BLOCK_SIZE) + ) weight_split_m_e ( .clk(clk), .rst(rst), - .in_data(eweight), - .in_valid(eweight_valid), - .in_ready(eweight_ready), - .out_data(buffer_eweight), - .out_valid(buffer_eweight_valid), - .out_ready(buffer_eweight_ready), - .empty(), - .full() + .mdata_in(mweight), + .edata_in(eweight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + .fifo_edata_out(buffer_eweight), + .fifo_edata_out_valid(buffer_eweight_valid), + .fifo_edata_out_ready(buffer_eweight_ready), + .straight_mdata_out(mweight_reg_out), + .straight_mdata_out_valid(mweight_reg_out_valid), + .straight_mdata_out_ready(mweight_reg_out_ready) ); assign edata_out_0 = $signed(buffer_eweight) + $signed(buffer_edata_in_0); fixed_dot_product #( @@ -133,6 +101,7 @@ module mxint_dot_product #( .data_out_valid(mdata_out_0_valid), .data_out_ready(mdata_out_0_ready) ); + join_n #( .NUM_HANDSHAKES(3) ) join_inst ( diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_exp.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_exp.sv new file mode 100644 index 000000000..6e2cdbb6c --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_exp.sv @@ -0,0 +1,99 @@ +`timescale 1ns / 1ps +/* + This code actually input mxint and then output rounded integer n, + In the first version, we just keep the width of n is 8 + which means like output n range from [-128:127] +*/ +module mxint_exp #( + parameter DATA_IN_MAN_WIDTH = 8, + parameter DATA_IN_EXP_WIDTH = 3, + parameter BLOCK_SIZE = 16, + parameter DATA_OUT_MAN_WIDTH= 10, + parameter DATA_OUT_EXP_WIDTH= 4, + parameter DATA_R_WIDTH = 7 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input rst, + input clk, + input logic [DATA_IN_MAN_WIDTH-1:0] mdata_in_0[BLOCK_SIZE - 1:0], + input logic [DATA_IN_EXP_WIDTH-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_MAN_WIDTH-1:0] mdata_out_0[BLOCK_SIZE - 1 : 0], + output logic [DATA_OUT_EXP_WIDTH-1:0] edata_out_0[BLOCK_SIZE - 1 : 0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + + localparam signed MLOG2_E = 8'd92; + localparam signed ELOG2_E = 4'd1; + + localparam LOG2_E_MAN_WIDTH = 8; + localparam LOG2_E_EXP_WIDTH = 4; + localparam DATA_LOG2_E_MAN_WIDTH = DATA_IN_MAN_WIDTH + LOG2_E_MAN_WIDTH; + localparam DATA_LOG2_E_MAN_FRAC_WIDTH = DATA_IN_MAN_WIDTH - 1 + LOG2_E_MAN_WIDTH - 1; + localparam DATA_LOG2_E_EXP_WIDTH = DATA_IN_EXP_WIDTH; + + localparam SHIFT_WIDTH = $clog2(DATA_LOG2_E_MAN_WIDTH) + 2; + localparam DATA_N_WIDTH = DATA_OUT_EXP_WIDTH; + + localparam CASTED_DATA_LOG2_E_WIDTH = DATA_N_WIDTH + DATA_R_WIDTH - 1; + localparam CASTED_DATA_LOG2_E_FRAC_WIDTH = DATA_R_WIDTH - 1; + + logic [DATA_LOG2_E_MAN_WIDTH - 1:0] mdata_in_0_log2_e[BLOCK_SIZE - 1:0]; + logic [DATA_LOG2_E_EXP_WIDTH - 1:0] edata_in_0_log2_e; + + logic signed [SHIFT_WIDTH - 1:0] shift_value; + logic [CASTED_DATA_LOG2_E_WIDTH - 1:0] casted_data_in_0_log2_e[BLOCK_SIZE - 1:0]; + + logic [DATA_N_WIDTH - 1:0] temp_data_out_n[BLOCK_SIZE - 1 : 0]; + logic [DATA_R_WIDTH - 1:0] temp_data_out_r[BLOCK_SIZE - 1 : 0]; + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign mdata_in_0_log2_e[i] = $signed(mdata_in_0[i]) * MLOG2_E; + end + assign edata_in_0_log2_e = $signed(edata_in_0) + ELOG2_E; + + // So basically, The input frac_width is DATA_LOG2_E_MAN_FRAC_WIDTH + // We wish to make the output frac_width = CASTED_DATA_LOG2_E_FRAC_WIDTH + // real_data = man * 2** exp this is left shift here + assign shift_value = DATA_LOG2_E_MAN_FRAC_WIDTH - CASTED_DATA_LOG2_E_FRAC_WIDTH - $signed(edata_in_0_log2_e); + + optimized_right_shift #( + .IN_WIDTH(DATA_LOG2_E_MAN_WIDTH), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(CASTED_DATA_LOG2_E_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(mdata_in_0_log2_e), + .shift_value(shift_value), + .data_out(casted_data_in_0_log2_e) + ); + + // Then we need to extract the casted_data_in_0_log2_e to get the n and r + + + logic [DATA_OUT_MAN_WIDTH - 1:0] mexp [BLOCK_SIZE - 1:0]; + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : power2_lut_inst + assign temp_data_out_n[i] = casted_data_in_0_log2_e[i][CASTED_DATA_LOG2_E_WIDTH - 1: DATA_R_WIDTH - 1]; + assign temp_data_out_r[i] = { + casted_data_in_0_log2_e[i][DATA_N_WIDTH + DATA_R_WIDTH - 1], + casted_data_in_0_log2_e[i][DATA_R_WIDTH - 2: 0]}; + power2_lut #( + .DATA_IN_0_PRECISION_0(DATA_R_WIDTH), + .DATA_IN_0_PRECISION_1(DATA_R_WIDTH - 1), + .DATA_OUT_0_PRECISION_0(DATA_OUT_MAN_WIDTH), + .DATA_OUT_0_PRECISION_1(DATA_OUT_MAN_WIDTH - 2) + ) power2_lut_inst ( + .data_in_0(temp_data_out_r[i]), + .data_out_0(mexp[i]) + ); + assign mdata_out_0[i] = mexp[i]; + assign edata_out_0[i] = temp_data_out_n[i]; + end + assign data_out_0_valid = data_in_0_valid; + assign data_in_0_ready = data_out_0_ready; + +endmodule + diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_fork2.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_fork2.sv new file mode 100644 index 000000000..9c10b8ebd --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_fork2.sv @@ -0,0 +1,98 @@ +`timescale 1ns / 1ps +module mxint_fork2 #( + parameter DATA_IN_0_PRECISION_0 = 8, // mantissa width + parameter DATA_IN_0_PRECISION_1 = 8, // exponent width + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, + + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1, + parameter DATA_OUT_1_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_1_PRECISION_1 = DATA_IN_0_PRECISION_1, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2, + + parameter DATA_OUT_1_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_1_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_1_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + parameter DATA_OUT_1_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_1_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_1_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2, + localparam FIFO_DEPTH = DATA_OUT_0_TENSOR_SIZE_DIM_0 * DATA_OUT_0_TENSOR_SIZE_DIM_1 / (DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1), + + localparam BLOCK_SIZE = DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1 +) ( + input wire clk, + input wire rst, + + // Input interface + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0[BLOCK_SIZE-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + // FIFO output interface (output 0) + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0[BLOCK_SIZE-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready, + + // Straight output interface (output 1) + output logic [DATA_OUT_1_PRECISION_0-1:0] mdata_out_1[BLOCK_SIZE-1:0], + output logic [DATA_OUT_1_PRECISION_1-1:0] edata_out_1, + output logic data_out_1_valid, + input logic data_out_1_ready +); + + // Flatten the input data + logic [DATA_IN_0_PRECISION_0 * BLOCK_SIZE + DATA_IN_0_PRECISION_1 - 1:0] data_in_flatten; + logic [DATA_IN_0_PRECISION_0 * BLOCK_SIZE + DATA_IN_0_PRECISION_1 - 1:0] fifo_data_out_flatten; + logic [DATA_IN_0_PRECISION_0 * BLOCK_SIZE + DATA_IN_0_PRECISION_1 - 1:0] straight_data_out_flatten; + + // Input flattening + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : reshape + assign data_in_flatten[i*DATA_IN_0_PRECISION_0 +: DATA_IN_0_PRECISION_0] = mdata_in_0[i]; + end + assign data_in_flatten[DATA_IN_0_PRECISION_0*BLOCK_SIZE +: DATA_IN_0_PRECISION_1] = edata_in_0; + + // Split2 instance + split2_with_data #( + .DATA_WIDTH(DATA_IN_0_PRECISION_0 * BLOCK_SIZE + DATA_IN_0_PRECISION_1), + .FIFO_DEPTH(FIFO_DEPTH) + ) split2_with_data_i ( + .clk(clk), + .rst(rst), + .data_in(data_in_flatten), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .fifo_data_out(fifo_data_out_flatten), + .fifo_data_out_valid(data_out_0_valid), + .fifo_data_out_ready(data_out_0_ready), + .straight_data_out(straight_data_out_flatten), + .straight_data_out_valid(data_out_1_valid), + .straight_data_out_ready(data_out_1_ready) + ); + + // Unflatten FIFO output (output 0) + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : unreshape_fifo + assign mdata_out_0[i] = fifo_data_out_flatten[i*DATA_OUT_0_PRECISION_0 +: DATA_OUT_0_PRECISION_0]; + end + assign edata_out_0 = fifo_data_out_flatten[DATA_OUT_0_PRECISION_0*BLOCK_SIZE +: DATA_OUT_0_PRECISION_1]; + + // Unflatten straight output (output 1) + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : unreshape_straight + assign mdata_out_1[i] = straight_data_out_flatten[i*DATA_OUT_1_PRECISION_0 +: DATA_OUT_1_PRECISION_0]; + end + assign edata_out_1 = straight_data_out_flatten[DATA_OUT_1_PRECISION_0*BLOCK_SIZE +: DATA_OUT_1_PRECISION_1]; + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_gelu.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_gelu.sv new file mode 100644 index 000000000..dd952b7db --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_gelu.sv @@ -0,0 +1,182 @@ +`timescale 1ns / 1ps +/* + Currently, we dont' want to support parallelism + Cause in attention, it's actually not in parallel +*/ +module mxint_gelu_element #( + parameter IN_MAN_WIDTH = 4, + parameter IN_EXP_WIDTH = 8, + parameter OUT_MAN_WIDTH = 4, + parameter OUT_EXP_WIDTH = 8 +) ( + input logic[IN_MAN_WIDTH-1:0] mdata_in_0, + input logic[IN_EXP_WIDTH-1:0] edata_in_0, + output logic[OUT_MAN_WIDTH-1:0] mdata_out_0, + output logic[OUT_EXP_WIDTH-1:0] edata_out_0 +); + localparam VALID_WIDTH = IN_MAN_WIDTH + 2; + localparam logic[VALID_WIDTH-1:0] MIN_VAL = -(2 ** (VALID_WIDTH - 1)); + localparam logic[VALID_WIDTH-1:0] MAX_VAL = (2 ** (VALID_WIDTH - 1)) - 1; + + logic[VALID_WIDTH - 1:0] real_x ; + logic[VALID_WIDTH - 1:0] real_x_v [0:0]; + + logic[OUT_MAN_WIDTH-1:0] lut_out; + logic [OUT_MAN_WIDTH-1:0] shifted_lut_out_v[0:0]; + logic [OUT_MAN_WIDTH-1:0] shifted_lut_out; + + logic signed [IN_EXP_WIDTH-1:0] data_in_shift; + logic signed [IN_EXP_WIDTH-1:0] hash_out_shift; + assign data_in_shift = - edata_in_0; + assign hash_out_shift = edata_in_0 - 2; + optimized_right_shift #( + .IN_WIDTH(IN_MAN_WIDTH), + .SHIFT_WIDTH(IN_EXP_WIDTH), + .OUT_WIDTH(VALID_WIDTH), + .BLOCK_SIZE(1) + ) data_in_shift_inst ( + .data_in({mdata_in_0}), + .shift_value(data_in_shift), + .data_out(real_x_v) + ); + + assign real_x = real_x_v[0]; + + gelu_lut #( + .DATA_IN_0_PRECISION_0(VALID_WIDTH), + .DATA_IN_0_PRECISION_1(VALID_WIDTH - 3), + .DATA_OUT_0_PRECISION_0(OUT_MAN_WIDTH), + .DATA_OUT_0_PRECISION_1(OUT_MAN_WIDTH - 3) + ) gelu_lut_inst ( + .data_in_0(real_x), + .data_out_0(lut_out) + ); + + optimized_right_shift #( + .IN_WIDTH(OUT_MAN_WIDTH), + .SHIFT_WIDTH(IN_EXP_WIDTH), + .OUT_WIDTH(OUT_MAN_WIDTH), + .BLOCK_SIZE(1) + ) lut_out_shift_inst ( + .data_in({lut_out}), + .shift_value(hash_out_shift), + .data_out(shifted_lut_out_v) + ); + assign shifted_lut_out = shifted_lut_out_v[0]; + + always_comb begin + if (real_x == MAX_VAL) begin + mdata_out_0 = mdata_in_0; + edata_out_0 = edata_in_0; + end else if (real_x == MIN_VAL) begin + mdata_out_0 = 0; + edata_out_0 = edata_in_0; + end else begin + mdata_out_0 = shifted_lut_out; + edata_out_0 = edata_in_0; + end + end +endmodule + +module mxint_gelu #( + /* verilator lint_off UNUSEDPARAM */ + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 4, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 10, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 1, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, + + parameter IN_0_DEPTH = $rtoi($ceil(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0)), + + parameter HASH_OUT_WIDTH = 8, + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input clk, + input rst, + + input logic data_in_0_valid, + output logic data_in_0_ready, + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0[DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + + output logic data_out_0_valid, + input logic data_out_0_ready, + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0[DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0 +); + localparam HASH_OUT_FRAC_WIDTH = HASH_OUT_WIDTH - 1; + + // Add intermediate signals for registered inputs + logic [DATA_IN_0_PRECISION_0-1:0] reg_mdata_in[DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; + logic [DATA_IN_0_PRECISION_1-1:0] reg_edata_in; + logic reg_data_in_valid, reg_data_in_ready; + + // Add register slice at input + mxint_register_slice #( + .DATA_PRECISION_0(DATA_IN_0_PRECISION_0), + .DATA_PRECISION_1(DATA_IN_0_PRECISION_1), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) + ) input_reg_slice ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_in_0), + .edata_in(edata_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .mdata_out(reg_mdata_in), + .edata_out(reg_edata_in), + .data_out_valid(reg_data_in_valid), + .data_out_ready(reg_data_in_ready) + ); + + // Update gelu instance connections to use registered signals + logic [HASH_OUT_WIDTH-1:0] gelu_mdata_out [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0]; + logic [DATA_IN_0_PRECISION_1-1:0] gelu_edata_out; + + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1; i++) begin : gelu + mxint_gelu_element #( + .IN_MAN_WIDTH(DATA_IN_0_PRECISION_0), + .IN_EXP_WIDTH(DATA_IN_0_PRECISION_1), + .OUT_MAN_WIDTH(HASH_OUT_WIDTH), + .OUT_EXP_WIDTH(DATA_IN_0_PRECISION_1) + ) gelu_inst ( + .mdata_in_0(reg_mdata_in[i]), // Changed from mdata_in_0 + .edata_in_0(reg_edata_in), // Changed from edata_in_0 + .mdata_out_0(gelu_mdata_out[i]), + .edata_out_0() + ); + end + assign gelu_edata_out = reg_edata_in; // Changed from edata_in_0 + + mxint_cast #( + .IN_MAN_WIDTH(HASH_OUT_WIDTH), + .IN_MAN_FRAC_WIDTH(HASH_OUT_FRAC_WIDTH), + .IN_EXP_WIDTH(DATA_IN_0_PRECISION_1), + .OUT_MAN_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .BLOCK_SIZE(DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) + ) cast_inst ( + .clk(clk), + .rst(rst), + .mdata_in(gelu_mdata_out), + .edata_in(gelu_edata_out), + .data_in_valid(reg_data_in_valid), // Changed from data_in_0_valid + .data_in_ready(reg_data_in_ready), // Changed from data_in_0_ready + .mdata_out(mdata_out_0), + .edata_out(edata_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + +endmodule \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_hardware_round.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_hardware_round.sv new file mode 100644 index 000000000..8fcd2ff3a --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_hardware_round.sv @@ -0,0 +1,82 @@ +`timescale 1ns / 1ps +/* + This code actually input mxint and then output rounded integer n, + In the first version, we just keep the width of n is 8 + which means like output n range from [-128:127] +*/ +module mxint_hardware_round #( + /* verilator lint_off UNUSEDPARAM */ + parameter DATA_IN_MAN_WIDTH = 4, + parameter DATA_IN_MAN_FRAC_WIDTH = 4, + parameter DATA_IN_EXP_WIDTH = 8, + parameter BLOCK_SIZE = 16, + parameter DATA_OUT_WIDTH = 8 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input rst, + input clk, + input logic [DATA_IN_MAN_WIDTH-1:0] mdata_in_0[BLOCK_SIZE - 1:0], + input logic [DATA_IN_EXP_WIDTH-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_WIDTH-1:0] data_out_0[BLOCK_SIZE - 1 : 0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + + localparam SHIFT_WIDTH = DATA_IN_EXP_WIDTH + 1; + + logic [SHIFT_WIDTH - 1:0] shift_value; + logic [DATA_IN_MAN_WIDTH - 1:0] mid_n[BLOCK_SIZE - 1:0]; + logic [DATA_IN_MAN_WIDTH-1:0] shift_result[BLOCK_SIZE-1:0]; + + assign shift_value = DATA_IN_MAN_FRAC_WIDTH - $signed(edata_in_0); + optimized_right_shift #( + .IN_WIDTH(DATA_IN_MAN_WIDTH), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(DATA_IN_MAN_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(mdata_in_0), + .shift_value(shift_value), + .data_out(shift_result) + ); + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + always_comb begin + if ($signed(shift_value) >= DATA_IN_MAN_FRAC_WIDTH) begin + mid_n[i] = (mdata_in_0[i][DATA_IN_MAN_WIDTH-1]) ? -1 : 0; + end else begin + mid_n[i] = shift_result[i]; + end + end + end + + logic [DATA_OUT_WIDTH - 1:0] clamped_n[BLOCK_SIZE - 1:0]; + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + signed_clamp #( + .IN_WIDTH (DATA_IN_MAN_WIDTH), + .OUT_WIDTH(DATA_OUT_WIDTH) + ) n_clamp ( + .in_data (mid_n[i]), + .out_data(clamped_n[i]) + ); + end + unpacked_register_slice #( + .DATA_WIDTH(DATA_OUT_WIDTH), + .IN_SIZE (BLOCK_SIZE) + ) register_slice_i ( + .clk(clk), + .rst(rst), + + .data_in(clamped_n), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + + .data_out(data_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); +endmodule + diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_layernorm.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_layernorm.sv new file mode 100644 index 000000000..91f3532a7 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_layernorm.sv @@ -0,0 +1,259 @@ +`timescale 1ns / 1ps +module mxint_layernorm #( + // Dimensions + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 2, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 4, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 2, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, + + // Data widths + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 4, + parameter WEIGHT_PRECISION_0 = 8, + parameter WEIGHT_PRECISION_1 = 4, + parameter BIAS_PRECISION_0 = 8, + parameter BIAS_PRECISION_1 = 4, + parameter ELEMENTWISE_AFFINE = 0, + parameter HAS_BIAS = 1, + + parameter ISQRT_IN_PRECISION_0 = 8, //PREICISION_0 for ISQRT is integer width + parameter ISQRT_IN_PRECISION_1 = 8, //PREICISION_1 for ISQRT is integer frac width + parameter ISQRT_OUT_PRECISION_0 = 8, + parameter ISQRT_OUT_PRECISION_1 = 4, + parameter NORM_OUT_PRECISION_0 = 8, + parameter NORM_OUT_FRAC_WIDTH = 4, + parameter NORM_OUT_PRECISION_1 = 4, + + parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter BIAS_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_1 = 1, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter WEIGHT_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 1, + parameter WEIGHT_PARALLELISM_DIM_1 = 1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2, + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4 +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0 [DATA_IN_0_PARALLELISM_DIM_1*DATA_IN_0_PARALLELISM_DIM_0-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + input logic [WEIGHT_PRECISION_0-1:0] mweight [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0], + input logic [WEIGHT_PRECISION_1-1:0] eweight, + input logic weight_valid, + output logic weight_ready, + + input logic [BIAS_PRECISION_0-1:0] mbias [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0], + input logic [BIAS_PRECISION_1-1:0] ebias, + input logic bias_valid, + output logic bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0 [DATA_IN_0_PARALLELISM_DIM_1*DATA_IN_0_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready +); + + localparam AFFINE_PRECISION_0 = DATA_OUT_0_PRECISION_0 + WEIGHT_PRECISION_0 + 1; + localparam AFFINE_PRECISION_1 = DATA_OUT_0_PRECISION_1 + WEIGHT_PRECISION_1; + + localparam WD_PRECISION_0 = NORM_OUT_PRECISION_0 + WEIGHT_PRECISION_0 + 1; + localparam WD_MAN_FRAC_WIDTH = NORM_OUT_PRECISION_1 + WEIGHT_PRECISION_0 - 1; + localparam WD_PRECISION_1 = NORM_OUT_PRECISION_1 + 1; + + logic [NORM_OUT_PRECISION_0 - 1:0] mnorm_out [DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1 - 1:0]; + logic [NORM_OUT_PRECISION_1 - 1:0] enorm_out; + logic [DATA_IN_0_PARALLELISM_DIM_1 - 1:0] parallel_norm_in_valid, parallel_norm_in_ready; + logic [DATA_IN_0_PARALLELISM_DIM_1 - 1:0] parallel_norm_out_valid, parallel_norm_out_ready; + logic [DATA_OUT_0_PRECISION_1 - 1:0] parallel_enorm_out [DATA_IN_0_PARALLELISM_DIM_1 - 1:0]; + logic norm_out_valid, norm_out_ready; + logic [AFFINE_PRECISION_0 -1:0] uncast_data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1 - 1:0]; + + logic [BIAS_PRECISION_0-1:0] mbias_buffered [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0]; + logic [BIAS_PRECISION_1-1:0] ebias_buffered; + logic bias_buffered_valid, bias_buffered_ready; + + logic [WEIGHT_PRECISION_0-1:0] mweight_buffered [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0]; + logic [WEIGHT_PRECISION_1-1:0] eweight_buffered; + logic weight_buffered_ready, weight_buffered_valid; + + logic [WD_PRECISION_0 - 1:0] mwd_out [DATA_OUT_0_PARALLELISM_DIM_1*DATA_OUT_0_PARALLELISM_DIM_0 - 1 : 0]; + logic [WD_PRECISION_1 - 1:0] ewd_out; + logic wd_out_valid, wd_out_ready; + + logic affine_out_ready, affine_out_valid; + localparam SHIFT_WIDTH = WEIGHT_PRECISION_1 + 2; + logic [SHIFT_WIDTH - 1:0] shift_value; + logic [WD_PRECISION_0 - 1:0] casted_bias [DATA_OUT_0_PARALLELISM_DIM_0 - 1:0]; + + logic [WD_PRECISION_0 - 1:0] maffine_out [DATA_OUT_0_PARALLELISM_DIM_1*DATA_OUT_0_PARALLELISM_DIM_0 - 1:0]; + logic [WD_PRECISION_1 - 1:0] eaffine_out; + + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_1; i++) begin : parallel_dim_1 + assign parallel_norm_in_valid[i] = data_in_0_valid; + assign parallel_norm_out_ready[i] = norm_out_ready; + mxint_layernorm_1d #( + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_MAN_WIDTH(DATA_IN_0_PRECISION_0), + .DATA_IN_0_EXP_WIDTH(DATA_IN_0_PRECISION_1), + .ISQRT_IN_MAN_WIDTH(ISQRT_IN_PRECISION_0), + .ISQRT_IN_MAN_FRAC_WIDTH(ISQRT_IN_PRECISION_1), + .ISQRT_OUT_MAN_WIDTH(ISQRT_OUT_PRECISION_0), + .ISQRT_OUT_MAN_FRAC_WIDTH(ISQRT_OUT_PRECISION_1), + .DATA_OUT_0_MAN_WIDTH(NORM_OUT_PRECISION_0), + .DATA_OUT_0_MAN_FRAC_WIDTH(NORM_OUT_FRAC_WIDTH), + .DATA_OUT_0_EXP_WIDTH(NORM_OUT_PRECISION_1) + ) layer_norm_inst ( + .clk, + .rst, + .mdata_in_0(mdata_in_0[i*DATA_IN_0_PARALLELISM_DIM_0 + DATA_IN_0_PARALLELISM_DIM_0 - 1: i*DATA_IN_0_PARALLELISM_DIM_0]), + .edata_in_0(edata_in_0), + .data_in_0_valid(parallel_norm_in_valid[i]), + .data_in_0_ready(parallel_norm_in_ready[i]), + .mdata_out_0(mnorm_out[i*DATA_IN_0_PARALLELISM_DIM_0 + DATA_IN_0_PARALLELISM_DIM_0 - 1: i*DATA_IN_0_PARALLELISM_DIM_0]), + .edata_out_0(parallel_enorm_out[i]), + .data_out_0_valid(parallel_norm_out_valid[i]), + .data_out_0_ready(parallel_norm_out_ready[i]) + ); + end + //TODO: Bug here, notice, our module currently can only support parallel in the dimension 0; + assign enorm_out = parallel_enorm_out[0]; + assign data_in_0_ready = parallel_norm_in_ready[0]; + assign norm_out_valid = parallel_norm_out_valid[0]; + + if (ELEMENTWISE_AFFINE == 1) begin + mxint_circular #( + .DATA_PRECISION_0(BIAS_PRECISION_0), + .DATA_PRECISION_1(BIAS_PRECISION_1), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0), + .REPEAT(DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1), + .BUFFER_SIZE(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0) + ) bias_buffer_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mbias), + .edata_in(ebias), + .data_in_valid(bias_valid), + .data_in_ready(bias_ready), + .mdata_out(mbias_buffered), + .edata_out(ebias_buffered), + .data_out_valid(bias_buffered_valid), + .data_out_ready(bias_buffered_ready) + ); + + mxint_circular #( + .DATA_PRECISION_0(WEIGHT_PRECISION_0), + .DATA_PRECISION_1(WEIGHT_PRECISION_1), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0), + .REPEAT(DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1), + .BUFFER_SIZE(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0) + ) weight_buffer_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mweight), + .edata_in(eweight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + .mdata_out(mweight_buffered), + .edata_out(eweight_buffered), + .data_out_valid(weight_buffered_valid), + .data_out_ready(weight_buffered_ready) + ); + + join2 weight_data_join_inst ( + .data_in_valid ({weight_buffered_valid, norm_out_valid}), + .data_in_ready ({weight_buffered_ready, norm_out_ready}), + .data_out_valid(wd_out_valid), + .data_out_ready(wd_out_ready) + ); + + for(genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i++) begin : affine_bias_parallel + for(genvar j = 0; j < DATA_OUT_0_PARALLELISM_DIM_0; j++) begin : affine_bias_parallel + localparam int k = i * DATA_IN_0_PARALLELISM_DIM_0 + j; + assign mwd_out[k] = $signed(mweight_buffered[j]) * $signed(mnorm_out[k]); + end + end + assign ewd_out = $signed(eweight_buffered) + $signed(enorm_out); + + join2 wd_bias_join_inst ( + .data_in_valid ({wd_out_valid, bias_buffered_valid}), + .data_in_ready ({wd_out_ready, bias_buffered_ready}), + .data_out_valid(affine_out_valid), + .data_out_ready(affine_out_ready) + ); + localparam MWD_FRAC_WIDTH = NORM_OUT_FRAC_WIDTH + WEIGHT_PRECISION_0 - 1; + localparam BIAS_FRAC_WIDTH = BIAS_PRECISION_0 - 1; + assign shift_value = $signed(ewd_out) - $signed(ebias_buffered) - MWD_FRAC_WIDTH + BIAS_FRAC_WIDTH; + optimized_right_shift #( + .IN_WIDTH(BIAS_PRECISION_0), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(WD_PRECISION_0), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM_DIM_0) + ) ovshift_inst ( + .data_in(mbias_buffered), + .shift_value(shift_value), + .data_out(casted_bias) + ); + for(genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i++) begin : affine_bias_parallel_dim_1 + for(genvar j = 0; j < DATA_OUT_0_PARALLELISM_DIM_0; j++) begin : affine_bias_parallel_dim_0 + localparam int k = i * DATA_IN_0_PARALLELISM_DIM_0 + j; + assign maffine_out[k] = $signed(casted_bias[j]) + $signed(mwd_out[k]); + end + end + assign eaffine_out = ewd_out; + + mxint_cast #( + .IN_MAN_WIDTH(WD_PRECISION_0), + .IN_MAN_FRAC_WIDTH(WD_MAN_FRAC_WIDTH), + .IN_EXP_WIDTH(WD_PRECISION_0), + .OUT_MAN_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1) + ) u_mxint_cast ( + .clk(clk), + .rst(rst), + .mdata_in(maffine_out), + .edata_in(eaffine_out), + .data_in_valid(affine_out_valid), + .data_in_ready(affine_out_ready), + .mdata_out(mdata_out_0), + .edata_out(edata_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + end else begin + mxint_cast #( + .IN_MAN_WIDTH(NORM_OUT_PRECISION_0), + .IN_MAN_FRAC_WIDTH(NORM_OUT_PRECISION_1), + .IN_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .OUT_MAN_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM_DIM_1*DATA_OUT_0_PARALLELISM_DIM_0) + ) u_mxint_cast ( + .clk(clk), + .rst(rst), + .mdata_in(mnorm_out), + .edata_in(enorm_out), + .data_in_valid(norm_out_valid), + .data_in_ready(norm_out_ready), + .mdata_out(mdata_out_0), + .edata_out(edata_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + end +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_layernorm_1d.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_layernorm_1d.sv new file mode 100644 index 000000000..eb494ff3c --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_layernorm_1d.sv @@ -0,0 +1,673 @@ +`timescale 1ns / 1ps +module mxint_layernorm_1d #( + /* verilator lint_off UNUSEDPARAM */ + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 10, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + + parameter DATA_IN_0_MAN_WIDTH = 8, + parameter DATA_IN_0_MAN_FRAC_WIDTH = DATA_IN_0_MAN_WIDTH - 1, + parameter DATA_IN_0_EXP_WIDTH = 4, + + parameter DATA_OUT_0_MAN_WIDTH = 8, + parameter DATA_OUT_0_MAN_FRAC_WIDTH = DATA_OUT_0_MAN_WIDTH - 1, + parameter DATA_OUT_0_EXP_WIDTH = 4, + + parameter ISQRT_IN_MAN_WIDTH = 8, + parameter ISQRT_IN_MAN_FRAC_WIDTH = 4, + parameter ISQRT_OUT_MAN_WIDTH = 8, + parameter ISQRT_OUT_MAN_FRAC_WIDTH = 4 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input clk, + input rst, + + input logic data_in_0_valid, + output logic data_in_0_ready, + input logic [DATA_IN_0_MAN_WIDTH-1:0] mdata_in_0[DATA_OUT_0_PARALLELISM_DIM_0-1:0], + input logic [DATA_IN_0_EXP_WIDTH-1:0] edata_in_0, + + output logic data_out_0_valid, + input logic data_out_0_ready, + output logic [DATA_OUT_0_MAN_WIDTH-1:0] mdata_out_0[DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_EXP_WIDTH-1:0] edata_out_0 +); + // Internal signals + logic [DATA_IN_0_MAN_WIDTH-1:0] casted_mdata_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [DATA_IN_0_EXP_WIDTH-1:0] casted_edata_in; + logic casted_data_in_valid; + logic casted_data_in_ready; + + dim_0_cast #( + .MAN_WIDTH(DATA_IN_0_MAN_WIDTH), + .EXP_WIDTH(DATA_IN_0_EXP_WIDTH), + .IN_DEPTH(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0), + .BLOCK_SIZE(DATA_IN_0_PARALLELISM_DIM_0) + ) u_dim_0_cast ( + .clk(clk), + .rst(rst), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + .mdata_in_0(mdata_in_0), + .edata_in_0(edata_in_0), + .data_out_0_valid(casted_data_in_valid), + .data_out_0_ready(casted_data_in_ready), + .mdata_out_0(casted_mdata_in), + .edata_out_0(casted_edata_in) + ); + + layernorm_core #( + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + // Data widths + .DATA_IN_0_WIDTH(DATA_IN_0_MAN_WIDTH), + .DATA_IN_0_FRAC_WIDTH(DATA_IN_0_MAN_FRAC_WIDTH), + .ISQRT_IN_WIDTH(ISQRT_IN_MAN_WIDTH), + .ISQRT_IN_FRAC_WIDTH(ISQRT_IN_MAN_FRAC_WIDTH), + .ISQRT_OUT_WIDTH(ISQRT_OUT_MAN_WIDTH), + .ISQRT_OUT_FRAC_WIDTH(ISQRT_OUT_MAN_FRAC_WIDTH), + .DATA_OUT_0_WIDTH(DATA_OUT_0_MAN_WIDTH), + .DATA_OUT_0_FRAC_WIDTH(DATA_OUT_0_MAN_FRAC_WIDTH) + ) u_layer_norm_1d ( + .clk(clk), + .rst(rst), + .data_in_0(casted_mdata_in), + .data_in_0_valid(casted_data_in_valid), + .data_in_0_ready(casted_data_in_ready), + .mdata_out_0(mdata_out_0), + .edata_out_0(edata_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + +endmodule +/* +layernorm 1d +*/ +module layernorm_core #( + // Dimensions + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 2, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + // Data widths + parameter DATA_IN_0_WIDTH = 8, + parameter DATA_IN_0_FRAC_WIDTH = 4, + parameter ISQRT_IN_WIDTH = 8, + parameter ISQRT_IN_FRAC_WIDTH = 8, + + parameter ISQRT_OUT_WIDTH = 8, + parameter ISQRT_OUT_FRAC_WIDTH = 4, + parameter DATA_OUT_0_WIDTH = 8, + parameter DATA_OUT_0_FRAC_WIDTH = 4, + parameter DATA_OUT_0_EXP_WIDTH = 4 +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_WIDTH-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_0_WIDTH-1:0] mdata_out_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_EXP_WIDTH-1:0] edata_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready +); + + // Derived params + localparam DEPTH_DIM0 = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0; + + localparam NUM_VALUES = DATA_IN_0_TENSOR_SIZE_DIM_0; + + localparam NUM_ITERS = DEPTH_DIM0; + localparam ITER_WIDTH = $clog2(NUM_ITERS); + + // Compute Pipeline Widths + + localparam ADDER_TREE_IN_SIZE = DATA_IN_0_PARALLELISM_DIM_0; + localparam ADDER_TREE_OUT_WIDTH = $clog2(ADDER_TREE_IN_SIZE) + DATA_IN_0_WIDTH; + + localparam ACC_OUT_WIDTH = ITER_WIDTH + ADDER_TREE_OUT_WIDTH; + + localparam DIFF_WIDTH = DATA_IN_0_WIDTH + 1; + localparam DIFF_FRAC_WIDTH = DATA_IN_0_FRAC_WIDTH; + + localparam SQUARE_WIDTH = DIFF_WIDTH * 2; + localparam SQUARE_FRAC_WIDTH = DIFF_FRAC_WIDTH * 2; + + localparam SQUARES_ADDER_TREE_IN_SIZE = DATA_IN_0_PARALLELISM_DIM_0; + localparam SQUARES_ADDER_TREE_OUT_WIDTH = $clog2(SQUARES_ADDER_TREE_IN_SIZE) + SQUARE_WIDTH; + localparam SQUARES_ADDER_TREE_OUT_FRAC_WIDTH = SQUARE_FRAC_WIDTH; + + localparam VARIANCE_WIDTH = ITER_WIDTH + SQUARES_ADDER_TREE_OUT_WIDTH; + localparam VARIANCE_FRAC_WIDTH = SQUARES_ADDER_TREE_OUT_FRAC_WIDTH; + + + localparam NORM_WIDTH = ISQRT_OUT_WIDTH + DIFF_WIDTH; + localparam NORM_FRAC_WIDTH = ISQRT_OUT_FRAC_WIDTH + DIFF_FRAC_WIDTH; + + /* verilator lint_off UNUSEDSIGNAL */ + // Input FIFO + logic [DATA_IN_0_WIDTH-1:0] fifo_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic fifo_out_valid, fifo_out_ready; + logic fifo_in_valid, fifo_in_ready; + + // Input Adder Tree + logic [ADDER_TREE_OUT_WIDTH-1:0] adder_tree_data; + logic adder_tree_out_valid, adder_tree_out_ready; + logic adder_tree_in_valid, adder_tree_in_ready; + + + logic [ACC_OUT_WIDTH-1:0] mu_acc; + logic mu_acc_valid, mu_acc_ready; + + logic [DATA_IN_0_WIDTH-1:0] mu_in, mu_out; + logic mu_out_valid, mu_out_ready; + + logic [ACC_OUT_WIDTH + ACC_OUT_WIDTH:0] mu_acc_div; + + logic mu_fifo_valid, mu_fifo_ready; + + logic signed [DIFF_WIDTH-1:0] diff_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic signed [DIFF_WIDTH-1:0] diff_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_in_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_out_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + logic [SQUARE_WIDTH-1:0] square_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic square_in_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic square_out_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [SQUARE_WIDTH-1:0] square_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + // Split2 for split in pipeline from diff + logic fifo_diff_in_valid, fifo_diff_in_ready; + logic fifo_diff_out_valid; + + // Squares adder tree + logic [SQUARES_ADDER_TREE_OUT_WIDTH-1:0] squares_adder_tree_data; + logic squares_adder_tree_out_valid, squares_adder_tree_out_ready; + logic squares_adder_tree_in_valid, squares_adder_tree_in_ready; + + // Squares Accumulator + logic [VARIANCE_WIDTH-1:0] squares_acc; + logic squares_acc_valid, squares_acc_ready; + + // Take the accumulated squares and divide it to get variance + logic [SQUARES_ADDER_TREE_OUT_WIDTH+VARIANCE_WIDTH:0] variance_buffer; + logic [VARIANCE_WIDTH-1:0] variance_in, variance_out; + logic variance_out_valid, variance_out_ready; + + // From now it becomes mxint quantization + logic [ISQRT_OUT_WIDTH-1:0] minv_sqrt_out; + logic [DATA_OUT_0_EXP_WIDTH-1:0] einv_sqrt_out; + logic inv_sqrt_out_valid, inv_sqrt_out_ready; + + // Repeat circular buffer to hold inverse square root of variance during mult + logic [ISQRT_OUT_WIDTH-1:0] misqrt_circ; + logic [DATA_OUT_0_EXP_WIDTH-1:0] eisqrt_circ; + logic isqrt_circ_valid, isqrt_circ_ready; + logic norm_in_valid; + + // FIFO for storing X-mu differences + logic [DIFF_WIDTH-1:0] diff_batch_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_batch_in_valid, diff_batch_in_ready; + logic [DIFF_WIDTH-1:0] diff_batch_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_batch_out_valid, diff_batch_out_ready; + + logic [NORM_WIDTH-1:0] mnorm_in_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [DATA_OUT_0_EXP_WIDTH-1:0] enorm_in_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + logic [NORM_WIDTH-1:0] mnorm_out_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [DATA_OUT_0_EXP_WIDTH-1:0] enorm_out_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + logic [DATA_OUT_0_WIDTH-1:0] mnorm_round_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [DATA_OUT_0_EXP_WIDTH-1:0] enorm_round_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + logic [DATA_OUT_0_WIDTH-1:0] mnorm_batch_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [DATA_OUT_0_EXP_WIDTH-1:0] enorm_batch_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic output_reg_ready; + + logic norm_in_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic norm_out_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic norm_batch_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic output_reg_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + /* verilator lint_on UNUSEDSIGNAL */ + + matrix_fifo #( + .DATA_WIDTH(DATA_IN_0_WIDTH), + .DIM0 (DATA_IN_0_PARALLELISM_DIM_0), + .DIM1 (1), + .FIFO_SIZE (4 * NUM_ITERS) + ) input_fifo_inst ( + .clk(clk), + .rst(rst), + .in_data(data_in_0), + .in_valid(fifo_in_valid), + .in_ready(fifo_in_ready), + .out_data(fifo_data), + .out_valid(fifo_out_valid), + .out_ready(fifo_out_ready) + ); + + // Input Adder Tree + fixed_adder_tree #( + .IN_SIZE (DATA_IN_0_PARALLELISM_DIM_0), + .IN_WIDTH(DATA_IN_0_WIDTH) + ) sum_adder_tree ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(adder_tree_in_valid), + .data_in_ready(adder_tree_in_ready), + .data_out(adder_tree_data), + .data_out_valid(adder_tree_out_valid), + .data_out_ready(adder_tree_out_ready) + ); + + // Split2 for input to FIFO & Adder Tree + split2 input_fifo_adder_split ( + .data_in_valid (data_in_0_valid), + .data_in_ready (data_in_0_ready), + .data_out_valid({adder_tree_in_valid, fifo_in_valid}), + .data_out_ready({adder_tree_in_ready, fifo_in_ready}) + ); + // Accumulator for mu + fixed_accumulator #( + .IN_DEPTH(NUM_ITERS), + .IN_WIDTH(ADDER_TREE_OUT_WIDTH) + ) mu_accumulator ( + .clk(clk), + .rst(rst), + .data_in(adder_tree_data), + .data_in_valid(adder_tree_out_valid), + .data_in_ready(adder_tree_out_ready), + .data_out(mu_acc), + .data_out_valid(mu_acc_valid), + .data_out_ready(mu_acc_ready) + ); + + + // Division by NUM_VALUES + // ACC_WIDTH = DATA_IN_WIDTH + $clog2(NUM_VALUES) + // BASICALLY the same thing + localparam bit [ACC_OUT_WIDTH+1:0] INV_NUMVALUES_0 = ((1 << ACC_OUT_WIDTH) / NUM_VALUES); + assign mu_acc_div = ($signed(mu_acc) * $signed(INV_NUMVALUES_0)); + + fixed_signed_cast #( + .IN_WIDTH(ACC_OUT_WIDTH + ACC_OUT_WIDTH + 1), + .IN_FRAC_WIDTH(DATA_IN_0_FRAC_WIDTH + ACC_OUT_WIDTH), + .OUT_WIDTH(DATA_IN_0_WIDTH), + .OUT_FRAC_WIDTH(DATA_IN_0_FRAC_WIDTH), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) acc_div_cast_i ( + .in_data (mu_acc_div), + .out_data(mu_in) + ); + + single_element_repeat #( + .DATA_WIDTH(DATA_IN_0_WIDTH), + .REPEAT(NUM_ITERS) + ) mu_buffer ( + .clk(clk), + .rst(rst), + .in_data(mu_in), + .in_valid(mu_acc_valid), + .in_ready(mu_acc_ready), + .out_data(mu_out), + .out_valid(mu_out_valid), + .out_ready(mu_out_ready) + ); + + // Join 2 for combining fifo and mu buffer signals + assign mu_fifo_ready = diff_in_ready[0]; + + join2 mu_fifo_join2 ( + .data_in_valid ({mu_out_valid, fifo_out_valid}), + .data_in_ready ({mu_out_ready, fifo_out_ready}), + .data_out_valid(mu_fifo_valid), + .data_out_ready(mu_fifo_ready) + ); + + // Compute pipeline + + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0; i++) begin : compute_pipe + + // Take the difference between input and mean: (X - mu) + assign diff_in[i] = $signed(fifo_data[i]) - $signed(mu_out); + + skid_buffer #( + .DATA_WIDTH(DIFF_WIDTH) + ) subtract_reg ( + .clk(clk), + .rst(rst), + .data_in(diff_in[i]), + .data_in_valid(mu_fifo_valid), + .data_in_ready(diff_in_ready[i]), + .data_out(diff_out[i]), + .data_out_valid(diff_out_valid[i]), + .data_out_ready(fifo_diff_in_ready) + ); + + // Assign the output of diff int batch to be buffered + assign diff_batch_in[i] = diff_out[i]; + + // There will be a split in the pipline here, split2 is down below. + + // Take the difference and square it: (X - mu) ^ 2 + + assign square_in[i] = $signed(diff_batch_in[i]) * $signed(diff_batch_in[i]); + + skid_buffer #( + .DATA_WIDTH(SQUARE_WIDTH) + ) square_reg ( + .clk(clk), + .rst(rst), + .data_in(square_in[i]), + .data_in_valid(fifo_diff_out_valid), + .data_in_ready(square_in_ready[i]), + .data_out(square_out[i]), + .data_out_valid(square_out_valid[i]), + .data_out_ready(squares_adder_tree_in_ready) + ); + end + + assign fifo_diff_in_valid = diff_out_valid[0]; + split2 fifo_diff_split ( + .data_in_valid (fifo_diff_in_valid), + .data_in_ready (fifo_diff_in_ready), + .data_out_valid({diff_batch_in_valid, fifo_diff_out_valid}), + .data_out_ready({diff_batch_in_ready, square_in_ready[0]}) + ); + + assign squares_adder_tree_in_valid = square_out_valid[0]; + + fixed_adder_tree #( + .IN_SIZE (SQUARES_ADDER_TREE_IN_SIZE), + .IN_WIDTH(SQUARE_WIDTH) + ) squares_adder_tree ( + .clk(clk), + .rst(rst), + .data_in(square_out), + .data_in_valid(squares_adder_tree_in_valid), + .data_in_ready(squares_adder_tree_in_ready), + .data_out(squares_adder_tree_data), + .data_out_valid(squares_adder_tree_out_valid), + .data_out_ready(squares_adder_tree_out_ready) + ); + + fixed_accumulator #( + .IN_DEPTH(NUM_ITERS), + .IN_WIDTH(SQUARES_ADDER_TREE_OUT_WIDTH) + ) squares_accumulator ( + .clk(clk), + .rst(rst), + .data_in(squares_adder_tree_data), + .data_in_valid(squares_adder_tree_out_valid), + .data_in_ready(squares_adder_tree_out_ready), + .data_out(squares_acc), + .data_out_valid(squares_acc_valid), + .data_out_ready(squares_acc_ready) + ); + + // Division by NUM_VALUES + localparam bit [SQUARES_ADDER_TREE_OUT_WIDTH+1:0] INV_NUMVALUES_1 = ((1 << SQUARES_ADDER_TREE_OUT_WIDTH) / NUM_VALUES); + assign variance_buffer = (squares_acc * INV_NUMVALUES_1) >> SQUARES_ADDER_TREE_OUT_WIDTH; + assign variance_in = variance_buffer[VARIANCE_WIDTH-1:0]; + + skid_buffer #( + .DATA_WIDTH(VARIANCE_WIDTH) + ) variance_reg ( + .clk(clk), + .rst(rst), + .data_in(variance_in), + .data_in_valid(squares_acc_valid), + .data_in_ready(squares_acc_ready), + .data_out(variance_out), + .data_out_valid(variance_out_valid), + .data_out_ready(variance_out_ready) + ); + + mxint_isqrt_lut #( + .IN_WIDTH(VARIANCE_WIDTH), + .IN_FRAC_WIDTH(VARIANCE_FRAC_WIDTH), + .VARIANCE_MAN_WIDTH(ISQRT_IN_WIDTH), + .OUT_MAN_WIDTH(ISQRT_OUT_WIDTH), + .OUT_MAN_FRAC_WIDTH(ISQRT_OUT_FRAC_WIDTH), + .EXP_WIDTH(DATA_OUT_0_EXP_WIDTH) + ) isqrt_lut_inst ( + .clk(clk), + .rst(rst), + .data_in_0 (variance_out), + .data_in_0_valid(variance_out_valid), + .data_in_0_ready(variance_out_ready), + .mdata_out_0(minv_sqrt_out), + .edata_out_0(einv_sqrt_out), + .data_out_0_valid(inv_sqrt_out_valid), + .data_out_0_ready(inv_sqrt_out_ready) + ); + + + single_element_repeat #( + .DATA_WIDTH(ISQRT_OUT_WIDTH + DATA_OUT_0_EXP_WIDTH), + .REPEAT(NUM_ITERS) + ) isqrt_var_circ_buffer ( + .clk(clk), + .rst(rst), + .in_data({minv_sqrt_out, einv_sqrt_out}), + .in_valid(inv_sqrt_out_valid), + .in_ready(inv_sqrt_out_ready), + .out_data({misqrt_circ, eisqrt_circ}), + .out_valid(isqrt_circ_valid), + .out_ready(isqrt_circ_ready) + ); + + // Join2 for pipeline join at sqrt and diff fifo + // logic inv_sqrt_ready; + join2 diff_fifo_isqrt_join ( + .data_in_valid ({diff_batch_out_valid, isqrt_circ_valid}), + .data_in_ready ({diff_batch_out_ready, isqrt_circ_ready}), + .data_out_valid(norm_in_valid), + .data_out_ready(norm_in_ready[0]) + ); + + matrix_fifo #( + .DATA_WIDTH(DIFF_WIDTH), + .DIM0(DATA_IN_0_PARALLELISM_DIM_0), + .DIM1(1), + .FIFO_SIZE(4 * NUM_ITERS) + ) diff_fifo_inst ( + .clk(clk), + .rst(rst), + .in_data(diff_batch_in), + .in_valid(diff_batch_in_valid), + .in_ready(diff_batch_in_ready), + .out_data(diff_batch_out), + .out_valid(diff_batch_out_valid), + .out_ready(diff_batch_out_ready) + ); + + + // Output chunks compute pipeline: final multiply and output cast + + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0; i++) begin : out_mult_pipe + + // Multiply difference with 1/sqrt(var) to get normalized result + assign mnorm_in_data[i] = $signed({1'b0, misqrt_circ}) * $signed(diff_batch_out[i]); + assign enorm_in_data[i] = eisqrt_circ; + + skid_buffer #( + .DATA_WIDTH(NORM_WIDTH + DATA_OUT_0_EXP_WIDTH) + ) norm_reg ( + .clk(clk), + .rst(rst), + .data_in({mnorm_in_data[i], enorm_in_data[i]}), + .data_in_valid(norm_in_valid), + .data_in_ready(norm_in_ready[i]), + .data_out({mnorm_out_data[i], enorm_out_data[i]}), + .data_out_valid(norm_out_valid[i]), + .data_out_ready(norm_batch_ready[i]) + ); + + // Output Rounding Stage + fixed_signed_cast #( + .IN_WIDTH(NORM_WIDTH), + .IN_FRAC_WIDTH(NORM_FRAC_WIDTH), + .OUT_WIDTH(DATA_OUT_0_WIDTH), + .OUT_FRAC_WIDTH(DATA_OUT_0_FRAC_WIDTH), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) output_cast ( + .in_data (mnorm_out_data[i]), + .out_data(mnorm_round_out[i]) + ); + assign enorm_round_out[i] = enorm_out_data[i]; + + skid_buffer #( + .DATA_WIDTH(DATA_OUT_0_WIDTH + DATA_OUT_0_EXP_WIDTH) + ) output_reg ( + .clk(clk), + .rst(rst), + .data_in({mnorm_round_out[i], enorm_round_out[i]}), + .data_in_valid(norm_out_valid[i]), + .data_in_ready(norm_batch_ready[i]), + .data_out({mnorm_batch_data[i], enorm_batch_data[i]}), + .data_out_valid(output_reg_valid[i]), + .data_out_ready(output_reg_ready) + ); + end + + // Final connection to output + assign mdata_out_0 = mnorm_batch_data; + assign edata_out_0 = enorm_batch_data[0]; + assign data_out_0_valid = output_reg_valid[0]; + assign output_reg_ready = data_out_0_ready; + +endmodule + +module dim_0_cast #( + parameter MAN_WIDTH = 8, + parameter EXP_WIDTH = 4, + parameter IN_DEPTH = 10, + parameter BLOCK_SIZE = 4 +) ( + input logic clk, + input logic rst, + input logic data_in_0_valid, + output logic data_in_0_ready, + input logic [MAN_WIDTH-1:0] mdata_in_0[BLOCK_SIZE-1:0], + input logic [EXP_WIDTH-1:0] edata_in_0, + + output logic data_out_0_valid, + input logic data_out_0_ready, + output logic [MAN_WIDTH-1:0] mdata_out_0[BLOCK_SIZE-1:0], + output logic [EXP_WIDTH-1:0] edata_out_0 +); + + // Internal signals + logic [MAN_WIDTH-1:0] mdata_in_0_fifo[BLOCK_SIZE-1:0]; + logic [EXP_WIDTH-1:0] edata_in_0_fifo; + logic data_in_0_fifo_valid; + logic data_in_0_fifo_ready; + + logic [EXP_WIDTH-1:0] edata_in_0_straight; + logic data_in_0_straight_valid; + logic data_in_0_straight_ready; + + logic [EXP_WIDTH-1:0] max_edata_in_0; + logic max_edata_in_0_valid; + logic max_edata_in_0_ready; + + logic [EXP_WIDTH-1:0] circular_max_edata_in_0 [0:0]; + logic circular_max_edata_in_0_valid; + logic circular_max_edata_in_0_ready; + + logic signed [EXP_WIDTH:0] shift_value; + + // Split2 circuit for parallel processing + unpacked_mx_split2_with_data #( + .DEPTH(IN_DEPTH), + .MAN_WIDTH(MAN_WIDTH), + .EXP_WIDTH(EXP_WIDTH), + .IN_SIZE(BLOCK_SIZE) + ) split2_circ ( + .clk(clk), + .rst(rst), + // Input from circular buffer + .mdata_in(mdata_in_0), + .edata_in(edata_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + // FIFO output path (not used) + .fifo_mdata_out(mdata_in_0_fifo), + .fifo_edata_out(edata_in_0_fifo), + .fifo_data_out_valid(data_in_0_fifo_valid), + .fifo_data_out_ready(data_in_0_fifo_ready), + // Straight output path + .straight_mdata_out(), // Connect to the same signals previously used + .straight_edata_out(edata_in_0_straight), + .straight_data_out_valid(data_in_0_straight_valid), + .straight_data_out_ready(data_in_0_straight_ready) + ); + + // Sequential max finder + sequential_max #( + .IN_DEPTH(IN_DEPTH), + .IN_WIDTH(EXP_WIDTH) + ) sequential_max_inst ( + .clk (clk), // input + .rst (rst), // input + .data_in (edata_in_0_straight), // input [IN_WIDTH-1:0] + .data_in_valid (data_in_0_straight_valid), // input + .data_in_ready (data_in_0_straight_ready), // output + .data_out (max_edata_in_0), // output [IN_WIDTH-1:0] + .data_out_valid (max_edata_in_0_valid), // output + .data_out_ready (max_edata_in_0_ready) // input + ); + + input_buffer #( + .DATA_WIDTH (EXP_WIDTH), + .IN_NUM (1), + .REPEAT (IN_DEPTH), + .BUFFER_SIZE(1) + ) mdata_in_0_buffer ( + .clk, + .rst, + // Input streaming port + .data_in({max_edata_in_0}), + .data_in_valid(max_edata_in_0_valid), + .data_in_ready(max_edata_in_0_ready), + // Output streaming port + .data_out(circular_max_edata_in_0), + .data_out_valid(circular_max_edata_in_0_valid), + .data_out_ready(circular_max_edata_in_0_ready) + ); + + // Join circuit for output synchronization + join2 data_out_join_inst ( + .data_in_ready({circular_max_edata_in_0_ready, data_in_0_fifo_ready}), + .data_in_valid({circular_max_edata_in_0_valid, data_in_0_fifo_valid}), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + + // Calculate shift value and perform optimized right shift + assign shift_value = $signed(max_edata_in_0) - $signed(circular_max_edata_in_0[0]); + + optimized_right_shift #( + .IN_WIDTH(MAN_WIDTH), + .SHIFT_WIDTH(EXP_WIDTH), + .OUT_WIDTH(MAN_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(mdata_in_0_fifo), + .shift_value(shift_value), + .data_out(mdata_out_0) + ); + // Assign final exponent output + assign edata_out_0 = max_edata_in_0; +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_linear.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_linear.sv index b345241db..e201c8ec0 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_linear.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_linear.sv @@ -16,21 +16,23 @@ Description : This module does a matrix multiplcation between matrices X & Y. module mxint_linear #( /* verilator lint_off UNUSEDPARAM */ parameter HAS_BIAS = 1, - + parameter CIRCULAR_WEIGHT = 0, parameter DATA_IN_0_PRECISION_0 = 16, parameter DATA_IN_0_PRECISION_1 = 3, parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, localparam IN_0_DEPTH_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0, localparam IN_0_DEPTH_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1, parameter WEIGHT_PRECISION_0 = 16, parameter WEIGHT_PRECISION_1 = 3, - parameter WEIGHT_TENSOR_SIZE_DIM_0 = 20, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, parameter WEIGHT_TENSOR_SIZE_DIM_1 = 20, - parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, parameter WEIGHT_PARALLELISM_DIM_1 = 4, // Inferred precision of the output data @@ -41,9 +43,13 @@ module mxint_linear #( parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1, parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = 1, parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = 1, + parameter UNDERFLOW_BITS = 0, + parameter ROUND_BITS = 4, parameter BIAS_PRECISION_0 = 16, parameter BIAS_PRECISION_1 = 3, parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_OUT_0_TENSOR_SIZE_DIM_0, @@ -76,6 +82,17 @@ module mxint_linear #( output logic data_out_0_valid, input logic data_out_0_ready ); + localparam FDP_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + DATA_IN_0_PARALLELISM_DIM_0 + ); + localparam FDP_EXP_WIDTH = (WEIGHT_PRECISION_1 > DATA_IN_0_PRECISION_1)? WEIGHT_PRECISION_1 + 1: DATA_IN_0_PRECISION_1 + 1; + localparam ACC_WIDTH = FDP_WIDTH + $clog2(IN_0_DEPTH_DIM_0) + UNDERFLOW_BITS; + localparam ACC_FRAC_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + UNDERFLOW_BITS - 2; + localparam ACC_EXP_WIDTH = FDP_EXP_WIDTH; + localparam LOSSLESS_OUT_WIDTH = ACC_WIDTH + HAS_BIAS; + localparam LOSSLESS_OUT_EXP_WIDTH = ACC_EXP_WIDTH; + localparam LOSSLESS_OUT_FRAC_WIDTH = ACC_FRAC_WIDTH; + logic [DATA_IN_0_PRECISION_0-1:0]circular_mdata_in_0[DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1-1:0]; logic [DATA_IN_0_PRECISION_1-1:0] circular_edata_in_0; logic circular_data_in_0_valid, circular_data_in_0_ready; @@ -87,6 +104,33 @@ module mxint_linear #( logic [BIAS_PRECISION_0-1:0] circular_mbias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1-1:0]; logic [BIAS_PRECISION_1-1:0] circular_ebias; logic circular_bias_valid, circular_bias_ready; + + logic [FDP_EXP_WIDTH-1:0] fdp_edata_out [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0]; + logic [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0] fdp_data_out_valid; + + // Change signal names to reflect data flow: dp -> skid -> acc + logic [FDP_WIDTH-1:0] dp_mdata_out [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0]; + logic [FDP_EXP_WIDTH-1:0] dp_edata_out; + logic dp_data_out_valid, dp_data_out_ready; + + logic [FDP_WIDTH-1:0] skid_mdata_out[DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0]; + logic [FDP_EXP_WIDTH-1:0] skid_edata_out; + logic skid_data_out_valid, skid_data_out_ready; + + logic [ACC_WIDTH-1:0] acc_mdata_out [DATA_IN_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [FDP_EXP_WIDTH-1:0] acc_edata_out; + logic acc_data_out_valid, acc_data_out_ready; + logic [LOSSLESS_OUT_WIDTH-1:0] cast_mdata_out_0[DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [LOSSLESS_OUT_EXP_WIDTH-1:0] cast_edata_out_0; + logic cast_data_out_0_valid, cast_data_out_0_ready; + + logic [LOSSLESS_OUT_WIDTH-1:0] fifo_mdata_in[DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [LOSSLESS_OUT_EXP_WIDTH-1:0] fifo_edata_in; + logic fifo_data_in_valid, fifo_data_in_ready; + // Add signals for FIFO + logic [LOSSLESS_OUT_WIDTH-1:0] fifo_mdata_out[DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [LOSSLESS_OUT_EXP_WIDTH-1:0] fifo_edata_out; + logic fifo_data_out_valid, fifo_data_out_ready; mxint_circular #( .DATA_PRECISION_0(DATA_IN_0_PRECISION_0), .DATA_PRECISION_1(DATA_IN_0_PRECISION_1), @@ -107,54 +151,82 @@ module mxint_linear #( .data_out_valid(circular_data_in_0_valid), .data_out_ready(circular_data_in_0_ready) ); - mxint_circular #( - .DATA_PRECISION_0(WEIGHT_PRECISION_0), - .DATA_PRECISION_1(WEIGHT_PRECISION_1), - .IN_NUM(WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1), - .REPEAT(IN_0_DEPTH_DIM_1), - .BUFFER_SIZE(WEIGHT_TENSOR_SIZE_DIM_0*WEIGHT_TENSOR_SIZE_DIM_1 / (WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1)) - ) weight_buffer ( - .clk, - .rst, - // Input streaming port - .mdata_in(mweight), - .edata_in(eweight), - .data_in_valid(weight_valid), - .data_in_ready(weight_ready), - // Output streaming port - .mdata_out(circular_mweight), - .edata_out(circular_eweight), - .data_out_valid(circular_weight_valid), - .data_out_ready(circular_weight_ready) - ); - mxint_circular #( - .DATA_PRECISION_0(BIAS_PRECISION_0), - .DATA_PRECISION_1(BIAS_PRECISION_1), - .IN_NUM (BIAS_PARALLELISM_DIM_0), - .REPEAT (IN_0_DEPTH_DIM_1), - .BUFFER_SIZE (BIAS_TENSOR_SIZE_DIM_0 / (BIAS_PARALLELISM_DIM_0)) - ) bias_buffer ( - .clk, - .rst, - // Input streaming port - .mdata_in(mbias), - .edata_in(ebias), - .data_in_valid(bias_valid), - .data_in_ready(bias_ready), - // Output streaming port - .mdata_out(circular_mbias), - .edata_out(circular_ebias), - .data_out_valid(circular_bias_valid), - .data_out_ready(circular_bias_ready) - ); - localparam FDP_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( - DATA_IN_0_PARALLELISM_DIM_0 - ); - localparam FDP_EXP_WIDTH = (WEIGHT_PRECISION_1 > DATA_IN_0_PRECISION_1)? WEIGHT_PRECISION_1 + 1: DATA_IN_0_PRECISION_1 + 1; - localparam ACC_WIDTH = FDP_WIDTH + $clog2(IN_0_DEPTH_DIM_0) + 2 ** FDP_EXP_WIDTH; - localparam ACC_EXP_WIDTH = FDP_EXP_WIDTH; - localparam LOSSLESS_OUT_WIDTH = ACC_WIDTH + HAS_BIAS; - localparam LOSSLESS_OUT_EXP_WIDTH = ACC_EXP_WIDTH; + if (CIRCULAR_WEIGHT == 1) begin + mxint_circular #( + .DATA_PRECISION_0(WEIGHT_PRECISION_0), + .DATA_PRECISION_1(WEIGHT_PRECISION_1), + .IN_NUM(WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1), + .REPEAT(IN_0_DEPTH_DIM_1), + .BUFFER_SIZE(WEIGHT_TENSOR_SIZE_DIM_0*WEIGHT_TENSOR_SIZE_DIM_1 / (WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1)) + ) weight_buffer ( + .clk, + .rst, + // Input streaming port + .mdata_in(mweight), + .edata_in(eweight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + // Output streaming port + .mdata_out(circular_mweight), + .edata_out(circular_eweight), + .data_out_valid(circular_weight_valid), + .data_out_ready(circular_weight_ready) + ); + + mxint_circular #( + .DATA_PRECISION_0(BIAS_PRECISION_0), + .DATA_PRECISION_1(BIAS_PRECISION_1), + .IN_NUM (BIAS_PARALLELISM_DIM_0), + .REPEAT (IN_0_DEPTH_DIM_1), + .BUFFER_SIZE (BIAS_TENSOR_SIZE_DIM_0 / (BIAS_PARALLELISM_DIM_0)) + ) bias_buffer ( + .clk, + .rst, + // Input streaming port + .mdata_in(mbias), + .edata_in(ebias), + .data_in_valid(bias_valid), + .data_in_ready(bias_ready), + // Output streaming port + .mdata_out(circular_mbias), + .edata_out(circular_ebias), + .data_out_valid(circular_bias_valid), + .data_out_ready(circular_bias_ready) + ); + end else begin + mxint_skid_buffer #( + .DATA_PRECISION_0(BIAS_PRECISION_0), + .DATA_PRECISION_1(BIAS_PRECISION_1), + .IN_NUM(WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1) + ) weight_reg_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mweight), + .edata_in(eweight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + .mdata_out(circular_mweight), + .edata_out(circular_eweight), + .data_out_valid(circular_weight_valid), + .data_out_ready(circular_weight_ready) + ); + mxint_skid_buffer #( + .DATA_PRECISION_0(BIAS_PRECISION_0), + .DATA_PRECISION_1(BIAS_PRECISION_1), + .IN_NUM(DATA_OUT_0_PARALLELISM_DIM_0) + ) bias_reg_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mbias), + .edata_in(ebias), + .data_in_valid(bias_valid), + .data_in_ready(bias_ready), + .mdata_out(circular_mbias), + .edata_out(circular_ebias), + .data_out_valid(circular_bias_valid), + .data_out_ready(circular_bias_ready) + ); + end /* verilator lint_off UNUSEDSIGNAL */ // Assume the parallelised hardware above have the same arrival time // which means that they always have the same state. So we can just @@ -163,24 +235,12 @@ module mxint_linear #( fdp_data_ready, fdp_weight_ready; assign circular_weight_ready = fdp_weight_ready[0]; assign circular_data_in_0_ready = fdp_data_ready[0]; - logic [FDP_EXP_WIDTH-1:0] fdp_edata_out [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0]; - logic [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0] fdp_data_out_valid; - logic [FDP_WIDTH-1:0] acc_mdata_in [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1 - 1:0]; - logic [FDP_EXP_WIDTH-1:0] acc_edata_in; - logic acc_data_in_valid, acc_data_in_ready; - - logic [ ACC_WIDTH-1:0] acc_mdata_out [DATA_IN_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; - logic [FDP_EXP_WIDTH-1:0] acc_edata_out; - logic acc_data_out_valid, acc_data_out_ready; - logic [LOSSLESS_OUT_WIDTH-1:0] cast_mdata_out_0[DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; - logic [LOSSLESS_OUT_EXP_WIDTH-1:0] cast_edata_out_0; - logic cast_data_out_0_valid, cast_data_out_0_ready; // There are WEIGHT_PARALLELISM_DIM_0 number of dot product instances with DATA_IN_0_TENSOR_SIZE_DIM_0 inputs // and each one computes for IN_0_DEPTH iterations for each inputs. for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_1; i = i + 1) begin : out_dim_1 for (genvar j = 0; j < WEIGHT_PARALLELISM_DIM_1; j = j + 1) begin : out_dim_0 - // Assume the weight are transposed and partitioned + // Assume the weight are transposed and partitioned logic [WEIGHT_PRECISION_0-1:0] current_mweight[WEIGHT_PARALLELISM_DIM_0-1:0]; logic [WEIGHT_PRECISION_1-1:0] current_eweight; logic [DATA_IN_0_PRECISION_0-1:0] current_mdata[WEIGHT_PARALLELISM_DIM_0-1:0]; @@ -196,7 +256,9 @@ module mxint_linear #( .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), .WEIGHT_PRECISION_0(WEIGHT_PRECISION_0), .WEIGHT_PRECISION_1(WEIGHT_PRECISION_1), - .BLOCK_SIZE(DATA_IN_0_PARALLELISM_DIM_0) + .BLOCK_SIZE(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_OUT_0_PRECISION_0(FDP_WIDTH), + .DATA_OUT_0_PRECISION_1(FDP_EXP_WIDTH) ) mxdp_inst ( .clk(clk), .rst(rst), @@ -208,35 +270,53 @@ module mxint_linear #( .eweight(current_eweight), .weight_valid(circular_weight_valid), .weight_ready(fdp_weight_ready[i*WEIGHT_PARALLELISM_DIM_1+j]), - .mdata_out_0(acc_mdata_in[i*WEIGHT_PARALLELISM_DIM_1+j]), + .mdata_out_0(dp_mdata_out[i*WEIGHT_PARALLELISM_DIM_1+j]), .edata_out_0(fdp_edata_out[i*WEIGHT_PARALLELISM_DIM_1+j]), .data_out_0_valid(fdp_data_out_valid[i*WEIGHT_PARALLELISM_DIM_1+j]), - .data_out_0_ready(acc_data_in_ready) + .data_out_0_ready(dp_data_out_ready) ); end end - assign acc_data_in_valid = fdp_data_out_valid[0]; - assign acc_edata_in = fdp_edata_out[0]; + assign dp_data_out_valid = fdp_data_out_valid[0]; + assign dp_edata_out = fdp_edata_out[0]; + + // Insert skid buffer between dot product and accumulator + mxint_skid_buffer #( + .DATA_PRECISION_0(FDP_WIDTH), + .DATA_PRECISION_1(FDP_EXP_WIDTH), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1) + ) skid_buffer_inst ( + .clk(clk), + .rst(rst), + .mdata_in(dp_mdata_out), + .edata_in(dp_edata_out), + .data_in_valid(dp_data_out_valid), + .data_in_ready(dp_data_out_ready), + .mdata_out(skid_mdata_out), + .edata_out(skid_edata_out), + .data_out_valid(skid_data_out_valid), + .data_out_ready(skid_data_out_ready) + ); mxint_accumulator #( .DATA_IN_0_PRECISION_0(FDP_WIDTH), .DATA_IN_0_PRECISION_1(FDP_EXP_WIDTH), + .UNDERFLOW_BITS(UNDERFLOW_BITS), .IN_DEPTH(IN_0_DEPTH_DIM_0), .BLOCK_SIZE(DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0) ) accumulator_inst ( .clk(clk), .rst(rst), - .mdata_in_0(acc_mdata_in), - .edata_in_0(acc_edata_in), - .data_in_0_valid(acc_data_in_valid), - .data_in_0_ready(acc_data_in_ready), + .mdata_in_0(skid_mdata_out), // Changed from acc_mdata_in + .edata_in_0(skid_edata_out), // Changed from acc_edata_in + .data_in_0_valid(skid_data_out_valid), // Changed from acc_data_in_valid + .data_in_0_ready(skid_data_out_ready), // Changed from acc_data_in_ready .mdata_out_0(acc_mdata_out), .edata_out_0(acc_edata_out), .data_out_0_valid(acc_data_out_valid), .data_out_0_ready(acc_data_out_ready) ); - logic [BIAS_PRECISION_0-1:0] mbias_sext[DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; logic [LOSSLESS_OUT_WIDTH-1:0] shifted_mbias[DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0]; logic [FDP_EXP_WIDTH - 1:0] exp_difference; @@ -250,14 +330,23 @@ module mxint_linear #( .data_out_valid(cast_data_out_0_valid), .data_out_ready(cast_data_out_0_ready) ); - assign exp_difference = $signed(circular_ebias) - $signed(acc_edata_out); - assign abs_shift_value = exp_difference[FDP_EXP_WIDTH - 1]? (~exp_difference + 1): exp_difference; + assign exp_difference = -($signed( + circular_ebias + ) - $signed( + acc_edata_out + ) + LOSSLESS_OUT_FRAC_WIDTH - (BIAS_PRECISION_0 - 1)); + + optimized_right_shift #( + .IN_WIDTH(BIAS_PRECISION_0), + .SHIFT_WIDTH(FDP_EXP_WIDTH), + .OUT_WIDTH(LOSSLESS_OUT_WIDTH), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1) + ) ovshift_inst ( + .data_in(mbias_sext), + .shift_value(exp_difference), + .data_out(shifted_mbias) + ); for (genvar m = 0; m < DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1; m++) begin - assign shifted_mbias[m] = exp_difference[FDP_EXP_WIDTH-1] ? $signed( - mbias_sext[m] - ) >>> abs_shift_value : $signed( - mbias_sext[m] - ) <<< abs_shift_value; assign cast_mdata_out_0[m] = $signed(shifted_mbias[m]) + $signed(acc_mdata_out[m]); end assign cast_edata_out_0 = acc_edata_out; @@ -266,21 +355,60 @@ module mxint_linear #( assign cast_data_out_0_valid = acc_data_out_valid; assign cast_mdata_out_0 = acc_mdata_out; assign cast_edata_out_0 = acc_edata_out; - assign bias_ready = 1; + assign circular_bias_ready = 1; end + + mxint_skid_buffer #( + .DATA_PRECISION_0(LOSSLESS_OUT_WIDTH), + .DATA_PRECISION_1(LOSSLESS_OUT_EXP_WIDTH), + .IN_NUM(DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0) + ) fifo_reg_inst ( + .clk(clk), + .rst(rst), + .mdata_in(cast_mdata_out_0), + .edata_in(cast_edata_out_0), + .data_in_valid(cast_data_out_0_valid), + .data_in_ready(cast_data_out_0_ready), + .mdata_out(fifo_mdata_in), + .edata_out(fifo_edata_in), + .data_out_valid(fifo_data_in_valid), + .data_out_ready(fifo_data_in_ready) + ); + // We need a fifo here to make match the throughput between different layers + unpacked_mx_fifo #( + .MAN_WIDTH(LOSSLESS_OUT_WIDTH), + .EXP_WIDTH(LOSSLESS_OUT_EXP_WIDTH), + .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0), + .DEPTH(DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0) + ) cast_fifo ( + .clk(clk), + .rst(rst), + .mdata_in(fifo_mdata_in), + .edata_in(fifo_edata_in), + .data_in_valid(fifo_data_in_valid), + .data_in_ready(fifo_data_in_ready), + .mdata_out(fifo_mdata_out), + .edata_out(fifo_edata_out), + .data_out_valid(fifo_data_out_valid), + .data_out_ready(fifo_data_out_ready) + ); + + // Update cast instance to use FIFO outputs mxint_cast #( .IN_MAN_WIDTH(LOSSLESS_OUT_WIDTH), + .IN_MAN_FRAC_WIDTH(LOSSLESS_OUT_FRAC_WIDTH), .IN_EXP_WIDTH(LOSSLESS_OUT_EXP_WIDTH), .OUT_MAN_WIDTH(DATA_OUT_0_PRECISION_0), .OUT_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .ROUND_BITS(ROUND_BITS), .BLOCK_SIZE(DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0) ) cast_i ( - .clk, - .rst, - .mdata_in(cast_mdata_out_0), - .edata_in(cast_edata_out_0), - .data_in_valid(cast_data_out_0_valid), - .data_in_ready(cast_data_out_0_ready), + .clk(clk), + .rst(rst), + .mdata_in(fifo_mdata_out), // Changed from skid_mdata_out + .edata_in(fifo_edata_out), // Changed from skid_edata_out + .data_in_valid(fifo_data_out_valid), // Changed from skid_data_out_valid + .data_in_ready(fifo_data_out_ready), // Changed from skid_data_out_ready .mdata_out(mdata_out_0), .edata_out(edata_out_0), .data_out_valid(data_out_0_valid), diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_matmul.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_matmul.sv index 325060743..9461f6b5f 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_matmul.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_matmul.sv @@ -141,7 +141,7 @@ module mxint_matmul #( logic [C_DEPTH_DIM0-1:0] acc_out_valid; logic [C_DEPTH_DIM0-1:0] acc_out_ready; localparam MAT_ACC_EXP_WIDTH = SM_EXP_WIDTH; - localparam MAT_ACC_OUT_WIDTH = SM_OUT_WIDTH + 2 ** SM_EXP_WIDTH + $clog2(B_DEPTH_DIM1); + localparam MAT_ACC_OUT_WIDTH = SM_OUT_WIDTH + $clog2(B_DEPTH_DIM1); logic [MAT_ACC_OUT_WIDTH-1:0] macc_out_data[C_DEPTH_DIM0-1:0][C_COMPUTE_DIM0*C_COMPUTE_DIM1-1:0]; logic [MAT_ACC_EXP_WIDTH-1:0] eacc_out_data[C_DEPTH_DIM0-1:0]; @@ -358,14 +358,14 @@ module mxint_matmul #( simple_matmul #( .N (A_COMPUTE_DIM1), - .M (A_COMPUTE_DIM0), // == B_COMPUTE_DIM1 + .M (A_COMPUTE_DIM0), // == B_COMPUTE_DIM1 .K (B_COMPUTE_DIM0), .X_WIDTH (A_MAN_WIDTH), - .X_FRAC_WIDTH (0), + .X_FRAC_WIDTH (A_MAN_WIDTH - 1), .Y_WIDTH (B_MAN_WIDTH), - .Y_FRAC_WIDTH (0), + .Y_FRAC_WIDTH (B_MAN_WIDTH - 1), .OUT_WIDTH (SM_OUT_WIDTH), - .OUT_FRAC_WIDTH(0) + .OUT_FRAC_WIDTH(A_MAN_WIDTH + B_MAN_WIDTH - 2) ) simple_matmul_inst ( .clk (clk), .rst (rst), @@ -414,6 +414,7 @@ module mxint_matmul #( mxint_cast #( .IN_MAN_WIDTH(MAT_ACC_OUT_WIDTH), + .IN_MAN_FRAC_WIDTH(A_MAN_WIDTH + B_MAN_WIDTH - 2), .IN_EXP_WIDTH(MAT_ACC_EXP_WIDTH), .OUT_MAN_WIDTH(OUT_MAN_WIDTH), .OUT_EXP_WIDTH(OUT_EXP_WIDTH), diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_patch_embed.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_patch_embed.sv new file mode 100644 index 000000000..b7e4567bd --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_patch_embed.sv @@ -0,0 +1,437 @@ +`timescale 1ns / 1ps +module mxint_patch_embed #( + // + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter CONV_WEIGHT_PRECISION_0 = 8, + parameter CONV_WEIGHT_PRECISION_1 = 4, + parameter CONV_BIAS_PRECISION_0 = 8, + parameter CONV_BIAS_PRECISION_1 = 4, + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 224, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 224, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 3, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 3, + + parameter CONV_WEIGHT_TENSOR_SIZE_DIM_0 = 224, + parameter CONV_WEIGHT_TENSOR_SIZE_DIM_1 = 224, + parameter CONV_WEIGHT_PARALLELISM_DIM_0 = 1, + parameter CONV_WEIGHT_PARALLELISM_DIM_1 = 1, + + parameter CONV_BIAS_TENSOR_SIZE_DIM_0 = 224, + parameter CONV_BIAS_TENSOR_SIZE_DIM_1 = 224, + parameter CONV_BIAS_PARALLELISM_DIM_0 = 1, + parameter CONV_BIAS_PARALLELISM_DIM_1 = 1, + + parameter CLS_TOKEN_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter CLS_TOKEN_PRECISION_1 = DATA_IN_0_PRECISION_1, + + parameter DISTILL_TOKEN_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DISTILL_TOKEN_PRECISION_1 = DATA_IN_0_PRECISION_1, + + parameter CLS_TOKEN_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter CLS_TOKEN_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter CLS_TOKEN_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + + parameter CLS_TOKEN_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter CLS_TOKEN_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter CLS_TOKEN_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2, + + parameter DISTILL_TOKEN_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DISTILL_TOKEN_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DISTILL_TOKEN_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + + parameter DISTILL_TOKEN_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DISTILL_TOKEN_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DISTILL_TOKEN_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2, + + parameter PATCH_SIZE = 16, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 192, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = 4, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = (DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1) / (PATCH_SIZE*PATCH_SIZE), + parameter DATA_OUT_0_PARALLELISM_DIM_1 = 1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = 1, + + parameter IN_X = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter IN_Y = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter IN_C = DATA_IN_0_TENSOR_SIZE_DIM_2, + + parameter KERNEL_X = PATCH_SIZE, + parameter KERNEL_Y = PATCH_SIZE, + parameter OUT_C = DATA_OUT_0_TENSOR_SIZE_DIM_0, + parameter UNROLL_OUT_C = DATA_OUT_0_PARALLELISM_DIM_0, + + parameter BIAS_SIZE = UNROLL_OUT_C, + + parameter HAS_BIAS = 1, + + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4 +) ( + input clk, + input rst, + + input [DATA_IN_0_PRECISION_0 - 1:0] mdata_in_0 [IN_C - 1 : 0], + input [DATA_IN_0_PRECISION_1 - 1:0] edata_in_0, + input data_in_0_valid, + output data_in_0_ready, + + input [CONV_WEIGHT_PRECISION_0-1:0] mconv_weight [UNROLL_OUT_C * IN_C -1:0], + input [CONV_WEIGHT_PRECISION_1-1:0] econv_weight, + input conv_weight_valid, + output conv_weight_ready, + + input [CONV_BIAS_PRECISION_0-1:0] mconv_bias [UNROLL_OUT_C-1:0], + input [CONV_BIAS_PRECISION_1-1:0] econv_bias, + input conv_bias_valid, + output conv_bias_ready, + + input [DATA_OUT_0_PRECISION_0 - 1:0] mcls_token [UNROLL_OUT_C - 1 : 0], + input [DATA_OUT_0_PRECISION_1 - 1:0] ecls_token, + input cls_token_valid, + output logic cls_token_ready, + + input [DATA_OUT_0_PRECISION_0 - 1:0] mdistill_token [UNROLL_OUT_C - 1 : 0], + input [DATA_OUT_0_PRECISION_1 - 1:0] edistill_token, + input distill_token_valid, + output logic distill_token_ready, + + output logic [DATA_OUT_0_PRECISION_0 - 1:0] mdata_out_0 [UNROLL_OUT_C - 1:0], + output logic [DATA_OUT_0_PRECISION_1 - 1:0] edata_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready +); + localparam OUT_Y = (IN_Y) / (KERNEL_Y); + localparam OUT_X = (IN_X) / (KERNEL_X); + localparam SLIDING_NUM = OUT_Y * OUT_X; + localparam MAXIMUM_OUT = (IN_X * IN_Y / (KERNEL_X * KERNEL_Y) + 2)* (OUT_C / UNROLL_OUT_C); + localparam COUNT_WIDTH = $clog2(MAXIMUM_OUT); + + logic [CONV_WEIGHT_PRECISION_0-1:0] circular_mweight [UNROLL_OUT_C * IN_C -1:0]; + logic [CONV_WEIGHT_PRECISION_1-1:0] circular_eweight; + logic circular_weight_valid; + logic circular_weight_ready; + + logic [CONV_BIAS_PRECISION_0-1:0] circular_mbias [UNROLL_OUT_C-1:0]; + logic [CONV_BIAS_PRECISION_1-1:0] circular_ebias; + logic circular_bias_valid; + logic circular_bias_ready; + + logic [COUNT_WIDTH - 1:0] count; + + enum {CLS_TOKEN, DISTILL_TOKEN, CONV_OUT} state; + + logic [DATA_OUT_0_PRECISION_0 - 1:0] mconv_out [UNROLL_OUT_C - 1:0]; + logic [DATA_OUT_0_PRECISION_1 - 1:0] econv_out; + logic conv_out_valid; + logic conv_out_ready; + mxint_circular #( + .DATA_PRECISION_0(CONV_WEIGHT_PRECISION_0), + .DATA_PRECISION_1(CONV_WEIGHT_PRECISION_1), + .IN_NUM (UNROLL_OUT_C * IN_C), + .REPEAT (SLIDING_NUM), + .BUFFER_SIZE (OUT_C / UNROLL_OUT_C) + ) weight_buffer ( + .clk, + .rst, + // Input streaming port + .mdata_in(mconv_weight), + .edata_in(econv_weight), + .data_in_valid(conv_weight_valid), + .data_in_ready(conv_weight_ready), + // Output streaming port + .mdata_out(circular_mweight), + .edata_out(circular_eweight), + .data_out_valid(circular_weight_valid), + .data_out_ready(circular_weight_ready) + ); + mxint_circular #( + .DATA_PRECISION_0(CONV_BIAS_PRECISION_0), + .DATA_PRECISION_1(CONV_BIAS_PRECISION_1), + .IN_NUM (UNROLL_OUT_C), + .REPEAT (SLIDING_NUM), + .BUFFER_SIZE (OUT_C / UNROLL_OUT_C) + ) bias_buffer ( + .clk, + .rst, + // Input streaming port + .mdata_in(mconv_bias), + .edata_in(econv_bias), + .data_in_valid(conv_bias_valid), + .data_in_ready(conv_bias_ready), + // Output streaming port + .mdata_out(circular_mbias), + .edata_out(circular_ebias), + .data_out_valid(circular_bias_valid), + .data_out_ready(circular_bias_ready) + ); + +mxint_patch_embed_conv #( + .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), + .WEIGHT_PRECISION_0(CONV_WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1(CONV_WEIGHT_PRECISION_1), + .BIAS_PRECISION_0(CONV_BIAS_PRECISION_0), + .BIAS_PRECISION_1(CONV_BIAS_PRECISION_1), + .IN_X(IN_X), + .IN_Y(IN_Y), + .IN_C(IN_C), + .KERNEL_X(KERNEL_X), + .KERNEL_Y(KERNEL_Y), + .OUT_C(OUT_C), + .UNROLL_OUT_C(UNROLL_OUT_C), + .HAS_BIAS(HAS_BIAS), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) +) conv_inst ( + .clk(clk), + .rst(rst), + .mdata_in_0(mdata_in_0), + .edata_in_0(edata_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + .mweight(circular_mweight), + .eweight(circular_eweight), + .weight_valid(circular_weight_valid), + .weight_ready(circular_weight_ready), + .mbias(circular_mbias), + .ebias(circular_ebias), + .bias_valid(circular_bias_valid), + .bias_ready(circular_bias_ready), + .mdata_out_0(mconv_out), + .edata_out_0(econv_out), + .data_out_0_valid(conv_out_valid), + .data_out_0_ready(conv_out_ready) +); + + always_ff @(posedge clk) begin + if (rst) count <= 0; + else if (data_out_0_valid && data_out_0_ready) + if (count == MAXIMUM_OUT - 1) count <= 0; + else count <= count + 1; + else count <= count; + end + + always_comb begin + if (count < OUT_C/UNROLL_OUT_C) state = CLS_TOKEN; + else if (count < 2 * OUT_C/UNROLL_OUT_C) state = DISTILL_TOKEN; + else state = CONV_OUT; + case (state) + CLS_TOKEN: begin + mdata_out_0 = mcls_token; + edata_out_0 = ecls_token; + data_out_0_valid = cls_token_valid; + cls_token_ready = data_out_0_ready; + distill_token_ready = 0; + conv_out_ready = 0; + end + DISTILL_TOKEN: begin + mdata_out_0 = mdistill_token; + edata_out_0 = edistill_token; + data_out_0_valid = distill_token_valid; + cls_token_ready = 0; + distill_token_ready = data_out_0_ready; + conv_out_ready = 0; + end + CONV_OUT: begin + mdata_out_0 = mconv_out; + edata_out_0 = econv_out; + data_out_0_valid = conv_out_valid; + cls_token_ready = 0; + distill_token_ready = 0; + conv_out_ready = data_out_0_ready; + end + default: begin + mdata_out_0 = '{default:0}; + edata_out_0 = '0; + data_out_0_valid = 0; + cls_token_ready = 0; + distill_token_ready = 0; + conv_out_ready = 0; + end + endcase + end + +endmodule + +module mxint_patch_embed_conv #( + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter WEIGHT_PRECISION_0 = 8, + parameter WEIGHT_PRECISION_1 = 4, + parameter BIAS_PRECISION_0 = 8, + parameter BIAS_PRECISION_1 = 4, + + parameter IN_X = 3, + parameter IN_Y = 2, + parameter IN_C = 4, + + parameter KERNEL_X = 2, + parameter KERNEL_Y = 2, + parameter OUT_C = 4, + + parameter UNROLL_OUT_C = 2, + + parameter BIAS_SIZE = UNROLL_OUT_C, + + parameter HAS_BIAS = 1, + + parameter OUT_Y = (IN_Y) / (KERNEL_Y), + parameter OUT_X = (IN_X) / (KERNEL_X), + parameter SLIDING_NUM = OUT_Y * OUT_X, + + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4 +) ( + input clk, + input rst, + + input [DATA_IN_0_PRECISION_0 - 1:0] mdata_in_0 [IN_C - 1 : 0], + input [DATA_IN_0_PRECISION_1 - 1:0] edata_in_0, + input data_in_0_valid, + output data_in_0_ready, + + input [WEIGHT_PRECISION_0-1:0] mweight [UNROLL_OUT_C * IN_C -1:0], + input [WEIGHT_PRECISION_1-1:0] eweight, + input weight_valid, + output weight_ready, + + input [BIAS_PRECISION_0-1:0] mbias [UNROLL_OUT_C-1:0], + input [BIAS_PRECISION_1-1:0] ebias, + input bias_valid, + output bias_ready, + + output [DATA_OUT_0_PRECISION_0 - 1:0] mdata_out_0 [UNROLL_OUT_C - 1:0], + output [DATA_OUT_0_PRECISION_1 - 1:0] edata_out_0, + output data_out_0_valid, + input data_out_0_ready +); + initial begin + assert ( + (KERNEL_X==KERNEL_Y) + ) else $fatal("UNROLL parameter not set correctly"); + end + + localparam STRIDE = KERNEL_X; + localparam UNCAST_OUT_WIDTH = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + KERNEL_Y * KERNEL_X * IN_C + ) + 1; + localparam UNCAST_OUT_FRAC_WIDTH = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1; + localparam ROUND_PRECISION_0 = DATA_IN_0_PRECISION_0 + WEIGHT_PRECISION_0 + $clog2( + KERNEL_X * KERNEL_Y * IN_C + ); + localparam ROUND_PRECISION_1 = DATA_IN_0_PRECISION_1 + WEIGHT_PRECISION_1; + + logic [DATA_IN_0_PRECISION_0 * IN_C + DATA_IN_0_PRECISION_1 - 1:0] packed_data_in; + logic [UNCAST_OUT_WIDTH - 1:0] uncast_data_out[UNROLL_OUT_C - 1:0]; + + logic [DATA_IN_0_PRECISION_0 * IN_C + DATA_IN_0_PRECISION_1 - 1:0] packed_kernel[KERNEL_Y * KERNEL_X - 1:0]; + logic kernel_valid; + logic kernel_ready; + + logic [DATA_IN_0_PRECISION_0 * IN_C + DATA_IN_0_PRECISION_1 - 1:0] packed_rolled_k[0:0]; + logic [DATA_IN_0_PRECISION_0 - 1:0] mrolled_k [IN_C - 1:0]; + logic [DATA_IN_0_PRECISION_1 - 1:0] erolled_k; + logic rolled_k_valid; + logic rolled_k_ready; + + logic [ROUND_PRECISION_0 -1:0] round_in[UNROLL_OUT_C-1:0]; + + for (genvar i = 0; i < IN_C; i++) + for (genvar j = 0; j < DATA_IN_0_PRECISION_0; j++) + assign packed_data_in[i*DATA_IN_0_PRECISION_0+j] = mdata_in_0[i][j]; + assign packed_data_in[IN_C * DATA_IN_0_PRECISION_0 + DATA_IN_0_PRECISION_1 - 1 : IN_C * DATA_IN_0_PRECISION_0] = edata_in_0; + + sliding_window #( + .IMG_WIDTH (IN_X), + .IMG_HEIGHT (IN_Y), + .KERNEL_WIDTH (KERNEL_X), + .KERNEL_HEIGHT (KERNEL_Y), + .PADDING_WIDTH (0), + .PADDING_HEIGHT(0), + .CHANNELS (1), + .DATA_WIDTH (IN_C * DATA_IN_0_PRECISION_0 + DATA_IN_0_PRECISION_1), + .STRIDE (STRIDE) + /* verilator lint_off PINMISSING */ + ) sw_inst ( + .clk(clk), + .rst(rst), + .data_in(packed_data_in), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + + .data_out(packed_kernel), + .data_out_valid(kernel_valid), + .data_out_ready(kernel_ready) + ); + + roller #( + .DATA_WIDTH(IN_C * DATA_IN_0_PRECISION_0 + DATA_IN_0_PRECISION_1), + .NUM(KERNEL_X * KERNEL_Y), + .ROLL_NUM(1) // actually with only roll_num == 1, + ) roller_inst ( + .clk(clk), + .rst(rst), + .data_in(packed_kernel), + .data_in_valid(kernel_valid), + .data_in_ready(kernel_ready), + .data_out(packed_rolled_k), + .data_out_valid(rolled_k_valid), + .data_out_ready(rolled_k_ready) + ); + for (genvar i = 0; i < IN_C; i++) + assign mrolled_k[i] = packed_rolled_k[0][(i+1)*DATA_IN_0_PRECISION_0 - 1 : i * DATA_IN_0_PRECISION_0]; + assign erolled_k = packed_rolled_k[0][IN_C * DATA_IN_0_PRECISION_0 + DATA_IN_0_PRECISION_1 - 1 : IN_C * DATA_IN_0_PRECISION_0]; + + mxint_linear #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(KERNEL_X * KERNEL_Y * IN_C), + .DATA_IN_0_TENSOR_SIZE_DIM_1(SLIDING_NUM), + .DATA_IN_0_PARALLELISM_DIM_0(IN_C), + .DATA_IN_0_PARALLELISM_DIM_1(1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(KERNEL_X * KERNEL_Y * IN_C), + .WEIGHT_TENSOR_SIZE_DIM_1(OUT_C), + .WEIGHT_PARALLELISM_DIM_0(IN_C), + .WEIGHT_PARALLELISM_DIM_1(UNROLL_OUT_C), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(OUT_C), + .BIAS_TENSOR_SIZE_DIM_1(1), + .BIAS_PARALLELISM_DIM_0(UNROLL_OUT_C), + .BIAS_PARALLELISM_DIM_1(1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) linear_inst ( + .clk(clk), + .rst(rst), + .mdata_in_0(mrolled_k), + .edata_in_0(erolled_k), + .data_in_0_valid(rolled_k_valid), + .data_in_0_ready(rolled_k_ready), + .mweight(mweight), + .eweight(eweight), + .weight_valid(weight_valid), + .weight_ready(weight_ready), + .mbias(mbias), + .ebias(ebias), + .bias_valid(bias_valid), + .bias_ready(bias_ready), + .mdata_out_0(mdata_out_0), + .edata_out_0(edata_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_range_reduction.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_range_reduction.sv new file mode 100644 index 000000000..611d179e9 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_range_reduction.sv @@ -0,0 +1,188 @@ +`timescale 1ns / 1ps +/* + This code actually input mxint and then output rounded integer n, + In the first version, we just keep the width of n is 8 + which means like output n range from [-128:127] +*/ +module mxint_range_reduction #( + /* verilator lint_off UNUSEDPARAM */ + parameter DATA_IN_MAN_WIDTH = 4, + parameter DATA_IN_EXP_WIDTH = 8, + parameter BLOCK_SIZE = 16, + parameter DATA_OUT_N_WIDTH = 8 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input rst, + input clk, + input logic [DATA_IN_MAN_WIDTH-1:0] mdata_in_0[BLOCK_SIZE - 1:0], + input logic [DATA_IN_EXP_WIDTH-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_N_WIDTH-1:0] data_out_n[BLOCK_SIZE - 1 : 0], + output logic data_out_n_valid, + input logic data_out_n_ready, + + output logic [9-1:0] data_out_r[BLOCK_SIZE - 1 : 0], + output logic data_out_r_valid, + input logic data_out_r_ready +); + localparam signed MLOG2_E = 8'd92; + localparam signed ELOG2_E = 4'd1; + localparam signed MLN_2 = 8'd88; + localparam signed ELN_2 = 4'd0; + + localparam DATA_LOG2_E_MAN_WIDTH = DATA_IN_MAN_WIDTH + 8; + localparam DATA_LOG2_E_MAN_FRAC_WIDTH = DATA_IN_MAN_WIDTH - 1 + 8 - 1; + localparam DATA_LOG2_E_EXP_WIDTH = DATA_IN_EXP_WIDTH; + + localparam DATA_LN_2_MAN_WIDTH = DATA_OUT_N_WIDTH + 8; + localparam DATA_LN_2_MAN_FRAC_WIDTH = 8 - 1; // N is integer + localparam DATA_LN_2_EXP_WIDTH = DATA_IN_EXP_WIDTH; + + localparam SHIFT_WIDTH = DATA_IN_EXP_WIDTH; + + logic [DATA_IN_MAN_WIDTH-1:0] fifo_mdata_in[BLOCK_SIZE - 1:0]; + logic [DATA_IN_EXP_WIDTH-1:0] fifo_edata_in; + logic fifo_data_in_valid; + logic fifo_data_in_ready; + + logic [DATA_IN_MAN_WIDTH-1:0] straight_mdata_in[BLOCK_SIZE - 1:0]; + logic [DATA_IN_EXP_WIDTH-1:0] straight_edata_in; + logic straight_data_in_valid; + logic straight_data_in_ready; + + logic [DATA_LOG2_E_MAN_WIDTH - 1:0] mdata_in_0_log2_e[BLOCK_SIZE - 1:0]; + logic [DATA_LOG2_E_EXP_WIDTH - 1:0] edata_in_0_log2_e; + + logic [DATA_OUT_N_WIDTH-1:0] temp_data_out_n[BLOCK_SIZE - 1 : 0]; + logic temp_data_out_n_valid, temp_data_out_n_ready; + + logic [DATA_OUT_N_WIDTH-1:0] straight_data_out_n[BLOCK_SIZE - 1 : 0]; + logic straight_data_out_n_valid, straight_data_out_n_ready; + + logic [DATA_LN_2_MAN_WIDTH - 1:0] mn_ln_2[BLOCK_SIZE - 1:0]; + logic [DATA_LN_2_EXP_WIDTH - 1:0] en_ln_2; + + logic [DATA_LN_2_MAN_WIDTH - 1:0] shifted_fifo_mdata_in[BLOCK_SIZE - 1:0]; + logic [SHIFT_WIDTH - 1:0] shift_value; + + logic [DATA_LN_2_MAN_WIDTH - 1:0] clamped_in[BLOCK_SIZE - 1:0]; + logic [9 - 1:0] regi_r_in[BLOCK_SIZE - 1:0]; + logic regi_r_in_valid, regi_r_in_ready; + + unpacked_mx_split2_with_data #( + .DEPTH(2), + .MAN_WIDTH(DATA_IN_MAN_WIDTH), + .EXP_WIDTH(DATA_IN_EXP_WIDTH), + .IN_SIZE(BLOCK_SIZE) + ) unpacked_mx_split2_with_data_i ( + .clk(clk), + .rst(rst), + .mdata_in(mdata_in_0), + .edata_in(edata_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .fifo_mdata_out(fifo_mdata_in), + .fifo_edata_out(fifo_edata_in), + .fifo_data_out_valid(fifo_data_in_valid), + .fifo_data_out_ready(fifo_data_in_ready), + .straight_mdata_out(straight_mdata_in), + .straight_edata_out(straight_edata_in), + .straight_data_out_valid(straight_data_in_valid), + .straight_data_out_ready(straight_data_in_ready) + ); + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign mdata_in_0_log2_e[i] = $signed(straight_mdata_in[i]) * MLOG2_E; + end + assign edata_in_0_log2_e = $signed(straight_edata_in) + ELOG2_E; + + mxint_hardware_round #( + .DATA_IN_MAN_WIDTH(DATA_LOG2_E_MAN_WIDTH), + .DATA_IN_MAN_FRAC_WIDTH(DATA_LOG2_E_MAN_FRAC_WIDTH), + .DATA_IN_EXP_WIDTH(DATA_LOG2_E_EXP_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE), + .DATA_OUT_WIDTH(DATA_OUT_N_WIDTH) + ) mxint_hardware_round_i ( + .rst(rst), + .clk(clk), + .mdata_in_0(mdata_in_0_log2_e), + .edata_in_0(edata_in_0_log2_e), + .data_in_0_valid(straight_data_in_valid), + .data_in_0_ready(straight_data_in_ready), + .data_out_0(temp_data_out_n), + .data_out_0_valid(temp_data_out_n_valid), + .data_out_0_ready(temp_data_out_n_ready) + ); + + unpacked_split2_with_data #( + .DEPTH(3), + .DATA_WIDTH(DATA_OUT_N_WIDTH), + .IN_SIZE(BLOCK_SIZE) + ) unpacked_split2_with_data_i ( + .clk(clk), + .rst(rst), + .data_in(temp_data_out_n), + .data_in_valid(temp_data_out_n_valid), + .data_in_ready(temp_data_out_n_ready), + .fifo_data_out(data_out_n), + .fifo_data_out_valid(data_out_n_valid), + .fifo_data_out_ready(data_out_n_ready), + .straight_data_out(straight_data_out_n), + .straight_data_out_valid(straight_data_out_n_valid), + .straight_data_out_ready(straight_data_out_n_ready) + ); + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign mn_ln_2[i] = $signed(straight_data_out_n[i]) * MLN_2; + end + assign en_ln_2 = ELN_2; + + assign shift_value = en_ln_2 - $signed(fifo_edata_in) + DATA_IN_MAN_WIDTH - 1 - 7; + join2 #() acc_join_inst ( + .data_in_ready ({straight_data_out_n_ready, fifo_data_in_ready}), + .data_in_valid ({straight_data_out_n_valid, fifo_data_in_valid}), + .data_out_valid(regi_r_in_valid), + .data_out_ready(regi_r_in_ready) + ); + optimized_right_shift #( + .IN_WIDTH(DATA_IN_MAN_WIDTH), + .SHIFT_WIDTH(SHIFT_WIDTH), + .OUT_WIDTH(DATA_LN_2_MAN_WIDTH), + .BLOCK_SIZE(BLOCK_SIZE) + ) ovshift_inst ( + .data_in(fifo_mdata_in), + .shift_value(shift_value), + .data_out(shifted_fifo_mdata_in) + ); + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + assign clamped_in[i] = $signed(shifted_fifo_mdata_in[i]) - $signed(mn_ln_2[i]); + end + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + signed_clamp #( + .IN_WIDTH (DATA_LN_2_MAN_WIDTH), + .OUT_WIDTH(9) + ) data_clamp ( + .in_data (clamped_in[i]), + .out_data(regi_r_in[i]) + ); + end + unpacked_register_slice #( + .DATA_WIDTH(9), + .IN_SIZE (BLOCK_SIZE) + ) register_slice_i ( + .clk(clk), + .rst(rst), + + .data_in(regi_r_in), + .data_in_valid(regi_r_in_valid), + .data_in_ready(regi_r_in_ready), + + .data_out(data_out_r), + .data_out_valid(data_out_r_valid), + .data_out_ready(data_out_r_ready) + ); +endmodule + diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_register_slice.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_register_slice.sv index 3cbbbfa66..070bdd4a1 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_register_slice.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_register_slice.sv @@ -27,31 +27,28 @@ module mxint_register_slice #( output data_out_valid, input data_out_ready ); - initial begin - assert (DATA_PRECISION_0 >= DATA_PRECISION_1) - else $fatal("DATA_PRECISION_0 must larger than PRECISION_1"); - end - logic [DATA_PRECISION_0 - 1:0] packed_data_in [IN_NUM:0]; - logic [DATA_PRECISION_0 - 1:0] packed_data_out[IN_NUM:0]; - always_comb begin : data_pack - packed_data_in[IN_NUM-1:0] = mdata_in; - packed_data_in[IN_NUM] = $signed(edata_in); - mdata_out = packed_data_out[IN_NUM-1:0]; - edata_out = packed_data_out[IN_NUM]; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] data_in_flatten; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] data_out_flatten; + for (genvar i = 0; i < IN_NUM; i++) begin : reshape + assign data_in_flatten[(i+1)*DATA_PRECISION_0-1:i*DATA_PRECISION_0] = mdata_in[i]; end + assign data_in_flatten[DATA_PRECISION_0*IN_NUM+DATA_PRECISION_1-1:DATA_PRECISION_0*IN_NUM] = edata_in; - unpacked_register_slice #( - .DATA_WIDTH(DATA_PRECISION_0), - .IN_SIZE(IN_NUM + 1) - ) register_slice ( + register_slice #( + .DATA_WIDTH(DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1) + ) register_slice_i ( .clk (clk), .rst (rst), - .data_in (packed_data_in), + .data_in (data_in_flatten), .data_in_valid (data_in_valid), .data_in_ready (data_in_ready), - .data_out (packed_data_out), + .data_out (data_out_flatten), .data_out_valid(data_out_valid), .data_out_ready(data_out_ready) ); + for (genvar i = 0; i < IN_NUM; i++) begin : unreshape + assign mdata_out[i] = data_out_flatten[(i+1)*DATA_PRECISION_0-1:i*DATA_PRECISION_0]; + end + assign edata_out = data_out_flatten[DATA_PRECISION_0*IN_NUM+DATA_PRECISION_1-1:DATA_PRECISION_0*IN_NUM]; endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_skid_buffer.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_skid_buffer.sv new file mode 100644 index 000000000..59a975739 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_skid_buffer.sv @@ -0,0 +1,54 @@ +`timescale 1ns / 1ps +/* +Module : mxint_register_slice +Description : This module does the same function as register slice + But for datatype mxint. +*/ + +module mxint_skid_buffer #( + // precision represent mantissa width + // precision_1 represent exponent width + // + parameter DATA_PRECISION_0 = 8, + parameter DATA_PRECISION_1 = 8, + parameter IN_NUM = 6 + +) ( + input clk, + input rst, + // m -> mantissa, e -> exponent + input logic [DATA_PRECISION_0-1:0] mdata_in[IN_NUM - 1:0], + input logic [DATA_PRECISION_1-1:0] edata_in, + input data_in_valid, + output data_in_ready, + + output logic [DATA_PRECISION_0-1:0] mdata_out[IN_NUM - 1:0], + output logic [DATA_PRECISION_1-1:0] edata_out, + output data_out_valid, + input data_out_ready +); + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] data_in_flatten; + logic [DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1 - 1:0] data_out_flatten; + for (genvar i = 0; i < IN_NUM; i++) begin : reshape + assign data_in_flatten[(i+1)*DATA_PRECISION_0-1:i*DATA_PRECISION_0] = mdata_in[i]; + end + assign data_in_flatten[DATA_PRECISION_0*IN_NUM+DATA_PRECISION_1-1:DATA_PRECISION_0*IN_NUM] = edata_in; + + skid_buffer #( + .DATA_WIDTH(DATA_PRECISION_0 * IN_NUM + DATA_PRECISION_1) + ) register_slice_i ( + .clk (clk), + .rst (rst), + .data_in (data_in_flatten), + .data_in_valid (data_in_valid), + .data_in_ready (data_in_ready), + .data_out (data_out_flatten), + .data_out_valid(data_out_valid), + .data_out_ready(data_out_ready) + ); + for (genvar i = 0; i < IN_NUM; i++) begin : unreshape + assign mdata_out[i] = data_out_flatten[(i+1)*DATA_PRECISION_0-1:i*DATA_PRECISION_0]; + end + assign edata_out = data_out_flatten[DATA_PRECISION_0*IN_NUM+DATA_PRECISION_1-1:DATA_PRECISION_0*IN_NUM]; + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_softmax.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_softmax.sv new file mode 100644 index 000000000..820e73544 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_softmax.sv @@ -0,0 +1,240 @@ +`timescale 1ns / 1ps +/* + Currently, we dont' want to support parallelism + Cause in attention, it's actually not in parallel +*/ +module mxint_softmax #( + /* verilator lint_off UNUSEDPARAM */ + + parameter DATA_IN_0_PRECISION_0 = 4, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter DATA_IN_0_DIM = 8, // input vector size + parameter DATA_IN_0_PARALLELISM = 1, // batch size + parameter DATA_R_WIDTH = 2, + + parameter IN_0_DEPTH = DATA_IN_0_DIM, + parameter DATA_OUT_0_PRECISION_0 = 4, + parameter DATA_OUT_0_PRECISION_1 = 8, + parameter DATA_OUT_0_DIM = DATA_IN_0_DIM, + parameter DATA_OUT_0_PARALLELISM = DATA_IN_0_PARALLELISM, + parameter EXP_SUM_UNDERFLOW_BITS = 4, + parameter DIVISION_UNDERFLOW_BITS = 4 +) ( + /* verilator lint_off UNUSEDSIGNAL */ + input rst, + input clk, + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0[DATA_IN_0_PARALLELISM-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0[DATA_OUT_0_PARALLELISM-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + + input logic data_in_0_valid, + output logic data_in_0_ready, + output logic data_out_0_valid, + input logic data_out_0_ready +); + + // softmax over a vector + // each vector might be split into block of elements + // Can handle multiple batches at once + // each iteration recieves a batch of blocks + + // The current version only support precision of taylor_exp output to be the same with data_out_r + localparam DATA_EXP_0_PRECISION_0 = DATA_IN_0_PRECISION_0; + localparam DATA_EXP_0_FRAC_WIDTH = DATA_EXP_0_PRECISION_0 - 2; + localparam DATA_EXP_0_PRECISION_1 = DATA_IN_0_PRECISION_1; + + localparam ACC_WIDTH = $clog2(IN_0_DEPTH) + DATA_EXP_0_PRECISION_0 + EXP_SUM_UNDERFLOW_BITS; + localparam ACC_FRAC_WIDTH = DATA_EXP_0_FRAC_WIDTH + EXP_SUM_UNDERFLOW_BITS; + + localparam DATA_DIVIDEND_PRECISION_0 = DATA_EXP_0_PRECISION_0 + EXP_SUM_UNDERFLOW_BITS + DIVISION_UNDERFLOW_BITS; + localparam DATA_DIVIDEND_PRECISION_1 = DATA_EXP_0_PRECISION_1; + localparam DATA_DIVISOR_PRECISION_0 = ACC_WIDTH; + localparam DATA_DIVISOR_PRECISION_1 = DATA_EXP_0_PRECISION_1; + localparam DATA_QUOTIENT_PRECISION_0 = DATA_DIVIDEND_PRECISION_0; + localparam DATA_QUOTIENT_FRAC_WIDTH = DIVISION_UNDERFLOW_BITS; + localparam DATA_QUOTIENT_PRECISION_1 =DATA_EXP_0_PRECISION_1 + 1; + + + localparam BLOCK_SIZE = DATA_IN_0_PARALLELISM; + initial begin + assert (BLOCK_SIZE == 1) + else $fatal("Currently only BLOCK_SIZE of 1 is supported."); + end + + // Add missing signals for mxint_exp interface + logic [DATA_EXP_0_PRECISION_0-1:0] mdata_exp[BLOCK_SIZE - 1:0]; + logic [DATA_EXP_0_PRECISION_1-1:0] edata_exp[BLOCK_SIZE - 1:0]; + logic data_exp_valid, data_exp_ready; + + // Split2 and FF signals for exp path + logic [DATA_EXP_0_PRECISION_0-1:0] ff_exp_mdata_out[DATA_IN_0_PARALLELISM-1:0]; + logic [DATA_EXP_0_PRECISION_1-1:0] ff_exp_edata_out; + logic ff_exp_data_valid, ff_exp_data_ready; + + // Straight path signals + logic [DATA_EXP_0_PRECISION_0-1:0] straight_exp_mdata_out[DATA_IN_0_PARALLELISM-1:0]; + logic [DATA_EXP_0_PRECISION_1-1:0] straight_exp_edata_out; + logic straight_exp_data_out_valid, straight_exp_data_out_ready; + + // Accumulator signals + logic [ACC_WIDTH-1:0] acc_mdata_out[BLOCK_SIZE-1:0]; + logic [DATA_EXP_0_PRECISION_1-1:0] acc_edata_out; + logic acc_data_out_valid, acc_data_out_ready; + + // Circular buffer signals + logic [ACC_WIDTH-1:0] circ_mdata_out[DATA_OUT_0_PARALLELISM-1:0]; + logic [DATA_EXP_0_PRECISION_1-1:0] circ_edata_out; + logic circ_data_out_valid, circ_data_out_ready; + + logic [DATA_DIVIDEND_PRECISION_0 - 1:0] mdata_dividend [BLOCK_SIZE - 1:0]; + logic [DATA_DIVIDEND_PRECISION_1 - 1:0] edata_dividend; + // Division signals + logic [DATA_QUOTIENT_PRECISION_0 - 1:0] mquotient_data[BLOCK_SIZE - 1:0]; + logic [DATA_QUOTIENT_PRECISION_1 - 1:0] equotient_data; + logic quotient_data_valid, quotient_data_ready; + + // Updated mxint_exp instantiation with all parameters and proper signal connections + mxint_exp #( + .DATA_IN_MAN_WIDTH(DATA_IN_0_PRECISION_0), + .DATA_IN_EXP_WIDTH(DATA_IN_0_PRECISION_1), + .BLOCK_SIZE(BLOCK_SIZE), + .DATA_R_WIDTH(DATA_R_WIDTH), + .DATA_OUT_MAN_WIDTH(DATA_EXP_0_PRECISION_0), + .DATA_OUT_EXP_WIDTH(DATA_EXP_0_PRECISION_1) + ) mxint_exp_inst ( + .rst(rst), + .clk(clk), + // Input interface + .mdata_in_0(mdata_in_0), + .edata_in_0(edata_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + // Output interface + .mdata_out_0(mdata_exp), + .edata_out_0(edata_exp), + .data_out_0_valid(data_exp_valid), + .data_out_0_ready(data_exp_ready) + ); + + unpacked_mx_split2_with_data #( + .DEPTH(DATA_IN_0_DIM * 2), + .MAN_WIDTH(DATA_EXP_0_PRECISION_0), + .EXP_WIDTH(DATA_EXP_0_PRECISION_1), + .IN_SIZE(DATA_IN_0_PARALLELISM) + ) split2_mxint_exp_inst ( + .clk(clk), + .rst(rst), + // Input from mxint exp + .mdata_in(mdata_exp), + .edata_in(edata_exp[0]), + .data_in_valid(data_exp_valid), + .data_in_ready(data_exp_ready), + // FIFO output path + .fifo_mdata_out(ff_exp_mdata_out), + .fifo_edata_out(ff_exp_edata_out), // Not used + .fifo_data_out_valid(ff_exp_data_valid), + .fifo_data_out_ready(ff_exp_data_ready), + // Straight output path + .straight_mdata_out(straight_exp_mdata_out), + .straight_edata_out(straight_exp_edata_out), + .straight_data_out_valid(straight_exp_data_out_valid), + .straight_data_out_ready(straight_exp_data_out_ready) + ); + + mxint_accumulator #( + .DATA_IN_0_PRECISION_0(DATA_EXP_0_PRECISION_0), + .DATA_IN_0_PRECISION_1(DATA_EXP_0_PRECISION_1), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM), + .IN_DEPTH(IN_0_DEPTH), + .UNDERFLOW_BITS(EXP_SUM_UNDERFLOW_BITS) + ) mxint_accumulator_inst ( + .clk(clk), + .rst(rst), + .mdata_in_0(straight_exp_mdata_out), // From split2 straight output + .edata_in_0(straight_exp_edata_out), // From split2 straight output + .data_in_0_valid(straight_exp_data_out_valid), + .data_in_0_ready(straight_exp_data_out_ready), + .mdata_out_0(acc_mdata_out), + .edata_out_0(acc_edata_out), + .data_out_0_valid(acc_data_out_valid), + .data_out_0_ready(acc_data_out_ready) + ); + // Replace existing signals + // Replace input_buffer with mxint_circular + mxint_circular #( + .DATA_PRECISION_0(ACC_WIDTH), + .DATA_PRECISION_1(DATA_EXP_0_PRECISION_1), + .IN_NUM(DATA_OUT_0_PARALLELISM), + .REPEAT(IN_0_DEPTH), + .BUFFER_SIZE(1) + ) acc_circular ( + .clk(clk), + .rst(rst), + .mdata_in(acc_mdata_out), + .edata_in(acc_edata_out), + .data_in_valid(acc_data_out_valid), + .data_in_ready(acc_data_out_ready), + .mdata_out(circ_mdata_out), + .edata_out(circ_edata_out), + .data_out_valid(circ_data_out_valid), + .data_out_ready(circ_data_out_ready) + ); + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin : dividend + assign mdata_dividend[i] = ff_exp_mdata_out[i] << EXP_SUM_UNDERFLOW_BITS + DIVISION_UNDERFLOW_BITS; + end + assign edata_dividend = ff_exp_edata_out; + // Add after mxint_circular instance + mxint_div #( + .DATA_DIVIDEND_PRECISION_0(DATA_DIVIDEND_PRECISION_0), + .DATA_DIVIDEND_PRECISION_1(DATA_DIVIDEND_PRECISION_1), + .DATA_DIVISOR_PRECISION_0(DATA_DIVISOR_PRECISION_0), + .DATA_DIVISOR_PRECISION_1(DATA_DIVISOR_PRECISION_1), + .DATA_QUOTIENT_PRECISION_0(DATA_QUOTIENT_PRECISION_0), + .DATA_QUOTIENT_PRECISION_1(DATA_QUOTIENT_PRECISION_1), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM) + ) div_inst ( + .clk(clk), + .rst(rst), + // Connect dividend (ff_exp_data) + .mdividend_data(mdata_dividend), + .edividend_data(edata_dividend), + .dividend_data_valid(ff_exp_data_valid), + .dividend_data_ready(ff_exp_data_ready), + // Connect divisor (circ_data) + .mdivisor_data(circ_mdata_out), + .edivisor_data(circ_edata_out), + .divisor_data_valid(circ_data_out_valid), + .divisor_data_ready(circ_data_out_ready), + // Connect quotient output + .mquotient_data(mquotient_data), + .equotient_data(equotient_data), + .quotient_data_valid(quotient_data_valid), + .quotient_data_ready(quotient_data_ready) + ); + + + // Add mxint_cast instance + mxint_cast #( + .IN_MAN_WIDTH(DATA_QUOTIENT_PRECISION_0), + .IN_MAN_FRAC_WIDTH(DATA_QUOTIENT_FRAC_WIDTH), + .IN_EXP_WIDTH(DATA_QUOTIENT_PRECISION_1), + .OUT_MAN_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .BLOCK_SIZE(DATA_OUT_0_PARALLELISM), + .ROUND_BITS(4) + ) cast_inst ( + .clk(clk), + .rst(rst), + .mdata_in(mquotient_data), + .edata_in(equotient_data), + .data_in_valid(quotient_data_valid), // Updated + .data_in_ready(quotient_data_ready), // Updated + .mdata_out(mdata_out_0), + .edata_out(edata_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_straightm_fifoe.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_straightm_fifoe.sv new file mode 100644 index 000000000..8f9fba918 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_straightm_fifoe.sv @@ -0,0 +1,55 @@ +`timescale 1 ns / 1 ps +module mxint_straightm_fifoe #( + parameter DEPTH = 8, + parameter MAN_WIDTH = 8, + parameter EXP_WIDTH = 8, + parameter IN_SIZE = 8 +) ( + input clk, + input rst, + // Input interface + input [MAN_WIDTH-1:0] mdata_in[IN_SIZE - 1:0], + input [EXP_WIDTH-1:0] edata_in, + input logic data_in_valid, + output logic data_in_ready, + // FIFO output interface + output [EXP_WIDTH-1:0] fifo_edata_out, + output logic fifo_edata_out_valid, + input logic fifo_edata_out_ready, + // Straight output interface + output [MAN_WIDTH-1:0] straight_mdata_out[IN_SIZE - 1:0], + output logic straight_mdata_out_valid, + input logic straight_mdata_out_ready +); + logic mdata_in_ready, edata_in_ready; + logic mdata_in_valid, edata_in_valid; + split2 #() data_out_n_split_i ( + .data_in_valid (data_in_valid), + .data_in_ready (data_in_ready), + .data_out_valid({mdata_in_valid, edata_in_valid}), + .data_out_ready({mdata_in_ready, edata_in_ready}) + ); + fifo #( + .DEPTH(DEPTH), + .DATA_WIDTH(EXP_WIDTH) + ) ff_inst ( + .clk(clk), + .rst(rst), + .in_data(edata_in), + .in_valid(edata_in_valid), + .in_ready(edata_in_ready), + .out_data(fifo_edata_out), + .out_valid(fifo_edata_out_valid), + .out_ready(fifo_edata_out_ready), + .empty(), + .full() + ); + for (genvar i = 0; i < IN_SIZE; i++) begin + assign straight_mdata_out[i] = mdata_in[i]; + end + always_comb begin + straight_mdata_out_valid = mdata_in_valid; + mdata_in_ready = straight_mdata_out_ready; + end + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention.sv new file mode 100644 index 000000000..2fd6c812e --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention.sv @@ -0,0 +1,411 @@ +`timescale 1 ns / 1 ps +module mxint_vit_attention #( + parameter NUM_HEADS = 4, + + // Input data parameters + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 8, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 2, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 2, + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 3, + + // Weight parameters (shared by Q,K,V) + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 8, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 8, + parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + parameter WEIGHT_PRECISION_0 = 8, + parameter WEIGHT_PRECISION_1 = 3, + + // Bias parameters (shared by Q,K,V) + parameter HAS_BIAS = 1, + parameter BIAS_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + parameter BIAS_PARALLELISM_DIM_1 = 1, + parameter BIAS_PRECISION_0 = 8, + parameter BIAS_PRECISION_1 = 3, + + // Internal precision parameters + parameter QKV_PRECISION_0 = 16, + parameter QKV_PRECISION_1 = 3, + + // Projection parameters + parameter WEIGHT_PROJ_PRECISION_0 = 12, + parameter WEIGHT_PROJ_PRECISION_1 = 3, + parameter BIAS_PROJ_PRECISION_0 = 8, + parameter BIAS_PROJ_PRECISION_1 = 3, + + // Derived parameters for projection + parameter WEIGHT_PROJ_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_0, + parameter WEIGHT_PROJ_TENSOR_SIZE_DIM_1 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter WEIGHT_PROJ_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_0, + parameter WEIGHT_PROJ_PARALLELISM_DIM_1 = WEIGHT_PARALLELISM_DIM_1, + + parameter BIAS_PROJ_TENSOR_SIZE_DIM_0 = WEIGHT_PROJ_TENSOR_SIZE_DIM_1, + parameter BIAS_PROJ_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PROJ_PARALLELISM_DIM_0 = WEIGHT_PROJ_PARALLELISM_DIM_1, + parameter BIAS_PROJ_PARALLELISM_DIM_1 = 1, + + // Derived parameters for output + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_PROJ_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PROJ_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + // Query weights + input logic [WEIGHT_PRECISION_0-1:0] mweight_query [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] eweight_query, + input logic query_weight_valid, + output logic query_weight_ready, + + // Query bias + input logic [BIAS_PRECISION_0-1:0] mquery_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] equery_bias, + input logic query_bias_valid, + output logic query_bias_ready, + + // Key weights + input logic [WEIGHT_PRECISION_0-1:0] mkey_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] ekey_weight, + input logic key_weight_valid, + output logic key_weight_ready, + + // Key bias + input logic [BIAS_PRECISION_0-1:0] mkey_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] ekey_bias, + input logic key_bias_valid, + output logic key_bias_ready, + + // Value weights + input logic [WEIGHT_PRECISION_0-1:0] mvalue_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] evalue_weight, + input logic value_weight_valid, + output logic value_weight_ready, + + // Value bias + input logic [BIAS_PRECISION_0-1:0] mvalue_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] evalue_bias, + input logic value_bias_valid, + output logic value_bias_ready, + + // Proj weights + input logic [WEIGHT_PROJ_PRECISION_0-1:0] mproj_weight [WEIGHT_PROJ_PARALLELISM_DIM_0 * WEIGHT_PROJ_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PROJ_PRECISION_1-1:0] eproj_weight, + input logic proj_weight_valid, + output logic proj_weight_ready, + + // Proj bias + input logic [BIAS_PROJ_PRECISION_0-1:0] mproj_bias [BIAS_PROJ_PARALLELISM_DIM_0 * BIAS_PROJ_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PROJ_PRECISION_1-1:0] eproj_bias, + input logic proj_bias_valid, + output logic proj_bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready +); + + // * Declarations + // * ================================================================= + + localparam HEAD_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1; + localparam HEAD_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1; + localparam HEAD_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1; + localparam HEAD_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1; + // Query + logic [QKV_PRECISION_0-1:0] query[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [QKV_PRECISION_1-1:0] equery; + logic joint_query_valid, joint_query_ready; + logic [NUM_HEADS-1:0] split_query_valid, split_query_ready; + + // Key + logic [QKV_PRECISION_0-1:0] key[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [QKV_PRECISION_1-1:0] ekey; + logic joint_key_valid, joint_key_ready; + logic [NUM_HEADS-1:0] split_key_valid, split_key_ready; + + // Value + logic [QKV_PRECISION_0-1:0] value[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [QKV_PRECISION_1-1:0] evalue; + logic joint_value_valid, joint_value_ready; + logic [NUM_HEADS-1:0] split_value_valid, split_value_ready; + + logic [QKV_PRECISION_0-1:0] fifo_key[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic fifo_key_valid, fifo_key_ready; + logic [QKV_PRECISION_0-1:0] mfifo_value[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [QKV_PRECISION_1-1:0] efifo_value; + logic fifo_value_valid, fifo_value_ready; + + // Head output + logic [QKV_PRECISION_0-1:0] mhead_out [NUM_HEADS-1:0] [HEAD_OUT_0_PARALLELISM_DIM_0 * HEAD_OUT_0_PARALLELISM_DIM_1-1:0]; + logic [QKV_PRECISION_1-1:0] ehead_out [NUM_HEADS-1:0]; + logic [NUM_HEADS-1:0] head_out_valid; + logic [NUM_HEADS-1:0] head_out_ready; + + logic [QKV_PRECISION_0-1:0] mproj_in [HEAD_OUT_0_PARALLELISM_DIM_0 * HEAD_OUT_0_PARALLELISM_DIM_1-1:0]; + logic [QKV_PRECISION_1-1:0] eproj_in; // Add this signal declaration + logic proj_in_valid, proj_in_ready; + + + // * Instances + // * ================================================================= + + mxint_vit_attention_input_block_batched #( + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + + .HAS_BIAS (HAS_BIAS), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + + .DATA_OUT_0_PRECISION_0(QKV_PRECISION_0), + .DATA_OUT_0_PRECISION_1(QKV_PRECISION_1) + ) batched_input_block_i ( + .clk(clk), + .rst(rst), + + .mdata_in_0(mdata_in_0), + .edata_in_0(edata_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + + // Query parameters + .mweight_query(mweight_query), + .eweight_query(eweight_query), + .weight_query_valid(query_weight_valid), + .weight_query_ready(query_weight_ready), + + .mbias_query(mquery_bias), + .ebias_query(equery_bias), + .bias_query_valid(query_bias_valid), + .bias_query_ready(query_bias_ready), + + // Key parameters + .mweight_key(mkey_weight), + .eweight_key(ekey_weight), + .weight_key_valid(key_weight_valid), + .weight_key_ready(key_weight_ready), + + .mbias_key(mkey_bias), + .ebias_key(ekey_bias), + .bias_key_valid(key_bias_valid), + .bias_key_ready(key_bias_ready), + + // Value parameters + .mweight_value(mvalue_weight), + .eweight_value(evalue_weight), + .weight_value_valid(value_weight_valid), + .weight_value_ready(value_weight_ready), + + .mbias_value(mvalue_bias), + .ebias_value(evalue_bias), + .bias_value_valid(value_bias_valid), + .bias_value_ready(value_bias_ready), + + // Query output + .mdata_out_query(query), + .edata_out_query(equery), + .data_out_query_valid(joint_query_valid), + .data_out_query_ready(joint_query_ready), + + // Key output + .mdata_out_key(key), + .edata_out_key(ekey), + .data_out_key_valid(joint_key_valid), + .data_out_key_ready(joint_key_ready), + + // Value output + .mdata_out_value(mfifo_value), + .edata_out_value(efifo_value), + .data_out_value_valid(fifo_value_valid), + .data_out_value_ready(fifo_value_ready) + ); + + unpacked_mx_fifo #( + .DEPTH(DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0)), + .MAN_WIDTH(QKV_PRECISION_0), + .EXP_WIDTH(QKV_PRECISION_1), + .IN_SIZE(DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0) + ) value_in_buffer ( + .clk(clk), + .rst(rst), + .mdata_in(mfifo_value), + .edata_in(efifo_value), + .data_in_valid(fifo_value_valid), + .data_in_ready(fifo_value_ready), + .mdata_out(value), + .edata_out(evalue), + .data_out_valid(joint_value_valid), + .data_out_ready(joint_value_ready) + ); + + // * Scatter query, key, value + + self_attention_head_scatter #( + .NUM_HEADS(NUM_HEADS), + + // Fix parameter names to match module definition + .IN_DATA_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .IN_DATA_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .IN_DATA_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .IN_DATA_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1) + ) scatter_qkv_i ( + .clk, + .rst, + + .query_valid(joint_query_valid), + .query_ready(joint_query_ready), + + .key_valid(joint_key_valid), + .key_ready(joint_key_ready), + + .value_valid(joint_value_valid), + .value_ready(joint_value_ready), + + .split_query_valid(split_query_valid), + .split_query_ready(split_query_ready), + + .split_key_valid(split_key_valid), + .split_key_ready(split_key_ready), + + .split_value_valid(split_value_valid), + .split_value_ready(split_value_ready) + ); + + // * Heads + + for (genvar head = 0; head < NUM_HEADS; head++) begin : g_attention_head + mxint_vit_attention_head #( + .IN_DATA_TENSOR_SIZE_DIM_0 (HEAD_OUT_0_TENSOR_SIZE_DIM_0 / NUM_HEADS), + .IN_DATA_TENSOR_SIZE_DIM_1 (HEAD_OUT_0_TENSOR_SIZE_DIM_1), + .IN_DATA_PARALLELISM_DIM_0 (HEAD_OUT_0_PARALLELISM_DIM_0), + .IN_DATA_PARALLELISM_DIM_1 (HEAD_OUT_0_PARALLELISM_DIM_1), + .IN_DATA_PRECISION_0 (QKV_PRECISION_0), + .IN_DATA_PRECISION_1 (QKV_PRECISION_1), + .OUT_DATA_PRECISION_0 (QKV_PRECISION_0), + .OUT_DATA_PRECISION_1 (QKV_PRECISION_1) + ) head_i ( + .clk, + .rst, + + .mquery (query), + .equery (equery), + .query_valid(split_query_valid[head]), + .query_ready(split_query_ready[head]), + + .mkey (key), + .ekey (ekey), + .key_valid(split_key_valid[head]), + .key_ready(split_key_ready[head]), + + .mvalue (value), + .evalue (evalue), + .value_valid(split_value_valid[head]), + .value_ready(split_value_ready[head]), + + .mout (mhead_out[head]), + .eout (ehead_out[head]), + .out_valid(head_out_valid[head]), + .out_ready(head_out_ready[head]) + ); + end + + // * Gather heads + + mxint_vit_attention_head_gather #( + .NUM_HEADS(NUM_HEADS), + .IN_DATA_TENSOR_SIZE_DIM_0(HEAD_OUT_0_TENSOR_SIZE_DIM_0), + .IN_DATA_TENSOR_SIZE_DIM_1(HEAD_OUT_0_TENSOR_SIZE_DIM_1), + .IN_DATA_PARALLELISM_DIM_0(HEAD_OUT_0_PARALLELISM_DIM_0), + .IN_DATA_PARALLELISM_DIM_1(HEAD_OUT_0_PARALLELISM_DIM_1), + .MAN_WIDTH(QKV_PRECISION_0), + .EXP_WIDTH(QKV_PRECISION_1) + ) gather_qkv_i ( + .clk, + .rst, + .msplit_head_out(mhead_out), + .esplit_head_out(ehead_out), + .split_head_out_valid(head_out_valid), + .split_head_out_ready(head_out_ready), + .mupdated_tokens(mproj_in), + .eupdated_tokens(eproj_in), + .updated_tokens_valid(proj_in_valid), + .updated_tokens_ready(proj_in_ready) + ); + + mxint_linear #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (QKV_PRECISION_0), + .DATA_IN_0_PRECISION_1 (QKV_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(HEAD_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(HEAD_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(HEAD_OUT_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(HEAD_OUT_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PROJ_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PROJ_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_PROJ_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_PROJ_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PROJ_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PROJ_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PROJ_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PROJ_PRECISION_1), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) proj ( + .clk(clk), + .rst(rst), + + // input port for data_inivations + .mdata_in_0 (mproj_in), + .edata_in_0 (eproj_in), + .data_in_0_valid (proj_in_valid), + .data_in_0_ready (proj_in_ready), + + // input port for weight + .mweight (mproj_weight), + .eweight(eproj_weight), + .weight_valid(proj_weight_valid), + .weight_ready(proj_weight_ready), + + .mbias (mproj_bias), + .ebias(eproj_bias), + .bias_valid(proj_bias_valid), + .bias_ready(proj_bias_ready), + + .mdata_out_0(mdata_out_0), + .edata_out_0(edata_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_head.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_head.sv new file mode 100644 index 000000000..8b8402303 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_head.sv @@ -0,0 +1,153 @@ +`timescale 1ns / 1ps +module mxint_vit_attention_head #( + // Input dimensions and parallelism + parameter IN_DATA_TENSOR_SIZE_DIM_0 = 32, + parameter IN_DATA_TENSOR_SIZE_DIM_1 = 10, + parameter IN_DATA_PARALLELISM_DIM_0 = 2, + parameter IN_DATA_PARALLELISM_DIM_1 = 2, + parameter IN_DATA_PRECISION_0 = 16, + parameter IN_DATA_PRECISION_1 = 3, + + // Output dimensions + parameter OUT_DATA_TENSOR_SIZE_DIM_0 = IN_DATA_TENSOR_SIZE_DIM_0, + parameter OUT_DATA_TENSOR_SIZE_DIM_1 = IN_DATA_TENSOR_SIZE_DIM_1, + parameter OUT_DATA_PARALLELISM_DIM_0 = IN_DATA_PARALLELISM_DIM_0, + parameter OUT_DATA_PARALLELISM_DIM_1 = IN_DATA_PARALLELISM_DIM_1, + parameter OUT_DATA_PRECISION_0 = IN_DATA_PRECISION_0, + parameter OUT_DATA_PRECISION_1 = IN_DATA_PRECISION_1 +) ( + input logic clk, + input logic rst, + + // Query inputs + input logic [IN_DATA_PRECISION_0-1:0] mquery [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic [IN_DATA_PRECISION_1-1:0] equery, + input logic query_valid, + output logic query_ready, + + // Key inputs + input logic [IN_DATA_PRECISION_0-1:0] mkey [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic [IN_DATA_PRECISION_1-1:0] ekey, + input logic key_valid, + output logic key_ready, + + // Value inputs + input logic [IN_DATA_PRECISION_0-1:0] mvalue [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic [IN_DATA_PRECISION_1-1:0] evalue, + input logic value_valid, + output logic value_ready, + + // Outputs + output logic [OUT_DATA_PRECISION_0-1:0] mout [OUT_DATA_PARALLELISM_DIM_0*OUT_DATA_PARALLELISM_DIM_1-1:0], + output logic [OUT_DATA_PRECISION_1-1:0] eout, + output logic out_valid, + input logic out_ready +); + + // QK matmul signals + logic [IN_DATA_PRECISION_0-1:0] qk_mout [IN_DATA_PARALLELISM_DIM_1 * IN_DATA_PARALLELISM_DIM_1-1:0]; + logic [IN_DATA_PRECISION_1-1:0] qk_eout; + logic qk_valid, qk_ready; + + // Softmax signals + logic [IN_DATA_PRECISION_0-1:0] sm_mout [IN_DATA_PARALLELISM_DIM_1 * IN_DATA_PARALLELISM_DIM_1-1:0]; + logic [IN_DATA_PRECISION_1-1:0] sm_eout; + logic sm_valid, sm_ready; + + // First compute Q * K^T using mxint_linear + mxint_linear #( + .DATA_IN_0_PRECISION_0(IN_DATA_PRECISION_0), + .DATA_IN_0_PRECISION_1(IN_DATA_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0(IN_DATA_PRECISION_0), + .WEIGHT_PRECISION_1(IN_DATA_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(IN_DATA_PRECISION_0), + .DATA_OUT_0_PRECISION_1(IN_DATA_PRECISION_1), + .HAS_BIAS(0) + ) query_key_linear ( + .clk(clk), + .rst(rst), + .mdata_in_0(mquery), + .edata_in_0(equery), + .data_in_0_valid(query_valid), + .data_in_0_ready(query_ready), + .mweight(mkey), + .eweight(ekey), + .weight_valid(key_valid), + .weight_ready(key_ready), + .mbias(), // Not used since HAS_BIAS=0 + .ebias(), + .bias_valid(1'b1), + .bias_ready(), + .mdata_out_0(qk_mout), + .edata_out_0(qk_eout), + .data_out_0_valid(qk_valid), + .data_out_0_ready(qk_ready) + ); + + // Apply softmax to QK^T result + mxint_softmax #( + .DATA_IN_0_PRECISION_0(IN_DATA_PRECISION_0), + .DATA_IN_0_PRECISION_1(IN_DATA_PRECISION_1), + .DATA_IN_0_DIM(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM(IN_DATA_PARALLELISM_DIM_1), + .DATA_OUT_0_PRECISION_0(OUT_DATA_PRECISION_0), + .DATA_OUT_0_PRECISION_1(OUT_DATA_PRECISION_1), + .DATA_OUT_0_DIM(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM(IN_DATA_PARALLELISM_DIM_1) + ) attention_softmax ( + .clk(clk), + .rst(rst), + .mdata_in_0(qk_mout), + .edata_in_0(qk_eout), + .data_in_0_valid(qk_valid), + .data_in_0_ready(qk_ready), + .mdata_out_0(sm_mout), + .edata_out_0(sm_eout), + .data_out_0_valid(sm_valid), + .data_out_0_ready(sm_ready) + ); + + // Compute softmax(QK^T)V + mxint_matmul #( + .A_TOTAL_DIM0(IN_DATA_TENSOR_SIZE_DIM_1), + .A_TOTAL_DIM1(IN_DATA_TENSOR_SIZE_DIM_1), + .B_TOTAL_DIM0(IN_DATA_TENSOR_SIZE_DIM_0), + .B_TOTAL_DIM1(IN_DATA_TENSOR_SIZE_DIM_1), + .A_COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_1), + .A_COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_1), + .B_COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_0), + .B_COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_1), + .A_MAN_WIDTH(OUT_DATA_PRECISION_0), + .A_EXP_WIDTH(OUT_DATA_PRECISION_1), + .B_MAN_WIDTH(IN_DATA_PRECISION_0), + .B_EXP_WIDTH(IN_DATA_PRECISION_1), + .OUT_MAN_WIDTH(OUT_DATA_PRECISION_0), + .OUT_EXP_WIDTH(OUT_DATA_PRECISION_1) + ) attention_value_matmul ( + .clk(clk), + .rst(rst), + .ma_data(sm_mout), + .ea_data(sm_eout), + .a_valid(sm_valid), + .a_ready(sm_ready), + .mb_data(mvalue), + .eb_data(evalue), + .b_valid(value_valid), + .b_ready(value_ready), + .mout_data(mout), + .eout_data(eout), + .out_valid(out_valid), + .out_ready(out_ready) + ); + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_head_gather.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_head_gather.sv new file mode 100644 index 000000000..b0b63ea60 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_head_gather.sv @@ -0,0 +1,74 @@ +`timescale 1ns / 1ps +module mxint_vit_attention_head_gather #( + parameter NUM_HEADS = 12, + + parameter IN_DATA_TENSOR_SIZE_DIM_0 = 64, + parameter IN_DATA_TENSOR_SIZE_DIM_1 = 32, + parameter IN_DATA_PARALLELISM_DIM_0 = 4, + parameter IN_DATA_PARALLELISM_DIM_1 = 4, + parameter MAN_WIDTH = 16, + parameter EXP_WIDTH = 3 + +) ( + input logic clk, + input logic rst, + + input logic [MAN_WIDTH-1:0] msplit_head_out [NUM_HEADS-1:0] [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic [EXP_WIDTH-1:0] esplit_head_out [NUM_HEADS-1:0], + input logic [NUM_HEADS-1:0] split_head_out_valid, + output logic [NUM_HEADS-1:0] split_head_out_ready, + + output logic [MAN_WIDTH-1:0] mupdated_tokens [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + output logic [EXP_WIDTH-1:0] eupdated_tokens, + output logic updated_tokens_valid, + input logic updated_tokens_ready +); + + parameter IN_DATA_DEPTH = IN_DATA_TENSOR_SIZE_DIM_0 / IN_DATA_PARALLELISM_DIM_0; + parameter BLOCKS_PER_HEAD = IN_DATA_DEPTH / NUM_HEADS; + + // Block counters + logic [NUM_HEADS-1:0][$clog2(BLOCKS_PER_HEAD):0] block_counter; + logic [NUM_HEADS-1:0] heads_flushed; + logic [$clog2(NUM_HEADS)-1:0] head_flushing_idx; + + // * Count the number of blocks received for each head + for (genvar head = 0; head < NUM_HEADS; head++) begin + always_ff @(posedge clk) begin + if (rst) begin + block_counter[head] <= '0; + end else if (split_head_out_valid[head] & split_head_out_ready[head]) begin + if (block_counter[head] != BLOCKS_PER_HEAD) begin + block_counter[head] <= block_counter[head] + 1'b1; + end else begin + block_counter[head] <= 1'b1; + end + end else if (heads_flushed == '1) begin + block_counter[head] <= '0; + end + end + + assign heads_flushed[head] = (block_counter[head] == BLOCKS_PER_HEAD); + end + + // * Find index of first head that hasn't finished + find_first_arbiter #( + .NUM_REQUESTERS(NUM_HEADS) + ) ff_arb_i ( + .request (~heads_flushed), + .grant_oh (), + .grant_bin(head_flushing_idx) + ); + + // * Drive output interfaces with mantissa and exponent + assign mupdated_tokens = msplit_head_out[head_flushing_idx]; + assign eupdated_tokens = esplit_head_out[head_flushing_idx]; + assign updated_tokens_valid = split_head_out_valid[head_flushing_idx]; + + for (genvar head = 0; head < NUM_HEADS; head++) begin + always_comb begin + split_head_out_ready[head] = updated_tokens_ready && (head_flushing_idx == head); + end + end + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_input_block_batched.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_input_block_batched.sv new file mode 100644 index 000000000..5e8c04af7 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_input_block_batched.sv @@ -0,0 +1,315 @@ +`timescale 1ns / 1ps +module mxint_vit_attention_input_block_batched #( + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 768, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 768, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 768, + parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + parameter WEIGHT_PRECISION_0 = 16, + parameter WEIGHT_PRECISION_1 = 3, + + parameter HAS_BIAS = 1, + parameter BIAS_TENSOR_SIZE_DIM_0 = 64, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_0 = 4, + parameter BIAS_PARALLELISM_DIM_1 = 1, + parameter BIAS_PRECISION_0 = 16, + parameter BIAS_PRECISION_1 = 3, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PRECISION_0 = 16, + parameter DATA_OUT_0_PRECISION_1 = 3 + +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + // Query weights + input logic [WEIGHT_PRECISION_0-1:0] mweight_query [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] eweight_query, + input logic weight_query_valid, + output logic weight_query_ready, + + // Query bias + input logic [BIAS_PRECISION_0-1:0] mbias_query [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] ebias_query, + input logic bias_query_valid, + output logic bias_query_ready, + + // Key weights + input logic [WEIGHT_PRECISION_0-1:0] mweight_key [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] eweight_key, + input logic weight_key_valid, + output logic weight_key_ready, + + // Key bias + input logic [BIAS_PRECISION_0-1:0] mbias_key [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] ebias_key, + input logic bias_key_valid, + output logic bias_key_ready, + + // Value weights + input logic [WEIGHT_PRECISION_0-1:0] mweight_value [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] eweight_value, + input logic weight_value_valid, + output logic weight_value_ready, + + // Value bias + input logic [BIAS_PRECISION_0-1:0] mbias_value [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] ebias_value, + input logic bias_value_valid, + output logic bias_value_ready, + + // Query + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_query [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_query, + output logic data_out_query_valid, + input logic data_out_query_ready, + + // Key + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_key [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_key, + output logic data_out_key_valid, + input logic data_out_key_ready, + + // Value + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_value [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_value, + output logic data_out_value_valid, + input logic data_out_value_ready +); + + // ! TO DO: add assertions about bias parallelism matching weight parallelism + + // * Inferred parameters + parameter DATA_IN_0_DEPTH_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1; + parameter WEIGHT_DEPTH_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_0 / WEIGHT_PARALLELISM_DIM_0; + + // * Declarations + // * ================================================================= + + logic query_data_in_valid, query_data_in_ready; + logic key_data_in_valid, key_data_in_ready; + logic value_data_in_valid, value_data_in_ready; + + logic [DATA_OUT_0_PRECISION_0-1:0] query_buffer_mdata [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1-1:0]; + logic [DATA_OUT_0_PRECISION_1-1:0] query_buffer_edata; + logic query_buffer_valid; + logic query_buffer_ready; + + // * Instances + // * ================================================================= + + // * Split the incoming data over the QKV projections + split_n #( + .N(3) + ) split_i ( + .data_in_valid (data_in_0_valid), + .data_in_ready (data_in_0_ready), + .data_out_valid({query_data_in_valid, key_data_in_valid, value_data_in_valid}), + .data_out_ready({query_data_in_ready, key_data_in_ready, value_data_in_ready}) + ); + + // * Query linear + + mxint_linear #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + + ) mxint_linear_query ( + .clk, + .rst, + + // input port for data_inivations + .mdata_in_0 (mdata_in_0), + .edata_in_0 (edata_in_0), + .data_in_0_valid(query_data_in_valid), + .data_in_0_ready(query_data_in_ready), + + // input port for weight + .mweight (mweight_query), + .eweight (eweight_query), + .weight_valid(weight_query_valid), + .weight_ready(weight_query_ready), + + .mbias (mbias_query), + .ebias (ebias_query), + .bias_valid(bias_query_valid), + .bias_ready(bias_query_ready), + + .mdata_out_0 (query_buffer_mdata), + .edata_out_0 (query_buffer_edata), + .data_out_0_valid(query_buffer_valid), + .data_out_0_ready(query_buffer_ready) + ); + + // * We must buffer the queries to latency match the key transpose path + // * since the matmul for QK^T buffers K^T but streams Q + unpacked_mx_fifo #( + .MAN_WIDTH(DATA_OUT_0_PRECISION_0), + .EXP_WIDTH(DATA_OUT_0_PRECISION_1), + .IN_SIZE(DATA_OUT_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1), + .DEPTH(DATA_IN_0_DEPTH_DIM_1 * DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0) + ) query_buffer_i ( + .clk(clk), + .rst(rst), + .mdata_in(query_buffer_mdata), + .edata_in(query_buffer_edata), + .data_in_valid(query_buffer_valid), + .data_in_ready(query_buffer_ready), + .mdata_out(mdata_out_query), + .edata_out(edata_out_query), + .data_out_valid(data_out_query_valid), + .data_out_ready(data_out_query_ready) + ); + + // * Key linear + + mxint_linear #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + + ) mxint_linear_key ( + .clk, + .rst, + + // input port for data_inivations + .mdata_in_0 (mdata_in_0), + .edata_in_0 (edata_in_0), + .data_in_0_valid(key_data_in_valid), + .data_in_0_ready(key_data_in_ready), + + // input port for weight + .mweight (mweight_key), + .eweight (eweight_key), + .weight_valid(weight_key_valid), + .weight_ready(weight_key_ready), + + .mbias (mbias_key), + .ebias (ebias_key), + .bias_valid(bias_key_valid), + .bias_ready(bias_key_ready), + + .mdata_out_0 (mdata_out_key), + .edata_out_0 (edata_out_key), + .data_out_0_valid(data_out_key_valid), + .data_out_0_ready(data_out_key_ready) + ); + + // * Value linear + + mxint_linear #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + + ) mxint_linear_value ( + .clk, + .rst, + + // input port for data_inivations + .mdata_in_0 (mdata_in_0), + .edata_in_0 (edata_in_0), + .data_in_0_valid(value_data_in_valid), + .data_in_0_ready(value_data_in_ready), + + // input port for weight + .mweight (mweight_value), + .eweight (eweight_value), + .weight_valid(weight_value_valid), + .weight_ready(weight_value_ready), + + .mbias (mbias_value), + .ebias (ebias_value), + .bias_valid(bias_value_valid), + .bias_ready(bias_value_ready), + + .mdata_out_0 (mdata_out_value), + .edata_out_0 (edata_out_value), + .data_out_0_valid(data_out_value_valid), + .data_out_0_ready(data_out_value_ready) + ); + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_wrap.sv b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_wrap.sv new file mode 100644 index 000000000..2a1af9c8b --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/mxint_vit_attention_wrap.sv @@ -0,0 +1,261 @@ +// File: mxint_vit_attention.sv +module mxint_vit_attention_wrap #( + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 8, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 12, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 2, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 10, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 1, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, + parameter QUERY_WEIGHT_PRECISION_0 = 8, + parameter QUERY_WEIGHT_PRECISION_1 = 4, + parameter QUERY_WEIGHT_TENSOR_SIZE_DIM_0 = 12, + parameter QUERY_WEIGHT_PARALLELISM_DIM_0 = 2, + parameter QUERY_WEIGHT_TENSOR_SIZE_DIM_1 = 12, + parameter QUERY_WEIGHT_PARALLELISM_DIM_1 = 2, + parameter QUERY_BIAS_PRECISION_0 = 4, + parameter QUERY_BIAS_PRECISION_1 = 8, + parameter QUERY_BIAS_TENSOR_SIZE_DIM_0 = 12, + parameter QUERY_BIAS_PARALLELISM_DIM_0 = 2, + parameter QUERY_BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter QUERY_BIAS_PARALLELISM_DIM_1 = 1, + parameter KEY_WEIGHT_PRECISION_0 = 8, + parameter KEY_WEIGHT_PRECISION_1 = 4, + parameter KEY_WEIGHT_TENSOR_SIZE_DIM_0 = 12, + parameter KEY_WEIGHT_PARALLELISM_DIM_0 = 2, + parameter KEY_WEIGHT_TENSOR_SIZE_DIM_1 = 12, + parameter KEY_WEIGHT_PARALLELISM_DIM_1 = 2, + parameter KEY_BIAS_PRECISION_0 = 4, + parameter KEY_BIAS_PRECISION_1 = 8, + parameter KEY_BIAS_TENSOR_SIZE_DIM_0 = 12, + parameter KEY_BIAS_PARALLELISM_DIM_0 = 2, + parameter KEY_BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter KEY_BIAS_PARALLELISM_DIM_1 = 1, + parameter VALUE_WEIGHT_PRECISION_0 = 8, + parameter VALUE_WEIGHT_PRECISION_1 = 4, + parameter VALUE_WEIGHT_TENSOR_SIZE_DIM_0 = 12, + parameter VALUE_WEIGHT_PARALLELISM_DIM_0 = 2, + parameter VALUE_WEIGHT_TENSOR_SIZE_DIM_1 = 12, + parameter VALUE_WEIGHT_PARALLELISM_DIM_1 = 2, + parameter VALUE_BIAS_PRECISION_0 = 4, + parameter VALUE_BIAS_PRECISION_1 = 8, + parameter VALUE_BIAS_TENSOR_SIZE_DIM_0 = 12, + parameter VALUE_BIAS_PARALLELISM_DIM_0 = 2, + parameter VALUE_BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter VALUE_BIAS_PARALLELISM_DIM_1 = 1, + parameter PROJ_WEIGHT_PRECISION_0 = 8, + parameter PROJ_WEIGHT_PRECISION_1 = 4, + parameter PROJ_WEIGHT_TENSOR_SIZE_DIM_0 = 12, + parameter PROJ_WEIGHT_PARALLELISM_DIM_0 = 2, + parameter PROJ_WEIGHT_TENSOR_SIZE_DIM_1 = 12, + parameter PROJ_WEIGHT_PARALLELISM_DIM_1 = 2, + parameter PROJ_BIAS_PRECISION_0 = 4, + parameter PROJ_BIAS_PRECISION_1 = 8, + parameter PROJ_BIAS_TENSOR_SIZE_DIM_0 = 12, + parameter PROJ_BIAS_PARALLELISM_DIM_0 = 2, + parameter PROJ_BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter PROJ_BIAS_PARALLELISM_DIM_1 = 1, + parameter DATA_OUT_0_PRECISION_0 = 4, + parameter DATA_OUT_0_PRECISION_1 = 8, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = 12, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = 2, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = 10, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = 1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = 1, + parameter NUM_HEADS = 3, + parameter HAS_BIAS = 1, + localparam WEIGHT_PRECISION_0 = QUERY_WEIGHT_PRECISION_0, + localparam WEIGHT_PRECISION_1 = QUERY_WEIGHT_PRECISION_1, + localparam BIAS_PRECISION_0 = QUERY_BIAS_PRECISION_0, + localparam BIAS_PRECISION_1 = QUERY_BIAS_PRECISION_1, + localparam WEIGHT_PROJ_PRECISION_0 = PROJ_WEIGHT_PRECISION_0, + localparam WEIGHT_PROJ_PRECISION_1 = PROJ_WEIGHT_PRECISION_1, + localparam BIAS_PROJ_PRECISION_0 = PROJ_BIAS_PRECISION_0, + localparam BIAS_PROJ_PRECISION_1 = PROJ_BIAS_PRECISION_1, + localparam WEIGHT_TENSOR_SIZE_DIM_0 = QUERY_WEIGHT_TENSOR_SIZE_DIM_0, + localparam WEIGHT_TENSOR_SIZE_DIM_1 = QUERY_WEIGHT_TENSOR_SIZE_DIM_1, + localparam WEIGHT_PARALLELISM_DIM_0 = QUERY_WEIGHT_PARALLELISM_DIM_0, + localparam WEIGHT_PARALLELISM_DIM_1 = QUERY_WEIGHT_PARALLELISM_DIM_1, + localparam BIAS_TENSOR_SIZE_DIM_0 = QUERY_BIAS_TENSOR_SIZE_DIM_0, + localparam BIAS_TENSOR_SIZE_DIM_1 = QUERY_BIAS_TENSOR_SIZE_DIM_1, + localparam BIAS_PARALLELISM_DIM_0 = QUERY_BIAS_PARALLELISM_DIM_0, + localparam BIAS_PARALLELISM_DIM_1 = QUERY_BIAS_PARALLELISM_DIM_1, + localparam WEIGHT_PROJ_TENSOR_SIZE_DIM_0 = PROJ_WEIGHT_TENSOR_SIZE_DIM_0, + localparam WEIGHT_PROJ_TENSOR_SIZE_DIM_1 = PROJ_WEIGHT_TENSOR_SIZE_DIM_1, + localparam WEIGHT_PROJ_PARALLELISM_DIM_0 = PROJ_WEIGHT_PARALLELISM_DIM_0, + localparam WEIGHT_PROJ_PARALLELISM_DIM_1 = PROJ_WEIGHT_PARALLELISM_DIM_1, + localparam BIAS_PROJ_TENSOR_SIZE_DIM_0 = PROJ_BIAS_TENSOR_SIZE_DIM_0, + localparam BIAS_PROJ_TENSOR_SIZE_DIM_1 = PROJ_BIAS_TENSOR_SIZE_DIM_1, + localparam BIAS_PROJ_PARALLELISM_DIM_0 = PROJ_BIAS_PARALLELISM_DIM_0, + localparam BIAS_PROJ_PARALLELISM_DIM_1 = PROJ_BIAS_PARALLELISM_DIM_1, + localparam QKV_PRECISION_0 = DATA_IN_0_PRECISION_0, + localparam QKV_PRECISION_1 = DATA_IN_0_PRECISION_1 +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] mdata_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic [DATA_IN_0_PRECISION_1-1:0] edata_in_0, + input logic data_in_0_valid, + output logic data_in_0_ready, + + // Query weights + input logic [WEIGHT_PRECISION_0-1:0] mquery_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] equery_weight, + input logic query_weight_valid, + output logic query_weight_ready, + + // Query bias + input logic [BIAS_PRECISION_0-1:0] mquery_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] equery_bias, + input logic query_bias_valid, + output logic query_bias_ready, + + // Key weights + input logic [WEIGHT_PRECISION_0-1:0] mkey_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] ekey_weight, + input logic key_weight_valid, + output logic key_weight_ready, + + // Key bias + input logic [BIAS_PRECISION_0-1:0] mkey_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] ekey_bias, + input logic key_bias_valid, + output logic key_bias_ready, + + // Value weights + input logic [WEIGHT_PRECISION_0-1:0] mvalue_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PRECISION_1-1:0] evalue_weight, + input logic value_weight_valid, + output logic value_weight_ready, + + // Value bias + input logic [BIAS_PRECISION_0-1:0] mvalue_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PRECISION_1-1:0] evalue_bias, + input logic value_bias_valid, + output logic value_bias_ready, + + // Proj weights + input logic [WEIGHT_PROJ_PRECISION_0-1:0] mproj_weight [WEIGHT_PROJ_PARALLELISM_DIM_0 * WEIGHT_PROJ_PARALLELISM_DIM_1-1:0], + input logic [WEIGHT_PROJ_PRECISION_1-1:0] eproj_weight, + input logic proj_weight_valid, + output logic proj_weight_ready, + + // Proj bias + input logic [BIAS_PROJ_PRECISION_0-1:0] mproj_bias [BIAS_PROJ_PARALLELISM_DIM_0 * BIAS_PROJ_PARALLELISM_DIM_1 -1:0], + input logic [BIAS_PROJ_PRECISION_1-1:0] eproj_bias, + input logic proj_bias_valid, + output logic proj_bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] mdata_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic [DATA_OUT_0_PRECISION_1-1:0] edata_out_0, + output logic data_out_0_valid, + input logic data_out_0_ready +); + // Internal logic here (if any) + mxint_vit_attention #( + .NUM_HEADS(NUM_HEADS), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + .WEIGHT_PRECISION_0(WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1(WEIGHT_PRECISION_1), + .HAS_BIAS(HAS_BIAS), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + .BIAS_PRECISION_0(BIAS_PRECISION_0), + .BIAS_PRECISION_1(BIAS_PRECISION_1), + .QKV_PRECISION_0(QKV_PRECISION_0), + .QKV_PRECISION_1(QKV_PRECISION_1), + .WEIGHT_PROJ_PRECISION_0(WEIGHT_PROJ_PRECISION_0), + .WEIGHT_PROJ_PRECISION_1(WEIGHT_PROJ_PRECISION_1), + .BIAS_PROJ_PRECISION_0(BIAS_PROJ_PRECISION_0), + .BIAS_PROJ_PRECISION_1(BIAS_PROJ_PRECISION_1), + .WEIGHT_PROJ_TENSOR_SIZE_DIM_0(WEIGHT_PROJ_TENSOR_SIZE_DIM_0), + .WEIGHT_PROJ_TENSOR_SIZE_DIM_1(WEIGHT_PROJ_TENSOR_SIZE_DIM_1), + .WEIGHT_PROJ_PARALLELISM_DIM_0(WEIGHT_PROJ_PARALLELISM_DIM_0), + .WEIGHT_PROJ_PARALLELISM_DIM_1(WEIGHT_PROJ_PARALLELISM_DIM_1), + .BIAS_PROJ_TENSOR_SIZE_DIM_0(BIAS_PROJ_TENSOR_SIZE_DIM_0), + .BIAS_PROJ_TENSOR_SIZE_DIM_1(BIAS_PROJ_TENSOR_SIZE_DIM_1), + .BIAS_PROJ_PARALLELISM_DIM_0(BIAS_PROJ_PARALLELISM_DIM_0), + .BIAS_PROJ_PARALLELISM_DIM_1(BIAS_PROJ_PARALLELISM_DIM_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_1(DATA_OUT_0_PARALLELISM_DIM_1), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) mxint_vit_attention_inst ( + .clk(clk), + .rst(rst), + .mdata_in_0(mdata_in_0), + .edata_in_0(edata_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + + // Query weights + .mweight_query(mquery_weight), + .eweight_query(equery_weight), + .query_weight_valid(query_weight_valid), + .query_weight_ready(query_weight_ready), + + // Query bias + .mquery_bias(mquery_bias), + .equery_bias(equery_bias), + .query_bias_valid(query_bias_valid), + .query_bias_ready(query_bias_ready), + + // Key weights + .mkey_weight(mkey_weight), + .ekey_weight(ekey_weight), + .key_weight_valid(key_weight_valid), + .key_weight_ready(key_weight_ready), + + // Key bias + .mkey_bias(mkey_bias), + .ekey_bias(ekey_bias), + .key_bias_valid(key_bias_valid), + .key_bias_ready(key_bias_ready), + + // Value weights + .mvalue_weight(mvalue_weight), + .evalue_weight(evalue_weight), + .value_weight_valid(value_weight_valid), + .value_weight_ready(value_weight_ready), + + // Value bias + .mvalue_bias(mvalue_bias), + .evalue_bias(evalue_bias), + .value_bias_valid(value_bias_valid), + .value_bias_ready(value_bias_ready), + + // Proj weights + .mproj_weight(mproj_weight), + .eproj_weight(eproj_weight), + .proj_weight_valid(proj_weight_valid), + .proj_weight_ready(proj_weight_ready), + + // Proj bias + .mproj_bias(mproj_bias), + .eproj_bias(eproj_bias), + .proj_bias_valid(proj_bias_valid), + .proj_bias_ready(proj_bias_ready), + + .mdata_out_0(mdata_out_0), + .edata_out_0(edata_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + +endmodule \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/optimized_right_shift.sv b/src/mase_components/linear_layers/mxint_operators/rtl/optimized_right_shift.sv new file mode 100644 index 000000000..7ce5c19cb --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/optimized_right_shift.sv @@ -0,0 +1,77 @@ +`timescale 1ns / 1ps +/* +Module : optimized_variable_shift +Description : + optimized version of variable shift. + if shift_value > 0, this module will implement right shift + if left shift exceeding output range, + it will automatically clamp into maximum; +*/ +module optimized_right_shift #( + parameter IN_WIDTH = -1, + parameter BLOCK_SIZE = -1, + parameter SHIFT_WIDTH = -1, + parameter OUT_WIDTH = -1 +) ( + input logic [IN_WIDTH - 1:0] data_in[BLOCK_SIZE - 1:0], + input logic [SHIFT_WIDTH - 1:0] shift_value, + output logic [OUT_WIDTH - 1:0] data_out[BLOCK_SIZE - 1:0] +); + localparam SHIFT_DATA_WIDTH = IN_WIDTH + OUT_WIDTH - 1; // The maximum left shift value is out_width - 1 + + localparam logic signed [OUT_WIDTH-1:0] MIN_VAL = -(2 ** (OUT_WIDTH - 1)); + localparam logic signed [OUT_WIDTH-1:0] MAX_VAL = (2 ** (OUT_WIDTH - 1)) - 1; + + logic [SHIFT_WIDTH - 1:0] abs_shift_value, real_shift_value; + logic shift_sign; + + logic [SHIFT_DATA_WIDTH - 1:0] shift_data_list[BLOCK_SIZE - 1:0][SHIFT_DATA_WIDTH -1 : 0]; + + logic [OUT_WIDTH - 1:0] clamped_out[BLOCK_SIZE - 1:0]; + + enum { + SHIFT_OUT_RANGE, + SHIFT_IN_RANGE + } mode; + + assign shift_sign = shift_value[SHIFT_WIDTH-1]; + + assign abs_shift_value = (shift_sign) ? (~shift_value + 1) : shift_value; + assign real_shift_value = (abs_shift_value < SHIFT_DATA_WIDTH - 1) ? abs_shift_value : SHIFT_DATA_WIDTH - 1; + + // There is several things need to be considered + always_comb begin + if ((abs_shift_value >= OUT_WIDTH) && (shift_sign)) mode = SHIFT_OUT_RANGE; + else mode = SHIFT_IN_RANGE; + end + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + for (genvar j = 0; j < SHIFT_DATA_WIDTH; j++) begin + always_comb begin + shift_data_list[i][j] = (shift_sign) ? $signed(data_in[i]) <<< j : + $signed(data_in[i]) >>> j; + end + end + end + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + signed_clamp #( + .IN_WIDTH (SHIFT_DATA_WIDTH), + .OUT_WIDTH(OUT_WIDTH) + ) data_clamp ( + .in_data (shift_data_list[i][real_shift_value]), + .out_data(clamped_out[i]) + ); + end + + for (genvar i = 0; i < BLOCK_SIZE; i++) begin + always_comb begin + if (data_in[i] == 0) data_out[i] = 0; + else + case (mode) + SHIFT_OUT_RANGE: data_out[i] = (data_in[i][IN_WIDTH-1]) ? MIN_VAL : MAX_VAL; + SHIFT_IN_RANGE: data_out[i] = clamped_out[i]; + default: data_out[i] = clamped_out[i]; + endcase + end + end +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/or_tree.sv b/src/mase_components/linear_layers/mxint_operators/rtl/or_tree.sv index 570928f5d..522d27628 100644 --- a/src/mase_components/linear_layers/mxint_operators/rtl/or_tree.sv +++ b/src/mase_components/linear_layers/mxint_operators/rtl/or_tree.sv @@ -5,23 +5,22 @@ Description : This module actually implement the tree structure of or logic. */ module or_tree #( - parameter IN_SIZE = 2, - parameter IN_WIDTH = 32, - parameter OUT_WIDTH = IN_WIDTH + parameter IN_SIZE = 2, + parameter IN_WIDTH = 32 ) ( /* verilator lint_off UNUSEDSIGNAL */ - input logic clk, - input logic rst, + input logic clk, + input logic rst, /* verilator lint_on UNUSEDSIGNAL */ - input logic [ IN_WIDTH-1:0] data_in [IN_SIZE-1:0], - input logic data_in_valid, - output logic data_in_ready, - output logic [OUT_WIDTH-1:0] data_out, - output logic data_out_valid, - input logic data_out_ready + input logic [IN_WIDTH-1:0] data_in [IN_SIZE-1:0], + input logic data_in_valid, + output logic data_in_ready, + output logic [IN_WIDTH-1:0] data_out, + output logic data_out_valid, + input logic data_out_ready ); - localparam LEVELS = $clog2(IN_SIZE); + localparam LEVELS = $clog2(IN_SIZE) + 1; initial begin assert (IN_SIZE > 0); @@ -30,17 +29,26 @@ module or_tree #( generate if (LEVELS == 0) begin : gen_skip_adder_tree - assign data_out = data_in[0][IN_WIDTH-1] ? ~data_in[0] + 1 : data_in[0]; - assign data_out_valid = data_in_valid; - assign data_in_ready = data_out_ready; + register_slice #( + .DATA_WIDTH(IN_WIDTH) + ) register_slice_i ( + .clk (clk), + .rst (rst), + .data_in_valid (data_in_valid), + .data_in_ready (data_in_ready), + .data_in (data_in[0][IN_WIDTH-1] ? ~data_in[0] + 1 : data_in[0]), + .data_out_valid(data_out_valid), + .data_out_ready(data_out_ready), + .data_out (data_out) + ); end else begin : gen_adder_tree // data & sum wires are oversized on purpose for vivado. - logic [OUT_WIDTH*IN_SIZE-1:0] data[LEVELS:0]; - logic [OUT_WIDTH*IN_SIZE-1:0] or_result[LEVELS-1:0]; - logic valid[IN_SIZE-1:0]; - logic ready[IN_SIZE-1:0]; + logic [IN_WIDTH*IN_SIZE-1:0] data[LEVELS:0]; + logic [IN_WIDTH*IN_SIZE-1:0] or_result[LEVELS-1:0]; + logic valid[LEVELS:0]; + logic ready[LEVELS:0]; // Generate adder for each layer for (genvar i = 0; i < LEVELS; i++) begin : level @@ -60,7 +68,7 @@ module or_tree #( register_slice #( .DATA_WIDTH(LEVEL_OUT_SIZE * LEVEL_OUT_WIDTH) - ) register_slice ( + ) register_slice_i ( .clk (clk), .rst (rst), .data_in (or_result[i]), @@ -80,7 +88,7 @@ module or_tree #( assign valid[0] = data_in_valid; assign data_in_ready = ready[0]; - assign data_out = data[LEVELS][OUT_WIDTH-1:0]; + assign data_out = data[LEVELS][IN_WIDTH-1:0]; assign data_out_valid = valid[LEVELS]; assign ready[LEVELS] = data_out_ready; diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/pack_data.sv b/src/mase_components/linear_layers/mxint_operators/rtl/pack_data.sv new file mode 100644 index 000000000..be14f0360 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/pack_data.sv @@ -0,0 +1,15 @@ +`timescale 1ns / 1ps +module pack_data #( + parameter IN_WIDTH = 1, + parameter IN_SIZE = 8 +) ( + input logic [IN_WIDTH - 1:0] data_in [IN_SIZE - 1:0], + output logic [IN_WIDTH*IN_SIZE-1:0] data_out +); + + // Pack the array into a single vector + for (genvar i = 0; i < IN_SIZE; i++) begin : reshape + assign data_out[i*IN_WIDTH +: IN_WIDTH] = data_in[i]; + end + +endmodule \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/sequential_max.sv b/src/mase_components/linear_layers/mxint_operators/rtl/sequential_max.sv new file mode 100644 index 000000000..fe064d3d1 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/sequential_max.sv @@ -0,0 +1,68 @@ +`timescale 1ns / 1ps + +module sequential_max #( + parameter IN_DEPTH = 4, + parameter IN_WIDTH = 32 +) ( + input logic clk, + input logic rst, + + input logic [IN_WIDTH-1:0] data_in, + input logic data_in_valid, + output logic data_in_ready, + + output logic [IN_WIDTH-1:0] data_out, + output logic data_out_valid, + input logic data_out_ready +); + logic [IN_WIDTH-1:0] reg_in; + logic reg_in_valid, reg_in_ready; + + skid_buffer #( + .DATA_WIDTH(IN_WIDTH) + ) register_slice ( + .data_in(reg_in), + .data_in_valid(reg_in_valid), + .data_in_ready(reg_in_ready), + .* + ); + // 1-bit wider so IN_DEPTH also fits. + localparam COUNTER_WIDTH = $clog2(IN_DEPTH); + logic [COUNTER_WIDTH:0] counter; + + /* verilator lint_off WIDTH */ + assign data_in_ready = (counter != IN_DEPTH) || reg_in_ready; + assign reg_in_valid = (counter == IN_DEPTH); + /* verilator lint_on WIDTH */ + + // counter logic + always_ff @(posedge clk) begin + if (rst) begin + counter <= '0; + end else begin + if (reg_in_valid && reg_in_ready) begin + // Reset counter or start new sequence + counter <= data_in_valid ? 1'b1 : '0; + end else if (data_in_valid && data_in_ready) begin + // Continue counting inputs + counter <= counter + 1'b1; + end + end + end + + // max value tracking logic + always_ff @(posedge clk) begin + if (rst) begin + reg_in <= '0; + end else begin + if (reg_in_valid && reg_in_ready) begin + // Reset or start new maximum tracking + reg_in <= data_in_valid ? data_in : '0; + end else if (data_in_valid && data_in_ready) begin + // Update maximum if new value is larger + reg_in <= ($signed(data_in) > $signed(reg_in)) ? data_in : reg_in; + end + end + end + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/unpack_data.sv b/src/mase_components/linear_layers/mxint_operators/rtl/unpack_data.sv new file mode 100644 index 000000000..044dffff4 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/unpack_data.sv @@ -0,0 +1,15 @@ +`timescale 1ns / 1ps +module unpack_data #( + parameter IN_WIDTH = 1, + parameter IN_SIZE = 8 +) ( + input logic [IN_WIDTH*IN_SIZE-1:0] data_in, + output logic [IN_WIDTH - 1:0] data_out [IN_SIZE - 1:0] +); + + // Unpack the vector into an array + for (genvar i = 0; i < IN_SIZE; i++) begin : reshape + assign data_out[i] = data_in[i*IN_WIDTH +: IN_WIDTH]; + end + +endmodule \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/rtl/unpacked_mx_split2_with_data.sv b/src/mase_components/linear_layers/mxint_operators/rtl/unpacked_mx_split2_with_data.sv new file mode 100644 index 000000000..4b7a22377 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/rtl/unpacked_mx_split2_with_data.sv @@ -0,0 +1,88 @@ +`timescale 1 ns / 1 ps +module unpacked_mx_split2_with_data #( + parameter DEPTH = 8, + parameter MAN_WIDTH = 8, + parameter EXP_WIDTH = 8, + parameter IN_SIZE = 8 +) ( + input clk, + input rst, + // Input interface + input [MAN_WIDTH-1:0] mdata_in[IN_SIZE - 1:0], + input [EXP_WIDTH-1:0] edata_in, + input logic data_in_valid, + output logic data_in_ready, + // FIFO output interface + output [MAN_WIDTH-1:0] fifo_mdata_out[IN_SIZE - 1:0], + output [EXP_WIDTH-1:0] fifo_edata_out, + output logic fifo_data_out_valid, + input logic fifo_data_out_ready, + // Straight output interface + output [MAN_WIDTH-1:0] straight_mdata_out[IN_SIZE - 1:0], + output [EXP_WIDTH-1:0] straight_edata_out, + output logic straight_data_out_valid, + input logic straight_data_out_ready +); + // Flatten the input data + logic [MAN_WIDTH * IN_SIZE + EXP_WIDTH - 1:0] data_in_flatten; + logic [MAN_WIDTH * IN_SIZE + EXP_WIDTH - 1:0] fifo_data_out_flatten; + logic [MAN_WIDTH * IN_SIZE + EXP_WIDTH - 1:0] straight_data_out_flatten; + // Add register slice at the end for FIFO outputs + logic [MAN_WIDTH-1:0] fifo_mdata_out_unreg[IN_SIZE - 1:0]; + logic [EXP_WIDTH-1:0] fifo_edata_out_unreg; + logic fifo_data_out_unreg_valid, fifo_data_out_unreg_ready; + + // Input flattening + for (genvar i = 0; i < IN_SIZE; i++) begin : reshape + assign data_in_flatten[i*MAN_WIDTH+MAN_WIDTH-1:i*MAN_WIDTH] = mdata_in[i]; + end + assign data_in_flatten[MAN_WIDTH*IN_SIZE+EXP_WIDTH-1:MAN_WIDTH*IN_SIZE] = edata_in; + + // Split2 instance + split2_with_data #( + .DATA_WIDTH(MAN_WIDTH * IN_SIZE + EXP_WIDTH), + .FIFO_DEPTH(DEPTH) + ) split2_with_data_i ( + .clk(clk), + .rst(rst), + .data_in(data_in_flatten), + .data_in_valid(data_in_valid), + .data_in_ready(data_in_ready), + .fifo_data_out(fifo_data_out_flatten), + .fifo_data_out_valid(fifo_data_out_unreg_valid), + .fifo_data_out_ready(fifo_data_out_unreg_ready), + .straight_data_out(straight_data_out_flatten), + .straight_data_out_valid(straight_data_out_valid), + .straight_data_out_ready(straight_data_out_ready) + ); + + // Unflatten FIFO output + for (genvar i = 0; i < IN_SIZE; i++) begin : unreshape_fifo + assign fifo_mdata_out_unreg[i] = fifo_data_out_flatten[i*MAN_WIDTH+MAN_WIDTH-1:i*MAN_WIDTH]; + end + assign fifo_edata_out_unreg = fifo_data_out_flatten[MAN_WIDTH*IN_SIZE+EXP_WIDTH-1:MAN_WIDTH*IN_SIZE]; + + mxint_skid_buffer #( + .DATA_PRECISION_0(MAN_WIDTH), + .DATA_PRECISION_1(EXP_WIDTH), + .IN_NUM(IN_SIZE) + ) fifo_out_reg_slice ( + .clk(clk), + .rst(rst), + .mdata_in(fifo_mdata_out_unreg), + .edata_in(fifo_edata_out_unreg), + .data_in_valid(fifo_data_out_unreg_valid), + .data_in_ready(fifo_data_out_unreg_ready), + .mdata_out(fifo_mdata_out), + .edata_out(fifo_edata_out), + .data_out_valid(fifo_data_out_valid), + .data_out_ready(fifo_data_out_ready) + ); + + // Unflatten straight output + for (genvar i = 0; i < IN_SIZE; i++) begin : unreshape_straight + assign straight_mdata_out[i] = straight_data_out_flatten[i*MAN_WIDTH+MAN_WIDTH-1:i*MAN_WIDTH]; + end + assign straight_edata_out = straight_data_out_flatten[MAN_WIDTH*IN_SIZE+EXP_WIDTH-1:MAN_WIDTH*IN_SIZE]; + +endmodule diff --git a/src/mase_components/linear_layers/mxint_operators/test/log2_max_abs_tb.py b/src/mase_components/linear_layers/mxint_operators/test/log2_max_abs_tb.py index e66d2e0f2..c2e09a318 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/log2_max_abs_tb.py +++ b/src/mase_components/linear_layers/mxint_operators/test/log2_max_abs_tb.py @@ -1,164 +1,147 @@ #!/usr/bin/env python3 -# This script tests the fixed point adder tree -import os, math, logging, pytest +# This script tests the fixed point linear +import os, logging + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import * -from mase_cocotb.random_test import RandomSource, RandomSink, check_results from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + StreamDriver, + StreamMonitor, +) + from mase_cocotb.runner import mase_runner +from utils import mxint_quantize -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock +import torch +from math import ceil, log2 +import random +from mase_cocotb.utils import bit_driver -debug = False +logger = logging.getLogger("testbench") +logger.setLevel(logging.DEBUG) -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) +torch.manual_seed(10) -# DUT test specifications -class VerificationCase(Testbench): - def __init__(self, dut, samples=10): +class Log2_max_abs_tb(Testbench): + def __init__(self, dut, num=1) -> None: super().__init__(dut, dut.clk, dut.rst) - self.assign_self_params(["IN_SIZE", "IN_WIDTH"]) - self.data_in_width = self.IN_WIDTH - self.num = self.IN_SIZE - self.inputs = RandomSource( - samples=samples, num=self.num, max_stalls=2 * samples, debug=debug + self.num = num + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + + cocotb.start_soon(check_signal(dut, self.log)) + self.data_in_0_driver = StreamDriver( + dut.clk, + dut.data_in_0, + dut.data_in_0_valid, + dut.data_in_0_ready, ) - self.outputs = RandomSink( - samples=samples, num=self.num, max_stalls=2 * samples, debug=debug + self.data_out_0_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, ) - self.samples = samples - self.ref = self.sw_compute() - - def sw_compute(self): - ref = [] - for i in range(self.samples): - breakpoint() - ref.append( - math.ceil(math.log2(max([abs(data) for data in self.inputs.data[i]]))) - ) - print(self.inputs.data[i]) - ref.reverse() - return ref - - -# Check if an impossible state is reached -def is_impossible_state(data_in_ready, data_in_valid, data_out_ready, data_out_valid): - # (0, X, 0, 0) - # (0, X, 1, 0) - # (0, X, 1, 1) - if (not data_in_ready) and not ((not data_out_ready) and data_out_valid): - return True - return False + self.input_drivers = {"in": self.data_in_0_driver} + self.output_monitors = {"out": self.data_out_0_monitor} + self.data_in_0_driver.log.setLevel(logging.DEBUG) + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self): + from math import ceil, log2 + + data_in = torch.randint(-20, 20, size=(self.get_parameter("IN_SIZE"),)) + log2_max = ceil(log2((int(data_in.abs().max()) + 1e-6))) + inputs = [data_in.tolist()] + outputs = [log2_max] + return inputs, outputs + + async def run_test(self, samples, us): + await self.reset() + logger.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + self.data_in_0_driver.valid.value = 0 + for _ in range(samples): + logger.info(f"generating inputs") + inputs, exp_outputs = self.generate_inputs() + + # Load the inputs driver + print(inputs) + self.data_in_0_driver.load_driver(inputs) + # Load the output monitor + self.data_out_0_monitor.load_monitor(exp_outputs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + + +async def check_signal(dut, log): + # await Timer(20, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + if str(dut.data_out_0_valid) == "1" and str(dut.data_out_0_ready) == "1": + print(dut.or_result.value) + # print("end") + + +# @cocotb.test() +# async def test(dut): +# tb = Log2_max_abs_tb(dut, 1) +# await tb.run_test(samples=10, us=5) + +# @cocotb.test() +# async def single_mult(dut): +# tb = MXIntMatmulTB(dut) +# tb.output_monitor.ready.value = 1 +# await tb.run_test(batches=1, us=100) + + +# @cocotb.test() +# async def repeated_mult(dut): +# tb = MXIntMatmulTB(dut) +# tb.output_monitor.ready.value = 1 +# await tb.run_test(batches=1000, us=2000) @cocotb.test() -async def cocotb_test_fixed_adder_tree(dut): - """Test integer based adder tree""" - samples = 1 - test_case = VerificationCase(dut, samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - logger.debug( - "Pre-clk State: (data_in_ready,data_in_valid,data_out_ready,data_out_valid) = ({},{},{},{})".format( - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - await FallingEdge(dut.clk) - logger.debug( - "Post-clk State: (data_in_ready,data_in_valid,data_out_ready,data_out_valid) = ({},{},{},{})".format( - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - logger.debug( - "Pre-clk State: (data_in_ready,data_in_valid,data_out_ready,data_out_valid) = ({},{},{},{})".format( - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - await FallingEdge(dut.clk) - logger.debug( - "Post-clk State: (data_in_ready,data_in_valid,data_out_ready,data_out_valid) = ({},{},{},{})".format( - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - - done = False - while not done: - await FallingEdge(dut.clk) - logger.debug( - "Post-clk State: (data_in_ready,data_in_valid,data_out_ready,data_out_valid) = ({},{},{},{})".format( - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - dut.data_in_valid.value = test_case.inputs.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute( - dut.data_out_valid.value - ) - await Timer(1, units="ns") - dut.data_in_valid.value, dut.data_in.value = test_case.inputs.compute( - dut.data_in_ready.value - ) - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - logger.debug( - "Pre-clk State: (data_in_ready,data_in_valid,data_out_ready,data_out_valid) = ({},{},{},{})".format( - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - done = test_case.inputs.is_empty() and test_case.outputs.is_full() - check_results([i.signed_integer for i in test_case.outputs.data], test_case.ref) +async def repeated_mult_valid_backpressure(dut): + tb = Log2_max_abs_tb(dut, 1) + tb.data_in_0_driver.set_valid_prob(0.7) + cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) + await tb.run_test(samples=20, us=200) -@pytest.mark.dev -def test_abs_max_tree(): +if __name__ == "__main__": mase_runner( + trace=True, module_param_list=[ - # Power of 2's - {"IN_SIZE": 2, "IN_WIDTH": 8}, + # { + # "DATA_IN_0_PRECISION_0": 8, + # "DATA_IN_0_PRECISION_1": 4, + # "BLOCK_SIZE": 1, + # "IN_DEPTH": 1, + # }, + # { + # "DATA_IN_0_PRECISION_0": 8, + # "DATA_IN_0_PRECISION_1": 4, + # "BLOCK_SIZE": 4, + # }, + { + "IN_WIDTH": 8, + "IN_SIZE": 16, + }, + { + "IN_WIDTH": 8, + "IN_SIZE": 4, + }, ], - trace=True, + # sim="questa", + # gui=True ) - - -if __name__ == "__main__": - test_abs_max_tree() diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_accumulator_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_accumulator_tb.py index aaa16a70f..6c0152df0 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/mxint_accumulator_tb.py +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_accumulator_tb.py @@ -11,10 +11,11 @@ from mase_cocotb.interfaces.streaming import ( MultiSignalStreamDriver, MultiSignalStreamMonitor, + MultiSignalErrorThresholdStreamMonitor ) from mase_cocotb.runner import mase_runner -from utils import mxint_quantize +from utils import mxint_quantize, MxIntAccumulator import torch from math import ceil, log2 @@ -33,28 +34,36 @@ def __init__(self, dut, num=1) -> None: self.num = num if not hasattr(self, "log"): self.log = SimLog("%s" % (type(self).__qualname__)) - self.data_in_0_driver = MultiSignalStreamDriver( dut.clk, (dut.mdata_in_0, dut.edata_in_0), dut.data_in_0_valid, dut.data_in_0_ready, ) - self.data_out_0_monitor = MultiSignalStreamMonitor( + self.data_out_0_monitor = MultiSignalErrorThresholdStreamMonitor( dut.clk, (dut.mdata_out_0, dut.edata_out_0), dut.data_out_0_valid, dut.data_out_0_ready, + width=self.get_parameter("DATA_IN_0_PRECISION_0"), + signed=True, check=True, + error_bits=1, ) + self.input_drivers = {"in0": self.data_in_0_driver} + self.output_monitors = {"out": self.data_out_0_monitor} def generate_inputs(self): from utils import block_mxint_quant, pack_tensor_to_mx_listed_chunk from utils import mxint_quantize from math import ceil, log2 - data_in = 20 * torch.rand( - self.get_parameter("IN_DEPTH"), self.get_parameter("BLOCK_SIZE") + data_in = ( + 20 + * torch.rand( + self.get_parameter("IN_DEPTH"), self.get_parameter("BLOCK_SIZE") + ) + - 10 ) config = { "width": self.get_parameter("DATA_IN_0_PRECISION_0"), @@ -62,15 +71,22 @@ def generate_inputs(self): } parallelism = [1, self.get_parameter("BLOCK_SIZE")] (qtensor, mtensor, etensor) = block_mxint_quant(data_in, config, parallelism) - - qout, mout, eout = mxint_quantize( - qtensor.sum(dim=0), - width=config["width"] - + 2 ** config["exponent_width"] - + ceil(log2(self.get_parameter("IN_DEPTH"))), - exponent_width=config["exponent_width"], - exponent=int(etensor.min()), + mtensor = mtensor.reshape( + self.get_parameter("IN_DEPTH"), self.get_parameter("BLOCK_SIZE") ) + etensor = etensor.reshape(self.get_parameter("IN_DEPTH")) + mout, eout = MxIntAccumulator(mtensor, etensor) + _, mout, eout = block_mxint_quant(qtensor.sum(dim=0), config, parallelism) + + qout = mout * 2**(eout - config["width"] + 1) + new_config = { + "width": 8, + "exponent_width": config["exponent_width"], + } + # print(block_mxint_quant(mid_out, new_config, parallelism)) + # print(block_mxint_quant(qout, new_config, parallelism)) + # breakpoint() + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) exp_outs = [(mout.int().tolist(), int(eout))] @@ -94,10 +110,27 @@ async def run_test(self, samples, us): assert self.data_out_0_monitor.exp_queue.empty() +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + if dut.data_in_0_valid.value == 1 and dut.data_in_0_valid.value == 1: + print( + "data_in_0 = ", [x.signed_integer for x in dut.shifted_mdata_in_0.value] + ) + print( + "data_out_0 = ", + [x.signed_integer for x in dut.shifted_mdata_out_0.value], + ) + print("end") + + # @cocotb.test() # async def test(dut): # tb = MXIntAccumulatorTB(dut, 1) -# await tb.run_test(samples=20, us=5) +# cocotb.start_soon(check_signal(dut)) +# await tb.run_test(samples=10, us=5) # @cocotb.test() # async def single_mult(dut): @@ -125,11 +158,24 @@ async def repeated_mult_valid_backpressure(dut): mase_runner( trace=True, module_param_list=[ + # { + # "DATA_IN_0_PRECISION_0": 8, + # "DATA_IN_0_PRECISION_1": 4, + # "BLOCK_SIZE": 1, + # "IN_DEPTH": 1, + # }, + # { + # "DATA_IN_0_PRECISION_0": 8, + # "DATA_IN_0_PRECISION_1": 4, + # "BLOCK_SIZE": 4, + # "IN_DEPTH": 1, + # }, { - "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_0": 16, "DATA_IN_0_PRECISION_1": 4, - "BLOCK_SIZE": 1, - "IN_DEPTH": 1, + "BLOCK_SIZE": 4, + "IN_DEPTH": 4, }, ], + # sim="questa", ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_addition_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_addition_tb.py new file mode 100644 index 000000000..6758640b2 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_addition_tb.py @@ -0,0 +1,161 @@ + +#!/usr/bin/env python3 + +import os, pytest +import torch +import logging +from functools import partial + +import cocotb +from cocotb.triggers import Timer + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + MultiSignalStreamMonitor, +) +from mase_cocotb.runner import mase_runner + +class AdditionTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + # Data drivers + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, dut.data_in_0_ready + ) + + self.data_in_1_driver = MultiSignalStreamDriver( + dut.clk, (dut.mdata_in_1, dut.edata_in_1), + dut.data_in_1_valid, dut.data_in_1_ready + ) + + self.input_drivers = { + "data_0": self.data_in_0_driver, + "data_1": self.data_in_1_driver, + } + + # Output monitor + self.data_out_0_monitor = MultiSignalStreamMonitor( + dut.clk, (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, dut.data_out_0_ready, + check=False + ) + self.output_monitors = {"out": self.data_out_0_monitor} + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + from utils import block_mxint_quant + from utils import pack_tensor_to_mx_listed_chunk + + (qtensor, mtensor, etensor) = block_mxint_quant(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + def generate_inputs(self): + block_size = self.get_parameter("BLOCK_SIZE") + return { + "data_0": torch.randn((block_size)), + "data_1": torch.randn((block_size)), + } + + def generate_exp_outputs(self): + block_size = self.get_parameter("BLOCK_SIZE") + return torch.randn((block_size)) + + async def run_test(self, us, num=1): + await self.reset() + self.data_out_0_monitor.ready.value = 1 + + for i in range(num): + # Generate random inputs + inputs = self.generate_inputs() + # Generate random expected outputs + exp_out = self.generate_exp_outputs() + + # Process input data 0 + data_0_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["data_0"], + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1") + }, + parallelism=[self.get_parameter("BLOCK_SIZE")] + ) + self.data_in_0_driver.load_driver(data_0_inputs) + + # Process input data 1 + data_1_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["data_1"], + config={ + "width": self.get_parameter("DATA_IN_1_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_1_PRECISION_1") + }, + parallelism=[self.get_parameter("BLOCK_SIZE")] + ) + self.data_in_1_driver.load_driver(data_1_inputs) + + # Load output monitor + outs = self.preprocess_tensor_for_mxint( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[self.get_parameter("BLOCK_SIZE")] + ) + self.data_out_0_monitor.load_monitor(outs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + +@cocotb.test() +async def test_addition(dut): + tb = AdditionTB(dut) + await tb.run_test(us=10, num=5) + +def get_addition_config(kwargs={}): + """ + Default configuration for addition test + """ + config = { + # Input 0 precision + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + + # Input 1 precision (same as input 0) + "DATA_IN_1_PRECISION_0": 8, + "DATA_IN_1_PRECISION_1": 4, + + # Output precision + "DATA_OUT_0_PRECISION_0": 9, # One extra bit for addition + "DATA_OUT_0_PRECISION_1": 4, + + # Tensor dimensions + "DATA_IN_0_TENSOR_SIZE_DIM_0": 20, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 20, + "DATA_IN_0_TENSOR_SIZE_DIM_2": 1, + + # Parallelism configuration + "DATA_IN_0_PARALLELISM_DIM_0": 20, + "DATA_IN_0_PARALLELISM_DIM_1": 20, + "DATA_IN_0_PARALLELISM_DIM_2": 1, + } + + config.update(kwargs) + return config + +def test_addition_regression(): + """ + Run regression tests with different configurations + """ + mase_runner( + trace=True, + module_param_list=[ + # Basic test with default config + get_addition_config(), + ] + ) + +if __name__ == "__main__": + test_addition_regression() \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_cast_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_cast_tb.py index 963ae40df..b2fa06539 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/mxint_cast_tb.py +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_cast_tb.py @@ -13,7 +13,8 @@ MultiSignalStreamMonitor, ) from mase_cocotb.runner import mase_runner -from utils import mxint_quantize +# from utils import mxint_quantize +# from utils import MxIntCast import torch @@ -21,10 +22,9 @@ logger.setLevel(logging.DEBUG) -class MXINTVectorMultTB(Testbench): - def __init__(self, dut, num) -> None: +class MxIntCastTB(Testbench): + def __init__(self, dut) -> None: super().__init__(dut, dut.clk, dut.rst) - self.num = num if not hasattr(self, "log"): self.log = SimLog("%s" % (type(self).__qualname__)) @@ -42,64 +42,90 @@ def __init__(self, dut, num) -> None: dut.data_out_ready, check=True, ) + self.input_drivers = {"in0": self.data_in_0_driver} + self.output_monitors = {"out": self.data_out_0_monitor} self.data_in_0_driver.log.setLevel(logging.DEBUG) self.data_out_0_monitor.log.setLevel(logging.DEBUG) - def generate_inputs(self): + def generate_inputs(self, num): inputs = [] exp_outputs = [] - for _ in range(self.num): + from a_cx_mxint_quant import mxint_quant_block + for _ in range(num): data = 20 * torch.rand(int(self.dut.BLOCK_SIZE)) - (data_in, mdata_in, edata_in) = mxint_quantize( + (data_in, mdata_in, edata_in) = mxint_quant_block( data, int(self.dut.IN_MAN_WIDTH), int(self.dut.IN_EXP_WIDTH), ) - exp_out, mexp_out, eexp_out = mxint_quantize( + (exp_out, mexp_out, eexp_out) = mxint_quant_block( data_in, int(self.dut.OUT_MAN_WIDTH), int(self.dut.OUT_EXP_WIDTH), + round_bits = int(self.dut.ROUND_BITS), ) - breakpoint() - inputs.append((mdata_in.int().tolist(), edata_in.int().tolist())) - exp_outputs.append((mexp_out.int().tolist(), eexp_out.int().tolist())) + # mexp_out, eexp_out = MxIntCast( + # mdata_in, + # edata_in, + # { + # "in_width": int(self.dut.IN_MAN_WIDTH), + # "in_frac_width": int(self.dut.IN_MAN_FRAC_WIDTH), + # "in_exponent_width": int(self.dut.IN_EXP_WIDTH), + # "out_width": int(self.dut.OUT_MAN_WIDTH), + # "out_exponent_width": int(self.dut.OUT_EXP_WIDTH), + # }, + # ) + inputs.append((mdata_in.int().tolist(), int(edata_in))) + exp_outputs.append((mexp_out.int().tolist(), int(eexp_out))) return inputs, exp_outputs - async def run_test(self): + async def run_test(self, us = 1, num = 10): await self.reset() logger.info(f"Reset finished") - self.data_out_0_monitor.ready.value = 1 logger.info(f"generating inputs") - inputs, exp_outputs = self.generate_inputs() + inputs, exp_outputs = self.generate_inputs(num) # Load the inputs driver self.data_in_0_driver.load_driver(inputs) # Load the output monitor self.data_out_0_monitor.load_monitor(exp_outputs) - - await Timer(5, units="us") + await Timer(us, units="us") assert self.data_out_0_monitor.exp_queue.empty() @cocotb.test() async def test(dut): - tb = MXINTVectorMultTB(dut, num=1) - await tb.run_test() + # cocotb.start_soon(check_signal(dut)) + tb = MxIntCastTB(dut) + await tb.run_test(us = 10, num = 50) + + +async def check_signal(dut): + num = {"data_out_0": 0, "data_in_0": 0} + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + if dut.data_out_valid.value == 1 and dut.data_out_ready.value == 1: + print(dut.edata_out_full) + print("end") if __name__ == "__main__": mase_runner( trace=True, module_param_list=[ - # { - # "IN_MAN_WIDTH": 6, - # "IN_EXP_WIDTH": 3, - # "OUT_MAN_WIDTH": 12, - # "OUT_EXP_WIDTH": 4, - # "BLOCK_SIZE": 4, - # }, + { + "IN_MAN_WIDTH": 13, + "IN_MAN_FRAC_WIDTH": 12, + "IN_EXP_WIDTH": 8, + "OUT_MAN_WIDTH": 8, + "OUT_EXP_WIDTH": 8, + "ROUND_BITS": 2, + "BLOCK_SIZE": 4, + }, # { # "IN_MAN_WIDTH": 8, # "IN_EXP_WIDTH": 3, @@ -107,13 +133,14 @@ async def test(dut): # "OUT_EXP_WIDTH": 3, # "BLOCK_SIZE": 4, # }, - { - "IN_MAN_WIDTH": 8, - "IN_EXP_WIDTH": 4, - "OUT_MAN_WIDTH": 49, - "OUT_EXP_WIDTH": 5, - "BLOCK_SIZE": 4, - }, + # { + # "IN_MAN_WIDTH": 8, + # "IN_MAN_FRAC_WIDTH": 7, + # "IN_EXP_WIDTH": 4, + # "OUT_MAN_WIDTH": 16, + # "OUT_EXP_WIDTH": 5, + # "BLOCK_SIZE": 1, + # }, # { # "IN_MAN_WIDTH": 12, # "IN_EXP_WIDTH": 3, @@ -122,4 +149,6 @@ async def test(dut): # "BLOCK_SIZE": 4, # }, ], + # sim="questa", + # gui=True ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_dot_product_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_dot_product_tb.py index 43e70adcf..062596565 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/mxint_dot_product_tb.py +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_dot_product_tb.py @@ -76,9 +76,6 @@ def generate_inputs(self): inputs.append((mdata_in.int().tolist(), edata_in.int().tolist())) weights.append((mweight.int().tolist(), eweight.int().tolist())) exp_outputs.append((mdp_out.int().tolist(), edp_out.int().tolist())) - print(inputs) - print(weights) - print(exp_outputs) return inputs, weights, exp_outputs async def run_test(self): @@ -118,4 +115,5 @@ async def test(dut): "BLOCK_SIZE": 4, }, ], + # sim="questa", ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_exp_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_exp_tb.py new file mode 100644 index 000000000..8571d1d3d --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_exp_tb.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 + +# This script tests the fixed point linear +import os, logging + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import * + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + MultiSignalStreamMonitor, +) + +from mase_cocotb.runner import mase_runner +from chop.nn.quantizers.integer import _integer_floor_quantize +from typing import Literal, Optional, Tuple, Union, Dict, List +import torch +import math +from functools import partial +import random +from a_cx_mxint_quant.softmax import MXIntHardwareExp + +logger = logging.getLogger("testbench") +logger.setLevel(logging.DEBUG) + +torch.manual_seed(10) +class MXIntExpTB(Testbench): + def __init__(self, dut, num) -> None: + super().__init__(dut, dut.clk, dut.rst) + self.num = num + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready, + ) + + self.data_out_0_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + self.input_drivers = { + "a": self.data_in_0_driver, + } + self.output_monitors = { + "out": self.data_out_0_monitor, + } + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + def generate_inputs(self): + inputs = [] + expected_outputs = [] + + #cx: Test the code with software + q_config = { + "data_in_width": int(self.dut.DATA_IN_MAN_WIDTH), + "data_in_exponent_width": int(self.dut.DATA_IN_EXP_WIDTH), + "data_r_width": int(self.dut.DATA_R_WIDTH), + "block_size": int(self.dut.BLOCK_SIZE), + "data_out_width": int(self.dut.DATA_OUT_MAN_WIDTH), + "data_out_exponent_width": int(self.dut.DATA_OUT_EXP_WIDTH), + } + + for _ in range(self.num): + data = 49 * torch.rand(q_config["block_size"]) - 24.5 + from a_cx_mxint_quant.quantizers import mxint_quant_block + from a_cx_mxint_quant.softmax import MXIntHardwareExp + (qdata_in, mdata_in, edata_in) = mxint_quant_block( + data, + q_config["data_in_width"], + q_config["data_in_exponent_width"], + ) + + module = MXIntHardwareExp(q_config) + # Calculate expected output using software model + qout, mout, eout = module(qdata_in) + + inputs.append((mdata_in.int().tolist(), int(edata_in.int()))) + expected_outputs.append((mout.reshape(-1).int().tolist(), eout.reshape(-1).int().tolist())) + + return inputs, expected_outputs + + async def run_test(self): + await self.reset() + logger.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + + inputs, expected_outputs = self.generate_inputs() + + self.data_in_0_driver.load_driver(inputs) + self.data_out_0_monitor.load_monitor(expected_outputs) + + await Timer(500, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + +@cocotb.test() +async def test(dut): + # cocotb.start_soon(check_signal(dut)) + tb = MXIntExpTB(dut, num=20) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + if dut.data_in_0_valid.value == 1 and dut.data_in_0_ready.value == 1: + print( + "data_in_0 = ", [x.signed_integer for x in dut.mdata_in_0.value] + ) + print("end") + +from mase_components.helper import generate_memory +from pathlib import Path + +default_config = { + "DATA_IN_MAN_WIDTH": 8, + "DATA_IN_EXP_WIDTH": 4, + "BLOCK_SIZE": 2, + "DATA_R_WIDTH": 2, + "DATA_OUT_MAN_WIDTH": 10, + "DATA_OUT_EXP_WIDTH": 4, +} +if __name__ == "__main__": + valid_width = default_config["DATA_R_WIDTH"] + valid_frac_width = default_config["DATA_R_WIDTH"] - 1 + + hash_out_width = default_config["DATA_OUT_MAN_WIDTH"] + hash_out_frac_width = default_config["DATA_OUT_MAN_WIDTH"] - 2 + + generate_memory.generate_sv_lut( + "power2", + valid_width, + valid_frac_width, + hash_out_width, + hash_out_frac_width, + path=Path(__file__).parents[1] / "rtl", + constant_mult=1, + floor=False, + ) + mase_runner( + trace=True, + module_param_list=[{ + "DATA_IN_MAN_WIDTH": default_config["DATA_IN_MAN_WIDTH"], + "DATA_IN_EXP_WIDTH": default_config["DATA_IN_EXP_WIDTH"], + "BLOCK_SIZE": default_config["BLOCK_SIZE"], + "DATA_R_WIDTH": default_config["DATA_R_WIDTH"], + "DATA_OUT_MAN_WIDTH": default_config["DATA_OUT_MAN_WIDTH"], + "DATA_OUT_EXP_WIDTH": default_config["DATA_OUT_EXP_WIDTH"], + }], + # sim="questa", + ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_gelu_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_gelu_tb.py new file mode 100644 index 000000000..befa71d9d --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_gelu_tb.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +# This script tests the fixed point linear +import os, logging + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import * + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + MultiSignalStreamMonitor, +) + +from mase_cocotb.runner import mase_runner +from chop.nn.quantizers.integer import _integer_floor_quantize +from typing import Literal, Optional, Tuple, Union, Dict, List +import torch +import math +from functools import partial +import random + +logger = logging.getLogger("testbench") +logger.setLevel(logging.DEBUG) + +torch.manual_seed(10) + + +def mxint_gelu(mx, ex, q_config): + """Vectorized range reduction""" + in_man_width = q_config["in_width"] + in_exp_width = q_config["in_exponent_width"] + out_man_width = q_config["out_width"] + out_exp_width = q_config["out_exponent_width"] + # first + real_x = mx * 2**(ex - in_man_width + 1) + if real_x >= 3: + out_mx, out_ex = mx, ex + elif real_x <= -3: + out_mx, out_ex = 0, ex + else: + quant_out_x = _integer_floor_quantize(torch.nn.GELU()(real_x), out_man_width, out_man_width - 1) + out_mx = quant_out_x * 2**(out_man_width - 1 - ex) + out_ex = ex + + return out_mx + +class MXIntGeluTB(Testbench): + def __init__(self, dut, num) -> None: + super().__init__(dut, dut.clk, dut.rst) + self.num = num + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready, + ) + + self.data_out_0_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + self.input_drivers = { + "a": self.data_in_0_driver, + } + self.output_monitors = { + "out": self.data_out_0_monitor, + } + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + def generate_inputs(self): + inputs = [] + expected_outputs = [] + + q_config = { + "data_in_width": int(self.dut.DATA_IN_0_PRECISION_0), + "data_in_exponent_width": int(self.dut.DATA_IN_0_PRECISION_1), + "data_in_parallelism": [int(self.dut.DATA_IN_0_PARALLELISM_DIM_1), int(self.dut.DATA_IN_0_PARALLELISM_DIM_0)], + "hash_out_width": int(self.dut.HASH_OUT_WIDTH), + "data_out_width": int(self.dut.DATA_OUT_0_PRECISION_0), + "data_out_exponent_width": int(self.dut.DATA_OUT_0_PRECISION_1), + "data_out_parallelism": [int(self.dut.DATA_OUT_0_PARALLELISM_DIM_1), int(self.dut.DATA_OUT_0_PARALLELISM_DIM_0)], + } + + for _ in range(self.num): + data = 10 * torch.rand(int(self.dut.DATA_IN_0_PARALLELISM_DIM_0)) - 5 # Generate data between -3 and 3 + from utils import mxint_quant_block + from mxint_module import mxint_gelu + (qdata_in, mdata_in, edata_in) = mxint_quant_block( + data, + q_config["data_in_width"], + q_config["data_in_exponent_width"], + ) + + # Calculate expected output using software model + qout, mout, eout = mxint_gelu( + qdata_in, + q_config + ) + + inputs.append((mdata_in.int().tolist(), int(edata_in.int()))) + expected_outputs.append((mout.reshape(-1).int().tolist(), int(eout.reshape(-1).int()))) + + return inputs, expected_outputs + + async def run_test(self): + await self.reset() + logger.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + + inputs, expected_outputs = self.generate_inputs() + + self.data_in_0_driver.load_driver(inputs) + self.data_out_0_monitor.load_monitor(expected_outputs) + + await Timer(500, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + +@cocotb.test() +async def test(dut): + # cocotb.start_soon(check_signal(dut)) + tb = MXIntGeluTB(dut, num=20) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + if dut.data_in_0_valid.value == 1 and dut.data_in_0_ready.value == 1: + print( + "data_in_0 = ", [x.signed_integer for x in dut.mdata_in_0.value] + ) + print("end") + +from mase_components.helper import generate_memory +from pathlib import Path +default_config = { + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 4], + "hash_out_width": 8, + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 4], +} +if __name__ == "__main__": + valid_width = default_config["data_in_width"] + 2 + valid_frac_width = default_config["data_in_width"] - 1 + + hash_out_width = default_config["hash_out_width"] + hash_out_frac_width = hash_out_width - 3 + + generate_memory.generate_sv_lut( + "gelu", + valid_width, + valid_frac_width, + hash_out_width, + hash_out_frac_width, + path=Path(__file__).parents[1] / "rtl", + constant_mult=1, + floor=False, + ) + mase_runner( + trace=True, + module_param_list=[{ + "DATA_IN_0_PRECISION_0": default_config["data_in_width"], + "DATA_IN_0_PRECISION_1": default_config["data_in_exponent_width"], + "DATA_IN_0_TENSOR_SIZE_DIM_0": 16, + "DATA_IN_0_PARALLELISM_DIM_0": default_config["data_in_parallelism"][1], + "HASH_OUT_WIDTH": default_config["hash_out_width"], + "DATA_OUT_0_PRECISION_0": default_config["data_out_width"], + "DATA_OUT_0_PRECISION_1": default_config["data_out_exponent_width"], + }], + # sim="questa", + ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_hardware_round_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_hardware_round_tb.py new file mode 100644 index 000000000..6287a4e47 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_hardware_round_tb.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 + +# This script tests the fixed point linear +import os, logging + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import * + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + StreamMonitor, +) + +from mase_cocotb.runner import mase_runner +from utils import mxint_quantize + +import torch +from math import ceil, log2 +import random + +logger = logging.getLogger("testbench") +logger.setLevel(logging.DEBUG) + +torch.manual_seed(10) + +def hardware_round(mx, ex, in_man_frac_width): + round_max = 2**(8-1) - 1 + round_min = -2**(8-1) + round_x = mx.reshape(-1) // 2**((in_man_frac_width-ex).reshape(-1)) + return torch.clamp(round_x, round_min, round_max) + +class MXIntHardwareRoundTB(Testbench): + def __init__(self, dut, num) -> None: + super().__init__(dut, dut.clk, dut.rst) + self.num = num + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready, + ) + + self.data_out_0_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + self.input_drivers = { + "a": self.data_in_0_driver, + } + self.output_drivers = { + "out": self.data_out_0_monitor, + } + def generate_inputs(self): + inputs = [] + exp_outputs = [] + for _ in range(self.num): + def hardware_round(mx, ex, in_man_frac_width): + round_max = 2**(8-1) - 1 + round_min = -2**(8-1) + round_x = mx.reshape(-1) // 2**((in_man_frac_width-ex).reshape(-1)) + print(mx.reshape(-1)) + print((in_man_frac_width-ex).reshape(-1)) + return torch.clamp(round_x, round_min, round_max) + data = 49 * torch.rand(int(self.dut.BLOCK_SIZE)) - 24.5 + (data_in, mdata_in, edata_in) = mxint_quantize( + data, + int(self.dut.DATA_IN_MAN_WIDTH), + int(self.dut.DATA_IN_EXP_WIDTH), + ) + n = hardware_round(mdata_in, edata_in, int(self.dut.DATA_IN_MAN_FRAC_WIDTH)) + print(n) + inputs.append((mdata_in.int().tolist(), edata_in.int().tolist())) + exp_outputs.append(n.int().tolist()) + return inputs, exp_outputs + + async def run_test(self): + await self.reset() + logger.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + + logger.info(f"generating inputs") + inputs, exp_outputs = self.generate_inputs() + + # Load the inputs driver + self.data_in_0_driver.load_driver(inputs) + # Load the output monitor + self.data_out_0_monitor.load_monitor(exp_outputs) + + await Timer(5, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + + +@cocotb.test() +async def test(dut): + cocotb.start_soon(check_signal(dut)) + tb = MXIntHardwareRoundTB(dut, num=20) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + if dut.data_in_0_valid.value == 1 and dut.data_in_0_ready.value == 1: + print( + "data_in_0 = ", [x.signed_integer for x in dut.mdata_in_0.value] + ) + print( + "shift_result = ", [x.signed_integer for x in dut.shift_result.value] + ) + print( + "clamped_n = ", [x.signed_integer for x in dut.clamped_n.value] + ) + # print( + # "data_out_0 = ", + # [x.signed_integer for x in dut.data_out_0.value], + # ) + print("end") +if __name__ == "__main__": + mase_runner( + trace=True, + module_param_list=[ + { + "DATA_IN_MAN_WIDTH": 8, + "DATA_IN_MAN_FRAC_WIDTH": 6, + "DATA_IN_EXP_WIDTH": 4, + "BLOCK_SIZE": 4, + "DATA_OUT_WIDTH": 8, + }, + ], + sim="questa", + ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_layernorm_1d_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_layernorm_1d_tb.py new file mode 100644 index 000000000..4541111eb --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_layernorm_1d_tb.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 + +import os +import torch +import logging +from functools import partial +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge, ReadOnly +from pathlib import Path + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import MultiSignalStreamDriver, MultiSignalStreamMonitor +from mase_cocotb.runner import mase_runner +from a_cx_mxint_quant import mxint_quant_block, mxint_hardware +from utils import pack_tensor_to_mx_listed_chunk + +class MxIntLayerNorm1DTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + # Input data driver + self.data_in_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready + ) + + # Output monitor + self.out_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + self.input_drivers = { + "data_in": self.data_in_driver, + } + self.output_monitors = {"out": self.out_monitor} + + # Model parameters + self.tensor_size_dim_0 = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0") + self.parallelism_dim_0 = self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0") + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + (qtensor, mtensor, etensor) = mxint_hardware(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + async def run_test(self): + await self.reset() + self.log.info("Reset finished") + self.out_monitor.ready.value = 1 + + input_data = torch.randn((1, self.tensor_size_dim_0)) + # Update config to match RTL parameter names + input_config = { + "width": self.get_parameter("DATA_IN_0_MAN_WIDTH"), + "exponent_width": self.get_parameter("DATA_IN_0_EXP_WIDTH"), + "round_bits": 4, + } + + input_parallelism = [ + 1, + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ] + (qtensor, mtensor, etensor) = mxint_hardware(input_data, input_config, input_parallelism) + shape = mtensor.shape + mtensor = mtensor.reshape(-1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")).unsqueeze(0) + mtensor = mtensor // 2**(etensor.max() - etensor).unsqueeze(-1) + etensor = etensor.max().repeat(etensor.shape) + input_data_processed = pack_tensor_to_mx_listed_chunk(mtensor, etensor, input_parallelism) + self.data_in_driver.load_driver(input_data_processed) + + from a_cx_mxint_quant.layernorm import mxint_layer_norm + qinput = mtensor * 2**(etensor.unsqueeze(-1) - input_config["width"] - 1) + qinput = qinput.reshape(shape) + layer_norm_config = { + "name": "mxint_hardware", + # data + "data_in_width": self.get_parameter("DATA_IN_0_MAN_WIDTH"), + "data_in_exponent_width": self.get_parameter("DATA_IN_0_EXP_WIDTH"), + "data_in_parallelism": [1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")], + "data_out_width": self.get_parameter("DATA_OUT_0_MAN_WIDTH"), + "data_out_exponent_width": self.get_parameter("DATA_OUT_0_EXP_WIDTH"), + "data_out_parallelism": [1, self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0")], + } + int_config = { + "qx_lossy": True, + "num_val_0_lossy": True, + "num_val_1_lossy": True, + "mean_lossy": True, + "var_lossy": True, + "isqrt_lossy": True, + "data_in_width": layer_norm_config["data_in_width"], + "data_in_frac_width": layer_norm_config["data_in_width"] - 1, + "isqrt_in_width": self.get_parameter("ISQRT_IN_MAN_WIDTH"), + "isqrt_in_exponent_width": 6, + "isqrt_out_width": self.get_parameter("ISQRT_OUT_MAN_WIDTH"), + "isqrt_out_frac_width": self.get_parameter("ISQRT_OUT_MAN_FRAC_WIDTH"), + "isqrt_out_exponent_width": 6, + "weight_width": 8, + "weight_frac_width": 6, + "bias_width": 8, + "bias_frac_width": 6, + "data_out_width": self.get_parameter("DATA_OUT_0_MAN_WIDTH"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_MAN_FRAC_WIDTH"), + } + qout_data, mout_data, eout_data = mxint_layer_norm(qinput, (self.tensor_size_dim_0,), None, None, q_config=int_config) + eout_data = eout_data.repeat(etensor.shape) + + # Simplified parallelism config since RTL only has one dimension + out_parallelism = [ + 1, + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ] + out_processed = pack_tensor_to_mx_listed_chunk(mout_data, eout_data, out_parallelism) + + self.out_monitor.load_monitor(out_processed) + + await Timer(100, units="us") + if not self.out_monitor.exp_queue.empty(): + raise RuntimeError("Output monitor is not empty at end of test") + +@cocotb.test() +async def test_mxint_layer_norm(dut): + cocotb.start_soon(check_signal(dut)) + tb = MxIntLayerNorm1DTB(dut) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + print("end") + +default_config = { + # Input/output dimensions + "DATA_IN_0_TENSOR_SIZE_DIM_0": 10, # Changed from 8 to match RTL + "DATA_IN_0_PARALLELISM_DIM_0": 2, # Changed from 2 to match RTL + + # Data width parameters + "DATA_IN_0_MAN_WIDTH": 8, # Added to match RTL + "DATA_IN_0_MAN_FRAC_WIDTH": 7, # Added to match RTL + "DATA_IN_0_EXP_WIDTH": 4, # Added to match RTL + + "DATA_OUT_0_MAN_WIDTH": 8, # Added to match RTL + "DATA_OUT_0_MAN_FRAC_WIDTH": 7, # Added to match RTL + "DATA_OUT_0_EXP_WIDTH": 4, # Added to match RTL + + # ISQRT parameters + "ISQRT_IN_MAN_WIDTH": 8, # Added to match RTL + "ISQRT_IN_MAN_FRAC_WIDTH": 7, # Added to match RTL + "ISQRT_OUT_MAN_WIDTH": 8, # Added to match RTL + "ISQRT_OUT_MAN_FRAC_WIDTH": 4, # Added to match RTL +} + +def test_layer_norm_smoke(): + valid_width = default_config["ISQRT_IN_MAN_WIDTH"] + 1 + valid_frac_width = default_config["ISQRT_IN_MAN_WIDTH"] - 1 + + out_width = default_config["ISQRT_OUT_MAN_WIDTH"] + out_frac_width = default_config["ISQRT_OUT_MAN_FRAC_WIDTH"] + + from mase_components.helper import generate_memory + generate_memory.generate_sv_lut( + "isqrt", + valid_width, + valid_frac_width, + out_width, + out_frac_width, + path=Path(__file__).parents[1] / "rtl", + constant_mult=1, + floor=False, + ) + mase_runner( + trace=True, + module_param_list=[default_config], + skip_build=False, + sim="verilator", + + ) + +if __name__ == "__main__": + test_layer_norm_smoke() diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_layernorm_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_layernorm_tb.py new file mode 100644 index 000000000..a5b892d64 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_layernorm_tb.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 + +import os +import torch +import logging +from functools import partial +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge, ReadOnly +from pathlib import Path + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import MultiSignalStreamDriver, MultiSignalStreamMonitor +from mase_cocotb.runner import mase_runner +from a_cx_mxint_quant import mxint_quant_block, mxint_hardware +from utils import pack_tensor_to_mx_listed_chunk + +class MxIntLayerNormTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + # Input data driver + self.data_in_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready + ) + + # Weight driver + self.weight_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mweight, dut.eweight), + dut.weight_valid, + dut.weight_ready + ) + + # Bias driver + self.bias_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mbias, dut.ebias), + dut.bias_valid, + dut.bias_ready + ) + + # Output monitor + self.out_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True + ) + + self.input_drivers = { + "data_in": self.data_in_driver, + "weight": self.weight_driver, + "bias": self.bias_driver, + } + self.output_monitors = {"out": self.out_monitor} + self.out_monitor.log.setLevel(logging.DEBUG) + + # Model parameters + self.tensor_size_dim_0 = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0") + self.tensor_size_dim_1 = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1") + self.parallelism_dim_0 = self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0") + self.parallelism_dim_1 = self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1") + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + (qtensor, mtensor, etensor) = mxint_hardware(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + async def run_test(self): + await self.reset() + self.log.info("Reset finished") + self.out_monitor.ready.value = 1 + + # Generate random tensors for testing + input_data = torch.randn((self.tensor_size_dim_1, self.tensor_size_dim_0)) + weight = torch.randn((self.tensor_size_dim_0,)) + bias = torch.randn((self.tensor_size_dim_0,)) + + # Input data processing + input_config = { + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "round_bits": 4, + } + input_parallelism = [ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ] + (qinput, minput, einput) = mxint_hardware(input_data, input_config, input_parallelism) + # Save original shape + shape = minput.shape + + # Reshape to match parallelism structure like in pack_tensor_to_mx_listed_chunk + reshaped_mtensor = minput.reshape(-1, shape[-2] // self.parallelism_dim_1, + self.parallelism_dim_1, + shape[-1] // self.parallelism_dim_0, + self.parallelism_dim_0)\ + .permute(0, 1, 3, 2, 4)\ + .reshape(-1, self.parallelism_dim_1 * self.parallelism_dim_0) + + # Get max exponent per block and adjust mantissa + reshaped_etensor = einput.reshape(-1) + emax = reshaped_etensor.max() + reshaped_mtensor = reshaped_mtensor // 2**(emax - reshaped_etensor).unsqueeze(-1) + + # Reshape back to original shape + minput = reshaped_mtensor.reshape(-1, shape[-2] // self.parallelism_dim_1, + shape[-1] // self.parallelism_dim_0, + self.parallelism_dim_1, + self.parallelism_dim_0)\ + .permute(0, 1, 3, 2, 4)\ + .reshape(shape) + einput = einput.max().repeat(einput.shape) + input_data_processed = pack_tensor_to_mx_listed_chunk(minput, einput, input_parallelism) + + # Weight processing + weight_config = { + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "exponent_width": self.get_parameter("WEIGHT_PRECISION_1"), + "round_bits": 4, + } + # Weight has shape (tensor_size_dim_0,) + weight_parallelism = [1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")] + weight_processed = self.preprocess_tensor_for_mxint(weight, weight_config, weight_parallelism) + + # Bias processing + bias_config = { + "width": self.get_parameter("BIAS_PRECISION_0"), + "exponent_width": self.get_parameter("BIAS_PRECISION_1"), + "round_bits": 4, + } + # Bias has shape (tensor_size_dim_0,) + bias_parallelism = [1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")] + bias_processed = self.preprocess_tensor_for_mxint(bias, bias_config, bias_parallelism) + + # Load drivers + self.data_in_driver.load_driver(input_data_processed) + self.weight_driver.load_driver(weight_processed) + self.bias_driver.load_driver(bias_processed) + + # Generate expected output + from a_cx_mxint_quant.layernorm import mxint_layer_norm + qinput = minput * 2**(einput.reshape(-1)[0] - input_config["width"] - 1) + int_config = { + "qx_lossy": True, + "num_val_0_lossy": True, + "num_val_1_lossy": True, + "mean_lossy": True, + "var_lossy": True, + "isqrt_lossy": True, + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_0") - 1, + "isqrt_in_width": self.get_parameter("ISQRT_IN_PRECISION_0"), + "isqrt_in_exponent_width": 6, + "isqrt_out_width": self.get_parameter("ISQRT_OUT_PRECISION_0"), + "isqrt_out_frac_width": self.get_parameter("ISQRT_OUT_PRECISION_1"), + "isqrt_out_exponent_width": 6, + "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), + "weight_exponent_width": self.get_parameter("WEIGHT_PRECISION_1"), + "weight_parallelism": [1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")], + "bias_width": self.get_parameter("BIAS_PRECISION_0"), + "bias_exponent_width": self.get_parameter("BIAS_PRECISION_1"), + "bias_parallelism": [1, self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0")], + "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_out_exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "data_out_parallelism": [self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0")], + } + qout_data, mout_data, eout_data = mxint_layer_norm(qinput, (self.tensor_size_dim_0,), weight, bias, q_config=int_config) + eout_data = eout_data + + out_parallelism = [ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ] + out_processed = pack_tensor_to_mx_listed_chunk(mout_data, eout_data, out_parallelism) + self.out_monitor.load_monitor(out_processed) + + await Timer(100, units="us") + if not self.out_monitor.exp_queue.empty(): + raise RuntimeError("Output monitor is not empty at end of test") + +@cocotb.test() +async def test_mxint_layer_norm(dut): + # cocotb.start_soon(check_signal(dut)) + tb = MxIntLayerNormTB(dut) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + print("end") + +default_config = { + # Input/output dimensions + "DATA_IN_0_TENSOR_SIZE_DIM_0": 4, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 4, + "DATA_IN_0_TENSOR_SIZE_DIM_2": 1, + "DATA_IN_0_PARALLELISM_DIM_0": 2, + "DATA_IN_0_PARALLELISM_DIM_1": 1, + "DATA_IN_0_PARALLELISM_DIM_2": 1, + + # Data width parameters + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + + "WEIGHT_PRECISION_0": 8, + "WEIGHT_PRECISION_1": 4, + + "BIAS_PRECISION_0": 8, + "BIAS_PRECISION_1": 4, + + "DATA_OUT_0_PRECISION_0": 8, + "DATA_OUT_0_PRECISION_1": 4, + + # ISQRT parameters + "ISQRT_IN_PRECISION_0": 8, + "ISQRT_IN_PRECISION_1": 8, + "ISQRT_OUT_PRECISION_0": 8, + "ISQRT_OUT_PRECISION_1": 4, + + # Norm parameters + "NORM_OUT_PRECISION_0": 8, + "NORM_OUT_PRECISION_1": 4, + + # Other parameters + "ELEMENTWISE_AFFINE": 1, + "HAS_BIAS": 1 +} + +def test_layer_norm_smoke(): + valid_width = default_config["ISQRT_IN_PRECISION_0"] + 1 + valid_frac_width = default_config["ISQRT_IN_PRECISION_0"] - 1 + + out_width = default_config["ISQRT_OUT_PRECISION_0"] + out_frac_width = default_config["ISQRT_OUT_PRECISION_1"] + + from mase_components.helper import generate_memory + generate_memory.generate_sv_lut( + "isqrt", + valid_width, + valid_frac_width, + out_width, + out_frac_width, + path=Path(__file__).parents[1] / "rtl", + constant_mult=1, + floor=False, + ) + mase_runner( + trace=True, + module_param_list=[default_config], + skip_build=False, + sim="verilator", + ) + +if __name__ == "__main__": + test_layer_norm_smoke() diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_linear_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_linear_tb.py index 036c5a2e3..8a85922b2 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/mxint_linear_tb.py +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_linear_tb.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -import os, pytest +import os, sys, logging, traceback, pdb +import pytest import torch import logging @@ -8,18 +9,25 @@ import cocotb from cocotb.log import SimLog -from cocotb.triggers import Timer, RisingEdge +from cocotb.triggers import Timer, RisingEdge, ReadOnly from mase_cocotb.testbench import Testbench from mase_cocotb.interfaces.streaming import ( MultiSignalStreamDriver, MultiSignalStreamMonitor, + MultiSignalErrorThresholdStreamMonitor, ) from mase_cocotb.runner import mase_runner +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) -torch.manual_seed(0) + +sys.excepthook = excepthook +# torch.manual_seed(0) # from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner -from utils import MXIntLinear +from utils import MXIntLinearHardware class LinearTB(Testbench): @@ -28,7 +36,7 @@ def __init__(self, dut) -> None: if not hasattr(self, "log"): self.log = SimLog("%s" % (type(self).__qualname__)) - self.log.setLevel(logging.DEBUG) + # self.log.setLevel(logging.DEBUG) self.data_in_0_driver = MultiSignalStreamDriver( dut.clk, @@ -40,11 +48,16 @@ def __init__(self, dut) -> None: dut.clk, (dut.mweight, dut.eweight), dut.weight_valid, dut.weight_ready ) + self.input_drivers = { + "a": self.data_in_0_driver, + "b": self.weight_driver, + } if self.get_parameter("HAS_BIAS") == 1: self.bias_driver = MultiSignalStreamDriver( dut.clk, (dut.mbias, dut.ebias), dut.bias_valid, dut.bias_ready ) self.bias_driver.log.setLevel(logging.DEBUG) + self.input_drivers["bias"] = self.bias_driver self.data_out_0_monitor = MultiSignalStreamMonitor( dut.clk, @@ -54,137 +67,146 @@ def __init__(self, dut) -> None: check=True, ) + self.output_monitors = {"out": self.data_out_0_monitor} # Model - self.model = MXIntLinear( + self.model = MXIntLinearHardware( in_features=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), out_features=self.get_parameter("DATA_OUT_0_TENSOR_SIZE_DIM_0"), bias=True if self.get_parameter("HAS_BIAS") == 1 else False, - config={ + q_config={ "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), "data_in_exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - "data_in_parallelism_dim_1": self.get_parameter( - "DATA_IN_0_PARALLELISM_DIM_1" - ), - "data_in_parallelism_dim_0": self.get_parameter( - "DATA_IN_0_PARALLELISM_DIM_0" - ), + "data_in_parallelism": [ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), "weight_exponent_width": self.get_parameter("WEIGHT_PRECISION_1"), - "weight_parallelism_dim_1": self.get_parameter( - "WEIGHT_PARALLELISM_DIM_1" - ), - "weight_parallelism_dim_0": self.get_parameter( - "WEIGHT_PARALLELISM_DIM_0" - ), + "weight_parallelism": [ + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), + ], "bias_width": self.get_parameter("BIAS_PRECISION_0"), "bias_exponent_width": self.get_parameter("BIAS_PRECISION_1"), - "bias_parallelism_dim_1": self.get_parameter("BIAS_PARALLELISM_DIM_1"), - "bias_parallelism_dim_0": self.get_parameter("BIAS_PARALLELISM_DIM_0"), + "bias_parallelism": [ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), "data_out_exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - "data_out_parallelism_dim_1": self.get_parameter( - "DATA_OUT_0_PARALLELISM_DIM_1" - ), - "data_out_parallelism_dim_0": self.get_parameter( - "DATA_OUT_0_PARALLELISM_DIM_0" - ), + "data_out_parallelism": [ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + "round_bits": self.get_parameter("ROUND_BITS"), }, ) # Set verbosity of driver and monitor loggers to debug - self.data_in_0_driver.log.setLevel(logging.DEBUG) - self.weight_driver.log.setLevel(logging.DEBUG) + self.data_in_0_driver.log.setLevel(logging.INFO) + self.weight_driver.log.setLevel(logging.INFO) self.data_out_0_monitor.log.setLevel(logging.DEBUG) def preprocess_tensor_for_mxint(self, tensor, config, parallelism): - from utils import block_mxint_quant + from utils import mxint_hardware from utils import pack_tensor_to_mx_listed_chunk - (qtensor, mtensor, etensor) = block_mxint_quant(tensor, config, parallelism) + (qtensor, mtensor, etensor) = mxint_hardware(tensor, config, parallelism) tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) return tensor_inputs def generate_inputs(self): return torch.randn( ( + 1, self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), ) ) - async def run_test(self, us): + async def run_test(self, us, num=1): await self.reset() self.log.info(f"Reset finished") self.data_out_0_monitor.ready.value = 1 - inputs = self.generate_inputs() - exp_out = self.model(inputs) - - # * Load the inputs driver - self.log.info(f"Processing inputs: {inputs}") - inputs = self.preprocess_tensor_for_mxint( - tensor=inputs, - config={ - "width": self.get_parameter("DATA_IN_0_PRECISION_0"), - "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), - ], - ) - self.data_in_0_driver.load_driver(inputs) + for i in range(num): + inputs = self.generate_inputs() + exp_out = self.model(inputs) - # * Load the weights driver - weights = self.model.weight + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = self.preprocess_tensor_for_mxint( + tensor=inputs, + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "round_bits": self.get_parameter("ROUND_BITS"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + ) + self.data_in_0_driver.load_driver(inputs) - self.log.info(f"Processing weights: {weights}") - weights = self.preprocess_tensor_for_mxint( - tensor=weights, - config={ - "width": self.get_parameter("WEIGHT_PRECISION_0"), - "exponent_width": self.get_parameter("WEIGHT_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), - self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), - ], - ) - self.weight_driver.load_driver(weights) + # * Load the weights driver + weights = self.model.weight - # * Load the bias driver - if self.get_parameter("HAS_BIAS") == 1: - bias = self.model.bias - self.log.info(f"Processing bias: {bias}") - bias = self.preprocess_tensor_for_mxint( - tensor=bias, + self.log.info(f"Processing weights: {weights}") + weights = self.preprocess_tensor_for_mxint( + tensor=weights, config={ - "width": self.get_parameter("BIAS_PRECISION_0"), - "exponent_width": self.get_parameter("BIAS_PRECISION_1"), + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "exponent_width": self.get_parameter("WEIGHT_PRECISION_1"), + "round_bits": 8, }, parallelism=[ - self.get_parameter("BIAS_PARALLELISM_DIM_1"), - self.get_parameter("BIAS_PARALLELISM_DIM_0"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), ], ) - self.bias_driver.load_driver(bias) - - # * Load the output monitor - self.log.info(f"Processing outputs: {exp_out}") - outs = self.preprocess_tensor_for_mxint( - tensor=exp_out, - config={ - "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), - "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), - }, - parallelism=[ - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), - self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), - ], - ) - breakpoint() - self.data_out_0_monitor.load_monitor(outs) - + if self.get_parameter("CIRCULAR_WEIGHT") == 0: + weights = weights * self.get_parameter("IN_0_DEPTH_DIM_1") + self.weight_driver.load_driver(weights) + + self.input_drivers = {"in0": self.data_in_0_driver, "in1": self.weight_driver} + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = self.model.bias + self.log.info(f"Processing bias: {bias}") + bias = self.preprocess_tensor_for_mxint( + tensor=bias, + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "exponent_width": self.get_parameter("BIAS_PRECISION_1"), + "round_bits": 8, + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + ) + if self.get_parameter("CIRCULAR_WEIGHT") == 0: + bias = bias * self.get_parameter("IN_0_DEPTH_DIM_1") + self.bias_driver.load_driver(bias) + self.input_drivers["in2"] = self.bias_driver + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor_for_mxint( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "round_bits": self.get_parameter("ROUND_BITS"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.data_out_0_monitor.load_monitor(outs) + self.output_monitors = {"out": self.data_out_0_monitor} await Timer(us, units="us") assert self.data_out_0_monitor.exp_queue.empty() @@ -192,65 +214,48 @@ async def run_test(self, us): @cocotb.test() async def cocotb_test(dut): tb = LinearTB(dut) - await tb.run_test(us=100) + # cocotb.start_soon(check_signal(dut)) + await tb.run_test(us=1000, num=5) + + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print("fifo_data_out = ", dut.fifo_data_out_valid.value, dut.fifo_data_out_ready.value) + # print("end") def get_fixed_linear_config(kwargs={}): - # if pretranspose - # weight1 = in0 - # else - # weight0 = in0 # currently, we only consider the transposed situation - # config = { - # "HAS_BIAS": 1, - # "DATA_IN_0_TENSOR_SIZE_DIM_0": 2, - # "DATA_IN_0_TENSOR_SIZE_DIM_1": 2, - # "DATA_IN_0_PARALLELISM_DIM_0": 2, - # "DATA_IN_0_PARALLELISM_DIM_1": 1, - # "WEIGHT_TENSOR_SIZE_DIM_0": 2, - # "WEIGHT_TENSOR_SIZE_DIM_1": 2, - # "WEIGHT_PARALLELISM_DIM_0": 2, - # "WEIGHT_PARALLELISM_DIM_1": 1, - # "DATA_IN_0_PRECISION_0": 8, - # "DATA_IN_0_PRECISION_1": 4, - # "WEIGHT_PRECISION_0": 8, - # "WEIGHT_PRECISION_1": 4, - # "BIAS_PRECISION_0": 8, - # "BIAS_PRECISION_1": 4, - # "DATA_OUT_0_PRECISION_0": 10, - # "DATA_OUT_0_PRECISION_1": 4, - # } config = { "HAS_BIAS": 1, "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, - "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, "DATA_IN_0_PARALLELISM_DIM_0": 4, - "DATA_IN_0_PARALLELISM_DIM_1": 4, - "WEIGHT_TENSOR_SIZE_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_1": 16, - "WEIGHT_PARALLELISM_DIM_0": 4, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, + "DATA_IN_0_PARALLELISM_DIM_1": 1, + "WEIGHT_TENSOR_SIZE_DIM_1": 32, "WEIGHT_PARALLELISM_DIM_1": 4, - "DATA_IN_0_PRECISION_0": 9, + "DATA_IN_0_PRECISION_0": 8, "DATA_IN_0_PRECISION_1": 4, - "WEIGHT_PRECISION_0": 8, + "WEIGHT_PRECISION_0": 6, "WEIGHT_PRECISION_1": 3, - "BIAS_PRECISION_0": 8, + "BIAS_PRECISION_0": 6, "BIAS_PRECISION_1": 4, - "DATA_OUT_0_PRECISION_0": 12, + "DATA_OUT_0_PRECISION_0": 8, "DATA_OUT_0_PRECISION_1": 4, } config.update(kwargs) return config -@pytest.mark.dev def test_fixed_linear_smoke(): """ Some quick tests to check if the module is working. """ mase_runner( trace=True, - extra_build_args=["--trace-depth", "8"], module_param_list=[ get_fixed_linear_config(), # noticed here if change WEIGHT_PRE_TRANSPOSED also need to change the DIM_SIZE to match ACTIVATION @@ -263,10 +268,11 @@ def test_fixed_linear_smoke(): # }, # ), ], + sim="questa", + # gui=True, ) -@pytest.mark.dev def test_fixed_linear_regression(): """ More extensive tests to check realistic parameter sizes. @@ -274,35 +280,32 @@ def test_fixed_linear_regression(): mase_runner( trace=True, module_param_list=[ - get_fixed_linear_config( - { - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, - "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, - "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } - ), get_fixed_linear_config( { "HAS_BIAS": 1, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 768, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 192, "DATA_IN_0_PARALLELISM_DIM_0": 32, - "WEIGHT_TENSOR_SIZE_DIM_0": 768, - "WEIGHT_TENSOR_SIZE_DIM_1": 768, - "WEIGHT_PARALLELISM_DIM_0": 32, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 196, + "DATA_IN_0_PARALLELISM_DIM_1": 1, + "WEIGHT_TENSOR_SIZE_DIM_1": 192, "WEIGHT_PARALLELISM_DIM_1": 32, - "BIAS_TENSOR_SIZE_DIM_0": 768, - "BIAS_PARALLELISM_DIM_0": 32, - } + "ROUND_BITS": 4, + }, + # { + # "HAS_BIAS": 1, + # "DATA_IN_0_TENSOR_SIZE_DIM_0": 2, + # "DATA_IN_0_PARALLELISM_DIM_0": 2, + # "DATA_IN_0_TENSOR_SIZE_DIM_1": 2, + # "DATA_IN_0_PARALLELISM_DIM_1": 1, + # "WEIGHT_TENSOR_SIZE_DIM_1": 2, + # "WEIGHT_PARALLELISM_DIM_1": 1, + # } ), ], + sim = "verilator", ) if __name__ == "__main__": - test_fixed_linear_smoke() - # test_fixed_linear_regression() + # test_fixed_linear_smoke() + test_fixed_linear_regression() diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_matmul_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_matmul_tb.py index f31d1ab61..322ce3d41 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/mxint_matmul_tb.py +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_matmul_tb.py @@ -14,7 +14,7 @@ ) from mase_cocotb.runner import mase_runner -from utils import block_mxint_quant +from utils import block_mxint_quant, MXIntMatmulHardware from mase_cocotb.matrix_tools import gen_random_matrix_input, matrix_mult_model from mase_cocotb.utils import bit_driver @@ -67,6 +67,12 @@ def __init__(self, dut) -> None: dut.out_ready, check=True, ) + self.input_drivers = { + "a": self.a_driver, + "b": self.b_driver, + } + self.output_monitors = {"out": self.output_monitor} + self.output_monitor.log.setLevel(logging.DEBUG) def generate_inputs(self): for _ in range(self.num): @@ -98,18 +104,43 @@ def generate_inputs(self): self.get_parameter("B_COMPUTE_DIM0"), ], ) - matmul_out = qa @ qb + self.log.debug(f"hardware_out = {ma @ mb}") - (qout, mout, eout) = block_mxint_quant( - matmul_out, - q_config={ + (mout, eout) = MXIntMatmulHardware( + ma, + ea, + mb, + eb, + { + "width": self.get_parameter("A_MAN_WIDTH"), + "exponent_width": self.get_parameter("A_EXP_WIDTH"), + "parallism_dim_0": self.get_parameter("A_COMPUTE_DIM0"), + "parallism_dim_1": self.get_parameter("A_COMPUTE_DIM1"), + "depth_dim_0": self.get_parameter("A_DEPTH_DIM0"), + "depth_dim_1": self.get_parameter("A_DEPTH_DIM1"), + "dim_0": self.get_parameter("A_TOTAL_DIM0"), + "dim_1": self.get_parameter("A_TOTAL_DIM1"), + }, + { + "width": self.get_parameter("B_MAN_WIDTH"), + "exponent_width": self.get_parameter("B_EXP_WIDTH"), + "parallism_dim_0": self.get_parameter("B_COMPUTE_DIM0"), + "parallism_dim_1": self.get_parameter("B_COMPUTE_DIM1"), + "depth_dim_0": self.get_parameter("B_DEPTH_DIM0"), + "depth_dim_1": self.get_parameter("B_DEPTH_DIM1"), + "dim_0": self.get_parameter("B_TOTAL_DIM0"), + "dim_1": self.get_parameter("B_TOTAL_DIM1"), + }, + { "width": self.get_parameter("OUT_MAN_WIDTH"), "exponent_width": self.get_parameter("OUT_EXP_WIDTH"), + "parallism_dim_0": self.get_parameter("C_COMPUTE_DIM0"), + "parallism_dim_1": self.get_parameter("C_COMPUTE_DIM1"), + "depth_dim_0": self.get_parameter("C_DEPTH_DIM0"), + "depth_dim_1": self.get_parameter("C_DEPTH_DIM1"), + "dim_0": self.get_parameter("C_TOTAL_DIM0"), + "dim_1": self.get_parameter("C_TOTAL_DIM1"), }, - parallelism=[ - self.get_parameter("C_COMPUTE_DIM1"), - self.get_parameter("C_COMPUTE_DIM0"), - ], ) from utils import pack_tensor_to_mx_listed_chunk @@ -166,11 +197,11 @@ async def run_test(self, batches, us): # await tb.run_test(batches=1, us=100) -# @cocotb.test() -# async def repeated_mult(dut): -# tb = MXIntMatmulTB(dut) -# tb.output_monitor.ready.value = 1 -# await tb.run_test(batches=1000, us=2000) +@cocotb.test() +async def repeated_mult(dut): + tb = MXIntMatmulTB(dut) + tb.output_monitor.ready.value = 1 + await tb.run_test(batches=20, us=20) # @cocotb.test() @@ -180,13 +211,13 @@ async def run_test(self, batches, us): # await tb.run_test(batches=500, us=2000) -@cocotb.test() -async def repeated_mult_valid_backpressure(dut): - tb = MXIntMatmulTB(dut) - tb.a_driver.set_valid_prob(0.7) - tb.b_driver.set_valid_prob(0.7) - cocotb.start_soon(bit_driver(dut.out_ready, dut.clk, 0.6)) - await tb.run_test(batches=20, us=200) +# @cocotb.test() +# async def repeated_mult_valid_backpressure(dut): +# tb = MXIntMatmulTB(dut) +# tb.a_driver.set_valid_prob(0.7) +# tb.b_driver.set_valid_prob(0.7) +# cocotb.start_soon(bit_driver(dut.out_ready, dut.clk, 0.6)) +# await tb.run_test(batches=20, us=200) def gen_random_dimensions(): @@ -236,19 +267,18 @@ def test_matmul(): "A_COMPUTE_DIM1": 2, "B_COMPUTE_DIM0": 2, "B_COMPUTE_DIM1": 2, # Must equal A_COMPUTE_DIM0 - "A_MAN_WIDTH": 8, - "A_EXP_WIDTH": 3, - "B_MAN_WIDTH": 8, - "B_EXP_WIDTH": 3, - "OUT_MAN_WIDTH": 8, - "OUT_EXP_WIDTH": 3, + "A_MAN_WIDTH": 4, + "A_EXP_WIDTH": 8, + "B_MAN_WIDTH": 4, + "B_EXP_WIDTH": 8, + "OUT_MAN_WIDTH": 4, + "OUT_EXP_WIDTH": 8, } mase_runner( module_param_list=[ # Default Square - DEFAULT_CONFIG, - # + # DEFAULT_CONFIG, # { # **DEFAULT_CONFIG, # "A_MAN_WIDTH": 9, @@ -258,7 +288,7 @@ def test_matmul(): # "OUT_MAN_WIDTH": 12, # "OUT_EXP_WIDTH": 4, # }, - # # Long Rectangle, should saturate many values + # Long Rectangle, should saturate many values { **DEFAULT_CONFIG, "A_TOTAL_DIM0": 16, @@ -270,20 +300,20 @@ def test_matmul(): "B_COMPUTE_DIM0": 4, "B_COMPUTE_DIM1": 4, # Must equal A_COMPUTE_DIM0 }, - # # # Change window to full size - { - **DEFAULT_CONFIG, - "A_COMPUTE_DIM0": 4, - "A_COMPUTE_DIM1": 4, - "B_COMPUTE_DIM0": 4, - "B_COMPUTE_DIM1": 4, - }, + # Change window to full size + # { + # **DEFAULT_CONFIG, + # "A_COMPUTE_DIM0": 4, + # "A_COMPUTE_DIM1": 4, + # "B_COMPUTE_DIM0": 4, + # "B_COMPUTE_DIM1": 4, + # }, # # Dimensions # *generate_random_dimension_cfg([DEFAULT_CONFIG]), ], trace=True, jobs=12, - extra_build_args=["--trace-depth", "5"], + # sim="questa", ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_module.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_module.py new file mode 100644 index 000000000..a64a25c83 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_module.py @@ -0,0 +1,77 @@ +# models.py +import torch +import torch.nn as nn +import math +from typing import List, Union, Optional +from pathlib import Path +import torch +import torch.nn as nn +from torch import Tensor +import math +from typing import Literal, Optional, Tuple, Union, Dict +from enum import Enum +from functools import partial +from tqdm import tqdm +from chop.nn.quantizers.integer import _integer_floor_quantize, _integer_quantize +from utils import mxint_quant_block, mxint_hardware +from utils import reshape_to_block, reshape_back + +def mxint_gelu(x, q_config): + """Vectorized range reduction""" + qx, mx, ex = mxint_hardware( + x, + { + "width": q_config["data_in_width"], + "exponent_width": q_config["data_in_exponent_width"], + "round_bits": 4, + }, + parallelism=q_config["data_in_parallelism"] + ) + # first + + original_shape = qx.shape + t1, t0 = mx.shape[-2:] + p1, p0 = q_config["data_in_parallelism"] + qx = reshape_to_block(qx, t1,t0, p1, p0) + mx = reshape_to_block(mx, t1, t0, p1, p0) + ex = ex.unsqueeze(-1).unsqueeze(-1) + + qout = torch.relu(qx) + eout = ex + remaining = (qx > -3) & (qx < 3) + + # data_width_loss + # avoid quant_loss here + # we will need to shift it to + # in hardware qx is lossless + VALID_WIDTH = q_config["data_in_width"] + 2 + HASH_OUT_WIDTH = q_config["hash_out_width"] + HASH_OUT_FRAC_WIDTH = HASH_OUT_WIDTH - 3 + # hash loss + qgelu = _integer_quantize(torch.nn.GELU()(qx), HASH_OUT_WIDTH, HASH_OUT_FRAC_WIDTH) + mgelu = qgelu * 2**(HASH_OUT_WIDTH - 1) // 2**ex + qgelu = mgelu * 2**ex / 2**(HASH_OUT_WIDTH - 1) + + qout[remaining] = qgelu[remaining] + qout = reshape_back(qout, t1, t0, p1, p0) + qout = qout.reshape(original_shape) + qx, mx, ex = mxint_hardware( + qout, + { + "width": q_config["data_out_width"], + "exponent_width": q_config["data_out_exponent_width"], + "round_bits": 4, + }, + parallelism=q_config["data_out_parallelism"] + ) + return qx, mx, ex + +class MXIntGELU(nn.Module): + def __init__(self, q_config: Dict = {}): + super().__init__() + self.q_config = q_config + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out, _, _ = mxint_gelu(x, self.q_config) + return out + diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_patch_embed_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_patch_embed_tb.py new file mode 100644 index 000000000..b17ff8dce --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_patch_embed_tb.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 + +import os, pytest + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge, ReadOnly + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + MultiSignalStreamMonitor, +) +from mase_cocotb.runner import mase_runner + +# torch.manual_seed(0) +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner + + +class PatchEmbedTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + # Data drivers + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, dut.data_in_0_ready + ) + + self.cls_token_driver = MultiSignalStreamDriver( + dut.clk, (dut.mcls_token, dut.ecls_token), + dut.cls_token_valid, dut.cls_token_ready + ) + + self.distill_token_driver = MultiSignalStreamDriver( + dut.clk, (dut.mdistill_token, dut.edistill_token), + dut.distill_token_valid, dut.distill_token_ready + ) + + self.weight_driver = MultiSignalStreamDriver( + dut.clk, (dut.mweight, dut.eweight), + dut.weight_valid, dut.weight_ready + ) + + self.input_drivers = { + "data": self.data_in_0_driver, + "cls": self.cls_token_driver, + "distill": self.distill_token_driver, + "weight": self.weight_driver, + } + + if self.get_parameter("HAS_BIAS") == 1: + self.bias_driver = MultiSignalStreamDriver( + dut.clk, (dut.mbias, dut.ebias), + dut.bias_valid, dut.bias_ready + ) + self.input_drivers["bias"] = self.bias_driver + + # Output monitor + self.data_out_0_monitor = MultiSignalStreamMonitor( + dut.clk, (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, dut.data_out_0_ready, + check=False + ) + self.output_monitors = {"out": self.data_out_0_monitor} + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + from utils import block_mxint_quant + from utils import pack_tensor_to_mx_listed_chunk + + (qtensor, mtensor, etensor) = block_mxint_quant(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + def generate_inputs(self): + return { + "data": torch.randn(( + self.get_parameter("IN_X"), + self.get_parameter("IN_Y"), + self.get_parameter("IN_C") + )), + "cls_token": torch.randn(( + self.get_parameter("OUT_C") + )), + "distill_token": torch.randn(( + self.get_parameter("OUT_C") + )), + "weight": torch.randn(( + self.get_parameter("OUT_C"), + self.get_parameter("KERNEL_X") * self.get_parameter("KERNEL_Y") * self.get_parameter("IN_C") + )), + "bias": torch.randn(( + self.get_parameter("OUT_C") + )) if self.get_parameter("HAS_BIAS") == 1 else None + } + + def generate_exp_outputs(self): + return torch.randn( + (self.get_parameter("SLIDING_NUM") + 2, + self.get_parameter("OUT_C")) + ) + + async def run_test(self, us, num=1): + await self.reset() + self.data_out_0_monitor.ready.value = 1 + + for i in range(num): + # Generate all random inputs + inputs = self.generate_inputs() + # Generate random expected outputs instead of using model + exp_out = self.generate_exp_outputs() + + # Process input data + data_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["data"], + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1") + }, + parallelism=[ + 1, + self.get_parameter("IN_C"), + ] + ) + self.data_in_0_driver.load_driver(data_inputs) + + # Process cls token + cls_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["cls_token"], + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1") + }, + parallelism=[1,self.get_parameter("UNROLL_OUT_C")] + ) + self.cls_token_driver.load_driver(cls_inputs) + + # Process distill token + distill_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["distill_token"], + config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1") + }, + parallelism=[1,self.get_parameter("UNROLL_OUT_C")] + ) + self.distill_token_driver.load_driver(distill_inputs) + + # Process weights + weight_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["weight"], + config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "exponent_width": self.get_parameter("WEIGHT_PRECISION_1") + }, + parallelism=[ + self.get_parameter("UNROLL_OUT_C"), + self.get_parameter("IN_C"), + ] + ) + self.weight_driver.load_driver(weight_inputs) + + # Process bias if needed + if self.get_parameter("HAS_BIAS") == 1: + bias_inputs = self.preprocess_tensor_for_mxint( + tensor=inputs["bias"], + config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "exponent_width": self.get_parameter("BIAS_PRECISION_1") + }, + parallelism=[1,self.get_parameter("UNROLL_OUT_C")] + ) + self.bias_driver.load_driver(bias_inputs) + + # Load output monitor with random exp_out + outs = self.preprocess_tensor_for_mxint( + tensor=exp_out, + config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + 1, + self.get_parameter("UNROLL_OUT_C"), + ] + ) + self.data_out_0_monitor.load_monitor(outs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + +@cocotb.test() +async def test_patch_embed(dut): + tb = PatchEmbedTB(dut) + await tb.run_test(us=10, num=5) + +def get_patch_embed_config(kwargs={}): + """ + Default configuration for patch embedding test + """ + config = { + # Basic parameters + "HAS_BIAS": 1, + + # Input dimensions + "IN_X": 4, # Input feature map height + "IN_Y": 4, # Input feature map width + "IN_C": 3, # Input channels + + # Kernel dimensions + "KERNEL_X": 2, # Kernel height + "KERNEL_Y": 2, # Kernel width + "OUT_C": 4, # Output channels + + # Parallelism + "UNROLL_OUT_C": 2, # Parallel output channels + + # Precision configurations + "DATA_IN_0_PRECISION_0": 8, # Input mantissa width + "DATA_IN_0_PRECISION_1": 4, # Input exponent width + + "WEIGHT_PRECISION_0": 8, # Weight mantissa width + "WEIGHT_PRECISION_1": 4, # Weight exponent width + + "BIAS_PRECISION_0": 8, # Bias mantissa width + "BIAS_PRECISION_1": 4, # Bias exponent width + + "DATA_OUT_0_PRECISION_0": 10, # Output mantissa width + "DATA_OUT_0_PRECISION_1": 4, # Output exponent width + } + + # Allow overriding with custom parameters + config.update(kwargs) + return config + +def test_patch_embed_regression(): + """ + More extensive tests with different parameter configurations + """ + mase_runner( + trace=True, + module_param_list=[ + # Basic test with default config + get_patch_embed_config(), + + # Test with larger dimensions + # get_patch_embed_config({ + # "IN_X": 28, + # "IN_Y": 28, + # "IN_C": 16, + # "OUT_C": 128, + # "UNROLL_OUT_C": 16 + # }), + + # # Test with different precision + # get_patch_embed_config({ + # "DATA_IN_0_PRECISION_0": 16, + # "DATA_IN_0_PRECISION_1": 6, + # "WEIGHT_PRECISION_0": 12, + # "WEIGHT_PRECISION_1": 5, + # "DATA_OUT_0_PRECISION_0": 16, + # "DATA_OUT_0_PRECISION_1": 6 + # }) + ], + # sim="questa" + ) + +if __name__ == "__main__": + test_patch_embed_regression() # Use regression test instead of smoke test \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_range_reduction_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_range_reduction_tb.py new file mode 100644 index 000000000..2e5d97fb0 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_range_reduction_tb.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +# This script tests the fixed point linear +import os, logging + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import * + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + StreamMonitor, +) + +from mase_cocotb.runner import mase_runner +from a_cx_mxint_quant import mxint_quant_block, mxint_hardware +from typing import Literal, Optional, Tuple, Union, Dict, List +import torch +import math +from functools import partial +import random + +logger = logging.getLogger("testbench") +logger.setLevel(logging.DEBUG) + +torch.manual_seed(10) + +def quantized_range_reduction(mx, ex, in_man_width, data_out_n_width): + """Vectorized range reduction""" + def hardware_round(mx, ex, in_man_frac_width, data_out_width): + round_max = 2**(data_out_width-1) - 1 + round_min = -2**(data_out_width-1) + round_x = mx.reshape(-1) // 2**((in_man_frac_width-ex).reshape(-1)) + return torch.clamp(round_x, round_min, round_max) + coefficient_quant_block = partial( + mxint_quant_block, + width=8, + exponent_width=4, + round_bits=4) + _, mlog2_e, elog2_e = coefficient_quant_block(torch.log2(torch.tensor(math.e))) + _, mln_2, eln_2 = coefficient_quant_block(torch.log(torch.tensor(2.0))) + n = hardware_round(mx * mlog2_e, ex + elog2_e, (in_man_width - 1 + 7), data_out_n_width) + print(n) + _mx = n * mln_2 + _ex = eln_2 + shifted_mx = mx // 2**(_ex - ex + (in_man_width - 1) - 7) + print(shifted_mx) + print(_ex - ex + (in_man_width - 1) - 7) + mr = shifted_mx - _mx + breakpoint() + # return mr as an fixedpoint ?.7 we can make it 2.7 + # return n as an integer number with width = data_out_width + return mr, n + +class MXIntRangeReductionTB(Testbench): + def __init__(self, dut, num) -> None: + super().__init__(dut, dut.clk, dut.rst) + self.num = num + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready, + ) + + self.data_out_n_monitor = StreamMonitor( + dut.clk, + dut.data_out_n, + dut.data_out_n_valid, + dut.data_out_n_ready, + check=True, + ) + + self.data_out_r_monitor = StreamMonitor( + dut.clk, + dut.data_out_r, + dut.data_out_r_valid, + dut.data_out_r_ready, + check=True, + ) + + self.input_drivers = { + "a": self.data_in_0_driver, + } + self.output_monitors = { + "n": self.data_out_n_monitor, + "r": self.data_out_r_monitor, + } + + def generate_inputs(self): + inputs = [] + exp_r_outputs = [] + exp_n_outputs = [] + for _ in range(self.num): + torch.manual_seed(0) + data = 49 * torch.rand(int(self.dut.BLOCK_SIZE)) - 24.5 + (data_in, mdata_in, edata_in) = mxint_quant_block( + data, + int(self.dut.DATA_IN_MAN_WIDTH), + int(self.dut.DATA_IN_EXP_WIDTH), + 4, + ) + r,n = quantized_range_reduction(mdata_in, edata_in, int(self.dut.DATA_IN_MAN_WIDTH), int(self.dut.DATA_OUT_N_WIDTH)) + inputs.append((mdata_in.int().tolist(), int(edata_in))) + exp_r_outputs.append(r.int().tolist()) + exp_n_outputs.append(n.int().tolist()) + return inputs, exp_r_outputs, exp_n_outputs + + async def run_test(self): + await self.reset() + logger.info(f"Reset finished") + self.data_out_n_monitor.ready.value = 1 + self.data_out_r_monitor.ready.value = 1 + + logger.info(f"generating inputs") + inputs, exp_r_outputs, exp_n_outputs = self.generate_inputs() + + # Load the inputs driver + self.data_in_0_driver.load_driver(inputs) + # Load the output monitors + print(exp_n_outputs) + self.data_out_n_monitor.load_monitor(exp_n_outputs) + self.data_out_r_monitor.load_monitor(exp_r_outputs) + + await Timer(5, units="us") + assert self.data_out_n_monitor.exp_queue.empty() + assert self.data_out_r_monitor.exp_queue.empty() + +@cocotb.test() +async def test(dut): + cocotb.start_soon(check_signal(dut)) + tb = MXIntRangeReductionTB(dut, num=20) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + print(dut.data_in_0_valid.value, dut.data_in_0_ready.value) + if dut.data_in_0_valid.value == 1 and dut.data_in_0_ready.value == 1: + print( + "data_in_0 = ", [x.signed_integer for x in dut.mdata_in_0.value] + ) + if dut.data_out_n_valid.value == 1 and dut.data_out_n_ready.value == 1: + print( + "data_out_n = ", [x.signed_integer for x in dut.data_out_n.value] + ) + # "straight_data_out_n = ", [x for x in dut.straight_data_out_n.value] + # ) + # print( + # "mdata_in_0_log2_e = ", [x for x in dut.mdata_in_0_log2_e.value] + # ) + print("end") + +if __name__ == "__main__": + mase_runner( + trace=True, + module_param_list=[ + { + "DATA_IN_MAN_WIDTH": 8, + "DATA_IN_EXP_WIDTH": 4, + "BLOCK_SIZE": 4, + "DATA_OUT_N_WIDTH": 8, + }, + ], + sim="verilator", + ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_softmax_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_softmax_tb.py new file mode 100644 index 000000000..127b866fd --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_softmax_tb.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 + +# This script tests the fixed point linear +import os, logging + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import * + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import ( + MultiSignalStreamDriver, + MultiSignalStreamMonitor, +) + +from mase_cocotb.runner import mase_runner +from typing import Literal, Optional, Tuple, Union, Dict, List +import torch +import math +from functools import partial +import random + +logger = logging.getLogger("testbench") +logger.setLevel(logging.DEBUG) + +torch.manual_seed(10) + +class MXIntSoftmaxTB(Testbench): + def __init__(self, dut, num) -> None: + super().__init__(dut, dut.clk, dut.rst) + self.num = num + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + + self.data_in_0_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready, + ) + + self.data_out_0_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + + self.input_drivers = { + "a": self.data_in_0_driver, + } + self.output_monitors = { + "out": self.data_out_0_monitor, + } + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + def generate_inputs(self): + inputs = [] + exp_outputs = [] + torch.manual_seed(0) + from a_cx_mxint_quant.softmax import MXIntSoftmax + from a_cx_mxint_quant.quantizers import mxint_hardware + for _ in range(self.num): + data = 49 * torch.rand(int(self.dut.DATA_IN_0_DIM)) - 24.5 + q_config = { + "data_in_width": int(self.dut.DATA_IN_0_PRECISION_0), + "data_in_exponent_width": int(self.dut.DATA_IN_0_PRECISION_1), + "block_size": int(self.dut.BLOCK_SIZE), + "data_out_width": int(self.dut.DATA_OUT_0_PRECISION_0), + "data_out_exponent_width": int(self.dut.DATA_OUT_0_PRECISION_1), + "data_width": int(self.dut.DATA_IN_0_PRECISION_0), + "data_exponent_width": int(self.dut.DATA_IN_0_PRECISION_1), + "data_r_width": int(self.dut.DATA_R_WIDTH), + "exp_sum_underflow_bits": int(self.dut.EXP_SUM_UNDERFLOW_BITS), + "division_underflow_bits": int(self.dut.DIVISION_UNDERFLOW_BITS), + } + qdata_in, mdata_in, edata_in = mxint_hardware( + data, + q_config = { + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "round_bits": 4, + }, + parallelism=[1, 1], + ) + + module = MXIntSoftmax(q_config) + qout, mout, eout = module(data) + qout, mout, eout = mxint_hardware( + qout, + q_config = { + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "round_bits": 4, + }, + parallelism=[1, 1], + ) + + mdata_in = mdata_in.reshape(-1) + edata_in = edata_in.reshape(-1) + mout = mout.reshape(-1) + eout = eout.reshape(-1) + shape = mdata_in.shape[0] + for i in range(shape): + inputs.append(([int(mdata_in[i])], int(edata_in[i]))) + exp_outputs.append(([int(mout[i])], int(eout[i]))) + return inputs, exp_outputs + + async def run_test(self): + await self.reset() + logger.info(f"Reset finished") + + logger.info(f"generating inputs") + inputs, exp_outputs = self.generate_inputs() + + # Load the inputs driver + self.data_in_0_driver.load_driver(inputs) + # Load the output monitors + self.data_out_0_monitor.load_monitor(exp_outputs) + + await Timer(20, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + +@cocotb.test() +async def test(dut): + # cocotb.start_soon(check_signal(dut)) + tb = MXIntSoftmaxTB(dut, num=20) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + + # Print all valid/ready signals + print("\nValid/Ready Signals:") + print(f"data_out_0: {dut.data_out_0_valid.value}/{dut.data_out_0_ready.value}") + print("---") + +from mase_components.helper import generate_memory +from pathlib import Path + +default_config = { + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + "DATA_IN_0_DIM": 8, + "DATA_OUT_0_PRECISION_0": 8, + "DATA_OUT_0_PRECISION_1": 4, + "DATA_R_WIDTH": 2, + "EXP_SUM_UNDERFLOW_BITS": 1, + "DIVISION_UNDERFLOW_BITS": 6, +} +if __name__ == "__main__": + valid_width = default_config["DATA_R_WIDTH"] + valid_frac_width = default_config["DATA_R_WIDTH"] - 1 + hash_out_width = default_config["DATA_IN_0_PRECISION_0"] + hash_out_frac_width = default_config["DATA_IN_0_PRECISION_0"] - 2 + generate_memory.generate_sv_lut( + "power2", + valid_width, + valid_frac_width, + hash_out_width, + hash_out_frac_width, + path=Path(__file__).parents[1] / "rtl", + constant_mult=1, + floor=False, + ) + mase_runner( + trace=True, + module_param_list=[ + default_config, + ], + sim="verilator", + ) diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_vit_attention_head_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_vit_attention_head_tb.py new file mode 100644 index 000000000..f132a3eca --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_vit_attention_head_tb.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +import os + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge, ReadOnly +from pathlib import Path + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import MultiSignalStreamDriver, MultiSignalStreamMonitor +from mase_cocotb.runner import mase_runner + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized import ViTSelfAttentionHeadInteger +from chop.nn.quantizers import integer_quantizer, integer_floor_quantizer + +from mase_components.helper import generate_memory + +import pytest +import math + +import torch +from torch import Tensor +import torch.nn as nn +import math + +from typing import Optional, Tuple +from functools import partial + +from mase_components.linear_layers.mxint_operators.test.utils import MXIntLinearHardware, MXIntMatmulHardware +from mase_components.linear_layers.mxint_operators.test.mxint_softmax_tb import mxint_softmax + +class MxIntViTSelfAttentionHeadTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + # * QKV drivers with MxInt format + self.query_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mquery, dut.equery), + dut.query_valid, + dut.query_ready + ) + self.key_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mkey, dut.ekey), + dut.key_valid, + dut.key_ready + ) + self.value_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mvalue, dut.evalue), + dut.value_valid, + dut.value_ready + ) + + self.out_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mout, dut.eout), + dut.out_valid, + dut.out_ready, + check=False, + ) + + self.input_drivers = {"in0": self.query_driver, "in1": self.key_driver, "in2": self.value_driver} + self.output_monitors = {"out": self.out_monitor} + # Model parameters + self.head_size = self.get_parameter("IN_DATA_TENSOR_SIZE_DIM_0") + self.seq_len = self.get_parameter("IN_DATA_TENSOR_SIZE_DIM_1") + + # Configure logging + # self.query_driver.log.setLevel(logging.DEBUG) + # self.key_driver.log.setLevel(logging.DEBUG) + # self.value_driver.log.setLevel(logging.DEBUG) + # self.out_monitor.log.setLevel(logging.DEBUG) + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + from utils import block_mxint_quant + from utils import pack_tensor_to_mx_listed_chunk + + (qtensor, mtensor, etensor) = block_mxint_quant(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + async def run_test(self): + await self.reset() + self.log.info("Reset finished") + self.out_monitor.ready.value = 1 + + # Generate random inputs + query = torch.randn((self.seq_len, self.head_size)) + key = torch.randn((self.seq_len, self.head_size)) + value = torch.randn((self.seq_len, self.head_size)) + + # Process and load inputs + config = { + "width": self.get_parameter("IN_DATA_PRECISION_0"), + "exponent_width": self.get_parameter("IN_DATA_PRECISION_1"), + } + parallelism = [ + self.get_parameter("IN_DATA_PARALLELISM_DIM_1"), + self.get_parameter("IN_DATA_PARALLELISM_DIM_0"), + ] + + query_inputs = self.preprocess_tensor_for_mxint(query, config, parallelism) + key_inputs = self.preprocess_tensor_for_mxint(key, config, parallelism) + value_inputs = self.preprocess_tensor_for_mxint(value, config, parallelism) + + self.query_driver.load_driver(query_inputs) + self.key_driver.load_driver(key_inputs) + self.value_driver.load_driver(value_inputs) + + # Generate expected outputs (using random values for this example) + exp_out = torch.randn((self.seq_len, self.head_size)) + out_config = { + "width": self.get_parameter("OUT_DATA_PRECISION_0"), + "exponent_width": self.get_parameter("OUT_DATA_PRECISION_1"), + } + out_parallelism = [ + self.get_parameter("OUT_DATA_PARALLELISM_DIM_1"), + self.get_parameter("OUT_DATA_PARALLELISM_DIM_0"), + ] + outs = self.preprocess_tensor_for_mxint(exp_out, out_config, out_parallelism) + self.out_monitor.load_monitor(outs) + + await Timer(1, units="ms") + if not self.out_monitor.exp_queue.empty(): + raise RuntimeError("Output monitor is not empty at end of test") + + +@cocotb.test() +async def cocotb_test(dut): + cocotb.start_soon(check_signal(dut)) + tb = MxIntViTSelfAttentionHeadTB(dut) + await tb.run_test() + +async def check_signal(dut): + await Timer(40, units="ns") + while True: + await RisingEdge(dut.clk) + await ReadOnly() + + # Print all valid/ready signals + print("\nValid/Ready Signals:") + # print(f"query: {dut.query_valid.value}/{dut.query_ready.value}") + # print(f"key: {dut.key_valid.value}/{dut.key_ready.value}") + print(f"qk: {dut.qk_valid.value}/{dut.qk_ready.value}") + print(f"query_key_linear_acc: {dut.query_key_linear.acc_data_out_valid.value}/{dut.query_key_linear.acc_data_out_ready.value}") + print(f"query_key_linear_fifo: {dut.query_key_linear.fifo_data_out_valid.value}/{dut.query_key_linear.fifo_data_out_ready.value}") + print(f"query_key_linear_cast_buffer: {dut.query_key_linear.cast_i.buffer_data_for_out_valid.value}/{dut.query_key_linear.cast_i.buffer_data_for_out_ready.value}") + print(f"log2_max_value: {dut.query_key_linear.cast_i.log2_max_value_valid.value}/{dut.query_key_linear.cast_i.log2_max_value_ready.value}") + # Print data values when valid and ready + # if dut.query_valid.value == 1 and dut.query_ready.value == 1: + # print("query_mout = ", [x.signed_integer for x in dut.mquery.value]) + # print("query_eout = ", dut.equery.value.signed_integer) + + # if dut.key_valid.value == 1 and dut.key_ready.value == 1: + # print("key_mout = ", [x.signed_integer for x in dut.mkey.value]) + # print("key_eout = ", dut.ekey.value.signed_integer) + + if dut.query_key_linear.cast_i.log2_max_value_valid.value == 1 and dut.query_key_linear.cast_i.log2_max_value_ready.value == 1: + print("log2_max_value = ", dut.query_key_linear.cast_i.log2_max_value.value.signed_integer) + + if dut.query_key_linear.cast_i.buffer_data_for_out_valid.value == 1 and dut.query_key_linear.cast_i.buffer_data_for_out_ready.value == 1: + print("query_key_linear_cast_buffer_mdata = ", [x.signed_integer for x in dut.query_key_linear.cast_i.mbuffer_data_for_out.value]) + print("query_key_linear_cast_buffer_edata = ", dut.query_key_linear.cast_i.ebuffer_data_for_out.value.signed_integer) + + if dut.query_key_linear.cast_i.buffer_data_for_out_valid.value == 1 and dut.query_key_linear.cast_i.buffer_data_for_out_ready.value == 1: + print("query_key_linear_cast_buffer_mdata = ", [x.signed_integer for x in dut.query_key_linear.cast_i.mbuffer_data_for_out.value]) + print("query_key_linear_cast_buffer_edata = ", dut.query_key_linear.cast_i.ebuffer_data_for_out.value.signed_integer) + + if dut.query_key_linear.acc_data_out_valid.value == 1 and dut.query_key_linear.acc_data_out_ready.value == 1: + print("query_key_linear_acc_mdata_out = ", [x.signed_integer for x in dut.query_key_linear.acc_mdata_out.value]) + print("query_key_linear_acc_edata_out = ", dut.query_key_linear.acc_edata_out.value.signed_integer) + + if dut.qk_valid.value == 1 and dut.qk_ready.value == 1: + print("qk_mout = ", [x.signed_integer for x in dut.qk_mout.value]) + print("qk_eout = ", dut.qk_eout.value.signed_integer) + + if dut.query_key_linear.fifo_data_out_valid.value == 1 and dut.query_key_linear.fifo_data_out_ready.value == 1: + print("query_key_linear_fifo_mdata_out = ", [x.signed_integer for x in dut.query_key_linear.fifo_mdata_out.value]) + print("query_key_linear_fifo_edata_out = ", dut.query_key_linear.fifo_edata_out.value.signed_integer) + + print("---") + +default_config = { + "IN_DATA_TENSOR_SIZE_DIM_0": 4, + "IN_DATA_TENSOR_SIZE_DIM_1": 12, + "IN_DATA_PARALLELISM_DIM_0": 4, + "IN_DATA_PARALLELISM_DIM_1": 1, + "IN_DATA_PRECISION_0": 8, + "IN_DATA_PRECISION_1": 4, + "OUT_DATA_PRECISION_0": 8, + "OUT_DATA_PRECISION_1": 4, +} +def get_fixed_self_attention_head_config(kwargs={}): + config = default_config + config.update(kwargs) + return config + + +torch.manual_seed(1) + + +@pytest.mark.dev +def test_fixed_self_attention_head_smoke(): + """ + Some quick tests to check if the module is working. + """ + + mase_runner( + trace=True, + module_param_list=[ + get_fixed_self_attention_head_config(), + ], + skip_build=False, + sim="questa", + ) + + +if __name__ == "__main__": + test_fixed_self_attention_head_smoke() diff --git a/src/mase_components/linear_layers/mxint_operators/test/mxint_vit_attention_tb.py b/src/mase_components/linear_layers/mxint_operators/test/mxint_vit_attention_tb.py new file mode 100644 index 000000000..6a02650e8 --- /dev/null +++ b/src/mase_components/linear_layers/mxint_operators/test/mxint_vit_attention_tb.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 + +import os + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge, ReadOnly +from pathlib import Path + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import MultiSignalStreamDriver, MultiSignalStreamMonitor +from mase_cocotb.runner import mase_runner + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized import ViTSelfAttentionHeadInteger +from chop.nn.quantizers import integer_quantizer, integer_floor_quantizer + +from mase_components.helper import generate_memory + +import pytest +import math + +import torch +from torch import Tensor +import torch.nn as nn +import math + +from typing import Optional, Tuple +from functools import partial + +from mase_components.linear_layers.mxint_operators.test.utils import MXIntLinearHardware, MXIntMatmulHardware +from mase_components.linear_layers.mxint_operators.test.mxint_softmax_tb import mxint_softmax + +class MxIntViTAttentionTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + # Update drivers/monitors for mxint format + self.data_in_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mdata_in_0, dut.edata_in_0), + dut.data_in_0_valid, + dut.data_in_0_ready + ) + + # Query parameters drivers + self.query_weight_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mweight_query, dut.eweight_query), + dut.query_weight_valid, + dut.query_weight_ready + ) + + self.query_bias_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mquery_bias, dut.equery_bias), + dut.query_bias_valid, + dut.query_bias_ready + ) + + # Key parameters drivers + self.key_weight_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mkey_weight, dut.ekey_weight), + dut.key_weight_valid, + dut.key_weight_ready + ) + + self.key_bias_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mkey_bias, dut.ekey_bias), + dut.key_bias_valid, + dut.key_bias_ready + ) + + # Value parameters drivers + self.value_weight_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mvalue_weight, dut.evalue_weight), + dut.value_weight_valid, + dut.value_weight_ready + ) + + self.value_bias_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mvalue_bias, dut.evalue_bias), + dut.value_bias_valid, + dut.value_bias_ready + ) + + # Projection parameters drivers + self.proj_weight_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mproj_weight, dut.eproj_weight), + dut.proj_weight_valid, + dut.proj_weight_ready + ) + + self.proj_bias_driver = MultiSignalStreamDriver( + dut.clk, + (dut.mproj_bias, dut.eproj_bias), # No exponent for proj bias + dut.proj_bias_valid, + dut.proj_bias_ready + ) + + self.out_monitor = MultiSignalStreamMonitor( + dut.clk, + (dut.mdata_out_0, dut.edata_out_0), + dut.data_out_0_valid, + dut.data_out_0_ready, + check=False + ) + + self.input_drivers = { + "data_in": self.data_in_driver, + "query_weight": self.query_weight_driver, + "query_bias": self.query_bias_driver, + "key_weight": self.key_weight_driver, + "key_bias": self.key_bias_driver, + "value_weight": self.value_weight_driver, + "value_bias": self.value_bias_driver, + "proj_weight": self.proj_weight_driver, + "proj_bias": self.proj_bias_driver + } + self.output_monitors = {"out": self.out_monitor} + # Model parameters (moved up for clarity) + self.num_heads = self.get_parameter("NUM_HEADS") + self.hidden_size = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0") + self.seq_len = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1") + self.head_size = self.hidden_size // self.num_heads + + # Configure logging + # self.query_driver.log.setLevel(logging.DEBUG) + # self.key_driver.log.setLevel(logging.DEBUG) + # self.value_driver.log.setLevel(logging.DEBUG) + # self.out_monitor.log.setLevel(logging.DEBUG) + + def preprocess_tensor_for_mxint(self, tensor, config, parallelism): + from utils import block_mxint_quant + from utils import pack_tensor_to_mx_listed_chunk + + (qtensor, mtensor, etensor) = block_mxint_quant(tensor, config, parallelism) + tensor_inputs = pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism) + return tensor_inputs + + async def run_test(self): + await self.reset() + self.log.info("Reset finished") + self.out_monitor.ready.value = 1 + + # Generate random tensors for all inputs + batch_size = self.seq_len + hidden_size = self.hidden_size + + # Input data + input_data = torch.randn((batch_size, hidden_size)) + + # Query/Key/Value weights and biases + qkv_weight_shape = (hidden_size, hidden_size) + qkv_bias_shape = (hidden_size,) + + query_weight = torch.randn(qkv_weight_shape) + query_bias = torch.randn(qkv_bias_shape) + key_weight = torch.randn(qkv_weight_shape) + key_bias = torch.randn(qkv_bias_shape) + value_weight = torch.randn(qkv_weight_shape) + value_bias = torch.randn(qkv_bias_shape) + + # Projection weights and biases + proj_weight = torch.randn(qkv_weight_shape) + proj_bias = torch.randn(qkv_bias_shape) + + # Configuration for different parameter types + input_config = { + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + } + weight_config = { + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "exponent_width": self.get_parameter("WEIGHT_PRECISION_1"), + } + bias_config = { + "width": self.get_parameter("BIAS_PRECISION_0"), + "exponent_width": self.get_parameter("BIAS_PRECISION_1"), + } + proj_config = { + "width": self.get_parameter("WEIGHT_PROJ_PRECISION_0"), + "exponent_width": self.get_parameter("WEIGHT_PROJ_PRECISION_1"), + } + + # Parallelism configurations + input_parallelism = [ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ] + weight_parallelism = [ + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), + ] + bias_parallelism = [ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ] + proj_parallelism = [ + self.get_parameter("WEIGHT_PROJ_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PROJ_PARALLELISM_DIM_0"), + ] + + # Preprocess all inputs + input_data_processed = self.preprocess_tensor_for_mxint(input_data, input_config, input_parallelism) + + query_weight_processed = self.preprocess_tensor_for_mxint(query_weight, weight_config, weight_parallelism) + query_bias_processed = self.preprocess_tensor_for_mxint(query_bias, bias_config, bias_parallelism) + + key_weight_processed = self.preprocess_tensor_for_mxint(key_weight, weight_config, weight_parallelism) + key_bias_processed = self.preprocess_tensor_for_mxint(key_bias, bias_config, bias_parallelism) + + value_weight_processed = self.preprocess_tensor_for_mxint(value_weight, weight_config, weight_parallelism) + value_bias_processed = self.preprocess_tensor_for_mxint(value_bias, bias_config, bias_parallelism) + + proj_weight_processed = self.preprocess_tensor_for_mxint(proj_weight, proj_config, proj_parallelism) + proj_bias_processed = self.preprocess_tensor_for_mxint(proj_bias, bias_config, bias_parallelism) + + # Load all drivers + self.data_in_driver.load_driver(input_data_processed) + + self.query_weight_driver.load_driver(query_weight_processed) + self.query_bias_driver.load_driver(query_bias_processed) + + self.key_weight_driver.load_driver(key_weight_processed) + self.key_bias_driver.load_driver(key_bias_processed) + + self.value_weight_driver.load_driver(value_weight_processed) + self.value_bias_driver.load_driver(value_bias_processed) + + self.proj_weight_driver.load_driver(proj_weight_processed) + self.proj_bias_driver.load_driver(proj_bias_processed) + breakpoint() + + # Generate expected output (for verification) + exp_out = torch.randn((batch_size, hidden_size)) + out_config = { + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "exponent_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + } + out_parallelism = [ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ] + out_processed = self.preprocess_tensor_for_mxint(exp_out, out_config, out_parallelism) + self.out_monitor.load_monitor(out_processed) + + await Timer(1, units="ms") + if not self.out_monitor.exp_queue.empty(): + raise RuntimeError("Output monitor is not empty at end of test") + + +async def check_signal(dut): + await Timer(40, units="ns") + cycle_count = 0 + while True: + await RisingEdge(dut.clk) + await ReadOnly() + + print(f"\nCycle {cycle_count}:") + print("Valid/Ready Status:") + print("=" * 50) + + # Print QKV internal signals status with exact binary values + print("\nQKV Internal Handshaking:") + print("Query signals:") + print(f" joint_query: valid={int(dut.joint_query_valid.value):1d} ready={int(dut.joint_query_ready.value):1d}") + print(f" split_query: valid={[int(x) for x in dut.split_query_valid.value]} ready={[int(x) for x in dut.split_query_ready.value]}") + + print("\nKey signals:") + print(f" joint_key: valid={int(dut.joint_key_valid.value):1d} ready={int(dut.joint_key_ready.value):1d}") + print(f" split_key: valid={[int(x) for x in dut.split_key_valid.value]} ready={[int(x) for x in dut.split_key_ready.value]}") + + print("\nValue signals:") + print(f" joint_value: valid={int(dut.joint_value_valid.value):1d} ready={int(dut.joint_value_ready.value):1d}") + print(f" split_value: valid={[int(x) for x in dut.split_value_valid.value]} ready={[int(x) for x in dut.split_value_ready.value]}") + + # Print other signals as before + print("\nOther Signals:") + print("-" * 50) + if dut.data_in_0_valid.value and dut.data_in_0_ready.value: + print("INPUT DATA:") + print(f" m={[x.signed_integer for x in dut.mdata_in_0.value]}") + print(f" e={dut.edata_in_0.value.signed_integer}") + + if dut.data_out_0_valid.value and dut.data_out_0_ready.value: + print("OUTPUT DATA:") + print(f" m={[x.signed_integer for x in dut.mdata_out_0.value]}") + print(f" e={dut.edata_out_0.value.signed_integer}") + + print("\n" + "=" * 50) + cycle_count += 1 + +@cocotb.test() +async def cocotb_test(dut): + cocotb.start_soon(check_signal(dut)) # Enable signal monitoring + tb = MxIntViTAttentionTB(dut) + await tb.run_test() + +default_config = { + # Number of attention heads + "NUM_HEADS": 2, + + # Input data parameters + "DATA_IN_0_TENSOR_SIZE_DIM_0": 8, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 2, + "DATA_IN_0_PARALLELISM_DIM_0": 2, + "DATA_IN_0_PARALLELISM_DIM_1": 1, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 3, + + # Weight parameters (shared by Q,K,V) + "WEIGHT_TENSOR_SIZE_DIM_0": 8, + "WEIGHT_TENSOR_SIZE_DIM_1": 8, + "WEIGHT_PARALLELISM_DIM_0": 2, + "WEIGHT_PARALLELISM_DIM_1": 2, + "WEIGHT_PRECISION_0": 8, + "WEIGHT_PRECISION_1": 3, + + # Bias parameters (shared by Q,K,V) + "HAS_BIAS": 1, + "BIAS_PRECISION_0": 8, + "BIAS_PRECISION_1": 3, + + # Internal precision parameters + "QKV_PRECISION_0": 16, + "QKV_PRECISION_1": 3, + + # Projection parameters + "WEIGHT_PROJ_PRECISION_0": 12, + "WEIGHT_PROJ_PRECISION_1": 3, + "BIAS_PROJ_PRECISION_0": 8, + "BIAS_PROJ_PRECISION_1": 3, +} + +def get_fixed_self_attention_head_config(kwargs={}): + config = default_config + config.update(kwargs) + return config + + +torch.manual_seed(1) + + +@pytest.mark.dev +def test_fixed_self_attention_head_smoke(): + """ + Some quick tests to check if the module is working. + """ + mase_runner( + trace=True, + module_param_list=[ + get_fixed_self_attention_head_config(), + ], + skip_build=False, + # sim="questa", + ) + + +if __name__ == "__main__": + test_fixed_self_attention_head_smoke() diff --git a/src/mase_components/linear_layers/mxint_operators/test/test.ipynb b/src/mase_components/linear_layers/mxint_operators/test/test.ipynb deleted file mode 100644 index fabc4c551..000000000 --- a/src/mase_components/linear_layers/mxint_operators/test/test.ipynb +++ /dev/null @@ -1,52 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "#!/usr/bin/env python3\n", - "\n", - "# This script tests the fixed point linear\n", - "import os, logging\n", - "\n", - "import cocotb\n", - "from cocotb.log import SimLog\n", - "from cocotb.triggers import *\n", - "\n", - "from mase_cocotb.testbench import Testbench\n", - "from mase_cocotb.interfaces.streaming import (\n", - " MultiSignalStreamDriver,\n", - " MultiSignalStreamMonitor,\n", - ")\n", - "\n", - "from mase_cocotb.runner import mase_runner\n", - "from utils import mxint_quantize\n", - "\n", - "import torch\n", - "from math import ceil, log2\n", - "import random\n", - "\n", - "logger = logging.getLogger(\"testbench\")\n", - "logger.setLevel(logging.DEBUG)\n", - "\n", - "torch.manual_seed(10)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mase", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/mase_components/linear_layers/mxint_operators/test/test.py b/src/mase_components/linear_layers/mxint_operators/test/test.py index f58f382cf..8aa3016ad 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/test.py +++ b/src/mase_components/linear_layers/mxint_operators/test/test.py @@ -1,64 +1,259 @@ -from utils import mxint_quantize import torch +from torch import Tensor +import torch.nn as nn +from functools import partial -block_size = 10 -torch.manual_seed(0) -data = torch.rand(10) -w = torch.rand(10, 10) -d_man_width = 12 -w_man_width = 8 -e_width = 4 -(data_in, mdata_in, edata_in) = mxint_quantize( - data, - d_man_width, - e_width, -) -(weight, mweight, eweight) = mxint_quantize( - w, - w_man_width, - e_width, -) -linear = torch.nn.Linear(10, 10, bias=False) -linear.weight = torch.nn.Parameter(weight) -target_x = linear(data_in) -linear.weight = torch.nn.Parameter(mweight) -hardware_out = linear(mdata_in) -print(hardware_out * (2 ** (edata_in + eweight))) -# software knows -print(target_x) - - -# hardware quant back -def hardware_quant(hardware_in, be_value, e_width, width): - from math import ceil, log2 - - result = ceil(log2(max(hardware_in))) - exponent_bias = 2 ** (e_width - 1) - 1 - - # exponent - exponent_max = 2**e_width - 1 - exponent_bias - exponent_min = -exponent_bias - exponent = ( - torch.ceil(torch.log2(hardware_in.abs().max())) + be_value - exponent_bias +# Import from utils directly +from utils import MXIntLinearHardware, MXIntMatmulHardware, mxint_softmax, mxint_quantize + +class MxIntViTSelfAttentionHead(torch.nn.Module): + def __init__(self, dim, attn_drop, q_config) -> None: + super().__init__() + self.dim = dim + + # Extract configs for different components + self.linear_config = q_config["linear"] + self.matmul_config = q_config["matmul"] + self.softmax_config = q_config["softmax"] + + # Initialize hardware components with configs + self.linear = MXIntLinearHardware + self.matmul = MXIntMatmulHardware + self.act = partial(mxint_softmax, q_config=self.softmax_config) + + def self_attention_head( + self, + mquery: torch.Tensor, equery: torch.Tensor, + mkey: torch.Tensor, ekey: torch.Tensor, + mvalue: torch.Tensor, evalue: torch.Tensor, + ) -> Tensor: + # Configure configs for matrix multiplications + qk_x_config = { + "width": self.matmul_config["A_MAN_WIDTH"], + "exponent_width": self.matmul_config["A_EXP_WIDTH"], + "parallism_dim_0": self.matmul_config["A_COMPUTE_DIM0"], + "parallism_dim_1": self.matmul_config["A_COMPUTE_DIM1"], + "depth_dim_0": mquery.shape[-1] // self.matmul_config["A_COMPUTE_DIM0"], + "depth_dim_1": mquery.shape[-2] // self.matmul_config["A_COMPUTE_DIM1"], + "dim_0": mquery.shape[-1], + "dim_1": mquery.shape[-2], + } + + qk_y_config = { + "width": self.matmul_config["B_MAN_WIDTH"], + "exponent_width": self.matmul_config["B_EXP_WIDTH"], + "parallism_dim_0": self.matmul_config["B_COMPUTE_DIM0"], + "parallism_dim_1": self.matmul_config["B_COMPUTE_DIM1"], + "depth_dim_0": mkey.shape[-1] // self.matmul_config["B_COMPUTE_DIM0"], + "depth_dim_1": mkey.shape[-2] // self.matmul_config["B_COMPUTE_DIM1"], + "dim_0": mkey.shape[-1], + "dim_1": mkey.shape[-2], + } + + qk_out_config = { + "width": self.matmul_config["OUT_MAN_WIDTH"], + "exponent_width": self.matmul_config["OUT_EXP_WIDTH"], + "parallism_dim_0": self.matmul_config["C_COMPUTE_DIM0"], + "parallism_dim_1": self.matmul_config["C_COMPUTE_DIM1"], + "depth_dim_0": mquery.shape[-2] // self.matmul_config["C_COMPUTE_DIM0"], + "depth_dim_1": mkey.shape[-2] // self.matmul_config["C_COMPUTE_DIM1"], + "dim_0": mquery.shape[-2], + "dim_1": mkey.shape[-2], + } + + print("\n=== Self Attention Head Debug Info ===") + print("Query shape:", mquery.shape) + print("Key shape:", mkey.shape) + print("Value shape:", mvalue.shape) + + # First matmul: Q*K^T + matt_scores, eatt_scores = self.matmul( + mquery, equery, + mkey.transpose(-1, -2), ekey, + qk_x_config, qk_y_config, qk_out_config + ) + + print("\n--- Attention Scores ---") + print("Shape:", matt_scores.shape) + print("Sample values:", matt_scores[0, 0, :5]) + + # Apply softmax + mprobs, eprobs = self.act(matt_scores) + + print("\n--- Attention Probabilities ---") + print("Shape:", mprobs.shape) + print("Sample values:", mprobs[0, 0, :5]) + + # Configure matmul for attn*V + av_x_config = { + "width": self.matmul_config["A_MAN_WIDTH"], + "exponent_width": self.matmul_config["A_EXP_WIDTH"], + "parallism_dim_0": self.matmul_config["A_COMPUTE_DIM0"], + "parallism_dim_1": self.matmul_config["A_COMPUTE_DIM1"], + "depth_dim_0": mprobs.shape[-1] // self.matmul_config["A_COMPUTE_DIM0"], + "depth_dim_1": mprobs.shape[-2] // self.matmul_config["A_COMPUTE_DIM1"], + "dim_0": mprobs.shape[-1], + "dim_1": mprobs.shape[-2], + } + + av_y_config = { + "width": self.matmul_config["B_MAN_WIDTH"], + "exponent_width": self.matmul_config["B_EXP_WIDTH"], + "parallism_dim_0": self.matmul_config["B_COMPUTE_DIM0"], + "parallism_dim_1": self.matmul_config["B_COMPUTE_DIM1"], + "depth_dim_0": mvalue.shape[-1] // self.matmul_config["B_COMPUTE_DIM0"], + "depth_dim_1": mvalue.shape[-2] // self.matmul_config["B_COMPUTE_DIM1"], + "dim_0": mvalue.shape[-1], + "dim_1": mvalue.shape[-2], + } + + av_out_config = { + "width": self.matmul_config["OUT_MAN_WIDTH"], + "exponent_width": self.matmul_config["OUT_EXP_WIDTH"], + "parallism_dim_0": self.matmul_config["C_COMPUTE_DIM0"], + "parallism_dim_1": self.matmul_config["C_COMPUTE_DIM1"], + "depth_dim_0": mvalue.shape[-1] // self.matmul_config["C_COMPUTE_DIM0"], + "depth_dim_1": mprobs.shape[-2] // self.matmul_config["C_COMPUTE_DIM1"], + "dim_0": mvalue.shape[-1], + "dim_1": mprobs.shape[-2], + } + + # Second matmul: attn*V + mcontext, econtext = self.matmul( + mprobs, eprobs, + mvalue, evalue, + av_x_config, av_y_config, av_out_config + ) + + print("\n--- Context Layer ---") + print("Shape:", mcontext.shape) + print("Sample values:", mcontext[0, 0, :5]) + print("===============================\n") + + # Reconstruct output + return mcontext * (2 ** (econtext.unsqueeze(-1) - self.matmul_config["OUT_MAN_WIDTH"] + 1)) + + def forward( + self, + mquery: torch.Tensor, equery: torch.Tensor, + mkey: torch.Tensor, ekey: torch.Tensor, + mvalue: torch.Tensor, evalue: torch.Tensor, + ) -> Tensor: + return self.self_attention_head( + mquery, equery, mkey, ekey, mvalue, evalue + ) + +def test_mxint_vit_self_attention(): + print("\n=== Starting MxIntViTSelfAttention Test ===") + + # Test parameters + batch_size = 1 + num_heads = 8 + seq_length = 16 + head_dim = 64 + attn_drop = 0.1 + + print("\nTest Configuration:") + print(f"Batch size: {batch_size}") + print(f"Sequence length: {seq_length}") + print(f"Head dimension: {head_dim}") + + # Create sample input tensors + query = torch.randn(batch_size, seq_length, head_dim) + key = torch.randn(batch_size, seq_length, head_dim) + value = torch.randn(batch_size, seq_length, head_dim) + + print("\nInput Tensor Shapes:") + print(f"Query: {query.shape}") + print(f"Key: {key.shape}") + print(f"Value: {value.shape}") + + # Comprehensive configuration for all components + q_config = { + "linear": { + "data_in_width": 8, + "data_in_exponent_width": 4, + "data_in_parallelism": [2, 2], + "weight_width": 8, + "weight_exponent_width": 4, + "weight_parallelism": [2, 2], + "bias_width": 8, + "bias_exponent_width": 4, + "bias_parallelism": [2, 1], + "data_out_width": 8, + "data_out_exponent_width": 4, + "data_out_parallelism": [2, 2], + }, + "matmul": { + "A_MAN_WIDTH": 8, + "A_EXP_WIDTH": 4, + "B_MAN_WIDTH": 8, + "B_EXP_WIDTH": 4, + "OUT_MAN_WIDTH": 8, + "OUT_EXP_WIDTH": 4, + "A_COMPUTE_DIM0": 2, + "A_COMPUTE_DIM1": 2, + "B_COMPUTE_DIM0": 2, + "B_COMPUTE_DIM1": 2, + "C_COMPUTE_DIM0": 2, + "C_COMPUTE_DIM1": 2, + }, + "softmax": { + "in_man_width": 8, + "in_exp_width": 4, + "data_out_n_width": 4, + "data_out_man_width": 8, + "data_out_exp_width": 4, + } + } + + # Initialize the attention head + attention = MxIntViTSelfAttentionHead( + dim=head_dim, + attn_drop=attn_drop, # Remove num_heads as it's not used + q_config=q_config + ) + + # Quantize inputs before passing to attention + _, mquery, equery = mxint_quantize( + query, + q_config["matmul"]["A_MAN_WIDTH"], + q_config["matmul"]["A_EXP_WIDTH"] ) - exponent = torch.clamp(exponent, exponent_min, exponent_max) - int_min = -(2 ** (width - 1)) - int_max = 2 ** (width - 1) - 1 - mantissa = hardware_in / 2 ** (exponent - be_value) - breakpoint() - mantissa = torch.clamp(mantissa.floor(), int_min, int_max) - - msfp_x = (2**exponent) * mantissa - return msfp_x, mantissa, exponent - - -new_man_width = 8 -new_e_width = 4 -qout, qmout, qeout = hardware_quant( - hardware_out, (edata_in + eweight), new_e_width, new_man_width -) -out, mout, eout = mxint_quantize(target_x, new_man_width, new_e_width) -breakpoint() -# def hardware_quant_back(): -# hardware_out.max().log2()+ hardware_exp -# clamp((log2(max(hardware_out))+hardware_exp),target_width) + _, mkey, ekey = mxint_quantize( + key, + q_config["matmul"]["B_MAN_WIDTH"], + q_config["matmul"]["B_EXP_WIDTH"] + ) + _, mvalue, evalue = mxint_quantize( + value, + q_config["matmul"]["B_MAN_WIDTH"], + q_config["matmul"]["B_EXP_WIDTH"] + ) + + # Run forward pass with quantized inputs + try: + print("\nRunning forward pass...") + output = attention( + mquery, equery, + mkey, ekey, + mvalue, evalue + ) + print("\nResults:") + print(f"Output shape: {output.shape}") + print(f"Expected shape: ({batch_size}, {seq_length}, {head_dim})") + print(f"Sample output values (first 5):") + for i, val in enumerate(output[0,0,:5].tolist()): + print(f" [{i}]: {val:10.6f}") + + assert output.shape == (batch_size, seq_length, head_dim) + print("\n✓ Test passed successfully!") + except Exception as e: + print(f"\n✗ Test failed with error:") + print(f" {str(e)}") + raise + finally: + print("\n=== Test Completed ===\n") + +if __name__ == "__main__": + test_mxint_vit_self_attention() \ No newline at end of file diff --git a/src/mase_components/linear_layers/mxint_operators/test/utils.py b/src/mase_components/linear_layers/mxint_operators/test/utils.py index 7edb9f6ed..f49c15142 100644 --- a/src/mase_components/linear_layers/mxint_operators/test/utils.py +++ b/src/mase_components/linear_layers/mxint_operators/test/utils.py @@ -4,75 +4,82 @@ import torch.nn.functional as F from torch import Tensor - -def mxint_quantize(x, width: int = 12, exponent_width: int = 6, exponent: int = None): +def mxint_quant_block( + x, width: int = 12, exponent_width: int = 6, exponent: int = None, round_bits: int = 0, +): """ - - Convert IEEE FP32/64 to Microsoft floating point (MSFP), where an exponent is shared over all elements in a block. - - `e_shared x [(-1)^s1 x mantissa1, (-1)^s2 x mantissa2, ...]` - - See https://proceedings.neurips.cc/paper/2020/file/747e32ab0fea7fbd2ad9ec03daa3f840-Paper.pdf - - --- - - forward: convert IEEE FP32/64 to MSFP - - backward: STE - + - Idea from https://arxiv.org/pdf/2310.10537 + - Convert IEEE FP32/64 to Integer with sharing scale + - The main difference between is the sharing scale do not support NAN representation --- - `width`: The number of mantissa bits + 1 (the sign bit) - `exponent_width`: the number of exponent bits, which is shared over a block - `exponent_bias`: the exponent bias, if None, `2**(exponent_bits-1)-1` will be used - - `block_size`: a list of integers where each integer is the block size on that dimension. See function `block`. """ exponent_bias = 2 ** (exponent_width - 1) - exponent_max = 2**exponent_width - 1 - exponent_bias exponent_min = -exponent_bias + + # Vectorized max and log2 operations + abs_max = x.abs().max(dim=-1, keepdim=True).values + log2 = torch.log2(abs_max + torch.finfo(torch.float32).tiny) - # exponent - if exponent == None: - exponent = torch.ceil(torch.log2(x.abs().max())) - exponent_bias - exponent = torch.clamp(exponent, exponent_min, exponent_max) - # mantissa + exponent = torch.ceil(log2) + exponent[exponent == log2] += 1 + exponent = torch.clamp(exponent, exponent_min, exponent_max) + + # Vectorized mantissa calculation int_min = -(2 ** (width - 1)) int_max = 2 ** (width - 1) - 1 - mantissa = x / 2**exponent - mantissa = torch.clamp(mantissa.floor(), int_min, int_max) - msfp_x = (2**exponent) * mantissa - return msfp_x, mantissa, exponent + mantissa = x * (2 ** (width - 1)) / 2**exponent + mantissa = mantissa * 2 ** round_bits + mantissa = torch.floor(mantissa) + mantissa = mantissa / 2 ** round_bits + mantissa = torch.round(mantissa) + mantissa = torch.clamp(mantissa, int_min, int_max) + q_x = (2**exponent) * mantissa /(2 ** (width - 1)) + return q_x, mantissa, exponent +def reshape_to_block(tensor, t1, t0, p1, p0): + return tensor.reshape(-1, t1 // p1, p1, t0 // p0, p0)\ + .permute(0, 1, 3, 2, 4) + +def reshape_back(tensor, t1, t0, p1, p0): + return tensor.reshape(-1, t1 // p1, t0 // p0, p1, p0)\ + .permute(0, 1, 3, 2, 4) + +def mxint_hardware(tensor, q_config, parallelism): + """ + Vectorized hardware-aware quantization implementation + """ -def block_mxint_quant(tensor, q_config, parallelism): - original_shape = tensor.shape if len(tensor.shape) == 1: tensor = tensor.unsqueeze(0) - if len(parallelism) == 1: + if type(parallelism) == int: + parallelism = [1, parallelism] + elif len(parallelism) == 1: parallelism = [1, parallelism[0]] - p1 = parallelism[0] - p0 = parallelism[1] - t1 = tensor.shape[-2] - t0 = tensor.shape[-1] - reshaped_tensor = tensor.reshape(-1, t1 // p1, p1, t0 // p0, p0).permute( - 0, 1, 3, 2, 4 - ) - - # Quantize - quantizer = partial(mxint_quantize, **q_config) - reshaped_tensor = torch.tensor(reshaped_tensor.reshape(-1, p1 * p0)) - mtensor = torch.zeros(reshaped_tensor.shape) - etensor = torch.zeros(reshaped_tensor.shape[0]) - for i in range(reshaped_tensor.shape[0]): - reshaped_tensor[i], mtensor[i], etensor[i] = quantizer(reshaped_tensor[i]) - qtensor = reshaped_tensor.reshape(-1, t1 // p1, t0 // p0, p1, p0).permute( - 0, 1, 3, 2, 4 - ) - mtensor = ( - mtensor.reshape(-1, t1 // p1, t0 // p0, p1, p0) - .permute(0, 1, 3, 2, 4) - .reshape(-1, t1, t0) - ) - etensor = etensor.reshape(-1, t1 // p1, t0 // p0) - return qtensor.reshape(original_shape), mtensor.reshape(original_shape), etensor - + p1, p0 = parallelism + t1, t0 = tensor.shape[-2:] + + original_mshape = tensor.shape + original_eshape = torch.Size([t1//p1, t0//p0]) if len(tensor.shape) <=2 else torch.Size([*tensor.shape[:-2],t1//p1, t0//p0]) + assert (t1 % p1 == 0 and t0 % p0 == 0), \ + f"Block size mismatch: t1={t1}, p1={p1}, t0={t0}, p0={p0}" + + # Single reshape and permute operation + block_tensor = reshape_to_block(tensor, t1, t0, p1, p0).reshape(-1, p1*p0) + qtensor, mantissa, exponent = mxint_quant_block(block_tensor, **q_config) + + qtensor = reshape_to_block(qtensor, t1, t0, p1, p0) + mantissa = reshape_back(mantissa, t1, t0, p1, p0) + qtensor = qtensor.reshape(original_mshape) + mantissa = mantissa.reshape(original_mshape) + exponent = exponent.reshape(original_eshape) + # Efficient shape restoration + return qtensor, mantissa, exponent def pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism): if len(mtensor.shape) == 1: @@ -96,11 +103,100 @@ def pack_tensor_to_mx_listed_chunk(mtensor, etensor, parallelism): ) return mx_data_list - from chop.nn.quantized.modules.linear import _LinearBase +def fast_linear(x, w, b, config): + batch_size, n = x.shape[:2] + out_features = w.shape[0] + qx, mx, ex = mxint_hardware(x, **{ + "parallelism":[config["x_config"]["parallism_dim_1"], config["x_config"]["parallism_dim_0"]], + "q_config":{ + "width": config["x_config"]["width"], + "exponent_width": config["x_config"]["exponent_width"], + "round_bits": config["round_bits"], + + }, + }) + qw, mw, ew = mxint_hardware(w, **{ + "parallelism":[config["w_config"]["parallism_dim_1"], config["w_config"]["parallism_dim_0"]], + "q_config":{ + "width": config["w_config"]["width"], + "exponent_width": config["w_config"]["exponent_width"], + "round_bits": 8, + } + }) + qb, mb, eb = mxint_hardware(b, **{ + "parallelism":[config["bias_config"]["parallism_dim_1"], config["bias_config"]["parallism_dim_0"]], + "q_config":{ + "width": config["bias_config"]["width"], + "exponent_width": config["bias_config"]["exponent_width"], + "round_bits": 8, + } + }) + x_config = config["x_config"] + w_config = config["w_config"] + reshaped_mx = reshape_to_block(mx, x_config["dim_1"], x_config["dim_0"], x_config["parallism_dim_1"], x_config["parallism_dim_0"]) + reshaped_mw = reshape_to_block(mw, w_config["dim_1"], w_config["dim_0"], w_config["parallism_dim_1"], w_config["parallism_dim_0"]) + + # move the infeatures depth to the front + mx_for_accumulation = reshaped_mx.permute(2, 0, 1, 3, 4) + # The dimension will be [depth_in_features, batch_size, depth_n, parallism_n, parallism_in_features] + # For every parallelised block, we will have a unique exponent + # Original shape of ex is [batch_size, depth_n, depth_in_features] + # We will permute it to [depth_in_features, batch_size, depth_n] + ex_for_accumulation = ex.permute(2, 0, 1) + + # Same for mw, the shape of mw is [depth_out_features, depth_in_features, parallism_out_features, parallism_in_features] + mw_for_accumulation = reshaped_mw.squeeze(0) + mw_for_accumulation = mw_for_accumulation.permute(1, 0, 2, 3) + ew_for_accumulation = ew.transpose(0, 1) + + # We are trying to do the matmul based on the block partition + # mx is [depth_in_features, batch_size, depth_n, parallism_n, parallism_in_features] + # mw is [depth_in_features, depth_out_features, parallism_out_features, parallism_in_features] + # merge depth_out_features and parallelism_out_features + # mw = [depth_in_features, out_features, parallism_in_features] + mw_for_accumulation = mw_for_accumulation.reshape(mw_for_accumulation.shape[0], -1, mw_for_accumulation.shape[-1]) + + mout = mx_for_accumulation[0] @ mw_for_accumulation[0].transpose(-2, -1) + mout = reshape_to_block(mout, x_config["dim_1"], w_config["dim_1"], x_config["parallism_dim_1"], w_config["parallism_dim_1"]) + # shape of mout is [batch_size, depth_n, parallism_n, out_features] + ex_expanded = ex_for_accumulation.unsqueeze(-1) # [depth_in_features, batch_size, depth_n, 1] + ew_expanded = ew_for_accumulation.unsqueeze(1).unsqueeze(2) # [depth_in_features, 1, 1, depth_out_features] + eout = (ex_expanded[0] + ew_expanded[0]).unsqueeze(-1).unsqueeze(-1) + for i in range(1, mx_for_accumulation.shape[0]): + new_exponent = (ex_expanded[i] + ew_expanded[i]).unsqueeze(-1).unsqueeze(-1) + max_exponent = torch.max(eout, new_exponent) + mout = mout // 2 ** (max_exponent - eout) + current_result = mx_for_accumulation[i] @ mw_for_accumulation[i].transpose(-2, -1) + current_result = reshape_to_block(current_result, x_config["dim_1"], w_config["dim_1"], x_config["parallism_dim_1"], w_config["parallism_dim_1"]) + current_result = current_result // 2 ** (max_exponent - new_exponent) + mout += current_result + eout = max_exponent + + # the shape of qout will be [batch_size, depth_in_n, depth_out_features, paral_n, paral_out_features] + # the shape of mb will be [1, 1, out_features] + # reshape mb to [1, 1, depth_out_features, 1, paral_out_features] + # broad cast to [batch_size, depth_in_n, depth_out_features, paral_n, paral_out_features] + + # the shape of eout willbe [batch_size, depth_n, depth_out_features] + # the shape of eb will be [1, 1, depth_out_featuers] + + # so i wish eb can map back to + out_config = config["out_config"] + b_config = config["bias_config"] + width_difference = x_config["width"] + w_config["width"] - 2 - (b_config["width"] -1) + reshaped_mb = mb.reshape(1, 1, out_config["depth_dim_0"], 1, out_config["parallism_dim_0"]) + reshaped_eb = eb.reshape(1, 1, out_config["depth_dim_0"], 1, 1) + mb_for_out = reshaped_mb // 2**(eout - reshaped_eb - width_difference) + mout = mout + mb_for_out + + qout = reshape_back((mout / 2 **(x_config["width"]+w_config["width"] - 2 - eout)), x_config["dim_1"], w_config["dim_1"], x_config["parallism_dim_1"], w_config["parallism_dim_1"]) + qout = qout.reshape(batch_size, n, out_features) + + return qout -class MXIntLinear(_LinearBase): +class MXIntLinearHardware(_LinearBase): def __init__( self, in_features: int, @@ -108,81 +204,71 @@ def __init__( bias: bool = True, device=None, dtype=None, - config=None, - out_config=None, + q_config=None, ) -> None: super().__init__(in_features, out_features, bias, device, dtype) - assert config is not None, "config is None!" - self.config = config - self.out_config = out_config - self.bypass = config.get("bypass", False) + assert q_config is not None, "config is None!" + self.in_features = in_features + self.out_features = out_features + self.q_config = q_config + self.bypass = q_config.get("bypass", False) if self.bypass: return # establish quantizer - w_width, w_exponent_width = ( - config["weight_width"], - config["weight_exponent_width"], - ) - w_p1, w_p0 = ( - config["weight_parallelism_dim_1"], - config["weight_parallelism_dim_0"], - ) - x_width, x_exponent_width = ( - config["data_in_width"], - config["data_in_exponent_width"], - ) - x_p1, x_p0 = ( - config["data_in_parallelism_dim_1"], - config["data_in_parallelism_dim_0"], - ) - # check bias quantizer, if not, use weight quantizer - b_width, b_exponent_width = config["bias_width"], config["bias_exponent_width"] - b_p1, b_p0 = config["bias_parallelism_dim_1"], config["bias_parallelism_dim_0"] - base_quantizer = block_mxint_quant - if out_config is not None: - out_width, out_exponent_width = ( - config["data_out_width"], - config["data_out_exponent_width"], - ) - out_p1, out_p0 = ( - config["data_out_parallelism_dim_1"], - config["data_out_parallelism_dim_0"], - ) - self.out_quantizer = partial( - base_quantizer, - q_config={"width": out_width, "exponent_width": out_exponent_width}, - parallelism=[out_p1, out_p0], - ) - self.w_quantizer = partial( - base_quantizer, - q_config={"width": w_width, "exponent_width": w_exponent_width}, - parallelism=[w_p1, w_p0], - ) - self.x_quantizer = partial( - base_quantizer, - q_config={"width": x_width, "exponent_width": x_exponent_width}, - parallelism=[x_p1, x_p0], - ) - self.b_quantizer = partial( - base_quantizer, - q_config={"width": b_width, "exponent_width": b_exponent_width}, - parallelism=[b_p1, b_p0], - ) def forward(self, x: Tensor) -> Tensor: - if self.bypass: - return F.linear(x, self.weight, self.bias) - else: - x, mx, ex = self.x_quantizer(x) - w, mw, ew = self.w_quantizer(self.weight) - print((mx @ mw.transpose(0, 1)).int()) - if self.bias is not None: - bias, mb, eb = self.b_quantizer(self.bias) - else: - bias = None - breakpoint() - out = F.linear(x, w, bias) - # print(f"mout = {F.linear(mx, mw, mb*2**(ex+ew - eb).floor())}") - if self.out_quantizer is None: - return out - return self.out_quantizer(out) + # an example of config + unroll_in_features = self.q_config["data_in_parallelism"][1] + unroll_out_features = self.q_config["data_out_parallelism"][1] + unroll_n = self.q_config["data_in_parallelism"][0] + in_features = self.in_features + out_features = self.out_features + n = x.shape[1] + batch_size = x.shape[0] + assert x.shape[2] == in_features, f"Input shape mismatch: {x.shape[2]} != {in_features}" + + self.config = { + "x_config": { + "width": self.q_config["data_in_width"], + "exponent_width": self.q_config["data_in_exponent_width"], + "parallism_dim_0": unroll_in_features, + "parallism_dim_1": unroll_n, + "depth_dim_0": in_features // unroll_in_features, + "depth_dim_1": n // unroll_n, + "dim_0": in_features, + "dim_1": n, + }, + "w_config": { + "width": self.q_config["weight_width"], + "exponent_width": self.q_config["weight_exponent_width"], + "parallism_dim_0": unroll_in_features, + "parallism_dim_1": unroll_out_features, + "depth_dim_0": in_features // unroll_in_features, + "depth_dim_1": out_features // unroll_out_features, + "dim_0": in_features, + "dim_1": out_features, + }, + "bias_config": { + "width": self.q_config["bias_width"], + "exponent_width": self.q_config["bias_exponent_width"], + "parallism_dim_0": unroll_out_features, + "parallism_dim_1": 1, + "depth_dim_0": out_features // unroll_out_features, + "depth_dim_1": 1, + "dim_0": out_features, + "dim_1": 1, + }, + "out_config": { + "width": self.q_config["data_out_width"], + "exponent_width": self.q_config["data_out_exponent_width"], + "parallism_dim_0": unroll_out_features, + "parallism_dim_1": unroll_n, + "depth_dim_0": out_features // unroll_out_features, + "depth_dim_1": n // unroll_n, + "dim_0": out_features, + "dim_1": n, + }, + "round_bits": self.q_config["round_bits"], + } + out = fast_linear(x, self.weight, self.bias, self.config) + return out diff --git a/src/mase_components/memory/rtl/blk_mem_gen_0.sv b/src/mase_components/memory/rtl/blk_mem_gen_0.sv index 55ca60c9c..20dad70f7 100644 --- a/src/mase_components/memory/rtl/blk_mem_gen_0.sv +++ b/src/mase_components/memory/rtl/blk_mem_gen_0.sv @@ -3,21 +3,13 @@ module blk_mem_gen_0 #( parameter DATA_WIDTH = 8, parameter MEM_SIZE = 1 ) ( - clka, - ena, - wea, - addra, - dina, - douta -) -/* synthesis syn_black_box black_box_pad_pin="ena,wea[0:0],addra[9:0],dina[7:0],douta[7:0]" */ -/* synthesis syn_force_seq_prim="clka" */; - input logic clka /* synthesis syn_isclock = 1 */; - input logic ena; - input logic [0:0] wea; - input logic [$clog2(MEM_SIZE):0] addra; - input logic [DATA_WIDTH - 1:0] dina; - output logic [DATA_WIDTH - 1:0] douta; + input logic clka, + input logic ena, + input logic wea, + input logic [$clog2(MEM_SIZE):0] addra, + input logic [DATA_WIDTH - 1:0] dina, + output logic [DATA_WIDTH - 1:0] douta +); logic [DATA_WIDTH - 1:0] ram[0:MEM_SIZE-1]; logic [DATA_WIDTH - 1:0] douta_t1; diff --git a/src/mase_components/memory/rtl/fifo_for_autogen.sv b/src/mase_components/memory/rtl/fifo_for_autogen.sv new file mode 100644 index 000000000..d0e11f93d --- /dev/null +++ b/src/mase_components/memory/rtl/fifo_for_autogen.sv @@ -0,0 +1,42 @@ +`timescale 1 ns / 1 ps +/* verilator lint_off PINMISSING */ +module fifo_for_autogen #( + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 20, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, // must equal WEIGHT_PARALLELISM_DIM_1 + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DEPTH = DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_0 * DATA_IN_0_PARALLELISM_DIM_1) +) ( + input clk, + input rst, + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + unpacked_fifo #( + .DEPTH(DEPTH), + .DATA_WIDTH(DATA_IN_0_PRECISION_0), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1) + ) ff_inst ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(data_in_0_valid), + .data_in_ready(data_in_0_ready), + .data_out(data_out_0), + .data_out_valid(data_out_0_valid), + .data_out_ready(data_out_0_ready) + ); +endmodule diff --git a/src/mase_components/memory/rtl/input_buffer.sv b/src/mase_components/memory/rtl/input_buffer.sv index 66c1c4c3c..fc6311810 100644 --- a/src/mase_components/memory/rtl/input_buffer.sv +++ b/src/mase_components/memory/rtl/input_buffer.sv @@ -101,9 +101,9 @@ module input_buffer #( end assign reg_in_valid = (mode == STRAIGHT) ? data_in_valid : (!(delay2_bos || delay1_bos)); assign data_in_ready = (mode == STRAIGHT) ? reg_in_ready : 0; - unpacked_skid_buffer #( + unpacked_register_slice #( .DATA_WIDTH(DATA_WIDTH), - .IN_NUM(IN_NUM) + .IN_SIZE(IN_NUM) ) reg_inst ( .data_in(reg_in), .data_in_valid(reg_in_valid), diff --git a/src/mase_components/memory/rtl/matrix_bank.sv b/src/mase_components/memory/rtl/matrix_bank.sv index 81d2abc22..47ebce0b7 100644 --- a/src/mase_components/memory/rtl/matrix_bank.sv +++ b/src/mase_components/memory/rtl/matrix_bank.sv @@ -5,8 +5,65 @@ Features to implement: clear all counters and flags and transition to REQ_FETCH */ - -import matrix_bank_pkg::*; +package matrix_bank_pkg; + + parameter AXI_ADDRESS_WIDTH = 32; + parameter MAX_DIMENSION = 1024; + parameter MAX_FEATURE_COUNT = 16; + + + typedef struct packed { + logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [$clog2(MAX_DIMENSION):0] columns; + logic [$clog2(MAX_DIMENSION):0] rows; + } REQ_t; + + typedef struct packed {logic partial;} RESP_t; + + typedef struct packed { + // Check request payloads match NSB payloads + logic [$clog2(MAX_FEATURE_COUNT):0] columns; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; + } ROW_CHANNEL_REQ_t; + + typedef struct packed { + logic [MAX_FEATURE_COUNT-1:0][31:0] data; + logic [MAX_FEATURE_COUNT-1:0] valid_mask; + logic done; + } ROW_CHANNEL_RESP_t; + +endpackage + +// package matrix_bank_pkg; + +parameter AXI_ADDRESS_WIDTH = 32; +parameter MAX_DIMENSION = 1024; +parameter MAX_FEATURE_COUNT = 32; + +typedef struct packed { + logic [AXI_ADDRESS_WIDTH-1:0] start_address; + logic [$clog2(MAX_DIMENSION):0] columns; + logic [$clog2(MAX_DIMENSION):0] rows; +} REQ_t; + +typedef struct packed {logic partial;} RESP_t; + +typedef struct packed { + // Check request payloads match NSB payloads + logic [$clog2( +MAX_FEATURE_COUNT +):0] columns; + logic [$clog2(MAX_FEATURE_COUNT):0] rows; +} ROW_CHANNEL_REQ_t; + +typedef struct packed { + logic [MAX_FEATURE_COUNT-1:0][31:0] data; + logic [MAX_FEATURE_COUNT-1:0] valid_mask; + logic done; +} ROW_CHANNEL_RESP_t; + +// endpackage +// import matrix_bank_pkg::*; module matrix_bank #( parameter PRECISION = 0, // 0 = FP32, 1 = FP16 @@ -221,8 +278,9 @@ module matrix_bank #( axi_rm_fetch_byte_count = matrix_bank_req_q.columns * 4; bytes_per_row = matrix_bank_req_q.columns * 4; - bytes_per_row_padded = {bytes_per_row[$clog2(MAX_DIMENSION*4)-1:6], 6'b0} + - (|bytes_per_row[5:0] ? 'd64 : '0); // round up to nearest multiple of 64 + bytes_per_row_padded = { + bytes_per_row[$clog2(MAX_DIMENSION*4)-1:6], 6'b0 + } + (|bytes_per_row[5:0] ? 'd64 : '0); // round up to nearest multiple of 64 axi_rm_fetch_start_address = matrix_bank_req_q.start_address + rows_fetched * bytes_per_row_padded; end @@ -316,9 +374,9 @@ module matrix_bank #( end // Round up in features to the nearest multiple of 16 - assign required_pulses = {matrix_bank_req_q.columns[$clog2( - MAX_DIMENSION - )-1:4], 4'd0} + (|matrix_bank_req_q.columns[3:0] ? 'd16 : '0); + assign required_pulses = { + matrix_bank_req_q.columns[$clog2(MAX_DIMENSION)-1:4], 4'd0 + } + (|matrix_bank_req_q.columns[3:0] ? 'd16 : '0); always_ff @(posedge core_clk or negedge resetn) begin if (!resetn) begin diff --git a/src/mase_components/memory/rtl/matrix_bank_pkg.sv b/src/mase_components/memory/rtl/matrix_bank_pkg.sv index 791d48871..e56bac906 100644 --- a/src/mase_components/memory/rtl/matrix_bank_pkg.sv +++ b/src/mase_components/memory/rtl/matrix_bank_pkg.sv @@ -2,6 +2,7 @@ package matrix_bank_pkg; parameter AXI_ADDRESS_WIDTH = 32; parameter MAX_DIMENSION = 1024; + parameter MAX_FEATURE_COUNT = 32; typedef struct packed { logic [AXI_ADDRESS_WIDTH-1:0] start_address; diff --git a/src/mase_components/normalization_layers/rtl/layer_norm_1d.sv b/src/mase_components/normalization_layers/rtl/layer_norm_1d.sv new file mode 100644 index 000000000..4e8d40fac --- /dev/null +++ b/src/mase_components/normalization_layers/rtl/layer_norm_1d.sv @@ -0,0 +1,473 @@ +/* +layernorm 1d +*/ +module layer_norm_1d #( + // Dimensions + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 2, + // Data widths + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 4, + parameter ISQRT_IN_PRECISION_0 = 8, + parameter ISQRT_IN_PRECISION_1 = 8, + parameter ISQRT_OUT_PRECISION_0 = 8, + parameter ISQRT_OUT_PRECISION_1 = 4, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4 +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_IN_0_PARALLELISM_DIM_0-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + + // Derived params + localparam DEPTH_DIM0 = DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0; + + localparam NUM_VALUES = DATA_IN_0_TENSOR_SIZE_DIM_0; + + localparam NUM_ITERS = DEPTH_DIM0; + localparam ITER_WIDTH = $clog2(NUM_ITERS); + + // Compute Pipeline Widths + + localparam ADDER_TREE_IN_SIZE = DATA_IN_0_PARALLELISM_DIM_0; + localparam ADDER_TREE_OUT_WIDTH = $clog2(ADDER_TREE_IN_SIZE) + DATA_IN_0_PRECISION_0; + + localparam ACC_OUT_WIDTH = ITER_WIDTH + ADDER_TREE_OUT_WIDTH; + + localparam DIFF_WIDTH = DATA_IN_0_PRECISION_0 + 1; + localparam DIFF_FRAC_WIDTH = DATA_IN_0_PRECISION_1; + + localparam SQUARE_WIDTH = DIFF_WIDTH * 2; + localparam SQUARE_FRAC_WIDTH = DIFF_FRAC_WIDTH * 2; + + localparam SQUARES_ADDER_TREE_IN_SIZE = DATA_IN_0_PARALLELISM_DIM_0; + localparam SQUARES_ADDER_TREE_OUT_WIDTH = $clog2(SQUARES_ADDER_TREE_IN_SIZE) + SQUARE_WIDTH; + localparam SQUARES_ADDER_TREE_OUT_FRAC_WIDTH = SQUARE_FRAC_WIDTH; + + localparam VARIANCE_WIDTH = ITER_WIDTH + SQUARES_ADDER_TREE_OUT_WIDTH; + localparam VARIANCE_FRAC_WIDTH = SQUARES_ADDER_TREE_OUT_FRAC_WIDTH; + + + localparam NORM_WIDTH = ISQRT_OUT_PRECISION_0 + DIFF_WIDTH; + localparam NORM_FRAC_WIDTH = ISQRT_OUT_PRECISION_1 + DIFF_FRAC_WIDTH; + + /* verilator lint_off UNUSEDSIGNAL */ + // Input FIFO + logic [DATA_IN_0_PRECISION_0-1:0] fifo_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic fifo_out_valid, fifo_out_ready; + logic fifo_in_valid, fifo_in_ready; + + // Input Adder Tree + logic [ADDER_TREE_OUT_WIDTH-1:0] adder_tree_data; + logic adder_tree_out_valid, adder_tree_out_ready; + logic adder_tree_in_valid, adder_tree_in_ready; + + + logic [ACC_OUT_WIDTH-1:0] mu_acc; + logic mu_acc_valid, mu_acc_ready; + + logic [DATA_IN_0_PRECISION_0-1:0] mu_in, mu_out; + logic mu_out_valid, mu_out_ready; + + logic [ACC_OUT_WIDTH + ACC_OUT_WIDTH:0] mu_acc_div; + + logic mu_fifo_valid, mu_fifo_ready; + + logic signed [DIFF_WIDTH-1:0] diff_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic signed [DIFF_WIDTH-1:0] diff_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_in_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_out_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + logic [SQUARE_WIDTH-1:0] square_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic square_in_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic square_out_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [SQUARE_WIDTH-1:0] square_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + // Split2 for split in pipeline from diff + logic fifo_diff_in_valid, fifo_diff_in_ready; + logic fifo_diff_out_valid; + + // Squares adder tree + logic [SQUARES_ADDER_TREE_OUT_WIDTH-1:0] squares_adder_tree_data; + logic squares_adder_tree_out_valid, squares_adder_tree_out_ready; + logic squares_adder_tree_in_valid, squares_adder_tree_in_ready; + + // Squares Accumulator + logic [VARIANCE_WIDTH-1:0] squares_acc; + logic squares_acc_valid, squares_acc_ready; + + // Take the accumulated squares and divide it to get variance + logic [SQUARES_ADDER_TREE_OUT_WIDTH+VARIANCE_WIDTH:0] variance_buffer; + logic [VARIANCE_WIDTH-1:0] variance_in, variance_out; + logic variance_out_valid, variance_out_ready; + + logic [ ISQRT_IN_PRECISION_0-1:0] variance_cast; + logic [ ISQRT_IN_PRECISION_0-1:0] inv_sqrt_in; + logic [ISQRT_OUT_PRECISION_0-1:0] inv_sqrt_out; + // Take inverse square root of variance + logic [ISQRT_OUT_PRECISION_0-1:0] inv_sqrt_data; + logic inv_sqrt_valid, inv_sqrt_ready; + + // Repeat circular buffer to hold inverse square root of variance during mult + logic [ISQRT_OUT_PRECISION_0-1:0] isqrt_circ_data; + logic isqrt_circ_valid, isqrt_circ_ready; + logic norm_in_valid; + + // FIFO for storing X-mu differences + logic [DIFF_WIDTH-1:0] diff_batch_in[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_batch_in_valid, diff_batch_in_ready; + logic [DIFF_WIDTH-1:0] diff_batch_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic diff_batch_out_valid, diff_batch_out_ready; + + logic [NORM_WIDTH-1:0] norm_in_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [NORM_WIDTH-1:0] norm_out_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic [DATA_OUT_0_PRECISION_0-1:0] norm_round_out[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + logic [DATA_OUT_0_PRECISION_0-1:0] norm_batch_data[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic output_reg_ready; + + logic norm_in_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic norm_out_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic norm_batch_ready[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + logic output_reg_valid[DATA_IN_0_PARALLELISM_DIM_0-1:0]; + + /* verilator lint_on UNUSEDSIGNAL */ + + matrix_fifo #( + .DATA_WIDTH(DATA_IN_0_PRECISION_0), + .DIM0 (DATA_IN_0_PARALLELISM_DIM_0), + .DIM1 (1), + .FIFO_SIZE (4 * NUM_ITERS) + ) input_fifo_inst ( + .clk(clk), + .rst(rst), + .in_data(data_in_0), + .in_valid(fifo_in_valid), + .in_ready(fifo_in_ready), + .out_data(fifo_data), + .out_valid(fifo_out_valid), + .out_ready(fifo_out_ready) + ); + + // Input Adder Tree + fixed_adder_tree #( + .IN_SIZE (DATA_IN_0_PARALLELISM_DIM_0), + .IN_WIDTH(DATA_IN_0_PRECISION_0) + ) sum_adder_tree ( + .clk(clk), + .rst(rst), + .data_in(data_in_0), + .data_in_valid(adder_tree_in_valid), + .data_in_ready(adder_tree_in_ready), + .data_out(adder_tree_data), + .data_out_valid(adder_tree_out_valid), + .data_out_ready(adder_tree_out_ready) + ); + + // Split2 for input to FIFO & Adder Tree + split2 input_fifo_adder_split ( + .data_in_valid (data_in_0_valid), + .data_in_ready (data_in_0_ready), + .data_out_valid({adder_tree_in_valid, fifo_in_valid}), + .data_out_ready({adder_tree_in_ready, fifo_in_ready}) + ); + // Accumulator for mu + fixed_accumulator #( + .IN_DEPTH(NUM_ITERS), + .IN_WIDTH(ADDER_TREE_OUT_WIDTH) + ) mu_accumulator ( + .clk(clk), + .rst(rst), + .data_in(adder_tree_data), + .data_in_valid(adder_tree_out_valid), + .data_in_ready(adder_tree_out_ready), + .data_out(mu_acc), + .data_out_valid(mu_acc_valid), + .data_out_ready(mu_acc_ready) + ); + + + // Division by NUM_VALUES + // ACC_WIDTH = DATA_IN_WIDTH + $clog2(NUM_VALUES) + // BASICALLY the same thing + localparam bit [ACC_OUT_WIDTH+1:0] INV_NUMVALUES_0 = ((1 << ACC_OUT_WIDTH) / NUM_VALUES); + assign mu_acc_div = ($signed(mu_acc) * $signed(INV_NUMVALUES_0)); + + fixed_signed_cast #( + .IN_WIDTH(ACC_OUT_WIDTH + ACC_OUT_WIDTH + 1), + .IN_FRAC_WIDTH(DATA_IN_0_PRECISION_1 + ACC_OUT_WIDTH), + .OUT_WIDTH(DATA_IN_0_PRECISION_0), + .OUT_FRAC_WIDTH(DATA_IN_0_PRECISION_1), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) acc_div_cast_i ( + .in_data (mu_acc_div), + .out_data(mu_in) + ); + + single_element_repeat #( + .DATA_WIDTH(DATA_IN_0_PRECISION_0), + .REPEAT(NUM_ITERS) + ) mu_buffer ( + .clk(clk), + .rst(rst), + .in_data(mu_in), + .in_valid(mu_acc_valid), + .in_ready(mu_acc_ready), + .out_data(mu_out), + .out_valid(mu_out_valid), + .out_ready(mu_out_ready) + ); + + // Join 2 for combining fifo and mu buffer signals + assign mu_fifo_ready = diff_in_ready[0]; + + join2 mu_fifo_join2 ( + .data_in_valid ({mu_out_valid, fifo_out_valid}), + .data_in_ready ({mu_out_ready, fifo_out_ready}), + .data_out_valid(mu_fifo_valid), + .data_out_ready(mu_fifo_ready) + ); + + // Compute pipeline + + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0; i++) begin : compute_pipe + + // Take the difference between input and mean: (X - mu) + assign diff_in[i] = $signed(fifo_data[i]) - $signed(mu_out); + + skid_buffer #( + .DATA_WIDTH(DIFF_WIDTH) + ) subtract_reg ( + .clk(clk), + .rst(rst), + .data_in(diff_in[i]), + .data_in_valid(mu_fifo_valid), + .data_in_ready(diff_in_ready[i]), + .data_out(diff_out[i]), + .data_out_valid(diff_out_valid[i]), + .data_out_ready(fifo_diff_in_ready) + ); + + // Assign the output of diff int batch to be buffered + assign diff_batch_in[i] = diff_out[i]; + + // There will be a split in the pipline here, split2 is down below. + + // Take the difference and square it: (X - mu) ^ 2 + + assign square_in[i] = $signed(diff_batch_in[i]) * $signed(diff_batch_in[i]); + + skid_buffer #( + .DATA_WIDTH(SQUARE_WIDTH) + ) square_reg ( + .clk(clk), + .rst(rst), + .data_in(square_in[i]), + .data_in_valid(fifo_diff_out_valid), + .data_in_ready(square_in_ready[i]), + .data_out(square_out[i]), + .data_out_valid(square_out_valid[i]), + .data_out_ready(squares_adder_tree_in_ready) + ); + end + + assign fifo_diff_in_valid = diff_out_valid[0]; + split2 fifo_diff_split ( + .data_in_valid (fifo_diff_in_valid), + .data_in_ready (fifo_diff_in_ready), + .data_out_valid({diff_batch_in_valid, fifo_diff_out_valid}), + .data_out_ready({diff_batch_in_ready, square_in_ready[0]}) + ); + + assign squares_adder_tree_in_valid = square_out_valid[0]; + + fixed_adder_tree #( + .IN_SIZE (SQUARES_ADDER_TREE_IN_SIZE), + .IN_WIDTH(SQUARE_WIDTH) + ) squares_adder_tree ( + .clk(clk), + .rst(rst), + .data_in(square_out), + .data_in_valid(squares_adder_tree_in_valid), + .data_in_ready(squares_adder_tree_in_ready), + .data_out(squares_adder_tree_data), + .data_out_valid(squares_adder_tree_out_valid), + .data_out_ready(squares_adder_tree_out_ready) + ); + + fixed_accumulator #( + .IN_DEPTH(NUM_ITERS), + .IN_WIDTH(SQUARES_ADDER_TREE_OUT_WIDTH) + ) squares_accumulator ( + .clk(clk), + .rst(rst), + .data_in(squares_adder_tree_data), + .data_in_valid(squares_adder_tree_out_valid), + .data_in_ready(squares_adder_tree_out_ready), + .data_out(squares_acc), + .data_out_valid(squares_acc_valid), + .data_out_ready(squares_acc_ready) + ); + + // Division by NUM_VALUES + localparam bit [SQUARES_ADDER_TREE_OUT_WIDTH+1:0] INV_NUMVALUES_1 = ((1 << SQUARES_ADDER_TREE_OUT_WIDTH) / NUM_VALUES); + assign variance_buffer = (squares_acc * INV_NUMVALUES_1) >> SQUARES_ADDER_TREE_OUT_WIDTH; + assign variance_in = variance_buffer[VARIANCE_WIDTH-1:0]; + + skid_buffer #( + .DATA_WIDTH(VARIANCE_WIDTH) + ) variance_reg ( + .clk(clk), + .rst(rst), + .data_in(variance_in), + .data_in_valid(squares_acc_valid), + .data_in_ready(squares_acc_ready), + .data_out(variance_out), + .data_out_valid(variance_out_valid), + .data_out_ready(variance_out_ready) + ); + + + fixed_signed_cast #( + .IN_WIDTH(VARIANCE_WIDTH), + .IN_FRAC_WIDTH(VARIANCE_FRAC_WIDTH), + .OUT_WIDTH(ISQRT_IN_PRECISION_0), + .OUT_FRAC_WIDTH(ISQRT_IN_PRECISION_1), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) variance_cast_i ( + .in_data (variance_out), + .out_data(variance_cast) + ); + register_slice #( + .DATA_WIDTH(ISQRT_IN_PRECISION_0) + ) register_slice ( + .clk (clk), + .rst (rst), + .data_in_valid (variance_out_valid), + .data_in_ready (variance_out_ready), + .data_in (variance_cast), + .data_out_valid(inv_sqrt_valid), + .data_out_ready(inv_sqrt_ready), + .data_out (inv_sqrt_in) + ); + + isqrt_lut #( + .DATA_IN_0_PRECISION_0 (ISQRT_IN_PRECISION_0), + .DATA_IN_0_PRECISION_1 (ISQRT_IN_PRECISION_1), + .DATA_OUT_0_PRECISION_0(ISQRT_OUT_PRECISION_0), + .DATA_OUT_0_PRECISION_1(ISQRT_OUT_PRECISION_1) + ) exp_map ( + .data_in_0 (inv_sqrt_in), + .data_out_0(inv_sqrt_data) + ); + + + single_element_repeat #( + .DATA_WIDTH(ISQRT_OUT_PRECISION_0), + .REPEAT(NUM_ITERS) + ) isqrt_var_circ_buffer ( + .clk(clk), + .rst(rst), + .in_data(inv_sqrt_data), + .in_valid(inv_sqrt_valid), + .in_ready(inv_sqrt_ready), + .out_data(isqrt_circ_data), + .out_valid(isqrt_circ_valid), + .out_ready(isqrt_circ_ready) + ); + + // Join2 for pipeline join at sqrt and diff fifo + // logic inv_sqrt_ready; + join2 diff_fifo_isqrt_join ( + .data_in_valid ({diff_batch_out_valid, isqrt_circ_valid}), + .data_in_ready ({diff_batch_out_ready, isqrt_circ_ready}), + .data_out_valid(norm_in_valid), + .data_out_ready(norm_in_ready[0]) + ); + + + + matrix_fifo #( + .DATA_WIDTH(DIFF_WIDTH), + .DIM0(DATA_IN_0_PARALLELISM_DIM_0), + .DIM1(1), + .FIFO_SIZE(4 * NUM_ITERS) + ) diff_fifo_inst ( + .clk(clk), + .rst(rst), + .in_data(diff_batch_in), + .in_valid(diff_batch_in_valid), + .in_ready(diff_batch_in_ready), + .out_data(diff_batch_out), + .out_valid(diff_batch_out_valid), + .out_ready(diff_batch_out_ready) + ); + + + + + // Output chunks compute pipeline: final multiply and output cast + + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_0; i++) begin : out_mult_pipe + + // Multiply difference with 1/sqrt(var) to get normalized result + assign norm_in_data[i] = $signed({1'b0, isqrt_circ_data}) * $signed(diff_batch_out[i]); + + skid_buffer #( + .DATA_WIDTH(NORM_WIDTH) + ) norm_reg ( + .clk(clk), + .rst(rst), + .data_in(norm_in_data[i]), + .data_in_valid(norm_in_valid), + .data_in_ready(norm_in_ready[i]), + .data_out(norm_out_data[i]), + .data_out_valid(norm_out_valid[i]), + .data_out_ready(norm_batch_ready[i]) + ); + + // Output Rounding Stage + fixed_signed_cast #( + .IN_WIDTH(NORM_WIDTH), + .IN_FRAC_WIDTH(NORM_FRAC_WIDTH), + .OUT_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) output_cast ( + .in_data (norm_out_data[i]), + .out_data(norm_round_out[i]) + ); + + skid_buffer #( + .DATA_WIDTH(DATA_OUT_0_PRECISION_0) + ) output_reg ( + .clk(clk), + .rst(rst), + .data_in(norm_round_out[i]), + .data_in_valid(norm_out_valid[i]), + .data_in_ready(norm_batch_ready[i]), + .data_out(norm_batch_data[i]), + .data_out_valid(output_reg_valid[i]), + .data_out_ready(output_reg_ready) + ); + end + + // Final connection to output + assign data_out_0 = norm_batch_data; + assign data_out_0_valid = output_reg_valid[0]; + assign output_reg_ready = data_out_0_ready; + +endmodule \ No newline at end of file diff --git a/src/mase_components/normalization_layers/rtl/layer_norm_2d.sv b/src/mase_components/normalization_layers/rtl/layer_norm_2d.sv new file mode 100644 index 000000000..d73505516 --- /dev/null +++ b/src/mase_components/normalization_layers/rtl/layer_norm_2d.sv @@ -0,0 +1,215 @@ +/* +Module : group_norm_2d +Description : This module calculates the generalised group norm. + https://arxiv.org/abs/1803.08494v3 + + This module can be easily trivially specialised into layer norm or + instance norm by setting the GROUP_CHANNELS param to equal C or 1 + respectively. + + Group norm is independent of batch size, so the input shape is: + (GROUP, DEPTH_DIM1 * DEPTH_DIM0, COMPUTE_DIM1 * COMPUTE_DIM0) + assume we flatten layernorm.normalized_shape to, and then calculate it + so it actually is dim_0 = prod(normalized_shape), x.reshape(-1, dim0), out = norm(dim_0)(x) + 2d means parallelism here +*/ + +`timescale 1ns / 1ps +module layer_norm_2d #( + // Dimensions + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 2, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 2, + + // Data widths + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 4, + parameter WEIGHT_PRECISION_0 = 8, + parameter WEIGHT_PRECISION_1 = 4, + parameter BIAS_PRECISION_0 = 8, + parameter BIAS_PRECISION_1 = 4, + parameter ELEMENTWISE_AFFINE = 0, + parameter HAS_BIAS = 0, + parameter ISQRT_IN_PRECISION_0 = 8, + parameter ISQRT_IN_PRECISION_1 = 8, + parameter ISQRT_OUT_PRECISION_0 = 8, + parameter ISQRT_OUT_PRECISION_1 = 4, + parameter BIAS_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter BIAS_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_1 = 1, + parameter WEIGHT_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter WEIGHT_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 1, + parameter WEIGHT_PARALLELISM_DIM_1 = 1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = DATA_IN_0_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = DATA_IN_0_PARALLELISM_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PRECISION_0 = 8, + parameter DATA_OUT_0_PRECISION_1 = 4 +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_1*DATA_IN_0_PARALLELISM_DIM_0-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + input logic [WEIGHT_PRECISION_0-1:0] weight [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0], + input logic weight_valid, + output logic weight_ready, + + input logic [BIAS_PRECISION_0-1:0] bias [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0], + input logic bias_valid, + output logic bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_IN_0_PARALLELISM_DIM_1*DATA_IN_0_PARALLELISM_DIM_0-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + logic [DATA_IN_0_PARALLELISM_DIM_1 - 1:0] parallel_norm_in_ready, parallel_norm_out_valid; + logic join_out_valid, join_out_ready; + logic [DATA_OUT_0_PRECISION_0 - 1:0] norm_out [DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1 - 1:0]; + localparam AFFINE_PRECISION_0 = DATA_OUT_0_PRECISION_0 + WEIGHT_PRECISION_0 + 1; + localparam AFFINE_PRECISION_1 = DATA_OUT_0_PRECISION_1 + WEIGHT_PRECISION_1; + logic [AFFINE_PRECISION_0 -1:0] uncast_data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0 * DATA_OUT_0_PARALLELISM_DIM_1 - 1:0]; + logic [AFFINE_PRECISION_0 - 1:0] casted_bias[DATA_OUT_0_PARALLELISM_DIM_0-1:0]; + logic [ BIAS_PRECISION_0 - 1:0] bias_buffered [DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0]; + logic [WEIGHT_PRECISION_0 - 1:0] weight_buffered[DATA_IN_0_PARALLELISM_DIM_0 - 1 : 0]; + logic bias_buffered_valid, bias_buffered_ready, weight_buffered_ready, weight_buffered_valid; + for (genvar i = 0; i < DATA_IN_0_PARALLELISM_DIM_1; i++) begin : parallel_dim_1 + layer_norm_1d #( + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + // Data widths + .DATA_IN_0_PRECISION_0(DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1(DATA_IN_0_PRECISION_1), + .ISQRT_IN_PRECISION_0(ISQRT_IN_PRECISION_0), + .ISQRT_IN_PRECISION_1(ISQRT_IN_PRECISION_1), + .ISQRT_OUT_PRECISION_0(ISQRT_OUT_PRECISION_0), + .ISQRT_OUT_PRECISION_1(ISQRT_OUT_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) layer_norm_inst ( + .clk, + .rst, + .data_in_0(data_in_0[i*DATA_IN_0_PARALLELISM_DIM_0 + DATA_IN_0_PARALLELISM_DIM_0 - 1: i*DATA_IN_0_PARALLELISM_DIM_0]), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(parallel_norm_in_ready[i]), + .data_out_0(norm_out[i*DATA_IN_0_PARALLELISM_DIM_0 + DATA_IN_0_PARALLELISM_DIM_0 - 1: i*DATA_IN_0_PARALLELISM_DIM_0]), + .data_out_0_valid(parallel_norm_out_valid[i]), + .data_out_0_ready(join_out_ready) + ); + end + assign data_in_0_ready = parallel_norm_in_ready[0]; + assign join_out_valid = parallel_norm_out_valid[0]; + input_buffer #( + .DATA_WIDTH (BIAS_PRECISION_0), + .IN_NUM (DATA_IN_0_PARALLELISM_DIM_0), + .REPEAT (DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1), + .BUFFER_SIZE(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0) + ) bias_buffer_inst ( + .clk, + .rst, + + // Input streaming port + .data_in(bias), + .data_in_valid(bias_valid), + .data_in_ready(bias_ready), + + // Output streaming port + .data_out(bias_buffered), + .data_out_valid(bias_buffered_valid), + .data_out_ready(bias_buffered_ready) + ); + input_buffer #( + .DATA_WIDTH (WEIGHT_PRECISION_0), + .IN_NUM (DATA_IN_0_PARALLELISM_DIM_0), + .REPEAT (DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1), + .BUFFER_SIZE(DATA_IN_0_TENSOR_SIZE_DIM_0 / DATA_IN_0_PARALLELISM_DIM_0) + ) weight_buffer_inst ( + .clk, + .rst, + + // Input streaming port + .data_in(weight), + .data_in_valid(weight_valid), + .data_in_ready(weight_ready), + + // Output streaming port + .data_out(weight_buffered), + .data_out_valid(weight_buffered_valid), + .data_out_ready(weight_buffered_ready) + ); + if (ELEMENTWISE_AFFINE == 1) begin + logic wd_valid, wd_ready; + join2 weight_data_join_inst ( + .data_in_valid ({weight_buffered_valid, join_out_valid}), + .data_in_ready ({weight_buffered_ready, join_out_ready}), + .data_out_valid(wd_valid), + .data_out_ready(wd_ready) + ); + logic [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1 - 1:0] + parallel_wd_ready, parallel_bias_ready, parallel_data_out_0_valid; + assign bias_buffered_ready = parallel_bias_ready[0]; + assign wd_ready = parallel_wd_ready[0]; + assign data_out_0_valid = parallel_data_out_0_valid[0]; + for (genvar i = 0; i < DATA_OUT_0_PARALLELISM_DIM_1; i++) begin : affine_parallel_dim1 + for (genvar j = 0; j < DATA_OUT_0_PARALLELISM_DIM_0; j++) begin : affine_parallel_dim0 + localparam int k = i * DATA_IN_0_PARALLELISM_DIM_0 + j; + if (HAS_BIAS == 1) begin + join2 wd_bias_join_inst ( + .data_in_valid ({wd_valid, bias_buffered_valid}), + .data_in_ready ({parallel_wd_ready[k], parallel_bias_ready[k]}), + .data_out_valid(parallel_data_out_0_valid[k]), + .data_out_ready(data_out_0_ready) + ); + if (i==0) begin + fixed_signed_cast #( + .IN_WIDTH(BIAS_PRECISION_0), + .IN_FRAC_WIDTH(BIAS_PRECISION_1), + .OUT_WIDTH(AFFINE_PRECISION_0), + .OUT_FRAC_WIDTH(AFFINE_PRECISION_1), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) variance_cast_i ( + .in_data (bias_buffered[j]), + .out_data(casted_bias[j]) + ); + end + assign uncast_data_out_0[k] = $signed( + norm_out[k] + ) * $signed( + weight_buffered[j] + ) + $signed( + casted_bias[j] + ); + end else begin + assign parallel_wd_ready[k] = data_out_0_ready; + assign parallel_data_out_0_valid[k] = wd_valid; + assign parallel_bias_ready[k] = 1; + assign uncast_data_out_0[k] = $signed(norm_out[k]) * $signed(weight_buffered[j]); + end + fixed_signed_cast #( + .IN_WIDTH(AFFINE_PRECISION_0), + .IN_FRAC_WIDTH(AFFINE_PRECISION_1), + .OUT_WIDTH(DATA_OUT_0_PRECISION_0), + .OUT_FRAC_WIDTH(DATA_OUT_0_PRECISION_1), + .SYMMETRIC(0), + .ROUND_FLOOR(1) + ) variance_cast_i ( + .in_data (uncast_data_out_0[k]), + .out_data(data_out_0[k]) + ); + end + end + end else begin + assign join_out_ready = data_out_0_ready; + assign data_out_0_valid = join_out_valid; + assign data_out_0 = norm_out; + end +endmodule diff --git a/src/mase_components/normalization_layers/test/layer_norm_2d_tb.py b/src/mase_components/normalization_layers/test/layer_norm_2d_tb.py new file mode 100644 index 000000000..95e70b299 --- /dev/null +++ b/src/mase_components/normalization_layers/test/layer_norm_2d_tb.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 + +import os +import pytest + +import os, sys, logging, traceback, pdb +import torch +import logging +from functools import partial +from mase_components.helper import generate_memory +from pathlib import Path +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge +import logging + +logger = logging.getLogger("norm.models") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +logger.addHandler(handler) + +from chop.tools.logger import set_logging_verbosity + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor +from mase_cocotb.runner import mase_runner +from mase_cocotb.utils import fixed_preprocess_tensor + +from chop.nn.quantized.modules import LayerNormIntegerFloor + + +class LayerNormTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + self.in_data_driver = StreamDriver( + dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready + ) + if self.get_parameter("ELEMENTWISE_AFFINE"): + self.weight_driver = StreamDriver( + dut.clk, dut.weight, dut.weight_valid, dut.weight_ready + ) + if self.get_parameter("ELEMENTWISE_AFFINE"): + self.bias_driver = StreamDriver( + dut.clk, dut.bias, dut.bias_valid, dut.bias_ready + ) + self.out_data_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=True, + ) + # Model + self.model = LayerNormIntegerFloor( + normalized_shape=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + config={ + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "isqrt_in_width": self.get_parameter("ISQRT_IN_PRECISION_0"), + "isqrt_in_frac_width": self.get_parameter("ISQRT_IN_PRECISION_1"), + "isqrt_out_width": self.get_parameter("ISQRT_OUT_PRECISION_0"), + "isqrt_out_frac_width": self.get_parameter("ISQRT_OUT_PRECISION_1"), + "weight_width": self.get_parameter("WEIGHT_PRECISION_0"), + "weight_frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + "bias_width": self.get_parameter("BIAS_PRECISION_0"), + "bias_frac_width": self.get_parameter("BIAS_PRECISION_1"), + "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "by_pass": False, + }, + elementwise_affine=( + True if self.get_parameter("ELEMENTWISE_AFFINE") == 1 else False + ), + bias=True if self.get_parameter("HAS_BIAS") == 1 else False, + ) + if self.get_parameter("ELEMENTWISE_AFFINE") == 1: + self.model.weight = torch.nn.Parameter( + 5 * torch.rand(self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0")) + ) + if self.get_parameter("HAS_BIAS") == 1: + self.model.bias = torch.nn.Parameter( + 5 * torch.rand(self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0")) + ) + # Set verbosity of driver and monitor loggers to debug + self.in_data_driver.log.setLevel(logging.DEBUG) + self.out_data_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self): + return torch.randn( + ( + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + ) + ) + + async def run_test(self, batches, us): + await self.reset() + self.log.info(f"Reset finished") + + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs) + + # * Load the inputs driver + self.log.info(f"Processing inputs: {inputs}") + inputs = fixed_preprocess_tensor( + tensor=inputs, + q_config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + self.in_data_driver.load_driver(inputs) + if self.get_parameter("ELEMENTWISE_AFFINE"): + weights = fixed_preprocess_tensor( + tensor=self.model.weight, + q_config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + }, + parallelism=[ + 1, + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + self.weight_driver.load_driver(weights) + if self.get_parameter("HAS_BIAS"): + biases = fixed_preprocess_tensor( + tensor=self.model.bias, + q_config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + 1, + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + self.bias_driver.load_driver(biases) + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = fixed_preprocess_tensor( + tensor=exp_out, + q_config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + ) + self.out_data_monitor.load_monitor(outs) + + # from mase_cocotb.utils import check_signal + # cocotb.start_soon(check_signal(self.dut, self.log, ["mu_out"])) + await Timer(us, units="us") + assert self.out_data_monitor.exp_queue.empty() + + +@cocotb.test() +async def single_test(dut): + tb = LayerNormTB(dut) + tb.out_data_monitor.ready.value = 1 + await tb.run_test(batches=1, us=100) + + +# @cocotb.test() +# async def repeated_mult(dut): +# tb = LayerNormTB(dut) +# tb.out_data_monitor.ready.value = 1 +# await tb.run_test(batches=100, us=2000) + + +# @cocotb.test() +# async def repeated_mult_backpressure(dut): +# tb = LayerNormTB(dut) +# cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) +# await tb.run_test(batches=10, us=500) + + +# @cocotb.test() +# async def repeated_mult_valid_backpressure(dut): +# tb = LayerNormTB(dut) +# tb.in_data_driver.set_valid_prob(0.7) +# cocotb.start_soon(bit_driver(dut.data_out_0_ready, dut.clk, 0.6)) +# await tb.run_test(batches=50, us=200) + +# Don't support : +# 1. DATA_IN_0_PARALLELISM_DIM_0 ==DATA_IN_0_TENSOR_SIZE_DIM_0 +# +dut_params = { + "ELEMENTWISE_AFFINE": 0, + "HAS_BIAS": 0, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 12, + "DATA_IN_0_PARALLELISM_DIM_0": 2, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 4, + "DATA_IN_0_PARALLELISM_DIM_1": 2, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, + "WEIGHT_PRECISION_0": 8, + "WEIGHT_PRECISION_1": 4, + "BIAS_PRECISION_0": 8, + "BIAS_PRECISION_1": 4, + "ISQRT_IN_PRECISION_0": 7, + "ISQRT_IN_PRECISION_1": 4, + "ISQRT_OUT_PRECISION_0": 12, + "ISQRT_OUT_PRECISION_1": 4, + "DATA_OUT_0_PRECISION_0": 10, + "DATA_OUT_0_PRECISION_1": 4, +} + + +def get_fixed_softmax_config(kwargs={}): + config = dut_params + config.update(kwargs) + return config + + +torch.manual_seed(1) + + +@pytest.mark.dev +def test_fixed_softmax_smoke(): + """ + Some quick tests to check if the module is working. + """ + path = Path(__file__).parents[1] / "rtl" + generate_memory.generate_sv_lut( + "isqrt", + dut_params["ISQRT_IN_PRECISION_0"], + dut_params["ISQRT_IN_PRECISION_1"], + dut_params["ISQRT_OUT_PRECISION_0"], + dut_params["ISQRT_OUT_PRECISION_1"], + path=path, + floor=True, + ) + mase_runner( + trace=True, + module_param_list=[ + get_fixed_softmax_config(), + ], + sim="verilator", + # skip_build=True, + ) + + +if __name__ == "__main__": + test_fixed_softmax_smoke() diff --git a/src/mase_components/scalar_operators/fixed/rtl/fixed_div.sv b/src/mase_components/scalar_operators/fixed/rtl/fixed_div.sv new file mode 100644 index 000000000..0eb6e19e7 --- /dev/null +++ b/src/mase_components/scalar_operators/fixed/rtl/fixed_div.sv @@ -0,0 +1,1155 @@ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== +`timescale 1 ns / 1 ps +(* CORE_GENERATION_INFO="div_div,hls_ip_2023_1,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xcu250-figd2104-2L-e,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=pipeline,HLS_SYN_CLOCK=2.594500,HLS_SYN_LAT=27,HLS_SYN_TPT=1,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=1487,HLS_SYN_LUT=1130,HLS_VERSION=2023_1}" *) +// NOTE!! This div is based on the int div generated based on hls, which can only handle the 16/16 division +// NOTE!! When divisor_data == 0, quotient_data will be set to 111..111 which is -1, +// But at this point, this div is only used for softmax, so negelect this bug. +module fixed_div #( + parameter IN_NUM = 8, + parameter FIFO_DEPTH = 32, + parameter DIVIDEND_WIDTH = 8, + parameter DIVISOR_WIDTH = 8, + parameter QUOTIENT_WIDTH = 8 +) ( + input logic clk, + input logic rst, + input logic [DIVIDEND_WIDTH-1:0] dividend_data[IN_NUM - 1:0], + input logic dividend_data_valid, + output logic dividend_data_ready, + input logic [DIVISOR_WIDTH-1:0] divisor_data[IN_NUM - 1:0], + input logic divisor_data_valid, + output logic divisor_data_ready, + output logic [QUOTIENT_WIDTH-1:0] quotient_data[IN_NUM - 1:0], + output logic quotient_data_valid, + input logic quotient_data_ready +); + initial begin + assert (DIVIDEND_WIDTH <= 32) + else $fatal("DIVIDEND_WIDTH Set may cause resolution loss."); + assert (DIVISOR_WIDTH <= 32) + else $fatal("DIVISOR_WIDTH Set may cause resolution loss."); + assert (QUOTIENT_WIDTH <= 16) + else $fatal("QUOTIENT_WIDTH Set may cause resolution loss."); + end + + logic [IN_NUM - 1:0] + dividend_data_ready_expand, divisor_data_ready_expand, quotient_data_valid_expand; + always_comb begin + dividend_data_ready = dividend_data_ready_expand[0]; + divisor_data_ready = divisor_data_ready_expand[0]; + quotient_data_valid = quotient_data_valid_expand[0]; + end + logic [15:0] rounding_dividend[IN_NUM - 1:0]; + logic [31:0] rounding_divisor[IN_NUM - 1:0]; + logic [15:0] rounding_quotient[IN_NUM - 1:0]; + + logic [15:0] fifo_quotient[IN_NUM - 1:0]; + logic [IN_NUM-1:0] fifo_in_valid, fifo_in_ready; + + fixed_rounding #( + .IN_SIZE(IN_NUM), + .IN_WIDTH(DIVIDEND_WIDTH), + .IN_FRAC_WIDTH(0), + .OUT_WIDTH(16), + .OUT_FRAC_WIDTH(0) + ) dividend_round_inst ( + .data_in (dividend_data), + .data_out(rounding_dividend) + ); + fixed_rounding #( + .IN_SIZE(IN_NUM), + .IN_WIDTH(DIVISOR_WIDTH), + .IN_FRAC_WIDTH(0), + .OUT_WIDTH(32), + .OUT_FRAC_WIDTH(0) + ) divisor_round_inst ( + .data_in (divisor_data), + .data_out(rounding_divisor) + ); + + fixed_rounding #( + .IN_SIZE(IN_NUM), + .IN_WIDTH(16), + .IN_FRAC_WIDTH(0), + .OUT_WIDTH(QUOTIENT_WIDTH), + .OUT_FRAC_WIDTH(0) + ) quotient_round_inst ( + .data_in (rounding_quotient), + .data_out(quotient_data) + ); + for (genvar i = 0; i < IN_NUM; i++) begin + div div ( + .ap_clk(clk), + .ap_rst(rst), + .ap_start(1'b1), + .ap_done(), + .ap_idle(), + .ap_ready(), + .data_in_0_dout(rounding_dividend[i]), + .data_in_0_empty_n(dividend_data_valid), + .data_in_0_read(dividend_data_ready_expand[i]), + .data_in_1_dout(rounding_divisor[i]), + .data_in_1_empty_n(divisor_data_valid), + .data_in_1_read(divisor_data_ready_expand[i]), + .data_out_0_din(fifo_quotient[i]), + .data_out_0_write(fifo_in_valid[i]), + .data_out_0_full_n(fifo_in_ready[i]) + ); + fifo #( + .DEPTH(FIFO_DEPTH), + .DATA_WIDTH(16) + ) ff_inst ( + .clk(clk), + .rst(rst), + .in_data(fifo_quotient[i]), + .in_valid(fifo_in_valid[i]), + .in_ready(fifo_in_ready[i]), + .out_data(rounding_quotient[i]), + .out_valid(quotient_data_valid_expand[i]), + .out_ready(quotient_data_ready), + .empty(), + .full() + ); + end +endmodule + +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== + +/* verilator lint_off DECLFILENAME */ +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== + +`timescale 1 ns / 1 ps + +(* CORE_GENERATION_INFO="div_div,hls_ip_2023_1,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xcu250-figd2104-2L-e,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=pipeline,HLS_SYN_CLOCK=2.593000,HLS_SYN_LAT=35,HLS_SYN_TPT=1,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=2449,HLS_SYN_LUT=1808,HLS_VERSION=2023_1}" *) + +module div ( + ap_clk, + ap_rst, + ap_start, + ap_done, + ap_idle, + ap_ready, + data_in_0_dout, + data_in_0_empty_n, + data_in_0_read, + data_in_1_dout, + data_in_1_empty_n, + data_in_1_read, + data_out_0_din, + data_out_0_full_n, + data_out_0_write +); + + parameter ap_ST_fsm_pp0_stage0 = 1'd1; + + input ap_clk; + input ap_rst; + input ap_start; + output ap_done; + output ap_idle; + output ap_ready; + input [31:0] data_in_0_dout; + input data_in_0_empty_n; + output data_in_0_read; + input [31:0] data_in_1_dout; + input data_in_1_empty_n; + output data_in_1_read; + output [15:0] data_out_0_din; + input data_out_0_full_n; + output data_out_0_write; + + reg ap_done; + reg ap_idle; + reg ap_ready; + reg data_in_0_read; + reg data_in_1_read; + reg data_out_0_write; + + (* fsm_encoding = "none" *) reg [0:0] ap_CS_fsm; + wire ap_CS_fsm_pp0_stage0; + wire ap_enable_reg_pp0_iter0; + reg ap_enable_reg_pp0_iter1; + reg ap_enable_reg_pp0_iter2; + reg ap_enable_reg_pp0_iter3; + reg ap_enable_reg_pp0_iter4; + reg ap_enable_reg_pp0_iter5; + reg ap_enable_reg_pp0_iter6; + reg ap_enable_reg_pp0_iter7; + reg ap_enable_reg_pp0_iter8; + reg ap_enable_reg_pp0_iter9; + reg ap_enable_reg_pp0_iter10; + reg ap_enable_reg_pp0_iter11; + reg ap_enable_reg_pp0_iter12; + reg ap_enable_reg_pp0_iter13; + reg ap_enable_reg_pp0_iter14; + reg ap_enable_reg_pp0_iter15; + reg ap_enable_reg_pp0_iter16; + reg ap_enable_reg_pp0_iter17; + reg ap_enable_reg_pp0_iter18; + reg ap_enable_reg_pp0_iter19; + reg ap_enable_reg_pp0_iter20; + reg ap_enable_reg_pp0_iter21; + reg ap_enable_reg_pp0_iter22; + reg ap_enable_reg_pp0_iter23; + reg ap_enable_reg_pp0_iter24; + reg ap_enable_reg_pp0_iter25; + reg ap_enable_reg_pp0_iter26; + reg ap_enable_reg_pp0_iter27; + reg ap_enable_reg_pp0_iter28; + reg ap_enable_reg_pp0_iter29; + reg ap_enable_reg_pp0_iter30; + reg ap_enable_reg_pp0_iter31; + reg ap_enable_reg_pp0_iter32; + reg ap_enable_reg_pp0_iter33; + reg ap_enable_reg_pp0_iter34; + reg ap_enable_reg_pp0_iter35; + reg ap_idle_pp0; + wire ap_block_state1_pp0_stage0_iter0; + wire ap_block_state2_pp0_stage0_iter1; + wire ap_block_state3_pp0_stage0_iter2; + wire ap_block_state4_pp0_stage0_iter3; + wire ap_block_state5_pp0_stage0_iter4; + wire ap_block_state6_pp0_stage0_iter5; + wire ap_block_state7_pp0_stage0_iter6; + wire ap_block_state8_pp0_stage0_iter7; + wire ap_block_state9_pp0_stage0_iter8; + wire ap_block_state10_pp0_stage0_iter9; + wire ap_block_state11_pp0_stage0_iter10; + wire ap_block_state12_pp0_stage0_iter11; + wire ap_block_state13_pp0_stage0_iter12; + wire ap_block_state14_pp0_stage0_iter13; + wire ap_block_state15_pp0_stage0_iter14; + wire ap_block_state16_pp0_stage0_iter15; + wire ap_block_state17_pp0_stage0_iter16; + wire ap_block_state18_pp0_stage0_iter17; + wire ap_block_state19_pp0_stage0_iter18; + wire ap_block_state20_pp0_stage0_iter19; + wire ap_block_state21_pp0_stage0_iter20; + wire ap_block_state22_pp0_stage0_iter21; + wire ap_block_state23_pp0_stage0_iter22; + wire ap_block_state24_pp0_stage0_iter23; + wire ap_block_state25_pp0_stage0_iter24; + wire ap_block_state26_pp0_stage0_iter25; + wire ap_block_state27_pp0_stage0_iter26; + wire ap_block_state28_pp0_stage0_iter27; + wire ap_block_state29_pp0_stage0_iter28; + wire ap_block_state30_pp0_stage0_iter29; + wire ap_block_state31_pp0_stage0_iter30; + wire ap_block_state32_pp0_stage0_iter31; + wire ap_block_state33_pp0_stage0_iter32; + wire ap_block_state34_pp0_stage0_iter33; + wire ap_block_state35_pp0_stage0_iter34; + wire ap_block_state36_pp0_stage0_iter35; + wire ap_block_pp0_stage0_subdone; + wire ap_block_pp0_stage0_11001; + wire [0:0] tmp_nbreadreq_fu_32_p3; + wire [0:0] tmp_1_nbreadreq_fu_40_p3; + reg [0:0] tmp_1_reg_90; + reg [0:0] tmp_1_reg_90_pp0_iter1_reg; + reg [0:0] tmp_1_reg_90_pp0_iter2_reg; + reg [0:0] tmp_1_reg_90_pp0_iter3_reg; + reg [0:0] tmp_1_reg_90_pp0_iter4_reg; + reg [0:0] tmp_1_reg_90_pp0_iter5_reg; + reg [0:0] tmp_1_reg_90_pp0_iter6_reg; + reg [0:0] tmp_1_reg_90_pp0_iter7_reg; + reg [0:0] tmp_1_reg_90_pp0_iter8_reg; + reg [0:0] tmp_1_reg_90_pp0_iter9_reg; + reg [0:0] tmp_1_reg_90_pp0_iter10_reg; + reg [0:0] tmp_1_reg_90_pp0_iter11_reg; + reg [0:0] tmp_1_reg_90_pp0_iter12_reg; + reg [0:0] tmp_1_reg_90_pp0_iter13_reg; + reg [0:0] tmp_1_reg_90_pp0_iter14_reg; + reg [0:0] tmp_1_reg_90_pp0_iter15_reg; + reg [0:0] tmp_1_reg_90_pp0_iter16_reg; + reg [0:0] tmp_1_reg_90_pp0_iter17_reg; + reg [0:0] tmp_1_reg_90_pp0_iter18_reg; + reg [0:0] tmp_1_reg_90_pp0_iter19_reg; + reg [0:0] tmp_1_reg_90_pp0_iter20_reg; + reg [0:0] tmp_1_reg_90_pp0_iter21_reg; + reg [0:0] tmp_1_reg_90_pp0_iter22_reg; + reg [0:0] tmp_1_reg_90_pp0_iter23_reg; + reg [0:0] tmp_1_reg_90_pp0_iter24_reg; + reg [0:0] tmp_1_reg_90_pp0_iter25_reg; + reg [0:0] tmp_1_reg_90_pp0_iter26_reg; + reg [0:0] tmp_1_reg_90_pp0_iter27_reg; + reg [0:0] tmp_1_reg_90_pp0_iter28_reg; + reg [0:0] tmp_1_reg_90_pp0_iter29_reg; + reg [0:0] tmp_1_reg_90_pp0_iter30_reg; + reg [0:0] tmp_1_reg_90_pp0_iter31_reg; + reg [0:0] tmp_1_reg_90_pp0_iter32_reg; + reg [0:0] tmp_1_reg_90_pp0_iter33_reg; + reg [0:0] tmp_1_reg_90_pp0_iter34_reg; + reg [0:0] tmp_reg_99; + reg [0:0] tmp_reg_99_pp0_iter1_reg; + reg [0:0] tmp_reg_99_pp0_iter2_reg; + reg [0:0] tmp_reg_99_pp0_iter3_reg; + reg [0:0] tmp_reg_99_pp0_iter4_reg; + reg [0:0] tmp_reg_99_pp0_iter5_reg; + reg [0:0] tmp_reg_99_pp0_iter6_reg; + reg [0:0] tmp_reg_99_pp0_iter7_reg; + reg [0:0] tmp_reg_99_pp0_iter8_reg; + reg [0:0] tmp_reg_99_pp0_iter9_reg; + reg [0:0] tmp_reg_99_pp0_iter10_reg; + reg [0:0] tmp_reg_99_pp0_iter11_reg; + reg [0:0] tmp_reg_99_pp0_iter12_reg; + reg [0:0] tmp_reg_99_pp0_iter13_reg; + reg [0:0] tmp_reg_99_pp0_iter14_reg; + reg [0:0] tmp_reg_99_pp0_iter15_reg; + reg [0:0] tmp_reg_99_pp0_iter16_reg; + reg [0:0] tmp_reg_99_pp0_iter17_reg; + reg [0:0] tmp_reg_99_pp0_iter18_reg; + reg [0:0] tmp_reg_99_pp0_iter19_reg; + reg [0:0] tmp_reg_99_pp0_iter20_reg; + reg [0:0] tmp_reg_99_pp0_iter21_reg; + reg [0:0] tmp_reg_99_pp0_iter22_reg; + reg [0:0] tmp_reg_99_pp0_iter23_reg; + reg [0:0] tmp_reg_99_pp0_iter24_reg; + reg [0:0] tmp_reg_99_pp0_iter25_reg; + reg [0:0] tmp_reg_99_pp0_iter26_reg; + reg [0:0] tmp_reg_99_pp0_iter27_reg; + reg [0:0] tmp_reg_99_pp0_iter28_reg; + reg [0:0] tmp_reg_99_pp0_iter29_reg; + reg [0:0] tmp_reg_99_pp0_iter30_reg; + reg [0:0] tmp_reg_99_pp0_iter31_reg; + reg [0:0] tmp_reg_99_pp0_iter32_reg; + reg [0:0] tmp_reg_99_pp0_iter33_reg; + reg [0:0] tmp_reg_99_pp0_iter34_reg; + wire ap_block_pp0_stage0_01001; + wire ap_block_pp0_stage0; + wire [15:0] grp_fu_75_p2; + reg [0:0] ap_NS_fsm; + reg ap_idle_pp0_0to34; + reg ap_reset_idle_pp0; + wire ap_enable_pp0; + wire ap_ce_reg; + + // power-on initialization + initial begin + ap_CS_fsm = 1'd1; + ap_enable_reg_pp0_iter1 = 1'b0; + ap_enable_reg_pp0_iter2 = 1'b0; + ap_enable_reg_pp0_iter3 = 1'b0; + ap_enable_reg_pp0_iter4 = 1'b0; + ap_enable_reg_pp0_iter5 = 1'b0; + ap_enable_reg_pp0_iter6 = 1'b0; + ap_enable_reg_pp0_iter7 = 1'b0; + ap_enable_reg_pp0_iter8 = 1'b0; + ap_enable_reg_pp0_iter9 = 1'b0; + ap_enable_reg_pp0_iter10 = 1'b0; + ap_enable_reg_pp0_iter11 = 1'b0; + ap_enable_reg_pp0_iter12 = 1'b0; + ap_enable_reg_pp0_iter13 = 1'b0; + ap_enable_reg_pp0_iter14 = 1'b0; + ap_enable_reg_pp0_iter15 = 1'b0; + ap_enable_reg_pp0_iter16 = 1'b0; + ap_enable_reg_pp0_iter17 = 1'b0; + ap_enable_reg_pp0_iter18 = 1'b0; + ap_enable_reg_pp0_iter19 = 1'b0; + ap_enable_reg_pp0_iter20 = 1'b0; + ap_enable_reg_pp0_iter21 = 1'b0; + ap_enable_reg_pp0_iter22 = 1'b0; + ap_enable_reg_pp0_iter23 = 1'b0; + ap_enable_reg_pp0_iter24 = 1'b0; + ap_enable_reg_pp0_iter25 = 1'b0; + ap_enable_reg_pp0_iter26 = 1'b0; + ap_enable_reg_pp0_iter27 = 1'b0; + ap_enable_reg_pp0_iter28 = 1'b0; + ap_enable_reg_pp0_iter29 = 1'b0; + ap_enable_reg_pp0_iter30 = 1'b0; + ap_enable_reg_pp0_iter31 = 1'b0; + ap_enable_reg_pp0_iter32 = 1'b0; + ap_enable_reg_pp0_iter33 = 1'b0; + ap_enable_reg_pp0_iter34 = 1'b0; + ap_enable_reg_pp0_iter35 = 1'b0; + end + + div_sdiv_32ns_32ns_16_36_1 #( + .ID(1), + .NUM_STAGE(36), + .din0_WIDTH(32), + .din1_WIDTH(32), + .dout_WIDTH(16) + ) sdiv_32ns_32ns_16_36_1_U1 ( + .clk(ap_clk), + .reset(ap_rst), + .din0(data_in_0_dout), + .din1(data_in_1_dout), + .ce(1'b1), + .dout(grp_fu_75_p2) + ); + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_CS_fsm <= ap_ST_fsm_pp0_stage0; + end else begin + ap_CS_fsm <= ap_NS_fsm; + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter1 <= 1'b0; + end else begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_subdone))) begin + ap_enable_reg_pp0_iter1 <= ap_start; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter10 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter10 <= ap_enable_reg_pp0_iter9; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter11 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter11 <= ap_enable_reg_pp0_iter10; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter12 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter12 <= ap_enable_reg_pp0_iter11; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter13 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter13 <= ap_enable_reg_pp0_iter12; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter14 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter14 <= ap_enable_reg_pp0_iter13; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter15 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter15 <= ap_enable_reg_pp0_iter14; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter16 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter16 <= ap_enable_reg_pp0_iter15; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter17 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter17 <= ap_enable_reg_pp0_iter16; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter18 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter18 <= ap_enable_reg_pp0_iter17; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter19 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter19 <= ap_enable_reg_pp0_iter18; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter2 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter2 <= ap_enable_reg_pp0_iter1; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter20 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter20 <= ap_enable_reg_pp0_iter19; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter21 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter21 <= ap_enable_reg_pp0_iter20; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter22 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter22 <= ap_enable_reg_pp0_iter21; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter23 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter23 <= ap_enable_reg_pp0_iter22; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter24 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter24 <= ap_enable_reg_pp0_iter23; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter25 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter25 <= ap_enable_reg_pp0_iter24; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter26 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter26 <= ap_enable_reg_pp0_iter25; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter27 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter27 <= ap_enable_reg_pp0_iter26; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter28 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter28 <= ap_enable_reg_pp0_iter27; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter29 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter29 <= ap_enable_reg_pp0_iter28; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter3 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter3 <= ap_enable_reg_pp0_iter2; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter30 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter30 <= ap_enable_reg_pp0_iter29; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter31 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter31 <= ap_enable_reg_pp0_iter30; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter32 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter32 <= ap_enable_reg_pp0_iter31; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter33 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter33 <= ap_enable_reg_pp0_iter32; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter34 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter34 <= ap_enable_reg_pp0_iter33; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter35 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter35 <= ap_enable_reg_pp0_iter34; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter4 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter4 <= ap_enable_reg_pp0_iter3; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter5 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter5 <= ap_enable_reg_pp0_iter4; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter6 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter6 <= ap_enable_reg_pp0_iter5; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter7 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter7 <= ap_enable_reg_pp0_iter6; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter8 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter8 <= ap_enable_reg_pp0_iter7; + end + end + end + + always @(posedge ap_clk) begin + if (ap_rst == 1'b1) begin + ap_enable_reg_pp0_iter9 <= 1'b0; + end else begin + if ((1'b0 == ap_block_pp0_stage0_subdone)) begin + ap_enable_reg_pp0_iter9 <= ap_enable_reg_pp0_iter8; + end + end + end + + always @(posedge ap_clk) begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + tmp_1_reg_90 <= tmp_1_nbreadreq_fu_40_p3; + end + end + + always @(posedge ap_clk) begin + if ((1'b0 == ap_block_pp0_stage0_11001)) begin + tmp_1_reg_90_pp0_iter10_reg <= tmp_1_reg_90_pp0_iter9_reg; + tmp_1_reg_90_pp0_iter11_reg <= tmp_1_reg_90_pp0_iter10_reg; + tmp_1_reg_90_pp0_iter12_reg <= tmp_1_reg_90_pp0_iter11_reg; + tmp_1_reg_90_pp0_iter13_reg <= tmp_1_reg_90_pp0_iter12_reg; + tmp_1_reg_90_pp0_iter14_reg <= tmp_1_reg_90_pp0_iter13_reg; + tmp_1_reg_90_pp0_iter15_reg <= tmp_1_reg_90_pp0_iter14_reg; + tmp_1_reg_90_pp0_iter16_reg <= tmp_1_reg_90_pp0_iter15_reg; + tmp_1_reg_90_pp0_iter17_reg <= tmp_1_reg_90_pp0_iter16_reg; + tmp_1_reg_90_pp0_iter18_reg <= tmp_1_reg_90_pp0_iter17_reg; + tmp_1_reg_90_pp0_iter19_reg <= tmp_1_reg_90_pp0_iter18_reg; + tmp_1_reg_90_pp0_iter20_reg <= tmp_1_reg_90_pp0_iter19_reg; + tmp_1_reg_90_pp0_iter21_reg <= tmp_1_reg_90_pp0_iter20_reg; + tmp_1_reg_90_pp0_iter22_reg <= tmp_1_reg_90_pp0_iter21_reg; + tmp_1_reg_90_pp0_iter23_reg <= tmp_1_reg_90_pp0_iter22_reg; + tmp_1_reg_90_pp0_iter24_reg <= tmp_1_reg_90_pp0_iter23_reg; + tmp_1_reg_90_pp0_iter25_reg <= tmp_1_reg_90_pp0_iter24_reg; + tmp_1_reg_90_pp0_iter26_reg <= tmp_1_reg_90_pp0_iter25_reg; + tmp_1_reg_90_pp0_iter27_reg <= tmp_1_reg_90_pp0_iter26_reg; + tmp_1_reg_90_pp0_iter28_reg <= tmp_1_reg_90_pp0_iter27_reg; + tmp_1_reg_90_pp0_iter29_reg <= tmp_1_reg_90_pp0_iter28_reg; + tmp_1_reg_90_pp0_iter2_reg <= tmp_1_reg_90_pp0_iter1_reg; + tmp_1_reg_90_pp0_iter30_reg <= tmp_1_reg_90_pp0_iter29_reg; + tmp_1_reg_90_pp0_iter31_reg <= tmp_1_reg_90_pp0_iter30_reg; + tmp_1_reg_90_pp0_iter32_reg <= tmp_1_reg_90_pp0_iter31_reg; + tmp_1_reg_90_pp0_iter33_reg <= tmp_1_reg_90_pp0_iter32_reg; + tmp_1_reg_90_pp0_iter34_reg <= tmp_1_reg_90_pp0_iter33_reg; + tmp_1_reg_90_pp0_iter3_reg <= tmp_1_reg_90_pp0_iter2_reg; + tmp_1_reg_90_pp0_iter4_reg <= tmp_1_reg_90_pp0_iter3_reg; + tmp_1_reg_90_pp0_iter5_reg <= tmp_1_reg_90_pp0_iter4_reg; + tmp_1_reg_90_pp0_iter6_reg <= tmp_1_reg_90_pp0_iter5_reg; + tmp_1_reg_90_pp0_iter7_reg <= tmp_1_reg_90_pp0_iter6_reg; + tmp_1_reg_90_pp0_iter8_reg <= tmp_1_reg_90_pp0_iter7_reg; + tmp_1_reg_90_pp0_iter9_reg <= tmp_1_reg_90_pp0_iter8_reg; + tmp_reg_99_pp0_iter10_reg <= tmp_reg_99_pp0_iter9_reg; + tmp_reg_99_pp0_iter11_reg <= tmp_reg_99_pp0_iter10_reg; + tmp_reg_99_pp0_iter12_reg <= tmp_reg_99_pp0_iter11_reg; + tmp_reg_99_pp0_iter13_reg <= tmp_reg_99_pp0_iter12_reg; + tmp_reg_99_pp0_iter14_reg <= tmp_reg_99_pp0_iter13_reg; + tmp_reg_99_pp0_iter15_reg <= tmp_reg_99_pp0_iter14_reg; + tmp_reg_99_pp0_iter16_reg <= tmp_reg_99_pp0_iter15_reg; + tmp_reg_99_pp0_iter17_reg <= tmp_reg_99_pp0_iter16_reg; + tmp_reg_99_pp0_iter18_reg <= tmp_reg_99_pp0_iter17_reg; + tmp_reg_99_pp0_iter19_reg <= tmp_reg_99_pp0_iter18_reg; + tmp_reg_99_pp0_iter20_reg <= tmp_reg_99_pp0_iter19_reg; + tmp_reg_99_pp0_iter21_reg <= tmp_reg_99_pp0_iter20_reg; + tmp_reg_99_pp0_iter22_reg <= tmp_reg_99_pp0_iter21_reg; + tmp_reg_99_pp0_iter23_reg <= tmp_reg_99_pp0_iter22_reg; + tmp_reg_99_pp0_iter24_reg <= tmp_reg_99_pp0_iter23_reg; + tmp_reg_99_pp0_iter25_reg <= tmp_reg_99_pp0_iter24_reg; + tmp_reg_99_pp0_iter26_reg <= tmp_reg_99_pp0_iter25_reg; + tmp_reg_99_pp0_iter27_reg <= tmp_reg_99_pp0_iter26_reg; + tmp_reg_99_pp0_iter28_reg <= tmp_reg_99_pp0_iter27_reg; + tmp_reg_99_pp0_iter29_reg <= tmp_reg_99_pp0_iter28_reg; + tmp_reg_99_pp0_iter2_reg <= tmp_reg_99_pp0_iter1_reg; + tmp_reg_99_pp0_iter30_reg <= tmp_reg_99_pp0_iter29_reg; + tmp_reg_99_pp0_iter31_reg <= tmp_reg_99_pp0_iter30_reg; + tmp_reg_99_pp0_iter32_reg <= tmp_reg_99_pp0_iter31_reg; + tmp_reg_99_pp0_iter33_reg <= tmp_reg_99_pp0_iter32_reg; + tmp_reg_99_pp0_iter34_reg <= tmp_reg_99_pp0_iter33_reg; + tmp_reg_99_pp0_iter3_reg <= tmp_reg_99_pp0_iter2_reg; + tmp_reg_99_pp0_iter4_reg <= tmp_reg_99_pp0_iter3_reg; + tmp_reg_99_pp0_iter5_reg <= tmp_reg_99_pp0_iter4_reg; + tmp_reg_99_pp0_iter6_reg <= tmp_reg_99_pp0_iter5_reg; + tmp_reg_99_pp0_iter7_reg <= tmp_reg_99_pp0_iter6_reg; + tmp_reg_99_pp0_iter8_reg <= tmp_reg_99_pp0_iter7_reg; + tmp_reg_99_pp0_iter9_reg <= tmp_reg_99_pp0_iter8_reg; + end + end + + always @(posedge ap_clk) begin + if (((1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_11001))) begin + tmp_1_reg_90_pp0_iter1_reg <= tmp_1_reg_90; + tmp_reg_99 <= tmp_nbreadreq_fu_32_p3; + tmp_reg_99_pp0_iter1_reg <= tmp_reg_99; + end + end + + always @(*) begin + if (((1'b0 == ap_block_pp0_stage0_subdone) & (ap_enable_reg_pp0_iter35 == 1'b1))) begin + ap_done = 1'b1; + end else begin + ap_done = 1'b0; + end + end + + always @(*) begin + if (((ap_start == 1'b0) & (1'b1 == ap_CS_fsm_pp0_stage0) & (ap_idle_pp0 == 1'b1))) begin + ap_idle = 1'b1; + end else begin + ap_idle = 1'b0; + end + end + + always @(*) begin + if (((ap_enable_reg_pp0_iter26 == 1'b0) & (ap_enable_reg_pp0_iter25 == 1'b0) & (ap_enable_reg_pp0_iter24 == 1'b0) & (ap_enable_reg_pp0_iter23 == 1'b0) & (ap_enable_reg_pp0_iter22 == 1'b0) & (ap_enable_reg_pp0_iter21 == 1'b0) & (ap_enable_reg_pp0_iter20 == 1'b0) & (ap_enable_reg_pp0_iter19 == 1'b0) & (ap_enable_reg_pp0_iter18 == 1'b0) & (ap_enable_reg_pp0_iter17 == 1'b0) & (ap_enable_reg_pp0_iter16 == 1'b0) & (ap_enable_reg_pp0_iter15 == 1'b0) & (ap_enable_reg_pp0_iter14 == 1'b0) & (ap_enable_reg_pp0_iter13 == 1'b0) & (ap_enable_reg_pp0_iter12 == 1'b0) & (ap_enable_reg_pp0_iter11 == 1'b0) & (ap_enable_reg_pp0_iter10 == 1'b0) & (ap_enable_reg_pp0_iter9 == 1'b0) & (ap_enable_reg_pp0_iter8 == 1'b0) & (ap_enable_reg_pp0_iter7 == 1'b0) & (ap_enable_reg_pp0_iter6 == 1'b0) & (ap_enable_reg_pp0_iter5 == 1'b0) & (ap_enable_reg_pp0_iter4 == 1'b0) & (ap_enable_reg_pp0_iter3 == 1'b0) & (ap_enable_reg_pp0_iter2 == 1'b0) & (ap_enable_reg_pp0_iter1 == 1'b0) & (ap_enable_reg_pp0_iter0 == 1'b0) & (ap_enable_reg_pp0_iter35 == 1'b0) + & (ap_enable_reg_pp0_iter34 == 1'b0) & (ap_enable_reg_pp0_iter33 == 1'b0) & (ap_enable_reg_pp0_iter32 == 1'b0) & (ap_enable_reg_pp0_iter31 == 1'b0) & (ap_enable_reg_pp0_iter30 == 1'b0) & (ap_enable_reg_pp0_iter29 == 1'b0) & (ap_enable_reg_pp0_iter28 == 1'b0) & (ap_enable_reg_pp0_iter27 == 1'b0))) begin + ap_idle_pp0 = 1'b1; + end else begin + ap_idle_pp0 = 1'b0; + end + end + + always @(*) begin + if (((ap_enable_reg_pp0_iter26 == 1'b0) & (ap_enable_reg_pp0_iter25 == 1'b0) & (ap_enable_reg_pp0_iter24 == 1'b0) & (ap_enable_reg_pp0_iter23 == 1'b0) & (ap_enable_reg_pp0_iter22 == 1'b0) & (ap_enable_reg_pp0_iter21 == 1'b0) & (ap_enable_reg_pp0_iter20 == 1'b0) & (ap_enable_reg_pp0_iter19 == 1'b0) & (ap_enable_reg_pp0_iter18 == 1'b0) & (ap_enable_reg_pp0_iter17 == 1'b0) & (ap_enable_reg_pp0_iter16 == 1'b0) & (ap_enable_reg_pp0_iter15 == 1'b0) & (ap_enable_reg_pp0_iter14 == 1'b0) & (ap_enable_reg_pp0_iter13 == 1'b0) & (ap_enable_reg_pp0_iter12 == 1'b0) & (ap_enable_reg_pp0_iter11 == 1'b0) & (ap_enable_reg_pp0_iter10 == 1'b0) & (ap_enable_reg_pp0_iter9 == 1'b0) & (ap_enable_reg_pp0_iter8 == 1'b0) & (ap_enable_reg_pp0_iter7 == 1'b0) & (ap_enable_reg_pp0_iter6 == 1'b0) & (ap_enable_reg_pp0_iter5 == 1'b0) & (ap_enable_reg_pp0_iter4 == 1'b0) & (ap_enable_reg_pp0_iter3 == 1'b0) & (ap_enable_reg_pp0_iter2 == 1'b0) & (ap_enable_reg_pp0_iter1 == 1'b0) & (ap_enable_reg_pp0_iter0 == 1'b0) & (ap_enable_reg_pp0_iter34 == 1'b0) + & (ap_enable_reg_pp0_iter33 == 1'b0) & (ap_enable_reg_pp0_iter32 == 1'b0) & (ap_enable_reg_pp0_iter31 == 1'b0) & (ap_enable_reg_pp0_iter30 == 1'b0) & (ap_enable_reg_pp0_iter29 == 1'b0) & (ap_enable_reg_pp0_iter28 == 1'b0) & (ap_enable_reg_pp0_iter27 == 1'b0))) begin + ap_idle_pp0_0to34 = 1'b1; + end else begin + ap_idle_pp0_0to34 = 1'b0; + end + end + + always @(*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (1'b0 == ap_block_pp0_stage0_subdone))) begin + ap_ready = 1'b1; + end else begin + ap_ready = 1'b0; + end + end + + always @(*) begin + if (((ap_start == 1'b0) & (ap_idle_pp0_0to34 == 1'b1))) begin + ap_reset_idle_pp0 = 1'b1; + end else begin + ap_reset_idle_pp0 = 1'b0; + end + end + + always @(*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_1_nbreadreq_fu_40_p3 == 1'd1) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (data_in_0_empty_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + data_in_0_read = 1'b1; + end else begin + data_in_0_read = 1'b0; + end + end + + always @(*) begin + if (((ap_enable_reg_pp0_iter0 == 1'b1) & (1'b1 == ap_CS_fsm_pp0_stage0) & (tmp_1_nbreadreq_fu_40_p3 == 1'd1) & (tmp_nbreadreq_fu_32_p3 == 1'd1) & (data_in_1_empty_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001))) begin + data_in_1_read = 1'b1; + end else begin + data_in_1_read = 1'b0; + end + end + + always @(*) begin + if (((tmp_reg_99_pp0_iter34_reg == 1'd1) & (tmp_1_reg_90_pp0_iter34_reg == 1'd1) & (data_out_0_full_n == 1'b1) & (1'b0 == ap_block_pp0_stage0_11001) & (ap_enable_reg_pp0_iter35 == 1'b1))) begin + data_out_0_write = 1'b1; + end else begin + data_out_0_write = 1'b0; + end + end + + always @(*) begin + case (ap_CS_fsm) + ap_ST_fsm_pp0_stage0: begin + ap_NS_fsm = ap_ST_fsm_pp0_stage0; + end + default: begin + ap_NS_fsm = 'bx; + end + endcase + end + + assign ap_CS_fsm_pp0_stage0 = ap_CS_fsm[32'd0]; + + assign ap_block_pp0_stage0 = ~(1'b1 == 1'b1); + + assign ap_block_pp0_stage0_01001 = ~(1'b1 == 1'b1); + + assign ap_block_pp0_stage0_11001 = ~(1'b1 == 1'b1); + + assign ap_block_pp0_stage0_subdone = ~(1'b1 == 1'b1); + + assign ap_block_state10_pp0_stage0_iter9 = ~(1'b1 == 1'b1); + + assign ap_block_state11_pp0_stage0_iter10 = ~(1'b1 == 1'b1); + + assign ap_block_state12_pp0_stage0_iter11 = ~(1'b1 == 1'b1); + + assign ap_block_state13_pp0_stage0_iter12 = ~(1'b1 == 1'b1); + + assign ap_block_state14_pp0_stage0_iter13 = ~(1'b1 == 1'b1); + + assign ap_block_state15_pp0_stage0_iter14 = ~(1'b1 == 1'b1); + + assign ap_block_state16_pp0_stage0_iter15 = ~(1'b1 == 1'b1); + + assign ap_block_state17_pp0_stage0_iter16 = ~(1'b1 == 1'b1); + + assign ap_block_state18_pp0_stage0_iter17 = ~(1'b1 == 1'b1); + + assign ap_block_state19_pp0_stage0_iter18 = ~(1'b1 == 1'b1); + + assign ap_block_state1_pp0_stage0_iter0 = ~(1'b1 == 1'b1); + + assign ap_block_state20_pp0_stage0_iter19 = ~(1'b1 == 1'b1); + + assign ap_block_state21_pp0_stage0_iter20 = ~(1'b1 == 1'b1); + + assign ap_block_state22_pp0_stage0_iter21 = ~(1'b1 == 1'b1); + + assign ap_block_state23_pp0_stage0_iter22 = ~(1'b1 == 1'b1); + + assign ap_block_state24_pp0_stage0_iter23 = ~(1'b1 == 1'b1); + + assign ap_block_state25_pp0_stage0_iter24 = ~(1'b1 == 1'b1); + + assign ap_block_state26_pp0_stage0_iter25 = ~(1'b1 == 1'b1); + + assign ap_block_state27_pp0_stage0_iter26 = ~(1'b1 == 1'b1); + + assign ap_block_state28_pp0_stage0_iter27 = ~(1'b1 == 1'b1); + + assign ap_block_state29_pp0_stage0_iter28 = ~(1'b1 == 1'b1); + + assign ap_block_state2_pp0_stage0_iter1 = ~(1'b1 == 1'b1); + + assign ap_block_state30_pp0_stage0_iter29 = ~(1'b1 == 1'b1); + + assign ap_block_state31_pp0_stage0_iter30 = ~(1'b1 == 1'b1); + + assign ap_block_state32_pp0_stage0_iter31 = ~(1'b1 == 1'b1); + + assign ap_block_state33_pp0_stage0_iter32 = ~(1'b1 == 1'b1); + + assign ap_block_state34_pp0_stage0_iter33 = ~(1'b1 == 1'b1); + + assign ap_block_state35_pp0_stage0_iter34 = ~(1'b1 == 1'b1); + + assign ap_block_state36_pp0_stage0_iter35 = ~(1'b1 == 1'b1); + + assign ap_block_state3_pp0_stage0_iter2 = ~(1'b1 == 1'b1); + + assign ap_block_state4_pp0_stage0_iter3 = ~(1'b1 == 1'b1); + + assign ap_block_state5_pp0_stage0_iter4 = ~(1'b1 == 1'b1); + + assign ap_block_state6_pp0_stage0_iter5 = ~(1'b1 == 1'b1); + + assign ap_block_state7_pp0_stage0_iter6 = ~(1'b1 == 1'b1); + + assign ap_block_state8_pp0_stage0_iter7 = ~(1'b1 == 1'b1); + + assign ap_block_state9_pp0_stage0_iter8 = ~(1'b1 == 1'b1); + + assign ap_enable_pp0 = (ap_idle_pp0 ^ 1'b1); + + assign ap_enable_reg_pp0_iter0 = ap_start; + + assign data_out_0_din = grp_fu_75_p2[15:0]; + + assign tmp_1_nbreadreq_fu_40_p3 = data_in_1_empty_n; + + assign tmp_nbreadreq_fu_32_p3 = data_in_0_empty_n; + +endmodule //div +// ============================================================== +// Generated by Vitis HLS v2023.1 +// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. +// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +// ============================================================== +`timescale 1 ns / 1 ps + +module div_sdiv_32ns_32ns_16_36_1_divider #( + parameter in0_WIDTH = 32, + in1_WIDTH = 32, + out_WIDTH = 32 +) ( + input clk, + input reset, + input ce, + input [in0_WIDTH-1:0] dividend, + input [in1_WIDTH-1:0] divisor, + input [ 1:0] sign_i, + output wire [ 1:0] sign_o, + output wire [out_WIDTH-1:0] quot, + output wire [out_WIDTH-1:0] remd +); + + localparam cal_WIDTH = (in0_WIDTH > in1_WIDTH) ? in0_WIDTH : in1_WIDTH; + + //------------------------Local signal------------------- + reg [in0_WIDTH-1:0] dividend_tmp[ 0:in0_WIDTH]; + reg [in1_WIDTH-1:0] divisor_tmp [ 0:in0_WIDTH]; + reg [in0_WIDTH-1:0] remd_tmp [ 0:in0_WIDTH]; + wire [in0_WIDTH-1:0] comb_tmp [0:in0_WIDTH-1]; + wire [ cal_WIDTH:0] cal_tmp [0:in0_WIDTH-1]; + reg [ 1:0] sign_tmp [ 0:in0_WIDTH]; + //------------------------Body--------------------------- + assign quot = dividend_tmp[in0_WIDTH]; + assign remd = remd_tmp[in0_WIDTH]; + assign sign_o = sign_tmp[in0_WIDTH]; + + // dividend_tmp[0], divisor_tmp[0], remd_tmp[0] + always @(posedge clk) begin + if (ce) begin + dividend_tmp[0] <= dividend; + divisor_tmp[0] <= divisor; + sign_tmp[0] <= sign_i; + remd_tmp[0] <= 1'b0; + end + end + + genvar i; + generate + for (i = 0; i < in0_WIDTH; i = i + 1) begin : loop + if (in0_WIDTH == 1) assign comb_tmp[i] = dividend_tmp[i][0]; + else assign comb_tmp[i] = {remd_tmp[i][in0_WIDTH-2:0], dividend_tmp[i][in0_WIDTH-1]}; + assign cal_tmp[i] = {1'b0, comb_tmp[i]} - {1'b0, divisor_tmp[i]}; + + always @(posedge clk) begin + if (ce) begin + if (in0_WIDTH == 1) dividend_tmp[i+1] <= ~cal_tmp[i][cal_WIDTH]; + else dividend_tmp[i+1] <= {dividend_tmp[i][in0_WIDTH-2:0], ~cal_tmp[i][cal_WIDTH]}; + divisor_tmp[i+1] <= divisor_tmp[i]; + remd_tmp[i+1] <= cal_tmp[i][cal_WIDTH] ? comb_tmp[i] : cal_tmp[i][in0_WIDTH-1:0]; + sign_tmp[i+1] <= sign_tmp[i]; + end + end + end + endgenerate + +endmodule + +module div_sdiv_32ns_32ns_16_36_1 #( + parameter ID = 1, + NUM_STAGE = 2, + din0_WIDTH = 32, + din1_WIDTH = 32, + dout_WIDTH = 32 +) ( + input clk, + input reset, + input ce, + input [din0_WIDTH-1:0] din0, + input [din1_WIDTH-1:0] din1, + output [dout_WIDTH-1:0] dout +); + //------------------------Local signal------------------- + reg [din0_WIDTH-1:0] dividend0; + reg [din1_WIDTH-1:0] divisor0; + wire [din0_WIDTH-1:0] dividend_u; + wire [din1_WIDTH-1:0] divisor_u; + wire [dout_WIDTH-1:0] quot_u; + wire [dout_WIDTH-1:0] remd_u; + reg [dout_WIDTH-1:0] quot; + reg [dout_WIDTH-1:0] remd; + wire [ 1:0] sign_i; + wire [ 1:0] sign_o; + //------------------------Instantiation------------------ + div_sdiv_32ns_32ns_16_36_1_divider #( + .in0_WIDTH(din0_WIDTH), + .in1_WIDTH(din1_WIDTH), + .out_WIDTH(dout_WIDTH) + ) div_sdiv_32ns_32ns_16_36_1_divider_u ( + .clk (clk), + .reset (reset), + .ce (ce), + .dividend(dividend_u), + .divisor (divisor_u), + .sign_i (sign_i), + .sign_o (sign_o), + .quot (quot_u), + .remd (remd_u) + ); + //------------------------Body--------------------------- + assign sign_i = {dividend0[din0_WIDTH-1] ^ divisor0[din1_WIDTH-1], dividend0[din0_WIDTH-1]}; + assign dividend_u = dividend0[din0_WIDTH-1]? ~dividend0[din0_WIDTH-1:0] + 1'b1 : + dividend0[din0_WIDTH-1:0]; + assign divisor_u = divisor0[din1_WIDTH-1]? ~divisor0[din1_WIDTH-1:0] + 1'b1 : + divisor0[din1_WIDTH-1:0]; + + always @(posedge clk) begin + if (ce) begin + dividend0 <= din0; + divisor0 <= din1; + end + end + + always @(posedge clk) begin + if (ce) begin + if (sign_o[1]) quot <= ~quot_u + 1'b1; + else quot <= quot_u; + end + end + + always @(posedge clk) begin + if (ce) begin + if (sign_o[0]) remd <= ~remd_u + 1'b1; + else remd <= remd_u; + end + end + + assign dout = quot; + +endmodule + + diff --git a/src/mase_components/scalar_operators/fixed/rtl/int_div.sv b/src/mase_components/scalar_operators/fixed/rtl/int_div.sv new file mode 100644 index 000000000..9d0512f47 --- /dev/null +++ b/src/mase_components/scalar_operators/fixed/rtl/int_div.sv @@ -0,0 +1,74 @@ +`timescale 1 ns / 1 ps +module int_div #( + parameter IN_NUM = 8, + parameter FIFO_DEPTH = 8, + parameter DIVIDEND_WIDTH = 8, + parameter DIVISOR_WIDTH = 8, + parameter QUOTIENT_WIDTH = 8 +) ( + input logic clk, + input logic rst, + input logic [DIVIDEND_WIDTH-1:0] dividend_data[IN_NUM - 1:0], + input logic dividend_data_valid, + output logic dividend_data_ready, + input logic [DIVISOR_WIDTH-1:0] divisor_data[IN_NUM - 1:0], + input logic divisor_data_valid, + output logic divisor_data_ready, + output logic [QUOTIENT_WIDTH-1:0] quotient_data[IN_NUM - 1:0], + output logic quotient_data_valid, + input logic quotient_data_ready +); + + // Add signals for skid buffers + logic [DIVIDEND_WIDTH-1:0] dividend_data_skid[IN_NUM - 1:0]; + logic dividend_valid_skid; + logic dividend_ready_skid; + + logic [DIVISOR_WIDTH-1:0] divisor_data_skid[IN_NUM - 1:0]; + logic divisor_valid_skid; + logic divisor_ready_skid; + + // Replace skid buffers with unpacked versions + unpacked_skid_buffer #( + .DATA_WIDTH(DIVIDEND_WIDTH), + .IN_NUM(IN_NUM) + ) dividend_skid ( + .clk(clk), + .rst(rst), + .data_in(dividend_data), + .data_in_valid(dividend_data_valid), + .data_in_ready(dividend_data_ready), + .data_out(dividend_data_skid), + .data_out_valid(dividend_valid_skid), + .data_out_ready(dividend_ready_skid) + ); + + unpacked_skid_buffer #( + .DATA_WIDTH(DIVISOR_WIDTH), + .IN_NUM(IN_NUM) + ) divisor_skid ( + .clk(clk), + .rst(rst), + .data_in(divisor_data), + .data_in_valid(divisor_data_valid), + .data_in_ready(divisor_data_ready), + .data_out(divisor_data_skid), + .data_out_valid(divisor_valid_skid), + .data_out_ready(divisor_ready_skid) + ); + + // Update join2 instance to use skid buffer outputs + join2 #( + ) join2_inst ( + .data_in_valid({dividend_valid_skid, divisor_valid_skid}), + .data_in_ready({dividend_ready_skid, divisor_ready_skid}), + .data_out_valid(quotient_data_valid), + .data_out_ready(quotient_data_ready) + ); + + // Update division operation to use skid buffer outputs + for(genvar i = 0; i < IN_NUM; i++) begin + assign quotient_data[i] = dividend_data_skid[i] / divisor_data_skid[i]; + end + +endmodule \ No newline at end of file diff --git a/src/mase_components/scalar_operators/fixed/test/fixed_div_tb.py b/src/mase_components/scalar_operators/fixed/test/fixed_div_tb.py new file mode 100644 index 000000000..0a04d6eb3 --- /dev/null +++ b/src/mase_components/scalar_operators/fixed/test/fixed_div_tb.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +import os +import pytest + +import torch +import logging +from functools import partial +from src.mase_components.helper import generate_memory +from pathlib import Path +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor +from mase_cocotb.utils import bit_driver +from mase_cocotb.runner import mase_runner + +from mase_cocotb.z_qlayers import quantize_to_int as q2i + + +class FixedDivTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + self.dividend_driver = StreamDriver( + dut.clk, dut.dividend_data, dut.dividend_data_valid, dut.dividend_data_ready + ) + self.divisor_driver = StreamDriver( + dut.clk, dut.divisor_data, dut.divisor_data_valid, dut.divisor_data_ready + ) + + self.quotient_monitor = StreamMonitor( + dut.clk, + dut.quotient_data, + dut.quotient_data_valid, + dut.quotient_data_ready, + check=True, + ) + + # Set verbosity of driver and monitor loggers to debug + self.dividend_driver.log.setLevel(logging.DEBUG) + self.divisor_driver.log.setLevel(logging.DEBUG) + self.quotient_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self): + return torch.randint( + low=1, + high=100, + size=( + 1, + self.get_parameter("IN_NUM"), + ), + ) + + async def run_test(self, batches, us): + await self.reset() + self.log.info(f"Reset finished") + for _ in range(batches): + dividend = self.generate_inputs() + # * Load the inputs driver + self.log.info(f"Processing dividend: {dividend}") + qdividend = dividend + self.dividend_driver.load_driver(qdividend.tolist()) + + divisor = self.generate_inputs() + qdivisor = divisor + # breakpoint() + self.log.info(f"Processing divisor: {divisor}") + self.divisor_driver.load_driver(qdivisor.tolist()) + safe_divisor = torch.where(divisor == 0, torch.tensor(0.000001), divisor) + result = dividend // safe_divisor + qresult = result + + self.log.info(f"Processing outputs: {result}") + self.quotient_monitor.load_monitor(qresult.tolist()) + + await Timer(us, units="us") + assert self.quotient_monitor.exp_queue.empty() + + +# @cocotb.test() +# async def single_test(dut): +# tb = FixedDivTB(dut) +# tb.quotient_monitor.ready.value = 1 +# await tb.run_test(batches=1, us=100) + + +# @cocotb.test() +# async def repeated_test(dut): +# tb = FixedDivTB(dut) +# tb.quotient_monitor.ready.value = 1 +# await tb.run_test(batches=100, us=200) + + +@cocotb.test() +async def repeated_backpressure(dut): + tb = FixedDivTB(dut) + tb.dividend_driver.set_valid_prob(0.2) + tb.divisor_driver.set_valid_prob(0.2) + # tb.quotient_monitor.ready.value = 1 + + cocotb.start_soon(bit_driver(dut.quotient_data_ready, dut.clk, 0.5)) + await tb.run_test(batches=10, us=200) + + +dut_params = { + "IN_NUM": 1, + "DIVIDEND_WIDTH": 13, + "DIVISOR_WIDTH": 20, + "QUOTIENT_WIDTH": 9, +} + + +def get_fixed_div_config(kwargs={}): + config = dut_params + config.update(kwargs) + return config + + +torch.manual_seed(1) + + +@pytest.mark.dev +def test_fixed_div_smoke(): + """ + Some quick tests to check if the module is working. + """ + + mase_runner( + trace=True, + module_param_list=[ + get_fixed_div_config(), + ], + # skip_build=True, + ) + + +if __name__ == "__main__": + test_fixed_div_smoke() diff --git a/src/mase_components/scalar_operators/fixed/test/test_synth_fixed_math.py b/src/mase_components/scalar_operators/fixed/test/test_synth_fixed_math.py index 4855fc8e7..677e53cfb 100644 --- a/src/mase_components/scalar_operators/fixed/test/test_synth_fixed_math.py +++ b/src/mase_components/scalar_operators/fixed/test/test_synth_fixed_math.py @@ -1,10 +1,14 @@ import pytest from mase_components.synth_runner import run_synth +import logging + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(f"linter") @pytest.mark.vivado def test_synth_fixed_math(): - run_synth("fixed_math") + run_synth("scalar_operators/fixed", "fixed_div.sv") if __name__ == "__main__": diff --git a/src/mase_components/synth_runner.py b/src/mase_components/synth_runner.py index 3381d016c..40d3de2cf 100644 --- a/src/mase_components/synth_runner.py +++ b/src/mase_components/synth_runner.py @@ -14,83 +14,91 @@ def generate_tcl_script(group, module_name, include_groups, synth_project_path): os.makedirs(synth_project_path, exist_ok=True) tcl_script_template = f""" -set_param board.repoPaths {{{str(Path.home())}/shared/board-files}} -create_project synth_project_{group}_{module_name} {synth_project_path} -part xcu280-fsvh2892-2L-e -set_property board_part xilinx.com:au280:part0:1.1 [current_project] +# set_param board.repoPaths {{{str(Path.home())}/shared/board-files}} +create_project -force synth_project_{group}_{module_name} {synth_project_path} -part xcu280-fsvh2892-2L-e """ for include_group in include_groups: tcl_script_template += f"""\nadd_files {include_group}""" tcl_script_template += f"\n\nset_property top {module_name} [current_fileset]" - tcl_script_template += """ + tcl_script_template += f""" +add_files {COMPONENTS_PATH/"vivado/constraints.xdc"} +read_xdc {COMPONENTS_PATH/"vivado/constraints.xdc"} update_compile_order -fileset sources_1 launch_runs synth_1 -wait_on_runs synth_1 +wait_on_run synth_1 + +launch_runs impl_1 +wait_on_run impl_1 +report_timing_summary -file timing_summary.txt +report_timing -file detailed_timing.txt """ with open(f"{synth_project_path}/build.tcl", "w") as file: file.write(tcl_script_template) -def run_synth(group): +def run_synth(group, specified_name=None): comp_path = COMPONENTS_PATH / group / "rtl" rtl_files = [ file for file in os.listdir(comp_path) if file.endswith(".sv") or file.endswith(".v") ] - successes = [] failures = [] for rtl_file in rtl_files: + if specified_name != None: + if rtl_file != specified_name: + continue file_path = comp_path / rtl_file logger.info(f"Synthesizing {file_path}") logger.info(f"----------------------------") module_name = rtl_file.replace(".sv", "") module_path = f"{group}/{module_name}" - if module_path not in MASE_HW_DEPS.keys(): logger.warning( f"Module {module_path} is not included in dependencies file." ) # * List include files + # include_groups = [ + # f"{COMPONENTS_PATH / group / 'rtl'}" + # for group in mase_components.get_modules() + # if group != "vivado" + # ] include_groups = [ - f"{COMPONENTS_PATH / group / 'rtl'}" - for group in mase_components.get_modules() - if group != "vivado" + f"{COMPONENTS_PATH / group / 'rtl'}" for group in MASE_HW_DEPS[module_path] ] - synth_project_path = ( - f"{COMPONENTS_PATH}/{group}/synth/synth_project_{group}_{module_name}" - ) + synth_project_path = f"{COMPONENTS_PATH}/{group}/synth/" logger.debug(f"Include files: {include_groups}") logger.info(f"Generating build TCL script for module: {module_path}") generate_tcl_script(group, module_name, include_groups, synth_project_path) - logger.info(f"Launching Vivado synthesis for module: {module_path}") - cmd = [ - "vivado", - "-mode", - "batch", - "-log", - f"{synth_project_path}/vivado.log", - "-source", - f"{synth_project_path}/build.tcl", - ] - result = subprocess.run(cmd, capture_output=True, text=True) - - # * Process result - if result.stderr == "": - successes.append(rtl_file) - else: - logger.error(result.stderr) - failures.append(rtl_file) + # logger.info(f"Launching Vivado synthesis for module: {module_path}") + # cmd = [ + # "vivado", + # "-mode", + # "batch", + # "-log", + # f"{synth_project_path}/vivado.log", + # "-source", + # f"{synth_project_path}/build.tcl", + # ] + # result = subprocess.run(cmd, capture_output=True, text=True) + + # # * Process result + # if result.stderr == "": + # successes.append(rtl_file) + # else: + # logger.error(result.stderr) + # failures.append(rtl_file) # * Print summary logger.info(f"=========== SUMMARY ===========") diff --git a/src/mase_components/transformer_layers/rtl/fixed_self_attention.sv b/src/mase_components/transformer_layers/rtl/fixed_self_attention.sv index a54aec306..3b784d078 100644 --- a/src/mase_components/transformer_layers/rtl/fixed_self_attention.sv +++ b/src/mase_components/transformer_layers/rtl/fixed_self_attention.sv @@ -223,6 +223,7 @@ module fixed_self_attention #( .IN_DATA_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), .IN_DATA_PRECISION_0 (DATA_OUT_0_PRECISION_0), .IN_DATA_PRECISION_1 (DATA_OUT_0_PRECISION_1), + .ACTIVATION (ACTIVATION), .OUT_DATA_TENSOR_SIZE_DIM_0(DATA_OUT_0_TENSOR_SIZE_DIM_0 / NUM_HEADS), .OUT_DATA_TENSOR_SIZE_DIM_1(DATA_OUT_0_TENSOR_SIZE_DIM_1), diff --git a/src/mase_components/transformer_layers/rtl/fixed_self_attention_head.sv b/src/mase_components/transformer_layers/rtl/fixed_self_attention_head.sv index 18dfccfea..c2969517d 100644 --- a/src/mase_components/transformer_layers/rtl/fixed_self_attention_head.sv +++ b/src/mase_components/transformer_layers/rtl/fixed_self_attention_head.sv @@ -11,6 +11,7 @@ module fixed_self_attention_head #( parameter IN_DATA_PRECISION_1 = 3, // * Output tokens are casted to requested precision + parameter ACTIVATION = 0, parameter OUT_DATA_TENSOR_SIZE_DIM_0 = 64, parameter OUT_DATA_TENSOR_SIZE_DIM_1 = 32, parameter OUT_DATA_PARALLELISM_DIM_0 = IN_DATA_PARALLELISM_DIM_0, @@ -212,34 +213,58 @@ module fixed_self_attention_head #( // ! TO DO: normalize query_key_transpose // * Attention scores: softmax(Query x Key^T) - - fixed_softermax #( - .DATA_IN_0_PRECISION_0 (OUT_DATA_PRECISION_0), - .DATA_IN_0_PRECISION_1 (OUT_DATA_PRECISION_1), - .DATA_IN_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_1), - .DATA_IN_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), - .DATA_IN_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_1), - .DATA_IN_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1), - - .DATA_OUT_0_PRECISION_0 (OUT_DATA_PRECISION_0), - .DATA_OUT_0_PRECISION_1 (OUT_DATA_PRECISION_1), - .DATA_OUT_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_1), - .DATA_OUT_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), - .DATA_OUT_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_1), - .DATA_OUT_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1) - - ) fixed_softermax_i ( - .clk, - .rst, - - .data_in_0 (query_key_transpose), - .data_in_0_valid(query_key_transpose_valid), - .data_in_0_ready(query_key_transpose_ready), - - .data_out_0 (attention_scores), - .data_out_0_valid(attention_scores_valid), - .data_out_0_ready(attention_scores_ready) - ); + if (ACTIVATION == 0) begin + fixed_softermax #( + .DATA_IN_0_PRECISION_0 (OUT_DATA_PRECISION_0), + .DATA_IN_0_PRECISION_1 (OUT_DATA_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0 (OUT_DATA_PRECISION_0), + .DATA_OUT_0_PRECISION_1 (OUT_DATA_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1) + ) fixed_softermax_i ( + .clk, + .rst, + + .data_in_0 (query_key_transpose), + .data_in_0_valid(query_key_transpose_valid), + .data_in_0_ready(query_key_transpose_ready), + + .data_out_0 (attention_scores), + .data_out_0_valid(attention_scores_valid), + .data_out_0_ready(attention_scores_ready) + ); + end else begin + fixed_softmax #( + .DATA_IN_0_PRECISION_0 (OUT_DATA_PRECISION_0), + .DATA_IN_0_PRECISION_1 (OUT_DATA_PRECISION_1), + .DATA_EXP_0_PRECISION_0 (OUT_DATA_PRECISION_0), + .DATA_EXP_0_PRECISION_1 (OUT_DATA_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1), + .DATA_OUT_0_PRECISION_0 (OUT_DATA_PRECISION_0), + .DATA_OUT_0_PRECISION_1 (OUT_DATA_PRECISION_1) + ) fixed_softmax_i ( + .clk, + .rst, + + .data_in_0 (query_key_transpose), + .data_in_0_valid(query_key_transpose_valid), + .data_in_0_ready(query_key_transpose_ready), + + .data_out_0 (attention_scores), + .data_out_0_valid(attention_scores_valid), + .data_out_0_ready(attention_scores_ready) + ); + end // * Output: Attention scores x Value diff --git a/src/mase_components/transformer_layers/rtl/fixed_self_attention_input_block_batched.sv b/src/mase_components/transformer_layers/rtl/fixed_self_attention_input_block_batched.sv index d81cfcc09..e084f7af7 100644 --- a/src/mase_components/transformer_layers/rtl/fixed_self_attention_input_block_batched.sv +++ b/src/mase_components/transformer_layers/rtl/fixed_self_attention_input_block_batched.sv @@ -23,9 +23,9 @@ module fixed_self_attention_input_block_batched #( parameter BIAS_PRECISION_0 = 16, parameter BIAS_PRECISION_1 = 3, - parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = (WEIGHTS_PRE_TRANSPOSED == 0)? WEIGHT_TENSOR_SIZE_DIM_1: WEIGHT_TENSOR_SIZE_DIM_0, parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, - parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = (WEIGHTS_PRE_TRANSPOSED == 0)? WEIGHT_PARALLELISM_DIM_1: WEIGHT_PARALLELISM_DIM_0, parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, parameter DATA_OUT_0_PRECISION_0 = 16, parameter DATA_OUT_0_PRECISION_1 = 3 @@ -69,17 +69,17 @@ module fixed_self_attention_input_block_batched #( output logic bias_value_ready, // Query - output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_query [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_query [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], output logic data_out_query_valid, input logic data_out_query_ready, // Key - output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_key [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_key [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], output logic data_out_key_valid, input logic data_out_key_ready, // Value - output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_value [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_0-1:0], + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_value [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], output logic data_out_value_valid, input logic data_out_value_ready ); @@ -116,7 +116,7 @@ module fixed_self_attention_input_block_batched #( // * Query linear - fixed_linear #( + fixed_linear_with_input_circular #( .HAS_BIAS (HAS_BIAS), .WEIGHTS_PRE_TRANSPOSED(WEIGHTS_PRE_TRANSPOSED), @@ -171,9 +171,9 @@ module fixed_self_attention_input_block_batched #( // * since the matmul for QK^T buffers K^T but streams Q matrix_fifo #( .DATA_WIDTH(DATA_OUT_0_PRECISION_0), - .DIM0 (WEIGHT_PARALLELISM_DIM_0), + .DIM0 (DATA_OUT_0_PARALLELISM_DIM_0), .DIM1 (DATA_IN_0_PARALLELISM_DIM_1), - .FIFO_SIZE (DATA_IN_0_DEPTH_DIM_1 * WEIGHT_DEPTH_DIM_0) + .FIFO_SIZE (DATA_IN_0_DEPTH_DIM_1 * DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0) ) query_buffer_i ( .clk, .rst, diff --git a/src/mase_components/transformer_layers/rtl/self_attention_head_gather.sv b/src/mase_components/transformer_layers/rtl/self_attention_head_gather.sv index e6be4f36e..ea7b8650f 100644 --- a/src/mase_components/transformer_layers/rtl/self_attention_head_gather.sv +++ b/src/mase_components/transformer_layers/rtl/self_attention_head_gather.sv @@ -48,6 +48,8 @@ module self_attention_head_gather #( if (block_counter[head] != BLOCKS_PER_HEAD) begin block_counter[head] <= block_counter[head] + 1'b1; + end else begin + block_counter[head] <= 1'b1; end // * Reset counter when all heads done diff --git a/src/mase_components/transformer_layers/test/fixed_self_attention_tb.py b/src/mase_components/transformer_layers/test/fixed_self_attention_tb.py index 0642aeb9f..0fe43efc4 100644 --- a/src/mase_components/transformer_layers/test/fixed_self_attention_tb.py +++ b/src/mase_components/transformer_layers/test/fixed_self_attention_tb.py @@ -11,19 +11,23 @@ from cocotb.triggers import Timer from transformers.models.bert.configuration_bert import BertConfig +from transformers.models.vit.configuration_vit import ViTConfig from mase_cocotb.testbench import Testbench from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor from mase_cocotb.runner import mase_runner # from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner -from chop.nn.quantized import BertSelfAttentionInteger, fixed_softermax - +from chop.nn.quantized import ( + BertSelfAttentionInteger, + fixed_softermax, +) +from chop.nn.quantized.functional import softmax_integer from mase_cocotb.utils import fixed_preprocess_tensor class FixedSelfAttentionTB(Testbench): - def __init__(self, dut) -> None: + def __init__(self, dut, model_name) -> None: super().__init__(dut, dut.clk, dut.rst) if not hasattr(self, "log"): @@ -68,7 +72,7 @@ def __init__(self, dut) -> None: ) # Model - self.config = BertConfig() + self.config = ViTConfig() if "vit" in model_name else BertConfig() self.config.hidden_size = self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0") self.config.num_attention_heads = self.get_parameter("NUM_HEADS") self.q_config = { @@ -103,6 +107,22 @@ def __init__(self, dut) -> None: "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), }, ) + else: + self.model.softmax = partial( + softmax_integer, + config={ + "data_in_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + "data_in_exp_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_in_exp_frac_width": self.get_parameter( + "DATA_OUT_0_PRECISION_1" + ), + "data_in_div_frac_width": self.get_parameter( + "DATA_OUT_0_PRECISION_1" + ), + }, + dim=-1, + ) # Set verbosity of driver and monitor loggers to debug self.data_in_0_driver.log.setLevel(logging.DEBUG) @@ -203,40 +223,40 @@ async def run_test(self): @cocotb.test() async def cocotb_test(dut): - tb = FixedSelfAttentionTB(dut) + tb = FixedSelfAttentionTB(dut, "vit") await tb.run_test() def get_config(kwargs={}): config = { - "NUM_HEADS": 1, + "NUM_HEADS": 3, "ACTIVATION": 0, - "DATA_IN_0_TENSOR_SIZE_DIM_0": 4, - "DATA_IN_0_TENSOR_SIZE_DIM_1": 4, - "DATA_IN_0_PARALLELISM_DIM_0": 2, - "DATA_IN_0_PARALLELISM_DIM_1": 2, - "DATA_IN_0_PRECISION_0": 16, - "DATA_IN_0_PRECISION_1": 8, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 32, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 16, + "DATA_IN_0_PARALLELISM_DIM_0": 4, + "DATA_IN_0_PARALLELISM_DIM_1": 4, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 4, "WEIGHTS_PRE_TRANSPOSED": 1, - "WEIGHT_TENSOR_SIZE_DIM_0": 4, - "WEIGHT_TENSOR_SIZE_DIM_1": 4, - "WEIGHT_PARALLELISM_DIM_0": 2, - "WEIGHT_PARALLELISM_DIM_1": 2, + "WEIGHT_TENSOR_SIZE_DIM_0": 32, + "WEIGHT_TENSOR_SIZE_DIM_1": 32, + "WEIGHT_PARALLELISM_DIM_0": 4, + "WEIGHT_PARALLELISM_DIM_1": 4, "WEIGHT_PRECISION_0": 16, "WEIGHT_PRECISION_1": 8, - "HAS_BIAS": 0, - "BIAS_TENSOR_SIZE_DIM_0": 4, - "BIAS_TENSOR_SIZE_DIM_1": 4, - "BIAS_PARALLELISM_DIM_0": 2, - "BIAS_PARALLELISM_DIM_1": 2, + "HAS_BIAS": 1, + "BIAS_TENSOR_SIZE_DIM_0": 32, + "BIAS_TENSOR_SIZE_DIM_1": 1, + "BIAS_PARALLELISM_DIM_0": 4, + "BIAS_PARALLELISM_DIM_1": 1, "BIAS_PRECISION_0": 16, "BIAS_PRECISION_1": 8, - "DATA_OUT_0_TENSOR_SIZE_DIM_0": 4, - "DATA_OUT_0_TENSOR_SIZE_DIM_1": 4, - "DATA_OUT_0_PARALLELISM_DIM_0": 2, - "DATA_OUT_0_PARALLELISM_DIM_1": 2, - "DATA_OUT_0_PRECISION_0": 16, - "DATA_OUT_0_PRECISION_1": 8, + "DATA_OUT_0_TENSOR_SIZE_DIM_0": 32, + "DATA_OUT_0_TENSOR_SIZE_DIM_1": 16, + "DATA_OUT_0_PARALLELISM_DIM_0": 4, + "DATA_OUT_0_PARALLELISM_DIM_1": 4, + "DATA_OUT_0_PRECISION_0": 10, + "DATA_OUT_0_PRECISION_1": 4, } config.update(kwargs) return config diff --git a/src/mase_components/vision_models/vit/rtl/affine_layernorm.sv b/src/mase_components/vision_models/vit/rtl/affine_layernorm.sv deleted file mode 100644 index 975ec15f7..000000000 --- a/src/mase_components/vision_models/vit/rtl/affine_layernorm.sv +++ /dev/null @@ -1,89 +0,0 @@ -`timescale 1ns / 1ps -module affine_layernorm #( - parameter IN_WIDTH = 32, - parameter IN_FRAC_WIDTH = 0, - parameter OUT_WIDTH = 6, - parameter OUT_FRAC_WIDTH = 4, - parameter BIAS_WIDTH = 8, - parameter BIAS_FRAC_WIDTH = 4, - parameter IN_SIZE = 4 -) ( - input clk, - input rst, - - // input port for data_inivations - input [IN_WIDTH-1:0] data_in [IN_SIZE-1:0], - input data_in_valid, - output data_in_ready, - - // input port for weight - input [ IN_WIDTH-1:0] weight [IN_SIZE-1:0], - input [BIAS_WIDTH-1:0] bias [IN_SIZE-1:0], - input weight_valid, - input bias_valid, - output weight_ready, - output bias_ready, - - output [OUT_WIDTH-1:0] data_out [IN_SIZE-1:0], - output data_out_valid, - input data_out_ready -); - - localparam PROD_WIDTH = IN_WIDTH + IN_WIDTH; - localparam PROD_FRAC_WIDTH = IN_FRAC_WIDTH + IN_FRAC_WIDTH; - logic [PROD_WIDTH - 1:0] prod[IN_SIZE - 1:0]; - logic pv_valid, pv_ready; - logic [BIAS_WIDTH - 1:0] round_prod[IN_SIZE - 1:0]; - logic [BIAS_WIDTH:0] round_in[IN_SIZE - 1:0]; - logic wb_valid, wb_ready; - - fixed_vector_mult #( - .IN_WIDTH(IN_WIDTH), - .WEIGHT_WIDTH(IN_WIDTH), - .IN_SIZE(IN_SIZE) - ) fixed_vector_mult_inst ( - .clk(clk), - .rst(rst), - .data_in(data_in), - .data_in_valid(data_in_valid), - .data_in_ready(data_in_ready), - .weight(weight), - .weight_valid(weight_valid), - .weight_ready(weight_ready), - .data_out(prod), - .data_out_valid(pv_valid), - .data_out_ready(pv_ready) - ); - join2 #() join_inst2 ( - .data_in_ready ({pv_ready, bias_ready}), - .data_in_valid ({pv_valid, bias_valid}), - .data_out_valid(data_out_valid), - .data_out_ready(data_out_ready) - ); - - fixed_rounding #( - .IN_SIZE(IN_SIZE), - .IN_WIDTH(PROD_WIDTH), - .IN_FRAC_WIDTH(PROD_FRAC_WIDTH), - .OUT_WIDTH(BIAS_WIDTH), - .OUT_FRAC_WIDTH(BIAS_FRAC_WIDTH) - ) bias_cast ( - .data_in (prod), - .data_out(round_prod) - ); - for (genvar i = 0; i < IN_SIZE; i++) begin - assign round_in[i] = {bias[i][BIAS_WIDTH-1], bias[i]} + {round_prod[i][BIAS_WIDTH-1], round_prod[i]}; - end - - fixed_rounding #( - .IN_SIZE(IN_SIZE), - .IN_WIDTH(BIAS_WIDTH + 1), - .IN_FRAC_WIDTH(BIAS_FRAC_WIDTH), - .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH) - ) out_cast ( - .data_in (round_in), - .data_out(data_out) - ); - -endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_ViT.sv b/src/mase_components/vision_models/vit/rtl/fixed_ViT.sv deleted file mode 100644 index 0135ea2f5..000000000 --- a/src/mase_components/vision_models/vit/rtl/fixed_ViT.sv +++ /dev/null @@ -1,282 +0,0 @@ -`timescale 1ns / 1ps -module fixed_ViT #( - parameter IN_WIDTH = 6, - parameter IN_FRAC_WIDTH = 1, - //patch - parameter WC_WIDTH = 4, - parameter WC_FRAC_WIDTH = 1, - parameter BC_WIDTH = 4, - parameter BC_FRAC_WIDTH = 1, - parameter CONV_OUT_WIDTH = 6, - parameter CONV_OUT_FRAC_WIDTH = 1, - //msa - parameter WQ_WIDTH = 4, - parameter WQ_FRAC_WIDTH = 1, - parameter WK_WIDTH = 4, - parameter WK_FRAC_WIDTH = 1, - parameter WV_WIDTH = 4, - parameter WV_FRAC_WIDTH = 1, - - parameter BQ_WIDTH = 4, - parameter BQ_FRAC_WIDTH = 1, - parameter BK_WIDTH = 4, - parameter BK_FRAC_WIDTH = 1, - parameter BV_WIDTH = 4, - parameter BV_FRAC_WIDTH = 1, - - parameter WP_WIDTH = 4, - parameter WP_FRAC_WIDTH = 1, - parameter BP_WIDTH = 4, - parameter BP_FRAC_WIDTH = 1, - - parameter DQ_WIDTH = 6, - parameter DQ_FRAC_WIDTH = 1, - parameter DK_WIDTH = 6, - parameter DK_FRAC_WIDTH = 1, - parameter DV_WIDTH = 6, - parameter DV_FRAC_WIDTH = 1, - - parameter DS_WIDTH = 6, - parameter DS_FRAC_WIDTH = 1, - parameter DZ_WIDTH = 6, - parameter DZ_FRAC_WIDTH = 1, - - parameter MSA_OUT_WIDTH = 6, - parameter MSA_OUT_FRAC_WIDTH = 1, - // mlp - - parameter WEIGHT_I2H_WIDTH = 4, - parameter WEIGHT_I2H_FRAC_WIDTH = 1, - parameter WEIGHT_H2O_WIDTH = 4, - parameter WEIGHT_H2O_FRAC_WIDTH = 1, - parameter MLP_HAS_BIAS = 1, - parameter BIAS_I2H_WIDTH = 4, - parameter BIAS_I2H_FRAC_WIDTH = 1, - parameter BIAS_H2O_WIDTH = 4, - parameter BIAS_H2O_FRAC_WIDTH = 1, - - parameter HIDDEN_WIDTH = 6, - parameter HIDDEN_FRAC_WIDTH = 1, - - parameter OUT_WIDTH = 6, - parameter OUT_FRAC_WIDTH = 1, - // conv - parameter IN_C = 3, - parameter IN_Y = 16, - parameter IN_X = 16, - - parameter OUT_C = 4, - parameter KERNEL_C = IN_C, - parameter KERNEL_SIZE = 2, - parameter KERNEL_Y = KERNEL_SIZE, - parameter KERNEL_X = KERNEL_SIZE, - - parameter PADDING_Y = KERNEL_Y / 2, - parameter PADDING_X = KERNEL_Y / 2, - - parameter UNROLL_KERNEL_OUT = 2, - parameter UNROLL_OUT_C = 2, - parameter UNROLL_IN_C = 2, - - parameter SLIDING_NUM = 73, - - parameter STRIDE = KERNEL_SIZE, - // patch embedding - // TODO: IN_NUM = SLLIDING_NUM needs to be discussed - parameter IN_NUM = SLIDING_NUM, - parameter UNROLL_IN_NUM = 1, - - parameter IN_DIM = OUT_C, - parameter UNROLL_IN_DIM = UNROLL_IN_C, - // num_heads * wqkv_dim = IN_DIM - parameter NUM_HEADS = 2, - parameter WQKV_DIM = IN_DIM / NUM_HEADS, - parameter UNROLL_WQKV_DIM = 1, - parameter WP_DIM = IN_DIM, - - // set it with unroll_in_dim for residual matching - parameter UNROLL_WP_DIM = UNROLL_IN_DIM, - - parameter OUT_NUM = IN_NUM, - parameter OUT_DIM = IN_DIM, - parameter UNROLL_OUT_NUM = UNROLL_IN_NUM, - parameter UNROLL_OUT_DIM = UNROLL_WP_DIM, - // get attention output after 4 * 3 cycles - // mlp - parameter IN_FEATURES = IN_DIM, - parameter HIDDEN_FEATURES = 2 * IN_FEATURES, - parameter OUT_FEATURES = IN_FEATURES, - - parameter UNROLL_IN_FEATURES = UNROLL_IN_DIM, - parameter UNROLL_HIDDEN_FEATURES = 3, - parameter UNROLL_OUT_FEATURES = UNROLL_IN_DIM -) ( - input clk, - input rst, - - input [WC_WIDTH-1:0] weight_c [UNROLL_KERNEL_OUT * UNROLL_OUT_C -1:0], - input weight_c_valid, - output weight_c_ready, - - input [BC_WIDTH-1:0] bias_c [UNROLL_OUT_C-1:0], - input bias_c_valid, - output bias_c_ready, - - input [WQ_WIDTH - 1:0] weight_q[NUM_HEADS * UNROLL_WQKV_DIM * UNROLL_IN_DIM -1 : 0], - input weight_q_valid, - output weight_q_ready, - - input [WK_WIDTH - 1:0] weight_k[NUM_HEADS * UNROLL_WQKV_DIM * UNROLL_IN_DIM -1 : 0], - input weight_k_valid, - output weight_k_ready, - - input [WV_WIDTH - 1:0] weight_v[NUM_HEADS * UNROLL_WQKV_DIM * UNROLL_IN_DIM -1 : 0], - input weight_v_valid, - output weight_v_ready, - - input [WP_WIDTH - 1:0] weight_p[UNROLL_WP_DIM * NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input weight_p_valid, - output weight_p_ready, - - input [BQ_WIDTH - 1:0] bias_q[NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input bias_q_valid, - output bias_q_ready, - - input [BK_WIDTH - 1:0] bias_k[NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input bias_k_valid, - output bias_k_ready, - - input [BV_WIDTH - 1:0] bias_v[NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input bias_v_valid, - output bias_v_ready, - - input [BP_WIDTH - 1:0] bias_p[UNROLL_WP_DIM -1 : 0], - input bias_p_valid, - output bias_p_ready, - - input [WEIGHT_I2H_WIDTH-1:0] weight_in2hidden[UNROLL_HIDDEN_FEATURES * UNROLL_IN_FEATURES - 1:0], - input weight_in2hidden_valid, - output weight_in2hidden_ready, - - input [WEIGHT_H2O_WIDTH-1:0] weight_hidden2out[UNROLL_OUT_FEATURES * UNROLL_HIDDEN_FEATURES - 1:0], - input weight_hidden2out_valid, - output weight_hidden2out_ready, - //input bias - input [BIAS_I2H_WIDTH-1:0] bias_in2hidden[UNROLL_HIDDEN_FEATURES - 1:0], - input bias_in2hidden_valid, - output bias_in2hidden_ready, - - input [BIAS_H2O_WIDTH-1:0] bias_hidden2out[UNROLL_OUT_FEATURES - 1:0], - input bias_hidden2out_valid, - output bias_hidden2out_ready, - - input [IN_WIDTH -1:0] data_in[UNROLL_IN_NUM * UNROLL_IN_DIM - 1 : 0], - input data_in_valid, - output data_in_ready, - - output [OUT_WIDTH -1:0] data_out[UNROLL_OUT_NUM * UNROLL_OUT_FEATURES - 1:0], - output data_out_valid, - input data_out_ready -); - logic [CONV_OUT_WIDTH - 1:0] conv_out[UNROLL_OUT_C - 1:0]; - logic conv_out_valid, conv_out_ready; - fixed_patch_embed #( - .IN_WIDTH(IN_WIDTH), - .IN_FRAC_WIDTH(IN_FRAC_WIDTH), - .W_WIDTH(WC_WIDTH), - .W_FRAC_WIDTH(WC_FRAC_WIDTH), - .BIAS_WIDTH(BC_WIDTH), - .BIAS_FRAC_WIDTH(BC_FRAC_WIDTH), - .OUT_WIDTH(CONV_OUT_WIDTH), - .OUT_FRAC_WIDTH(CONV_OUT_FRAC_WIDTH), - .IN_C(IN_C), - .IN_Y(IN_Y), - .IN_X(IN_X), - .OUT_C(OUT_C), - .KERNEL_SIZE(KERNEL_SIZE), - .UNROLL_KERNEL_OUT(UNROLL_KERNEL_OUT), - .UNROLL_OUT_C(UNROLL_OUT_C), - .UNROLL_IN_C(UNROLL_IN_C), - .SLIDING_NUM(SLIDING_NUM) - ) patemb_inst ( - .weight(weight_c), - .weight_valid(weight_c_valid), - .weight_ready(weight_c_ready), - - .bias(bias_c), - .bias_valid(bias_c_valid), - .bias_ready(bias_c_ready), - - .data_out(conv_out), - .data_out_valid(conv_out_valid), - .data_out_ready(conv_out_ready), - .* - ); - - fixed_block #( - .IN_WIDTH(CONV_OUT_WIDTH), - .IN_FRAC_WIDTH(CONV_OUT_FRAC_WIDTH), - .WQ_WIDTH(WQ_WIDTH), - .WQ_FRAC_WIDTH(WQ_FRAC_WIDTH), - .WK_WIDTH(WK_WIDTH), - .WK_FRAC_WIDTH(WK_FRAC_WIDTH), - .WV_WIDTH(WV_WIDTH), - .WV_FRAC_WIDTH(WV_FRAC_WIDTH), - - .BQ_WIDTH(BQ_WIDTH), - .BQ_FRAC_WIDTH(BQ_FRAC_WIDTH), - .BK_WIDTH(BK_WIDTH), - .BK_FRAC_WIDTH(BK_FRAC_WIDTH), - .BV_WIDTH(BV_WIDTH), - .BV_FRAC_WIDTH(BV_FRAC_WIDTH), - - .WP_WIDTH(WP_WIDTH), - .WP_FRAC_WIDTH(WP_FRAC_WIDTH), - .BP_WIDTH(BP_WIDTH), - .BP_FRAC_WIDTH(BP_FRAC_WIDTH), - - .DQ_WIDTH(DQ_WIDTH), - .DQ_FRAC_WIDTH(DQ_FRAC_WIDTH), - .DK_WIDTH(DK_WIDTH), - .DK_FRAC_WIDTH(DK_FRAC_WIDTH), - .DV_WIDTH(DV_WIDTH), - .DV_FRAC_WIDTH(DV_FRAC_WIDTH), - - .DS_WIDTH(DS_WIDTH), - .DS_FRAC_WIDTH(DS_FRAC_WIDTH), - .DZ_WIDTH(DZ_WIDTH), - .DZ_FRAC_WIDTH(DZ_FRAC_WIDTH), - - .MSA_OUT_WIDTH(MSA_OUT_WIDTH), - .MSA_OUT_FRAC_WIDTH(MSA_OUT_FRAC_WIDTH), - - .WEIGHT_I2H_WIDTH(WEIGHT_I2H_WIDTH), - .WEIGHT_I2H_FRAC_WIDTH(WEIGHT_I2H_FRAC_WIDTH), - .WEIGHT_H2O_WIDTH(WEIGHT_H2O_WIDTH), - .WEIGHT_H2O_FRAC_WIDTH(WEIGHT_H2O_FRAC_WIDTH), - .MLP_HAS_BIAS(MLP_HAS_BIAS), - .BIAS_I2H_WIDTH(BIAS_I2H_WIDTH), - .BIAS_I2H_FRAC_WIDTH(BIAS_I2H_FRAC_WIDTH), - .BIAS_H2O_WIDTH(BIAS_H2O_WIDTH), - .BIAS_H2O_FRAC_WIDTH(BIAS_H2O_FRAC_WIDTH), - - .HIDDEN_WIDTH(HIDDEN_WIDTH), - .HIDDEN_FRAC_WIDTH(HIDDEN_FRAC_WIDTH), - - .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH), - - .IN_NUM(IN_NUM), - .IN_DIM(IN_DIM), - .NUM_HEADS(NUM_HEADS), - .UNROLL_IN_NUM(UNROLL_IN_NUM), - .UNROLL_IN_DIM(UNROLL_IN_DIM), - .UNROLL_WQKV_DIM(UNROLL_WQKV_DIM), - .UNROLL_HIDDEN_FEATURES(UNROLL_HIDDEN_FEATURES) - ) block_inst ( - .data_in(conv_out), - .data_in_valid(conv_out_valid), - .data_in_ready(conv_out_ready), - .* - ); -endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_block.sv b/src/mase_components/vision_models/vit/rtl/fixed_block.sv deleted file mode 100644 index 9b2adfa31..000000000 --- a/src/mase_components/vision_models/vit/rtl/fixed_block.sv +++ /dev/null @@ -1,455 +0,0 @@ -`timescale 1ns / 1ps -module fixed_block #( - parameter IN_WIDTH = 6, - parameter IN_FRAC_WIDTH = 1, - - parameter AF_MSA_ADD_WIDTH = 6, - parameter AF_MSA_ADD_FRAC_WIDTH = 1, - parameter MSA_IN_WIDTH = 6, - parameter MSA_IN_FRAC_WIDTH = 6, - - parameter WQ_WIDTH = 4, - parameter WQ_FRAC_WIDTH = 1, - parameter WK_WIDTH = 4, - parameter WK_FRAC_WIDTH = 1, - parameter WV_WIDTH = 4, - parameter WV_FRAC_WIDTH = 1, - - parameter BQ_WIDTH = 4, - parameter BQ_FRAC_WIDTH = 1, - parameter BK_WIDTH = 4, - parameter BK_FRAC_WIDTH = 1, - parameter BV_WIDTH = 4, - parameter BV_FRAC_WIDTH = 1, - - parameter WP_WIDTH = 4, - parameter WP_FRAC_WIDTH = 1, - parameter BP_WIDTH = 4, - parameter BP_FRAC_WIDTH = 1, - - parameter DQ_WIDTH = 6, - parameter DQ_FRAC_WIDTH = 1, - parameter DK_WIDTH = 6, - parameter DK_FRAC_WIDTH = 1, - parameter DV_WIDTH = 6, - parameter DV_FRAC_WIDTH = 1, - - parameter DS_WIDTH = 6, - parameter DS_FRAC_WIDTH = 1, - parameter EXP_WIDTH = 8, - parameter EXP_FRAC_WIDTH = 4, - parameter DIV_WIDTH = 10, - parameter DS_SOFTMAX_WIDTH = 8, - parameter DS_SOFTMAX_FRAC_WIDTH = 7, - parameter DZ_WIDTH = 6, - parameter DZ_FRAC_WIDTH = 1, - - - parameter AF_MLP_IN_WIDTH = 7, - parameter AF_MLP_IN_FRAC_WIDTH = 1, - parameter AF_MLP_ADD_WIDTH = 6, - parameter AF_MLP_ADD_FRAC_WIDTH = 1, - - parameter MLP_IN_WIDTH = 6, - parameter MLP_IN_FRAC_WIDTH = 6, - - parameter MSA_OUT_WIDTH = AF_MLP_IN_WIDTH - 1, - parameter MSA_OUT_FRAC_WIDTH = AF_MLP_IN_FRAC_WIDTH, - parameter WEIGHT_I2H_WIDTH = 4, - parameter WEIGHT_I2H_FRAC_WIDTH = 1, - parameter WEIGHT_H2O_WIDTH = 4, - parameter WEIGHT_H2O_FRAC_WIDTH = 1, - parameter MLP_HAS_BIAS = 1, - parameter BIAS_I2H_WIDTH = 4, - parameter BIAS_I2H_FRAC_WIDTH = 1, - parameter BIAS_H2O_WIDTH = 4, - parameter BIAS_H2O_FRAC_WIDTH = 1, - - parameter HIDDEN_WIDTH = 6, - parameter HIDDEN_FRAC_WIDTH = 1, - - parameter OUT_WIDTH = 6, - parameter OUT_FRAC_WIDTH = 1, - - parameter IN_NUM = 16, - parameter IN_DIM = 6, - // num_heads * wqkv_dim = IN_DIM - parameter NUM_HEADS = 2, - parameter WQKV_DIM = IN_DIM / NUM_HEADS, - parameter WP_DIM = IN_DIM, - parameter MLP_RATIO = 2, - parameter UNROLL_IN_NUM = 2, - parameter UNROLL_IN_DIM = 3, - parameter UNROLL_WQKV_DIM = 3, - // set it with unroll_in_dim for residual matching - parameter UNROLL_WP_DIM = UNROLL_IN_DIM, - - parameter OUT_NUM = IN_NUM, - parameter OUT_DIM = IN_DIM, - parameter UNROLL_OUT_NUM = UNROLL_IN_NUM, - parameter UNROLL_OUT_DIM = UNROLL_WP_DIM, - // get attention output after 4 * 3 cycles - // mlp - parameter IN_FEATURES = IN_DIM, - parameter HIDDEN_FEATURES = MLP_RATIO * IN_FEATURES, - parameter OUT_FEATURES = IN_FEATURES, - - parameter UNROLL_IN_FEATURES = UNROLL_IN_DIM, - parameter UNROLL_HIDDEN_FEATURES = 3, - parameter UNROLL_OUT_FEATURES = UNROLL_IN_DIM -) ( - input clk, - input rst, - //msa - input [IN_WIDTH - 1:0] af_msa_weight[UNROLL_IN_NUM * UNROLL_IN_DIM - 1:0], - input af_msa_weight_valid, - output af_msa_weight_ready, - input [AF_MSA_ADD_WIDTH - 1:0] af_msa_bias[UNROLL_IN_NUM * UNROLL_IN_DIM - 1:0], - input af_msa_bias_valid, - output af_msa_bias_ready, - - input [WQ_WIDTH - 1:0] weight_q[NUM_HEADS * UNROLL_WQKV_DIM * UNROLL_IN_DIM -1 : 0], - input weight_q_valid, - output weight_q_ready, - input [WK_WIDTH - 1:0] weight_k[NUM_HEADS * UNROLL_WQKV_DIM * UNROLL_IN_DIM -1 : 0], - input weight_k_valid, - output weight_k_ready, - input [WV_WIDTH - 1:0] weight_v[NUM_HEADS * UNROLL_WQKV_DIM * UNROLL_IN_DIM -1 : 0], - input weight_v_valid, - output weight_v_ready, - input [WP_WIDTH - 1:0] weight_p[UNROLL_WP_DIM * NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input weight_p_valid, - output weight_p_ready, - input [BQ_WIDTH - 1:0] bias_q[NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input bias_q_valid, - output bias_q_ready, - input [BK_WIDTH - 1:0] bias_k[NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input bias_k_valid, - output bias_k_ready, - input [BV_WIDTH - 1:0] bias_v[NUM_HEADS * UNROLL_WQKV_DIM -1 : 0], - input bias_v_valid, - output bias_v_ready, - input [BP_WIDTH - 1:0] bias_p[UNROLL_WP_DIM -1 : 0], - input bias_p_valid, - output bias_p_ready, - //mlp - input [AF_MLP_IN_WIDTH - 1:0] af_mlp_weight[UNROLL_IN_NUM * UNROLL_IN_FEATURES - 1:0], - input af_mlp_weight_valid, - output af_mlp_weight_ready, - input [AF_MLP_ADD_WIDTH - 1:0] af_mlp_bias[UNROLL_IN_NUM * UNROLL_IN_FEATURES - 1:0], - input af_mlp_bias_valid, - output af_mlp_bias_ready, - - input [WEIGHT_I2H_WIDTH-1:0] weight_in2hidden[UNROLL_HIDDEN_FEATURES * UNROLL_IN_FEATURES - 1:0], - input weight_in2hidden_valid, - output weight_in2hidden_ready, - input [WEIGHT_H2O_WIDTH-1:0] weight_hidden2out[UNROLL_OUT_FEATURES * UNROLL_HIDDEN_FEATURES - 1:0], - input weight_hidden2out_valid, - output weight_hidden2out_ready, - //input bias - input [BIAS_I2H_WIDTH-1:0] bias_in2hidden[UNROLL_HIDDEN_FEATURES - 1:0], - input bias_in2hidden_valid, - output bias_in2hidden_ready, - - input [BIAS_H2O_WIDTH-1:0] bias_hidden2out[UNROLL_OUT_FEATURES - 1:0], - input bias_hidden2out_valid, - output bias_hidden2out_ready, - - input [IN_WIDTH -1:0] data_in[UNROLL_IN_NUM * UNROLL_IN_DIM - 1 : 0], - input data_in_valid, - output data_in_ready, - - output [OUT_WIDTH -1:0] data_out[UNROLL_OUT_NUM * UNROLL_OUT_FEATURES - 1:0], - output data_out_valid, - input data_out_ready -); - logic [MSA_IN_WIDTH - 1:0] af_msa_out[UNROLL_IN_NUM * UNROLL_IN_DIM - 1:0]; - logic af_msa_out_valid, af_msa_out_ready; - logic [MSA_OUT_WIDTH - 1:0] msa_out[UNROLL_IN_NUM * UNROLL_IN_DIM - 1:0]; - logic msa_out_valid, msa_out_ready; - logic [AF_MLP_IN_WIDTH - 1:0] res_msa[UNROLL_IN_NUM * UNROLL_IN_DIM - 1:0]; - logic res_msa_valid, res_msa_ready; - //msa - logic ff_in_valid, ra_msa_in_valid; - logic ff_in_ready, ra_msa_in_ready; - logic [IN_WIDTH -1:0] ff_data_in[UNROLL_IN_NUM * UNROLL_IN_DIM - 1 : 0]; - logic ff_data_in_valid, ff_data_in_ready; - split2 split2_inst ( - .data_in_valid (data_in_valid), - .data_in_ready (data_in_ready), - .data_out_valid({ff_in_valid, ra_msa_in_valid}), - .data_out_ready({ff_in_ready, ra_msa_in_ready}) - ); - unpacked_fifo #( - .DEPTH(IN_NUM * IN_DIM / (UNROLL_IN_DIM * UNROLL_IN_NUM)), - .DATA_WIDTH(IN_WIDTH), - .IN_NUM(UNROLL_IN_NUM * UNROLL_IN_DIM) - ) fifo_in_inst ( - .data_out(ff_data_in), - .data_out_valid(ff_data_in_valid), - .data_out_ready(ff_data_in_ready), - .data_in_valid(ff_in_valid), - .data_in_ready(ff_in_ready), - .* - ); - affine_layernorm #( - .IN_WIDTH(IN_WIDTH), - .IN_FRAC_WIDTH(IN_FRAC_WIDTH), - .OUT_WIDTH(MSA_IN_WIDTH), - .OUT_FRAC_WIDTH(MSA_IN_FRAC_WIDTH), - .BIAS_WIDTH(AF_MSA_ADD_WIDTH), - .BIAS_FRAC_WIDTH(AF_MSA_ADD_FRAC_WIDTH), - .IN_SIZE(UNROLL_IN_NUM * UNROLL_IN_DIM) - ) aff_att ( - .weight(af_msa_weight), - .weight_valid(af_msa_weight_valid), - .weight_ready(af_msa_weight_ready), - .bias(af_msa_bias), - .bias_valid(af_msa_bias_valid), - .bias_ready(af_msa_bias_ready), - .data_in(ff_data_in), - .data_in_valid(ff_data_in_valid), - .data_in_ready(ff_data_in_ready), - .data_out(af_msa_out), - .data_out_valid(af_msa_out_valid), - .data_out_ready(af_msa_out_ready), - .* - ); - - //TODO: NORM here - // msa - fixed_msa #( - .IN_WIDTH(MSA_IN_WIDTH), - .IN_FRAC_WIDTH(MSA_IN_FRAC_WIDTH), - - .WQ_WIDTH(WQ_WIDTH), - .WQ_FRAC_WIDTH(WQ_FRAC_WIDTH), - .WK_WIDTH(WK_WIDTH), - .WK_FRAC_WIDTH(WK_FRAC_WIDTH), - .WV_WIDTH(WV_WIDTH), - .WV_FRAC_WIDTH(WV_FRAC_WIDTH), - - .BQ_WIDTH(BQ_WIDTH), - .BQ_FRAC_WIDTH(BQ_FRAC_WIDTH), - .BK_WIDTH(BK_WIDTH), - .BK_FRAC_WIDTH(BK_FRAC_WIDTH), - .BV_WIDTH(BV_WIDTH), - .BV_FRAC_WIDTH(BV_FRAC_WIDTH), - - .WP_WIDTH(WP_WIDTH), - .WP_FRAC_WIDTH(WP_FRAC_WIDTH), - .BP_WIDTH(BP_WIDTH), - .BP_FRAC_WIDTH(BP_FRAC_WIDTH), - - .DQ_WIDTH(DQ_WIDTH), - .DQ_FRAC_WIDTH(DQ_FRAC_WIDTH), - .DK_WIDTH(DK_WIDTH), - .DK_FRAC_WIDTH(DK_FRAC_WIDTH), - .DV_WIDTH(DV_WIDTH), - .DV_FRAC_WIDTH(DV_FRAC_WIDTH), - - .DS_WIDTH(DS_WIDTH), - .DS_FRAC_WIDTH(DS_FRAC_WIDTH), - - .EXP_WIDTH(EXP_WIDTH), - .EXP_FRAC_WIDTH(EXP_FRAC_WIDTH), - .DIV_WIDTH(DIV_WIDTH), - .DS_SOFTMAX_WIDTH(DS_SOFTMAX_WIDTH), - .DS_SOFTMAX_FRAC_WIDTH(DS_SOFTMAX_FRAC_WIDTH), - .DZ_WIDTH(DZ_WIDTH), - .DZ_FRAC_WIDTH(DZ_FRAC_WIDTH), - - .OUT_WIDTH(MSA_OUT_WIDTH), - .OUT_FRAC_WIDTH(MSA_OUT_FRAC_WIDTH), - - .IN_Y(IN_NUM), - .IN_X(IN_DIM), - .NUM_HEADS(NUM_HEADS), - .UNROLL_IN_Y(UNROLL_IN_NUM), - .UNROLL_IN_X(UNROLL_IN_DIM), - .UNROLL_WQKV_Y(UNROLL_WQKV_DIM), - .WP_Y(WP_DIM), - .UNROLL_WP_Y(UNROLL_WP_DIM) - ) msa_inst ( - .data_in(af_msa_out), - .data_in_valid(af_msa_out_valid), - .data_in_ready(af_msa_out_ready), - .data_out(msa_out), - .data_out_valid(msa_out_valid), - .data_out_ready(msa_out_ready), - .* - ); - res_add #( - .IN_WIDTH(IN_WIDTH), - .IN_FRAC_WIDTH(IN_FRAC_WIDTH), - .MODULE_WIDTH(MSA_OUT_WIDTH), - .MODULE_FRAC_WIDTH(MSA_OUT_FRAC_WIDTH), - .IN_SIZE(IN_NUM * IN_DIM), - .UNROLL_IN_SIZE(UNROLL_IN_NUM * UNROLL_IN_DIM) - ) ra_msa_inst ( - .data_in(data_in), - .data_in_valid(ra_msa_in_valid), - .data_in_ready(ra_msa_in_ready), - .module_in(msa_out), - .module_in_valid(msa_out_valid), - .module_in_ready(msa_out_ready), - .data_out(res_msa), - .data_out_valid(res_msa_valid), - .data_out_ready(res_msa_ready), - .* - ); - - logic [MLP_IN_WIDTH - 1:0] af_mlp_out[UNROLL_IN_NUM * UNROLL_IN_FEATURES - 1:0]; - logic af_mlp_out_valid, af_mlp_out_ready; - logic ra_mlp_in_valid; - assign ra_mlp_in_valid = res_msa_ready && res_msa_valid; - localparam MLP_OUT_WIDTH = OUT_WIDTH - 1; - localparam MLP_OUT_FRAC_WIDTH = OUT_FRAC_WIDTH; - logic [MLP_OUT_WIDTH-1:0] mlp_out[UNROLL_IN_NUM * UNROLL_IN_DIM - 1:0]; - logic mlp_out_valid, mlp_out_ready; - // mlp - affine_layernorm #( - .IN_WIDTH(AF_MLP_IN_WIDTH), - .IN_FRAC_WIDTH(AF_MLP_IN_FRAC_WIDTH), - .OUT_WIDTH(MLP_IN_WIDTH), - .OUT_FRAC_WIDTH(MLP_IN_FRAC_WIDTH), - .BIAS_WIDTH(AF_MLP_ADD_WIDTH), - .BIAS_FRAC_WIDTH(AF_MLP_ADD_FRAC_WIDTH), - .IN_SIZE(UNROLL_IN_NUM * UNROLL_IN_FEATURES) - ) aff_mlp ( - .data_in(res_msa), - .data_in_valid(res_msa_valid), - .data_in_ready(res_msa_ready), - .weight(af_mlp_weight), - .weight_valid(af_mlp_weight_valid), - .weight_ready(af_mlp_weight_ready), - .bias(af_mlp_bias), - .bias_valid(af_mlp_bias_valid), - .bias_ready(af_mlp_bias_ready), - .data_out(af_mlp_out), - .data_out_valid(af_mlp_out_valid), - .data_out_ready(af_mlp_out_ready), - .* - ); - fixed_mlp #( - .IN_WIDTH(MLP_IN_WIDTH), - .IN_FRAC_WIDTH(MLP_IN_FRAC_WIDTH), - .HIDDEN_WIDTH(HIDDEN_WIDTH), - .HIDDEN_FRAC_WIDTH(HIDDEN_FRAC_WIDTH), - .OUT_WIDTH(MLP_OUT_WIDTH), - .OUT_FRAC_WIDTH(MLP_OUT_FRAC_WIDTH), - - .WEIGHT_I2H_WIDTH(WEIGHT_I2H_WIDTH), - .WEIGHT_I2H_FRAC_WIDTH(WEIGHT_I2H_FRAC_WIDTH), - .BIAS_I2H_WIDTH(BIAS_I2H_WIDTH), - .BIAS_I2H_FRAC_WIDTH(BIAS_I2H_FRAC_WIDTH), - - .WEIGHT_H2O_WIDTH(WEIGHT_H2O_WIDTH), - .WEIGHT_H2O_FRAC_WIDTH(WEIGHT_H2O_FRAC_WIDTH), - .BIAS_H2O_WIDTH(BIAS_H2O_WIDTH), - .BIAS_H2O_FRAC_WIDTH(BIAS_H2O_FRAC_WIDTH), - - .IN_NUM(IN_NUM), - .IN_FEATURES(IN_FEATURES), - .HIDDEN_FEATURES(HIDDEN_FEATURES), - .UNROLL_IN_NUM(UNROLL_IN_NUM), - .UNROLL_IN_FEATURES(UNROLL_IN_FEATURES), - .UNROLL_HIDDEN_FEATURES(UNROLL_HIDDEN_FEATURES), - .UNROLL_OUT_FEATURES(UNROLL_OUT_FEATURES) - ) mlp_inst ( - .data_in(af_mlp_out), - .data_in_valid(af_mlp_out_valid), - .data_in_ready(af_mlp_out_ready), - .data_out(mlp_out), - .data_out_valid(mlp_out_valid), - .data_out_ready(mlp_out_ready), - .* - - ); - - res_add #( - .IN_WIDTH(AF_MLP_IN_WIDTH), - .IN_FRAC_WIDTH(AF_MLP_IN_FRAC_WIDTH), - .MODULE_WIDTH(MLP_OUT_WIDTH), - .MODULE_FRAC_WIDTH(MLP_OUT_FRAC_WIDTH), - .IN_SIZE(IN_NUM * IN_DIM), - .UNROLL_IN_SIZE(UNROLL_IN_NUM * UNROLL_IN_DIM) - ) ra_mlp_inst ( - .data_in(res_msa), - .data_in_valid(ra_mlp_in_valid), - .data_in_ready(), - .module_in(mlp_out), - .module_in_valid(mlp_out_valid), - .module_in_ready(mlp_out_ready), - .* - ); -endmodule - -module res_add #( - parameter IN_WIDTH = 32, - parameter IN_FRAC_WIDTH = 1, - parameter MODULE_WIDTH = 16, - parameter MODULE_FRAC_WIDTH = 1, - parameter OUT_WIDTH = MODULE_WIDTH + 1, - parameter UNROLL_IN_SIZE = 3, - parameter IN_SIZE = 3 -) ( - input logic clk, - input logic rst, - input logic [IN_WIDTH-1:0] data_in[UNROLL_IN_SIZE - 1:0], - input logic data_in_valid, - output logic data_in_ready, - input logic [MODULE_WIDTH-1:0] module_in[UNROLL_IN_SIZE - 1:0], - input logic module_in_valid, - output logic module_in_ready, - output logic [OUT_WIDTH-1:0] data_out[UNROLL_IN_SIZE-1:0], - output logic data_out_valid, - input logic data_out_ready -); - - logic [OUT_WIDTH-1:0] reg_in[UNROLL_IN_SIZE - 1:0]; - logic reg_in_valid, reg_in_ready; - unpacked_skid_buffer #( - .DATA_WIDTH(OUT_WIDTH), - .IN_NUM(UNROLL_IN_SIZE) - ) reg_inst ( - .data_in(reg_in), - .data_in_valid(reg_in_valid), - .data_in_ready(reg_in_ready), - .* - ); - logic [IN_WIDTH-1:0] ff_data_in[UNROLL_IN_SIZE - 1:0]; - logic ff_data_in_valid, ff_data_in_ready; - unpacked_fifo #( - .DEPTH(IN_SIZE), - .DATA_WIDTH(IN_WIDTH), - .IN_NUM(UNROLL_IN_SIZE) - ) fifo_in_inst ( - .data_out(ff_data_in), - .data_out_valid(ff_data_in_valid), - .data_out_ready(ff_data_in_ready), - .* - ); - logic [MODULE_WIDTH - 1:0] cast_in[UNROLL_IN_SIZE - 1:0]; - logic cast_in_valid, cast_in_ready; - fixed_rounding #( - .IN_SIZE(UNROLL_IN_SIZE), - .IN_WIDTH(IN_WIDTH), - .IN_FRAC_WIDTH(IN_FRAC_WIDTH), - .OUT_WIDTH(MODULE_WIDTH), - .OUT_FRAC_WIDTH(MODULE_FRAC_WIDTH) - ) msa_in_cast ( - .data_in (ff_data_in), - .data_out(cast_in) - ); - for (genvar i = 0; i < UNROLL_IN_SIZE; i++) - assign reg_in[i] = {cast_in[i][MODULE_WIDTH-1],cast_in[i]}+ {module_in[i][MODULE_WIDTH-1],module_in[i]}; - - join2 #() resadd_msa_join_inst ( - .data_in_ready ({module_in_ready, ff_data_in_ready}), - .data_in_valid ({module_in_valid, ff_data_in_valid}), - .data_out_valid(reg_in_valid), - .data_out_ready(reg_in_ready) - ); -endmodule - diff --git a/src/mase_components/vision_models/vit/rtl/fixed_mlp.sv b/src/mase_components/vision_models/vit/rtl/fixed_mlp.sv deleted file mode 100644 index 89ea7b795..000000000 --- a/src/mase_components/vision_models/vit/rtl/fixed_mlp.sv +++ /dev/null @@ -1,131 +0,0 @@ -`timescale 1ns / 1ps -module fixed_mlp #( - parameter IN_WIDTH = 32, - parameter IN_FRAC_WIDTH = 8, - parameter WEIGHT_I2H_WIDTH = 16, - parameter WEIGHT_I2H_FRAC_WIDTH = 8, - parameter WEIGHT_H2O_WIDTH = 16, - parameter WEIGHT_H2O_FRAC_WIDTH = 8, - parameter HAS_BIAS = 1, - parameter BIAS_I2H_WIDTH = 16, - parameter BIAS_I2H_FRAC_WIDTH = 4, - parameter BIAS_H2O_WIDTH = 16, - parameter BIAS_H2O_FRAC_WIDTH = 4, - parameter HIDDEN_WIDTH = 32, - parameter HIDDEN_FRAC_WIDTH = 8, - parameter OUT_WIDTH = 32, - parameter OUT_FRAC_WIDTH = 8, - - parameter IN_NUM = 16, - parameter IN_FEATURES = 4, - parameter HIDDEN_FEATURES = 8, - parameter OUT_FEATURES = IN_FEATURES, - - parameter UNROLL_IN_NUM = 4, - parameter UNROLL_IN_FEATURES = 2, - parameter UNROLL_HIDDEN_FEATURES = 4, - parameter UNROLL_OUT_FEATURES = 8 -) ( - input clk, - input rst, - //input data - input [IN_WIDTH-1:0] data_in[UNROLL_IN_NUM * UNROLL_IN_FEATURES - 1:0], - input data_in_valid, - output data_in_ready, - //input weight - input [WEIGHT_I2H_WIDTH-1:0] weight_in2hidden[UNROLL_HIDDEN_FEATURES * UNROLL_IN_FEATURES - 1:0], - input weight_in2hidden_valid, - output weight_in2hidden_ready, - - input [WEIGHT_H2O_WIDTH-1:0] weight_hidden2out[UNROLL_OUT_FEATURES * UNROLL_HIDDEN_FEATURES - 1:0], - input weight_hidden2out_valid, - output weight_hidden2out_ready, - //input bias - input [BIAS_I2H_WIDTH-1:0] bias_in2hidden[UNROLL_HIDDEN_FEATURES - 1:0], - input bias_in2hidden_valid, - output bias_in2hidden_ready, - - input [BIAS_H2O_WIDTH-1:0] bias_hidden2out[UNROLL_OUT_FEATURES - 1:0], - input bias_hidden2out_valid, - output bias_hidden2out_ready, - //output data - output [OUT_WIDTH-1:0] data_out[UNROLL_IN_NUM * UNROLL_OUT_FEATURES - 1:0], - output data_out_valid, - input data_out_ready -); - logic [HIDDEN_WIDTH-1:0] hidden_data[UNROLL_IN_NUM * UNROLL_HIDDEN_FEATURES - 1:0]; - logic hidden_data_valid, hidden_data_ready; - logic [HIDDEN_WIDTH-1:0] relu_data[UNROLL_IN_NUM * UNROLL_HIDDEN_FEATURES - 1:0]; - logic relu_data_valid, relu_data_ready; - fixed_2d_linear #( - .IN_WIDTH(IN_WIDTH), - .IN_FRAC_WIDTH(IN_FRAC_WIDTH), - .WEIGHT_WIDTH(WEIGHT_I2H_WIDTH), - .WEIGHT_FRAC_WIDTH(WEIGHT_I2H_FRAC_WIDTH), - .BIAS_WIDTH(BIAS_I2H_WIDTH), - .BIAS_FRAC_WIDTH(BIAS_I2H_FRAC_WIDTH), - .OUT_WIDTH(HIDDEN_WIDTH), - .OUT_FRAC_WIDTH(HIDDEN_FRAC_WIDTH), - .IN_Y(IN_NUM), - .IN_X(IN_FEATURES), - .W_Y(HIDDEN_FEATURES), - .UNROLL_IN_Y(UNROLL_IN_NUM), - .UNROLL_IN_X(UNROLL_IN_FEATURES), - .UNROLL_W_Y(UNROLL_HIDDEN_FEATURES) - ) in2hidden_linear ( - .weight(weight_in2hidden), - .weight_valid(weight_in2hidden_valid), - .weight_ready(weight_in2hidden_ready), - .bias(bias_in2hidden), - .bias_valid(bias_in2hidden_valid), - .bias_ready(bias_in2hidden_ready), - .data_out(hidden_data), - .data_out_valid(hidden_data_valid), - .data_out_ready(hidden_data_ready), - .* - ); - - fixed_relu #( - .IN_WIDTH(HIDDEN_WIDTH), - .IN_FRAC_WIDTH(HIDDEN_FRAC_WIDTH), - .OUT_WIDTH(HIDDEN_WIDTH), - .OUT_FRAC_WIDTH(HIDDEN_FRAC_WIDTH), - .IN_SIZE(UNROLL_IN_NUM * UNROLL_HIDDEN_FEATURES) - ) act_inst ( - .data_in(hidden_data), - .data_in_valid(hidden_data_valid), - .data_in_ready(hidden_data_ready), - .data_out(relu_data), - .data_out_valid(relu_data_valid), - .data_out_ready(relu_data_ready), - .* - ); - - fixed_2d_linear #( - .IN_WIDTH(HIDDEN_WIDTH), - .IN_FRAC_WIDTH(HIDDEN_FRAC_WIDTH), - .WEIGHT_WIDTH(WEIGHT_H2O_WIDTH), - .WEIGHT_FRAC_WIDTH(WEIGHT_H2O_FRAC_WIDTH), - .BIAS_WIDTH(BIAS_H2O_WIDTH), - .BIAS_FRAC_WIDTH(BIAS_H2O_FRAC_WIDTH), - .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH), - .IN_Y(IN_NUM), - .IN_X(HIDDEN_FEATURES), - .W_Y(IN_FEATURES), - .UNROLL_IN_Y(UNROLL_IN_NUM), - .UNROLL_IN_X(UNROLL_HIDDEN_FEATURES), - .UNROLL_W_Y(UNROLL_OUT_FEATURES) - ) hidden2in_linear ( - .data_in(relu_data), - .data_in_valid(relu_data_valid), - .data_in_ready(relu_data_ready), - .weight(weight_hidden2out), - .weight_valid(weight_hidden2out_valid), - .weight_ready(weight_hidden2out_ready), - .bias(bias_hidden2out), - .bias_valid(bias_hidden2out_valid), - .bias_ready(bias_hidden2out_ready), - .* - ); -endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_msa.sv b/src/mase_components/vision_models/vit/rtl/fixed_msa.sv deleted file mode 100644 index 837c4f357..000000000 --- a/src/mase_components/vision_models/vit/rtl/fixed_msa.sv +++ /dev/null @@ -1,274 +0,0 @@ -`timescale 1ns / 1ps -module fixed_msa #( - parameter IN_WIDTH = 8, - parameter IN_FRAC_WIDTH = 1, - - parameter WQ_WIDTH = 8, - parameter WQ_FRAC_WIDTH = 1, - parameter WK_WIDTH = 8, - parameter WK_FRAC_WIDTH = 1, - parameter WV_WIDTH = 8, - parameter WV_FRAC_WIDTH = 1, - - parameter BQ_WIDTH = 8, - parameter BQ_FRAC_WIDTH = 1, - parameter BK_WIDTH = 8, - parameter BK_FRAC_WIDTH = 1, - parameter BV_WIDTH = 8, - parameter BV_FRAC_WIDTH = 1, - - parameter WP_WIDTH = 8, - parameter WP_FRAC_WIDTH = 1, - parameter BP_WIDTH = 8, - parameter BP_FRAC_WIDTH = 1, - - parameter DQ_WIDTH = 8, - parameter DQ_FRAC_WIDTH = 1, - parameter DK_WIDTH = 8, - parameter DK_FRAC_WIDTH = 1, - parameter DV_WIDTH = 8, - parameter DV_FRAC_WIDTH = 1, - - parameter DS_WIDTH = 8, - parameter DS_FRAC_WIDTH = 1, - parameter EXP_WIDTH = 8, - parameter EXP_FRAC_WIDTH = 4, - parameter DIV_WIDTH = 10, - parameter DS_SOFTMAX_WIDTH = 8, - parameter DS_SOFTMAX_FRAC_WIDTH = 7, - parameter DZ_WIDTH = 8, - parameter DZ_FRAC_WIDTH = 1, - - parameter OUT_WIDTH = 8, - parameter OUT_FRAC_WIDTH = 1, - - parameter IN_Y = 6, - parameter UNROLL_IN_Y = 1, - parameter ITER_IN_Y = IN_Y / UNROLL_IN_Y, - - parameter IN_X = 12, - parameter UNROLL_IN_X = 2, - parameter ITER_IN_X = IN_X / UNROLL_IN_X, - - // make sure NUM_HEADS * WQKV_Y = IN_X - parameter NUM_HEADS = 1, - parameter WQKV_Y = IN_X / NUM_HEADS, - parameter UNROLL_WQKV_Y = 1, - parameter ITER_WQKV_Y = WQKV_Y / UNROLL_WQKV_Y, - - // WP_Y = IN_X - parameter WP_Y = 12, - parameter UNROLL_WP_Y = 2, - parameter WP_SIZE = NUM_HEADS * UNROLL_WQKV_Y, - // and WH_PARALLEL - parameter WQKV_SIZE = UNROLL_IN_X, - parameter OUT_PARALLELISM = UNROLL_IN_Y, - parameter OUT_SIZE = UNROLL_WP_Y -) ( - input clk, - input rst, - - input [WQ_WIDTH - 1:0] weight_q[NUM_HEADS * UNROLL_WQKV_Y * WQKV_SIZE -1 : 0], - input weight_q_valid, - output weight_q_ready, - - input [WK_WIDTH - 1:0] weight_k[NUM_HEADS * UNROLL_WQKV_Y * WQKV_SIZE -1 : 0], - input weight_k_valid, - output weight_k_ready, - - input [WV_WIDTH - 1:0] weight_v[NUM_HEADS * UNROLL_WQKV_Y * WQKV_SIZE -1 : 0], - input weight_v_valid, - output weight_v_ready, - - input [WP_WIDTH - 1:0] weight_p[UNROLL_WP_Y * WP_SIZE -1 : 0], - input weight_p_valid, - output weight_p_ready, - - input [BQ_WIDTH - 1:0] bias_q[NUM_HEADS * UNROLL_WQKV_Y -1 : 0], - input bias_q_valid, - output bias_q_ready, - - input [BK_WIDTH - 1:0] bias_k[NUM_HEADS * UNROLL_WQKV_Y -1 : 0], - input bias_k_valid, - output bias_k_ready, - - input [BV_WIDTH - 1:0] bias_v[NUM_HEADS * UNROLL_WQKV_Y -1 : 0], - input bias_v_valid, - output bias_v_ready, - - input [BP_WIDTH - 1:0] bias_p[UNROLL_WP_Y -1 : 0], - input bias_p_valid, - output bias_p_ready, - - - input [IN_WIDTH -1:0] data_in[UNROLL_IN_Y * UNROLL_IN_X - 1 : 0], - input data_in_valid, - output data_in_ready, - - output [OUT_WIDTH -1:0] data_out[OUT_PARALLELISM * OUT_SIZE - 1:0], - output data_out_valid, - input data_out_ready -); - // define head in size - localparam H_IN_SIZE = UNROLL_WQKV_Y * WQKV_SIZE; - logic [DZ_WIDTH - 1:0] sa_out[UNROLL_IN_Y * NUM_HEADS * UNROLL_WQKV_Y - 1:0]; - logic sa_out_valid, sa_out_ready; - for (genvar i = 0; i < NUM_HEADS; i++) begin : head - /* verilator lint_off UNUSEDSIGNAL */ - // define each head data_out - logic [DZ_WIDTH - 1:0] h_sa_out[UNROLL_IN_Y * UNROLL_WQKV_Y - 1:0]; - logic h_sa_out_valid; - logic h_sa_out_ready; - assign h_sa_out_ready = sa_out_ready; - // define each head data_in - logic [IN_WIDTH - 1:0] h_data_in[UNROLL_IN_Y * UNROLL_IN_X - 1:0]; - logic h_data_in_valid, h_data_in_ready; - assign h_data_in = data_in; - assign h_data_in_valid = data_in_valid; - // define each head weight - logic [WQ_WIDTH - 1:0] h_weight_q[UNROLL_WQKV_Y * WQKV_SIZE - 1:0]; - logic h_weight_q_valid, h_weight_q_ready; - logic [WK_WIDTH - 1:0] h_weight_k[UNROLL_WQKV_Y * WQKV_SIZE - 1:0]; - logic h_weight_k_valid, h_weight_k_ready; - logic [WV_WIDTH - 1:0] h_weight_v[UNROLL_WQKV_Y * WQKV_SIZE - 1:0]; - logic h_weight_v_valid, h_weight_v_ready; - - logic [BQ_WIDTH-1:0] h_bias_q[UNROLL_WQKV_Y - 1:0]; - logic h_bias_q_valid, h_bias_q_ready; - logic [BK_WIDTH-1:0] h_bias_k[UNROLL_WQKV_Y - 1:0]; - logic h_bias_k_valid, h_bias_k_ready; - logic [BV_WIDTH-1:0] h_bias_v[UNROLL_WQKV_Y - 1:0]; - logic h_bias_v_valid, h_bias_v_ready; - - - assign h_weight_q = weight_q[H_IN_SIZE*i+H_IN_SIZE-1:H_IN_SIZE*i]; - assign h_weight_k = weight_k[H_IN_SIZE*i+H_IN_SIZE-1:H_IN_SIZE*i]; - assign h_weight_v = weight_v[H_IN_SIZE*i+H_IN_SIZE-1:H_IN_SIZE*i]; - assign h_weight_q_valid = weight_q_valid; - assign h_weight_k_valid = weight_k_valid; - assign h_weight_v_valid = weight_v_valid; - - assign h_bias_q = bias_q[UNROLL_WQKV_Y*i+UNROLL_WQKV_Y-1:UNROLL_WQKV_Y*i]; - assign h_bias_k = bias_k[UNROLL_WQKV_Y*i+UNROLL_WQKV_Y-1:UNROLL_WQKV_Y*i]; - assign h_bias_v = bias_v[UNROLL_WQKV_Y*i+UNROLL_WQKV_Y-1:UNROLL_WQKV_Y*i]; - assign h_bias_q_valid = bias_q_valid; - assign h_bias_k_valid = bias_k_valid; - assign h_bias_v_valid = bias_v_valid; - - fixed_self_att #( - .DATA_WIDTH(IN_WIDTH), - .DATA_FRAC_WIDTH(IN_FRAC_WIDTH), - - .WQ_WIDTH(WQ_WIDTH), - .WQ_FRAC_WIDTH(WQ_FRAC_WIDTH), - .WK_WIDTH(WK_WIDTH), - .WK_FRAC_WIDTH(WK_FRAC_WIDTH), - .WV_WIDTH(WV_WIDTH), - .WV_FRAC_WIDTH(WV_FRAC_WIDTH), - - .BQ_WIDTH(BQ_WIDTH), - .BQ_FRAC_WIDTH(BQ_FRAC_WIDTH), - .BK_WIDTH(BK_WIDTH), - .BK_FRAC_WIDTH(BK_FRAC_WIDTH), - .BV_WIDTH(BV_WIDTH), - .BV_FRAC_WIDTH(BV_FRAC_WIDTH), - - .DQ_WIDTH(DQ_WIDTH), - .DQ_FRAC_WIDTH(DQ_FRAC_WIDTH), - .DK_WIDTH(DK_WIDTH), - .DK_FRAC_WIDTH(DK_FRAC_WIDTH), - .DV_WIDTH(DV_WIDTH), - .DV_FRAC_WIDTH(DV_FRAC_WIDTH), - - .DS_WIDTH(DS_WIDTH), - .DS_FRAC_WIDTH(DS_FRAC_WIDTH), - .EXP_WIDTH(EXP_WIDTH), - .EXP_FRAC_WIDTH(EXP_FRAC_WIDTH), - .DIV_WIDTH(DIV_WIDTH), - .DS_SOFTMAX_WIDTH(DS_SOFTMAX_WIDTH), - .DS_SOFTMAX_FRAC_WIDTH(DS_SOFTMAX_FRAC_WIDTH), - .DZ_WIDTH(DZ_WIDTH), - .DZ_FRAC_WIDTH(DZ_FRAC_WIDTH), - .IN_PARALLELISM(UNROLL_IN_Y), - .IN_NUM_PARALLELISM(ITER_IN_Y), - .IN_SIZE(UNROLL_IN_X), - .IN_DEPTH(ITER_IN_X), - .W_PARALLELISM(UNROLL_WQKV_Y), - .W_NUM_PARALLELISM(ITER_WQKV_Y) - ) satt_inst ( - .data_in(h_data_in), - .data_in_valid(h_data_in_valid), - .data_in_ready(h_data_in_ready), - .weight_q(h_weight_q), - .weight_q_valid(h_weight_q_valid), - .weight_q_ready(h_weight_q_ready), - .weight_k(h_weight_k), - .weight_k_valid(h_weight_k_valid), - .weight_k_ready(h_weight_k_ready), - .weight_v(h_weight_v), - .weight_v_valid(h_weight_v_valid), - .weight_v_ready(h_weight_v_ready), - .bias_q(h_bias_q), - .bias_q_valid(h_bias_q_valid), - .bias_q_ready(h_bias_q_ready), - .bias_k(h_bias_k), - .bias_k_valid(h_bias_k_valid), - .bias_k_ready(h_bias_k_ready), - .bias_v(h_bias_v), - .bias_v_valid(h_bias_v_valid), - .bias_v_ready(h_bias_v_ready), - .data_out(h_sa_out), - .data_out_valid(h_sa_out_valid), - .data_out_ready(h_sa_out_ready), - .* - ); - end - assign weight_q_ready = head[0].h_weight_q_ready; - assign weight_k_ready = head[0].h_weight_k_ready; - assign weight_v_ready = head[0].h_weight_v_ready; - assign bias_q_ready = head[0].h_bias_q_ready; - assign bias_k_ready = head[0].h_bias_k_ready; - assign bias_v_ready = head[0].h_bias_v_ready; - assign data_in_ready = head[0].h_data_in_ready; - - //transpose here - for (genvar i = 0; i < UNROLL_IN_Y; i++) - for (genvar j = 0; j < NUM_HEADS; j++) - for (genvar k = 0; k < UNROLL_WQKV_Y; k++) - assign sa_out[(i*NUM_HEADS+j)*UNROLL_WQKV_Y+k] = head[j].h_sa_out[i*UNROLL_WQKV_Y+k]; - - assign sa_out_valid = head[0].h_sa_out_valid; - fixed_2d_linear #( - .IN_WIDTH(DZ_WIDTH), - .IN_FRAC_WIDTH(DZ_FRAC_WIDTH), - .WEIGHT_WIDTH(WP_WIDTH), - .WEIGHT_FRAC_WIDTH(WP_FRAC_WIDTH), - .BIAS_WIDTH(BP_WIDTH), - .BIAS_FRAC_WIDTH(BP_FRAC_WIDTH), - .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH), - - .IN_Y(IN_Y), - .UNROLL_IN_Y(UNROLL_IN_Y), - .IN_X(NUM_HEADS * WQKV_Y), - .UNROLL_IN_X(NUM_HEADS * UNROLL_WQKV_Y), - .W_Y(WP_Y), - .UNROLL_W_Y(UNROLL_WP_Y) - ) inst_fmmc_k ( - .clk(clk), - .rst(rst), - .data_in(sa_out), - .data_in_valid(sa_out_valid), - .data_in_ready(sa_out_ready), - .weight(weight_p), - .weight_valid(weight_p_valid), - .weight_ready(weight_p_ready), - .bias(bias_p), - .bias_valid(bias_p_valid), - .bias_ready(bias_p_ready), - .data_out(data_out), - .data_out_valid(data_out_valid), - .data_out_ready(data_out_ready) - ); -endmodule - diff --git a/src/mase_components/vision_models/vit/rtl/fixed_patch_embed.sv b/src/mase_components/vision_models/vit/rtl/fixed_patch_embed.sv deleted file mode 100644 index 8294e157f..000000000 --- a/src/mase_components/vision_models/vit/rtl/fixed_patch_embed.sv +++ /dev/null @@ -1,77 +0,0 @@ -`timescale 1ns / 1ps -module fixed_patch_embed #( - parameter IN_WIDTH = 6, - parameter IN_FRAC_WIDTH = 1, - parameter W_WIDTH = 4, - parameter W_FRAC_WIDTH = 1, - parameter BIAS_WIDTH = 4, - parameter BIAS_FRAC_WIDTH = 1, - parameter OUT_WIDTH = 6, - parameter OUT_FRAC_WIDTH = 1, - - parameter IN_C = 3, - parameter IN_Y = 16, - parameter IN_X = 16, - - parameter OUT_C = 4, - parameter KERNEL_SIZE = 2, - parameter KERNEL_Y = KERNEL_SIZE, - parameter KERNEL_X = KERNEL_SIZE, - - parameter PADDING_Y = 0, - parameter PADDING_X = 0, - - parameter UNROLL_KERNEL_OUT = 2, - parameter UNROLL_OUT_C = 2, - parameter UNROLL_IN_C = 2, - - parameter SLIDING_NUM = 8, - - parameter STRIDE = KERNEL_SIZE -) ( - input clk, - input rst, - - input [IN_WIDTH - 1:0] data_in_0 [UNROLL_IN_C - 1 : 0], - input data_in_0_valid, - output data_in_0_ready, - - input [W_WIDTH-1:0] weight [UNROLL_KERNEL_OUT * UNROLL_OUT_C -1:0], - input weight_valid, - output weight_ready, - - input [BIAS_WIDTH-1:0] bias [UNROLL_OUT_C-1:0], - input bias_valid, - output bias_ready, - - output [OUT_WIDTH - 1:0] data_out_0 [UNROLL_OUT_C - 1:0], - output data_out_0_valid, - input data_out_0_ready -); - - convolution #( - .DATA_WIDTH(IN_WIDTH), - .DATA_FRAC_WIDTH(IN_FRAC_WIDTH), - .W_WIDTH(W_WIDTH), - .W_FRAC_WIDTH(W_FRAC_WIDTH), - .BIAS_WIDTH(BIAS_WIDTH), - .BIAS_FRAC_WIDTH(BIAS_FRAC_WIDTH), - .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH), - .IN_X(IN_X), - .IN_Y(IN_Y), - .IN_C(IN_C), - .KERNEL_X(KERNEL_X), - .KERNEL_Y(KERNEL_Y), - .OUT_C(OUT_C), - .UNROLL_IN_C(UNROLL_IN_C), - .UNROLL_KERNEL_OUT(UNROLL_KERNEL_OUT), - .UNROLL_OUT_C(UNROLL_OUT_C), - .SLIDING_NUM(SLIDING_NUM), - .STRIDE(STRIDE), - .PADDING_Y(PADDING_Y), - .PADDING_X(PADDING_X) - ) conv_inst ( - .* - ); -endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_pvt.sv b/src/mase_components/vision_models/vit/rtl/fixed_pvt.sv deleted file mode 100644 index 8e9c840f7..000000000 --- a/src/mase_components/vision_models/vit/rtl/fixed_pvt.sv +++ /dev/null @@ -1,411 +0,0 @@ -`timescale 1ns / 1ps -module fixed_pvt #( - parameter IN_WIDTH = 8, - parameter IN_FRAC_WIDTH = 3, - parameter OUT_WIDTH = 8, - parameter OUT_FRAC_WIDTH = 3, - //patch has bias - parameter PATCH_EMBED_W_WIDTH_3 = 8, - parameter PATCH_EMBED_W_FRAC_WIDTH_3 = 6, - parameter PATCH_EMBED_B_WIDTH_3 = 8, - parameter PATCH_EMBED_B_FRAC_WIDTH_3 = 5, - parameter POS_ADD_IN_WIDTH_3 = 8, - parameter POS_ADD_IN_FRAC_WIDTH_3 = 5, - - // block has bias - parameter BLOCK_IN_WIDTH = 8, - parameter BLOCK_IN_FRAC_WIDTH = 3, - - parameter BLOCK_AF_MSA_ADD_WIDTH = 8, - parameter BLOCK_AF_MSA_ADD_FRAC_WIDTH = 3, - parameter BLOCK_MSA_IN_WIDTH = 8, - parameter BLOCK_MSA_IN_FRAC_WIDTH = 3, - - parameter BLOCK_WQ_WIDTH = 6, - parameter BLOCK_WQ_FRAC_WIDTH = 4, - parameter BLOCK_WK_WIDTH = 6, - parameter BLOCK_WK_FRAC_WIDTH = 4, - parameter BLOCK_WV_WIDTH = 6, - parameter BLOCK_WV_FRAC_WIDTH = 4, - - parameter BLOCK_BQ_WIDTH = 6, - parameter BLOCK_BQ_FRAC_WIDTH = 4, - parameter BLOCK_BK_WIDTH = 6, - parameter BLOCK_BK_FRAC_WIDTH = 4, - parameter BLOCK_BV_WIDTH = 6, - parameter BLOCK_BV_FRAC_WIDTH = 4, - - parameter BLOCK_WP_WIDTH = 6, - parameter BLOCK_WP_FRAC_WIDTH = 4, - parameter BLOCK_BP_WIDTH = 6, - parameter BLOCK_BP_FRAC_WIDTH = 4, - - parameter BLOCK_DQ_WIDTH = 8, - parameter BLOCK_DQ_FRAC_WIDTH = 3, - parameter BLOCK_DK_WIDTH = 8, - parameter BLOCK_DK_FRAC_WIDTH = 3, - parameter BLOCK_DV_WIDTH = 8, - parameter BLOCK_DV_FRAC_WIDTH = 3, - - parameter BLOCK_DS_WIDTH = 8, - parameter BLOCK_DS_FRAC_WIDTH = 3, - parameter BLOCK_EXP_WIDTH = 8, - parameter BLOCK_EXP_FRAC_WIDTH = 5, - parameter BLOCK_DIV_WIDTH = 9, - parameter BLOCK_DS_SOFTMAX_WIDTH = 8, - parameter BLOCK_DS_SOFTMAX_FRAC_WIDTH = 3, - parameter BLOCK_DZ_WIDTH = 8, - parameter BLOCK_DZ_FRAC_WIDTH = 3, - - parameter BLOCK_AF_MLP_IN_WIDTH = 9, - parameter BLOCK_AF_MLP_IN_FRAC_WIDTH = 3, - parameter BLOCK_AF_MLP_ADD_WIDTH = 8, - parameter BLOCK_AF_MLP_ADD_FRAC_WIDTH = 3, - - parameter BLOCK_MLP_IN_WIDTH = 8, - parameter BLOCK_MLP_IN_FRAC_WIDTH = 3, - - parameter BLOCK_WEIGHT_I2H_WIDTH = 6, - parameter BLOCK_WEIGHT_I2H_FRAC_WIDTH = 4, - parameter BLOCK_WEIGHT_H2O_WIDTH = 6, - parameter BLOCK_WEIGHT_H2O_FRAC_WIDTH = 4, - parameter BLOCK_MLP_HIDDEN_WIDTH = 8, - parameter BLOCK_MLP_HIDDEN_FRAC_WIDTH = 3, - parameter BLOCK_MLP_HAS_BIAS = 1, - parameter BLOCK_BIAS_I2H_WIDTH = 6, - parameter BLOCK_BIAS_I2H_FRAC_WIDTH = 4, - parameter BLOCK_BIAS_H2O_WIDTH = 6, - parameter BLOCK_BIAS_H2O_FRAC_WIDTH = 4, - //head has bias - parameter HEAD_IN_WIDTH = 8, - parameter HEAD_IN_FRAC_WIDTH = 3, - parameter HEAD_W_WIDTH = 8, - parameter HEAD_W_FRAC_WIDTH = 4, - parameter HEAD_B_WIDTH = 8, - parameter HEAD_B_FRAC_WIDTH = 4, - - // patch embedding - // INPUT = IN_C * IN_Y * IN_X - // output = OUT_Y * OUT_X( OUT_Y is NUM_PATCH, OUT_X is EMBED_DIM) - parameter PATCH_EMBED_IN_C_3 = 3, - parameter PATCH_EMBED_IN_Y_3 = 224, - parameter PATCH_EMEBD_IN_X_3 = 224, - parameter PATCH_SIZE_3 = 16, - parameter PATCH_EMEBD_NUM_PATCH_3 = PATCH_EMBED_IN_Y_3*PATCH_EMEBD_IN_X_3/(PATCH_SIZE_3*PATCH_SIZE_3), - parameter PATCH_EMBED_EMBED_DIM_3 = 384, - - parameter NUM_HEADS = 6, - parameter MLP_RATIO = 2, - parameter NUM_CLASSES = 10, - - parameter PATCH_EMEBD_UNROLL_KERNEL_OUT_3 = 24, - parameter PATCH_EMEBD_UNROLL_IN_C_3 = 3, - parameter PATCH_EMBED_UNROLL_EMBED_DIM_3 = 8, - parameter BLOCK_UNROLL_WQKV_DIM = 2, - parameter BLOCK_UNROLL_HIDDEN_FEATURES = 2, - parameter HEAD_UNROLL_OUT_X = 1, - - parameter BLOCK_IN_NUM = PATCH_EMEBD_NUM_PATCH_3 + 1, // cls token - parameter BLOCK_IN_DIM = PATCH_EMBED_EMBED_DIM_3, - // num_heads * wqkv_dim = IN_DIM - - parameter BLOCK_UNROLL_IN_NUM = 1, - parameter BLOCK_UNROLL_IN_DIM = PATCH_EMBED_UNROLL_EMBED_DIM_3, - // head - parameter HEAD_IN_Y = BLOCK_IN_NUM, - parameter HEAD_IN_X = BLOCK_IN_DIM, - parameter HEAD_OUT_X = NUM_CLASSES, - parameter HEAD_UNROLL_IN_Y = 1, - parameter HEAD_UNROLL_IN_X = BLOCK_UNROLL_IN_DIM - -) ( - input clk, - input rst, - // patch embedding - input [PATCH_EMBED_W_WIDTH_3-1:0] patch_embed_weight_3 [PATCH_EMEBD_UNROLL_KERNEL_OUT_3 * PATCH_EMBED_UNROLL_EMBED_DIM_3 -1:0], - input patch_embed_weight_3_valid, - output patch_embed_weight_3_ready, - - input [PATCH_EMBED_B_WIDTH_3-1:0] patch_embed_bias_3 [PATCH_EMBED_UNROLL_EMBED_DIM_3-1:0], - input patch_embed_bias_3_valid, - output patch_embed_bias_3_ready, - - // position embedding - input [POS_ADD_IN_WIDTH_3-1:0] cls_token[PATCH_EMBED_UNROLL_EMBED_DIM_3-1:0], - input cls_token_valid, - output cls_token_ready, - - input [POS_ADD_IN_WIDTH_3-1:0] pos_embed_in[PATCH_EMBED_UNROLL_EMBED_DIM_3-1:0], - input pos_embed_in_valid, - output pos_embed_in_ready, - //msa - input [BLOCK_IN_WIDTH - 1:0] af_msa_weight[BLOCK_UNROLL_IN_NUM * BLOCK_UNROLL_IN_DIM - 1:0], - input af_msa_weight_valid, - output af_msa_weight_ready, - input [BLOCK_AF_MSA_ADD_WIDTH - 1:0] af_msa_bias [BLOCK_UNROLL_IN_NUM * BLOCK_UNROLL_IN_DIM - 1:0], - input af_msa_bias_valid, - output af_msa_bias_ready, - - input [BLOCK_WQ_WIDTH - 1:0] weight_q[NUM_HEADS * BLOCK_UNROLL_WQKV_DIM * BLOCK_UNROLL_IN_DIM -1 : 0], - input weight_q_valid, - output weight_q_ready, - input [BLOCK_WK_WIDTH - 1:0] weight_k[NUM_HEADS * BLOCK_UNROLL_WQKV_DIM * BLOCK_UNROLL_IN_DIM -1 : 0], - input weight_k_valid, - output weight_k_ready, - input [BLOCK_WV_WIDTH - 1:0] weight_v[NUM_HEADS * BLOCK_UNROLL_WQKV_DIM * BLOCK_UNROLL_IN_DIM -1 : 0], - input weight_v_valid, - output weight_v_ready, - input [BLOCK_WP_WIDTH - 1:0] weight_p[BLOCK_UNROLL_IN_DIM * NUM_HEADS * BLOCK_UNROLL_WQKV_DIM -1 : 0], - input weight_p_valid, - output weight_p_ready, - input [BLOCK_BQ_WIDTH - 1:0] bias_q[NUM_HEADS * BLOCK_UNROLL_WQKV_DIM -1 : 0], - input bias_q_valid, - output bias_q_ready, - input [BLOCK_BK_WIDTH - 1:0] bias_k[NUM_HEADS * BLOCK_UNROLL_WQKV_DIM -1 : 0], - input bias_k_valid, - output bias_k_ready, - input [BLOCK_BV_WIDTH - 1:0] bias_v[NUM_HEADS * BLOCK_UNROLL_WQKV_DIM -1 : 0], - input bias_v_valid, - output bias_v_ready, - input [BLOCK_BP_WIDTH - 1:0] bias_p[BLOCK_UNROLL_IN_DIM -1 : 0], - input bias_p_valid, - output bias_p_ready, - //mlp - input [BLOCK_AF_MLP_IN_WIDTH - 1:0] af_mlp_weight [BLOCK_UNROLL_IN_NUM * BLOCK_UNROLL_IN_DIM - 1:0], - input af_mlp_weight_valid, - output af_mlp_weight_ready, - input [BLOCK_AF_MLP_ADD_WIDTH - 1:0] af_mlp_bias [BLOCK_UNROLL_IN_NUM * BLOCK_UNROLL_IN_DIM - 1:0], - input af_mlp_bias_valid, - output af_mlp_bias_ready, - - input [BLOCK_WEIGHT_I2H_WIDTH-1:0] weight_in2hidden[BLOCK_UNROLL_HIDDEN_FEATURES * BLOCK_UNROLL_IN_DIM - 1:0], - input weight_in2hidden_valid, - output weight_in2hidden_ready, - input [BLOCK_WEIGHT_H2O_WIDTH-1:0] weight_hidden2out[BLOCK_UNROLL_IN_DIM * BLOCK_UNROLL_HIDDEN_FEATURES - 1:0], - input weight_hidden2out_valid, - output weight_hidden2out_ready, - input [BLOCK_BIAS_I2H_WIDTH-1:0] bias_in2hidden[BLOCK_UNROLL_HIDDEN_FEATURES - 1:0], - input bias_in2hidden_valid, - output bias_in2hidden_ready, - input [BLOCK_BIAS_H2O_WIDTH-1:0] bias_hidden2out[BLOCK_UNROLL_IN_DIM - 1:0], - input bias_hidden2out_valid, - output bias_hidden2out_ready, - // head - input [HEAD_W_WIDTH-1:0] head_weight[HEAD_UNROLL_OUT_X * HEAD_UNROLL_IN_X - 1:0], - input head_weight_valid, - output head_weight_ready, - input [HEAD_B_WIDTH-1:0] head_bias[HEAD_UNROLL_OUT_X - 1:0], - input head_bias_valid, - output head_bias_ready, - - - input [IN_WIDTH -1:0] data_in[PATCH_EMEBD_UNROLL_IN_C_3 - 1 : 0], - input data_in_valid, - output data_in_ready, - - output [OUT_WIDTH -1:0] data_out[HEAD_UNROLL_OUT_X - 1:0], - output data_out_valid, - input data_out_ready -); - logic [POS_ADD_IN_WIDTH_3 - 1:0] patch_embed_out_3[PATCH_EMBED_UNROLL_EMBED_DIM_3 - 1:0]; - logic patch_embed_out_3_valid, patch_embed_out_3_ready; - fixed_patch_embed #( - .IN_WIDTH(IN_WIDTH), - .IN_FRAC_WIDTH(IN_FRAC_WIDTH), - .W_WIDTH(PATCH_EMBED_W_WIDTH_3), - .W_FRAC_WIDTH(PATCH_EMBED_W_FRAC_WIDTH_3), - .BIAS_WIDTH(PATCH_EMBED_B_WIDTH_3), - .BIAS_FRAC_WIDTH(PATCH_EMBED_B_FRAC_WIDTH_3), - .OUT_WIDTH(POS_ADD_IN_WIDTH_3), - .OUT_FRAC_WIDTH(POS_ADD_IN_FRAC_WIDTH_3), - .IN_C(PATCH_EMBED_IN_C_3), - .IN_Y(PATCH_EMBED_IN_Y_3), - .IN_X(PATCH_EMEBD_IN_X_3), - .OUT_C(PATCH_EMBED_EMBED_DIM_3), - .KERNEL_SIZE(PATCH_SIZE_3), - .UNROLL_KERNEL_OUT(PATCH_EMEBD_UNROLL_KERNEL_OUT_3), - .UNROLL_OUT_C(PATCH_EMBED_UNROLL_EMBED_DIM_3), - .UNROLL_IN_C(PATCH_EMEBD_UNROLL_IN_C_3), - .SLIDING_NUM(PATCH_EMEBD_NUM_PATCH_3) - ) patemb_inst ( - .weight(patch_embed_weight_3), - .weight_valid(patch_embed_weight_3_valid), - .weight_ready(patch_embed_weight_3_ready), - - .bias(patch_embed_bias_3), - .bias_valid(patch_embed_bias_3_valid), - .bias_ready(patch_embed_bias_3_ready), - - .data_out(patch_embed_out_3), - .data_out_valid(patch_embed_out_3_valid), - .data_out_ready(patch_embed_out_3_ready), - .* - ); - logic [POS_ADD_IN_WIDTH_3 - 1:0] pos_data_in[PATCH_EMBED_UNROLL_EMBED_DIM_3 - 1:0]; - logic pos_data_in_valid, pos_data_in_ready; - // cls token - wrap_data #( - .IN_WIDTH(POS_ADD_IN_WIDTH_3), - .WRAP_Y(1), - .IN_Y(PATCH_EMEBD_NUM_PATCH_3), - .IN_X(PATCH_EMBED_EMBED_DIM_3), - .UNROLL_IN_X(PATCH_EMBED_UNROLL_EMBED_DIM_3) - ) cls_inst ( - .wrap_in(cls_token), - .wrap_in_valid(cls_token_valid), - .wrap_in_ready(cls_token_ready), - .data_in(patch_embed_out_3), - .data_in_valid(patch_embed_out_3_valid), - .data_in_ready(patch_embed_out_3_ready), - .data_out(pos_data_in), - .data_out_valid(pos_data_in_valid), - .data_out_ready(pos_data_in_ready), - .* - ); - // position embedding - logic [POS_ADD_IN_WIDTH_3 + 1 - 1:0] pos_embed_out[PATCH_EMBED_UNROLL_EMBED_DIM_3 - 1:0]; - for (genvar i = 0; i < PATCH_EMBED_UNROLL_EMBED_DIM_3; i++) - assign pos_embed_out[i] = {pos_embed_in[i][IN_WIDTH-1], pos_embed_in[i]} + {pos_data_in[i][IN_WIDTH-1], pos_data_in[i]}; - logic pos_embed_out_valid, pos_embed_out_ready; - - join2 #() fmm_join_inst ( - .data_in_ready ({pos_data_in_ready, pos_embed_in_ready}), - .data_in_valid ({pos_data_in_valid, pos_embed_in_valid}), - .data_out_valid(pos_embed_out_valid), - .data_out_ready(pos_embed_out_ready) - ); - - logic [BLOCK_IN_WIDTH - 1:0] block_in [PATCH_EMBED_UNROLL_EMBED_DIM_3 - 1:0]; - - logic [ HEAD_IN_WIDTH - 1:0] block_out[PATCH_EMBED_UNROLL_EMBED_DIM_3 - 1:0]; - logic block_out_valid, block_out_ready; - fixed_rounding #( - .IN_SIZE(PATCH_EMBED_UNROLL_EMBED_DIM_3), - .IN_WIDTH(POS_ADD_IN_WIDTH_3 + 1), - .IN_FRAC_WIDTH(POS_ADD_IN_FRAC_WIDTH_3), - .OUT_WIDTH(BLOCK_IN_WIDTH), - .OUT_FRAC_WIDTH(BLOCK_IN_FRAC_WIDTH) - ) head_cast ( - .data_in (pos_embed_out), - .data_out(block_in) - ); - fixed_block #( - .IN_WIDTH (BLOCK_IN_WIDTH), - .IN_FRAC_WIDTH (BLOCK_IN_FRAC_WIDTH), - .AF_MSA_ADD_WIDTH (BLOCK_AF_MSA_ADD_WIDTH), - .AF_MSA_ADD_FRAC_WIDTH (BLOCK_AF_MSA_ADD_FRAC_WIDTH), - .MSA_IN_WIDTH (BLOCK_MSA_IN_WIDTH), - .MSA_IN_FRAC_WIDTH (BLOCK_MSA_IN_FRAC_WIDTH), - .WQ_WIDTH (BLOCK_WQ_WIDTH), - .WQ_FRAC_WIDTH (BLOCK_WQ_FRAC_WIDTH), - .WK_WIDTH (BLOCK_WK_WIDTH), - .WK_FRAC_WIDTH (BLOCK_WK_FRAC_WIDTH), - .WV_WIDTH (BLOCK_WV_WIDTH), - .WV_FRAC_WIDTH (BLOCK_WV_FRAC_WIDTH), - .BQ_WIDTH (BLOCK_BQ_WIDTH), - .BQ_FRAC_WIDTH (BLOCK_BQ_FRAC_WIDTH), - .BK_WIDTH (BLOCK_BK_WIDTH), - .BK_FRAC_WIDTH (BLOCK_BK_FRAC_WIDTH), - .BV_WIDTH (BLOCK_BV_WIDTH), - .BV_FRAC_WIDTH (BLOCK_BV_FRAC_WIDTH), - .WP_WIDTH (BLOCK_WP_WIDTH), - .WP_FRAC_WIDTH (BLOCK_WP_FRAC_WIDTH), - .BP_WIDTH (BLOCK_BP_WIDTH), - .BP_FRAC_WIDTH (BLOCK_BP_FRAC_WIDTH), - .DQ_WIDTH (BLOCK_DQ_WIDTH), - .DQ_FRAC_WIDTH (BLOCK_DQ_FRAC_WIDTH), - .DK_WIDTH (BLOCK_DK_WIDTH), - .DK_FRAC_WIDTH (BLOCK_DK_FRAC_WIDTH), - .DV_WIDTH (BLOCK_DV_WIDTH), - .DV_FRAC_WIDTH (BLOCK_DV_FRAC_WIDTH), - .DS_WIDTH (BLOCK_DS_WIDTH), - .DS_FRAC_WIDTH (BLOCK_DS_FRAC_WIDTH), - .EXP_WIDTH (BLOCK_EXP_WIDTH), - .EXP_FRAC_WIDTH (BLOCK_EXP_FRAC_WIDTH), - .DIV_WIDTH (BLOCK_DIV_WIDTH), - .DS_SOFTMAX_WIDTH (BLOCK_DS_SOFTMAX_WIDTH), - .DS_SOFTMAX_FRAC_WIDTH (BLOCK_DS_SOFTMAX_FRAC_WIDTH), - .DZ_WIDTH (BLOCK_DZ_WIDTH), - .DZ_FRAC_WIDTH (BLOCK_DZ_FRAC_WIDTH), - .AF_MLP_IN_WIDTH (BLOCK_AF_MLP_IN_WIDTH), - .AF_MLP_IN_FRAC_WIDTH (BLOCK_AF_MLP_IN_FRAC_WIDTH), - .AF_MLP_ADD_WIDTH (BLOCK_AF_MLP_ADD_WIDTH), - .AF_MLP_ADD_FRAC_WIDTH (BLOCK_AF_MLP_ADD_FRAC_WIDTH), - .MLP_IN_WIDTH (BLOCK_MLP_IN_WIDTH), - .MLP_IN_FRAC_WIDTH (BLOCK_MLP_IN_FRAC_WIDTH), - .WEIGHT_I2H_WIDTH (BLOCK_WEIGHT_I2H_WIDTH), - .WEIGHT_I2H_FRAC_WIDTH (BLOCK_WEIGHT_I2H_FRAC_WIDTH), - .WEIGHT_H2O_WIDTH (BLOCK_WEIGHT_H2O_WIDTH), - .WEIGHT_H2O_FRAC_WIDTH (BLOCK_WEIGHT_H2O_FRAC_WIDTH), - .MLP_HAS_BIAS (BLOCK_MLP_HAS_BIAS), - .BIAS_I2H_WIDTH (BLOCK_BIAS_I2H_WIDTH), - .BIAS_I2H_FRAC_WIDTH (BLOCK_BIAS_I2H_FRAC_WIDTH), - .BIAS_H2O_WIDTH (BLOCK_BIAS_H2O_WIDTH), - .BIAS_H2O_FRAC_WIDTH (BLOCK_BIAS_H2O_FRAC_WIDTH), - .HIDDEN_WIDTH (BLOCK_MLP_HIDDEN_WIDTH), - .HIDDEN_FRAC_WIDTH (BLOCK_MLP_HIDDEN_FRAC_WIDTH), - .OUT_WIDTH (HEAD_IN_WIDTH), - .OUT_FRAC_WIDTH (HEAD_IN_FRAC_WIDTH), - .IN_NUM (BLOCK_IN_NUM), - .IN_DIM (BLOCK_IN_DIM), - .NUM_HEADS (NUM_HEADS), - .MLP_RATIO (MLP_RATIO), - .UNROLL_IN_NUM (BLOCK_UNROLL_IN_NUM), - .UNROLL_IN_DIM (BLOCK_UNROLL_IN_DIM), - .UNROLL_WQKV_DIM (BLOCK_UNROLL_WQKV_DIM), - .UNROLL_HIDDEN_FEATURES(BLOCK_UNROLL_HIDDEN_FEATURES) - ) block_inst ( - .data_in(block_in), - .data_in_valid(pos_embed_out_valid), - .data_in_ready(pos_embed_out_ready), - .data_out(block_out), - .data_out_valid(block_out_valid), - .data_out_ready(block_out_ready), - .* - ); - - logic [HEAD_IN_WIDTH - 1:0] head_in[PATCH_EMBED_UNROLL_EMBED_DIM_3 - 1:0]; - logic head_in_valid, head_in_ready; - cut_data #( - .IN_WIDTH(HEAD_IN_WIDTH), - .IN_Y(HEAD_IN_Y), - .IN_X(HEAD_IN_X), - .UNROLL_IN_X(HEAD_UNROLL_IN_X) - ) cut_inst ( - .data_in(block_out), - .data_in_valid(block_out_valid), - .data_in_ready(block_out_ready), - .data_out(head_in), - .data_out_valid(head_in_valid), - .data_out_ready(head_in_ready), - .* - ); - fixed_2d_linear #( - .IN_WIDTH(HEAD_IN_WIDTH), - .IN_FRAC_WIDTH(HEAD_IN_FRAC_WIDTH), - .WEIGHT_WIDTH(HEAD_W_WIDTH), - .WEIGHT_FRAC_WIDTH(HEAD_W_FRAC_WIDTH), - .HAS_BIAS(1), - .BIAS_WIDTH(HEAD_B_WIDTH), - .BIAS_FRAC_WIDTH(HEAD_B_FRAC_WIDTH), - .OUT_WIDTH(OUT_WIDTH), - .OUT_FRAC_WIDTH(OUT_FRAC_WIDTH), - .IN_Y(1), - .UNROLL_IN_Y(1), - .IN_X(HEAD_IN_X), - .UNROLL_IN_X(HEAD_UNROLL_IN_X), - .W_Y(NUM_CLASSES), - .UNROLL_W_Y(HEAD_UNROLL_OUT_X) - ) head_inst ( - .data_in(head_in), - .data_in_valid(head_in_valid), - .data_in_ready(head_in_ready), - .weight(head_weight), - .weight_valid(head_weight_valid), - .weight_ready(head_weight_ready), - .bias(head_bias), - .bias_valid(head_bias_valid), - .bias_ready(head_bias_ready), - .* - ); -endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_vit_attention.sv b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention.sv new file mode 100644 index 000000000..387df19b1 --- /dev/null +++ b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention.sv @@ -0,0 +1,385 @@ +`timescale 1ns / 1ps +module fixed_vit_attention #( + // currently assume weights are all transposed + // currently force weight dim keep same + + parameter NUM_HEADS = 4, + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 8, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 2, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 2, + parameter DATA_IN_0_PRECISION_0 = 8, + parameter DATA_IN_0_PRECISION_1 = 3, + + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 8, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 8, + parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + parameter WEIGHT_PRECISION_0 = 8, + parameter WEIGHT_PRECISION_1 = 3, + + parameter HAS_BIAS = 1, + parameter BIAS_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + parameter BIAS_PARALLELISM_DIM_1 = 1, + parameter BIAS_PRECISION_0 = 8, + parameter BIAS_PRECISION_1 = 3, + + parameter QKV_PRECISION_0 = 16, + parameter QKV_PRECISION_1 = 3, + parameter QKMM_OUT_PRECISION_0 = 16, + parameter QKMM_OUT_PRECISION_1 = 3, + parameter SOFTMAX_EXP_PRECISION_0 = 16, + parameter SOFTMAX_EXP_PRECISION_1 = 3, + parameter SOFTMAX_OUT_DATA_PRECISION_1 = 3, + parameter SVMM_OUT_PRECISION_0 = 8, + parameter SVMM_OUT_PRECISION_1 = 3, + + parameter WEIGHT_PROJ_PRECISION_0 = 12, + parameter WEIGHT_PROJ_PRECISION_1 = 3, + + parameter WEIGHT_PROJ_TENSOR_SIZE_DIM_0 = 8, + parameter WEIGHT_PROJ_TENSOR_SIZE_DIM_1 = 8, + parameter WEIGHT_PROJ_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PROJ_PARALLELISM_DIM_1 = 4, + + parameter BIAS_PROJ_PRECISION_0 = 8, + parameter BIAS_PROJ_PRECISION_1 = 3, + parameter BIAS_PROJ_TENSOR_SIZE_DIM_0 = WEIGHT_PROJ_TENSOR_SIZE_DIM_1, + parameter BIAS_PROJ_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PROJ_PARALLELISM_DIM_0 = WEIGHT_PROJ_PARALLELISM_DIM_1, + parameter BIAS_PROJ_PARALLELISM_DIM_1 = 1, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_PROJ_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PROJ_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + // Query weights + input logic [WEIGHT_PRECISION_0-1:0] query_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic query_weight_valid, + output logic query_weight_ready, + + // Query bias + input logic [BIAS_PRECISION_0-1:0] query_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic query_bias_valid, + output logic query_bias_ready, + + // Key weights + input logic [WEIGHT_PRECISION_0-1:0] key_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic key_weight_valid, + output logic key_weight_ready, + + // Key bias + input logic [BIAS_PRECISION_0-1:0] key_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic key_bias_valid, + output logic key_bias_ready, + + // Value weights + input logic [WEIGHT_PRECISION_0-1:0] value_weight [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic value_weight_valid, + output logic value_weight_ready, + + // Value bias + input logic [BIAS_PRECISION_0-1:0] value_bias [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic value_bias_valid, + output logic value_bias_ready, + + // Proj weights + input logic [WEIGHT_PROJ_PRECISION_0-1:0] proj_weight [WEIGHT_PROJ_PARALLELISM_DIM_0 * WEIGHT_PROJ_PARALLELISM_DIM_1-1:0], + input logic proj_weight_valid, + output logic proj_weight_ready, + + // Proj bias + input logic [BIAS_PROJ_PRECISION_0-1:0] proj_bias [BIAS_PROJ_PARALLELISM_DIM_0 * BIAS_PROJ_PARALLELISM_DIM_1 -1:0], + input logic proj_bias_valid, + output logic proj_bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + + // * Declarations + // * ================================================================= + + localparam HEAD_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1; + localparam HEAD_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1; + localparam HEAD_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1; + localparam HEAD_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1; + // Query + logic [QKV_PRECISION_0-1:0] query[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic joint_query_valid, joint_query_ready; + logic [NUM_HEADS-1:0] split_query_valid, split_query_ready; + + // Key + logic [QKV_PRECISION_0-1:0] key[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic joint_key_valid, joint_key_ready; + logic [NUM_HEADS-1:0] split_key_valid, split_key_ready; + + // Value + logic [QKV_PRECISION_0-1:0] value[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic joint_value_valid, joint_value_ready; + logic [NUM_HEADS-1:0] split_value_valid, split_value_ready; + + logic [QKV_PRECISION_0-1:0] fifo_key[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic fifo_key_valid, fifo_key_ready; + logic [QKV_PRECISION_0-1:0] fifo_value[DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0-1:0]; + logic fifo_value_valid, fifo_value_ready; + + // Head output + logic [SVMM_OUT_PRECISION_0-1:0] head_out [NUM_HEADS-1:0] [HEAD_OUT_0_PARALLELISM_DIM_0 * HEAD_OUT_0_PARALLELISM_DIM_1-1:0]; + logic [NUM_HEADS-1:0] head_out_valid; + logic [NUM_HEADS-1:0] head_out_ready; + + logic [SVMM_OUT_PRECISION_0-1:0] proj_in [HEAD_OUT_0_PARALLELISM_DIM_0 * HEAD_OUT_0_PARALLELISM_DIM_1-1:0]; + logic proj_in_valid, proj_in_ready; + + // * Instances + // * ================================================================= + + fixed_vit_attention_input_block_batched #( + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + + .HAS_BIAS (HAS_BIAS), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + + .DATA_OUT_0_PRECISION_0(QKV_PRECISION_0), + .DATA_OUT_0_PRECISION_1(QKV_PRECISION_1) + ) batched_input_block_i ( + .clk(clk), + .rst(rst), + + .data_in_0(data_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + + // Query parameters + .weight_query(query_weight), + .weight_query_valid(query_weight_valid), + .weight_query_ready(query_weight_ready), + + .bias_query(query_bias), + .bias_query_valid(query_bias_valid), + .bias_query_ready(query_bias_ready), + + // Key parameters + .weight_key(key_weight), + .weight_key_valid(key_weight_valid), + .weight_key_ready(key_weight_ready), + + .bias_key(key_bias), + .bias_key_valid(key_bias_valid), + .bias_key_ready(key_bias_ready), + + // Value parameters + .weight_value(value_weight), + .weight_value_valid(value_weight_valid), + .weight_value_ready(value_weight_ready), + + .bias_value(value_bias), + .bias_value_valid(value_bias_valid), + .bias_value_ready(value_bias_ready), + + // Query output + .data_out_query(query), + .data_out_query_valid(joint_query_valid), + .data_out_query_ready(joint_query_ready), + + // Key output + .data_out_key(key), + .data_out_key_valid(joint_key_valid), + .data_out_key_ready(joint_key_ready), + + // Value output + .data_out_value(fifo_value), + .data_out_value_valid(fifo_value_valid), + .data_out_value_ready(fifo_value_ready) + ); + + unpacked_fifo #( + .DEPTH(DATA_IN_0_TENSOR_SIZE_DIM_0 * DATA_IN_0_TENSOR_SIZE_DIM_1 / (DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0)), + .DATA_WIDTH(QKV_PRECISION_0), + .IN_NUM(DATA_IN_0_PARALLELISM_DIM_1 * HEAD_OUT_0_PARALLELISM_DIM_0) + ) value_in_buffer ( + .clk(clk), + .rst(rst), + .data_in(fifo_value), + .data_in_valid(fifo_value_valid), + .data_in_ready(fifo_value_ready), // write enable + .data_out(value), + .data_out_valid(joint_value_valid), + .data_out_ready(joint_value_ready) // read enable + ); + // * Scatter query, key, value + + self_attention_head_scatter #( + .NUM_HEADS(NUM_HEADS), + + .IN_DATA_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .IN_DATA_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .IN_DATA_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .IN_DATA_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1) + + ) scatter_qkv_i ( + .clk, + .rst, + + .query_valid(joint_query_valid), + .query_ready(joint_query_ready), + + .key_valid(joint_key_valid), + .key_ready(joint_key_ready), + + .value_valid(joint_value_valid), + .value_ready(joint_value_ready), + + .split_query_valid(split_query_valid), + .split_query_ready(split_query_ready), + + .split_key_valid(split_key_valid), + .split_key_ready(split_key_ready), + + .split_value_valid(split_value_valid), + .split_value_ready(split_value_ready) + ); + + // * Heads + + for (genvar head = 0; head < NUM_HEADS; head++) begin : g_attention_head + + fixed_vit_attention_head #( + .IN_DATA_TENSOR_SIZE_DIM_0 (HEAD_OUT_0_TENSOR_SIZE_DIM_0 / NUM_HEADS), + .IN_DATA_TENSOR_SIZE_DIM_1 (HEAD_OUT_0_TENSOR_SIZE_DIM_1), + .IN_DATA_PARALLELISM_DIM_0 (HEAD_OUT_0_PARALLELISM_DIM_0), + .IN_DATA_PARALLELISM_DIM_1 (HEAD_OUT_0_PARALLELISM_DIM_1), + .IN_DATA_PRECISION_0 (QKV_PRECISION_0), + .IN_DATA_PRECISION_1 (QKV_PRECISION_1), + .QKMM_OUT_PRECISION_0 (QKMM_OUT_PRECISION_0), + .QKMM_OUT_PRECISION_1 (QKMM_OUT_PRECISION_1), + .SOFTMAX_EXP_PRECISION_0 (SOFTMAX_EXP_PRECISION_0), + .SOFTMAX_EXP_PRECISION_1 (SOFTMAX_EXP_PRECISION_1), + .SOFTMAX_OUT_DATA_PRECISION_1(SOFTMAX_OUT_DATA_PRECISION_1), + .OUT_DATA_PRECISION_0 (SVMM_OUT_PRECISION_0), + .OUT_DATA_PRECISION_1 (SVMM_OUT_PRECISION_1) + + ) head_i ( + .clk, + .rst, + + .query (query), + .query_valid(split_query_valid[head]), + .query_ready(split_query_ready[head]), + + .key (key), + .key_valid(split_key_valid[head]), + .key_ready(split_key_ready[head]), + + .value (value), + .value_valid(split_value_valid[head]), + .value_ready(split_value_ready[head]), + + .out (head_out[head]), + .out_valid(head_out_valid[head]), + .out_ready(head_out_ready[head]) + ); + + end + + // * Gather heads + + self_attention_head_gather #( + .NUM_HEADS(NUM_HEADS), + + .IN_DATA_TENSOR_SIZE_DIM_0(HEAD_OUT_0_TENSOR_SIZE_DIM_0), + .IN_DATA_TENSOR_SIZE_DIM_1(HEAD_OUT_0_TENSOR_SIZE_DIM_1), + .IN_DATA_PARALLELISM_DIM_0(HEAD_OUT_0_PARALLELISM_DIM_0), + .IN_DATA_PARALLELISM_DIM_1(HEAD_OUT_0_PARALLELISM_DIM_1), + .IN_DATA_PRECISION_0 (SVMM_OUT_PRECISION_0), + .IN_DATA_PRECISION_1 (SVMM_OUT_PRECISION_1) + ) gather_qkv_i ( + .clk, + .rst, + + .split_head_out (head_out), + .split_head_out_valid(head_out_valid), + .split_head_out_ready(head_out_ready), + + .updated_tokens (proj_in), + .updated_tokens_valid(proj_in_valid), + .updated_tokens_ready(proj_in_ready) + ); + + fixed_linear_with_input_circular #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (SVMM_OUT_PRECISION_0), + .DATA_IN_0_PRECISION_1 (SVMM_OUT_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(HEAD_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(HEAD_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(HEAD_OUT_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(HEAD_OUT_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PROJ_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PROJ_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_PROJ_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_PROJ_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PROJ_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PROJ_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PROJ_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PROJ_PRECISION_1), + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + ) proj ( + .clk(clk), + .rst(rst), + + // input port for data_inivations + .data_in_0 (proj_in), + .data_in_0_valid(proj_in_valid), + .data_in_0_ready(proj_in_ready), + + // input port for weight + .weight (proj_weight), + .weight_valid(proj_weight_valid), + .weight_ready(proj_weight_ready), + + .bias (proj_bias), + .bias_valid(proj_bias_valid), + .bias_ready(proj_bias_ready), + + .data_out_0(data_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); +endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_head.sv b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_head.sv new file mode 100644 index 000000000..5692318d9 --- /dev/null +++ b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_head.sv @@ -0,0 +1,298 @@ +`timescale 1ns / 1ps +module fixed_vit_attention_head #( + + // * Queries, keys and values are assumed to have the same + // * precision, dimensions and parallelism + parameter IN_DATA_TENSOR_SIZE_DIM_0 = 32, + parameter IN_DATA_TENSOR_SIZE_DIM_1 = 10, + parameter IN_DATA_PARALLELISM_DIM_0 = 2, + parameter IN_DATA_PARALLELISM_DIM_1 = 2, + parameter IN_DATA_PRECISION_0 = 16, + parameter IN_DATA_PRECISION_1 = 3, + + // * Output tokens are casted to requested precision + parameter OUT_DATA_TENSOR_SIZE_DIM_0 = IN_DATA_TENSOR_SIZE_DIM_0, + parameter OUT_DATA_TENSOR_SIZE_DIM_1 = IN_DATA_TENSOR_SIZE_DIM_1, + parameter OUT_DATA_PARALLELISM_DIM_0 = IN_DATA_PARALLELISM_DIM_0, + parameter OUT_DATA_PARALLELISM_DIM_1 = IN_DATA_PARALLELISM_DIM_1, + + parameter QKMM_OUT_PRECISION_0 = 16, + parameter QKMM_OUT_PRECISION_1 = 16, + parameter SOFTMAX_EXP_PRECISION_0 = 16, + parameter SOFTMAX_EXP_PRECISION_1 = 16, + parameter SOFTMAX_OUT_DATA_PRECISION_1 = 7, + parameter SOFTMAX_OUT_DATA_PRECISION_0 = SOFTMAX_OUT_DATA_PRECISION_1 + 2, + parameter OUT_DATA_PRECISION_0 = 16, + parameter OUT_DATA_PRECISION_1 = 3 + +) ( + input logic clk, + input logic rst, + + input logic [IN_DATA_PRECISION_0-1:0] query [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic query_valid, + output logic query_ready, + + input logic [IN_DATA_PRECISION_0-1:0] key [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic key_valid, + output logic key_ready, + + input logic [IN_DATA_PRECISION_0-1:0] value [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0], + input logic value_valid, + output logic value_ready, + + output logic [OUT_DATA_PRECISION_0-1:0] out [OUT_DATA_PARALLELISM_DIM_0*OUT_DATA_PARALLELISM_DIM_1-1:0], + output logic out_valid, + input logic out_ready +); + + initial begin + assert (OUT_DATA_TENSOR_SIZE_DIM_0 == IN_DATA_TENSOR_SIZE_DIM_0) + else + $fatal( + "Module incorrectly parametrized. OUT_DATA_TENSOR_SIZE_DIM_0 != IN_DATA_TENSOR_SIZE_DIM_0" + ); + + assert (OUT_DATA_TENSOR_SIZE_DIM_1 == IN_DATA_TENSOR_SIZE_DIM_1) + else + $fatal( + "Module incorrectly parametrized. OUT_DATA_TENSOR_SIZE_DIM_1 != IN_DATA_TENSOR_SIZE_DIM_1" + ); + + assert (OUT_DATA_PARALLELISM_DIM_0 == IN_DATA_PARALLELISM_DIM_0) + else + $fatal( + "Parallelism conversion not yet supported. OUT_DATA_PARALLELISM_DIM_0 != IN_DATA_PARALLELISM_DIM_0" + ); + + assert (OUT_DATA_PARALLELISM_DIM_1 == IN_DATA_PARALLELISM_DIM_1) + else + $fatal( + "Parallelism conversion not yet supported. OUT_DATA_PARALLELISM_DIM_1 != IN_DATA_PARALLELISM_DIM_1" + ); + end + + parameter IN_DATA_DEPTH_0 = IN_DATA_TENSOR_SIZE_DIM_0 / IN_DATA_PARALLELISM_DIM_0; + parameter IN_DATA_DEPTH_1 = IN_DATA_TENSOR_SIZE_DIM_1 / IN_DATA_PARALLELISM_DIM_1; + + // Query key transpose + parameter QUERY_TRANSPOSE_PRECISION_0 = 2 * IN_DATA_PRECISION_0 + $clog2( + IN_DATA_PARALLELISM_DIM_0 + ) + $clog2( + IN_DATA_DEPTH_1 + ); + parameter QUERY_TRANSPOSE_PRECISION_1 = 2 * IN_DATA_PRECISION_1; + + // Attention scores + // ! TO DO: check precision transformation post softmax + parameter ATTENTION_SCORES_PRECISION_0 = QUERY_TRANSPOSE_PRECISION_0; + parameter ATTENTION_SCORES_PRECISION_1 = QUERY_TRANSPOSE_PRECISION_1; + + parameter OUT_PRE_CAST_PRECISION_0 = IN_DATA_PRECISION_0 + ATTENTION_SCORES_PRECISION_0 + $clog2( + IN_DATA_PARALLELISM_DIM_1 + ) + $clog2( + IN_DATA_TENSOR_SIZE_DIM_1 / IN_DATA_PARALLELISM_DIM_1 + ); + parameter OUT_PRE_CAST_PRECISION_1 = IN_DATA_PRECISION_1 + ATTENTION_SCORES_PRECISION_1; + + // * Declarations + // * ================================================================= + + logic [IN_DATA_PRECISION_0-1:0] key_transpose [IN_DATA_PARALLELISM_DIM_0*IN_DATA_PARALLELISM_DIM_1-1:0]; + logic key_transpose_valid; + logic key_transpose_ready; + + logic [QKMM_OUT_PRECISION_0-1:0] query_key_transpose [IN_DATA_PARALLELISM_DIM_1 * IN_DATA_PARALLELISM_DIM_1-1:0]; + logic query_key_transpose_valid; + logic query_key_transpose_ready; + + logic [QKMM_OUT_PRECISION_0-1:0] buffered_query_key_transpose [IN_DATA_PARALLELISM_DIM_1 * IN_DATA_PARALLELISM_DIM_1-1:0]; + logic buffered_query_key_transpose_valid; + logic buffered_query_key_transpose_ready; + + logic [SOFTMAX_OUT_DATA_PRECISION_0 - 1:0] attention_scores [IN_DATA_PARALLELISM_DIM_1 * IN_DATA_PARALLELISM_DIM_1-1:0]; + logic attention_scores_valid; + logic attention_scores_ready; + + logic [OUT_DATA_PRECISION_0-1:0] out_casted [OUT_DATA_PARALLELISM_DIM_0*OUT_DATA_PARALLELISM_DIM_1-1:0]; + logic out_cast_valid; + logic out_cast_ready; + + + logic [OUT_DATA_PRECISION_0-1:0] buffer_out [OUT_DATA_PARALLELISM_DIM_0*OUT_DATA_PARALLELISM_DIM_1-1:0]; + logic buffer_out_valid; + logic buffer_out_ready; + // * Instances + // * ================================================================= + + // * Transpose projected keys + + matrix_stream_transpose #( + .TOTAL_DIM0 (IN_DATA_TENSOR_SIZE_DIM_0), + .TOTAL_DIM1 (IN_DATA_TENSOR_SIZE_DIM_1), + .COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_0), + .COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_1), + + .DATA_WIDTH(IN_DATA_PRECISION_0) + ) key_transpose_i ( + .clk, + .rst, + + // In Matrix + .in_data (key), + .in_valid(key_valid), + .in_ready(key_ready), + + // Out Matrix + .out_data (key_transpose), + .out_valid(key_transpose_valid), + .out_ready(key_transpose_ready) + ); + + // * Query x Key^T + + matmul #( + .A_TOTAL_DIM0(IN_DATA_TENSOR_SIZE_DIM_0), + .A_TOTAL_DIM1(IN_DATA_TENSOR_SIZE_DIM_1), + + .B_TOTAL_DIM0(IN_DATA_TENSOR_SIZE_DIM_1), + .B_TOTAL_DIM1(IN_DATA_TENSOR_SIZE_DIM_0), + + .A_COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_0), + .A_COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_1), + .B_COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_1), + .B_COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_0), + + .A_WIDTH (IN_DATA_PRECISION_0), + .A_FRAC_WIDTH(IN_DATA_PRECISION_1), + + .B_WIDTH (IN_DATA_PRECISION_0), + .B_FRAC_WIDTH(IN_DATA_PRECISION_1), + + .OUT_WIDTH (QKMM_OUT_PRECISION_0), + .OUT_FRAC_WIDTH(QKMM_OUT_PRECISION_1) + + ) query_key_transpose_matmul_i ( + .clk, + .rst, + + .a_data (query), + .a_valid(query_valid), + .a_ready(query_ready), + + .b_data (key_transpose), + .b_valid(key_transpose_valid), + .b_ready(key_transpose_ready), + + .out_data (query_key_transpose), + .out_valid(query_key_transpose_valid), + .out_ready(query_key_transpose_ready) + ); + + + //cut the long ready path + unpacked_skid_buffer #( + .DATA_WIDTH(QKMM_OUT_PRECISION_0), + .IN_NUM (IN_DATA_PARALLELISM_DIM_1 * IN_DATA_PARALLELISM_DIM_1) + ) input_stream_reg_slice ( + .clk (clk), + .rst (rst), + .data_in (query_key_transpose), + .data_in_valid (query_key_transpose_valid), + .data_in_ready (query_key_transpose_ready), + .data_out (buffered_query_key_transpose), + .data_out_valid(buffered_query_key_transpose_valid), + .data_out_ready(buffered_query_key_transpose_ready) + ); + fixed_softmax #( + .DATA_IN_0_PRECISION_0 (QKMM_OUT_PRECISION_0), + .DATA_IN_0_PRECISION_1 (QKMM_OUT_PRECISION_1), + .DATA_EXP_0_PRECISION_0 (SOFTMAX_EXP_PRECISION_0), + .DATA_EXP_0_PRECISION_1 (SOFTMAX_EXP_PRECISION_1), + .DATA_OUT_0_PRECISION_0 (SOFTMAX_OUT_DATA_PRECISION_0), + .DATA_OUT_0_PRECISION_1 (SOFTMAX_OUT_DATA_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_TENSOR_SIZE_DIM_1(IN_DATA_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(IN_DATA_PARALLELISM_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_1(IN_DATA_PARALLELISM_DIM_1) + ) fixed_softmax_i ( + .clk, + .rst, + + .data_in_0 (buffered_query_key_transpose), + .data_in_0_valid(buffered_query_key_transpose_valid), + .data_in_0_ready(buffered_query_key_transpose_ready), + + .data_out_0 (attention_scores), + .data_out_0_valid(attention_scores_valid), + .data_out_0_ready(attention_scores_ready) + ); + // end + + // * Output: Attention scores x Value + + matmul #( + .A_TOTAL_DIM0(IN_DATA_TENSOR_SIZE_DIM_1), + .A_TOTAL_DIM1(IN_DATA_TENSOR_SIZE_DIM_1), + + .B_TOTAL_DIM0(IN_DATA_TENSOR_SIZE_DIM_0), + .B_TOTAL_DIM1(IN_DATA_TENSOR_SIZE_DIM_1), + + .A_COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_1), + .A_COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_1), + .B_COMPUTE_DIM0(IN_DATA_PARALLELISM_DIM_0), + .B_COMPUTE_DIM1(IN_DATA_PARALLELISM_DIM_1), + + .A_WIDTH (SOFTMAX_OUT_DATA_PRECISION_0), + .A_FRAC_WIDTH(SOFTMAX_OUT_DATA_PRECISION_1), + + .B_WIDTH (IN_DATA_PRECISION_0), + .B_FRAC_WIDTH(IN_DATA_PRECISION_1), + + .OUT_WIDTH (OUT_DATA_PRECISION_0), + .OUT_FRAC_WIDTH(OUT_DATA_PRECISION_1) + + ) attention_scores_values_matmul_i ( + .clk, + .rst, + + .a_data (attention_scores), + .a_valid(attention_scores_valid), + .a_ready(attention_scores_ready), + + .b_data (value), + .b_valid(value_valid), + .b_ready(value_ready), + + .out_data (out_casted), + .out_valid(out_cast_valid), + .out_ready(out_cast_ready) + ); + + + fifo_for_autogen #( + .DATA_IN_0_PRECISION_0(OUT_DATA_PRECISION_0), // = 8 + .DATA_IN_0_PRECISION_1(OUT_DATA_PRECISION_1), // = 4 + .DATA_IN_0_TENSOR_SIZE_DIM_0(OUT_DATA_TENSOR_SIZE_DIM_0), // = 20 + .DATA_IN_0_PARALLELISM_DIM_0(OUT_DATA_PARALLELISM_DIM_0), // = 2 + .DATA_IN_0_TENSOR_SIZE_DIM_1(OUT_DATA_TENSOR_SIZE_DIM_1), // = 4 + .DATA_IN_0_PARALLELISM_DIM_1(OUT_DATA_PARALLELISM_DIM_1), // = 2 + .DEPTH(OUT_DATA_TENSOR_SIZE_DIM_0/OUT_DATA_PARALLELISM_DIM_0), + .DATA_OUT_0_PRECISION_0(OUT_DATA_PRECISION_0), + .DATA_OUT_0_PRECISION_1(OUT_DATA_PRECISION_1), + .DATA_OUT_0_TENSOR_SIZE_DIM_0(OUT_DATA_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_0(OUT_DATA_PARALLELISM_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1(OUT_DATA_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_1(OUT_DATA_PARALLELISM_DIM_1) + ) fifo_1_inst ( + .clk(clk), + .rst(rst), + + .data_in_0(out_casted), + .data_in_0_valid(out_cast_valid), + .data_in_0_ready(out_cast_ready), + .data_out_0(out), + .data_out_0_valid(out_valid), + .data_out_0_ready(out_ready) + ); +endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_input_block_batched.sv b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_input_block_batched.sv new file mode 100644 index 000000000..c374d12b3 --- /dev/null +++ b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_input_block_batched.sv @@ -0,0 +1,290 @@ +`timescale 1ns / 1ps +module fixed_vit_attention_input_block_batched #( + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 768, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 20, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + + parameter WEIGHT_TENSOR_SIZE_DIM_0 = 768, + parameter WEIGHT_TENSOR_SIZE_DIM_1 = 768, + parameter WEIGHT_PARALLELISM_DIM_0 = 4, + parameter WEIGHT_PARALLELISM_DIM_1 = 4, + parameter WEIGHT_PRECISION_0 = 16, + parameter WEIGHT_PRECISION_1 = 3, + + parameter HAS_BIAS = 1, + parameter BIAS_TENSOR_SIZE_DIM_0 = 64, + parameter BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter BIAS_PARALLELISM_DIM_0 = 4, + parameter BIAS_PARALLELISM_DIM_1 = 1, + parameter BIAS_PRECISION_0 = 16, + parameter BIAS_PRECISION_1 = 3, + + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = WEIGHT_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PRECISION_0 = 16, + parameter DATA_OUT_0_PRECISION_1 = 3 + +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + // Query weights + input logic [WEIGHT_PRECISION_0-1:0] weight_query [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic weight_query_valid, + output logic weight_query_ready, + + // Query bias + input logic [BIAS_PRECISION_0-1:0] bias_query [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic bias_query_valid, + output logic bias_query_ready, + + // Key weights + input logic [WEIGHT_PRECISION_0-1:0] weight_key [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic weight_key_valid, + output logic weight_key_ready, + + // Key bias + input logic [BIAS_PRECISION_0-1:0] bias_key [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic bias_key_valid, + output logic bias_key_ready, + + // Value weights + input logic [WEIGHT_PRECISION_0-1:0] weight_value [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0], + input logic weight_value_valid, + output logic weight_value_ready, + + // Value bias + input logic [BIAS_PRECISION_0-1:0] bias_value [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1 -1:0], + input logic bias_value_valid, + output logic bias_value_ready, + + // Query + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_query [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic data_out_query_valid, + input logic data_out_query_ready, + + // Key + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_key [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic data_out_key_valid, + input logic data_out_key_ready, + + // Value + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_value [DATA_OUT_0_PARALLELISM_DIM_1 * DATA_OUT_0_PARALLELISM_DIM_0-1:0], + output logic data_out_value_valid, + input logic data_out_value_ready +); + + // ! TO DO: add assertions about bias parallelism matching weight parallelism + + // * Inferred parameters + parameter DATA_IN_0_DEPTH_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1 / DATA_IN_0_PARALLELISM_DIM_1; + parameter WEIGHT_DEPTH_DIM_0 = WEIGHT_TENSOR_SIZE_DIM_0 / WEIGHT_PARALLELISM_DIM_0; + + // * Declarations + // * ================================================================= + + logic query_data_in_valid, query_data_in_ready; + logic key_data_in_valid, key_data_in_ready; + logic value_data_in_valid, value_data_in_ready; + + logic [DATA_OUT_0_PRECISION_0-1:0] query_buffer [DATA_IN_0_PARALLELISM_DIM_1 * WEIGHT_PARALLELISM_DIM_1-1:0]; + logic query_buffer_valid; + logic query_buffer_ready; + + // * Instances + // * ================================================================= + + // * Split the incoming data over the QKV projections + split_n #( + .N(3) + ) split_i ( + .data_in_valid (data_in_0_valid), + .data_in_ready (data_in_0_ready), + .data_out_valid({query_data_in_valid, key_data_in_valid, value_data_in_valid}), + .data_out_ready({query_data_in_ready, key_data_in_ready, value_data_in_ready}) + ); + + // * Query linear + + fixed_linear_with_input_circular #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + + ) fixed_linear_query ( + .clk, + .rst, + + // input port for data_inivations + .data_in_0 (data_in_0), + .data_in_0_valid(query_data_in_valid), + .data_in_0_ready(query_data_in_ready), + + // input port for weight + .weight (weight_query), + .weight_valid(weight_query_valid), + .weight_ready(weight_query_ready), + + .bias (bias_query), + .bias_valid(bias_query_valid), + .bias_ready(bias_query_ready), + + .data_out_0 (query_buffer), + .data_out_0_valid(query_buffer_valid), + .data_out_0_ready(query_buffer_ready) + ); + + // * We must buffer the queries to latency match the key transpose path + // * since the matmul for QK^T buffers K^T but streams Q + matrix_fifo #( + .DATA_WIDTH(DATA_OUT_0_PRECISION_0), + .DIM0 (DATA_OUT_0_PARALLELISM_DIM_0), + .DIM1 (DATA_IN_0_PARALLELISM_DIM_1), + .FIFO_SIZE (DATA_IN_0_DEPTH_DIM_1 * DATA_OUT_0_TENSOR_SIZE_DIM_0 / DATA_OUT_0_PARALLELISM_DIM_0) + ) query_buffer_i ( + .clk, + .rst, + .in_data (query_buffer), + .in_valid (query_buffer_valid), + .in_ready (query_buffer_ready), + .out_data (data_out_query), + .out_valid(data_out_query_valid), + .out_ready(data_out_query_ready) + ); + + // * Key linear + + fixed_linear_with_input_circular #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + + ) fixed_linear_key ( + .clk, + .rst, + + // input port for data_inivations + .data_in_0 (data_in_0), + .data_in_0_valid(key_data_in_valid), + .data_in_0_ready(key_data_in_ready), + + // input port for weight + .weight (weight_key), + .weight_valid(weight_key_valid), + .weight_ready(weight_key_ready), + + .bias (bias_key), + .bias_valid(bias_key_valid), + .bias_ready(bias_key_ready), + + .data_out_0 (data_out_key), + .data_out_0_valid(data_out_key_valid), + .data_out_0_ready(data_out_key_ready) + ); + + // * Value linear + + fixed_linear_with_input_circular #( + .HAS_BIAS (HAS_BIAS), + + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + + .WEIGHT_PRECISION_0 (WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1 (WEIGHT_PRECISION_1), + .WEIGHT_TENSOR_SIZE_DIM_0(WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(WEIGHT_PARALLELISM_DIM_1), + + .BIAS_PRECISION_0 (BIAS_PRECISION_0), + .BIAS_PRECISION_1 (BIAS_PRECISION_1), + .BIAS_TENSOR_SIZE_DIM_0(BIAS_TENSOR_SIZE_DIM_0), + .BIAS_TENSOR_SIZE_DIM_1(BIAS_TENSOR_SIZE_DIM_1), + .BIAS_PARALLELISM_DIM_0(BIAS_PARALLELISM_DIM_0), + .BIAS_PARALLELISM_DIM_1(BIAS_PARALLELISM_DIM_1), + + .DATA_OUT_0_PRECISION_0(DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1(DATA_OUT_0_PRECISION_1) + + ) fixed_linear_value ( + .clk, + .rst, + + // input port for data_inivations + .data_in_0 (data_in_0), + .data_in_0_valid(value_data_in_valid), + .data_in_0_ready(value_data_in_ready), + + // input port for weight + .weight (weight_value), + .weight_valid(weight_value_valid), + .weight_ready(weight_value_ready), + + .bias (bias_value), + .bias_valid(bias_value_valid), + .bias_ready(bias_value_ready), + + .data_out_0 (data_out_value), + .data_out_0_valid(data_out_value_valid), + .data_out_0_ready(data_out_value_ready) + ); + +endmodule diff --git a/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_single_precision_wrapper.sv b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_single_precision_wrapper.sv new file mode 100644 index 000000000..780d66664 --- /dev/null +++ b/src/mase_components/vision_models/vit/rtl/fixed_vit_attention_single_precision_wrapper.sv @@ -0,0 +1,257 @@ +`timescale 1ns / 1ps + +/* + * This is a workaround to use attention in single precision + * in emitted verilog, where separate precision parameters are + * emitted for each model submodule. + */ + +module fixed_vit_attention_single_precision_wrapper #( + parameter NUM_HEADS = 12, + parameter CHOSEN_PRECISION = "QUERY", + + parameter DATA_IN_0_TENSOR_SIZE_DIM_0 = 768, + parameter DATA_IN_0_TENSOR_SIZE_DIM_1 = 128, + parameter DATA_IN_0_TENSOR_SIZE_DIM_2 = 1, + parameter DATA_IN_0_PARALLELISM_DIM_0 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_1 = 4, + parameter DATA_IN_0_PARALLELISM_DIM_2 = 1, + parameter DATA_IN_0_PRECISION_0 = 16, + parameter DATA_IN_0_PRECISION_1 = 3, + + parameter QUERY_WEIGHT_TENSOR_SIZE_DIM_0 = 768, + parameter QUERY_WEIGHT_TENSOR_SIZE_DIM_1 = 768, + parameter QUERY_WEIGHT_PARALLELISM_DIM_0 = 4, + parameter QUERY_WEIGHT_PARALLELISM_DIM_1 = 4, + parameter QUERY_WEIGHT_PRECISION_0 = 16, + parameter QUERY_WEIGHT_PRECISION_1 = 3, + + parameter KEY_WEIGHT_TENSOR_SIZE_DIM_0 = 768, + parameter KEY_WEIGHT_TENSOR_SIZE_DIM_1 = 768, + parameter KEY_WEIGHT_PARALLELISM_DIM_0 = 4, + parameter KEY_WEIGHT_PARALLELISM_DIM_1 = 4, + parameter KEY_WEIGHT_PRECISION_0 = 16, + parameter KEY_WEIGHT_PRECISION_1 = 3, + + parameter VALUE_WEIGHT_TENSOR_SIZE_DIM_0 = 768, + parameter VALUE_WEIGHT_TENSOR_SIZE_DIM_1 = 768, + parameter VALUE_WEIGHT_PARALLELISM_DIM_0 = 4, + parameter VALUE_WEIGHT_PARALLELISM_DIM_1 = 4, + parameter VALUE_WEIGHT_PRECISION_0 = 16, + parameter VALUE_WEIGHT_PRECISION_1 = 3, + + parameter QUERY_HAS_BIAS = 1, + parameter QUERY_BIAS_TENSOR_SIZE_DIM_0 = 64, + parameter QUERY_BIAS_TENSOR_SIZE_DIM_1 = 1, + parameter QUERY_BIAS_PARALLELISM_DIM_0 = 4, + parameter QUERY_BIAS_PARALLELISM_DIM_1 = 4, + parameter QUERY_BIAS_PRECISION_0 = 16, + parameter QUERY_BIAS_PRECISION_1 = 3, + + parameter KEY_HAS_BIAS = 1, + parameter KEY_BIAS_TENSOR_SIZE_DIM_0 = 64, + parameter KEY_BIAS_TENSOR_SIZE_DIM_1 = 20, + parameter KEY_BIAS_PARALLELISM_DIM_0 = 4, + parameter KEY_BIAS_PARALLELISM_DIM_1 = 4, + parameter KEY_BIAS_PRECISION_0 = 16, + parameter KEY_BIAS_PRECISION_1 = 3, + + parameter VALUE_HAS_BIAS = 1, + parameter VALUE_BIAS_TENSOR_SIZE_DIM_0 = 64, + parameter VALUE_BIAS_TENSOR_SIZE_DIM_1 = 20, + parameter VALUE_BIAS_PARALLELISM_DIM_0 = 4, + parameter VALUE_BIAS_PARALLELISM_DIM_1 = 4, + parameter VALUE_BIAS_PRECISION_0 = 16, + parameter VALUE_BIAS_PRECISION_1 = 3, + + parameter CHOSEN_WEIGHT_TENSOR_SIZE_DIM_0 = QUERY_WEIGHT_TENSOR_SIZE_DIM_0, + parameter CHOSEN_WEIGHT_TENSOR_SIZE_DIM_1 = QUERY_WEIGHT_TENSOR_SIZE_DIM_1, + parameter CHOSEN_WEIGHT_PARALLELISM_DIM_0 = QUERY_WEIGHT_PARALLELISM_DIM_0, + parameter CHOSEN_WEIGHT_PARALLELISM_DIM_1 = QUERY_WEIGHT_PARALLELISM_DIM_1, + parameter CHOSEN_WEIGHT_PRECISION_0 = QUERY_WEIGHT_PRECISION_0, + parameter CHOSEN_WEIGHT_PRECISION_1 = QUERY_WEIGHT_PRECISION_1, + parameter CHOSEN_HAS_BIAS = QUERY_HAS_BIAS, + parameter CHOSEN_BIAS_TENSOR_SIZE_DIM_0 = QUERY_BIAS_TENSOR_SIZE_DIM_0, + parameter CHOSEN_BIAS_TENSOR_SIZE_DIM_1 = QUERY_BIAS_TENSOR_SIZE_DIM_1, + parameter CHOSEN_BIAS_PARALLELISM_DIM_0 = QUERY_BIAS_PARALLELISM_DIM_0, + parameter CHOSEN_BIAS_PARALLELISM_DIM_1 = QUERY_BIAS_PARALLELISM_DIM_1, + parameter CHOSEN_BIAS_PRECISION_0 = QUERY_BIAS_PRECISION_0, + parameter CHOSEN_BIAS_PRECISION_1 = QUERY_BIAS_PRECISION_1, + + parameter QKV_PRECISION_0 = -1, + parameter QKV_PRECISION_1 = -1, + parameter QKMM_OUT_PRECISION_0 = -1, + parameter QKMM_OUT_PRECISION_1 = -1, + parameter SOFTMAX_EXP_PRECISION_0 = -1, + parameter SOFTMAX_EXP_PRECISION_1 = -1, + parameter SOFTMAX_OUT_PRECISION_1 = -1, + parameter SVMM_OUT_PRECISION_0 = -1, + parameter SVMM_OUT_PRECISION_1 = -1, + + parameter PROJ_WEIGHT_TENSOR_SIZE_DIM_0 = -1, + parameter PROJ_WEIGHT_TENSOR_SIZE_DIM_1 = -1, + parameter PROJ_WEIGHT_PARALLELISM_DIM_0 = -1, + parameter PROJ_WEIGHT_PARALLELISM_DIM_1 = -1, + parameter PROJ_WEIGHT_PRECISION_0 = -1, + parameter PROJ_WEIGHT_PRECISION_1 = -1, + parameter PROJ_BIAS_TENSOR_SIZE_DIM_0 = -1, + parameter PROJ_BIAS_TENSOR_SIZE_DIM_1 = -1, + parameter PROJ_BIAS_PARALLELISM_DIM_0 = -1, + parameter PROJ_BIAS_PARALLELISM_DIM_1 = -1, + parameter PROJ_BIAS_PRECISION_0 = -1, + parameter PROJ_BIAS_PRECISION_1 = -1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_0 = CHOSEN_WEIGHT_TENSOR_SIZE_DIM_0, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_1 = DATA_IN_0_TENSOR_SIZE_DIM_1, + parameter DATA_OUT_0_TENSOR_SIZE_DIM_2 = DATA_IN_0_TENSOR_SIZE_DIM_2, + parameter DATA_OUT_0_PARALLELISM_DIM_0 = CHOSEN_WEIGHT_PARALLELISM_DIM_0, + parameter DATA_OUT_0_PARALLELISM_DIM_1 = DATA_IN_0_PARALLELISM_DIM_1, + parameter DATA_OUT_0_PARALLELISM_DIM_2 = DATA_IN_0_PARALLELISM_DIM_2, + parameter DATA_OUT_0_PRECISION_0 = DATA_IN_0_PRECISION_0, + parameter DATA_OUT_0_PRECISION_1 = DATA_IN_0_PRECISION_1 + +) ( + input logic clk, + input logic rst, + + input logic [DATA_IN_0_PRECISION_0-1:0] data_in_0 [DATA_IN_0_PARALLELISM_DIM_0*DATA_IN_0_PARALLELISM_DIM_1-1:0], + input logic data_in_0_valid, + output logic data_in_0_ready, + + // Query weights + input logic [QUERY_WEIGHT_PRECISION_0-1:0] query_weight [QUERY_WEIGHT_PARALLELISM_DIM_0 * QUERY_WEIGHT_PARALLELISM_DIM_1-1:0], + input logic query_weight_valid, + output logic query_weight_ready, + + // Query bias + input logic [QUERY_BIAS_PRECISION_0-1:0] query_bias [QUERY_BIAS_PARALLELISM_DIM_0 * QUERY_BIAS_PARALLELISM_DIM_1 -1:0], + input logic query_bias_valid, + output logic query_bias_ready, + + // Key weights + input logic [KEY_WEIGHT_PRECISION_0-1:0] key_weight [KEY_WEIGHT_PARALLELISM_DIM_0 * KEY_WEIGHT_PARALLELISM_DIM_1-1:0], + input logic key_weight_valid, + output logic key_weight_ready, + + // Key bias + input logic [KEY_BIAS_PRECISION_0-1:0] key_bias [KEY_BIAS_PARALLELISM_DIM_0 * KEY_BIAS_PARALLELISM_DIM_1 -1:0], + input logic key_bias_valid, + output logic key_bias_ready, + + // Value weights + input logic [VALUE_WEIGHT_PRECISION_0-1:0] value_weight [VALUE_WEIGHT_PARALLELISM_DIM_0 * VALUE_WEIGHT_PARALLELISM_DIM_1-1:0], + input logic value_weight_valid, + output logic value_weight_ready, + + // Value bias + input logic [VALUE_BIAS_PRECISION_0-1:0] value_bias [VALUE_BIAS_PARALLELISM_DIM_0 * VALUE_BIAS_PARALLELISM_DIM_1 -1:0], + input logic value_bias_valid, + output logic value_bias_ready, + + // Proj weights + input logic [PROJ_WEIGHT_PRECISION_0-1:0] proj_weight [PROJ_WEIGHT_PARALLELISM_DIM_0 * PROJ_WEIGHT_PARALLELISM_DIM_1-1:0], + input logic proj_weight_valid, + output logic proj_weight_ready, + + // Proj bias + input logic [PROJ_BIAS_PRECISION_0-1:0] proj_bias [PROJ_BIAS_PARALLELISM_DIM_0 * PROJ_BIAS_PARALLELISM_DIM_1 -1:0], + input logic proj_bias_valid, + output logic proj_bias_ready, + + output logic [DATA_OUT_0_PRECISION_0-1:0] data_out_0 [DATA_OUT_0_PARALLELISM_DIM_0*DATA_OUT_0_PARALLELISM_DIM_1-1:0], + output logic data_out_0_valid, + input logic data_out_0_ready +); + + fixed_vit_attention #( + .NUM_HEADS(NUM_HEADS), + + .DATA_IN_0_TENSOR_SIZE_DIM_0(DATA_IN_0_TENSOR_SIZE_DIM_0), + .DATA_IN_0_TENSOR_SIZE_DIM_1(DATA_IN_0_TENSOR_SIZE_DIM_1), + .DATA_IN_0_PARALLELISM_DIM_0(DATA_IN_0_PARALLELISM_DIM_0), + .DATA_IN_0_PARALLELISM_DIM_1(DATA_IN_0_PARALLELISM_DIM_1), + .DATA_IN_0_PRECISION_0 (DATA_IN_0_PRECISION_0), + .DATA_IN_0_PRECISION_1 (DATA_IN_0_PRECISION_1), + + .WEIGHT_TENSOR_SIZE_DIM_0(CHOSEN_WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_TENSOR_SIZE_DIM_1(CHOSEN_WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PARALLELISM_DIM_0(CHOSEN_WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PARALLELISM_DIM_1(CHOSEN_WEIGHT_PARALLELISM_DIM_1), + .WEIGHT_PRECISION_0(CHOSEN_WEIGHT_PRECISION_0), + .WEIGHT_PRECISION_1(CHOSEN_WEIGHT_PRECISION_1), + + .HAS_BIAS (CHOSEN_HAS_BIAS), + .BIAS_PRECISION_0 (CHOSEN_BIAS_PRECISION_0), + .BIAS_PRECISION_1 (CHOSEN_BIAS_PRECISION_1), + + .QKV_PRECISION_0(QKV_PRECISION_0), + .QKV_PRECISION_1(QKV_PRECISION_1), + .QKMM_OUT_PRECISION_0(QKMM_OUT_PRECISION_0), + .QKMM_OUT_PRECISION_1(QKMM_OUT_PRECISION_1), + .SOFTMAX_EXP_PRECISION_0(SOFTMAX_EXP_PRECISION_0), + .SOFTMAX_EXP_PRECISION_1(SOFTMAX_EXP_PRECISION_1), + .SOFTMAX_OUT_DATA_PRECISION_1(SOFTMAX_OUT_PRECISION_1), + .SVMM_OUT_PRECISION_0(SVMM_OUT_PRECISION_0), + .SVMM_OUT_PRECISION_1(SVMM_OUT_PRECISION_1), + + .WEIGHT_PROJ_TENSOR_SIZE_DIM_0(PROJ_WEIGHT_TENSOR_SIZE_DIM_0), + .WEIGHT_PROJ_TENSOR_SIZE_DIM_1(PROJ_WEIGHT_TENSOR_SIZE_DIM_1), + .WEIGHT_PROJ_PARALLELISM_DIM_0(PROJ_WEIGHT_PARALLELISM_DIM_0), + .WEIGHT_PROJ_PARALLELISM_DIM_1(PROJ_WEIGHT_PARALLELISM_DIM_1), + .WEIGHT_PROJ_PRECISION_0(PROJ_WEIGHT_PRECISION_0), + .WEIGHT_PROJ_PRECISION_1(PROJ_WEIGHT_PRECISION_1), + .BIAS_PROJ_PRECISION_0(PROJ_BIAS_PRECISION_0), + .BIAS_PROJ_PRECISION_1(PROJ_BIAS_PRECISION_1), + + + .DATA_OUT_0_TENSOR_SIZE_DIM_0 (DATA_OUT_0_TENSOR_SIZE_DIM_0), + .DATA_OUT_0_TENSOR_SIZE_DIM_1 (DATA_OUT_0_TENSOR_SIZE_DIM_1), + .DATA_OUT_0_PARALLELISM_DIM_0 (DATA_OUT_0_PARALLELISM_DIM_0), + .DATA_OUT_0_PARALLELISM_DIM_1 (DATA_OUT_0_PARALLELISM_DIM_1), + .DATA_OUT_0_PRECISION_0 (DATA_OUT_0_PRECISION_0), + .DATA_OUT_0_PRECISION_1 (DATA_OUT_0_PRECISION_1) + ) encoder_layer_0_attention_self_inst ( + .clk(clk), + .rst(rst), + + .data_in_0 (data_in_0), + .data_in_0_valid(data_in_0_valid), + .data_in_0_ready(data_in_0_ready), + + .query_weight (query_weight), + .query_weight_valid(query_weight_valid), + .query_weight_ready(query_weight_ready), + + .query_bias (query_bias), + .query_bias_valid(query_bias_valid), + .query_bias_ready(query_bias_ready), + + .key_weight (key_weight), + .key_weight_valid(key_weight_valid), + .key_weight_ready(key_weight_ready), + + .key_bias (key_bias), + .key_bias_valid(key_bias_valid), + .key_bias_ready(key_bias_ready), + + .value_weight (value_weight), + .value_weight_valid(value_weight_valid), + .value_weight_ready(value_weight_ready), + + .value_bias (value_bias), + .value_bias_valid(value_bias_valid), + .value_bias_ready(value_bias_ready), + + .proj_weight(proj_weight), + .proj_weight_valid(proj_weight_valid), + .proj_weight_ready(proj_weight_ready), + + .proj_bias(proj_bias), + .proj_bias_valid(proj_bias_valid), + .proj_bias_ready(proj_bias_ready), + + .data_out_0 (data_out_0), + .data_out_0_valid(data_out_0_valid), + .data_out_0_ready(data_out_0_ready) + ); + +endmodule diff --git a/src/mase_components/vision_models/vit/rtl/hash_softmax.sv b/src/mase_components/vision_models/vit/rtl/hash_softmax.sv deleted file mode 100644 index a88950462..000000000 --- a/src/mase_components/vision_models/vit/rtl/hash_softmax.sv +++ /dev/null @@ -1,354 +0,0 @@ -`timescale 1ns / 1ps -/* verilator lint_off UNUSEDPARAM */ -module hash_softmax #( - parameter IN_WIDTH = 8, - parameter IN_FRAC_WIDTH = 4, - parameter EXP_WIDTH = 8, - parameter EXP_FRAC_WIDTH = 4, - parameter DIV_WIDTH = 8, - parameter OUT_WIDTH = 8, - parameter OUT_FRAC_WIDTH = 4, - parameter IN_SIZE = 4, - parameter OUT_SIZE = 2, - parameter IN_DEPTH = 3 -) ( - input clk, - rst, - input [IN_WIDTH - 1:0] data_in[IN_SIZE - 1:0], - input data_in_valid, - output data_in_ready, - output [OUT_WIDTH - 1:0] data_out[OUT_SIZE - 1:0], - output data_out_valid, - input data_out_ready -); - logic [EXP_WIDTH - 1:0] exp[IN_SIZE - 1:0]; - logic exp_valid, exp_ready; - for (genvar i = 0; i < IN_SIZE; i++) begin : exp_parallel - /* verilator lint_off UNUSEDSIGNAL */ - logic exp_in_valid, exp_in_ready; - logic exp_out_valid, exp_out_ready; - assign exp_out_ready = exp_ready; - assign exp_in_valid = data_in_valid; - hash_exp #( - .IN_WIDTH (IN_WIDTH), - .OUT_WIDTH(EXP_WIDTH) - ) exp_inst ( - .data_in(data_in[i]), - .data_in_valid(exp_in_valid), - .data_in_ready(exp_in_ready), - .data_out(exp[i]), - .data_out_valid(exp_out_valid), - .data_out_ready(exp_out_ready), - .* - ); - end - assign exp_valid = exp_parallel[0].exp_out_valid; - assign data_in_ready = exp_parallel[0].exp_in_ready; - localparam SUM_WIDTH = EXP_WIDTH + $clog2(IN_SIZE); - logic [SUM_WIDTH - 1:0] sum; - logic sum_valid, sum_ready; - logic exp2sum_ready, exp2roller_ready; - logic exp2sum_valid, exp2roller_valid; - - split2 split2_inst ( - .data_in_valid (exp_valid), - .data_in_ready (exp_ready), - .data_out_valid({exp2sum_valid, exp2roller_valid}), - .data_out_ready({exp2sum_ready, exp2roller_ready}) - ); - - fixed_adder_tree #( - .IN_SIZE (IN_SIZE), - .IN_WIDTH(IN_WIDTH) - ) sum_inst ( - .data_in(exp), - .data_in_valid(exp2sum_valid), - .data_in_ready(exp2sum_ready), - .data_out(sum), - .data_out_valid(sum_valid), - .data_out_ready(sum_ready), - .* - ); - // roller part - logic [EXP_WIDTH - 1:0] roller_exp[OUT_SIZE - 1:0]; - logic roller_exp_valid, roller_exp_ready; - logic [EXP_WIDTH - 1:0] ff_exp[IN_SIZE - 1:0]; - logic ff_exp_valid, ff_exp_ready; - unpacked_fifo #( - .DEPTH(IN_DEPTH), - .DATA_WIDTH(EXP_WIDTH), - .IN_NUM(IN_SIZE) - ) roller_buffer ( - .data_in(exp), - .data_in_valid(exp2roller_valid), - .data_in_ready(exp2roller_ready), - .data_out(ff_exp), - .data_out_valid(ff_exp_valid), - .data_out_ready(ff_exp_ready), - .* - ); - roller #( - .DATA_WIDTH(EXP_WIDTH), - .NUM(IN_SIZE), - .IN_SIZE(IN_SIZE), - .ROLL_NUM(OUT_SIZE) - ) roller_inst ( - .data_in(ff_exp), - .data_in_valid(ff_exp_valid), - .data_in_ready(ff_exp_ready), - .data_out(roller_exp), - .data_out_valid(roller_exp_valid), - .data_out_ready(roller_exp_ready), - .* - ); - - localparam ACC_WIDTH = SUM_WIDTH + $clog2(IN_DEPTH); - logic [ACC_WIDTH - 1:0] acc; - logic [ACC_WIDTH - 1:0] acc_duplicate[OUT_SIZE - 1:0]; - logic acc_valid, acc_ready; - - fixed_accumulator #( - .IN_WIDTH(SUM_WIDTH), - .IN_DEPTH(IN_DEPTH) - ) fixed_accumulator_inst ( - .clk(clk), - .rst(rst), - .data_in(sum), - .data_in_valid(sum_valid), - .data_in_ready(sum_ready), - .data_out(acc), - .data_out_valid(acc_valid), - .data_out_ready(acc_ready) - ); - - logic [ACC_WIDTH - 1:0] ib_acc[OUT_SIZE - 1:0]; - logic ib_acc_valid, ib_acc_ready; - circular_buffer #( - .IN_WIDTH(ACC_WIDTH), - .IN_SIZE (OUT_SIZE), - .REPEAT (IN_DEPTH * IN_SIZE / OUT_SIZE) - ) acc_circular ( - .clk(clk), - .rst(rst), - .data_in(acc_duplicate), - .data_in_valid(acc_valid), - .data_in_ready(acc_ready), - .data_out(ib_acc), - .data_out_valid(ib_acc_valid), - .data_out_ready(ib_acc_ready) - ); - - logic [ACC_WIDTH - 1:0] one_over_div[OUT_SIZE - 1:0]; - logic [DIV_WIDTH - 1:0] div_in[OUT_SIZE - 1:0]; - - fixed_rounding #( - .IN_SIZE(OUT_SIZE), - .IN_WIDTH(ACC_WIDTH), - .IN_FRAC_WIDTH(0), - .OUT_WIDTH(DIV_WIDTH), - .OUT_FRAC_WIDTH(0) - ) div_round ( - .data_in (one_over_div), - .data_out(div_in) - ); - logic div_join_valid, div_join_ready; - - join2 #() div_join_inst ( - .data_in_ready ({roller_exp_ready, ib_acc_ready}), - .data_in_valid ({roller_exp_valid, ib_acc_valid}), - .data_out_valid(div_join_valid), - .data_out_ready(div_join_ready) - ); - for (genvar i = 0; i < OUT_SIZE; i++) begin : div_parallel - /* verilator lint_off UNUSEDSIGNAL */ - logic [OUT_WIDTH - 1:0] div_out; - logic div_out_valid, div_out_ready; - assign acc_duplicate[i] = acc; - /* verilator lint_off WIDTH */ - assign one_over_div[i] = (roller_exp[i] == 0) ? 2**(DIV_WIDTH-1) - 1:ib_acc[i] / roller_exp[i]; - /* verilator lint_on WIDTH */ - logic div_in_valid, div_in_ready; - assign div_in_valid = div_join_valid; - hash_div #( - .IN_WIDTH (DIV_WIDTH), - .OUT_WIDTH(OUT_WIDTH) - ) div_inst ( - .data_in(div_in[i]), - .data_in_valid(div_in_valid), - .data_in_ready(div_in_ready), - .data_out(div_out), - .data_out_valid(div_out_valid), - .data_out_ready(div_out_ready), - .* - ); - assign data_out[i] = div_out; - assign div_out_ready = data_out_ready; - end - assign div_join_ready = div_parallel[0].div_in_ready; - assign data_out_valid = div_parallel[0].div_out_valid; -endmodule -/* verilator lint_off DECLFILENAME */ - -module hash_exp #( - parameter IN_WIDTH = 8, - parameter OUT_WIDTH = 8 -) ( - input clk, - input rst, - input [IN_WIDTH - 1:0] data_in, - input data_in_valid, - output data_in_ready, - output logic [OUT_WIDTH - 1:0] data_out, - output data_out_valid, - input data_out_ready -); - localparam MEM_NUM = 2 ** (IN_WIDTH + 1) - 1; - logic [OUT_WIDTH - 1:0] mem[MEM_NUM - 1:0]; - initial begin - $readmemh("../exp_init.mem", mem); - end - - // The shift register stores the validity of the data in the buffer - logic shift_reg; - // The buffer stores the intermeidate data being computed in the register slice - logic [OUT_WIDTH-1:0] buffer; - - always_ff @(posedge clk) begin - if (rst) shift_reg <= 1'b0; - else begin - // no backpressure or buffer empty - if (data_out_ready || !shift_reg) shift_reg <= data_in_valid; - else shift_reg <= shift_reg; - end - end - - // buffer - always_ff @(posedge clk) begin - if (rst) buffer <= 0; - // backpressure && valid output - if (!data_out_ready && data_out_valid) buffer <= buffer; - /* verilator lint_off WIDTH */ - else - buffer <= mem[data_in]; - /* verilator lint_on WIDTH */ - end - - // empty buffer or no back pressure - assign data_in_ready = (~shift_reg) | data_out_ready; - // dummy data_iniring - assign data_out_valid = shift_reg; - assign data_out = buffer; -endmodule - - -module hash_div #( - parameter IN_WIDTH = 8, - parameter OUT_WIDTH = 8 -) ( - input clk, - input rst, - input [IN_WIDTH - 1:0] data_in, - input data_in_valid, - output logic data_in_ready, - output logic [OUT_WIDTH - 1:0] data_out, - output logic data_out_valid, - input data_out_ready -); - localparam MEM_NUM = 2 ** (IN_WIDTH + 1) - 1; - logic [OUT_WIDTH - 1:0] mem[MEM_NUM - 1:0]; - initial begin - $readmemh("../div_init.mem", mem); - end - - // The shift register stores the validity of the data in the buffer - logic shift_reg; - // The buffer stores the intermeidate data being computed in the register slice - logic [OUT_WIDTH-1:0] buffer; - - always_ff @(posedge clk) begin - if (rst) shift_reg <= 1'b0; - else begin - // no backpressure or buffer empty - if (data_out_ready || !shift_reg) shift_reg <= data_in_valid; - else shift_reg <= shift_reg; - end - end - - // buffer - always_ff @(posedge clk) begin - if (rst) buffer <= 0; - // backpressure && valid output - if (!data_out_ready && data_out_valid) buffer <= buffer; - /* verilator lint_off WIDTH */ - else - buffer <= mem[data_in]; - /* verilator lint_on WIDTH */ - end - - // empty buffer or no back pressure - assign data_in_ready = (~shift_reg) | data_out_ready; - // dummy data_iniring - assign data_out_valid = shift_reg; - assign data_out = buffer; -endmodule - -module circular_buffer #( - // input - parameter IN_WIDTH = 8, - // define as nm * mk - // rows refers to n, columns refers to m - - //in parallelism in the row dimension - parameter IN_SIZE = 1, - parameter REPEAT = 8, - parameter OUT_WIDTH = IN_WIDTH, - parameter OUT_SIZE = IN_SIZE -) ( - input clk, - input rst, - //input data - input [IN_WIDTH-1:0] data_in[IN_SIZE - 1:0], - input logic data_in_valid, - output logic data_in_ready, - - output [OUT_WIDTH-1:0] data_out[OUT_SIZE - 1:0], - output logic data_out_valid, - input logic data_out_ready -); - localparam COUNT_SIZE = $clog2(REPEAT) + 1; - logic [IN_WIDTH-1:0] buffer[IN_SIZE - 1:0]; - // The shift register stores the validity of the data in the buffer - logic circular_mode; - logic [COUNT_SIZE - 1:0] circular_count; - assign circular_mode = circular_count != 0; - logic insert, remove; - always_comb begin - insert = data_in_ready && data_in_valid; - remove = data_out_ready && data_out_valid; - end - - always_ff @(posedge clk) begin - /* verilator lint_off WIDTH */ - if (rst) circular_count <= 0; - else if (insert && (~circular_mode)) circular_count <= REPEAT; - else if (remove && circular_mode) circular_count <= circular_count - 1; - else circular_count <= circular_count; - /* verilator lint_on WIDTH */ - end - for (genvar i = 0; i < IN_SIZE; i++) begin - always_ff @(posedge clk) begin - if (rst) buffer[i] <= 0; - else if (insert && (~circular_mode)) buffer[i] <= data_in[i]; - else buffer[i] <= buffer[i]; - end - assign data_out[i] = buffer[i]; - end - - always_comb begin - // empty buffer or no back pressure - data_in_ready = (~circular_mode); - // dummy data_iniring - data_out_valid = circular_mode; - end -endmodule - diff --git a/src/mase_components/vision_models/vit/test/affine_layernorm_tb.py b/src/mase_components/vision_models/vit/test/affine_layernorm_tb.py deleted file mode 100644 index 44e99401b..000000000 --- a/src/mase_components/vision_models/vit/test/affine_layernorm_tb.py +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/env python3 - -# This script tests the register slice -import random, os, math, logging, sys - -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -print(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - -sys.path.append("/workspace/components/testbench/ViT/") -sys.path.append("/workspace/machop/") -from mase_cocotb.random_test import RandomSource -from mase_cocotb.random_test import RandomSink -from mase_cocotb.random_test import check_results - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock -from cocotb.runner import get_runner -import torch -from .helpers.pvt_quant import fixed_affine -from mase_cocotb.z_qlayers import quantize_to_int as q2i - -debug = True -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - -import pytest - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - self.samples = samples - self.data_width = 8 - self.data_frac_width = 3 - self.b_width = 8 - self.b_frac_width = 5 - self.data_out_width = 8 - self.data_out_frac_width = 4 - self.w_config = { - "affine": { - "mul": { - "name": "integer", - "data_in_width": self.data_width, - "data_in_frac_width": self.data_frac_width, - }, - "add": { - "name": "integer", - "data_in_width": self.b_width, - "data_in_frac_width": self.b_frac_width, - }, - } - } - self.in_size = 4 - self.data_generate() - self.inputs = RandomSource( - samples=samples, - max_stalls=2 * samples, - num=self.in_size, - is_data_vector=True, - debug=debug, - data_specify=self.data_in, - ) - self.outputs = RandomSink(samples=samples, max_stalls=2 * samples, debug=debug) - self.samples = samples - self.ref = self.sw_compute() - - def data_generate(self): - torch.manual_seed(0) - samples = self.samples - self.fixed_aff = fixed_affine(self.w_config["affine"]) - self.x = 3 * torch.randn(self.samples, self.in_size) - w = self.fixed_aff.weight - b = self.fixed_aff.bias - self.weight_in = ( - q2i( - w, - self.w_config["affine"]["mul"]["data_in_width"], - self.w_config["affine"]["mul"]["data_in_frac_width"], - ) - .repeat(self.samples, self.in_size) - .tolist() - ) - - self.bias_in = ( - q2i( - b, - self.w_config["affine"]["add"]["data_in_width"], - self.w_config["affine"]["add"]["data_in_frac_width"], - ) - .repeat(self.samples, self.in_size) - .tolist() - ) - self.data_in = q2i( - self.x, - self.w_config["affine"]["mul"]["data_in_width"], - self.w_config["affine"]["mul"]["data_in_frac_width"], - ).tolist() - self.data_in.reverse() - self.weight_in.reverse() - self.bias_in.reverse() - self.weight = RandomSource( - samples=samples, - max_stalls=2 * samples, - num=self.in_size, - is_data_vector=True, - debug=debug, - data_specify=self.weight_in, - ) - self.bias = RandomSource( - samples=samples, - max_stalls=2 * samples, - num=self.in_size, - is_data_vector=True, - debug=debug, - data_specify=self.bias_in, - ) - - def sw_compute(self): - data_out = self.fixed_aff(self.x) - output = q2i(data_out, self.data_out_width, self.data_out_frac_width).tolist() - return output - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.w_config["affine"]["mul"]["data_in_width"], - "IN_FRAC_WIDTH": self.w_config["affine"]["mul"]["data_in_frac_width"], - "BIAS_WIDTH": self.w_config["affine"]["add"]["data_in_width"], - "BIAS_FRAC_WIDTH": self.w_config["affine"]["add"]["data_in_frac_width"], - "OUT_WIDTH": self.data_out_width, - "OUT_FRAC_WIDTH": self.data_out_frac_width, - "IN_SIZE": self.in_size, - } - - -def in_out_wave(dut, name): - logger.debug( - "{} State: (in_valid,in_ready,out_valid,out_ready) = ({},{},{},{})".format( - name, - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - - -@cocotb.test() -async def cocotb_test_register_slice(dut): - """Test register slice""" - samples = 20 - test_case = VerificationCase(samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - in_out_wave(dut, "Pre-clk") - await FallingEdge(dut.clk) - in_out_wave(dut, "Post-clk") - - in_out_wave(dut, "Pre-clk") - await FallingEdge(dut.clk) - in_out_wave(dut, "Post-clk") - - done = False - while not done: - await FallingEdge(dut.clk) - in_out_wave(dut, "Post-clk") - - ## Pre_compute - dut.data_in_valid.value = test_case.inputs.pre_compute() - dut.weight_valid.value = test_case.weight.pre_compute() - dut.bias_valid.value = test_case.bias.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute( - dut.data_out_valid.value - ) - await Timer(1, units="ns") - - ## Compute - dut.data_in_valid.value, dut.data_in.value = test_case.inputs.compute( - dut.data_in_ready.value - ) - dut.weight_valid.value, dut.weight.value = test_case.weight.compute( - dut.weight_ready.value - ) - dut.bias_valid.value, dut.bias.value = test_case.bias.compute( - dut.bias_ready.value - ) - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - # wave_check(dut) - logger.debug("\n") - # breakpoint() - done = ( - test_case.outputs.is_full() - and test_case.inputs.is_empty() - and test_case.weight.is_empty() - and test_case.bias.is_empty() - ) - - check_results(test_case.outputs.data, test_case.ref) - - -def wave_check(dut): - logger.debug( - "wave of in_out:\n\ - {},{},data_in = {} \n\ - {},{},weight_in = {} \n\ - {},{},bias = {} \n\ - {},{},data_out = {}\n\ - ".format( - dut.data_in_valid.value, - dut.data_in_ready.value, - [int(i) for i in dut.data_in.value], - dut.weight_valid.value, - dut.weight_ready.value, - [int(i) for i in dut.weight.value], - dut.bias_valid.value, - dut.bias_ready.value, - [int(i) for i in dut.bias.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.data_out.value], - ) - ) - - logger.debug( - "wave of sa_out:\n\ - {},{},prod = {} \n\ - {},{},add = {} \n\ - ".format( - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.prod.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.round_prod.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.round_in.value], - ) - ) - breakpoint() - - -def runner(): - sim = os.getenv("SIM", "verilator") - - verilog_sources = [ - "../../../../components/ViT/affine_layernorm.sv", - "../../../../components/cast/fixed_rounding.sv", - "../../../../components/common/join2.sv", - "../../../../components/fixed_arithmetic/fixed_vector_mult.sv", - "../../../../components/fixed_arithmetic/fixed_mult.sv", - "../../../../components/common/fifo.sv", - ] - test_case = VerificationCase() - - # set parameters - extra_args = [] - for k, v in test_case.get_dut_parameters().items(): - extra_args.append(f"-G{k}={v}") - print(extra_args) - runner = get_runner(sim) - runner.build( - verilog_sources=verilog_sources, - hdl_toplevel="affine_layernorm", - build_args=extra_args, - ) - - runner.test(hdl_toplevel="affine_layernorm", test_module="affine_layernorm_tb") - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_affine_layernorm(): - runner() - - -if __name__ == "__main__": - test_affine_layernorm() diff --git a/src/mase_components/vision_models/vit/test/fixed_block_tb.py b/src/mase_components/vision_models/vit/test/fixed_block_tb.py deleted file mode 100644 index 51b2baa67..000000000 --- a/src/mase_components/vision_models/vit/test/fixed_block_tb.py +++ /dev/null @@ -1,1149 +0,0 @@ -#!/usr/bin/env python3 - -import random, os, math, logging, sys - -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -import random, os, math, logging, sys -import numpy as np -import torch -import torch.nn as nn -from torch import Tensor - -# from torchsummary import summary -from einops import rearrange, reduce, repeat - - -from mase_cocotb.random_test import RandomSource -from mase_cocotb.random_test import RandomSink -from mase_cocotb.random_test import check_results - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock -from cocotb.runner import get_runner - -from .helpers.pvt_quant import QuantizedBlock -from .helpers.ha_softmax import generate_table_hardware, generate_table_div_hardware -from mase_cocotb.z_qlayers import quantize_to_int as q2i -from mase_cocotb.z_qlayers import linear_data_pack - -debug = True - -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - self.samples = samples - self.width_generate() - - self.has_bias = 1 - self.in_num = 2 - self.in_dim = 12 - self.num_heads = 2 - self.wqkv_dim = self.in_dim / self.num_heads - self.wp_dim = self.in_dim - self.unroll_in_num = 1 - self.unroll_in_dim = 2 - self.unroll_wqkv_dim = 3 - - self.in_parallelism = self.unroll_in_num - self.in_num_parallelism = int(self.in_num / self.unroll_in_num) - - self.in_size = self.unroll_in_dim - self.in_depth = int(self.in_dim / self.unroll_in_dim) - - # noted num_heads * wqkv_p * wqkv_np should be = in_s * in_d - self.wqkv_parallelism = self.unroll_wqkv_dim - self.wqkv_num_parallelism = int(self.wqkv_dim / self.unroll_wqkv_dim) - - self.wp_parallelism = self.in_size - self.wp_num_parallelism = self.in_depth - assert ( - self.num_heads * self.wqkv_parallelism * self.wqkv_num_parallelism - == self.in_size * self.in_depth - ), "should have num_heads * wqkv_p * wqkv_np == in_s * in_d" - assert ( - (self.in_num % self.unroll_in_num == 0) - and (self.in_dim % self.unroll_in_dim == 0) - and (self.wqkv_dim % self.unroll_wqkv_dim == 0) - ), "unroll parameter should be exact division of all" - - self.wp_size = self.num_heads * self.wqkv_parallelism - self.wp_depth = self.wqkv_num_parallelism - - self.in_num = self.in_num - self.in_features = self.in_dim - self.mlp_ratio = 2 - self.hidden_features = self.mlp_ratio * self.in_features - self.out_features = self.in_features - - self.unroll_in_num = self.unroll_in_num - self.unroll_in_features = self.unroll_in_dim - self.unroll_hidden_features = 2 - self.unroll_out_features = self.unroll_in_features - # data_generate - self.source_generate() - ## remain modification - self.outputs = RandomSink( - samples=samples * self.in_num_parallelism * self.wp_num_parallelism, - max_stalls=2 * samples * self.in_num_parallelism * self.wp_parallelism, - debug=debug, - ) - self.samples = samples - self.ref = self.sw_compute() - - def width_generate(self): - din, din_f = 8, 3 - - aff_msa_w, aff_msa_w_f = din, din_f - aff_msa_b, aff_msa_b_f = 8, 3 - msa_din, msa_din_f = 8, 3 - wq, wq_f = 6, 4 - wkv, wkv_f = 6, 4 - wp, wp_f = 6, 4 - - bq, bq_f = 6, 4 - bkv, bkv_f = 6, 4 - bp, bp_f = 6, 4 - - dq, dq_f = 8, 3 - dk, dk_f = 8, 3 - dv, dv_f = 8, 3 - ds, ds_f = 8, 3 - softmax_exp, softmax_exp_f = 8, 5 - softmax_ds, softmax_ds_f = 8, 3 - div = 9 - dz, dz_f = 8, 3 - - msa_o, msa_o_f = 8, 3 - - aff_mlp_w, aff_mlp_w_f = msa_o + 1, msa_o_f - aff_mlp_b, aff_mlp_b_f = 8, 3 - - mlp_din, mlp_din_f = 8, 3 - fc1_w, fc1_w_f = 6, 4 - fc1_b, fc1_b_f = 6, 4 - mlp_hidden, mlp_hidden_f = 8, 3 - fc2_w, fc2_w_f = 6, 4 - fc2_b, fc2_b_f = 6, 4 - mlp_o, mlp_o_f = 8, 3 - self.w_config = { - "head_proj": { - "name": "integer", - "data_in_width": 8, - "data_in_frac_width": 5, - "weight_width": 8, - "weight_frac_width": 6, - "bias_width": 8, - "bias_frac_width": 4, - }, - "patch_embed": { - "patch_proj": { - "name": "integer", - "data_in_width": 8, - "data_in_frac_width": 5, - "weight_width": 8, - "weight_frac_width": 6, - "bias_width": 8, - "bias_frac_width": 5, - }, - }, - "pos_add": { - "name": "integer", - "data_in_width": 8, - "data_in_frac_width": 5, - }, - "block": { - "affine_att": { - "mul": { - "name": "integer", - "data_in_width": aff_msa_w, - "data_in_frac_width": aff_msa_w_f, - }, - "add": { - "name": "integer", - "data_in_width": aff_msa_b, - "data_in_frac_width": aff_msa_b_f, - }, - }, - "msa": { - "q_proj": { - "name": "integer", - "weight_width": wq, - "weight_frac_width": wq_f, - "data_in_width": msa_din, - "data_in_frac_width": msa_din_f, - "bias_width": bq, - "bias_frac_width": bq_f, - }, - "kv_proj": { - "name": "integer", - "weight_width": wkv, - "weight_frac_width": wkv_f, - "data_in_width": msa_din, - "data_in_frac_width": msa_din_f, - "bias_width": bkv, - "bias_frac_width": bkv_f, - }, - "z_proj": { - "name": "integer", - "weight_width": wp, - "weight_frac_width": wp_f, - "data_in_width": dz, - "data_in_frac_width": dz_f, - "bias_width": bp, - "bias_frac_width": bp_f, - }, - "softmax": { - "name": "integer", - "exp_width": softmax_exp, - "exp_frac_width": softmax_exp_f, - "data_in_width": ds, - "data_in_frac_width": ds_f, - "data_out_width": softmax_ds, - "data_out_frac_width": softmax_ds_f, - "div_width": div, - }, - "attn_matmul": { - "name": "integer", - "data_in_width": dq, - "data_in_frac_width": dq_f, - "weight_width": dk, - "weight_frac_width": dk_f, - }, - "z_matmul": { - "name": "integer", - "data_in_width": softmax_ds, - "data_in_frac_width": softmax_ds_f, - "weight_width": dv, - "weight_frac_width": dv_f, - }, - }, - "add1": { - "name": "integer", - "data_in_width": msa_o, - "data_in_frac_width": msa_o_f, - }, - "affine_mlp": { - "mul": { - "name": "integer", - "data_in_width": aff_mlp_w, - "data_in_frac_width": aff_mlp_w_f, - }, - "add": { - "name": "integer", - "data_in_width": aff_mlp_b, - "data_in_frac_width": aff_mlp_b_f, - }, - }, - "mlp": { - "fc1_proj": { - "name": "integer", - "weight_width": fc1_w, - "weight_frac_width": fc1_w_f, - "data_in_width": mlp_din, - "data_in_frac_width": mlp_din_f, - "bias_width": fc1_b, - "bias_frac_width": fc1_b_f, - }, - "mlp_relu": { - "name": "integer", - "bypass": True, - "data_in_width": mlp_hidden, - "data_in_frac_width": mlp_hidden_f, - }, - "fc2_proj": { - "name": "integer", - "weight_width": fc2_w, - "weight_frac_width": fc2_w_f, - "data_in_width": mlp_hidden, - "data_in_frac_width": mlp_hidden_f, - "bias_width": fc2_b, - "bias_frac_width": fc2_b_f, - }, - }, - "add2": { - "name": "integer", - "data_in_width": mlp_o, - "data_in_frac_width": mlp_o_f, - }, - }, - "pvt_norm": { - "mul": { - "name": "integer", - "data_in_width": aff_mlp_w, - "data_in_frac_width": aff_mlp_w_f, - }, - "add": { - "name": "integer", - "data_in_width": aff_mlp_b, - "data_in_frac_width": aff_mlp_b_f, - }, - }, - } - self.ow, self.ow_f = mlp_o + 1, mlp_o_f - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.w_config["block"]["affine_att"]["mul"]["data_in_width"], - "IN_FRAC_WIDTH": self.w_config["block"]["affine_att"]["mul"][ - "data_in_frac_width" - ], - "AF_MSA_ADD_WIDTH": self.w_config["block"]["affine_att"]["add"][ - "data_in_width" - ], - "AF_MSA_ADD_FRAC_WIDTH": self.w_config["block"]["affine_att"]["add"][ - "data_in_frac_width" - ], - "MSA_IN_WIDTH": self.w_config["block"]["msa"]["q_proj"]["data_in_width"], - "MSA_IN_FRAC_WIDTH": self.w_config["block"]["msa"]["q_proj"][ - "data_in_frac_width" - ], - "WQ_WIDTH": self.w_config["block"]["msa"]["q_proj"]["weight_width"], - "WQ_FRAC_WIDTH": self.w_config["block"]["msa"]["q_proj"][ - "weight_frac_width" - ], - "WK_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["weight_width"], - "WK_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "weight_frac_width" - ], - "WV_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["weight_width"], - "WV_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "weight_frac_width" - ], - "WP_WIDTH": self.w_config["block"]["msa"]["z_proj"]["weight_width"], - "WP_FRAC_WIDTH": self.w_config["block"]["msa"]["z_proj"][ - "weight_frac_width" - ], - "BQ_WIDTH": self.w_config["block"]["msa"]["q_proj"]["bias_width"], - "BQ_FRAC_WIDTH": self.w_config["block"]["msa"]["q_proj"]["bias_frac_width"], - "BK_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["bias_width"], - "BK_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "bias_frac_width" - ], - "BV_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["bias_width"], - "BV_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "bias_frac_width" - ], - "BP_WIDTH": self.w_config["block"]["msa"]["z_proj"]["bias_width"], - "BP_FRAC_WIDTH": self.w_config["block"]["msa"]["z_proj"]["bias_frac_width"], - "DQ_WIDTH": self.w_config["block"]["msa"]["attn_matmul"]["data_in_width"], - "DQ_FRAC_WIDTH": self.w_config["block"]["msa"]["attn_matmul"][ - "data_in_frac_width" - ], - "DK_WIDTH": self.w_config["block"]["msa"]["attn_matmul"]["weight_width"], - "DK_FRAC_WIDTH": self.w_config["block"]["msa"]["attn_matmul"][ - "weight_frac_width" - ], - "DS_WIDTH": self.w_config["block"]["msa"]["z_matmul"]["data_in_width"], - "DS_FRAC_WIDTH": self.w_config["block"]["msa"]["z_matmul"][ - "data_in_frac_width" - ], - "DV_WIDTH": self.w_config["block"]["msa"]["z_matmul"]["weight_width"], - "DV_FRAC_WIDTH": self.w_config["block"]["msa"]["z_matmul"][ - "weight_frac_width" - ], - "EXP_WIDTH": self.w_config["block"]["msa"]["softmax"]["exp_width"], - "EXP_FRAC_WIDTH": self.w_config["block"]["msa"]["softmax"][ - "exp_frac_width" - ], - "DIV_WIDTH": self.w_config["block"]["msa"]["softmax"]["div_width"], - "DS_SOFTMAX_WIDTH": self.w_config["block"]["msa"]["softmax"][ - "data_out_width" - ], - "DS_SOFTMAX_FRAC_WIDTH": self.w_config["block"]["msa"]["softmax"][ - "data_out_frac_width" - ], - "DZ_WIDTH": self.w_config["block"]["msa"]["z_proj"]["data_in_width"], - "DZ_FRAC_WIDTH": self.w_config["block"]["msa"]["z_proj"][ - "data_in_frac_width" - ], - "AF_MLP_IN_WIDTH": self.w_config["block"]["affine_mlp"]["mul"][ - "data_in_width" - ], - "AF_MLP_IN_FRAC_WIDTH": self.w_config["block"]["affine_mlp"]["mul"][ - "data_in_frac_width" - ], - "AF_MLP_ADD_WIDTH": self.w_config["block"]["affine_mlp"]["add"][ - "data_in_width" - ], - "AF_MLP_ADD_FRAC_WIDTH": self.w_config["block"]["affine_mlp"]["add"][ - "data_in_frac_width" - ], - # mlp - "MLP_IN_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"]["data_in_width"], - "MLP_IN_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "data_in_frac_width" - ], - "WEIGHT_I2H_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "weight_width" - ], - "WEIGHT_I2H_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "weight_frac_width" - ], - "BIAS_I2H_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"]["bias_width"], - "BIAS_I2H_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "bias_frac_width" - ], - "MLP_HAS_BIAS": self.has_bias, - "HIDDEN_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"]["data_in_width"], - "HIDDEN_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "data_in_frac_width" - ], - "WEIGHT_H2O_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "weight_width" - ], - "WEIGHT_H2O_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "weight_frac_width" - ], - "BIAS_H2O_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"]["bias_width"], - "BIAS_H2O_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "bias_frac_width" - ], - "OUT_WIDTH": self.ow, - "OUT_FRAC_WIDTH": self.ow_f, - "IN_NUM": self.in_num, - "IN_DIM": self.in_dim, - "MLP_RATIO": self.mlp_ratio, - "NUM_HEADS": self.num_heads, - "UNROLL_IN_NUM": self.unroll_in_num, - "UNROLL_IN_DIM": self.unroll_in_dim, - "UNROLL_WQKV_DIM": self.unroll_wqkv_dim, - "UNROLL_HIDDEN_FEATURES": self.unroll_hidden_features, - } - - def source_generate(self): - samples = self.samples - torch.manual_seed(2) - self.x = 3 * torch.randn((samples, self.in_num, self.in_dim)) - w_config = self.w_config["block"] - self.block = QuantizedBlock( - self.in_dim, - self.num_heads, - w_config, - mlp_ratio=self.hidden_features / self.in_features, - qkv_bias=True, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - ) - input_tensor = q2i( - self.x, - w_config["msa"]["q_proj"]["data_in_width"], - w_config["msa"]["q_proj"]["data_in_frac_width"], - ) - self.data_in = self.data_pack( - input_tensor, - self.in_num_parallelism, - self.in_depth, - self.in_parallelism, - self.in_size, - ) - self.data_in.reverse() - self.inputs = RandomSource( - name="data_in", - samples=samples * self.in_depth * self.in_num_parallelism, - num=self.in_parallelism * self.in_size, - max_stalls=2 * samples * self.in_depth * self.in_num_parallelism, - data_specify=self.data_in, - debug=debug, - ) - att = self.block.attn - self.msa_data_generate(att) - mlp = self.block.mlp - self.mlp_data_generate(mlp) - - aff_att = self.block.norm1 - num = self.in_num_parallelism * self.in_depth - in_size = self.in_parallelism * self.in_size - aff_att_w, aff_att_b = self.aff_data_generate( - w_config["affine_att"], aff_att, num, in_size - ) - aff_mlp = self.block.norm2 - aff_mlp_w, aff_mlp_b = self.aff_data_generate( - w_config["affine_mlp"], aff_mlp, num, in_size - ) - # breakpoint() - self.aff_att_weight = RandomSource( - samples=samples * num, - max_stalls=2 * samples, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_att_w, - ) - self.aff_att_bias = RandomSource( - samples=samples * num, - max_stalls=2 * samples, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_att_b, - ) - - self.aff_mlp_weight = RandomSource( - samples=samples * num, - max_stalls=2 * samples, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_mlp_w, - ) - self.aff_mlp_bias = RandomSource( - samples=samples * num, - max_stalls=2 * samples, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_mlp_b, - ) - - def aff_data_generate(self, config, qaff, num, in_size): - fixed_aff = qaff - w = fixed_aff.weight - b = fixed_aff.bias - weight_in = ( - q2i(w, config["mul"]["data_in_width"], config["mul"]["data_in_frac_width"]) - .repeat(self.samples * num, in_size) - .tolist() - ) - - bias_in = ( - q2i(b, config["add"]["data_in_width"], config["add"]["data_in_frac_width"]) - .repeat(self.samples * num, in_size) - .tolist() - ) - weight_in.reverse() - bias_in.reverse() - return weight_in, bias_in - - def msa_data_generate(self, qatt): - # generate data - samples = self.samples - config = self.w_config["block"]["msa"] - in_x = self.in_dim - att = qatt - att_wq = q2i( - att.q.weight, - config["q_proj"]["weight_width"], - config["q_proj"]["weight_frac_width"], - ) - att_wkv = q2i( - att.kv.weight, - config["kv_proj"]["weight_width"], - config["kv_proj"]["weight_frac_width"], - ) - wqkv_tensor = torch.cat((att_wq, att_wkv), 0) - wqkv_tensor = wqkv_tensor.reshape(3, in_x, in_x) - wqkv_tensor = wqkv_tensor.reshape(in_x * 3, in_x).repeat(samples, 1, 1) - - att_bq = q2i( - att.q.bias, - config["q_proj"]["bias_width"], - config["q_proj"]["bias_frac_width"], - ) - att_bkv = q2i( - att.kv.bias, - config["kv_proj"]["bias_width"], - config["kv_proj"]["bias_frac_width"], - ) - bqkv_tensor = torch.cat((att_bq, att_bkv), 0) - bqkv_tensor = bqkv_tensor.reshape(3, in_x) - bqkv_tensor = bqkv_tensor.reshape(-1).repeat(samples, 1) - - wp_tensor = q2i( - att.proj.weight, - config["z_proj"]["weight_width"], - config["z_proj"]["weight_frac_width"], - ).repeat(samples, 1, 1) - bp_tensor = q2i( - att.proj.bias, - config["z_proj"]["bias_width"], - config["z_proj"]["bias_frac_width"], - ).repeat(samples, 1) - - logger.debug( - "input data: \n\ - wqkv_tensor = \n{}\n\ - bqkv_tensor = \n{}\n\ - wp_tensor = \n{}\n\ - bp_tensor = \n{}\n\ - ".format( - wqkv_tensor, bqkv_tensor, wp_tensor, bp_tensor - ) - ) - # generate hash table - exp_table = generate_table_hardware( - att.scale, - config["softmax"]["data_in_width"], - config["softmax"]["data_in_frac_width"], - config["softmax"]["exp_width"], - config["softmax"]["exp_frac_width"], - ).tolist() - div_table = generate_table_div_hardware( - config["softmax"]["div_width"], - config["softmax"]["data_out_width"], - config["softmax"]["data_out_frac_width"], - ).tolist() - with open(r"exp_init.mem", "w") as fp: - for item in exp_table: - # write each item on a new lineformat(addr[i] ,f'0{width}b' - fp.write( - "%s\n" % format(item, f'0{config["softmax"]["exp_width"]//4}x') - ) - with open(r"div_init.mem", "w") as fp: - for item in div_table: - # write each item on a new line - fp.write( - "%s\n" % format(item, f'0{config["softmax"]["data_out_width"]//4}x') - ) - # data_pack - in_depth = self.in_depth - in_size = self.in_size - wqkv_parallelism = self.wqkv_parallelism - wqkv_num_parallelism = self.wqkv_num_parallelism - num_heads = self.num_heads - wp_parallelism = self.wp_parallelism - wp_num_parallelism = self.wp_num_parallelism - wp_depth = wqkv_num_parallelism - wp_size = num_heads * wqkv_parallelism - dim = in_size * in_depth - - wqkv = wqkv_tensor.reshape( - samples, 3, num_heads, wqkv_num_parallelism, wqkv_parallelism, dim - ).permute(1, 0, 3, 2, 4, 5) - wqkv = wqkv.reshape(3, samples, dim, dim) - bqkv = bqkv_tensor.reshape( - samples, 3, num_heads, wqkv_num_parallelism, wqkv_parallelism - ).permute(1, 0, 3, 2, 4) - bqkv = bqkv.reshape(3, samples, dim) - - wp = wp_tensor.reshape( - samples * dim, num_heads, wqkv_num_parallelism, wqkv_parallelism - ) - wp = wp.permute(0, 2, 1, 3).reshape(samples, dim, dim) - - wq = wqkv[0] - wk = wqkv[1] - wv = wqkv[2] - - bq = bqkv[0] - bk = bqkv[1] - bv = bqkv[2] - wq_in = self.data_pack( - wq, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - wk_in = self.data_pack( - wk, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - wv_in = self.data_pack( - wv, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - wp_in = self.data_pack( - wp, - wp_num_parallelism, - wqkv_num_parallelism, - wp_parallelism, - num_heads * wqkv_parallelism, - ) - - bq_in = self.data_pack( - bq, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - bk_in = self.data_pack( - bk, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - bv_in = self.data_pack( - bv, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - bp_in = self.data_pack(bp_tensor, 1, wp_num_parallelism, 1, wp_parallelism) - - wq_in.reverse() - wk_in.reverse() - wv_in.reverse() - wp_in.reverse() - bq_in.reverse() - bk_in.reverse() - bv_in.reverse() - bp_in.reverse() - - self.weight_q = RandomSource( - name="weight_q", - samples=samples * in_depth * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism * in_size, - max_stalls=2 * samples * in_depth * wqkv_num_parallelism, - data_specify=wq_in, - debug=debug, - ) - self.weight_k = RandomSource( - name="weight_k", - samples=samples * in_depth * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism * in_size, - max_stalls=2 * samples * in_depth * wqkv_num_parallelism, - data_specify=wk_in, - debug=debug, - ) - self.weight_v = RandomSource( - name="weight_v", - samples=samples * in_depth * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism * in_size, - max_stalls=2 * samples * in_depth * wqkv_num_parallelism, - data_specify=wv_in, - debug=debug, - ) - self.weight_p = RandomSource( - name="weight_p", - samples=samples * wp_depth * wp_num_parallelism, - num=wp_parallelism * wp_size, - max_stalls=2 * samples * wp_depth * wp_num_parallelism, - data_specify=wp_in, - debug=debug, - ) - self.bias_q = RandomSource( - name="bias_q", - samples=samples * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism, - max_stalls=2 * samples, - data_specify=bq_in, - debug=debug, - ) - self.bias_k = RandomSource( - name="bias_k", - samples=samples * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism, - max_stalls=2 * samples, - data_specify=bk_in, - debug=debug, - ) - self.bias_v = RandomSource( - name="bias_v", - samples=samples * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism, - max_stalls=2 * samples, - data_specify=bv_in, - debug=debug, - ) - self.bias_p = RandomSource( - name="bias_p", - samples=samples * wp_num_parallelism, - num=wp_parallelism, - max_stalls=2 * samples, - data_specify=bp_in, - debug=debug, - ) - - def mlp_data_generate(self, qmlp): - samples = self.samples - w_config = self.w_config["block"]["mlp"] - in_features = self.in_features - hidden_features = self.hidden_features - out_features = self.out_features - unroll_in_features = self.unroll_in_features - unroll_hidden_features = self.unroll_hidden_features - unroll_out_features = self.unroll_out_features - depth_in_features = in_features // unroll_in_features - depth_hidden_features = hidden_features // unroll_hidden_features - depth_out_features = out_features // unroll_out_features - mlp = qmlp - weight1_tensor = q2i( - mlp.fc1.weight, - w_config["fc1_proj"]["weight_width"], - w_config["fc1_proj"]["weight_frac_width"], - ) - - bias1_tensor = q2i( - mlp.fc1.bias, - w_config["fc1_proj"]["bias_width"], - w_config["fc1_proj"]["bias_frac_width"], - ) - - weight2_tensor = q2i( - mlp.fc2.weight, - w_config["fc2_proj"]["weight_width"], - w_config["fc2_proj"]["weight_frac_width"], - ) - - bias2_tensor = q2i( - mlp.fc2.bias, - w_config["fc2_proj"]["bias_width"], - w_config["fc2_proj"]["bias_frac_width"], - ) - weight1_in = linear_data_pack( - samples, - weight1_tensor.repeat(samples, 1, 1), - hidden_features, - in_features, - unroll_hidden_features, - unroll_in_features, - ) - bias1_in = linear_data_pack( - samples, - bias1_tensor.repeat(samples, 1, 1), - hidden_features, - 1, - unroll_hidden_features, - 1, - ) - weight2_in = linear_data_pack( - samples, - weight2_tensor.repeat(samples, 1, 1), - out_features, - hidden_features, - unroll_out_features, - unroll_hidden_features, - ) - bias2_in = linear_data_pack( - samples, - bias2_tensor.repeat(samples, 1, 1), - out_features, - 1, - unroll_out_features, - 1, - ) - weight1_in.reverse() - bias1_in.reverse() - weight2_in.reverse() - bias2_in.reverse() - self.bias1 = RandomSource( - name="bias1", - samples=samples * depth_hidden_features, - num=unroll_hidden_features, - max_stalls=2 * samples * depth_hidden_features, - data_specify=bias1_in, - debug=debug, - ) - self.bias2 = RandomSource( - name="bias2", - samples=samples * depth_out_features, - num=unroll_out_features, - max_stalls=2 * samples * depth_out_features, - data_specify=bias2_in, - debug=debug, - ) - self.weight1 = RandomSource( - name="weight1", - samples=samples * depth_hidden_features * depth_in_features, - num=unroll_hidden_features * unroll_in_features, - max_stalls=2 * samples * depth_hidden_features * depth_in_features, - data_specify=weight1_in, - debug=debug, - ) - self.weight2 = RandomSource( - name="weight2", - samples=samples * depth_out_features * depth_hidden_features, - num=unroll_out_features * unroll_hidden_features, - max_stalls=2 * samples * depth_hidden_features * depth_out_features, - data_specify=weight2_in, - debug=debug, - ) - - def data_pack(self, in_temp, np, d, p, s): - # assum in_temp.shape = (samples, batch = 1, N,dim) - in_temp = in_temp.to(torch.int).reshape(self.samples, np * p, d * s) - ref = [] - for i in range(self.samples): - re_tensor = rearrange( - in_temp[i], "(np p) (d s) -> np (p d) s", np=np, d=d, p=p, s=s - ) - ex_tensor = torch.zeros(np, d * p, s, dtype=int) - for b in range(np): - for i in range(d): - for j in range(p): - ex_tensor[b][i * p + j] = re_tensor[b][j * d + i] - output_tensor = rearrange( - ex_tensor, "np (d p) s -> (np d) (p s)", np=np, d=d, p=p, s=s - ) - output = output_tensor.tolist() - ref = ref + output - return ref - - def sw_compute(self): - output = self.block(self.x) - out_data = self.data_pack( - q2i(output, self.ow, self.ow_f), - self.in_num_parallelism, - self.out_features // self.unroll_out_features, - self.in_parallelism, - self.unroll_out_features, - ) - return out_data - - -def wave_check(dut): - logger.debug( - "wave of in_out:\n\ - {},{},data_in = {} \n\ - {},{},af_msa = {} \n\ - {},{},msa_out = {} \n\ - {},{},res_msa = {} \n\ - {},{},af_mlp = {} \n\ - {},{},mlp_out = {} \n\ - {},{},data_out = {}\n\ - ".format( - dut.data_in_valid.value, - dut.data_in_ready.value, - [int(i) for i in dut.data_in.value], - dut.af_msa_out_valid.value, - dut.af_msa_out_ready.value, - [int(i) for i in dut.af_msa_out.value], - dut.msa_out_valid.value, - dut.msa_out_ready.value, - [int(i) for i in dut.msa_out.value], - dut.res_msa_valid.value, - dut.res_msa_ready.value, - [int(i) for i in dut.res_msa.value], - dut.af_mlp_out_valid.value, - dut.af_mlp_out_ready.value, - [int(i) for i in dut.af_mlp_out.value], - dut.mlp_out_valid.value, - dut.mlp_out_ready.value, - [int(i) for i in dut.mlp_out.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.data_out.value], - ) - ) - - -@cocotb.test() -async def cocotb_test_att(dut): - """Test integer based vector mult""" - samples = 20 - test_case = VerificationCase(samples=samples) - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.weight_q_valid.value = 0 - dut.weight_k_valid.value = 0 - dut.weight_v_valid.value = 0 - dut.weight_p_valid.value = 0 - dut.bias_q_valid.value = 0 - dut.bias_k_valid.value = 0 - dut.bias_v_valid.value = 0 - dut.bias_p_valid.value = 0 - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - await FallingEdge(dut.clk) - await FallingEdge(dut.clk) - done = False - count_af_msa = 0 - count_msa = 0 - count_mlp = 0 - count_mlp_hidden = 0 - count_mlp_out = 0 - # Set a timeout to avoid deadlock - for i in range(samples * 6000): - await FallingEdge(dut.clk) - # breakpoint() - dut.af_msa_weight_valid.value = test_case.aff_att_weight.pre_compute() - dut.af_msa_bias_valid.value = test_case.aff_att_bias.pre_compute() - dut.weight_q_valid.value = test_case.weight_q.pre_compute() - dut.weight_k_valid.value = test_case.weight_k.pre_compute() - dut.weight_v_valid.value = test_case.weight_v.pre_compute() - dut.weight_p_valid.value = test_case.weight_p.pre_compute() - dut.bias_q_valid.value = test_case.bias_q.pre_compute() - dut.bias_k_valid.value = test_case.bias_k.pre_compute() - dut.bias_v_valid.value = test_case.bias_v.pre_compute() - dut.bias_p_valid.value = test_case.bias_p.pre_compute() - dut.data_in_valid.value = test_case.inputs.pre_compute() - - dut.af_mlp_weight_valid.value = test_case.aff_mlp_weight.pre_compute() - dut.af_mlp_bias_valid.value = test_case.aff_mlp_bias.pre_compute() - dut.weight_in2hidden_valid.value = test_case.weight1.pre_compute() - dut.bias_in2hidden_valid.value = test_case.bias1.pre_compute() - dut.weight_hidden2out_valid.value = test_case.weight2.pre_compute() - dut.bias_hidden2out_valid.value = test_case.bias2.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute( - dut.data_out_valid.value - ) - await Timer(1, units="ns") - ( - dut.af_msa_weight_valid.value, - dut.af_msa_weight.value, - ) = test_case.aff_att_weight.compute(dut.af_msa_weight_ready.value) - ( - dut.af_msa_bias_valid.value, - dut.af_msa_bias.value, - ) = test_case.aff_att_bias.compute(dut.af_msa_bias_ready.value) - dut.weight_q_valid.value, dut.weight_q.value = test_case.weight_q.compute( - dut.weight_q_ready.value - ) - dut.weight_k_valid.value, dut.weight_k.value = test_case.weight_k.compute( - dut.weight_k_ready.value - ) - dut.weight_v_valid.value, dut.weight_v.value = test_case.weight_v.compute( - dut.weight_v_ready.value - ) - dut.weight_p_valid.value, dut.weight_p.value = test_case.weight_p.compute( - dut.weight_p_ready.value - ) - - dut.bias_q_valid.value, dut.bias_q.value = test_case.bias_q.compute( - dut.bias_q_ready.value - ) - dut.bias_k_valid.value, dut.bias_k.value = test_case.bias_k.compute( - dut.bias_k_ready.value - ) - dut.bias_v_valid.value, dut.bias_v.value = test_case.bias_v.compute( - dut.bias_v_ready.value - ) - dut.bias_p_valid.value, dut.bias_p.value = test_case.bias_p.compute( - dut.bias_p_ready.value - ) - - ( - dut.af_mlp_weight_valid.value, - dut.af_mlp_weight.value, - ) = test_case.aff_mlp_weight.compute(dut.af_mlp_weight_ready.value) - ( - dut.af_mlp_bias_valid.value, - dut.af_mlp_bias.value, - ) = test_case.aff_mlp_bias.compute(dut.af_mlp_bias_ready.value) - ( - dut.weight_in2hidden_valid.value, - dut.weight_in2hidden.value, - ) = test_case.weight1.compute(dut.weight_in2hidden_ready.value) - ( - dut.weight_hidden2out_valid.value, - dut.weight_hidden2out.value, - ) = test_case.weight2.compute(dut.weight_hidden2out_ready.value) - ( - dut.bias_in2hidden_valid.value, - dut.bias_in2hidden.value, - ) = test_case.bias1.compute(dut.bias_in2hidden_ready.value) - ( - dut.bias_hidden2out_valid.value, - dut.bias_hidden2out.value, - ) = test_case.bias2.compute(dut.bias_hidden2out_ready.value) - - dut.data_in_valid.value, dut.data_in.value = test_case.inputs.compute( - dut.data_in_ready.value - ) - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - await Timer(1, units="ns") - if dut.af_msa_out_valid.value == 1 and dut.af_msa_out_ready.value == 1: - count_af_msa += 1 - if dut.msa_out_valid.value == 1 and dut.msa_out_ready.value == 1: - count_msa += 1 - if ( - dut.mlp_inst.hidden_data_valid.value == 1 - and dut.mlp_inst.hidden_data_ready.value == 1 - ): - count_mlp_hidden += 1 - if dut.mlp_out_valid.value == 1 and dut.mlp_out_ready.value == 1: - count_mlp += 1 - - print("count_af_msa = ", count_af_msa) - print("count_msa = ", count_msa) - print("count_mlp_hidden = ", count_mlp_hidden) - print("count_mlp = ", count_mlp) - wave_check(dut) - if ( - test_case.weight1.is_empty() - and test_case.bias1.is_empty() - and test_case.weight2.is_empty() - and test_case.bias2.is_empty() - and test_case.weight_q.is_empty() - and test_case.weight_k.is_empty() - and test_case.weight_v.is_empty() - and test_case.weight_p.is_empty() - and test_case.bias_q.is_empty() - and test_case.bias_k.is_empty() - and test_case.bias_v.is_empty() - and test_case.bias_p.is_empty() - and test_case.inputs.is_empty() - and test_case.outputs.is_full() - ): - done = True - break - assert ( - done - ), "Deadlock detected or the simulation reaches the maximum cycle limit (fixed it by adjusting the loop trip count)" - - check_results(test_case.outputs.data, test_case.ref) - - -def runner(): - sim = os.getenv("SIM", "verilator") - - verilog_sources = [ - "../../../../components/ViT/fixed_block.sv", - "../../../../components/ViT/hash_softmax.sv", - "../../../../components/ViT/affine_layernorm.sv", - "../../../../components/ViT/fixed_mlp.sv", - "../../../../components/ViT/fixed_msa.sv", - "../../../../components/attention/fixed_self_att.sv", - "../../../../components/attention/fixed_att.sv", - "../../../../components/conv/roller.sv", - "../../../../components/common/fifo.sv", - "../../../../components/common/unpacked_fifo.sv", - "../../../../components/common/input_buffer.sv", - "../../../../components/common/blk_mem_gen_0.sv", - "../../../../components/common/skid_buffer.sv", - "../../../../components/common/unpacked_skid_buffer.sv", - "../../../../components/common/register_slice.sv", - "../../../../components/common/split2.sv", - "../../../../components/common/join2.sv", - "../../../../components/matmul/fixed_matmul.sv", - "../../../../components/linear/fixed_linear.sv", - "../../../../components/linear/fixed_2d_linear.sv", - "../../../../components/cast/fixed_rounding.sv", - "../../../../components/activations/fixed_relu.sv", - "../../../../components/fixed_arithmetic/fixed_matmul_core.sv", - "../../../../components/fixed_arithmetic/fixed_dot_product.sv", - "../../../../components/fixed_arithmetic/fixed_accumulator.sv", - "../../../../components/fixed_arithmetic/fixed_vector_mult.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree_layer.sv", - "../../../../components/fixed_arithmetic/fixed_mult.sv", - ] - test_case = VerificationCase() - - # set parameters - extra_args = [] - for k, v in test_case.get_dut_parameters().items(): - extra_args.append(f"-G{k}={v}") - print(extra_args) - runner = get_runner(sim) - runner.build( - verilog_sources=verilog_sources, - hdl_toplevel="fixed_block", - build_args=extra_args, - ) - - runner.test(hdl_toplevel="fixed_block", test_module="fixed_block_tb") - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_fixed_block(): - runner() - - -if __name__ == "__main__": - test_fixed_block() diff --git a/src/mase_components/vision_models/vit/test/fixed_mlp_tb.py b/src/mase_components/vision_models/vit/test/fixed_mlp_tb.py deleted file mode 100644 index 2e19bed4b..000000000 --- a/src/mase_components/vision_models/vit/test/fixed_mlp_tb.py +++ /dev/null @@ -1,488 +0,0 @@ -#!/usr/bin/env python3 - -# This script tests the fixed point linear -import random, os, math, logging, sys -import numpy as np - -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append("/workspace/components/testbench/ViT/") -sys.path.append("/workspace/machop/") - -from mase_cocotb.random_test import RandomSource -from mase_cocotb.random_test import RandomSink -from mase_cocotb.random_test import check_results - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock -from cocotb.runner import get_runner - -from einops import rearrange, reduce, repeat -import torch -import torch.nn as nn -from .helpers.pvt_quant import QuantizedMlp -from mase_cocotb.z_qlayers import quantize_to_int as q2i -from mase_cocotb.z_qlayers import linear_data_pack - -debug = True - -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - self.samples = samples - self.data_in_width = 8 - self.data_in_frac_width = 3 - self.weight_i2h_width = 6 - self.weight_i2h_frac_width = 3 - self.weight_h2o_width = 6 - self.weight_h2o_frac_width = 3 - self.has_bias = 1 - self.bias_i2h_width = 6 - self.bias_i2h_frac_width = 4 - self.bias_h2o_width = 6 - self.bias_h2o_frac_width = 4 - self.hidden_width = 8 - self.hidden_frac_width = 4 - self.data_out_width = 8 - self.data_out_frac_width = 4 - self.w_config = { - "mlp": { - "fc1_proj": { - "name": "integer", - "weight_width": self.weight_i2h_width, - "weight_frac_width": self.weight_i2h_frac_width, - "data_in_width": self.data_in_width, - "data_in_frac_width": self.data_in_frac_width, - "bias_width": self.bias_i2h_width, - "bias_frac_width": self.bias_i2h_frac_width, - }, - "fc2_proj": { - "name": "integer", - "weight_width": self.weight_h2o_width, - "weight_frac_width": self.weight_h2o_frac_width, - "data_in_width": self.data_in_width, - "data_in_frac_width": self.data_in_frac_width, - "bias_width": self.bias_h2o_width, - "bias_frac_width": self.bias_h2o_frac_width, - }, - "mlp_relu": { - "name": "integer", - "bypass": True, - "data_in_width": self.data_in_width, - "data_in_frac_width": self.data_in_frac_width, - }, - }, - } - self.in_num = 2 - self.in_features = 16 - self.hidden_features = 2 * self.in_features - self.out_features = self.in_features - - self.tile_in_num = 1 - self.tile_in_features = 2 - self.tile_hidden_features = 1 - self.tile_out_features = self.tile_in_features - self.d_config = { - "mlp": { - "in_num": self.in_num, - "in_features": self.in_features, - "hidden_features": self.hidden_features, - "out_features": self.in_features, - "unroll_in_num": self.tile_in_num, - "unroll_in_features": self.tile_in_features, - "unroll_hidden_features": self.tile_hidden_features, - "unroll_out_features": self.tile_out_features, - }, - } - - self.data_generate() - depth_in_num = int(self.in_num / self.tile_in_num) - depth_out_features = int(self.out_features / self.tile_out_features) - self.outputs = RandomSink( - samples=samples * depth_out_features * depth_in_num, - debug=debug, - ) - self.ref = self.sw_compute() - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.w_config["mlp"]["fc1_proj"]["data_in_width"], - "IN_FRAC_WIDTH": self.w_config["mlp"]["fc1_proj"]["data_in_frac_width"], - "WEIGHT_I2H_WIDTH": self.w_config["mlp"]["fc1_proj"]["weight_width"], - "WEIGHT_I2H_FRAC_WIDTH": self.w_config["mlp"]["fc1_proj"][ - "weight_frac_width" - ], - "BIAS_I2H_WIDTH": self.w_config["mlp"]["fc1_proj"]["bias_width"], - "BIAS_I2H_FRAC_WIDTH": self.w_config["mlp"]["fc1_proj"]["bias_frac_width"], - "HAS_BIAS": self.has_bias, - "HIDDEN_WIDTH": self.w_config["mlp"]["fc2_proj"]["data_in_width"], - "HIDDEN_FRAC_WIDTH": self.w_config["mlp"]["fc2_proj"]["data_in_frac_width"], - "WEIGHT_H2O_WIDTH": self.w_config["mlp"]["fc2_proj"]["weight_width"], - "WEIGHT_H2O_FRAC_WIDTH": self.w_config["mlp"]["fc2_proj"][ - "weight_frac_width" - ], - "BIAS_H2O_WIDTH": self.w_config["mlp"]["fc2_proj"]["bias_width"], - "BIAS_H2O_FRAC_WIDTH": self.w_config["mlp"]["fc2_proj"]["bias_frac_width"], - "OUT_WIDTH": self.data_out_width, - "OUT_FRAC_WIDTH": self.data_out_frac_width, - "IN_NUM": self.in_num, - "IN_FEATURES": self.in_features, - "HIDDEN_FEATURES": self.hidden_features, - "UNROLL_IN_NUM": self.tile_in_num, - "UNROLL_IN_FEATURES": self.tile_in_features, - "UNROLL_HIDDEN_FEATURES": self.tile_hidden_features, - "UNROLL_OUT_FEATURES": self.tile_out_features, - } - - def data_generate(self): - torch.manual_seed(0) - w_config = self.w_config["mlp"] - self.mlp = QuantizedMlp( - in_features=self.d_config["mlp"]["in_features"], - hidden_features=self.d_config["mlp"]["hidden_features"], - drop=0.0, - config=self.w_config["mlp"], - ) - self.x = 5 * torch.randn((self.samples, self.in_num, self.in_features)) - weight1_tensor = q2i( - self.mlp.fc1.weight, - w_config["fc1_proj"]["weight_width"], - w_config["fc1_proj"]["weight_frac_width"], - ) - - bias1_tensor = q2i( - self.mlp.fc1.bias, - w_config["fc1_proj"]["bias_width"], - w_config["fc1_proj"]["bias_frac_width"], - ) - - weight2_tensor = q2i( - self.mlp.fc2.weight, - w_config["fc2_proj"]["weight_width"], - w_config["fc2_proj"]["weight_frac_width"], - ) - bias2_tensor = q2i( - self.mlp.fc2.bias, - w_config["fc2_proj"]["bias_width"], - w_config["fc2_proj"]["bias_frac_width"], - ) - x_tensor = q2i( - self.x, - w_config["fc1_proj"]["data_in_width"], - w_config["fc1_proj"]["data_in_frac_width"], - ) - self.inputs = linear_data_pack( - self.samples, - x_tensor, - self.in_num, - self.in_features, - self.tile_in_num, - self.tile_in_features, - ) - self.weight1_in = linear_data_pack( - self.samples, - weight1_tensor.repeat(self.samples, 1, 1), - self.hidden_features, - self.in_features, - self.tile_hidden_features, - self.tile_in_features, - ) - self.bias1_in = linear_data_pack( - self.samples, - bias1_tensor.repeat(self.samples, 1, 1), - self.hidden_features, - 1, - self.tile_hidden_features, - 1, - ) - self.weight2_in = linear_data_pack( - self.samples, - weight2_tensor.repeat(self.samples, 1, 1), - self.out_features, - self.hidden_features, - self.tile_out_features, - self.tile_hidden_features, - ) - self.bias2_in = linear_data_pack( - self.samples, - bias2_tensor.repeat(self.samples, 1, 1), - self.out_features, - 1, - self.tile_out_features, - 1, - ) - self.inputs.reverse() - self.weight1_in.reverse() - self.bias1_in.reverse() - self.weight2_in.reverse() - self.bias2_in.reverse() - samples = self.samples - depth_in_features = int(self.in_features / self.tile_in_features) - depth_in_num = int(self.in_num / self.tile_in_num) - depth_hidden_features = int(self.hidden_features / self.tile_hidden_features) - depth_out_features = int(self.out_features / self.tile_out_features) - self.data_in = RandomSource( - name="data_in", - samples=samples * depth_in_features * depth_in_num, - num=self.tile_in_features * self.tile_in_num, - max_stalls=2 * samples, - data_specify=self.inputs, - debug=debug, - ) - self.bias1 = RandomSource( - name="bias1", - samples=samples * depth_hidden_features, - num=self.tile_hidden_features, - max_stalls=2 * samples * depth_hidden_features, - data_specify=self.bias1_in, - debug=debug, - ) - self.bias2 = RandomSource( - name="bias2", - samples=samples * depth_out_features, - num=self.tile_out_features, - max_stalls=2 * samples * depth_out_features, - data_specify=self.bias2_in, - debug=debug, - ) - self.weight1 = RandomSource( - name="weight1", - samples=samples * depth_hidden_features * depth_in_features, - num=self.tile_hidden_features * self.tile_in_features, - max_stalls=2 * samples * depth_hidden_features * depth_in_features, - data_specify=self.weight1_in, - debug=debug, - ) - self.weight2 = RandomSource( - name="weight2", - samples=samples * depth_out_features * depth_hidden_features, - num=self.tile_out_features * self.tile_hidden_features, - max_stalls=2 * samples * depth_hidden_features * depth_out_features, - data_specify=self.weight2_in, - debug=debug, - ) - - def sw_compute(self): - data_out = self.mlp(self.x) - output = linear_data_pack( - self.samples, - q2i(data_out, self.data_out_width, self.data_out_frac_width), - self.in_num, - self.out_features, - self.tile_in_num, - self.tile_out_features, - ) - return output - - -def debug_state(dut, state): - logger.debug( - "{} State: (data_in2_ready,data_in2_valid,data_in1_ready,data_in1_valid,data_out_ready,data_out_valid) = ({},{},{},{},{},{})".format( - state, - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - - -@cocotb.test() -async def cocotb_test_fixed_linear(dut): - """Test integer based vector mult""" - samples = 10 - test_case = VerificationCase(samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.data_in_valid.value = 0 - dut.weight_in2hidden_valid.value = 0 - dut.weight_hidden2out_valid.value = 0 - dut.bias_in2hidden_valid.value = 0 - dut.bias_hidden2out_valid.value = 0 - dut.data_out_ready.value = 1 - debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - debug_state(dut, "Post-clk") - debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - debug_state(dut, "Post-clk") - - done = False - # Set a timeout to avoid deadlock - # breakpoint() - cdin = 0 - cdata_out = 0 - chidden_data = 0 - for i in range(samples * 8000): - await FallingEdge(dut.clk) - debug_state(dut, "Post-clk") - dut.weight_in2hidden_valid.value = test_case.weight1.pre_compute() - dut.bias_in2hidden_valid.value = test_case.bias1.pre_compute() - dut.weight_hidden2out_valid.value = test_case.weight2.pre_compute() - dut.bias_hidden2out_valid.value = test_case.bias2.pre_compute() - dut.data_in_valid.value = test_case.data_in.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute(dut.data_out_valid) - await Timer(1, units="ns") - - # start input data - # - ( - dut.weight_in2hidden_valid.value, - dut.weight_in2hidden.value, - ) = test_case.weight1.compute(dut.weight_in2hidden_ready.value) - ( - dut.weight_hidden2out_valid.value, - dut.weight_hidden2out.value, - ) = test_case.weight2.compute(dut.weight_hidden2out_ready.value) - ( - dut.bias_in2hidden_valid.value, - dut.bias_in2hidden.value, - ) = test_case.bias1.compute(dut.bias_in2hidden_ready.value) - ( - dut.bias_hidden2out_valid.value, - dut.bias_hidden2out.value, - ) = test_case.bias2.compute(dut.bias_hidden2out_ready.value) - dut.data_in_valid.value, dut.data_in.value = test_case.data_in.compute( - dut.data_in_ready.value - ) - - await Timer(1, units="ns") - - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - await Timer(1, units="ns") - debug_state(dut, "Pre-clk") - wave_check(dut) - if dut.data_in_valid.value == 1 and dut.data_in_ready.value == 1: - cdin += 1 - if dut.data_out_valid.value == 1 and dut.data_out_ready.value == 1: - cdata_out += 1 - if dut.hidden_data_valid.value == 1 and dut.hidden_data_ready.value == 1: - chidden_data += 1 - print("cdin = ", cdin) - print("cdata_out = ", cdata_out) - print("chidden_data = ", chidden_data) - if ( - # test_case.weight1.is_empty() - # and test_case.bias1.is_empty() - # and test_case.weight2.is_empty() - # and test_case.bias2.is_empty() - test_case.data_in.is_empty() - and test_case.outputs.is_full() - ): - done = True - break - assert ( - done - ), "Deadlock detected or the simulation reaches the maximum cycle limit (fixed it by adjusting the loop trip count)" - - check_results(test_case.outputs.data, test_case.ref) - - -def wave_check(dut): - logger.debug( - "wave of in_out:\n\ - {},{},data_in = {} \n\ - {},{},weight_in2hidden = {} \n\ - {},{},hidden_data = {} \n\ - {},{},relu_data = {} \n\ - {},{},data_out = {}\n\ - ".format( - dut.data_in_valid.value, - dut.data_in_ready.value, - [int(i) for i in dut.data_in.value], - dut.weight_in2hidden_valid.value, - dut.weight_in2hidden_ready.value, - [int(i) for i in dut.weight_in2hidden.value], - dut.hidden_data_valid.value, - dut.hidden_data_ready.value, - [int(i) for i in dut.hidden_data.value], - dut.relu_data_valid.value, - dut.relu_data_ready.value, - [int(i) for i in dut.relu_data.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.data_out.value], - ) - ) - - -def runner(): - sim = os.getenv("SIM", "verilator") - - verilog_sources = [ - "../../../../components/ViT/fixed_mlp.sv", - "../../../../components/matmul/fixed_matmul.sv", - "../../../../components/common/input_buffer.sv", - "../../../../components/common/blk_mem_gen_0.sv", - "../../../../components/common/skid_buffer.sv", - "../../../../components/common/unpacked_skid_buffer.sv", - "../../../../components/common/register_slice.sv", - "../../../../components/common/join2.sv", - "../../../../components/linear/fixed_2d_linear.sv", - "../../../../components/linear/fixed_linear.sv", - "../../../../components/cast/fixed_rounding.sv", - "../../../../components/activations/fixed_relu.sv", - "../../../../components/fixed_arithmetic/fixed_matmul_core.sv", - "../../../../components/fixed_arithmetic/fixed_dot_product.sv", - "../../../../components/fixed_arithmetic/fixed_accumulator.sv", - "../../../../components/fixed_arithmetic/fixed_vector_mult.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree_layer.sv", - "../../../../components/fixed_arithmetic/fixed_mult.sv", - ] - test_case = VerificationCase() - - # set parameters - extra_args = [] - for k, v in test_case.get_dut_parameters().items(): - extra_args.append(f"-G{k}={v}") - print(extra_args) - runner = get_runner(sim) - runner.build( - verilog_sources=verilog_sources, - hdl_toplevel="fixed_mlp", - build_args=extra_args, - ) - for _ in range(1): - runner.test( - hdl_toplevel="fixed_mlp", - test_module="fixed_mlp_tb", - ) - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_fixed_mlp(): - runner() - - -if __name__ == "__main__": - test_fixed_mlp() diff --git a/src/mase_components/vision_models/vit/test/fixed_msa_tb.py b/src/mase_components/vision_models/vit/test/fixed_msa_tb.py deleted file mode 100644 index 7e59d05dc..000000000 --- a/src/mase_components/vision_models/vit/test/fixed_msa_tb.py +++ /dev/null @@ -1,655 +0,0 @@ -#!/usr/bin/env python3 - -import os, logging -import torch - -# from torchsummary import summary -from einops import rearrange - -from mase_cocotb.random_test import RandomSource, RandomSink, check_results -from mase_cocotb.runner import mase_runner - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock - -from .helpers.pvt_quant import QuantizedAttention -from .helpers.ha_softmax import generate_table_hardware, generate_table_div_hardware -from mase_cocotb.z_qlayers import quantize_to_int as q2i - -debug = False - -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - self.samples = samples - # self.seeds = random.randint(0,1000) - self.data_in_width = 8 - self.data_in_frac_width = 5 - self.weight_q_width = 8 - self.weight_q_frac_width = 4 - self.weight_k_width = 8 - self.weight_k_frac_width = 4 - self.weight_v_width = self.weight_k_width - self.weight_v_frac_width = self.weight_k_frac_width - self.weight_p_width = 8 - self.weight_p_frac_width = 4 - - self.bias_q_width = 8 - self.bias_q_frac_width = 5 - self.bias_k_width = 8 - self.bias_k_frac_width = 5 - self.bias_v_width = self.bias_k_width - self.bias_v_frac_width = self.bias_k_frac_width - self.bias_p_width = 8 - self.bias_p_frac_width = 5 - - self.data_q_width = 8 - self.data_q_frac_width = 5 - self.data_k_width = 8 - self.data_k_frac_width = 5 - self.data_v_width = 8 - self.data_v_frac_width = 5 - self.data_s_width = 8 - self.data_s_frac_width = 5 - self.exp_width = 8 - self.exp_frac_width = 5 - self.div_width = 10 - self.data_s_softmax_width = 8 - self.data_s_softmax_frac_width = 4 - self.data_z_width = 8 - self.data_z_frac_width = 3 - - self.div_width = 10 - - self.w_config = { - "q_proj": { - "name": "integer", - "weight_width": self.weight_q_width, - "weight_frac_width": self.weight_q_frac_width, - "data_in_width": self.data_in_width, - "data_in_frac_width": self.data_in_frac_width, - "bias_width": self.bias_q_width, - "bias_frac_width": self.bias_q_frac_width, - }, - "kv_proj": { - "name": "integer", - "weight_width": self.weight_k_width, - "weight_frac_width": self.weight_k_frac_width, - "data_in_width": self.data_in_width, - "data_in_frac_width": self.data_in_frac_width, - "bias_width": self.bias_k_width, - "bias_frac_width": self.bias_k_frac_width, - }, - "z_proj": { - "name": "integer", - "weight_width": self.weight_p_width, - "weight_frac_width": self.weight_p_frac_width, - "data_in_width": self.data_z_width, - "data_in_frac_width": self.data_z_frac_width, - "bias_width": self.bias_p_width, - "bias_frac_width": self.bias_p_frac_width, - }, - "softmax": { - "exp_width": self.exp_width, - "exp_frac_width": self.exp_frac_width, - "div_width": self.div_width, - "data_in_width": self.data_s_width, - "data_in_frac_width": self.data_s_frac_width, - "data_out_width": self.data_s_softmax_width, - "data_out_frac_width": self.data_s_softmax_frac_width, - }, - "attn_matmul": { - "name": "integer", - "data_in_width": self.data_q_width, - "data_in_frac_width": self.data_q_frac_width, - "weight_width": self.data_k_width, - "weight_frac_width": self.data_k_frac_width, - }, - "z_matmul": { - "name": "integer", - "data_in_width": self.data_s_width, - "data_in_frac_width": self.data_s_frac_width, - "weight_width": self.data_v_width, - "weight_frac_width": self.data_v_frac_width, - }, - } - self.out_width = 8 - self.out_frac_width = 5 - - self.in_y = 8 - self.in_x = 8 - self.unroll_in_x = 2 - self.unroll_w_y = 4 - self.num_heads = 2 - self.w_y = self.in_x - self.unroll_in_y = 1 - self.wp_y = self.in_x - self.unroll_wp_y = self.unroll_in_x - - self.in_parallelism = self.unroll_in_y - self.in_num_parallelism = self.in_y // self.unroll_in_y - - self.in_size = self.unroll_in_x - self.in_depth = self.in_x // self.unroll_in_x - - # noted num_heads * wqkv_p * wqkv_np should be = in_s * in_d - self.wqkv_parallelism = self.unroll_w_y - self.wqkv_num_parallelism = self.w_y // (self.unroll_w_y * self.num_heads) - - assert ( - self.num_heads * self.wqkv_parallelism * self.wqkv_num_parallelism - == self.in_size * self.in_depth - ), "should have num_heads * wqkv_p * wqkv_np == in_s * in_d" - - self.wp_parallelism = self.unroll_wp_y - self.wp_num_parallelism = self.wp_y // self.unroll_wp_y - - assert ( - self.wp_parallelism * self.wp_num_parallelism - == self.in_size * self.in_depth - ), "should have wp_p * wp_np == in_s * in_d" - - self.wp_size = self.num_heads * self.wqkv_parallelism - self.wp_depth = self.wqkv_num_parallelism - # data_generate - self.data_generate() - ## remain modification - self.outputs = RandomSink( - samples=samples * self.in_num_parallelism * self.wp_num_parallelism, - max_stalls=2 * samples * self.in_num_parallelism * self.wp_parallelism, - debug=debug, - ) - self.samples = samples - self.ref = self.sw_compute() - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.data_in_width, - "IN_FRAC_WIDTH": self.data_in_frac_width, - "WQ_WIDTH": self.weight_q_width, - "WQ_FRAC_WIDTH": self.weight_q_frac_width, - "WK_WIDTH": self.weight_k_width, - "WK_FRAC_WIDTH": self.weight_k_frac_width, - "WV_WIDTH": self.weight_v_width, - "WV_FRAC_WIDTH": self.weight_v_frac_width, - "WP_WIDTH": self.weight_p_width, - "WP_FRAC_WIDTH": self.weight_p_frac_width, - "BQ_WIDTH": self.bias_q_width, - "BQ_FRAC_WIDTH": self.bias_q_frac_width, - "BK_WIDTH": self.bias_k_width, - "BK_FRAC_WIDTH": self.bias_k_frac_width, - "BV_WIDTH": self.bias_v_width, - "BV_FRAC_WIDTH": self.bias_v_frac_width, - "BP_WIDTH": self.bias_p_width, - "BP_FRAC_WIDTH": self.bias_p_frac_width, - "DQ_WIDTH": self.data_q_width, - "DQ_FRAC_WIDTH": self.data_q_frac_width, - "DK_WIDTH": self.data_k_width, - "DK_FRAC_WIDTH": self.data_k_frac_width, - "DV_WIDTH": self.data_v_width, - "DV_FRAC_WIDTH": self.data_v_frac_width, - "DS_WIDTH": self.data_s_width, - "DS_FRAC_WIDTH": self.data_s_frac_width, - "EXP_WIDTH": self.w_config["softmax"]["exp_width"], - "EXP_FRAC_WIDTH": self.w_config["softmax"]["exp_frac_width"], - "DIV_WIDTH": self.w_config["softmax"]["div_width"], - "DS_SOFTMAX_WIDTH": self.w_config["softmax"]["data_out_width"], - "DS_SOFTMAX_FRAC_WIDTH": self.w_config["softmax"]["data_out_frac_width"], - "DZ_WIDTH": self.data_z_width, - "DZ_FRAC_WIDTH": self.data_z_frac_width, - "OUT_WIDTH": self.out_width, - "OUT_FRAC_WIDTH": self.out_frac_width, - "UNROLL_IN_Y": self.in_parallelism, - "IN_Y": self.in_num_parallelism * self.in_parallelism, - "UNROLL_IN_X": self.in_size, - "IN_X": self.in_depth * self.in_size, - "NUM_HEADS": self.num_heads, - "UNROLL_WQKV_Y": self.wqkv_parallelism, - "WQKV_Y": self.wqkv_parallelism * self.wqkv_num_parallelism, - "UNROLL_WP_Y": self.wp_parallelism, - "WP_Y": self.wp_parallelism * self.wp_num_parallelism, - } - - def data_generate(self): - # generate data - samples = self.samples - torch.manual_seed(2) - # breakpoint() - self.x = torch.randn((samples, self.in_y, self.in_x)) - self.att = QuantizedAttention( - dim=self.in_x, - num_heads=self.num_heads, - qkv_bias=True, - attn_drop=0.0, - proj_drop=0.0, - config=self.w_config, - ) - input_tensor = q2i(self.x, self.data_in_width, self.data_in_frac_width) - att_wq = q2i(self.att.q.weight, self.weight_q_width, self.weight_q_frac_width) - att_wkv = q2i(self.att.kv.weight, self.weight_k_width, self.weight_k_frac_width) - wqkv_tensor = torch.cat((att_wq, att_wkv), 0) - wqkv_tensor = wqkv_tensor.reshape(3, self.in_x, self.in_x) - wqkv_tensor = wqkv_tensor.reshape(self.in_x * 3, self.in_x).repeat( - samples, 1, 1 - ) - - att_bq = q2i(self.att.q.bias, self.bias_q_width, self.bias_q_frac_width) - att_bkv = q2i(self.att.kv.bias, self.bias_k_width, self.bias_k_frac_width) - bqkv_tensor = torch.cat((att_bq, att_bkv), 0) - bqkv_tensor = bqkv_tensor.reshape(3, self.in_x) - bqkv_tensor = bqkv_tensor.reshape(-1).repeat(samples, 1) - - wp_tensor = q2i( - self.att.proj.weight, self.weight_p_width, self.weight_p_frac_width - ).repeat(samples, 1, 1) - bp_tensor = q2i( - self.att.proj.bias, self.bias_p_width, self.bias_p_frac_width - ).repeat(samples, 1) - - logger.debug( - "input data: \n\ - d_tensor = \n{}\n\ - wqkv_tensor = \n{}\n\ - bqkv_tensor = \n{}\n\ - wp_tensor = \n{}\n\ - bp_tensor = \n{}\n\ - ".format( - input_tensor, wqkv_tensor, bqkv_tensor, wp_tensor, bp_tensor - ) - ) - # generate hash table - exp_table = generate_table_hardware( - self.att.scale, - self.w_config["softmax"]["data_in_width"], - self.w_config["softmax"]["data_in_frac_width"], - self.w_config["softmax"]["exp_width"], - self.w_config["softmax"]["exp_frac_width"], - ).tolist() - div_table = generate_table_div_hardware( - self.w_config["softmax"]["div_width"], - self.w_config["softmax"]["data_out_width"], - self.w_config["softmax"]["data_out_frac_width"], - ).tolist() - with open(r"exp_init.mem", "w") as fp: - for item in exp_table: - # write each item on a new lineformat(addr[i] ,f'0{width}b' - fp.write( - "%s\n" - % format(item, f'0{self.w_config["softmax"]["exp_width"]//4}x') - ) - with open(r"div_init.mem", "w") as fp: - for item in div_table: - # write each item on a new line - fp.write( - "%s\n" - % format(item, f'0{self.w_config["softmax"]["data_out_width"]//4}x') - ) - # data_pack - in_num_parallelism = self.in_num_parallelism - in_depth = self.in_depth - in_parallelism = self.in_parallelism - in_size = self.in_size - wqkv_parallelism = self.wqkv_parallelism - wqkv_num_parallelism = self.wqkv_num_parallelism - num_heads = self.num_heads - wp_parallelism = self.wp_parallelism - wp_num_parallelism = self.wp_num_parallelism - B = 1 - N = in_parallelism * in_num_parallelism - dim = in_size * in_depth - - wqkv = wqkv_tensor.reshape( - samples, 3, num_heads, wqkv_num_parallelism, wqkv_parallelism, dim - ).permute(1, 0, 3, 2, 4, 5) - wqkv = wqkv.reshape(3, samples, dim, dim) - bqkv = bqkv_tensor.reshape( - samples, 3, num_heads, wqkv_num_parallelism, wqkv_parallelism - ).permute(1, 0, 3, 2, 4) - bqkv = bqkv.reshape(3, samples, dim) - - wp = wp_tensor.reshape( - samples * dim, num_heads, wqkv_num_parallelism, wqkv_parallelism - ) - wp = wp.permute(0, 2, 1, 3).reshape(samples, dim, dim) - - wq = wqkv[0] - wk = wqkv[1] - wv = wqkv[2] - - bq = bqkv[0] - bk = bqkv[1] - bv = bqkv[2] - self.data_in = self.data_pack( - input_tensor, in_num_parallelism, in_depth, in_parallelism, in_size - ) - self.wq_in = self.data_pack( - wq, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - self.wk_in = self.data_pack( - wk, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - self.wv_in = self.data_pack( - wv, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - self.wp_in = self.data_pack( - wp, - wp_num_parallelism, - wqkv_num_parallelism, - wp_parallelism, - num_heads * wqkv_parallelism, - ) - - self.bq_in = self.data_pack( - bq, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - self.bk_in = self.data_pack( - bk, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - self.bv_in = self.data_pack( - bv, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - self.bp_in = self.data_pack(bp_tensor, 1, wp_num_parallelism, 1, wp_parallelism) - - self.data_in.reverse() - self.wq_in.reverse() - self.wk_in.reverse() - self.wv_in.reverse() - self.wp_in.reverse() - self.bq_in.reverse() - self.bk_in.reverse() - self.bv_in.reverse() - self.bp_in.reverse() - - self.data_in = RandomSource( - name="data_in", - samples=samples * self.in_depth * self.in_num_parallelism, - num=self.in_parallelism * self.in_size, - max_stalls=2 * samples * self.in_depth * self.in_num_parallelism, - data_specify=self.data_in, - debug=debug, - ) - self.weight_q = RandomSource( - name="weight_q", - samples=samples * self.in_depth * self.wqkv_num_parallelism, - num=self.num_heads * self.wqkv_parallelism * self.in_size, - max_stalls=2 * samples * self.in_depth * self.wqkv_num_parallelism, - data_specify=self.wq_in, - debug=debug, - ) - self.weight_k = RandomSource( - name="weight_k", - samples=samples * self.in_depth * self.wqkv_num_parallelism, - num=self.num_heads * self.wqkv_parallelism * self.in_size, - max_stalls=2 * samples * self.in_depth * self.wqkv_num_parallelism, - data_specify=self.wk_in, - debug=debug, - ) - self.weight_v = RandomSource( - name="weight_v", - samples=samples * self.in_depth * self.wqkv_num_parallelism, - num=self.num_heads * self.wqkv_parallelism * self.in_size, - max_stalls=2 * samples * self.in_depth * self.wqkv_num_parallelism, - data_specify=self.wv_in, - debug=debug, - ) - self.weight_p = RandomSource( - name="weight_p", - samples=samples * self.wp_depth * self.wp_num_parallelism, - num=self.wp_parallelism * self.wp_size, - max_stalls=2 * samples * self.wp_depth * self.wp_num_parallelism, - data_specify=self.wp_in, - debug=debug, - ) - self.bias_q = RandomSource( - name="bias_q", - samples=samples * self.wqkv_num_parallelism, - num=self.num_heads * self.wqkv_parallelism, - max_stalls=2 * samples, - data_specify=self.bq_in, - debug=debug, - ) - self.bias_k = RandomSource( - name="bias_k", - samples=samples * self.wqkv_num_parallelism, - num=self.num_heads * self.wqkv_parallelism, - max_stalls=2 * samples, - data_specify=self.bk_in, - debug=debug, - ) - self.bias_v = RandomSource( - name="bias_v", - samples=samples * self.wqkv_num_parallelism, - num=self.num_heads * self.wqkv_parallelism, - max_stalls=2 * samples, - data_specify=self.bv_in, - debug=debug, - ) - self.bias_p = RandomSource( - name="bias_p", - samples=samples * self.wp_num_parallelism, - num=self.wp_parallelism, - max_stalls=2 * samples, - data_specify=self.bp_in, - debug=debug, - ) - - def sw_compute(self): - # get the matrix out result - # from M[num_parallelism][depth], - # and the element in M is m[parallelism][size] - # to M_out[in1_num_parallelism][in2_num_parallelism] - # the element in M_out is m_out[in1_parallelism][in2_parallelism] - - # collect all the input - # calculate the output - # cut the output to smaller sets - data_out = self.att(self.x) - output = self.data_pack( - q2i(data_out, self.out_width, self.out_frac_width), - self.in_num_parallelism, - self.wp_num_parallelism, - self.in_parallelism, - self.wp_parallelism, - ) - return output - - def data_pack(self, in_temp, np, d, p, s): - # assum in_temp.shape = (samples, batch = 1, N,dim) - in_temp = in_temp.to(torch.int).reshape(self.samples, np * p, d * s) - ref = [] - for i in range(self.samples): - re_tensor = rearrange( - in_temp[i], "(np p) (d s) -> np (p d) s", np=np, d=d, p=p, s=s - ) - ex_tensor = torch.zeros(np, d * p, s, dtype=int) - for b in range(np): - for i in range(d): - for j in range(p): - ex_tensor[b][i * p + j] = re_tensor[b][j * d + i] - output_tensor = rearrange( - ex_tensor, "np (d p) s -> (np d) (p s)", np=np, d=d, p=p, s=s - ) - output = output_tensor.tolist() - ref = ref + output - return ref - - -def debug_state(dut, state): - logger.debug( - "{} State: (wq_ready,wq_valid,wk_ready,wk_valid,wv_ready,wv_valid,in_ready,in_valid,data_out_ready,data_out_valid) = ({},{},{},{},{},{},{},{},{},{})".format( - state, - dut.weight_q_ready.value, - dut.weight_q_valid.value, - dut.weight_k_ready.value, - dut.weight_k_valid.value, - dut.weight_v_ready.value, - dut.weight_v_valid.value, - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - - -@cocotb.test() -async def cocotb_test_msa(dut): - """Test integer based vector mult""" - samples = 100 - test_case = VerificationCase(samples=samples) - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.weight_q_valid.value = 0 - dut.weight_k_valid.value = 0 - dut.weight_v_valid.value = 0 - dut.weight_p_valid.value = 0 - dut.bias_q_valid.value = 0 - dut.bias_k_valid.value = 0 - dut.bias_v_valid.value = 0 - dut.bias_p_valid.value = 0 - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - # debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - # debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - done = False - # Set a timeout to avoid deadlock - cdin = 0 - cdata_out = 0 - for i in range(samples * 15000): - await FallingEdge(dut.clk) - dut.weight_q_valid.value = test_case.weight_q.pre_compute() - dut.weight_k_valid.value = test_case.weight_k.pre_compute() - dut.weight_v_valid.value = test_case.weight_v.pre_compute() - dut.weight_p_valid.value = test_case.weight_p.pre_compute() - dut.bias_q_valid.value = test_case.bias_q.pre_compute() - dut.bias_k_valid.value = test_case.bias_k.pre_compute() - dut.bias_v_valid.value = test_case.bias_v.pre_compute() - dut.bias_p_valid.value = test_case.bias_p.pre_compute() - dut.data_in_valid.value = test_case.data_in.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute( - dut.data_out_valid.value - ) - await Timer(1, units="ns") - # debug_state(dut, "in compute") - dut.weight_q_valid.value, dut.weight_q.value = test_case.weight_q.compute( - dut.weight_q_ready.value - ) - dut.weight_k_valid.value, dut.weight_k.value = test_case.weight_k.compute( - dut.weight_k_ready.value - ) - dut.weight_v_valid.value, dut.weight_v.value = test_case.weight_v.compute( - dut.weight_v_ready.value - ) - dut.weight_p_valid.value, dut.weight_p.value = test_case.weight_p.compute( - dut.weight_p_ready.value - ) - - dut.bias_q_valid.value, dut.bias_q.value = test_case.bias_q.compute( - dut.bias_q_ready.value - ) - dut.bias_k_valid.value, dut.bias_k.value = test_case.bias_k.compute( - dut.bias_k_ready.value - ) - dut.bias_v_valid.value, dut.bias_v.value = test_case.bias_v.compute( - dut.bias_v_ready.value - ) - dut.bias_p_valid.value, dut.bias_p.value = test_case.bias_p.compute( - dut.bias_p_ready.value - ) - - dut.data_in_valid.value, dut.data_in.value = test_case.data_in.compute( - dut.data_in_ready.value - ) - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - await Timer(1, units="ns") - wave_check(dut) - if dut.data_in_valid.value == 1 and dut.data_in_ready.value == 1: - cdin += 1 - if dut.data_out_valid.value == 1 and dut.data_out_ready.value == 1: - cdata_out += 1 - print("cdin = ", cdin) - print("cdata_out = ", cdata_out) - if ( - test_case.weight_q.is_empty() - and test_case.weight_k.is_empty() - and test_case.weight_v.is_empty() - and test_case.weight_p.is_empty() - and test_case.bias_q.is_empty() - and test_case.bias_k.is_empty() - and test_case.bias_v.is_empty() - and test_case.bias_p.is_empty() - and test_case.data_in.is_empty() - and test_case.outputs.is_full() - ): - done = True - break - assert ( - done - ), "Deadlock detected or the simulation reaches the maximum cycle limit (fixed it by adjusting the loop trip count)" - - check_results(test_case.outputs.data, test_case.ref) - - -def wave_check(dut): - logger.debug( - "wave of in_out:\n\ - {},{},data_in = {} \n\ - {},{},data_out = {}\n\ - ".format( - dut.data_in_valid.value, - dut.data_in_ready.value, - [int(i) for i in dut.data_in.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.data_out.value], - ) - ) - - logger.debug( - "wave of sa_out:\n\ - {},{},sa_out = {} \n\ - ".format( - dut.sa_out_valid.value, - dut.sa_out_ready.value, - [int(i) for i in dut.sa_out.value], - ) - ) - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_fixed_msa(): - tb = VerificationCase() - mase_runner(module_param_list=[tb.get_dut_parameters()]) - - -if __name__ == "__main__": - test_fixed_msa() diff --git a/src/mase_components/vision_models/vit/test/fixed_patch_embed_tb.py b/src/mase_components/vision_models/vit/test/fixed_patch_embed_tb.py deleted file mode 100644 index 05cf0f2cf..000000000 --- a/src/mase_components/vision_models/vit/test/fixed_patch_embed_tb.py +++ /dev/null @@ -1,376 +0,0 @@ -#!/usr/bin/env python3 - -# This script tests the fixed point linear -import logging - -from mase_cocotb.random_test import RandomSource, RandomSink, check_results -from mase_cocotb.runner import mase_runner - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock - -from einops import rearrange -import torch -from mase_cocotb.z_qlayers import quantize_to_int as q2i - -from .helpers.pvt_quant import QuantizedPatchEmbed - -debug = False - -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - # width config - self.w_config = { - "patch_proj": { - "name": "integer", - "data_in_width": 8, - "data_in_frac_width": 5, - "weight_width": 8, - "weight_frac_width": 6, - "bias_width": 8, - "bias_frac_width": 5, - } - } - self.data_out_width = 6 - self.data_out_frac_width = 2 - # parameters config - self.in_c = 3 - self.in_y = 32 - self.in_x = 32 - self.patch_size = 8 - self.embed_dim = 64 - self.pe_unroll_kernel_out = 3 - self.pe_unroll_in_c = 3 - self.pe_unroll_embed_dim = 8 - self.num_patch = int(self.in_y * self.in_x // (self.patch_size**2)) - - # self.num_classes = 10 - # self.head_unroll_out_x = 5 - self.samples = samples - - self.pe_iter_weight = int( - (self.patch_size**2) - * self.in_c - * self.embed_dim - / self.pe_unroll_kernel_out - / self.pe_unroll_embed_dim - ) - self.data_generate() - # TODO: here - self.data_in = RandomSource( - name="data_in", - samples=samples - * int(self.in_x * self.in_y * self.in_c / self.pe_unroll_in_c), - num=self.pe_unroll_in_c, - max_stalls=0, - # max_stalls=2 * samples * int(self.in_x * self.in_y * self.in_c / self.pe_unroll_in_c), - data_specify=self.x_in, - debug=debug, - ) - self.patch_embed_bias = RandomSource( - name="patch_embed_bias", - samples=samples * int(self.embed_dim / self.pe_unroll_embed_dim), - num=self.pe_unroll_embed_dim, - # max_stalls=2 * samples * int(self.embed_dim/self.pe_unroll_embed_dim), - data_specify=self.pe_b_in, - debug=debug, - ) - self.patch_embed_weight = RandomSource( - name="patch_embed_weight", - samples=samples * self.pe_iter_weight, - num=self.pe_unroll_kernel_out * self.pe_unroll_embed_dim, - # max_stalls=2 * samples * self.pe_iter_weight, - data_specify=self.pe_w_in, - debug=debug, - ) - - self.outputs = RandomSink( - samples=samples - * self.num_patch - * int(self.embed_dim / self.pe_unroll_embed_dim), - max_stalls=0, - # max_stalls=2 * samples * int(self.num_classes/self.head_unroll_out_x), - debug=debug, - ) - self.samples = samples - self.ref = self.sw_compute() - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.w_config["patch_proj"]["data_in_width"], - "IN_FRAC_WIDTH": self.w_config["patch_proj"]["data_in_frac_width"], - "W_WIDTH": self.w_config["patch_proj"]["weight_width"], - "W_FRAC_WIDTH": self.w_config["patch_proj"]["weight_frac_width"], - "BIAS_WIDTH": self.w_config["patch_proj"]["bias_width"], - "BIAS_FRAC_WIDTH": self.w_config["patch_proj"]["bias_frac_width"], - "OUT_WIDTH": self.data_out_width, - "OUT_FRAC_WIDTH": self.data_out_frac_width, - "IN_C": self.in_c, - "IN_Y": self.in_y, - "IN_X": self.in_x, - "KERNEL_SIZE": self.patch_size, - "OUT_C": self.embed_dim, - "SLIDING_NUM": self.num_patch, - "UNROLL_KERNEL_OUT": self.pe_unroll_kernel_out, - "UNROLL_IN_C": self.pe_unroll_in_c, - "UNROLL_OUT_C": self.pe_unroll_embed_dim, - } - - def data_generate(self): - torch.manual_seed(0) - self.patch_embed = QuantizedPatchEmbed( - img_size=self.in_x, - patch_size=self.patch_size, - embed_dim=self.embed_dim, - in_chans=self.in_c, - config=self.w_config, - ) - # get parameters with integer format - patch_w_1 = q2i( - self.patch_embed.proj.weight, - self.w_config["patch_proj"]["weight_width"], - self.w_config["patch_proj"]["weight_frac_width"], - ) - print("weight = ", self.patch_embed.proj.weight) - patch_b_1 = q2i( - self.patch_embed.proj.bias, - self.w_config["patch_proj"]["bias_width"], - self.w_config["patch_proj"]["bias_frac_width"], - ) - print("patch_b_1 = ", patch_b_1) - print("bias = ", self.patch_embed.proj.bias) - self.x = 5 * torch.randn(self.samples, self.in_c, self.in_y, self.in_x) - self.x_in = q2i( - self.x, - self.w_config["patch_proj"]["data_in_width"], - self.w_config["patch_proj"]["data_in_frac_width"], - ) - print("x = ", self.x) - # parameters packs - self.pe_w_in, self.pe_b_in = self.conv_pack( - weight=patch_w_1, - bias=patch_b_1, - in_channels=self.in_c, - kernel_size=[self.patch_size, self.patch_size], - out_channels=self.embed_dim, - unroll_in_channels=self.pe_unroll_in_c, - unroll_kernel_out=self.pe_unroll_kernel_out, - unroll_out_channels=self.pe_unroll_embed_dim, - ) - - self.x_in = self.x_in.permute(0, 2, 3, 1).reshape(-1, self.pe_unroll_in_c) - - self.x_in = self.x_in.flip(0).tolist() - - def sw_compute(self): - data_out, _ = self.patch_embed(self.x) - # breakpoint() - print(data_out) - output = self.linear_data_pack( - q2i(data_out, self.data_out_width, self.data_out_frac_width), - in_y=self.num_patch, - in_x=self.embed_dim, - unroll_in_y=1, - unroll_in_x=self.pe_unroll_embed_dim, - ) - return output - - def linear_data_pack(self, in_temp, in_y, in_x, unroll_in_y, unroll_in_x): - ## just what to make a matrix with [np*p][s*d] to tile [np*d][p*s] - ## assume the in_temp as torch.float - np = int(in_y / unroll_in_y) - d = int(in_x / unroll_in_x) - p = unroll_in_y - s = unroll_in_x - - in_temp = in_temp.to(torch.int).reshape(self.samples, np * p, d * s) - ref = [] - for i in range(self.samples): - re_tensor = rearrange( - in_temp[i], "(np p) (d s) -> np (p d) s", np=np, d=d, p=p, s=s - ) - ex_tensor = torch.zeros(np, d * p, s, dtype=int) - for b in range(np): - for i in range(d): - for j in range(p): - ex_tensor[b][i * p + j] = re_tensor[b][j * d + i] - output_tensor = rearrange( - ex_tensor, "np (d p) s -> (np d) (p s)", np=np, d=d, p=p, s=s - ) - output = output_tensor.tolist() - ref = ref + output - return ref - - def conv_pack( - self, - weight, - bias, - in_channels, - kernel_size, - out_channels, - unroll_in_channels, - unroll_kernel_out, - unroll_out_channels, - ): - samples = self.samples - # requires input as a quantized int format - # weight_pack - # from (oc,ic/u_ic,u_ic,h,w) to (ic/u_ic,h*w,u_ic,oc) - reorder_w_tensor = ( - weight.repeat(samples, 1, 1, 1, 1) - .reshape( - samples, - out_channels, - int(in_channels / unroll_in_channels), - unroll_in_channels, - kernel_size[0] * kernel_size[1], - ) - .permute(0, 2, 4, 3, 1) - ) - - # reverse the final 2 dimension - # from(samples, int(kernel_height * kernel_width * in_channels / unroll_kernel_out), unroll_kernel_out, int(out_channels/unroll_out_channels), unroll_out_channels) - # to (samples, int(out_channels/unroll_out_channels), int(kernel_height * kernel_width * in_channels / unroll_kernel_out), unroll_out_channels, unroll_kernel_out) - w_tensor = reorder_w_tensor.reshape( - samples, - int(kernel_size[0] * kernel_size[1] * in_channels / unroll_kernel_out), - unroll_kernel_out, - int(out_channels / unroll_out_channels), - unroll_out_channels, - ).permute(0, 3, 1, 4, 2) - - w_tensor = w_tensor.reshape( - -1, - unroll_out_channels * unroll_kernel_out, - ) - w_in = w_tensor.type(torch.int).flip(0).tolist() - # bias_pack - bias_tensor = bias.repeat(samples, 1).reshape(-1, unroll_out_channels) - b_in = bias_tensor.type(torch.int).flip(0).tolist() - return w_in, b_in - - -@cocotb.test() -async def cocotb_test_fixed_linear(dut): - # TODO: - """Test integer based vector mult""" - samples = 10 - test_case = VerificationCase(samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.weight_valid.value = 0 - dut.bias_valid.value = 0 - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - # debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - # debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - - done = False - cdin = 0 - cpatch_out = 0 - # Set a timeout to avoid deadlock - for i in range(samples * 400000): - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - dut.rst.value = 0 - dut.bias_valid.value = test_case.patch_embed_bias.pre_compute() - dut.weight_valid.value = test_case.patch_embed_weight.pre_compute() - dut.data_in_valid.value = test_case.data_in.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute(dut.data_out_valid) - await Timer(1, units="ns") - # start input data - dut.weight_valid.value, dut.weight.value = test_case.patch_embed_weight.compute( - dut.weight_ready.value - ) - dut.bias_valid.value, dut.bias.value = test_case.patch_embed_bias.compute( - dut.bias_ready.value - ) - dut.data_in_valid.value, dut.data_in.value = test_case.data_in.compute( - dut.data_in_ready.value - ) - - await Timer(1, units="ns") - - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - await Timer(1, units="ns") - # if(dut.data_out_ready.value and dut.data_out_valid.value): - # if() - # dut.rst.value = 1 - # dut.data_in_ready.value = 0 - wave_check(dut) - if dut.data_in_valid.value == 1 and dut.data_in_ready.value == 1: - cdin += 1 - if dut.data_out_valid.value == 1 and dut.data_out_ready.value == 1: - cpatch_out += 1 - print("cdin = ", cdin) - print("cpatch_out = ", cpatch_out) - # breakpoint() - if ( - test_case.outputs.is_full() - and test_case.patch_embed_bias.is_empty() - and test_case.patch_embed_weight.is_empty() - and test_case.data_in.is_empty() - ): - done = True - break - assert ( - done - ), "Deadlock detected or the simulation reaches the maximum cycle limit (fixed it by adjusting the loop trip count)" - - check_results(test_case.outputs.data, test_case.ref) - - -def wave_check(dut): - logger.debug( - "wave_check:\n\ - {},{} data_in = {}\n\ - {},{} data_out = {}\n\ - ".format( - dut.conv_inst.fl_instance.data_in_valid.value, - dut.conv_inst.fl_instance.data_in_ready.value, - [int(i) for i in dut.conv_inst.fl_instance.data_in.value], - dut.conv_inst.fl_instance.data_out_valid.value, - dut.conv_inst.fl_instance.data_out_ready.value, - [int(i) for i in dut.conv_inst.fl_instance.data_out.value], - ) - ) - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_fixed_patch_embed(): - tb = VerificationCase() - mase_runner(module_param_list=[tb.get_dut_parameters()]) - - -if __name__ == "__main__": - test_fixed_patch_embed() diff --git a/src/mase_components/vision_models/vit/test/fixed_pvt_tb.py b/src/mase_components/vision_models/vit/test/fixed_pvt_tb.py deleted file mode 100644 index 06b921e81..000000000 --- a/src/mase_components/vision_models/vit/test/fixed_pvt_tb.py +++ /dev/null @@ -1,1531 +0,0 @@ -#!/usr/bin/env python3 - -# This script tests the fixed point linear -import random, os, math, logging, sys -import numpy as np - -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -sys.path.append("/workspace/machop/") -sys.path.append("/workspace/components/testbench/ViT/") - -from mase_cocotb.random_test import RandomSource -from mase_cocotb.random_test import RandomSink -from mase_cocotb.random_test import check_results - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock -from cocotb.runner import get_runner - -from einops import rearrange -import torch -import torch.nn as nn -import torch.nn.functional as F -from mase_cocotb.z_qlayers import quantize_to_int as q2i -from chop.models.manual.quant_utils import get_quantized_cls -from .helpers.pvt_quant import QuantizedPyramidVisionTransformer -from mase_cocotb.z_qlayers import linear_data_pack -from .helpers.ha_softmax import generate_table_hardware, generate_table_div_hardware - -debug = False - -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - # width config - self.samples = samples - self.width_generate() - # parameters config - self.in_c = 3 - self.in_y = 224 - self.in_x = 224 - self.embed_dim = 384 - self.patch_size = 16 - self.num_patch = self.in_y * self.in_x // (self.patch_size**2) - - self.num_heads = 6 - self.mlp_ratio = 2 - - self.pe_unroll_kernel_out = 24 - self.pe_unroll_in_c = 3 - self.pe_unroll_embed_dim = 8 - self.blk_unroll_qkv_dim = 2 - self.blk_unroll_hidden_features = 4 - - self.num_classes = 10 - self.head_unroll_out_x = 1 - - self.pe_iter_weight = int( - (self.patch_size**2) - * self.in_c - * self.embed_dim - / self.pe_unroll_kernel_out - / self.pe_unroll_embed_dim - ) - self.source_generate() - self.outputs = RandomSink( - samples=samples * int(self.num_classes / self.head_unroll_out_x), - debug=debug, - ) - self.samples = samples - self.ref = self.sw_compute() - - def width_generate(self): - din, din_f = 8, 3 - - aff_msa_w, aff_msa_w_f = din, din_f - aff_msa_b, aff_msa_b_f = 8, 3 - msa_din, msa_din_f = 8, 3 - wq, wq_f = 6, 4 - wkv, wkv_f = 6, 4 - wp, wp_f = 6, 4 - - bq, bq_f = 6, 4 - bkv, bkv_f = 6, 4 - bp, bp_f = 6, 4 - - dq, dq_f = 8, 3 - dk, dk_f = 8, 3 - dv, dv_f = 8, 3 - ds, ds_f = 8, 3 - softmax_exp, softmax_exp_f = 8, 5 - softmax_ds, softmax_ds_f = 8, 3 - div = 9 - dz, dz_f = 8, 3 - - msa_o, msa_o_f = 8, 3 - - aff_mlp_w, aff_mlp_w_f = msa_o + 1, msa_o_f - aff_mlp_b, aff_mlp_b_f = 8, 3 - - mlp_din, mlp_din_f = 8, 3 - fc1_w, fc1_w_f = 6, 4 - fc1_b, fc1_b_f = 6, 4 - mlp_hidden, mlp_hidden_f = 8, 3 - fc2_w, fc2_w_f = 6, 4 - fc2_b, fc2_b_f = 6, 4 - mlp_o, mlp_o_f = 8, 3 - self.w_config = { - "patch_embed": { - "patch_proj": { - "name": "integer", - "data_in_width": 8, - "data_in_frac_width": 5, - "weight_width": 8, - "weight_frac_width": 6, - "bias_width": 8, - "bias_frac_width": 5, - }, - }, - "pos_add": { - "name": "integer", - "data_in_width": 8, - "data_in_frac_width": 5, - }, - "block": { - "affine_att": { - "mul": { - "name": "integer", - "data_in_width": aff_msa_w, - "data_in_frac_width": aff_msa_w_f, - }, - "add": { - "name": "integer", - "data_in_width": aff_msa_b, - "data_in_frac_width": aff_msa_b_f, - }, - }, - "msa": { - "q_proj": { - "name": "integer", - "weight_width": wq, - "weight_frac_width": wq_f, - "data_in_width": msa_din, - "data_in_frac_width": msa_din_f, - "bias_width": bq, - "bias_frac_width": bq_f, - }, - "kv_proj": { - "name": "integer", - "weight_width": wkv, - "weight_frac_width": wkv_f, - "data_in_width": msa_din, - "data_in_frac_width": msa_din_f, - "bias_width": bkv, - "bias_frac_width": bkv_f, - }, - "z_proj": { - "name": "integer", - "weight_width": wp, - "weight_frac_width": wp_f, - "data_in_width": dz, - "data_in_frac_width": dz_f, - "bias_width": bp, - "bias_frac_width": bp_f, - }, - "softmax": { - "name": "integer", - "exp_width": softmax_exp, - "exp_frac_width": softmax_exp_f, - "data_in_width": ds, - "data_in_frac_width": ds_f, - "data_out_width": softmax_ds, - "data_out_frac_width": softmax_ds_f, - "div_width": div, - }, - "attn_matmul": { - "name": "integer", - "data_in_width": dq, - "data_in_frac_width": dq_f, - "weight_width": dk, - "weight_frac_width": dk_f, - }, - "z_matmul": { - "name": "integer", - "data_in_width": softmax_ds, - "data_in_frac_width": softmax_ds_f, - "weight_width": dv, - "weight_frac_width": dv_f, - }, - }, - "add1": { - "name": "integer", - "data_in_width": msa_o, - "data_in_frac_width": msa_o_f, - }, - "affine_mlp": { - "mul": { - "name": "integer", - "data_in_width": aff_mlp_w, - "data_in_frac_width": aff_mlp_w_f, - }, - "add": { - "name": "integer", - "data_in_width": aff_mlp_b, - "data_in_frac_width": aff_mlp_b_f, - }, - }, - "mlp": { - "fc1_proj": { - "name": "integer", - "weight_width": fc1_w, - "weight_frac_width": fc1_w_f, - "data_in_width": mlp_din, - "data_in_frac_width": mlp_din_f, - "bias_width": fc1_b, - "bias_frac_width": fc1_b_f, - }, - "mlp_relu": { - "name": "integer", - "bypass": True, - "data_in_width": mlp_hidden, - "data_in_frac_width": mlp_hidden_f, - }, - "fc2_proj": { - "name": "integer", - "weight_width": fc2_w, - "weight_frac_width": fc2_w_f, - "data_in_width": mlp_hidden, - "data_in_frac_width": mlp_hidden_f, - "bias_width": fc2_b, - "bias_frac_width": fc2_b_f, - }, - }, - "add2": { - "name": "integer", - "data_in_width": mlp_o - 1, - "data_in_frac_width": mlp_o_f, - }, - }, - # "pvt_norm":{ - # "mul":{ - # "name": "integer", - # "data_in_width": aff_mlp_w, - # "data_in_frac_width": aff_mlp_w_f, - # }, - # "add":{ - # "name": "integer", - # "data_in_width": aff_mlp_b, - # "data_in_frac_width": aff_mlp_b_f, - # }, - # }, - "head_proj": { - "name": "integer", - "data_in_width": mlp_o, - "data_in_frac_width": mlp_o_f, - "weight_width": 8, - "weight_frac_width": 4, - "bias_width": 8, - "bias_frac_width": 4, - }, - } - self.ow, self.ow_f = 8, 3 - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.w_config["patch_embed"]["patch_proj"]["data_in_width"], - "IN_FRAC_WIDTH": self.w_config["patch_embed"]["patch_proj"][ - "data_in_frac_width" - ], - "PATCH_EMBED_W_WIDTH_3": self.w_config["patch_embed"]["patch_proj"][ - "weight_width" - ], - "PATCH_EMBED_W_FRAC_WIDTH_3": self.w_config["patch_embed"]["patch_proj"][ - "weight_frac_width" - ], - "PATCH_EMBED_B_WIDTH_3": self.w_config["patch_embed"]["patch_proj"][ - "bias_width" - ], - "PATCH_EMBED_B_FRAC_WIDTH_3": self.w_config["patch_embed"]["patch_proj"][ - "bias_frac_width" - ], - "POS_ADD_IN_WIDTH_3": self.w_config["pos_add"]["data_in_width"], - "POS_ADD_IN_FRAC_WIDTH_3": self.w_config["pos_add"]["data_in_frac_width"], - "BLOCK_IN_WIDTH": self.w_config["block"]["affine_att"]["mul"][ - "data_in_width" - ], - "BLOCK_IN_FRAC_WIDTH": self.w_config["block"]["affine_att"]["mul"][ - "data_in_frac_width" - ], - "BLOCK_AF_MSA_ADD_WIDTH": self.w_config["block"]["affine_att"]["add"][ - "data_in_width" - ], - "BLOCK_AF_MSA_ADD_FRAC_WIDTH": self.w_config["block"]["affine_att"]["add"][ - "data_in_frac_width" - ], - "BLOCK_MSA_IN_WIDTH": self.w_config["block"]["msa"]["q_proj"][ - "data_in_width" - ], - "BLOCK_MSA_IN_FRAC_WIDTH": self.w_config["block"]["msa"]["q_proj"][ - "data_in_frac_width" - ], - "BLOCK_WQ_WIDTH": self.w_config["block"]["msa"]["q_proj"]["weight_width"], - "BLOCK_WQ_FRAC_WIDTH": self.w_config["block"]["msa"]["q_proj"][ - "weight_frac_width" - ], - "BLOCK_WK_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["weight_width"], - "BLOCK_WK_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "weight_frac_width" - ], - "BLOCK_WV_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["weight_width"], - "BLOCK_WV_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "weight_frac_width" - ], - "BLOCK_WP_WIDTH": self.w_config["block"]["msa"]["z_proj"]["weight_width"], - "BLOCK_WP_FRAC_WIDTH": self.w_config["block"]["msa"]["z_proj"][ - "weight_frac_width" - ], - "BLOCK_BQ_WIDTH": self.w_config["block"]["msa"]["q_proj"]["bias_width"], - "BLOCK_BQ_FRAC_WIDTH": self.w_config["block"]["msa"]["q_proj"][ - "bias_frac_width" - ], - "BLOCK_BK_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["bias_width"], - "BLOCK_BK_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "bias_frac_width" - ], - "BLOCK_BV_WIDTH": self.w_config["block"]["msa"]["kv_proj"]["bias_width"], - "BLOCK_BV_FRAC_WIDTH": self.w_config["block"]["msa"]["kv_proj"][ - "bias_frac_width" - ], - "BLOCK_BP_WIDTH": self.w_config["block"]["msa"]["z_proj"]["bias_width"], - "BLOCK_BP_FRAC_WIDTH": self.w_config["block"]["msa"]["z_proj"][ - "bias_frac_width" - ], - "BLOCK_DQ_WIDTH": self.w_config["block"]["msa"]["attn_matmul"][ - "data_in_width" - ], - "BLOCK_DQ_FRAC_WIDTH": self.w_config["block"]["msa"]["attn_matmul"][ - "data_in_frac_width" - ], - "BLOCK_DK_WIDTH": self.w_config["block"]["msa"]["attn_matmul"][ - "weight_width" - ], - "BLOCK_DK_FRAC_WIDTH": self.w_config["block"]["msa"]["attn_matmul"][ - "weight_frac_width" - ], - "BLOCK_DS_WIDTH": self.w_config["block"]["msa"]["z_matmul"][ - "data_in_width" - ], - "BLOCK_DS_FRAC_WIDTH": self.w_config["block"]["msa"]["z_matmul"][ - "data_in_frac_width" - ], - "BLOCK_DV_WIDTH": self.w_config["block"]["msa"]["z_matmul"]["weight_width"], - "BLOCK_DV_FRAC_WIDTH": self.w_config["block"]["msa"]["z_matmul"][ - "weight_frac_width" - ], - "BLOCK_EXP_WIDTH": self.w_config["block"]["msa"]["softmax"]["exp_width"], - "BLOCK_EXP_FRAC_WIDTH": self.w_config["block"]["msa"]["softmax"][ - "exp_frac_width" - ], - "BLOCK_DIV_WIDTH": self.w_config["block"]["msa"]["softmax"]["div_width"], - "BLOCK_DS_SOFTMAX_WIDTH": self.w_config["block"]["msa"]["softmax"][ - "data_out_width" - ], - "BLOCK_DS_SOFTMAX_FRAC_WIDTH": self.w_config["block"]["msa"]["softmax"][ - "data_out_frac_width" - ], - "BLOCK_DZ_WIDTH": self.w_config["block"]["msa"]["z_proj"]["data_in_width"], - "BLOCK_DZ_FRAC_WIDTH": self.w_config["block"]["msa"]["z_proj"][ - "data_in_frac_width" - ], - "BLOCK_AF_MLP_IN_WIDTH": self.w_config["block"]["affine_mlp"]["mul"][ - "data_in_width" - ], - "BLOCK_AF_MLP_IN_FRAC_WIDTH": self.w_config["block"]["affine_mlp"]["mul"][ - "data_in_frac_width" - ], - "BLOCK_AF_MLP_ADD_WIDTH": self.w_config["block"]["affine_mlp"]["add"][ - "data_in_width" - ], - "BLOCK_AF_MLP_ADD_FRAC_WIDTH": self.w_config["block"]["affine_mlp"]["add"][ - "data_in_frac_width" - ], - # mlp - "BLOCK_MLP_IN_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "data_in_width" - ], - "BLOCK_MLP_IN_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "data_in_frac_width" - ], - "BLOCK_WEIGHT_I2H_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "weight_width" - ], - "BLOCK_WEIGHT_I2H_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "weight_frac_width" - ], - "BLOCK_BIAS_I2H_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "bias_width" - ], - "BLOCK_BIAS_I2H_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc1_proj"][ - "bias_frac_width" - ], - "BLOCK_MLP_HAS_BIAS": 1, - "BLOCK_MLP_HIDDEN_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "data_in_width" - ], - "BLOCK_MLP_HIDDEN_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "data_in_frac_width" - ], - "BLOCK_WEIGHT_H2O_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "weight_width" - ], - "BLOCK_WEIGHT_H2O_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "weight_frac_width" - ], - "BLOCK_BIAS_H2O_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "bias_width" - ], - "BLOCK_BIAS_H2O_FRAC_WIDTH": self.w_config["block"]["mlp"]["fc2_proj"][ - "bias_frac_width" - ], - "HEAD_IN_WIDTH": self.w_config["head_proj"]["data_in_width"], - "HEAD_IN_FRAC_WIDTH": self.w_config["head_proj"]["data_in_frac_width"], - "HEAD_W_WIDTH": self.w_config["head_proj"]["weight_width"], - "HEAD_W_FRAC_WIDTH": self.w_config["head_proj"]["weight_frac_width"], - "HEAD_B_WIDTH": self.w_config["head_proj"]["bias_width"], - "HEAD_B_FRAC_WIDTH": self.w_config["head_proj"]["bias_frac_width"], - "OUT_WIDTH": self.ow, - "OUT_FRAC_WIDTH": self.ow_f, - "PATCH_EMBED_IN_C_3": self.in_c, - "PATCH_EMBED_IN_Y_3": self.in_y, - "PATCH_EMEBD_IN_X_3": self.in_x, - "PATCH_SIZE_3": self.patch_size, - "PATCH_EMBED_EMBED_DIM_3": self.embed_dim, - "PATCH_EMEBD_NUM_PATCH_3": self.num_patch, - "PATCH_EMEBD_UNROLL_KERNEL_OUT_3": self.pe_unroll_kernel_out, - "PATCH_EMEBD_UNROLL_IN_C_3": self.pe_unroll_in_c, - "PATCH_EMBED_UNROLL_EMBED_DIM_3": self.pe_unroll_embed_dim, - "NUM_HEADS": self.num_heads, - "MLP_RATIO": self.mlp_ratio, - "BLOCK_UNROLL_WQKV_DIM": self.blk_unroll_qkv_dim, - "BLOCK_UNROLL_HIDDEN_FEATURES": self.blk_unroll_hidden_features, - "NUM_CLASSES": self.num_classes, - "HEAD_UNROLL_OUT_X": self.head_unroll_out_x, - } - - def source_generate(self): - samples = self.samples - torch.manual_seed(2) - self.x = torch.randn((samples, self.in_c, self.in_y, self.in_x)) - input_tensor = q2i( - self.x, - self.w_config["patch_embed"]["patch_proj"]["data_in_width"], - self.w_config["patch_embed"]["patch_proj"]["data_in_frac_width"], - ) - x_in = input_tensor.permute(0, 2, 3, 1).reshape(-1, self.pe_unroll_in_c) - - x_in = x_in.flip(0).tolist() - self.inputs = RandomSource( - max_stalls=0, - name="data_in", - samples=samples - * int(self.in_x * self.in_y * self.in_c / self.pe_unroll_in_c), - num=self.pe_unroll_in_c, - data_specify=x_in, - debug=debug, - ) - self.pvt = QuantizedPyramidVisionTransformer( - img_size=self.in_y, - in_chans=self.in_c, - num_classes=self.num_classes, - patch_size=self.patch_size, - embed_dims=[self.embed_dim, self.embed_dim, self.embed_dim, self.embed_dim], - num_heads=[self.num_heads, self.num_heads, self.num_heads, self.num_heads], - mlp_ratios=[self.mlp_ratio, self.mlp_ratio, self.mlp_ratio, self.mlp_ratio], - qkv_bias=True, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - depths=[1, 1, 1, 1], - num_stages=1, - config=self.w_config, - ) - self.patch_source_generate(self.pvt) - self.block_source_generate(self.pvt.block1[0]) - - def patch_source_generate(self, pvt): - samples = self.samples - patch_w_1 = q2i( - pvt.patch_embed1.proj.weight, - self.w_config["patch_embed"]["patch_proj"]["weight_width"], - self.w_config["patch_embed"]["patch_proj"]["weight_frac_width"], - ) - patch_b_1 = q2i( - pvt.patch_embed1.proj.bias, - self.w_config["patch_embed"]["patch_proj"]["bias_width"], - self.w_config["patch_embed"]["patch_proj"]["bias_frac_width"], - ) - cls_in = q2i( - self.pvt.cls_token, - self.w_config["pos_add"]["data_in_width"], - self.w_config["pos_add"]["data_in_frac_width"], - ) - # NOTE: only 1 layer patch - # position embed - patch_embed_H = self.in_y // self.patch_size - patch_embed_W = patch_embed_H - H = patch_embed_H - W = patch_embed_W - pos_embed = self.pvt.pos_embed1 - pos_embed_ = ( - F.interpolate( - pos_embed[:, 1:] - .reshape(1, patch_embed_H, patch_embed_W, -1) - .permute(0, 3, 1, 2), - size=(H, W), - mode="bilinear", - ) - .reshape(1, -1, H * W) - .permute(0, 2, 1) - ) - pos_embed = torch.cat((pos_embed[:, 0:1], pos_embed_), dim=1) - pos_in = q2i( - pos_embed, - self.w_config["pos_add"]["data_in_width"], - self.w_config["pos_add"]["data_in_frac_width"], - ) - head_w = q2i( - self.pvt.head.weight, - self.w_config["head_proj"]["weight_width"], - self.w_config["head_proj"]["weight_frac_width"], - ) - head_b = q2i( - self.pvt.head.bias, - self.w_config["head_proj"]["bias_width"], - self.w_config["head_proj"]["bias_frac_width"], - ) - # parameters packs - pe_w_in, pe_b_in = self.conv_pack( - weight=patch_w_1, - bias=patch_b_1, - in_channels=self.in_c, - kernel_size=[self.patch_size, self.patch_size], - out_channels=self.embed_dim, - unroll_in_channels=self.pe_unroll_in_c, - unroll_kernel_out=self.pe_unroll_kernel_out, - unroll_out_channels=self.pe_unroll_embed_dim, - ) - - cls_in = linear_data_pack( - samples, - cls_in.repeat(self.samples, 1, 1), - in_y=1, - unroll_in_y=1, - in_x=self.embed_dim, - unroll_in_x=self.pe_unroll_embed_dim, - ) - pos_in = linear_data_pack( - samples, - pos_in.repeat(self.samples, 1, 1), - in_y=self.num_patch + 1, - unroll_in_y=1, - in_x=self.embed_dim, - unroll_in_x=self.pe_unroll_embed_dim, - ) - h_w_in = linear_data_pack( - samples, - head_w.repeat(self.samples, 1, 1), - in_y=self.num_classes, - unroll_in_y=self.head_unroll_out_x, - in_x=self.embed_dim, - unroll_in_x=self.pe_unroll_embed_dim, - ) - - h_b_in = linear_data_pack( - samples, - head_b.repeat(self.samples, 1, 1), - in_y=self.num_classes, - unroll_in_y=self.head_unroll_out_x, - in_x=1, - unroll_in_x=1, - ) - - cls_in.reverse() - pos_in.reverse() - h_w_in.reverse() - h_b_in.reverse() - - self.patch_embed_bias = RandomSource( - max_stalls=0, - name="patch_embed_bias", - samples=samples * int(self.embed_dim / self.pe_unroll_embed_dim), - num=self.pe_unroll_embed_dim, - data_specify=pe_b_in, - debug=debug, - ) - self.patch_embed_weight = RandomSource( - max_stalls=0, - name="patch_embed_weight", - samples=samples * self.pe_iter_weight, - num=self.pe_unroll_kernel_out * self.pe_unroll_embed_dim, - data_specify=pe_w_in, - debug=debug, - ) - self.cls_token = RandomSource( - max_stalls=0, - name="cls_token_data", - samples=samples * (self.embed_dim // self.pe_unroll_embed_dim), - num=self.pe_unroll_embed_dim, - data_specify=cls_in, - debug=debug, - ) - self.pos_embed = RandomSource( - max_stalls=0, - name="pos_embed_data", - samples=samples - * (self.num_patch + 1) - * (self.embed_dim // self.pe_unroll_embed_dim), - num=self.pe_unroll_embed_dim, - data_specify=pos_in, - debug=debug, - ) - self.head_bias = RandomSource( - max_stalls=0, - name="head_bias", - samples=samples * int(self.num_classes / self.head_unroll_out_x), - num=self.head_unroll_out_x, - data_specify=h_b_in, - debug=debug, - ) - self.head_weight = RandomSource( - max_stalls=0, - name="head_weight", - samples=samples - * int(self.num_classes / self.head_unroll_out_x) - * int(self.embed_dim / self.pe_unroll_embed_dim), - num=self.head_unroll_out_x * self.pe_unroll_embed_dim, - data_specify=h_w_in, - debug=debug, - ) - - def data_pack(self, in_temp, np, d, p, s): - # assum in_temp.shape = (samples, batch = 1, N,dim) - in_temp = in_temp.to(torch.int).reshape(self.samples, np * p, d * s) - ref = [] - for i in range(self.samples): - re_tensor = rearrange( - in_temp[i], "(np p) (d s) -> np (p d) s", np=np, d=d, p=p, s=s - ) - ex_tensor = torch.zeros(np, d * p, s, dtype=int) - for b in range(np): - for i in range(d): - for j in range(p): - ex_tensor[b][i * p + j] = re_tensor[b][j * d + i] - output_tensor = rearrange( - ex_tensor, "np (d p) s -> (np d) (p s)", np=np, d=d, p=p, s=s - ) - output = output_tensor.tolist() - ref = ref + output - return ref - - def block_source_generate(self, qblock): - samples = self.samples - w_config = self.w_config["block"] - att = qblock.attn - aff_att = qblock.norm1 - in_y = self.num_patch + 1 - in_x = self.embed_dim - qkv_x = in_x - out_x = in_x - unroll_in_y = 1 - unroll_in_x = self.pe_unroll_embed_dim - unroll_qkv_x = self.blk_unroll_qkv_dim - unroll_out_x = unroll_in_x - self.msa_data_generate( - att, - in_x, - unroll_in_x, - qkv_x, - unroll_qkv_x, - self.num_heads, - out_x, - unroll_out_x, - ) - mlp = qblock.mlp - in_features = in_x - unroll_in_features = unroll_in_x - hidden_features = self.mlp_ratio * in_features - unroll_hidden_features = self.blk_unroll_hidden_features - out_features = in_x - unroll_out_features = unroll_in_x - self.mlp_data_generate( - mlp, - in_features, - hidden_features, - out_features, - unroll_in_features, - unroll_hidden_features, - unroll_out_features, - ) - - num = in_y * in_x // (unroll_in_x * unroll_in_y) - in_size = unroll_in_x * unroll_in_y - aff_att_w, aff_att_b = self.aff_data_generate( - w_config["affine_att"], aff_att, num, in_size - ) - aff_mlp = qblock.norm2 - aff_mlp_w, aff_mlp_b = self.aff_data_generate( - w_config["affine_mlp"], aff_mlp, num, in_size - ) - self.aff_att_weight = RandomSource( - max_stalls=0, - samples=samples * num, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_att_w, - ) - self.aff_att_bias = RandomSource( - max_stalls=0, - samples=samples * num, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_att_b, - ) - - self.aff_mlp_weight = RandomSource( - max_stalls=0, - samples=samples * num, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_mlp_w, - ) - self.aff_mlp_bias = RandomSource( - max_stalls=0, - samples=samples * num, - num=in_size, - is_data_vector=True, - debug=debug, - data_specify=aff_mlp_b, - ) - - def aff_data_generate(self, config, qaff, num, in_size): - fixed_aff = qaff - w = fixed_aff.weight - b = fixed_aff.bias - weight_in = ( - q2i(w, config["mul"]["data_in_width"], config["mul"]["data_in_frac_width"]) - .repeat(self.samples * num, in_size) - .tolist() - ) - - bias_in = ( - q2i(b, config["add"]["data_in_width"], config["add"]["data_in_frac_width"]) - .repeat(self.samples * num, in_size) - .tolist() - ) - weight_in.reverse() - bias_in.reverse() - return weight_in, bias_in - - def msa_data_generate( - self, - qatt, - in_x, - unroll_in_x, - qkv_x, - unroll_qkv_x, - num_heads, - out_x, - unroll_out_x, - ): - # generate data - samples = self.samples - in_depth = in_x // unroll_in_x - in_size = unroll_in_x - wqkv_parallelism = unroll_qkv_x - wqkv_num_parallelism = qkv_x // (unroll_qkv_x * num_heads) - wp_parallelism = unroll_out_x - wp_num_parallelism = out_x // unroll_out_x - wp_depth = qkv_x // (unroll_qkv_x * num_heads) - wp_size = num_heads * unroll_qkv_x - dim = in_size * in_depth - config = self.w_config["block"]["msa"] - att = qatt - att_wq = q2i( - att.q.weight, - config["q_proj"]["weight_width"], - config["q_proj"]["weight_frac_width"], - ) - att_wkv = q2i( - att.kv.weight, - config["kv_proj"]["weight_width"], - config["kv_proj"]["weight_frac_width"], - ) - wqkv_tensor = torch.cat((att_wq, att_wkv), 0) - wqkv_tensor = wqkv_tensor.reshape(3, in_x, in_x) - wqkv_tensor = wqkv_tensor.reshape(in_x * 3, in_x).repeat(samples, 1, 1) - - att_bq = q2i( - att.q.bias, - config["q_proj"]["bias_width"], - config["q_proj"]["bias_frac_width"], - ) - att_bkv = q2i( - att.kv.bias, - config["kv_proj"]["bias_width"], - config["kv_proj"]["bias_frac_width"], - ) - bqkv_tensor = torch.cat((att_bq, att_bkv), 0) - bqkv_tensor = bqkv_tensor.reshape(3, in_x) - bqkv_tensor = bqkv_tensor.reshape(-1).repeat(samples, 1) - - wp_tensor = q2i( - att.proj.weight, - config["z_proj"]["weight_width"], - config["z_proj"]["weight_frac_width"], - ).repeat(samples, 1, 1) - bp_tensor = q2i( - att.proj.bias, - config["z_proj"]["bias_width"], - config["z_proj"]["bias_frac_width"], - ).repeat(samples, 1) - - logger.debug( - "input data: \n\ - wqkv_tensor = \n{}\n\ - bqkv_tensor = \n{}\n\ - wp_tensor = \n{}\n\ - bp_tensor = \n{}\n\ - ".format( - wqkv_tensor, bqkv_tensor, wp_tensor, bp_tensor - ) - ) - # generate hash table - exp_table = generate_table_hardware( - att.scale, - config["softmax"]["data_in_width"], - config["softmax"]["data_in_frac_width"], - config["softmax"]["exp_width"], - config["softmax"]["exp_frac_width"], - ).tolist() - div_table = generate_table_div_hardware( - config["softmax"]["div_width"], - config["softmax"]["data_out_width"], - config["softmax"]["data_out_frac_width"], - ).tolist() - with open(r"exp_init.mem", "w") as fp: - for item in exp_table: - # write each item on a new lineformat(addr[i] ,f'0{width}b' - fp.write( - "%s\n" % format(item, f'0{config["softmax"]["exp_width"]//4}x') - ) - with open(r"div_init.mem", "w") as fp: - for item in div_table: - # write each item on a new line - fp.write( - "%s\n" % format(item, f'0{config["softmax"]["data_out_width"]//4}x') - ) - # data_pack - wqkv = wqkv_tensor.reshape( - samples, 3, num_heads, wqkv_num_parallelism, wqkv_parallelism, dim - ).permute(1, 0, 3, 2, 4, 5) - wqkv = wqkv.reshape(3, samples, dim, dim) - bqkv = bqkv_tensor.reshape( - samples, 3, num_heads, wqkv_num_parallelism, wqkv_parallelism - ).permute(1, 0, 3, 2, 4) - bqkv = bqkv.reshape(3, samples, dim) - - wp = wp_tensor.reshape( - samples * dim, num_heads, wqkv_num_parallelism, wqkv_parallelism - ) - wp = wp.permute(0, 2, 1, 3).reshape(samples, dim, dim) - - wq = wqkv[0] - wk = wqkv[1] - wv = wqkv[2] - - bq = bqkv[0] - bk = bqkv[1] - bv = bqkv[2] - wq_in = self.data_pack( - wq, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - wk_in = self.data_pack( - wk, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - wv_in = self.data_pack( - wv, wqkv_num_parallelism, in_depth, num_heads * wqkv_parallelism, in_size - ) - wp_in = self.data_pack( - wp, - wp_num_parallelism, - wqkv_num_parallelism, - wp_parallelism, - num_heads * wqkv_parallelism, - ) - - bq_in = self.data_pack( - bq, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - bk_in = self.data_pack( - bk, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - bv_in = self.data_pack( - bv, 1, wqkv_num_parallelism, 1, num_heads * wqkv_parallelism - ) - bp_in = self.data_pack(bp_tensor, 1, wp_num_parallelism, 1, wp_parallelism) - - wq_in.reverse() - wk_in.reverse() - wv_in.reverse() - wp_in.reverse() - bq_in.reverse() - bk_in.reverse() - bv_in.reverse() - bp_in.reverse() - - self.weight_q = RandomSource( - max_stalls=0, - name="weight_q", - samples=samples * in_depth * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism * in_size, - data_specify=wq_in, - debug=debug, - ) - self.weight_k = RandomSource( - max_stalls=0, - name="weight_k", - samples=samples * in_depth * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism * in_size, - data_specify=wk_in, - debug=debug, - ) - self.weight_v = RandomSource( - max_stalls=0, - name="weight_v", - samples=samples * in_depth * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism * in_size, - data_specify=wv_in, - debug=debug, - ) - self.weight_p = RandomSource( - max_stalls=0, - name="weight_p", - samples=samples * wp_depth * wp_num_parallelism, - num=wp_parallelism * wp_size, - data_specify=wp_in, - debug=debug, - ) - self.bias_q = RandomSource( - max_stalls=0, - name="bias_q", - samples=samples * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism, - data_specify=bq_in, - debug=debug, - ) - self.bias_k = RandomSource( - max_stalls=0, - name="bias_k", - samples=samples * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism, - data_specify=bk_in, - debug=debug, - ) - self.bias_v = RandomSource( - max_stalls=0, - name="bias_v", - samples=samples * wqkv_num_parallelism, - num=num_heads * wqkv_parallelism, - data_specify=bv_in, - debug=debug, - ) - self.bias_p = RandomSource( - max_stalls=0, - name="bias_p", - samples=samples * wp_num_parallelism, - num=wp_parallelism, - data_specify=bp_in, - debug=debug, - ) - - def mlp_data_generate( - self, - qmlp, - in_features, - hidden_features, - out_features, - unroll_in_features, - unroll_hidden_features, - unroll_out_features, - ): - samples = self.samples - depth_in_features = in_features // unroll_in_features - depth_hidden_features = hidden_features // unroll_hidden_features - depth_out_features = out_features // unroll_out_features - w_config = self.w_config["block"]["mlp"] - mlp = qmlp - weight1_tensor = q2i( - mlp.fc1.weight, - w_config["fc1_proj"]["weight_width"], - w_config["fc1_proj"]["weight_frac_width"], - ) - - bias1_tensor = q2i( - mlp.fc1.bias, - w_config["fc1_proj"]["bias_width"], - w_config["fc1_proj"]["bias_frac_width"], - ) - - weight2_tensor = q2i( - mlp.fc2.weight, - w_config["fc2_proj"]["weight_width"], - w_config["fc2_proj"]["weight_frac_width"], - ) - - bias2_tensor = q2i( - mlp.fc2.bias, - w_config["fc2_proj"]["bias_width"], - w_config["fc2_proj"]["bias_frac_width"], - ) - weight1_in = linear_data_pack( - samples, - weight1_tensor.repeat(samples, 1, 1), - in_y=hidden_features, - in_x=in_features, - unroll_in_y=unroll_hidden_features, - unroll_in_x=unroll_in_features, - ) - bias1_in = linear_data_pack( - samples, - bias1_tensor.repeat(samples, 1, 1), - in_y=hidden_features, - in_x=1, - unroll_in_y=unroll_hidden_features, - unroll_in_x=1, - ) - weight2_in = linear_data_pack( - samples, - weight2_tensor.repeat(samples, 1, 1), - in_y=out_features, - in_x=hidden_features, - unroll_in_y=unroll_out_features, - unroll_in_x=unroll_hidden_features, - ) - bias2_in = linear_data_pack( - samples, - bias2_tensor.repeat(samples, 1, 1), - in_y=out_features, - in_x=1, - unroll_in_y=unroll_out_features, - unroll_in_x=1, - ) - weight1_in.reverse() - bias1_in.reverse() - weight2_in.reverse() - bias2_in.reverse() - self.bias1 = RandomSource( - max_stalls=0, - name="bias1", - samples=samples * depth_hidden_features, - num=unroll_hidden_features, - data_specify=bias1_in, - debug=debug, - ) - self.bias2 = RandomSource( - max_stalls=0, - name="bias2", - samples=samples * depth_out_features, - num=unroll_out_features, - data_specify=bias2_in, - debug=debug, - ) - self.weight1 = RandomSource( - max_stalls=0, - name="weight1", - samples=samples * depth_hidden_features * depth_in_features, - num=unroll_hidden_features * unroll_in_features, - data_specify=weight1_in, - debug=debug, - ) - self.weight2 = RandomSource( - max_stalls=0, - name="weight2", - samples=samples * depth_out_features * depth_hidden_features, - num=unroll_out_features * unroll_hidden_features, - data_specify=weight2_in, - debug=debug, - ) - - def sw_compute(self): - data_out = self.pvt(self.x) - output = linear_data_pack( - self.samples, - q2i(data_out, self.ow, self.ow_f), - in_y=1, - in_x=self.num_classes, - unroll_in_y=1, - unroll_in_x=self.head_unroll_out_x, - ) - return output - - def conv_pack( - self, - weight, - bias, - in_channels, - kernel_size, - out_channels, - unroll_in_channels, - unroll_kernel_out, - unroll_out_channels, - ): - samples = self.samples - # requires input as a quantized int format - # weight_pack - # from (oc,ic/u_ic,u_ic,h,w) to (ic/u_ic,h*w,u_ic,oc) - reorder_w_tensor = ( - weight.repeat(samples, 1, 1, 1, 1) - .reshape( - samples, - out_channels, - int(in_channels / unroll_in_channels), - unroll_in_channels, - kernel_size[0] * kernel_size[1], - ) - .permute(0, 2, 4, 3, 1) - ) - - # reverse the final 2 dimension - # from(samples, int(kernel_height * kernel_width * in_channels / unroll_kernel_out), unroll_kernel_out, int(out_channels/unroll_out_channels), unroll_out_channels) - # to (samples, int(out_channels/unroll_out_channels), int(kernel_height * kernel_width * in_channels / unroll_kernel_out), unroll_out_channels, unroll_kernel_out) - w_tensor = reorder_w_tensor.reshape( - samples, - int(kernel_size[0] * kernel_size[1] * in_channels / unroll_kernel_out), - unroll_kernel_out, - int(out_channels / unroll_out_channels), - unroll_out_channels, - ).permute(0, 3, 1, 4, 2) - - w_tensor = w_tensor.reshape( - -1, - unroll_out_channels * unroll_kernel_out, - ) - w_in = w_tensor.type(torch.int).flip(0).tolist() - # bias_pack - bias_tensor = bias.repeat(samples, 1).reshape(-1, unroll_out_channels) - b_in = bias_tensor.type(torch.int).flip(0).tolist() - return w_in, b_in - - -@cocotb.test() -async def cocotb_test_fixed_linear(dut): - # TODO: - """Test integer based vector mult""" - samples = 1 - test_case = VerificationCase(samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.patch_embed_weight_3_valid.value = 0 - dut.patch_embed_bias_3_valid.value = 0 - dut.cls_token_valid.value = 0 - dut.pos_embed_in_valid.value = 0 - - dut.af_msa_weight_valid.value = 0 - dut.af_msa_bias_valid.value = 0 - dut.weight_q_valid.value = 0 - dut.weight_k_valid.value = 0 - dut.weight_v_valid.value = 0 - dut.weight_p_valid.value = 0 - dut.bias_q_valid.value = 0 - dut.bias_k_valid.value = 0 - dut.bias_v_valid.value = 0 - dut.bias_p_valid.value = 0 - - dut.af_mlp_weight_valid.value = 0 - dut.af_mlp_bias_valid.value = 0 - dut.weight_in2hidden_valid.value = 0 - dut.weight_hidden2out_valid.value = 0 - dut.bias_in2hidden_valid.value = 0 - dut.bias_hidden2out_valid.value = 0 - - dut.head_weight_valid.value = 0 - dut.head_bias_valid.value = 0 - - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - # debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - # debug_state(dut, "Pre-clk") - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - - done = False - cdin = 0 - cpatch_out = 0 - cpose_out = 0 - cmsa_out = 0 - cblock_out = 0 - chead_in = 0 - cafmsa_out = 0 - cmsa_sa_out = 0 - for i in range(samples * 10000000): - await FallingEdge(dut.clk) - # debug_state(dut, "Post-clk") - dut.rst.value = 0 - - dut.patch_embed_bias_3_valid.value = test_case.patch_embed_bias.pre_compute() - dut.patch_embed_weight_3_valid.value = ( - test_case.patch_embed_weight.pre_compute() - ) - dut.cls_token_valid.value = test_case.cls_token.pre_compute() - dut.pos_embed_in_valid.value = test_case.pos_embed.pre_compute() - - dut.af_msa_weight_valid.value = test_case.aff_att_weight.pre_compute() - dut.af_msa_bias_valid.value = test_case.aff_att_bias.pre_compute() - dut.weight_q_valid.value = test_case.weight_q.pre_compute() - dut.weight_k_valid.value = test_case.weight_k.pre_compute() - dut.weight_v_valid.value = test_case.weight_v.pre_compute() - dut.weight_p_valid.value = test_case.weight_p.pre_compute() - dut.bias_q_valid.value = test_case.bias_q.pre_compute() - dut.bias_k_valid.value = test_case.bias_k.pre_compute() - dut.bias_v_valid.value = test_case.bias_v.pre_compute() - dut.bias_p_valid.value = test_case.bias_p.pre_compute() - - dut.af_mlp_weight_valid.value = test_case.aff_mlp_weight.pre_compute() - dut.af_mlp_bias_valid.value = test_case.aff_mlp_bias.pre_compute() - dut.weight_in2hidden_valid.value = test_case.weight1.pre_compute() - dut.bias_in2hidden_valid.value = test_case.bias1.pre_compute() - dut.weight_hidden2out_valid.value = test_case.weight2.pre_compute() - dut.bias_hidden2out_valid.value = test_case.bias2.pre_compute() - - dut.head_weight_valid.value = test_case.head_weight.pre_compute() - dut.head_bias_valid.value = test_case.head_bias.pre_compute() - - dut.data_in_valid.value = test_case.inputs.pre_compute() - - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute(dut.data_out_valid) - await Timer(1, units="ns") - # start input data - ( - dut.patch_embed_weight_3_valid.value, - dut.patch_embed_weight_3.value, - ) = test_case.patch_embed_weight.compute(dut.patch_embed_weight_3_ready.value) - ( - dut.patch_embed_bias_3_valid.value, - dut.patch_embed_bias_3.value, - ) = test_case.patch_embed_bias.compute(dut.patch_embed_bias_3_ready.value) - dut.cls_token_valid.value, dut.cls_token.value = test_case.cls_token.compute( - dut.cls_token_ready.value - ) - ( - dut.pos_embed_in_valid.value, - dut.pos_embed_in.value, - ) = test_case.pos_embed.compute(dut.pos_embed_in_ready.value) - - ( - dut.af_msa_weight_valid.value, - dut.af_msa_weight.value, - ) = test_case.aff_att_weight.compute(dut.af_msa_weight_ready.value) - ( - dut.af_msa_bias_valid.value, - dut.af_msa_bias.value, - ) = test_case.aff_att_bias.compute(dut.af_msa_bias_ready.value) - dut.weight_q_valid.value, dut.weight_q.value = test_case.weight_q.compute( - dut.weight_q_ready.value - ) - dut.weight_k_valid.value, dut.weight_k.value = test_case.weight_k.compute( - dut.weight_k_ready.value - ) - dut.weight_v_valid.value, dut.weight_v.value = test_case.weight_v.compute( - dut.weight_v_ready.value - ) - dut.weight_p_valid.value, dut.weight_p.value = test_case.weight_p.compute( - dut.weight_p_ready.value - ) - - dut.bias_q_valid.value, dut.bias_q.value = test_case.bias_q.compute( - dut.bias_q_ready.value - ) - dut.bias_k_valid.value, dut.bias_k.value = test_case.bias_k.compute( - dut.bias_k_ready.value - ) - dut.bias_v_valid.value, dut.bias_v.value = test_case.bias_v.compute( - dut.bias_v_ready.value - ) - dut.bias_p_valid.value, dut.bias_p.value = test_case.bias_p.compute( - dut.bias_p_ready.value - ) - - ( - dut.af_mlp_weight_valid.value, - dut.af_mlp_weight.value, - ) = test_case.aff_mlp_weight.compute(dut.af_mlp_weight_ready.value) - ( - dut.af_mlp_bias_valid.value, - dut.af_mlp_bias.value, - ) = test_case.aff_mlp_bias.compute(dut.af_mlp_bias_ready.value) - ( - dut.weight_in2hidden_valid.value, - dut.weight_in2hidden.value, - ) = test_case.weight1.compute(dut.weight_in2hidden_ready.value) - ( - dut.weight_hidden2out_valid.value, - dut.weight_hidden2out.value, - ) = test_case.weight2.compute(dut.weight_hidden2out_ready.value) - ( - dut.bias_in2hidden_valid.value, - dut.bias_in2hidden.value, - ) = test_case.bias1.compute(dut.bias_in2hidden_ready.value) - ( - dut.bias_hidden2out_valid.value, - dut.bias_hidden2out.value, - ) = test_case.bias2.compute(dut.bias_hidden2out_ready.value) - - ( - dut.head_weight_valid.value, - dut.head_weight.value, - ) = test_case.head_weight.compute(dut.head_weight_ready.value) - dut.head_bias_valid.value, dut.head_bias.value = test_case.head_bias.compute( - dut.head_bias_ready.value - ) - dut.data_in_valid.value, dut.data_in.value = test_case.inputs.compute( - dut.data_in_ready.value - ) - - await Timer(1, units="ns") - - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - await Timer(1, units="ns") - # wave_check(dut) - if dut.data_in_valid.value == 1 and dut.data_in_ready.value == 1: - cdin += 1 - if ( - dut.patch_embed_out_3_valid.value == 1 - and dut.patch_embed_out_3_ready.value == 1 - ): - cpatch_out += 1 - if dut.pos_embed_out_valid.value == 1 and dut.pos_embed_out_ready.value == 1: - cpose_out += 1 - if ( - dut.block_inst.af_msa_out_valid.value == 1 - and dut.block_inst.af_msa_out_ready.value == 1 - ): - cafmsa_out += 1 - if ( - dut.block_inst.msa_inst.sa_out_valid.value == 1 - and dut.block_inst.msa_inst.sa_out_ready.value == 1 - ): - cmsa_sa_out += 1 - if ( - dut.block_inst.msa_out_valid.value == 1 - and dut.block_inst.msa_out_ready.value == 1 - ): - cmsa_out += 1 - if dut.block_out_valid.value == 1 and dut.block_out_ready.value == 1: - cblock_out += 1 - if dut.head_in_valid.value == 1 and dut.head_in_ready.value == 1: - chead_in += 1 - print("cdin = ", cdin) - print("cpatch_out = ", cpatch_out) - print("cpose_out = ", cpose_out) - print( - "{},{},cafmsa_out = {}".format( - dut.block_inst.af_msa_out_valid.value, - dut.block_inst.af_msa_out_ready.value, - cafmsa_out, - ) - ) - print("cmsa_sa_out = ", cmsa_sa_out) - print("cmsa_out = ", cmsa_out) - print("cblock_out = ", cblock_out) - print("chead_in = ", chead_in) - # if i % 1000 == 0: - if ( - test_case.outputs.is_full() - # and test_case.head_bias.is_empty() - # and test_case.head_weight.is_empty() - # and test_case.cls_token.is_empty() - # and test_case.pos_embed.is_empty() - # and test_case.patch_embed_bias.is_empty() - # and test_case.patch_embed_weight.is_empty() - # and test_case.weight1.is_empty() - # and test_case.bias1.is_empty() - # and test_case.weight2.is_empty() - # and test_case.bias2.is_empty() - # and test_case.weight_q.is_empty() - # and test_case.weight_k.is_empty() - # and test_case.weight_v.is_empty() - # and test_case.weight_p.is_empty() - # and test_case.bias_q.is_empty() - # and test_case.bias_k.is_empty() - # and test_case.bias_v.is_empty() - # and test_case.bias_p.is_empty() - # and test_case.inputs.is_empty() - ): - done = True - break - assert ( - done - ), "Deadlock detected or the simulation reaches the maximum cycle limit (fixed it by adjusting the loop trip count)" - - check_results(test_case.outputs.data, test_case.ref) - - -def wave_check(dut): - logger.debug( - "wave_check:\n\ - {},{} pos_embed_out\n\ - {},{} block = {}\n\ - {},{} res_msa = {}\n\ - {},{} mlp_out = {}\n\ - {},{} head_in\n\ - {},{} data_out\n\ - ".format( - dut.pos_embed_out_valid.value, - dut.pos_embed_out_ready.value, - dut.block_out_valid.value, - dut.block_out_ready.value, - [int(i) for i in dut.block_out.value], - dut.block_inst.res_msa_valid.value, - dut.block_inst.res_msa_ready.value, - [int(i) for i in dut.block_inst.res_msa.value], - dut.block_inst.mlp_out_valid.value, - dut.block_inst.mlp_out_ready.value, - [int(i) for i in dut.block_inst.mlp_out.value], - dut.head_in_valid.value, - dut.head_in_ready.value, - dut.data_out_valid.value, - dut.data_out_ready.value, - ) - ) - - -def runner(): - sim = os.getenv("SIM", "verilator") - - verilog_sources = [ - "../../../../components/ViT/fixed_pvt.sv", - "../../../../components/ViT/fixed_block.sv", - "../../../../components/ViT/hash_softmax.sv", - "../../../../components/ViT/affine_layernorm.sv", - "../../../../components/ViT/fixed_mlp.sv", - "../../../../components/ViT/fixed_msa.sv", - "../../../../components/ViT/fixed_patch_embed.sv", - "../../../../components/attention/fixed_self_att.sv", - "../../../../components/attention/fixed_att.sv", - "../../../../components/matmul/fixed_matmul.sv", - "../../../../components/cast/fixed_rounding.sv", - "../../../../components/activations/fixed_relu.sv", - "../../../../components/linear/fixed_2d_linear.sv", - "../../../../components/linear/fixed_linear.sv", - "../../../../components/conv/convolution.sv", - "../../../../components/conv/padding.sv", - "../../../../components/conv/roller.sv", - "../../../../components/conv/sliding_window.sv", - "../../../../components/common/wrap_data.sv", - "../../../../components/common/cut_data.sv", - "../../../../components/common/fifo.sv", - "../../../../components/common/unpacked_fifo.sv", - "../../../../components/common/input_buffer.sv", - "../../../../components/common/blk_mem_gen_0.sv", - "../../../../components/common/skid_buffer.sv", - "../../../../components/common/unpacked_skid_buffer.sv", - "../../../../components/common/join2.sv", - "../../../../components/common/split2.sv", - "../../../../components/fixed_arithmetic/fixed_matmul_core.sv", - "../../../../components/fixed_arithmetic/fixed_dot_product.sv", - "../../../../components/fixed_arithmetic/fixed_accumulator.sv", - "../../../../components/fixed_arithmetic/fixed_vector_mult.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree_layer.sv", - "../../../../components/fixed_arithmetic/fixed_mult.sv", - ] - test_case = VerificationCase() - - # set parameters - extra_args = [] - extra_args.append(f"--unroll-count") - extra_args.append(f"3000") - for k, v in test_case.get_dut_parameters().items(): - extra_args.append(f"-G{k}={v}") - print(extra_args) - runner = get_runner(sim) - runner.build( - verilog_sources=verilog_sources, - hdl_toplevel="fixed_pvt", - build_args=extra_args, - ) - runner.test( - hdl_toplevel="fixed_pvt", - test_module="fixed_pvt_tb", - ) - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_fixed_pvt(): - runner() - - -if __name__ == "__main__": - test_fixed_pvt() diff --git a/src/mase_components/vision_models/vit/test/fixed_vit_attention_head_tb.py b/src/mase_components/vision_models/vit/test/fixed_vit_attention_head_tb.py new file mode 100644 index 000000000..b26ef611d --- /dev/null +++ b/src/mase_components/vision_models/vit/test/fixed_vit_attention_head_tb.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 + +import os + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge +from pathlib import Path + +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor +from mase_cocotb.runner import mase_runner + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized import ViTSelfAttentionHeadInteger +from chop.nn.quantizers import integer_quantizer, integer_floor_quantizer + +from mase_components.helper import generate_memory + +import pytest +import math + + +class FixedSelfAttentionHeadTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + # * QKV drivers + self.query_driver = StreamDriver( + dut.clk, dut.query, dut.query_valid, dut.query_ready + ) + self.key_driver = StreamDriver(dut.clk, dut.key, dut.key_valid, dut.key_ready) + self.value_driver = StreamDriver( + dut.clk, dut.value, dut.value_valid, dut.value_ready + ) + + self.out_monitor = StreamMonitor( + dut.clk, + dut.out, + dut.out_valid, + dut.out_ready, + check=True, + ) + + # Model + self.head_size = self.get_parameter("IN_DATA_TENSOR_SIZE_DIM_0") + + self.q_config = { + "query_width": self.get_parameter("IN_DATA_PRECISION_0"), + "query_frac_width": self.get_parameter("IN_DATA_PRECISION_1"), + "key_width": self.get_parameter("IN_DATA_PRECISION_0"), + "key_frac_width": self.get_parameter("IN_DATA_PRECISION_1"), + "value_width": self.get_parameter("IN_DATA_PRECISION_0"), + "value_frac_width": self.get_parameter("IN_DATA_PRECISION_1"), + "qkmm_out_width": self.get_parameter("QKMM_OUT_PRECISION_0"), + "qkmm_out_frac_width": self.get_parameter("QKMM_OUT_PRECISION_1"), + "softmax_exp_width": self.get_parameter("SOFTMAX_EXP_PRECISION_0"), + "softmax_exp_frac_width": self.get_parameter("SOFTMAX_EXP_PRECISION_1"), + "softmax_out_frac_width": self.get_parameter( + "SOFTMAX_OUT_DATA_PRECISION_1" + ), + "svmm_out_width": self.get_parameter("OUT_DATA_PRECISION_0"), + "svmm_out_frac_width": self.get_parameter("OUT_DATA_PRECISION_1"), + } + self.model = ViTSelfAttentionHeadInteger( + dim=self.get_parameter("IN_DATA_TENSOR_SIZE_DIM_0"), + num_heads=1, + q_config=self.q_config, + floor=True, + ) + # assert self.model.mult_data == torch.tensor(MULT_DATA), f"running set mult data {self.model.mult_data} != {MULT_DATA}" + # Set verbosity of driver and monitor loggers to debug + self.query_driver.log.setLevel(logging.DEBUG) + self.key_driver.log.setLevel(logging.DEBUG) + self.value_driver.log.setLevel(logging.DEBUG) + self.out_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self, seq_len=1): + return { + "query_layer": torch.randn((seq_len, self.head_size)), + "key_layer": torch.randn((seq_len, self.head_size)), + "value_layer": torch.randn((seq_len, self.head_size)), + } + + def preprocess_tensor(self, tensor, config, parallelism, floor=True): + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + + # Quantize + base_quantizer = integer_floor_quantizer if floor else integer_quantizer + quantizer = partial(base_quantizer, **config) + q_tensor = quantizer(tensor) + self.log.debug(f"Quantized tensor: {q_tensor}") + + # Convert to integer format + q_tensor = (q_tensor * 2 ** config["frac_width"]).int() + self.log.debug(f"Tensor in integer format: {q_tensor}") + + # Split into chunks according to parallelism in each dimension + # parallelism[0]: along rows, parallelism[1]: along columns + dim_0_split = q_tensor.split(parallelism[0], dim=0) + dim_1_split = [x.split(parallelism[1], dim=1) for x in dim_0_split] + blocks = [] + # Flatten the list of blocks + for i in range(len(dim_1_split)): + for j in range(len(dim_1_split[i])): + blocks.append(dim_1_split[i][j].flatten().tolist()) + return blocks + + async def run_test(self): + await self.reset() + self.log.info(f"Reset finished") + self.out_monitor.ready.value = 1 + inputs = self.generate_inputs( + seq_len=self.get_parameter("IN_DATA_TENSOR_SIZE_DIM_1") + ) + exp_out = self.model(**inputs) + # breakpoint() + parallelism = [ + self.get_parameter("IN_DATA_PARALLELISM_DIM_1"), + self.get_parameter("IN_DATA_PARALLELISM_DIM_0"), + ] + + # * Load the query driver + self.log.info(f"Processing query inputs: {inputs['query_layer']}") + query_inputs = self.preprocess_tensor( + tensor=inputs["query_layer"], + config={ + "width": self.q_config["query_width"], + "frac_width": self.q_config["query_frac_width"], + }, + parallelism=parallelism, + ) + self.query_driver.load_driver(query_inputs) + + # * Load the key driver + self.log.info(f"Processing key inputs: {inputs['key_layer']}") + key_inputs = self.preprocess_tensor( + tensor=inputs["key_layer"], + config={ + "width": self.q_config["key_width"], + "frac_width": self.q_config["key_frac_width"], + }, + parallelism=parallelism, + ) + self.key_driver.load_driver(key_inputs) + + # * Load the value driver + self.log.info(f"Processing value inputs: {inputs['value_layer']}") + value_inputs = self.preprocess_tensor( + tensor=inputs["value_layer"], + config={ + "width": self.q_config["value_width"], + "frac_width": self.q_config["value_frac_width"], + }, + parallelism=parallelism, + ) + self.value_driver.load_driver(value_inputs) + + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = self.preprocess_tensor( + tensor=exp_out, + config={ + "width": self.get_parameter("OUT_DATA_PRECISION_0"), + "frac_width": self.get_parameter("OUT_DATA_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("OUT_DATA_PARALLELISM_DIM_1"), + self.get_parameter("OUT_DATA_PARALLELISM_DIM_0"), + ], + ) + self.out_monitor.load_monitor(outs) + + # cocotb.start_soon(check_signal(self.dut, self.log)) + await Timer(1, units="ms") + if not self.out_monitor.exp_queue.empty(): + raise RuntimeError( + "Reached the end of the test, but the output monitor is not empty." + ) + + +@cocotb.test() +async def cocotb_test(dut): + tb = FixedSelfAttentionHeadTB(dut) + await tb.run_test() + + +default_config = { + "IN_DATA_TENSOR_SIZE_DIM_0": 4, + "IN_DATA_TENSOR_SIZE_DIM_1": 12, + "IN_DATA_PARALLELISM_DIM_0": 2, + "IN_DATA_PARALLELISM_DIM_1": 2, + "IN_DATA_PRECISION_0": 8, + "IN_DATA_PRECISION_1": 4, + "QKMM_OUT_PRECISION_0": 8, + "QKMM_OUT_PRECISION_1": 4, + "SOFTMAX_EXP_PRECISION_0": 16, + "SOFTMAX_EXP_PRECISION_1": 4, + "SOFTMAX_OUT_DATA_PRECISION_1": 7, + "OUT_DATA_PRECISION_0": 12, + "OUT_DATA_PRECISION_1": 4, +} +# default_config = { +# "IN_DATA_TENSOR_SIZE_DIM_0": 4, +# "IN_DATA_TENSOR_SIZE_DIM_1": 2, +# "IN_DATA_PARALLELISM_DIM_0": 2, +# "IN_DATA_PARALLELISM_DIM_1": 1, +# "ACTIVATION": 1, +# "IN_DATA_PRECISION_0": 8, +# "IN_DATA_PRECISION_1": 4, +# "QKMM_OUT_PRECISION_0": 8, +# "QKMM_OUT_PRECISION_1": 4, +# "SOFTMAX_EXP_PRECISION_0": 16, +# "SOFTMAX_EXP_PRECISION_1": 4, + +# "SOFTMAX_OUT_DATA_PRECISION_1": 7, + + +# "OUT_DATA_PRECISION_0": 12, +# "OUT_DATA_PRECISION_1": 4, +# } +def get_fixed_self_attention_head_config(kwargs={}): + config = default_config + config.update(kwargs) + return config + + +torch.manual_seed(1) + + +async def check_signal(dut, log): + while True: + await RisingEdge(dut.clk) + handshake_signal_check( + dut.attention_scores_valid, + dut.attention_scores_ready, + dut.attention_scores, + log, + ) + # handshake_signal_check(dut.rolled_k_valid, dut.rolled_k_ready, dut.rolled_k, log) + # handshake_signal_check(dut.bias_valid, + # dut.bias_ready, + # dut.bias, log) + + +def handshake_signal_check(valid, ready, signal, log): + svalue = [i.signed_integer for i in signal.value] + if valid.value & ready.value: + log.debug(f"handshake {signal} = {svalue}") + + +MULT_DATA = 1 / math.sqrt(default_config["IN_DATA_TENSOR_SIZE_DIM_0"]) + + +@pytest.mark.dev +def test_fixed_self_attention_head_smoke(): + """ + Some quick tests to check if the module is working. + """ + + # * Generate exponential LUT for softmax + generate_memory.generate_sv_lut( + "exp", + default_config["QKMM_OUT_PRECISION_0"], + default_config["QKMM_OUT_PRECISION_1"], + default_config["SOFTMAX_EXP_PRECISION_0"], + default_config["SOFTMAX_EXP_PRECISION_1"], + path=Path(__file__).parents[1] / "rtl", + constant_mult=MULT_DATA, + floor=True, + ) + mase_runner( + trace=True, + module_param_list=[ + get_fixed_self_attention_head_config(), + ], + skip_build=False, + ) + + +if __name__ == "__main__": + test_fixed_self_attention_head_smoke() diff --git a/src/mase_components/vision_models/vit/test/fixed_vit_attention_tb.py b/src/mase_components/vision_models/vit/test/fixed_vit_attention_tb.py new file mode 100644 index 000000000..1309e774d --- /dev/null +++ b/src/mase_components/vision_models/vit/test/fixed_vit_attention_tb.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 + +import os + +import torch +import logging +from functools import partial + +import cocotb +from cocotb.log import SimLog +from cocotb.triggers import Timer, RisingEdge + +import math + +from mase_components.helper import generate_memory +from mase_cocotb.testbench import Testbench +from mase_cocotb.interfaces.streaming import StreamDriver, StreamMonitor +from mase_cocotb.runner import mase_runner + +# from mase_cocotb import Testbench, StreamDriver, StreamMonitor, mase_runner +from chop.nn.quantized import ( + ViTAttentionInteger, +) +from mase_cocotb.utils import fixed_preprocess_tensor + + +class FixedSelfAttentionTB(Testbench): + def __init__(self, dut) -> None: + super().__init__(dut, dut.clk, dut.rst) + + if not hasattr(self, "log"): + self.log = SimLog("%s" % (type(self).__qualname__)) + self.log.setLevel(logging.DEBUG) + + self.data_in_0_driver = StreamDriver( + dut.clk, dut.data_in_0, dut.data_in_0_valid, dut.data_in_0_ready + ) + + # * Weight drivers + self.query_weight_driver = StreamDriver( + dut.clk, dut.query_weight, dut.query_weight_valid, dut.query_weight_ready + ) + self.key_weight_driver = StreamDriver( + dut.clk, dut.key_weight, dut.key_weight_valid, dut.key_weight_ready + ) + self.value_weight_driver = StreamDriver( + dut.clk, dut.value_weight, dut.value_weight_valid, dut.value_weight_ready + ) + self.proj_weight_driver = StreamDriver( + dut.clk, dut.proj_weight, dut.proj_weight_valid, dut.proj_weight_ready + ) + + if self.get_parameter("HAS_BIAS") == 1: + self.query_bias_driver = StreamDriver( + dut.clk, dut.query_bias, dut.query_bias_valid, dut.query_bias_ready + ) + self.key_bias_driver = StreamDriver( + dut.clk, dut.key_bias, dut.key_bias_valid, dut.key_bias_ready + ) + self.value_bias_driver = StreamDriver( + dut.clk, dut.value_bias, dut.value_bias_valid, dut.value_bias_ready + ) + self.proj_bias_driver = StreamDriver( + dut.clk, dut.proj_bias, dut.proj_bias_valid, dut.proj_bias_ready + ) + self.query_bias_driver.log.setLevel(logging.DEBUG) + self.key_bias_driver.log.setLevel(logging.DEBUG) + self.value_bias_driver.log.setLevel(logging.DEBUG) + self.proj_bias_driver.log.setLevel(logging.DEBUG) + + self.data_out_0_monitor = StreamMonitor( + dut.clk, + dut.data_out_0, + dut.data_out_0_valid, + dut.data_out_0_ready, + check=False, + ) + + # Model + self.q_config = { + "data_in_width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "data_in_frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + "qkv_weight_width": self.get_parameter("WEIGHT_PRECISION_0"), + "qkv_weight_frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + "qkv_bias_width": self.get_parameter("BIAS_PRECISION_0"), + "qkv_bias_frac_width": self.get_parameter("BIAS_PRECISION_1"), + "qkv_width": self.get_parameter("QKV_PRECISION_0"), + "qkv_frac_width": self.get_parameter("QKV_PRECISION_1"), + "qkmm_out_width": self.get_parameter("QKMM_OUT_PRECISION_0"), + "qkmm_out_frac_width": self.get_parameter("QKMM_OUT_PRECISION_1"), + "softmax_exp_width": self.get_parameter("SOFTMAX_EXP_PRECISION_0"), + "softmax_exp_frac_width": self.get_parameter("SOFTMAX_EXP_PRECISION_1"), + "softmax_out_frac_width": self.get_parameter( + "SOFTMAX_OUT_DATA_PRECISION_1" + ), + "svmm_out_width": self.get_parameter("SVMM_OUT_PRECISION_0"), + "svmm_out_frac_width": self.get_parameter("SVMM_OUT_PRECISION_1"), + "proj_weight_width": self.get_parameter("WEIGHT_PROJ_PRECISION_0"), + "proj_weight_frac_width": self.get_parameter("WEIGHT_PROJ_PRECISION_1"), + "proj_bias_width": self.get_parameter("BIAS_PROJ_PRECISION_0"), + "proj_bias_frac_width": self.get_parameter("BIAS_PROJ_PRECISION_1"), + "data_out_width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "data_out_frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + } + self.model = ViTAttentionInteger( + dim=self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + num_heads=self.get_parameter("NUM_HEADS"), + qkv_bias=True if self.get_parameter("HAS_BIAS") else False, + q_config=self.q_config, + floor=True, + ) + + # Set verbosity of driver and monitor loggers to debug + self.data_in_0_driver.log.setLevel(logging.DEBUG) + self.query_weight_driver.log.setLevel(logging.INFO) + self.key_weight_driver.log.setLevel(logging.DEBUG) + self.value_weight_driver.log.setLevel(logging.DEBUG) + self.proj_weight_driver.log.setLevel(logging.DEBUG) + self.data_out_0_monitor.log.setLevel(logging.DEBUG) + + def generate_inputs(self, batch_size=1): + return torch.randn( + ( + batch_size, + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_1"), + self.get_parameter("DATA_IN_0_TENSOR_SIZE_DIM_0"), + ) + ) + + async def run_test(self, batches=1, us=100): + await self.reset() + self.log.info(f"Reset finished") + self.data_out_0_monitor.ready.value = 1 + for _ in range(batches): + inputs = self.generate_inputs() + exp_out = self.model(inputs)[0] + + # * Load the inputs driver + inputs = fixed_preprocess_tensor( + tensor=inputs, + q_config={ + "width": self.get_parameter("DATA_IN_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_IN_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_IN_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + # self.log.info(f"Processing inputs: {inputs}") + self.data_in_0_driver.load_driver(inputs) + + # * Load the qkv weight driver + + for projection in ["query", "key", "value"]: + layer = getattr(self.model, f"{projection}") + weights = layer.weight + weights = fixed_preprocess_tensor( + tensor=weights, + q_config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("WEIGHT_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PARALLELISM_DIM_0"), + ], + floor=True, + ) + # self.log.info(f"Processing {projection} weights: {weights}") + getattr(self, f"{projection}_weight_driver").load_driver(weights) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + bias = getattr(self.model, f"{projection}").bias + bias = fixed_preprocess_tensor( + tensor=bias, + q_config={ + "width": self.get_parameter("BIAS_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PARALLELISM_DIM_0"), + ], + floor=True, + ) + # self.log.info(f"Processing {projection} bias: {bias}") + getattr(self, f"{projection}_bias_driver").load_driver(bias) + + # * Load the proj weight driver + proj_weight = self.model.proj.weight + proj_bias = self.model.proj.bias + # self.log.info(f"Processing projection weights: {proj_weight}") + proj_weight = fixed_preprocess_tensor( + tensor=proj_weight, + q_config={ + "width": self.get_parameter("WEIGHT_PRECISION_0"), + "frac_width": self.get_parameter("WEIGHT_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("WEIGHT_PROJ_PARALLELISM_DIM_1"), + self.get_parameter("WEIGHT_PROJ_PARALLELISM_DIM_0"), + ], + floor=True, + ) + self.proj_weight_driver.load_driver(proj_weight) + + # * Load the bias driver + if self.get_parameter("HAS_BIAS") == 1: + self.log.info(f"Processing projection bias: {proj_bias}") + proj_bias = fixed_preprocess_tensor( + tensor=proj_bias, + q_config={ + "width": self.get_parameter("BIAS_PROJ_PRECISION_0"), + "frac_width": self.get_parameter("BIAS_PROJ_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("BIAS_PROJ_PARALLELISM_DIM_1"), + self.get_parameter("BIAS_PROJ_PARALLELISM_DIM_0"), + ], + floor=True, + ) + self.proj_bias_driver.load_driver(proj_bias) + # * Load the output monitor + self.log.info(f"Processing outputs: {exp_out}") + outs = fixed_preprocess_tensor( + tensor=exp_out, + q_config={ + "width": self.get_parameter("DATA_OUT_0_PRECISION_0"), + "frac_width": self.get_parameter("DATA_OUT_0_PRECISION_1"), + }, + parallelism=[ + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_1"), + self.get_parameter("DATA_OUT_0_PARALLELISM_DIM_0"), + ], + floor=True, + ) + count = [0] + cocotb.scheduler.add(check_signal(count, self.dut, self.log)) + self.data_out_0_monitor.load_monitor(outs) + + await Timer(us, units="us") + assert self.data_out_0_monitor.exp_queue.empty() + + +@cocotb.test() +async def cocotb_test(dut): + tb = FixedSelfAttentionTB(dut) + await tb.run_test(batches=1, us=400) + + +async def check_signal(count, dut, log): + while True: + await RisingEdge(dut.clk) + # handshake_signal_check( + # count, + # dut.split_query_valid[0], + # dut.split_query_ready[0], + # dut.query, + # log, + # ) + # handshake_signal_check( + # count, + # dut.g_attention_head[0].head_i.out_valid, + # dut.g_attention_head[0].head_i.out_ready, + # dut.g_attention_head[0].head_i.out, + # log, + # ) + + +def handshake_signal_check(count, valid, ready, signal, log): + svalue = [i.signed_integer for i in signal.value] + if valid.value[0] & ready.value[0]: + count[0] += 1 + log.debug(f"handshake {signal} count= {count}") + + +default_config = { + "NUM_HEADS": 3, + "HAS_BIAS": 1, + "DATA_IN_0_TENSOR_SIZE_DIM_0": 12, + "DATA_IN_0_PARALLELISM_DIM_0": 2, + "DATA_IN_0_TENSOR_SIZE_DIM_1": 12, + "DATA_IN_0_PARALLELISM_DIM_1": 2, + "WEIGHT_TENSOR_SIZE_DIM_0": 12, + "WEIGHT_PARALLELISM_DIM_0": 2, + "WEIGHT_TENSOR_SIZE_DIM_1": 12, + "WEIGHT_PARALLELISM_DIM_1": 2, + "WEIGHT_PROJ_TENSOR_SIZE_DIM_0": 12, + "WEIGHT_PROJ_PARALLELISM_DIM_0": 2, + "WEIGHT_PROJ_TENSOR_SIZE_DIM_1": 12, + "WEIGHT_PROJ_PARALLELISM_DIM_1": 2, + "DATA_IN_0_PRECISION_0": 8, + "DATA_IN_0_PRECISION_1": 3, + "WEIGHT_PRECISION_0": 16, + "WEIGHT_PRECISION_1": 8, + "BIAS_PRECISION_0": 16, + "BIAS_PRECISION_1": 8, + "QKV_PRECISION_0": 8, + "QKV_PRECISION_1": 3, + "QKMM_OUT_PRECISION_0": 8, + "QKMM_OUT_PRECISION_1": 3, + "SOFTMAX_EXP_PRECISION_0": 12, + "SOFTMAX_EXP_PRECISION_1": 4, + "SOFTMAX_OUT_DATA_PRECISION_1": 6, + "SVMM_OUT_PRECISION_0": 10, + "SVMM_OUT_PRECISION_1": 4, + "WEIGHT_PROJ_PRECISION_0": 16, + "WEIGHT_PROJ_PRECISION_1": 8, + "BIAS_PROJ_PRECISION_0": 16, + "BIAS_PROJ_PRECISION_1": 8, + "DATA_OUT_0_PRECISION_0": 10, + "DATA_OUT_0_PRECISION_1": 4, +} +# default_config = { +# "NUM_HEADS": 4, +# "ACTIVATION": 1, +# "HAS_BIAS": 1, +# "WEIGHTS_PRE_TRANSPOSED": 1, +# "DATA_IN_0_TENSOR_SIZE_DIM_0": 128, +# "DATA_IN_0_TENSOR_SIZE_DIM_1": 64, +# "DATA_IN_0_PARALLELISM_DIM_0": 4, +# "DATA_IN_0_PARALLELISM_DIM_1": 2, +# "WEIGHT_TENSOR_SIZE_DIM_0": 128, +# "WEIGHT_TENSOR_SIZE_DIM_1": 128, +# "WEIGHT_PARALLELISM_DIM_0": 2, +# "WEIGHT_PARALLELISM_DIM_1": 4, + +# "WEIGHT_PROJ_TENSOR_SIZE_DIM_0": 128, +# "WEIGHT_PROJ_TENSOR_SIZE_DIM_1": 128, +# "WEIGHT_PROJ_PARALLELISM_DIM_0": 4, +# "WEIGHT_PROJ_PARALLELISM_DIM_1": 2, + +# "DATA_IN_0_PRECISION_0": 8, +# "DATA_IN_0_PRECISION_1": 3, +# "WEIGHT_PRECISION_0": 16, +# "WEIGHT_PRECISION_1": 8, +# "BIAS_PRECISION_0": 16, +# "BIAS_PRECISION_1": 8, +# "QKV_PRECISION_0": 8, +# "QKV_PRECISION_1": 3, +# "QKMM_OUT_PRECISION_0": 8, +# "QKMM_OUT_PRECISION_1": 3, +# "SOFTMAX_EXP_PRECISION_0": 12, +# "SOFTMAX_EXP_PRECISION_1": 4, +# "SOFTMAX_OUT_DATA_PRECISION_1": 6, +# "SVMM_OUT_PRECISION_0": 10, +# "SVMM_OUT_PRECISION_1": 4, +# "WEIGHT_PROJ_PRECISION_0": 16, +# "WEIGHT_PROJ_PRECISION_1": 8, +# "BIAS_PROJ_PRECISION_0": 16, +# "BIAS_PROJ_PRECISION_1": 8, +# "DATA_OUT_0_PRECISION_0": 10, +# "DATA_OUT_0_PRECISION_1": 4, +# } +MULT_DATA = 1 / math.sqrt( + default_config["DATA_IN_0_TENSOR_SIZE_DIM_0"] // default_config["NUM_HEADS"] +) + + +def get_config(kwargs={}): + config = default_config + config.update(kwargs) + return config + + +torch.manual_seed(1) + + +def test_fixed_linear_smoke(): + """ + Some quick tests to check if the module is working. + """ + generate_memory.generate_sv_lut( + "exp", + default_config["QKMM_OUT_PRECISION_0"], + default_config["QKMM_OUT_PRECISION_1"], + default_config["SOFTMAX_EXP_PRECISION_0"], + default_config["SOFTMAX_EXP_PRECISION_1"], + constant_mult=MULT_DATA, + floor=True, + ) + mase_runner( + trace=True, + module_param_list=[ + get_config(), + # get_config(), + ], + sim="verilator", + skip_build=False, + # trace=True, + ) + + +torch.manual_seed(0) +if __name__ == "__main__": + test_fixed_linear_smoke() diff --git a/src/mase_components/vision_models/vit/test/hash_exp_tb.py b/src/mase_components/vision_models/vit/test/hash_exp_tb.py deleted file mode 100644 index e8b107503..000000000 --- a/src/mase_components/vision_models/vit/test/hash_exp_tb.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python3 - -# This script tests the register slice -import random, os, math, logging, sys - -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -print(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - -from mase_cocotb.random_test import RandomSource -from mase_cocotb.random_test import RandomSink -from mase_cocotb.random_test import check_results - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock -from cocotb.runner import get_runner - -debug = True -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=10): - self.data_width = 8 - self.out_width = 8 - self.inputs = RandomSource( - samples=samples, max_stalls=0, is_data_vector=False, debug=debug - ) - self.outputs = RandomSink(samples=samples, max_stalls=0, debug=debug) - self.samples = samples - self.ref = self.sw_compute() - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.data_width, - "OUT_WIDTH": self.out_width, - } - - def sw_compute(self): - ref = [] - for i in range(self.samples): - ref.append(1) - ref.reverse() - return ref - - -def in_out_wave(dut, name): - logger.debug( - "{} State: (in_valid,in_ready,out_valid,out_ready) = ({},{},{},{})".format( - name, - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - - -@cocotb.test() -async def cocotb_test_register_slice(dut): - """Test register slice""" - samples = 30 - test_case = VerificationCase(samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - in_out_wave(dut, "Pre-clk") - await FallingEdge(dut.clk) - in_out_wave(dut, "Post-clk") - - in_out_wave(dut, "Pre-clk") - await FallingEdge(dut.clk) - in_out_wave(dut, "Post-clk") - - done = False - while not done: - await FallingEdge(dut.clk) - in_out_wave(dut, "Post-clk") - - ## Pre_compute - dut.data_in_valid.value = test_case.inputs.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute( - dut.data_out_valid.value - ) - await Timer(1, units="ns") - - ## Compute - dut.data_in_valid.value, dut.data_in.value = test_case.inputs.compute( - dut.data_in_ready.value - ) - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - in_out_wave(dut, "Pre-clk") - logger.debug( - "\n\ - {}{}in = {}\n\ - {}{}out= {}\n\ - ".format( - dut.data_in_valid.value, - dut.data_in_ready.value, - [int(dut.data_in.value)], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(dut.data_out.value)], - ) - ) - breakpoint() - done = test_case.inputs.is_empty() and test_case.outputs.is_full() - - check_results(test_case.outputs.data, test_case.ref) - - -def runner(): - sim = os.getenv("SIM", "verilator") - - verilog_sources = [ - "../../../../components/ViT/hash_softmax.sv", - ] - test_case = VerificationCase() - - # set parameters - extra_args = [] - for k, v in test_case.get_dut_parameters().items(): - extra_args.append(f"-G{k}={v}") - print(extra_args) - runner = get_runner(sim) - runner.build( - verilog_sources=verilog_sources, - hdl_toplevel="hash_exp", - build_args=extra_args, - ) - - runner.test(hdl_toplevel="hash_exp", test_module="hash_exp_tb") - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_hash_exp(): - runner() - - -if __name__ == "__main__": - test_hash_exp() diff --git a/src/mase_components/vision_models/vit/test/hash_softmax_tb.py b/src/mase_components/vision_models/vit/test/hash_softmax_tb.py deleted file mode 100644 index 98ceea63f..000000000 --- a/src/mase_components/vision_models/vit/test/hash_softmax_tb.py +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env python3 - -# This script tests the register slice -import random, os, math, logging, sys, torch - -sys.path.append("/workspace/components/testbench/ViT") -sys.path.append( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -print(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - -from mase_cocotb.z_qlayers import quantize_to_int as q2i -from einops import rearrange -from mase_cocotb.random_test import RandomSource, RandomSink, check_results - -import cocotb -from cocotb.triggers import Timer -from cocotb.triggers import FallingEdge -from cocotb.clock import Clock -from cocotb.runner import get_runner -from .helpers.ha_softmax import ( - generate_table_div_hardware, - generate_table_hardware, - QHashSoftmax, -) - -debug = False -logger = logging.getLogger("tb_signals") -if debug: - logger.setLevel(logging.DEBUG) - - -# DUT test specifications -class VerificationCase: - def __init__(self, samples=1): - self.w_config = { - "softmax": { - "exp_width": 8, - "exp_frac_width": 4, - "div_width": 10, - "data_in_width": 8, - "data_in_frac_width": 4, - "data_out_width": 8, - "data_out_frac_width": 6, - }, - } - self.d_config = { - "softmax": { - "in_size": 1, - "out_size": 1, - "in_depth": 4, - }, - } - in_size = self.d_config["softmax"]["in_size"] - out_size = self.d_config["softmax"]["out_size"] - in_depth = self.d_config["softmax"]["in_depth"] - self.samples = samples - self.data_generate() - self.inputs = RandomSource( - name="data_in", - samples=samples * in_depth, - max_stalls=0, - num=in_size, - data_specify=self.d_in, - debug=debug, - ) - self.outputs = RandomSink( - samples=samples * in_size * in_depth // out_size, max_stalls=0, debug=debug - ) - self.ref = self.sw_compute() - - def data_generate(self): - B = self.samples - C = self.d_config["softmax"]["in_size"] * self.d_config["softmax"]["in_depth"] - - torch.manual_seed(0) - self.x = 5 * torch.randn(B, C) - x_in = q2i( - self.x, - self.w_config["softmax"]["data_in_width"], - self.w_config["softmax"]["data_in_frac_width"], - ) - exp_table = generate_table_hardware( - 1, - self.w_config["softmax"]["data_in_width"], - self.w_config["softmax"]["data_in_frac_width"], - self.w_config["softmax"]["exp_width"], - self.w_config["softmax"]["exp_frac_width"], - ).tolist() - div_table = generate_table_div_hardware( - self.w_config["softmax"]["div_width"], - self.w_config["softmax"]["data_out_width"], - self.w_config["softmax"]["data_out_frac_width"], - ).tolist() - with open(r"exp_init.mem", "w") as fp: - for item in exp_table: - # write each item on a new lineformat(addr[i] ,f'0{width}b' - fp.write( - "%s\n" - % format(item, f'0{self.w_config["softmax"]["exp_width"]//4}x') - ) - with open(r"div_init.mem", "w") as fp: - for item in div_table: - # write each item on a new line - fp.write( - "%s\n" - % format(item, f'0{self.w_config["softmax"]["data_out_width"]//4}x') - ) - self.qhsoftmax = QHashSoftmax(self.w_config["softmax"]) - # data_pack - self.d_in = self.linear_data_pack( - x_in, 1, C, 1, self.d_config["softmax"]["in_size"] - ) - self.d_in.reverse() - - def sw_compute(self): - data_out = self.qhsoftmax(self.x, 1) - output = q2i( - data_out, - self.w_config["softmax"]["data_out_width"], - self.w_config["softmax"]["data_out_frac_width"], - ) - - C = self.d_config["softmax"]["in_size"] * self.d_config["softmax"]["in_depth"] - output = self.linear_data_pack( - output, 1, C, 1, self.d_config["softmax"]["out_size"] - ) - return output - - def linear_data_pack(self, in_temp, in_y, in_x, unroll_in_y, unroll_in_x): - ## just what to make a matrix with [np*p][s*d] to tile [np*d][p*s] - ## assume the in_temp as torch.float - np = int(in_y / unroll_in_y) - d = int(in_x / unroll_in_x) - p = unroll_in_y - s = unroll_in_x - - in_temp = in_temp.to(torch.int).reshape(self.samples, np * p, d * s) - ref = [] - for i in range(self.samples): - re_tensor = rearrange( - in_temp[i], "(np p) (d s) -> np (p d) s", np=np, d=d, p=p, s=s - ) - ex_tensor = torch.zeros(np, d * p, s, dtype=int) - for b in range(np): - for i in range(d): - for j in range(p): - ex_tensor[b][i * p + j] = re_tensor[b][j * d + i] - output_tensor = rearrange( - ex_tensor, "np (d p) s -> (np d) (p s)", np=np, d=d, p=p, s=s - ) - output = output_tensor.tolist() - ref = ref + output - return ref - - def get_dut_parameters(self): - return { - "IN_WIDTH": self.w_config["softmax"]["data_in_width"], - "IN_FRAC_WIDTH": self.w_config["softmax"]["data_in_frac_width"], - "EXP_WIDTH": self.w_config["softmax"]["exp_width"], - "EXP_FRAC_WIDTH": self.w_config["softmax"]["exp_frac_width"], - "DIV_WIDTH": self.w_config["softmax"]["div_width"], - "OUT_WIDTH": self.w_config["softmax"]["data_out_width"], - "OUT_FRAC_WIDTH": self.w_config["softmax"]["data_out_frac_width"], - "IN_SIZE": self.d_config["softmax"]["in_size"], - "OUT_SIZE": self.d_config["softmax"]["out_size"], - "IN_DEPTH": self.d_config["softmax"]["in_depth"], - } - - -def in_out_wave(dut, name): - logger.debug( - "{} State: (in_valid,in_ready,out_valid,out_ready) = ({},{},{},{})".format( - name, - dut.data_in_ready.value, - dut.data_in_valid.value, - dut.data_out_ready.value, - dut.data_out_valid.value, - ) - ) - - -@cocotb.test() -async def cocotb_test_register_slice(dut): - """Test register slice""" - samples = 100 - test_case = VerificationCase(samples=samples) - - # Reset cycle - await Timer(20, units="ns") - dut.rst.value = 1 - await Timer(100, units="ns") - dut.rst.value = 0 - - # Create a 10ns-period clock on port clk - clock = Clock(dut.clk, 10, units="ns") - # Start the clock - cocotb.start_soon(clock.start()) - await Timer(500, units="ns") - - # Synchronize with the clock - dut.data_in_valid.value = 0 - dut.data_out_ready.value = 1 - await FallingEdge(dut.clk) - - await FallingEdge(dut.clk) - - done = False - while not done: - await FallingEdge(dut.clk) - - ## Pre_compute - dut.data_in_valid.value = test_case.inputs.pre_compute() - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.pre_compute( - dut.data_out_valid.value - ) - await Timer(1, units="ns") - - ## Compute - dut.data_in_valid.value, dut.data_in.value = test_case.inputs.compute( - dut.data_in_ready.value - ) - await Timer(1, units="ns") - dut.data_out_ready.value = test_case.outputs.compute( - dut.data_out_valid.value, dut.data_out.value - ) - logger.debug( - "\n\ - {}{}in = {}\n\ - {}{}sum={}\n\ - {}{}acc={}\n\ - circular_count={}\n\ - {}{}ib_acc={}\n\ - {}{}out= {}\n\ - ".format( - dut.data_in_valid.value, - dut.data_in_ready.value, - [int(i) for i in dut.data_in.value], - dut.sum_valid.value, - dut.sum_ready.value, - int(dut.sum.value), - dut.acc_valid.value, - dut.acc_ready.value, - [int(i) for i in dut.acc_duplicate.value], - int(dut.acc_circular.circular_count.value), - dut.ib_acc_valid.value, - dut.ib_acc_ready.value, - [int(i) for i in dut.ib_acc.value], - dut.data_out_valid.value, - dut.data_out_ready.value, - [int(i) for i in dut.data_out.value], - ) - ) - done = test_case.inputs.is_empty() and test_case.outputs.is_full() - - check_results(test_case.outputs.data, test_case.ref) - - -def runner(): - sim = os.getenv("SIM", "verilator") - - verilog_sources = [ - "../../../../components/ViT/hash_softmax.sv", - "../../../../components/conv/roller.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree.sv", - "../../../../components/fixed_arithmetic/fixed_adder_tree_layer.sv", - "../../../../components/fixed_arithmetic/fixed_accumulator.sv", - "../../../../components/common/join2.sv", - "../../../../components/common/split2.sv", - "../../../../components/common/fifo.sv", - "../../../../components/common/unpacked_fifo.sv", - "../../../../components/common/skid_buffer.sv", - "../../../../components/common/unpacked_skid_buffer.sv", - "../../../../components/cast/fixed_rounding.sv", - ] - test_case = VerificationCase() - - # set parameters - extra_args = [] - for k, v in test_case.get_dut_parameters().items(): - extra_args.append(f"-G{k}={v}") - print(extra_args) - runner = get_runner(sim) - runner.build( - verilog_sources=verilog_sources, - hdl_toplevel="hash_softmax", - build_args=extra_args, - ) - - runner.test(hdl_toplevel="hash_softmax", test_module="hash_softmax_tb") - - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_hash_softmax(): - runner() - - -if __name__ == "__main__": - test_hash_softmax() diff --git a/src/mase_components/vision_models/vit/test/helpers/ha_softmax.py b/src/mase_components/vision_models/vit/test/helpers/ha_softmax.py deleted file mode 100644 index 62e5b840d..000000000 --- a/src/mase_components/vision_models/vit/test/helpers/ha_softmax.py +++ /dev/null @@ -1,117 +0,0 @@ -import torch - -from torch import Tensor -from chop.nn.quantizers import integer_quantizer as _integer_quantize - - -def quantize_to_int(x: Tensor, width: int, frac_width: int): - x = _integer_quantize(x, width, frac_width) * (2**frac_width) - x = x.int() & (2**width - 1) - return x - - -def twos_complement_to_float(binary_string: str, width: int, frac_width: int): - # Determine the sign - sign_bit = binary_string[0] - - # Extract integer and fractional parts - integer_part = binary_string[1 : 1 + width] - - # Calculate integer magnitude - integer_magnitude = int(integer_part, 2) - - # Apply two's complement conversion for negative numbers - if sign_bit == "1": - integer_magnitude = -(2 ** (width - 1)) + integer_magnitude - - # Calculate scaling factor - scaling_factor = 2**frac_width - - # Calculate floating-point value - float_value = integer_magnitude / scaling_factor - - return float_value - - -def generate_table_software(scale, width, frac_width, out_width, out_frac_width): - addr = torch.tensor(range(0, 2 ** (width)), dtype=int) - table = torch.zeros(2 ** (width)) - for i in range(2 ** (width)): - element = twos_complement_to_float( - format(addr[i], f"0{width}b"), width, frac_width - ) - table[i] = element * scale - table = _integer_quantize(table.exp(), out_width, out_frac_width) - return table - - -def generate_table_hardware(scale, width, frac_width, out_width, out_frac_width): - addr = torch.tensor(range(0, 2 ** (width)), dtype=int) - table = torch.zeros(2 ** (width)) - for i in range(2 ** (width)): - element = twos_complement_to_float( - format(addr[i], f"0{width}b"), width, frac_width - ) - table[i] = element * scale - table = quantize_to_int(table.exp(), out_width, out_frac_width) - return table - - -def generate_table_div_hardware(width, out_width, out_frac_width): - addr = torch.tensor(range(0, 2 ** (width - 1)), dtype=int) - table = torch.zeros(2 ** (width - 1)) - for i in range(2 ** (width - 1)): - element = twos_complement_to_float(format(addr[i], f"0{width}b"), width, 0) - table[i] = element - table = quantize_to_int(1 / table, out_width, out_frac_width) - return table - - -def generate_table_div_software(width, out_width, out_frac_width): - addr = torch.tensor(range(0, 2 ** (width - 1)), dtype=int) - table = torch.zeros(2 ** (width - 1)) - for i in range(2 ** (width - 1)): - element = twos_complement_to_float(format(addr[i], f"0{width}b"), width, 0) - table[i] = element - table = _integer_quantize(1 / table, out_width, out_frac_width) - return table - - -class QHashSoftmax(torch.nn.Module): - def __init__( - self, - config, - ): - super(QHashSoftmax, self).__init__() - self.in_width = config["data_in_width"] - self.in_frac_width = config["data_in_frac_width"] - self.exp_width = config["exp_width"] - self.exp_frac_width = config["exp_frac_width"] - self.out_width = config["data_out_width"] - self.out_frac_width = config["data_out_frac_width"] - self.div_width = config["div_width"] - - def forward(self, x, scale): - table_exp = generate_table_software( - scale, - self.in_width, - self.in_frac_width, - self.exp_width, - self.exp_frac_width, - ) - - table_div = generate_table_div_software( - self.div_width + 1, self.out_width, self.out_frac_width - ) - x = quantize_to_int(x, self.in_width, self.in_frac_width) - exp = table_exp[x] - exp_sum = exp.sum(dim=-1, keepdim=True) - # quantize to div_width - one_over_div = _integer_quantize(exp_sum // exp, self.div_width + 1, 0) - one_over_div = torch.where( - exp == 0, torch.tensor(2**self.div_width - 1), one_over_div - ) - one_over_div = torch.tensor(one_over_div, dtype=int) - - div = table_div[one_over_div] - return div diff --git a/src/mase_components/vision_models/vit/test/helpers/pvt_quant.py b/src/mase_components/vision_models/vit/test/helpers/pvt_quant.py deleted file mode 100644 index b3eb1ab3e..000000000 --- a/src/mase_components/vision_models/vit/test/helpers/pvt_quant.py +++ /dev/null @@ -1,397 +0,0 @@ -from logging import getLogger - -import torch -import torch.nn as nn -import toml -from timm.models.layers import to_2tuple, trunc_normal_ -import torch.nn.functional as F - -__all__ = ["get_pvt_quant"] - -from chop.models.manual.quant_utils import get_quantized_cls, get_quantized_func -from chop.nn.quantizers import integer_quantizer as _integer_quantize -from .ha_softmax import QHashSoftmax - -logger = getLogger(__name__) - - -class fixed_affine(nn.Module): - def __init__(self, config): - super(fixed_affine, self).__init__() - self.weight = torch.randn(1) - self.bias = torch.randn(1) - self.mult = get_quantized_func("mul", config["mul"]) - self.add = get_quantized_func("add", config["add"]) - self.config = config - - def forward(self, x): - x = self.mult(x, self.weight, config=self.config["mul"]) - x = self.add(x, self.bias, config=self.config["add"]) - return x - - -class QuantizedAttention(nn.Module): - def __init__( - self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, - config=None, - # sr_ratio=1, - ): - super().__init__() - assert ( - dim % num_heads == 0 - ), f"dim {dim} should be divided by num_heads {num_heads}." - self.config = config - self.dim = dim - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.q = get_quantized_cls("linear", config["q_proj"])( - dim, dim, bias=qkv_bias, config=config["q_proj"] - ) - self.kv = get_quantized_cls("linear", config["q_proj"])( - dim, dim * 2, bias=qkv_bias, config=config["kv_proj"] - ) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = get_quantized_cls("linear", config["z_proj"])( - dim, dim, bias=True, config=config["z_proj"] - ) - self.proj_drop = nn.Dropout(proj_drop) - - # self.sr_ratio = sr_ratio - # if sr_ratio > 1: - # self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) - # self.norm = nn.LayerNorm(dim) - - # def forward(self, x, H, W): - def forward(self, x): - B, N, C = x.shape - q = ( - self.q(x) - .reshape(B, N, self.num_heads, C // self.num_heads) - .permute(0, 2, 1, 3) - ) - - # if self.sr_ratio > 1: - # x_ = x.permute(0, 2, 1).reshape(B, C, H, W) - # x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) - # x_ = self.norm(x_) - # kv = ( - # self.kv(x_) - # .reshape(B, -1, 2, self.num_heads, C // self.num_heads) - # .permute(2, 0, 3, 1, 4) - # ) - # else: - kv = ( - self.kv(x) - .reshape(B, -1, 2, self.num_heads, C // self.num_heads) - .permute(2, 0, 3, 1, 4) - ) - k, v = kv[0], kv[1] - attn = get_quantized_func("matmul", self.config["attn_matmul"])( - q, k.transpose(-2, -1), self.config["attn_matmul"] - ) - attn = QHashSoftmax(self.config["softmax"])(attn, self.scale) - attn = self.attn_drop(attn) - x = get_quantized_func("matmul", self.config["z_matmul"])( - attn, v, self.config["z_matmul"] - ) - x = x.transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class QuantizedMlp(nn.Module): - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - config=None, - drop=0.0, - ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = get_quantized_cls("linear", config["fc1_proj"])( - in_features, hidden_features, bias=True, config=config["fc1_proj"] - ) - self.act = get_quantized_func("relu", config["mlp_relu"]) - self.fc2 = get_quantized_cls("linear", config["fc2_proj"])( - hidden_features, out_features, bias=True, config=config["fc2_proj"] - ) - self.drop = nn.Dropout(drop) - self.config = config - - def forward(self, x): - x = self.fc1(x) - x = self.act(x, config=self.config["mlp_relu"]) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - -# TODO: here -class QuantizedBlock(nn.Module): - def __init__( - self, - dim, - num_heads, - config, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - # sr_ratio=1, - ): - super().__init__() - self.norm1 = fixed_affine(config["affine_att"]) - self.attn = QuantizedAttention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - # sr_ratio=sr_ratio, - config=config["msa"], - ) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - # self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.norm2 = fixed_affine(config["affine_mlp"]) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = QuantizedMlp( - in_features=dim, - hidden_features=mlp_hidden_dim, - drop=drop, - config=config["mlp"], - ) - self.add1 = get_quantized_func("add", config["add1"]) - self.add2 = get_quantized_func("add", config["add2"]) - - self.config = config - - def forward(self, x): - x = self.add1(x, self.attn(self.norm1(x)), self.config["add1"]) - x = self.add2(x, self.mlp(self.norm2(x)), self.config["add2"]) - return x - - -class QuantizedPatchEmbed(nn.Module): - """Image to Patch Embedding""" - - def __init__( - self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, config=None - ): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - - self.img_size = img_size - self.patch_size = patch_size - # assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \ - # f"img_size {img_size} should be divided by patch_size {patch_size}." - self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1] - self.num_patches = self.H * self.W - self.proj = get_quantized_cls("conv2d", config["patch_proj"])( - in_chans, - embed_dim, - kernel_size=patch_size, - stride=patch_size, - config=config["patch_proj"], - ) - - # TODO: layer NORM - # self.norm = nn.LayerNorm(embed_dim) - - def forward(self, x): - B, C, H, W = x.shape - - x = self.proj(x).flatten(2).transpose(1, 2) - # x = self.norm(x) - H, W = H // self.patch_size[0], W // self.patch_size[1] - - return x, (H, W) - - -class QuantizedPyramidVisionTransformer(nn.Module): - def __init__( - self, - img_size=224, - patch_size=16, - in_chans=3, - num_classes=1000, - embed_dims=[64, 128, 256, 512], - num_heads=[1, 2, 4, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=False, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - # norm_layer=nn.LayerNorm, - depths=[3, 4, 6, 3], - num_stages=4, - # pretrained_cfg=None, - config=None, - ): - super().__init__() - self.config = config - self.num_classes = num_classes - self.num_stages = num_stages - for i in range(num_stages): - patch_embed = QuantizedPatchEmbed( - img_size=img_size if i == 0 else img_size // (2 ** (i + 1)), - patch_size=patch_size if i == 0 else 2, - in_chans=in_chans if i == 0 else embed_dims[i - 1], - embed_dim=embed_dims[i], - config=config["patch_embed"], - ) - num_patches = ( - patch_embed.num_patches - if i != num_stages - 1 - else patch_embed.num_patches + 1 - ) - pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dims[i])) - pos_drop = nn.Dropout(p=drop_rate) - - block = nn.ModuleList( - [ - QuantizedBlock( - dim=embed_dims[i], - num_heads=num_heads[i], - mlp_ratio=mlp_ratios[i], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - config=config["block"], - # sr_ratio=sr_ratios[i], - ) - for j in range(depths[i]) - ] - ) - setattr(self, f"patch_embed{i + 1}", patch_embed) - setattr(self, f"pos_embed{i + 1}", pos_embed) - setattr(self, f"pos_drop{i + 1}", pos_drop) - setattr(self, f"block{i + 1}", block) - # self.norm = fixed_affine(config["pvt_norm"]) - - # cls_token - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims[3])) - # classification head - self.head = get_quantized_cls("linear", config["head_proj"])( - in_features=embed_dims[3], - out_features=num_classes, - bias=True, - config=config["head_proj"], - ) - - # init weights - for i in range(num_stages): - pos_embed = getattr(self, f"pos_embed{i + 1}") - trunc_normal_(pos_embed, std=0.02) - trunc_normal_(self.cls_token, std=0.02) - - def _get_pos_embed(self, pos_embed, patch_embed, H, W): - if H * W == self.patch_embed1.num_patches: - return pos_embed - else: - return ( - F.interpolate( - pos_embed.reshape(1, patch_embed.H, patch_embed.W, -1).permute( - 0, 3, 1, 2 - ), - size=(H, W), - mode="bilinear", - ) - .reshape(1, -1, H * W) - .permute(0, 2, 1) - ) - - def forward_features(self, x): - B = x.shape[0] - - for i in range(self.num_stages): - patch_embed = getattr(self, f"patch_embed{i + 1}") - pos_embed = getattr(self, f"pos_embed{i + 1}") - pos_drop = getattr(self, f"pos_drop{i + 1}") - block = getattr(self, f"block{i + 1}") - x, (H, W) = patch_embed(x) - if i == self.num_stages - 1: - cls_tokens = self.cls_token.expand(B, -1, -1) - cls_tokens = _integer_quantize( - cls_tokens, - self.config["pos_add"]["data_in_width"], - self.config["pos_add"]["data_in_frac_width"], - ) - x = torch.cat((cls_tokens, x), dim=1) - pos_embed_ = self._get_pos_embed(pos_embed[:, 1:], patch_embed, H, W) - pos_embed = torch.cat((pos_embed[:, 0:1], pos_embed_), dim=1) - else: - pos_embed = self._get_pos_embed(pos_embed, patch_embed, H, W) - pos_add = get_quantized_func("add", self.config["pos_add"])( - x, pos_embed, self.config["pos_add"] - ) - x = pos_drop(pos_add) - for blk in block: - x = blk(x) - if i != self.num_stages - 1: - x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2) - - # x = self.norm(x) - - return x[:, 0] - - def forward(self, x): - x = self.forward_features(x) - x = self.head(x) - return x - - -def get_pvt_quant(info, config, pretrained=False): - num_classes = info["num_classes"] - img_size = info["image_size"][2] - in_chans = info["image_size"][0] - config = toml.load(config) - model = QuantizedPyramidVisionTransformer( - num_classes=num_classes, - img_size=img_size, - in_chans=in_chans, - config=config, - patch_size=4, - embed_dims=[64, 128, 320, 512], - # num_heads=[1, 2, 5, 8], - # mlp_ratios=[8, 8, 4, 4], - # qkv_bias=True, - # norm_layer=partial(nn.LayerNorm, eps=1e-6), - # depths=[2, 2, 2, 2], - # sr_ratios=[8, 4, 2, 1], - ) - # TODO: pretrained - # if pretrained: - # checkpoint = torch.hub.load_state_dict_from_url( - # url="https://github.com/whai362/PVT/releases/download/v2/pvt_tiny.pth", - # map_location="cpu", - # check_hash=True, - # ) - # if num_classes != 1000: - # _ = checkpoint.pop("head.weight") - # _ = checkpoint.pop("head.bias") - # logger.warning( - # f"num_classes (={num_classes}) != 1000. The last classifier layer (head) is randomly initialized" - # ) - # model.load_state_dict(checkpoint, strict=False) - # logger.info("Pretrained weights loaded into pvt_tiny") - # else: - # logger.info("pvt_tiny randomly initialized") - - return model diff --git a/src/mase_components/vision_models/vit/test/helpers/qlayer.py b/src/mase_components/vision_models/vit/test/helpers/qlayer.py deleted file mode 100644 index 7ccdffd9f..000000000 --- a/src/mase_components/vision_models/vit/test/helpers/qlayer.py +++ /dev/null @@ -1,400 +0,0 @@ -import torch -import torch.nn as nn -import math -from math import log2 -import sys, os - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -from z_qlayers.qlinear import QuantizedLinear -from z_qlayers.qconv import QuantizedConvolution -from z_qlayers.qatt import QPartAttention -from z_qlayers.qmm import QuantizedMatmulBias -from z_qlayers.tensor_cast import tensor_cast - - -class QOverlapPatchEmbed(nn.Module): - """Image to Patch Embedding""" - - def __init__( - self, - weights, - bias, - img_size=224, - patch_size=7, - stride=4, - in_chans=3, - embed_dim=768, - data_width=32, - data_frac_width=8, - weight_width=16, - weight_frac_width=8, - bias_width=32, - bias_frac_width=8, - out_width=32, - out_frac_width=1, - ): - super().__init__() - - img_size = (img_size, img_size) - patch_size = (patch_size, patch_size) - - assert max(patch_size) >= stride, "Set larger patch_size than stride" - - self.img_size = img_size - self.patch_size = patch_size - self.H, self.W = img_size[0] // stride, img_size[1] // stride - self.num_patches = self.H * self.W - self.proj = QuantizedConvolution( - in_chans, - embed_dim, - patch_size, - weights, - bias, - stride=stride, - padding=(patch_size[0] // 2, patch_size[1] // 2), - data_width=data_width, - data_frac_width=data_frac_width, - weight_width=weight_width, - weight_frac_width=weight_frac_width, - bias_width=bias_width, - bias_frac_width=bias_frac_width, - out_width=out_width, - out_frac_width=out_frac_width, - ) - - def forward(self, x): - x = self.proj(x) - _, _, H, W = x.shape - x = x.flatten(2).transpose(1, 2) - - return x, H, W - - -class QuantizedMSA(nn.Module): - def __init__( - self, - dim, - num_heads, - wqkv, - wp, - bqkv, - bp, - data_width=32, - data_frac_width=1, - weight_q_width=16, - weight_q_frac_width=1, - weight_k_width=16, - weight_k_frac_width=1, - weight_v_width=16, - weight_v_frac_width=1, - weight_p_width=16, - weight_p_frac_width=1, - bias_q_width=16, - bias_q_frac_width=1, - bias_k_width=16, - bias_k_frac_width=1, - bias_v_width=16, - bias_v_frac_width=1, - bias_p_width=16, - bias_p_frac_width=1, - data_q_width=16, - data_q_frac_width=1, - data_k_width=16, - data_k_frac_width=1, - data_v_width=16, - data_v_frac_width=1, - data_s_width=16, - data_s_frac_width=1, - data_z_width=16, - data_z_frac_width=1, - out_width=32, - out_frac_width=1, - ): - super().__init__() - assert ( - dim % num_heads == 0 - ), f"dim {dim} should be divided by num_heads {num_heads}." - self.dim = dim - self.num_heads = num_heads - dim_out = int(dim / num_heads) - wqkv = wqkv.reshape(num_heads, int(dim * 3 / num_heads), dim) - bqkv = bqkv.reshape(num_heads, int(dim * 3 / num_heads)) - self.att_list = [] - for i in range(num_heads): - self.qatt = QPartAttention( - dim, - dim_out, - wqkv[i], - bqkv[i], - data_width, - data_frac_width, - weight_q_width, - weight_q_frac_width, - weight_k_width, - weight_k_frac_width, - weight_v_width, - weight_v_frac_width, - bias_q_width, - bias_q_frac_width, - bias_k_width, - bias_k_frac_width, - bias_v_width, - bias_v_frac_width, - data_q_width, - data_q_frac_width, - data_k_width, - data_k_frac_width, - data_v_width, - data_v_frac_width, - data_s_width, - data_s_frac_width, - data_z_width, - data_z_frac_width, - ) - self.att_list.append(self.qatt) - - self.projection = QuantizedLinear( - dim, - dim, - wp, - data_width=data_z_width, - data_frac_width=data_z_frac_width, - weight_width=weight_p_width, - weight_frac_width=weight_p_frac_width, - bias_in=bp, - bias_width=bias_p_width, - bias_frac_width=bias_p_frac_width, - out_width=out_width, - out_frac_width=out_frac_width, - ) - - def forward(self, q_in): - result = self.att_list[0](q_in) - for i in range(1, self.num_heads): - other = self.att_list[i](q_in) - result = torch.cat((result, other), 2) - print("result = ", result) - out = self.projection(result) - return out - - -class QuantizedMlp(nn.Module): - """MLP as used in Vision Transformer, MLP-Mixer and related networks""" - - def __init__( - self, - weights1, - weights2, - in_features, - hidden_features, - bias=True, - bias_in1=[], - bias_in2=[], - data_width=32, - data_frac_width=8, - weight_i2h_width=16, - weight_i2h_frac_width=8, - weight_h2o_width=16, - weight_h2o_frac_width=8, - bias_i2h_width=32, - bias_i2h_frac_width=8, - bias_h2o_width=32, - bias_h2o_frac_width=8, - hidden_width=32, - hidden_frac_width=2, - out_width=32, - out_frac_width=1, - ): - super().__init__() - self.fc1 = QuantizedLinear( - in_features, - hidden_features, - weights=weights1, - bias=bias, - bias_in=bias_in1, - data_width=data_width, - data_frac_width=data_frac_width, - weight_width=weight_i2h_width, - weight_frac_width=weight_i2h_frac_width, - bias_width=bias_i2h_width, - bias_frac_width=bias_i2h_frac_width, - out_width=hidden_width, - out_frac_width=hidden_frac_width, - ) - # dont need it in the integer mode - # self.act = act_layer() - self.fc2 = QuantizedLinear( - hidden_features, - in_features, - weights=weights2, - bias=bias, - bias_in=bias_in2, - data_width=hidden_width, - data_frac_width=hidden_frac_width, - weight_width=weight_h2o_width, - weight_frac_width=weight_h2o_frac_width, - bias_width=bias_h2o_width, - bias_frac_width=bias_h2o_frac_width, - out_width=out_width, - out_frac_width=out_frac_width, - ) - - def forward(self, x): - x = self.fc1(x) - # x = self.act(x) - x = self.fc2(x) - return x - - -class QuantizedBlock(nn.Module): - def __init__( - self, - dim, - num_heads, - wqkv, - wp, - bqkv, - bp, - weights1, - weights2, - in_features, - hidden_features, - bias=True, - bias_in1=[], - bias_in2=[], - in_width=32, - in_frac_width=1, - weight_q_width=16, - weight_q_frac_width=1, - weight_k_width=16, - weight_k_frac_width=1, - weight_v_width=16, - weight_v_frac_width=1, - weight_p_width=16, - weight_p_frac_width=1, - bias_q_width=16, - bias_q_frac_width=1, - bias_k_width=16, - bias_k_frac_width=1, - bias_v_width=16, - bias_v_frac_width=1, - bias_p_width=16, - bias_p_frac_width=1, - data_q_width=16, - data_q_frac_width=1, - data_k_width=16, - data_k_frac_width=1, - data_v_width=16, - data_v_frac_width=1, - data_s_width=16, - data_s_frac_width=1, - data_z_width=16, - data_z_frac_width=1, - msa_out_width=16, - msa_out_frac_width=1, - weight_i2h_width=16, - weight_i2h_frac_width=8, - weight_h2o_width=16, - weight_h2o_frac_width=8, - bias_i2h_width=32, - bias_i2h_frac_width=8, - bias_h2o_width=32, - bias_h2o_frac_width=8, - hidden_width=32, - hidden_frac_width=2, - out_width=32, - out_frac_width=1, - ): - super().__init__() - self.qmsa = QuantizedMSA( - dim, - num_heads, - wqkv, - wp, - bqkv, - bp, - in_width, - in_frac_width, - weight_q_width, - weight_q_frac_width, - weight_k_width, - weight_k_frac_width, - weight_v_width, - weight_v_frac_width, - weight_p_width, - weight_p_frac_width, - bias_q_width, - bias_q_frac_width, - bias_k_width, - bias_k_frac_width, - bias_v_width, - bias_v_frac_width, - bias_p_width, - bias_p_frac_width, - data_q_width, - data_q_frac_width, - data_k_width, - data_k_frac_width, - data_v_width, - data_v_frac_width, - data_s_width, - data_s_frac_width, - data_z_width, - data_z_frac_width, - msa_out_width, - msa_out_frac_width, - ) - # dont need it in the integer mode - # self.act = act_layer() - self.qmlp = QuantizedMlp( - weights1, - weights2, - in_features, - hidden_features, - bias, - bias_in1, - bias_in2, - msa_out_width + 1, - msa_out_frac_width, - weight_i2h_width, - weight_i2h_frac_width, - weight_h2o_width, - weight_h2o_frac_width, - bias_i2h_width, - bias_i2h_frac_width, - bias_h2o_width, - bias_h2o_frac_width, - hidden_width, - hidden_frac_width, - out_width - 1, - out_frac_width, - ) - self.in_width = in_width - self.in_frac_width = in_frac_width - self.msa_out_width = msa_out_width - self.msa_out_frac_width = msa_out_frac_width - self.out_width = out_width - self.out_frac_width = out_frac_width - - def forward(self, x): - qmsa_x = self.qmsa(x) - res_msa_x = tensor_cast( - tensor_in=x, - in_width=self.in_width, - in_frac_width=self.in_frac_width, - out_width=self.msa_out_width, - out_frac_width=self.msa_out_frac_width, - ) - x = qmsa_x + res_msa_x - - qmlp_x = self.qmlp(x) - res_mlp_x = tensor_cast( - tensor_in=x, - in_width=self.msa_out_width, - in_frac_width=self.msa_out_frac_width, - out_width=self.out_width - 1, - out_frac_width=self.out_frac_width, - ) - x = qmlp_x + res_mlp_x - return x diff --git a/src/mase_components/vision_models/vit/test/test_lint_ViT.py b/src/mase_components/vision_models/vit/test/test_lint_ViT.py deleted file mode 100644 index fd2d555db..000000000 --- a/src/mase_components/vision_models/vit/test/test_lint_ViT.py +++ /dev/null @@ -1,12 +0,0 @@ -from mase_components.linter import run_lint - -import pytest - - -@pytest.mark.skip(reason="Needs to be fixed.") -def test_lint_ViT(): - run_lint("ViT") - - -if __name__ == "__main__": - test_lint_ViT() diff --git a/src/mase_components/vision_models/vit/test/test_lint_attention.py b/src/mase_components/vision_models/vit/test/test_lint_attention.py new file mode 100644 index 000000000..04c1788cc --- /dev/null +++ b/src/mase_components/vision_models/vit/test/test_lint_attention.py @@ -0,0 +1,12 @@ +from mase_components.linter import run_lint + +import pytest + + +@pytest.mark.dev +def test_lint_attention(): + run_lint("transformer_layers") + + +if __name__ == "__main__": + test_lint_attention() diff --git a/src/mase_components/vision_models/vit/test/test_synth_ViT.py b/src/mase_components/vision_models/vit/test/test_synth_ViT.py deleted file mode 100644 index 195bd379a..000000000 --- a/src/mase_components/vision_models/vit/test/test_synth_ViT.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest -from mase_components.synth_runner import run_synth - - -@pytest.mark.vivado -def test_synth_ViT(): - run_synth("ViT") - - -if __name__ == "__main__": - test_synth_ViT() diff --git a/src/mase_components/vision_models/vit/test/test_synth_attention.py b/src/mase_components/vision_models/vit/test/test_synth_attention.py new file mode 100644 index 000000000..105bb2c2e --- /dev/null +++ b/src/mase_components/vision_models/vit/test/test_synth_attention.py @@ -0,0 +1,11 @@ +import pytest +from mase_components.synth_runner import run_synth + + +@pytest.mark.vivado +def test_synth_attention(): + run_synth("vision_models/vit", "fixed_self_attention.sv") + + +if __name__ == "__main__": + test_synth_attention() diff --git a/src/mase_components/vivado/constraints.xdc b/src/mase_components/vivado/constraints.xdc index 60faaa465..f3fd76f71 100644 --- a/src/mase_components/vivado/constraints.xdc +++ b/src/mase_components/vivado/constraints.xdc @@ -1 +1 @@ -create_clock -period 20.000 -name clk -waveform {0.000 10.000} [get_ports clk] \ No newline at end of file +create_clock -period 10.000 -name clk -waveform {0.000 5.000} [get_ports clk] \ No newline at end of file diff --git a/test/passes/graph/transforms/verilog/gen_hardware_bram.drawio b/test/passes/graph/transforms/verilog/gen_hardware_bram.drawio new file mode 100644 index 000000000..693ef7a1a --- /dev/null +++ b/test/passes/graph/transforms/verilog/gen_hardware_bram.drawio @@ -0,0 +1,260 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/test/passes/graph/transforms/verilog/graph_automation.drawio b/test/passes/graph/transforms/verilog/graph_automation.drawio new file mode 100644 index 000000000..d9f2b076a --- /dev/null +++ b/test/passes/graph/transforms/verilog/graph_automation.drawio @@ -0,0 +1,333 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/test/passes/graph/transforms/verilog/test_emit_activation_gelu.py b/test/passes/graph/transforms/verilog/test_emit_activation_gelu.py deleted file mode 100644 index d78ac8525..000000000 --- a/test/passes/graph/transforms/verilog/test_emit_activation_gelu.py +++ /dev/null @@ -1,111 +0,0 @@ -import os, sys - -from chop.ir.graph.mase_graph import MaseGraph - -from chop.passes.graph.analysis import ( - init_metadata_analysis_pass, - add_common_metadata_analysis_pass, - add_hardware_metadata_analysis_pass, - report_node_type_analysis_pass, -) - -from chop.passes.graph.transforms import ( - emit_verilog_top_transform_pass, - emit_internal_rtl_transform_pass, - emit_bram_transform_pass, - emit_cocotb_transform_pass, - quantize_transform_pass, -) - -from chop.tools.logger import set_logging_verbosity - -set_logging_verbosity("debug") - -import toml -import torch -import torch.nn as nn - -# TO DO: remove -import os - -os.environ["PATH"] = "/opt/homebrew/bin:" + os.environ["PATH"] - -import subprocess - -# Example command to invoke Verilator -verilator_cmd = ["verilator", "--version"] - -# Execute the command and capture output -try: - output = subprocess.check_output(verilator_cmd, stderr=subprocess.STDOUT, text=True) - print("Verilator output:", output) -except subprocess.CalledProcessError as e: - print("Error running Verilator command:", e) - -import pytest - - -class MLP(torch.nn.Module): - """ - Toy FC model for digit recognition on MNIST - """ - - def __init__(self) -> None: - super().__init__() - self.fc1 = nn.Linear(4, 10) - - def forward(self, x): - x = torch.flatten(x, start_dim=1, end_dim=-1) - x = torch.nn.functional.gelu(self.fc1(x)) - return x - - -@pytest.mark.skip(reason="Not working") -def test_emit_activation_gelu(): - mlp = MLP() - mg = MaseGraph(model=mlp) - - # Provide a dummy input for the graph so it can use for tracing - batch_size = 1 - x = torch.randn((batch_size, 2, 2)) - dummy_in = {"x": x} - - mg, _ = init_metadata_analysis_pass(mg, None) - mg, _ = add_common_metadata_analysis_pass( - mg, {"dummy_in": dummy_in, "add_value": False} - ) - - config_file = os.path.join( - os.path.abspath(""), - "configs", - "tests", - "quantize", - "fixed.toml", - ) - with open(config_file, "r") as f: - quan_args = toml.load(f)["passes"]["quantize"] - mg, _ = quantize_transform_pass(mg, quan_args) - - _ = report_node_type_analysis_pass(mg) - - # Update the metadata - for node in mg.fx_graph.nodes: - for arg, arg_info in node.meta["mase"]["common"]["args"].items(): - if isinstance(arg_info, dict): - arg_info["type"] = "fixed" - arg_info["precision"] = [8, 3] - for result, result_info in node.meta["mase"]["common"]["results"].items(): - if isinstance(result_info, dict): - result_info["type"] = "fixed" - result_info["precision"] = [8, 3] - - mg, _ = add_hardware_metadata_analysis_pass(mg, None) - - mg, _ = emit_verilog_top_transform_pass(mg) - mg, _ = emit_internal_rtl_transform_pass(mg) - - mg, _ = emit_bram_transform_pass(mg) - - -if __name__ == "__main__": - test_emit_activation_gelu() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_folded_block.py b/test/passes/graph/transforms/verilog/test_emit_verilog_folded_block.py new file mode 100644 index 000000000..157d3a263 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_folded_block.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger +from test_emit_verilog_layernorm import ( + update_common_metadata_pass, + update_hardware_precision_param, +) + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) +from chop.models.vision.vit.vit import Attention + +from chop.nn.quantized.modules.attention import ViTAttentionInteger +from mase_components import get_module_dependencies + +VIT_CUSTOM_OPS = {"modules": {ViTAttentionInteger: {}}} + + +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, in_features, hidden_features) -> None: + super().__init__() + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_features, in_features, bias=True) + + def forward(self, x): + x = self.fc2(self.act(self.fc1(x))) + return x + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_drop: float = 0.0, + attn_drop: float = 0.0, + drop_path: float = 0.0, + act_layer: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + mlp_layer: nn.Module = MLP, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=proj_drop, + ) + + self.norm2 = norm_layer(dim) + self.mlp = mlp_layer( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm1(x)) + x = x + self.mlp(self.norm2(x)) + # x = self.attn(x) + return x + + +class ViTAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_drop: float = 0.0, + attn_drop: float = 0.0, + depth: int = 12, + drop_path: float = 0.0, + act_layer: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + mlp_layer: nn.Module = MLP, + ) -> None: + super().__init__() + + self.blocks = nn.Sequential( + *[ + Block( + dim=dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_drop=proj_drop, + attn_drop=attn_drop, + drop_path=drop_path, + norm_layer=norm_layer, + act_layer=act_layer, + ) + for i in range(depth) + ] + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.blocks(x) + return x + + +from chop.passes.graph.utils import deepsetattr + + +def vit_module_level_quantize(model, model_config, q_config): + for module in model.named_modules(): + if isinstance(module[1], Attention): + ori_module = module[1] + new_module = ViTAttentionInteger( + model_config["dim"], + model_config["num_heads"], + qkv_bias=model_config["query_has_bias"], + q_config=q_config, + ) + logger.info(f"Replacing module: {module[0]}") + dim = ori_module.head_dim * ori_module.num_heads + + qkv_weight = ori_module.qkv.weight.reshape(3, dim, dim) + new_module.query.weight = nn.Parameter(qkv_weight[0]) + new_module.key.weight = nn.Parameter(qkv_weight[1]) + new_module.value.weight = nn.Parameter(qkv_weight[2]) + + has_bias = False if ori_module.qkv.bias == None else True + if has_bias: + qkv_bias = ori_module.qkv.bias.reshape(3, 1, dim) + new_module.query.bias = nn.Parameter(qkv_bias[0]) + new_module.key.bias = nn.Parameter(qkv_bias[1]) + new_module.value.bias = nn.Parameter(qkv_bias[2]) + + new_module.proj.weight = ori_module.proj.weight + new_module.proj.bias = ori_module.proj.bias + deepsetattr(model, module[0], new_module) + return model + + +attention_quant_config = { + "name": "integer_floor", + "data_in_width": 8, + "data_in_frac_width": 4, + "qkv_weight_width": 6, + "qkv_weight_frac_width": 4, + "qkv_bias_width": 6, + "qkv_bias_frac_width": 4, + "qkv_width": 8, + "qkv_frac_width": 4, + "qkmm_out_width": 8, + "qkmm_out_frac_width": 5, + "softmax_exp_width": 8, + "softmax_exp_frac_width": 3, + "softmax_out_frac_width": 7, + "svmm_out_width": 8, + "svmm_out_frac_width": 4, + "proj_weight_width": 6, + "proj_weight_frac_width": 4, + "proj_bias_width": 8, + "proj_bias_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, +} + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "fork2": [8, 4], + "linear": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + # weight + "weight_width": 10, + "weight_frac_width": 3, + # bias + "bias_width": 5, + "bias_frac_width": 2, + # optional + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, + "gelu": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + } + }, + "layer_norm": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "weight_width": 8, + "weight_frac_width": 4, + "bias_width": 8, + "bias_frac_width": 4, + "isqrt_in_width": 8, + "isqrt_in_frac_width": 3, + "isqrt_out_width": 8, + "isqrt_out_frac_width": 7, + "data_out_width": 8, + "data_out_frac_width": 4, + "bypass": False, + "noparse": True, + } + }, + "add": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, + "vit_self_attention_integer": {"config": attention_quant_config}, +} + + +@pytest.mark.dev +def test_emit_verilog_vit(): + # vit_tiny dim 192, n 196, num_heads = 3 + # + dim = 192 + num_heads = 3 + batch_size = 1 + n = 196 + parallelism = 8 + depth = 12 + px = 2 + pqkv = 64 + p_proj = px + p_w1 = 128 + p_w2 = px + model_config_for_quantize = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + model_args_for_hardware_param = { + "vit_self_attention_integer": { + "num_heads": num_heads, + "query_has_bias": True, + } + } + layer = ViTAttention(dim, num_heads, mlp_ratio=4, qkv_bias=True, depth=depth) + qlayer = vit_module_level_quantize( + layer, model_config_for_quantize, attention_quant_config + ) + mg = chop.MaseGraph(model=qlayer) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + mg, _ = passes.graph.transforms.insert_fork_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [parallelism] * 4} + ) + update_hardware_precision_param(mg, quan_args, model_args_for_hardware_param) + # mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + + layer_for_block = ViTAttention(dim, num_heads, mlp_ratio=4, qkv_bias=True, depth=1) + qlayer_for_block = vit_module_level_quantize( + layer_for_block, model_config_for_quantize, attention_quant_config + ) + mg_for_block = chop.MaseGraph(qlayer_for_block) + mg_for_block, _ = passes.init_metadata_analysis_pass(mg_for_block, None) + mg_for_block, _ = passes.add_common_metadata_analysis_pass( + mg_for_block, {"dummy_in": dummy_in} + ) + mg_for_block, _ = passes.quantize_transform_pass(mg_for_block, quan_args) + mg_for_block, _ = passes.graph.transforms.insert_fork_transform_pass( + mg_for_block, quan_args + ) + update_common_metadata_pass(mg_for_block, quan_args) + mg_for_block, _ = passes.add_hardware_metadata_analysis_pass( + mg_for_block, pass_args={"max_parallelism": [parallelism] * 4} + ) + update_hardware_precision_param( + mg_for_block, quan_args, model_args_for_hardware_param + ) + + pqkv = pqkv * num_heads + pass_args = { + "fork2": {"din": [1, px], "dout": ([1, px], [1, px])}, + "norm1": {"din": [1, px], "dout": [1, px]}, + "attn": {"din": [1, px], "dattn": [1, pqkv], "dout": [1, p_proj]}, + "fifo": {"din": [1, px], "dout": [1, px]}, + "add": {"din": ([1, px], [1, px]), "dout": [1, px]}, + "norm2": {"din": [1, px], "dout": [1, px]}, + "mlp_fc1": {"din": [1, px], "dout": [1, p_w1]}, + "mlp_act": {"din": [1, p_w1], "dout": [1, p_w1]}, + "mlp_fc2": {"din": [1, p_w1], "dout": [1, px]}, + "fifo": {"din": [1, px], "dout": [1, px]}, + } + from utils import manually_update_hardware_parallelism_param + + manually_update_hardware_parallelism_param(mg, pass_args) + manually_update_hardware_parallelism_param(mg_for_block, pass_args) + mg, _ = passes.emit_verilog_top_transform_pass( + mg, + pass_args={ + "folded_graph": mg_for_block, + "folded_node_name": "blocks", + "reuse_times": depth, + }, + ) + # mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_unit": "us", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + # simulate(skip_build=False, skip_test=False, simulator="questa", waves=True, gui=True) + + +if __name__ == "__main__": + test_emit_verilog_vit() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_folded_linear.py b/test/passes/graph/transforms/verilog/test_emit_verilog_folded_linear.py new file mode 100644 index 000000000..f0ffb98d1 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_folded_linear.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog + +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import update_common_metadata_pass, update_hardware_precision_param + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nentering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, in_features, depth=3) -> None: + super().__init__() + self.linears = nn.Sequential( + *[nn.Linear(in_features, in_features, bias=True) for i in range(depth)] + ) + + def forward(self, x): + out = self.linears(x) + return out + + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "linear": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + # weight + "weight_width": 10, + "weight_frac_width": 3, + # bias + "bias_width": 5, + "bias_frac_width": 2, + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, +} + + +@pytest.mark.dev +def test_emit_verilog_folded_linear(): + in_features = 10 + n = 10 + batch_size = 2 + depth = 3 + linear = MLP(in_features, depth=depth) + mg = chop.MaseGraph(model=linear) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, in_features)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + # Increase weight range + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + update_hardware_precision_param(mg, quan_args) + + linear_for_block = MLP(in_features, depth=1) + mg_for_block = chop.MaseGraph(linear_for_block) + mg_for_block, _ = passes.init_metadata_analysis_pass(mg_for_block, None) + # Increase weight range + mg_for_block, _ = passes.add_common_metadata_analysis_pass( + mg_for_block, {"dummy_in": dummy_in} + ) + + mg_for_block, _ = passes.quantize_transform_pass(mg_for_block, quan_args) + + update_common_metadata_pass(mg_for_block, quan_args) + mg_for_block, _ = passes.add_hardware_metadata_analysis_pass( + mg_for_block, pass_args={"max_parallelism": [2] * 4} + ) + update_hardware_precision_param(mg_for_block, quan_args) + + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + mg, _ = passes.emit_verilog_top_transform_pass( + mg, + pass_args={ + "folded_graph": mg_for_block, + "folded_node_name": "linears", + "reuse_times": depth, + }, + ) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_folded_linear() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_fork_add.py b/test/passes/graph/transforms/verilog/test_emit_verilog_fork_add.py new file mode 100644 index 000000000..ec211ed1d --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_fork_add.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger +from test_emit_verilog_layernorm import ( + update_common_metadata_pass, + update_hardware_precision_param, +) +import operator +from utils import update_common_metadata_pass + +set_logging_verbosity("debug") +from chop.passes.graph.transforms.verilog.insert_fork import insert_fork_transform_pass + +from mase_components import get_module_dependencies + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(8, 8) + + def forward(self, x): + a = self.linear(x) + b = a + x + return b + + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "fork2": [8, 4], + "linear": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + # weight + "weight_width": 10, + "weight_frac_width": 3, + # bias + "bias_width": 5, + "bias_frac_width": 2, + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, + "add": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, +} + + +@pytest.mark.dev +def test_emit_verilog_fork_add(): + model = MLP() + mg = chop.MaseGraph(model=model) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + batch_size = 4 + x = torch.randn((batch_size, 8)) + dummy_in = {"x": x} + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + + mg, _ = insert_fork_transform_pass(mg, quan_args) + + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + mg, _ = passes.report_node_hardware_type_analysis_pass( + mg, + pass_args={ + "which": ["hardware"], + "save_path": "graph_meta_params.txt", + }, + ) # pretty print + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_units": "us", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_fork_add() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_gelu.py b/test/passes/graph/transforms/verilog/test_emit_verilog_gelu.py new file mode 100644 index 000000000..62d1263e9 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_gelu.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class GELU_MODULE(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self) -> None: + super().__init__() + + self.act = nn.GELU() + + def forward(self, x): + x = self.act(x) + return x + + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "gelu": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + } + }, +} + + +@pytest.mark.dev +def test_emit_verilog_mlp(): + in_size = 4 + batch_size = 4 + linear = GELU_MODULE() + mg = chop.MaseGraph(model=linear) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, in_size)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + # There is a bug in the current quantization pass, where the results metadata is not uppdated with the precision. + # Here we temporarily update the metadata here so we can test the hardware back end. + for node in mg.fx_graph.nodes: + for arg, _ in node.meta["mase"].parameters["common"]["args"].items(): + if ( + type(node.meta["mase"].parameters["common"]["args"][arg]) == dict + and "type" in node.meta["mase"].parameters["common"]["args"][arg].keys() + ): + node.meta["mase"].parameters["common"]["args"][arg]["type"] = "fixed" + for result, _ in node.meta["mase"].parameters["common"]["results"].items(): + if ( + type(node.meta["mase"].parameters["common"]["results"][result]) == dict + and "type" + in node.meta["mase"].parameters["common"]["results"][result].keys() + ): + node.meta["mase"].parameters["common"]["results"][result][ + "type" + ] = "fixed" + node.meta["mase"].parameters["common"]["results"][result][ + "precision" + ] = [ + quan_args["gelu"]["config"]["data_out_width"], + quan_args["gelu"]["config"]["data_out_frac_width"], + ] + + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + simulate(skip_build=False, skip_test=False, simulator="verilator", waves=True) + + +if __name__ == "__main__": + test_emit_verilog_mlp() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_layernorm.py b/test/passes/graph/transforms/verilog/test_emit_verilog_layernorm.py new file mode 100644 index 000000000..d3fe2d0a5 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_layernorm.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger +from chop.passes.graph.transforms.quantize import QUANTIZEABLE_OP +from utils import update_common_metadata_pass, update_hardware_precision_param + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + +torch.manual_seed(0) +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class LAYERNORM_MODULE(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, norm_dim) -> None: + super().__init__() + + self.norm = nn.LayerNorm(norm_dim, elementwise_affine=True) + if self.norm.elementwise_affine: + self.norm.weight = torch.nn.Parameter(torch.rand(norm_dim)) + if self.norm.bias is not None: + self.norm.bias = torch.nn.Parameter(torch.rand(norm_dim)) + + def forward(self, x): + x = self.norm(x) + return x + + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "layer_norm": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "weight_width": 8, + "weight_frac_width": 4, + "bias_width": 8, + "bias_frac_width": 4, + "isqrt_in_width": 8, + "isqrt_in_frac_width": 4, + "isqrt_out_width": 8, + "isqrt_out_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + "bypass": False, + "noparse": True, + } + }, +} + + +@pytest.mark.dev +def test_emit_verilog_layernorm(): + + batch_size = 1 + norm_dim = 12 + norm_layer = LAYERNORM_MODULE(norm_dim) + mg = chop.MaseGraph(model=norm_layer) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, norm_dim)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + mg, _ = passes.quantize_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + update_hardware_precision_param(mg, quan_args) + print(mg.meta["mase"]["common"]["args"]) + # mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_units": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + simulate(skip_build=False, skip_test=False, simulator="questa", waves=True) + + +if __name__ == "__main__": + test_emit_verilog_layernorm() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_linear.py b/test/passes/graph/transforms/verilog/test_emit_verilog_linear.py index e10e61c3e..e675cbcd4 100644 --- a/test/passes/graph/transforms/verilog/test_emit_verilog_linear.py +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_linear.py @@ -17,6 +17,11 @@ from chop.tools import get_logger set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) def excepthook(exc_type, exc_value, exc_traceback): @@ -33,95 +38,109 @@ def excepthook(exc_type, exc_value, exc_traceback): # Model specifications # prefer small models for fast test # -------------------------------------------------- +# verified test case linear(2,4) + + class MLP(torch.nn.Module): """ Toy quantized FC model for digit recognition on MNIST """ - def __init__(self) -> None: + def __init__(self, in_features, hidden_features, out_features) -> None: super().__init__() - self.fc1 = nn.Linear(10, 10, bias=True) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + # self.fc2 = nn.Linear(hidden_features, out_features, bias=True) def forward(self, x): - x = torch.nn.functional.relu(self.fc1(x)) + x = self.fc1(x) + # x = self.fc2(x) return x +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "linear": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + # weight + "weight_width": 8, + "weight_frac_width": 3, + # bias + "bias_width": 8, + "bias_frac_width": 2, + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, +} + + @pytest.mark.dev def test_emit_verilog_linear(): - mlp = MLP() - mg = chop.MaseGraph(model=mlp) - + in_features = 192 + hidden_features = 192*4 + out_features = 192 + n = 196 + batch_size = 10 + linear = MLP(in_features, hidden_features, out_features) + mg = chop.MaseGraph(model=linear) + torch.manual_seed(0) # Provide a dummy input for the graph so it can use for tracing - batch_size = 2 - x = torch.randn((batch_size, 10)) + x = torch.randn((batch_size, n, in_features)) dummy_in = {"x": x} mg, _ = passes.init_metadata_analysis_pass(mg, None) - mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) - - # Quantize to int - config_file = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "..", - "..", - "..", - "..", - "..", - "configs", - "tests", - "quantize", - "integer.toml", - ) - - # load toml config file - with open(config_file, "r") as f: - quan_args = toml.load(f)["passes"]["quantize"] - mg, _ = passes.quantize_transform_pass(mg, quan_args) - - # There is a bug in the current quantizzation pass, where the results metadata is not uppdated with the precision. - # Here we temporarily update the metadata here so we can test the hardware back end. - for node in mg.fx_graph.nodes: - for arg, _ in node.meta["mase"].parameters["common"]["args"].items(): - if ( - type(node.meta["mase"].parameters["common"]["args"][arg]) == dict - and "type" in node.meta["mase"].parameters["common"]["args"][arg].keys() - ): - node.meta["mase"].parameters["common"]["args"][arg]["type"] = "fixed" - for result, _ in node.meta["mase"].parameters["common"]["results"].items(): - if ( - type(node.meta["mase"].parameters["common"]["results"][result]) == dict - and "type" - in node.meta["mase"].parameters["common"]["results"][result].keys() - ): - node.meta["mase"].parameters["common"]["results"][result][ - "type" - ] = "fixed" - node.meta["mase"].parameters["common"]["results"][result][ - "precision" - ] = [8, 3] - # Increase weight range mg.model.fc1.weight = torch.nn.Parameter( 10 * torch.randn(mg.model.fc1.weight.shape) ) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + mg, _ = passes.quantize_transform_pass(mg, quan_args) + + update_common_metadata_pass(mg, quan_args) + # from chop.passes.graph.transforms.verilog.insert_fork import insert_fifo_after_specified_modules + # mg, _ = insert_fifo_after_specified_modules( + # mg, pass_args = { + # "insert_fifo": ["linear"], + # "max_parallelism": 2 # used for generating the fifo depth + # } + # ) mg, _ = passes.add_hardware_metadata_analysis_pass( mg, pass_args={"max_parallelism": [2] * 4} ) + update_hardware_precision_param(mg, quan_args) + wp1 = 32 + wp2 = 32 + manually_update_hardware_parallelism_param( + mg, + pass_args={ + "fc1": {"din": [1, wp1], "dout": [1, wp2]}, + # "fc2": {"din": [1, wp1], "dout": [1, wp2]}, + }, + ) + pass_args = { + "project_dir": Path("/scratch/cx922/mase/int_linear"), + } mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print - - mg, _ = passes.emit_verilog_top_transform_pass(mg) - mg, _ = passes.emit_bram_transform_pass(mg) - mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) mg, _ = passes.emit_cocotb_transform_pass( - mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + mg, pass_args={"wait_time": 100, "wait_unit": "us", "batch_size": batch_size, "project_dir": pass_args["project_dir"]} ) - mg, _ = passes.emit_vivado_project_transform_pass(mg) - - simulate(skip_build=False, skip_test=False, simulator="verilator") + mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) if __name__ == "__main__": + pass_args = { + "project_dir": Path("/scratch/cx922/mase/int_linear"), + } test_emit_verilog_linear() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mlp.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mlp.py new file mode 100644 index 000000000..744f08786 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mlp.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, in_features, hidden_features, out_features) -> None: + super().__init__() + + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "linear": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + # weight + "weight_width": 10, + "weight_frac_width": 3, + # bias + "bias_width": 5, + "bias_frac_width": 2, + # optional + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, + "gelu": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + } + }, +} + + +@pytest.mark.dev +def test_emit_verilog_mlp(): + in_features = 4 + hidden_features = 20 + out_features = 10 + batch_size = 4 + linear = MLP(in_features, hidden_features, out_features) + mg = chop.MaseGraph(model=linear) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, in_features)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + # There is a bug in the current quantization pass, where the results metadata is not uppdated with the precision. + # Here we temporarily update the metadata here so we can test the hardware back end. + for node in mg.fx_graph.nodes: + for arg, _ in node.meta["mase"].parameters["common"]["args"].items(): + if ( + type(node.meta["mase"].parameters["common"]["args"][arg]) == dict + and "type" in node.meta["mase"].parameters["common"]["args"][arg].keys() + ): + node.meta["mase"].parameters["common"]["args"][arg]["type"] = "fixed" + for result, _ in node.meta["mase"].parameters["common"]["results"].items(): + if ( + type(node.meta["mase"].parameters["common"]["results"][result]) == dict + and "type" + in node.meta["mase"].parameters["common"]["results"][result].keys() + ): + node.meta["mase"].parameters["common"]["results"][result][ + "type" + ] = "fixed" + node.meta["mase"].parameters["common"]["results"][result][ + "precision" + ] = [ + quan_args["linear"]["config"]["data_out_width"], + quan_args["linear"]["config"]["data_out_frac_width"], + ] + + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + simulate(skip_build=False, skip_test=False, simulator="verilator", waves=True) + + +if __name__ == "__main__": + test_emit_verilog_mlp() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_gelu.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_gelu.py new file mode 100644 index 000000000..6351f3ef7 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_gelu.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + +from chop.nn.quantized.modules.attention import _ViTAttentionBase +class ViTAttentionMxInt(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + q_config: dict = None, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + +class MxIntGELU(nn.GELU): + def __init__( + self, + q_config, + ) -> None: + super().__init__() + self.q_config = q_config + +def vit_module_level_quantize(model, model_config, q_config): + from chop.passes.graph.utils import deepsetattr + for module in model.named_modules(): + if isinstance(module[1], nn.GELU): + ori_module = module[1] + new_module = MxIntGELU( + q_config=q_config["gelu"], + ) + logger.info(f"Replacing module: {module[0]}") + deepsetattr(model, module[0], new_module) + return model + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + +class CustomModel(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, dim, num_heads) -> None: + super().__init__() + + # self.attention = Attention(dim, num_heads, qkv_bias=True) + self.act = torch.nn.GELU() + + def forward(self, x): + # x = self.attention(x) + x = self.act(x) + return x + +quan_args = { + "gelu": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 48], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [48, 48], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 48], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 48], + } + } +} + +from mase_components import get_module_dependencies +VIT_CUSTOM_OPS = { + "modules": { + MxIntGELU: { + "args": { + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_gelu", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_gelu" + ), + }, + }, +} +@pytest.mark.dev +def test_emit_verilog_linear(): + dim = 192 + num_heads = 3 + batch_size = 1 + n = 196 + model_config = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + layer = CustomModel(dim, num_heads) + qlayer = vit_module_level_quantize(layer, model_config, quan_args) + mg = chop.MaseGraph(model=qlayer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_config.pop("dim") + model_args = {"vit_self_attention_integer": model_config} + update_hardware_precision_param(mg, quan_args, model_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + # wp1 = 8 + # wp2 = 1 + # manually_update_hardware_parallelism_param( + # mg, + # pass_args={ + # "fc1": {"din": [1, 2], "dout": [1, wp1]}, + # "fc2": {"din": [1, wp1], "dout": [1, wp2]}, + # }, + # ) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_layernorm.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_layernorm.py new file mode 100644 index 000000000..88fa311e8 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_layernorm.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + +from chop.nn.quantized.modules.attention import _ViTAttentionBase +class ViTAttentionMxInt(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + q_config: dict = None, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + +class MxIntGELU(nn.GELU): + def __init__( + self, + q_config, + ) -> None: + super().__init__() + self.q_config = q_config + +class MxIntLayerNorm(nn.LayerNorm): + def __init__( + self, + normlized_shape, + q_config, + eps=1e-5, + elementwise_affine=True, + bias=True, + ) -> None: + super().__init__(normlized_shape, eps, elementwise_affine, bias) + self.q_config = q_config + +def vit_module_level_quantize(model, model_config, q_config): + from chop.passes.graph.utils import deepsetattr + for module in model.named_modules(): + if isinstance(module[1], nn.LayerNorm): + ori_module = module[1] + if ori_module.bias is not None: + bias = True + new_module = MxIntLayerNorm( + ori_module.normalized_shape, + eps=ori_module.eps, + elementwise_affine=ori_module.elementwise_affine, + bias=bias, + q_config=q_config, + ) + new_module.weight = ori_module.weight + new_module.bias = ori_module.bias + print(f"LayerNorm {module[0]} was replaced") + logger.info(f"Replacing module: {module[0]}") + + deepsetattr(model, module[0], new_module) + return model +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + +class CustomModel(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, dim, num_heads) -> None: + super().__init__() + + self.linear1 = nn.Linear(dim, 4*dim) + self.act = torch.nn.GELU() + self.linear2 = nn.Linear(4*dim, dim) + self.norm1 = torch.nn.LayerNorm(dim) + + self.attention = Attention(dim, num_heads, qkv_bias=True) + self.norm2 = torch.nn.LayerNorm(dim) + + def forward(self, x): + x = self.linear1(x) + x = self.act(x) + x = self.linear2(x) + x = self.norm1(x) + x = self.attention(x) + x = self.norm2(x) + return x + +quan_args = { + "layer_norm": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 48], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [1, 48], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 48], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 48], + } + }, + "gelu": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 48], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [48, 48], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 48], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 48], + }, + }, + "linear": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 48], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [48, 48], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 48], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 48], + }, + }, + "user_defined_module": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 48], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [48, 48], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 48], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 48], + } + } +} + +from mase_components import get_module_dependencies +VIT_CUSTOM_OPS = { + "modules": { + MxIntLayerNorm: { + "args": { + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_layernorm", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_layernorm" + ), + }, + }, +} +@pytest.mark.dev +def test_emit_verilog_linear(): + dim = 192 + num_heads = 3 + batch_size = 1 + n = 196 + model_config = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + layer = CustomModel(dim, num_heads) + qlayer = vit_module_level_quantize(layer, model_config, quan_args) + mg = chop.MaseGraph(model=qlayer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_config.pop("dim") + model_args = {"vit_self_attention_integer": model_config} + update_hardware_precision_param(mg, quan_args, model_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + # wp1 = 8 + # wp2 = 1 + # manually_update_hardware_parallelism_param( + # mg, + # pass_args={ + # "fc1": {"din": [1, 2], "dout": [1, wp1]}, + # "fc2": {"din": [1, wp1], "dout": [1, wp2]}, + # }, + # ) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_linear.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_linear.py new file mode 100644 index 000000000..2e422570f --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_linear.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, in_features, hidden_features, out_features) -> None: + super().__init__() + + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + # self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, x): + x = self.fc1(x) + # x = self.fc2(x) + return x + +parallelism = 32 +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "linear": { + "config": { + "name": "mxint_hardware", + # data + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + # weight + "weight_width": 6, + "weight_exponent_width": 8, + "weight_parallelism": [parallelism, parallelism], + # bias + "bias_width": 6, + "bias_exponent_width": 8, + "bias_parallelism": [1, parallelism], + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + "round_bits": 4, + } + }, +} + + + +from a_cx_mxint_quant import vit_module_level_quantize, VIT_CUSTOM_OPS +@pytest.mark.dev +def test_emit_verilog_linear(): + in_features = 192 + hidden_features = 192*4 + out_features = 192 + n = 196 + batch_size = 10 + linear = MLP(in_features, hidden_features, out_features) + qlinear = vit_module_level_quantize(linear, q_config=quan_args) + qlinear.fc1.weight = torch.nn.Parameter( + 10 * torch.randn(qlinear.fc1.weight.shape) - 5 + ) + model_path = "/scratch/cx922/mase/mlp_model.pth" + torch.save(qlinear.state_dict(), model_path) + print(f"Model saved to {model_path}") + + mg = chop.MaseGraph(model=qlinear, custom_ops=VIT_CUSTOM_OPS) + # Save the whole model to a file + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, in_features)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + # Increase weight range + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + update_common_metadata_pass(mg, quan_args) + + mg, _ = passes.add_hardware_metadata_analysis_pass( mg, pass_args={"max_parallelism": [2] * 4}) + update_hardware_precision_param(mg, quan_args) + + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + pass_args = { + "project_dir": Path("/scratch/cx922/mase/mxint_linear_m8e6"), + } + + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) + mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_mlp.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_mlp.py new file mode 100644 index 000000000..6642b8733 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_mlp.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +sys.path.append(Path(__file__).resolve().parents[5].as_posix()) +from a_cx_mxint_quant import ( + vit_module_level_quantize, + VIT_CUSTOM_OPS +) + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, in_features, hidden_features, out_features) -> None: + super().__init__() + + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + self.act = torch.nn.GELU() + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + +parallelism = 64 +parallelism2 = 12 +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "linear": { + "config": { + "name": "mxint_hardware", + # data + "data_in_width": 6, + "data_in_exponent_width": 4, + "data_in_parallelism": [1, parallelism], + # weight + "weight_width": 6, + "weight_exponent_width": 4, + "weight_parallelism": [parallelism, parallelism], + # bias + "bias_width": 6, + "bias_exponent_width": 4, + "bias_parallelism": [1, parallelism], + "data_out_width": 6, + "data_out_exponent_width": 4, + "data_out_parallelism": [1, parallelism], + "round_bits": 4, + } + }, + "gelu": { + "config": { + "name": "mxint_hardware", + "data_in_width": 6, + "data_in_exponent_width": 4, + "data_in_parallelism": [1, parallelism], + "hash_out_width": 10, + "data_out_width": 6, + "data_out_exponent_width": 4, + "data_out_parallelism": [1, parallelism], + } + } +} + +@pytest.mark.dev +def test_emit_verilog_linear(): + in_features = 192 + hidden_features = 192*4 + out_features = 192 + n = 196 + batch_size = 10 + layer = MLP(in_features, hidden_features, out_features) + qlayer = vit_module_level_quantize(layer, q_config=quan_args) + model_path = "/scratch/cx922/mase/mlp_model.pth" + torch.save(qlayer.state_dict(), model_path) + print(f"Model saved to {model_path}") + mg = chop.MaseGraph(model=qlayer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, in_features)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + update_hardware_precision_param(mg, quan_args) + + from utils import updating_hardware_metadata_pass + from functools import partial + updating_hardware_metadata_pass(mg, { + "updating_funcs_list": [ + partial(updating_for_mlp, quan_args=quan_args), + ], + }) + + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + pass_args = { + "project_dir": Path("/scratch/cx922/mase/mxint_mlp"), + } + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) + # mg, _ = passes.emit_cocotb_transform_pass( + # mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + # ) + mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) +def updating_for_mlp(node, quan_args): + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + vp = node.meta["mase"]["hardware"].get("verilog_param") + if mase_op == "gelu": + vp["HASH_OUT_WIDTH"] = quan_args["gelu"]["config"]["hash_out_width"] + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_patch_embed.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_patch_embed.py new file mode 100644 index 000000000..88418aeee --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_patch_embed.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +from quantize_modules import MxIntPatchEmbed, VIT_CUSTOM_OPS + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class CustomModel(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, + img_size: int, + patch_size: int, + in_chans: int, + embed_dim: int, + q_config: dict = None, + norm_layer: nn.Module = nn.LayerNorm + ) -> None: + super().__init__() + self.MxIntPatchEmbed = MxIntPatchEmbed( + img_size, patch_size, in_chans, embed_dim, q_config, norm_layer + ) + + + def forward(self, x): + x = self.MxIntPatchEmbed(x) + return x + +quan_args = { + "mx_int_patch_embed": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [3, 1, 1], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [32, 3], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 32], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], + } + }, +} + +@pytest.mark.dev +def test_emit_verilog_linear(): + img_size = 224 + patch_size = 16 + embed_dim = 192 + in_chans = 3 + + model_config = { + } + layer = CustomModel( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + q_config=quan_args, + ) + + + + mg = chop.MaseGraph(model=layer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + # we have to have this batch size in advance + x = torch.randn((1, in_chans, img_size, img_size)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.graph.transforms.insert_fork_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + # model_config.pop("dim") + model_args = {"vit_self_attention_integer": model_config} + + from functools import partial + from utils import updating_hardware_metadata_pass + update_hardware_precision_param(mg, quan_args, model_args) + updating_hardware_metadata_pass(mg, { + "updating_funcs_list": [ + updating_for_patch_embed, + ], + }) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + pass_args = { + "project_dir": Path("/scratch/cx922/mase/mxint_patch_embed"), + } + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) + mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + +def updating_for_patch_embed(node): + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + vp = node.meta["mase"]["hardware"].get("verilog_param") + if mase_op == "mx_int_patch_embed": + vp["DATA_IN_0_PARALLELISM_DIM_2"] = vp["DATA_IN_0_TENSOR_SIZE_DIM_2"] + del vp["CLS_TOKEN_TENSOR_SIZE_DIM_2"] + del vp["CLS_TOKEN_PARALLELISM_DIM_2"] + del vp["DISTILL_TOKEN_TENSOR_SIZE_DIM_2"] + del vp["DISTILL_TOKEN_PARALLELISM_DIM_2"] + for dim in ["CONV_WEIGHT_PARALLELISM", "CONV_WEIGHT_TENSOR_SIZE"]: + dim_0 = f"{dim}_DIM_0" + dim_1 = f"{dim}_DIM_1" + dim_2 = f"{dim}_DIM_2" + dim_3 = f"{dim}_DIM_3" + if dim_0 in vp and dim_1 in vp and dim_2 in vp and dim_3 in vp: + vp[dim_0] = vp[dim_0] * vp[dim_1] * vp[dim_2] + vp[dim_1] = vp[dim_3] + del vp[dim_2] + del vp[dim_3] + else: + raise ValueError(f"Cannot find {dim} in {vp}") + + + + + + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_real_top.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_real_top.py new file mode 100644 index 000000000..006781103 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_real_top.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + +sys.path.append(Path(__file__).resolve().parents[5].as_posix()) +logger = get_logger(__name__) +sys.excepthook = excepthook + + +from a_cx_mxint_quant.modules import MXIntPatchEmbed +from a_cx_mxint_quant import VIT_CUSTOM_OPS + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +from a_cx_mxint_quant import MXIntLinear, MXIntGELU + +class MXIntFoldedTop(torch.nn.Module): + def __init__(self, q_config): + super().__init__() + self.q_config = q_config + def forward(self, x): + return x + +class CustomModel(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, + img_size: int, + patch_size: int, + in_chans: int, + embed_dim: int, + q_config: dict = None, + norm_layer: nn.Module = nn.LayerNorm + ) -> None: + super().__init__() + self.MXIntPatchEmbed = MXIntPatchEmbed( + img_size, patch_size, in_chans, embed_dim, q_config, norm_layer + ) + self.folded_block = MXIntGELU(q_config=q_config["folded_block"]["config"]) + self.head = MXIntLinear( + embed_dim, 10, q_config=q_config["head"]["config"] + ) + def forward(self, x): + x = self.MXIntPatchEmbed(x) + x = self.folded_block(x) + x = self.head(x) + return x + +def get_parallelism(config, parallelism, mlp_parallelism): + quan_args = { + "by": "name", + "mxint_patch_embed": { + "config": { + "name": "mxint_hardware", + "data_in_width": config["data_width"], + "data_in_exponent_width": config["data_exponent_width"], + "data_in_parallelism": [3, 1, 1], + + "weight_width": config["weight_width"], + "weight_exponent_width": config["weight_exponent_width"], + "weight_parallelism": [parallelism, 3, 1, 1], + + "bias_width": config["bias_width"], + "bias_exponent_width": config["bias_exponent_width"], + "bias_parallelism": [1, parallelism], + + "data_out_width": config["data_width"], + "data_out_exponent_width": config["data_exponent_width"], + "data_out_parallelism": [1, parallelism], + } + }, + "folded_block": { + "config": { + "name": "mxint_hardware", + "data_in_width": config["data_width"], + "data_in_exponent_width": config["data_exponent_width"], + "data_in_parallelism": [1, parallelism], + + "hash_out_width": 5, + + "data_out_width": config["data_width"], + "data_out_exponent_width": config["data_exponent_width"], + "data_out_parallelism": [1, parallelism], + } + }, + "head": { + "config": { + "name": "mxint_hardware", + "data_in_width": config["data_width"], + "data_in_exponent_width": config["data_exponent_width"], + "data_in_parallelism": [1, parallelism], + + "weight_width": config["weight_width"], + "weight_exponent_width": config["weight_exponent_width"], + "weight_parallelism": [1, parallelism], + + "bias_width": config["bias_width"], + "bias_exponent_width": config["bias_exponent_width"], + "bias_parallelism": [1, 1], + + "data_out_width": config["data_width"], + "data_out_exponent_width": config["data_exponent_width"], + "data_out_parallelism": [1, 1], + } + }, + } + return quan_args + +@pytest.mark.dev +def test_emit_verilog_linear(): + import yaml + config_path = os.environ.get("CONFIG_PATH") + args = yaml.safe_load(open(config_path)) + config = args["config"] + parallelism = args["parallelism"] + mlp_parallelism = args["mlp_parallelism"] + quan_args = get_parallelism(config, parallelism, mlp_parallelism) + + img_size = int(args["img_size"]) + patch_size = int(args["patch_size"]) + embed_dim = int(args["embed_dim"]) + in_chans = int(args["in_chans"]) + project_dir = Path(args["project_dir"]) + + + layer = CustomModel( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + q_config=quan_args, + ) + + mg = chop.MaseGraph(model=layer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + # we have to have this batch size in advance + x = torch.randn((1, in_chans, img_size, img_size)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.graph.transforms.insert_fork_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_args = {"vit_self_attention_integer": {}} + + from functools import partial + from utils import updating_hardware_metadata_pass + update_hardware_precision_param(mg, quan_args, model_args) + updating_hardware_metadata_pass(mg, { + "updating_funcs_list": [ + updating_for_patch_embed, + ], + }) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + pass_args = { + "project_dir": Path(project_dir), + "real_top": True, + } + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) + with open(project_dir / "config.yaml", "w") as f: + yaml.dump(args, f) + # mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + +def updating_for_patch_embed(node): + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + vp = node.meta["mase"]["hardware"].get("verilog_param") + if mase_op == "mxint_patch_embed": + vp["DATA_IN_0_TENSOR_SIZE_DIM_2"] = 3 + vp["DATA_IN_0_PARALLELISM_DIM_2"] = 3 + del vp["DATA_IN_0_TENSOR_SIZE_DIM_3"] + del vp["DATA_IN_0_PARALLELISM_DIM_3"] + del vp["CLS_TOKEN_TENSOR_SIZE_DIM_2"] + del vp["CLS_TOKEN_PARALLELISM_DIM_2"] + del vp["DISTILL_TOKEN_TENSOR_SIZE_DIM_2"] + del vp["DISTILL_TOKEN_PARALLELISM_DIM_2"] + for dim in ["CONV_WEIGHT_PARALLELISM", "CONV_WEIGHT_TENSOR_SIZE"]: + dim_0 = f"{dim}_DIM_0" + dim_1 = f"{dim}_DIM_1" + dim_2 = f"{dim}_DIM_2" + dim_3 = f"{dim}_DIM_3" + if dim_0 in vp and dim_1 in vp and dim_2 in vp and dim_3 in vp: + vp[dim_0] = vp[dim_0] * vp[dim_1] * vp[dim_2] + vp[dim_1] = vp[dim_3] + del vp[dim_2] + del vp[dim_3] + else: + raise ValueError(f"Cannot find {dim} in {vp}") + vp["CLS_TOKEN_PRECISION_0"] = vp["DATA_IN_0_PRECISION_0"] + vp["CLS_TOKEN_PRECISION_1"] = vp["DATA_IN_0_PRECISION_1"] + vp["DISTILL_TOKEN_PRECISION_0"] = vp["DATA_IN_0_PRECISION_0"] + vp["DISTILL_TOKEN_PRECISION_1"] = vp["DATA_IN_0_PRECISION_1"] + + + + + + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_attention.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_attention.py new file mode 100644 index 000000000..c3f0a61b9 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_attention.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + +from chop.nn.quantized.modules.attention import _ViTAttentionBase +class ViTAttentionMxInt(_ViTAttentionBase): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + q_config: dict = None, + ) -> None: + super().__init__(dim, num_heads, qkv_bias, qk_norm, attn_drop, proj_drop) + self.q_config = q_config + +def vit_module_level_quantize(model, model_config, q_config): + from chop.passes.graph.utils import deepsetattr + for module in model.named_modules(): + if isinstance(module[1], Attention): + ori_module = module[1] + new_module = ViTAttentionMxInt( + model_config["dim"], + model_config["num_heads"], + qkv_bias=model_config["query_has_bias"], + q_config=q_config, + ) + logger.info(f"Replacing module: {module[0]}") + dim = ori_module.head_dim * ori_module.num_heads + + qkv_weight = ori_module.qkv.weight.reshape(3, dim, dim) + new_module.query.weight = nn.Parameter(qkv_weight[0]) + new_module.key.weight = nn.Parameter(qkv_weight[1]) + new_module.value.weight = nn.Parameter(qkv_weight[2]) + + has_bias = False if ori_module.qkv.bias == None else True + if has_bias: + qkv_bias = ori_module.qkv.bias.reshape(3, 1, dim) + new_module.query.bias = nn.Parameter(qkv_bias[0]) + new_module.key.bias = nn.Parameter(qkv_bias[1]) + new_module.value.bias = nn.Parameter(qkv_bias[2]) + + new_module.proj.weight = ori_module.proj.weight + new_module.proj.bias = ori_module.proj.bias + deepsetattr(model, module[0], new_module) + return model + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class CustomModel(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, dim, num_heads) -> None: + super().__init__() + + self.attention = Attention(dim, num_heads, qkv_bias=True) + + def forward(self, x): + x = self.attention(x) + return x + +attention_quant_config = { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 48], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [48, 48], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 48], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 48], +} + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "user_defined_module": {"config": attention_quant_config}, +} +from mase_components import get_module_dependencies +VIT_CUSTOM_OPS = { + "modules": { + ViTAttentionMxInt: { + "args": { + "dim": "data_in", + "num_heads": "config", + "qkv_bias": "config", + "qk_norm": None, + "attn_drop": None, + "proj_drop": None, + "norm_layer": None, + "q_config": "config", + }, + "toolchain": "INTERNAL_RTL", + "module": "mxint_vit_attention_wrap", + "dependence_files": get_module_dependencies( + "linear_layers/mxint_operators/mxint_vit_attention_wrap" + ), + }, + }, +} +@pytest.mark.dev +def test_emit_verilog_linear(): + dim = 192 + num_heads = 3 + batch_size = 1 + n = 196 + model_config = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + layer = CustomModel(dim, num_heads) + qlayer = vit_module_level_quantize(layer, model_config, attention_quant_config) + mg = chop.MaseGraph(model=qlayer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_config.pop("dim") + model_args = {"vit_self_attention_integer": model_config} + update_hardware_precision_param(mg, quan_args, model_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + pass_args = { + "project_dir": Path("/scratch/cx922/mase/mxint_vit_attention"), + } + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) + # mg, _ = passes.emit_cocotb_transform_pass( + # mg, pass_args={"wait_time": 100, "wait_unit": "ms", "batch_size": batch_size} + # ) + mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_block.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_block.py new file mode 100644 index 000000000..e9d6c759d --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_block.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +from mxint_quant import ( + vit_module_level_quantize, + MXIntAddition, + VIT_CUSTOM_OPS +) +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) +logger = get_logger(__name__) +sys.excepthook = excepthook + +class CustomModel(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, dim, num_heads) -> None: + super().__init__() + + # self.linear = nn.Linear(dim, dim) + self.linear1 = nn.Linear(dim, 4*dim) + self.act = torch.nn.GELU() + self.linear2 = nn.Linear(4*dim, dim) + + self.add = MXIntAddition({}) + self.norm1 = torch.nn.LayerNorm(dim) + + self.attention = Attention(dim, num_heads, qkv_bias=True) + self.norm2 = torch.nn.LayerNorm(dim) + + def forward(self, x): + x1 = self.linear1(x) + x1 = self.act(x1) + x1 = self.linear2(x1) + x1 = self.norm1(x1) + + mlp = self.add(x1, x) + + attn = self.attention(mlp) + attn = self.norm2(attn) + result = self.add(attn, mlp) + return result + +parallelism = 16 +quan_args = { + "by": "type", + "layer_norm": { + "config": { + "name": "mxint_hardware", + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + + "weight_width": 6, + "weight_exponent_width": 8, + "weight_parallelism": [1, parallelism], + + "bias_width": 6, + "bias_exponent_width": 8, + "bias_parallelism": [1, parallelism], + + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + } + }, + "gelu": { + "config": { + "name": "mxint_hardware", + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + + "weight_width": 6, + "weight_exponent_width": 8, + "weight_parallelism": [parallelism, parallelism], + + "bias_width": 6, + "bias_exponent_width": 8, + "bias_parallelism": [1, parallelism], + + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + }, + }, + "linear": { + "config": { + "name": "mxint_hardware", + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + + "weight_width": 6, + "weight_exponent_width": 8, + "weight_parallelism": [parallelism, parallelism], + + "bias_width": 6, + "bias_exponent_width": 8, + "bias_parallelism": [1, parallelism], + + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + }, + }, + "attention": { + "config": { + "name": "mxint_hardware", + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + + "weight_width": 6, + "weight_exponent_width": 8, + "weight_parallelism": [parallelism, parallelism], + + "bias_width": 6, + "bias_exponent_width": 8, + "bias_parallelism": [1, parallelism], + + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + } + }, + "fork2": { + "config": { + "name": "mxint_hardware", + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + } + }, + "add": { + "config": { + "name": "mxint_hardware", + "data_in_width": 8, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, parallelism], + "data_out_width": 8, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, parallelism], + } + }, +} + +@pytest.mark.dev +def test_emit_verilog_linear(): + dim = 192 + num_heads = 3 + batch_size = 1 + n = 196 + model_config = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + layer = CustomModel(dim, num_heads) + qlayer = vit_module_level_quantize(layer, model_config, quan_args) + mg = chop.MaseGraph(model=qlayer, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + # torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.graph.transforms.insert_fork_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_config.pop("dim") + model_args = {"vit_self_attention_integer": model_config} + update_hardware_precision_param(mg, quan_args, model_args) + from utils import updating_hardware_metadata_pass + updating_hardware_metadata_pass(mg, {"updating_funcs_list": [del_layernorm_args]}) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + pass_args = { + "project_dir": Path("mxint_vit_block"), + } + mg, _ = passes.emit_verilog_top_transform_pass(mg, pass_args) + mg, _ = passes.emit_bram_transform_pass(mg, pass_args) + mg, _ = passes.emit_internal_rtl_transform_pass(mg, pass_args) + mg, _ = passes.emit_vivado_project_transform_pass(mg, pass_args) + + +def del_layernorm_args(node): + print(node.name) + if "layernorm" in node.name: + del node.meta["mase"].parameters["hardware"]["verilog_param"]["NORMALIZED_SHAPE"] + del node.meta["mase"].parameters["hardware"]["verilog_param"]["EPS"] + + +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_folded_top.py b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_folded_top.py new file mode 100644 index 000000000..56e066070 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_mxint_vit_folded_top.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger + +set_logging_verbosity("debug") +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, + manually_update_hardware_parallelism_param, +) +from chop.models.vision.vit.vit import Attention + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + +from chop.nn.quantized.modules.attention import _ViTAttentionBase + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + +from a_cx_mxint_quant import MXIntAddition, vit_module_level_quantize, VIT_CUSTOM_OPS + +class MxIntBlock(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, dim, num_heads) -> None: + super().__init__() + + self.linear1 = nn.Linear(dim, 4*dim) + self.act = torch.nn.GELU() + self.linear2 = nn.Linear(4*dim, dim) + + self.add = MXIntAddition({}) + self.norm1 = torch.nn.LayerNorm(dim) + + self.attention = Attention(dim, num_heads, qkv_bias=True) + self.norm2 = torch.nn.LayerNorm(dim) + + def forward(self, x): + x1 = self.linear1(x) + x1 = self.act(x1) + x1 = self.linear2(x1) + x1 = self.norm1(x1) + + mlp = self.add(x1, x) + + attn = self.attention(mlp) + attn = self.norm2(attn) + result = self.add(attn, mlp) + return result + +class MxIntStreamBlocks(torch.nn.Module): + + def __init__(self, dim, num_heads, stream_depth) -> None: + super().__init__() + + self.stream_depth = stream_depth + self.stream_blocks = nn.Sequential( + *[ + MxIntBlock( + dim=dim, + num_heads=num_heads, + ) + for i in range(stream_depth) + ] + ) + def forward(self, x): + return self.stream_blocks(x) + +class MxIntFoldedTopBlocks(torch.nn.Module): + + def __init__(self, dim, num_heads, stream_depth, folded_depth) -> None: + super().__init__() + + self.folded_depth = folded_depth + self.folded_blocks = nn.Sequential( + *[ + MxIntStreamBlocks( + dim=dim, + num_heads=num_heads, + stream_depth=stream_depth + ) + for i in range(folded_depth) + ] + ) + def forward(self, x): + return self.folded_blocks(x) + +quan_args = { + "by": "type", + "layer_norm": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 32], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [1, 32], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 32], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], + } + }, + "gelu": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 32], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [32, 32], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 32], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], + }, + }, + "linear": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 32], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [32, 32], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 32], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], + }, + }, + "user_defined_module": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 32], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [32, 32], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 32], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], + } + }, + "fork2": { + "config": { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 32], + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], + } + }, +} +attention_quant_config = { + "name": "mxint_hardware", + "data_in_width": 4, + "data_in_exponent_width": 8, + "data_in_parallelism": [1, 32], + + "weight_width": 4, + "weight_exponent_width": 8, + "weight_parallelism": [32, 32], + + "bias_width": 4, + "bias_exponent_width": 8, + "bias_parallelism": [1, 32], + + "data_out_width": 4, + "data_out_exponent_width": 8, + "data_out_parallelism": [1, 32], +} + + +def graph_generation(model, pass_args): + model_config = pass_args["model_config"] + quan_args = pass_args["quan_args"] + + batch_size = pass_args["model_config"]["batch_size"] + dim = pass_args["model_config"]["dim"] + n = pass_args["model_config"]["n"] + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + + qmodel = vit_module_level_quantize(model, model_config, quan_args) + mg = chop.MaseGraph(model=qmodel, custom_ops=VIT_CUSTOM_OPS) + mg.model.custom_ops = VIT_CUSTOM_OPS + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.graph.transforms.insert_fork_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_args = {"vit_self_attention_integer": model_config} + update_hardware_precision_param(mg, quan_args, model_args) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + return mg, _ + + +@pytest.mark.dev +def test_emit_verilog_linear(): + batch_size = 1 + n = 196 + dim = 192 + num_heads = 3 + # notice: + stream_depth = 2 + folded_depth = 6 + + model_config = { + "batch_size": batch_size, + "n": n, + "dim": dim, + "num_heads": num_heads, + } + + stream_layer = MxIntStreamBlocks(dim, num_heads, stream_depth=stream_depth) + top_layer = MxIntFoldedTopBlocks(dim, num_heads, stream_depth=stream_depth, folded_depth=folded_depth) + stream_mg, _ = graph_generation(stream_layer, {"model_config": model_config, "quan_args": quan_args}) + top_mg, _ = graph_generation(top_layer, {"model_config": model_config, "quan_args": quan_args}) + + + pass_args = { + "project_dir": Path("/scratch/cx922/mase/mxint_vit_folded_top"), + } + + from utils_mxint_folded_top_generation import mxint_folded_top_generation + mxint_folded_top_generation( + top_mg, + pass_args={ + "stream_graph": stream_mg, + "stream_name": "stream_blocks", + "folded_name": "folded_blocks", + "reuse_times": folded_depth, + "project_dir": Path("/scratch/cx922/mase/mxint_vit_folded_top") + } + ) + # top_mg, _ = passes.emit_bram_transform_pass(top_mg, pass_args) + top_mg, _ = passes.emit_internal_rtl_transform_pass(top_mg, pass_args) + top_mg, _ = passes.emit_vivado_project_transform_pass(top_mg, pass_args) + + # mg, _ = passes.emit_cocotb_transform_pass( + # mg, pass_args={"wait_time": 100, "wait_unit": "us", "batch_size": batch_size} + # ) + top_mg, _ = passes.emit_vivado_project_transform_pass(top_mg) +def _simulate(): + simulate( + skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + ) + + +if __name__ == "__main__": + test_emit_verilog_linear() + # _simulate() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_vit.py b/test/passes/graph/transforms/verilog/test_emit_verilog_vit.py new file mode 100644 index 000000000..a69268e1e --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_vit.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger +from test_emit_verilog_layernorm import ( + update_common_metadata_pass, + update_hardware_precision_param, +) + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook + + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) +from chop.models.vision.vit.vit import Attention + +from chop.nn.quantized.modules.attention import ViTAttentionInteger +from mase_components import get_module_dependencies + +VIT_CUSTOM_OPS = {"modules": {ViTAttentionInteger: {}}} + + +class MLP(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, in_features, hidden_features) -> None: + super().__init__() + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_features, in_features, bias=True) + + def forward(self, x): + x = self.fc2(self.act(self.fc1(x))) + return x + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_drop: float = 0.0, + attn_drop: float = 0.0, + drop_path: float = 0.0, + act_layer: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + mlp_layer: nn.Module = MLP, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=proj_drop, + ) + + self.norm2 = norm_layer(dim) + self.mlp = mlp_layer( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x = x + self.attn(self.norm1(x)) + x = x + self.mlp(self.norm2(x)) + # x = self.attn(x) + return x + + +class ViTAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_drop: float = 0.0, + attn_drop: float = 0.0, + depth: int = 12, + drop_path: float = 0.0, + act_layer: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + mlp_layer: nn.Module = MLP, + ) -> None: + super().__init__() + + self.blocks = nn.Sequential( + *[ + Block( + dim=dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_drop=proj_drop, + attn_drop=attn_drop, + drop_path=drop_path, + norm_layer=norm_layer, + act_layer=act_layer, + ) + for i in range(depth) + ] + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.blocks(x) + return x + + +from chop.passes.graph.utils import deepsetattr + + +def vit_module_level_quantize(model, model_config, q_config): + for module in model.named_modules(): + if isinstance(module[1], Attention): + ori_module = module[1] + new_module = ViTAttentionInteger( + model_config["dim"], + model_config["num_heads"], + qkv_bias=model_config["query_has_bias"], + q_config=q_config, + ) + logger.info(f"Replacing module: {module[0]}") + dim = ori_module.head_dim * ori_module.num_heads + + qkv_weight = ori_module.qkv.weight.reshape(3, dim, dim) + new_module.query.weight = nn.Parameter(qkv_weight[0]) + new_module.key.weight = nn.Parameter(qkv_weight[1]) + new_module.value.weight = nn.Parameter(qkv_weight[2]) + + has_bias = False if ori_module.qkv.bias == None else True + if has_bias: + qkv_bias = ori_module.qkv.bias.reshape(3, 1, dim) + new_module.query.bias = nn.Parameter(qkv_bias[0]) + new_module.key.bias = nn.Parameter(qkv_bias[1]) + new_module.value.bias = nn.Parameter(qkv_bias[2]) + + new_module.proj.weight = ori_module.proj.weight + new_module.proj.bias = ori_module.proj.bias + deepsetattr(model, module[0], new_module) + return model + + +attention_quant_config = { + "name": "integer_floor", + "data_in_width": 8, + "data_in_frac_width": 4, + "qkv_weight_width": 6, + "qkv_weight_frac_width": 4, + "qkv_bias_width": 6, + "qkv_bias_frac_width": 4, + "qkv_width": 8, + "qkv_frac_width": 4, + "qkmm_out_width": 8, + "qkmm_out_frac_width": 5, + "softmax_exp_width": 8, + "softmax_exp_frac_width": 3, + "softmax_out_frac_width": 7, + "svmm_out_width": 8, + "svmm_out_frac_width": 4, + "proj_weight_width": 6, + "proj_weight_frac_width": 4, + "proj_bias_width": 8, + "proj_bias_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, +} + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "fork2": [8, 4], + "linear": { + "config": { + "name": "integer_floor", # quantization scheme name supported are ["integer", "fixed" (equivalent to integer), "lutnet" (dev mode), "logicnets" (dev mode), "binary", "binary_residual", "ternary", "minifloat_ieee", "minifloat_denorm", "log", "block_fp", "block_minifloat", "block_log"] + # data + "data_in_width": 8, + "data_in_frac_width": 4, + # weight + "weight_width": 10, + "weight_frac_width": 3, + # bias + "bias_width": 5, + "bias_frac_width": 2, + # optional + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, + "gelu": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + } + }, + "layer_norm": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "weight_width": 8, + "weight_frac_width": 4, + "bias_width": 8, + "bias_frac_width": 4, + "isqrt_in_width": 8, + "isqrt_in_frac_width": 3, + "isqrt_out_width": 8, + "isqrt_out_frac_width": 7, + "data_out_width": 8, + "data_out_frac_width": 4, + "bypass": False, + "noparse": True, + } + }, + "add": { + "config": { + "name": "integer_floor", + # data + "data_in_width": 8, + "data_in_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, + }, + }, + "vit_self_attention_integer": {"config": attention_quant_config}, +} + + +@pytest.mark.dev +def test_emit_verilog_vit(): + # vit_tiny dim 192, n 196, num_heads = 3 + # + dim = 192 + num_heads = 3 + batch_size = 1 + n = 196 + layer = ViTAttention(dim, num_heads, mlp_ratio=4, qkv_bias=True, depth=1) + model_config_for_quantize = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + model_args_for_hardware_param = { + "vit_self_attention_integer": { + "num_heads": num_heads, + "query_has_bias": True, + } + } + qlayer = vit_module_level_quantize( + layer, model_config_for_quantize, attention_quant_config + ) + mg = chop.MaseGraph(model=qlayer) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + mg, _ = passes.graph.transforms.insert_fork_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [1] * 4} + ) + update_hardware_precision_param(mg, quan_args, model_args_for_hardware_param) + mg, _ = passes.report_node_hardware_type_analysis_pass(mg) # pretty print + + px = 2 + pqkv = 32 + p_proj = px + p_w1 = 64 + p_w2 = px + + pqkv = pqkv * num_heads + + from utils import manually_update_hardware_parallelism_param + + manually_update_hardware_parallelism_param( + mg, + pass_args={ + "fork2": {"din": [1, px], "dout": ([1, px], [1, px])}, + "blocks_0_norm1": {"din": [1, px], "dout": [1, px]}, + "blocks_0_attn": {"din": [1, px], "dattn": [1, pqkv], "dout": [1, p_proj]}, + "fifo": {"din": [1, px], "dout": [1, px]}, + "add": {"din": ([1, px], [1, px]), "dout": [1, px]}, + "fork2_1": {"din": [1, px], "dout": ([1, px], [1, px])}, + "blocks_0_norm2": {"din": [1, px], "dout": [1, px]}, + "blocks_0_mlp_fc1": {"din": [1, px], "dout": [1, p_w1]}, + "blocks_0_mlp_act": {"din": [1, p_w1], "dout": [1, p_w1]}, + "blocks_0_mlp_fc2": {"din": [1, p_w1], "dout": [1, px]}, + "fifo_1": {"din": [1, px], "dout": [1, px]}, + "add_1": {"din": ([1, px], [1, px]), "dout": [1, px]}, + }, + ) + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 100, "wait_units": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + # simulate( + # skip_build=False, skip_test=False, simulator="questa", waves=True, gui=False + # ) + + +if __name__ == "__main__": + test_emit_verilog_vit() diff --git a/test/passes/graph/transforms/verilog/test_emit_verilog_vit_attention.py b/test/passes/graph/transforms/verilog/test_emit_verilog_vit_attention.py new file mode 100644 index 000000000..e0dbb15f0 --- /dev/null +++ b/test/passes/graph/transforms/verilog/test_emit_verilog_vit_attention.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import os, sys, logging, traceback, pdb +import pytest +import toml + +import torch +import torch.nn as nn + +import chop as chop +import chop.passes as passes + +from pathlib import Path + +from chop.actions import simulate +from chop.tools.logger import set_logging_verbosity +from chop.tools import get_logger +from utils import ( + update_common_metadata_pass, + update_hardware_precision_param, +) +from chop.nn.quantized.modules.attention import ViTAttentionInteger +from mase_components import get_module_dependencies +from chop.models.vision.vit.vit import Attention + +set_logging_verbosity("debug") + + +def excepthook(exc_type, exc_value, exc_traceback): + traceback.print_exception(exc_type, exc_value, exc_traceback) + print("\nEntering debugger...") + pdb.post_mortem(exc_traceback) + + +logger = get_logger(__name__) +sys.excepthook = excepthook +VIT_CUSTOM_OPS = {"modules": {ViTAttentionInteger: {}}} + +# -------------------------------------------------- +# Model specifications +# prefer small models for fast test +# -------------------------------------------------- +# verified test case linear(2,4) + + +class Layer(torch.nn.Module): + """ + Toy quantized FC model for digit recognition on MNIST + """ + + def __init__(self, dim, num_heads) -> None: + super().__init__() + self.attention = Attention(dim, num_heads, qkv_bias=True) + + def forward(self, x): + x = self.attention(x) + return x + + +attention_quant_config = { + "name": "integer_floor", + "data_in_width": 8, + "data_in_frac_width": 4, + "qkv_weight_width": 6, + "qkv_weight_frac_width": 4, + "qkv_bias_width": 6, + "qkv_bias_frac_width": 4, + "qkv_width": 8, + "qkv_frac_width": 4, + "qkmm_out_width": 8, + "qkmm_out_frac_width": 5, + "softmax_exp_width": 8, + "softmax_exp_frac_width": 3, + "softmax_out_frac_width": 7, + "svmm_out_width": 8, + "svmm_out_frac_width": 4, + "proj_weight_width": 6, + "proj_weight_frac_width": 4, + "proj_bias_width": 8, + "proj_bias_frac_width": 4, + "data_out_width": 8, + "data_out_frac_width": 4, +} + +quan_args = { + "by": "type", # quantize by type, name, or regex_name + "default": { + "config": {"name": None} + }, # default config, this would be used for any node that does not have a specific config + "vit_self_attention_integer": {"config": attention_quant_config}, +} + + +from chop.passes.graph.utils import deepsetattr + + +def vit_module_level_quantize(model, model_config, q_config): + for module in model.named_modules(): + if isinstance(module[1], Attention): + ori_module = module[1] + new_module = ViTAttentionInteger( + model_config["dim"], + model_config["num_heads"], + qkv_bias=model_config["query_has_bias"], + q_config=q_config, + ) + logger.info(f"Replacing module: {module[0]}") + dim = ori_module.head_dim * ori_module.num_heads + + qkv_weight = ori_module.qkv.weight.reshape(3, dim, dim) + new_module.query.weight = nn.Parameter(qkv_weight[0]) + new_module.key.weight = nn.Parameter(qkv_weight[1]) + new_module.value.weight = nn.Parameter(qkv_weight[2]) + + has_bias = False if ori_module.qkv.bias == None else True + if has_bias: + qkv_bias = ori_module.qkv.bias.reshape(3, 1, dim) + new_module.query.bias = nn.Parameter(qkv_bias[0]) + new_module.key.bias = nn.Parameter(qkv_bias[1]) + new_module.value.bias = nn.Parameter(qkv_bias[2]) + + new_module.proj.weight = ori_module.proj.weight + new_module.proj.bias = ori_module.proj.bias + deepsetattr(model, module[0], new_module) + return model + + +@pytest.mark.dev +def test_emit_verilog_vit_attention(): + dim = 12 + num_heads = 3 + batch_size = 1 + n = 10 + model_config = { + "dim": dim, + "num_heads": num_heads, + "query_has_bias": True, + } + layer = Layer(dim, num_heads) + qlayer = vit_module_level_quantize(layer, model_config, attention_quant_config) + mg = chop.MaseGraph(model=qlayer, custom_ops=VIT_CUSTOM_OPS) + torch.manual_seed(0) + # Provide a dummy input for the graph so it can use for tracing + x = torch.randn((batch_size, n, dim)) + dummy_in = {"x": x} + + mg, _ = passes.init_metadata_analysis_pass(mg, None) + mg, _ = passes.add_common_metadata_analysis_pass(mg, {"dummy_in": dummy_in}) + + mg, _ = passes.quantize_transform_pass(mg, quan_args) + update_common_metadata_pass(mg, quan_args) + mg, _ = passes.add_hardware_metadata_analysis_pass( + mg, pass_args={"max_parallelism": [2] * 4} + ) + model_config.pop("dim") + model_args = {"vit_self_attention_integer": model_config} + update_hardware_precision_param(mg, quan_args, model_args) + # TODO: + # Currently, the common metadata pass does not support extracting extra arguments except data in. + # So we need to have it directly after adding hardware metadata + mg, _ = passes.report_node_hardware_type_analysis_pass( + mg, + pass_args={ + "which": ["common", "hardware"], + "save_path": "graph_meta_params.txt", + }, + ) # pretty print + mg, _ = passes.emit_verilog_top_transform_pass(mg) + mg, _ = passes.emit_bram_transform_pass(mg) + mg, _ = passes.emit_internal_rtl_transform_pass(mg) + mg, _ = passes.emit_cocotb_transform_pass( + mg, pass_args={"wait_time": 10, "wait_units": "ms", "batch_size": batch_size} + ) + mg, _ = passes.emit_vivado_project_transform_pass(mg) + + simulate( + skip_build=False, skip_test=False, simulator="questa", trace_depth=3, waves=True + ) + + +if __name__ == "__main__": + test_emit_verilog_vit_attention() diff --git a/test/passes/graph/transforms/verilog/utils.py b/test/passes/graph/transforms/verilog/utils.py new file mode 100644 index 000000000..7c9176409 --- /dev/null +++ b/test/passes/graph/transforms/verilog/utils.py @@ -0,0 +1,287 @@ +import chop as chop + + +from chop.tools.logger import set_logging_verbosity +from chop.passes.graph.transforms.quantize import QUANTIZEABLE_OP + +set_logging_verbosity("debug") + + +def _cap(name): + """ + capitalize a string + """ + return str(name).upper() + + +def parse_arg(arg): + if "data_in" in arg: + new_arg = "data_in" + else: + new_arg = arg + return new_arg + + +def parse_q_config(node, mase_op, q_config): + if q_config.get("by") == "name": + return q_config.get(node.name)["config"] + elif q_config.get("by") == "type": + if mase_op == "user_defined_module": + if "add" in node.name: + return q_config.get("add")["config"] + elif "attention" in node.name: + return q_config.get("attention")["config"] + elif "gelu" in node.name or "act" in node.name: + return q_config.get("gelu")["config"] + return q_config.get(mase_op)["config"] + +def update_common_metadata_pass(mg, quan_args): + # There is a bug in the current quantization pass, where the results metadata is not updated with the precision. + # # Here we update the metadata here so we can test the hardware back end. + # update precision + for node in mg.fx_graph.nodes: + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + if mase_op not in QUANTIZEABLE_OP + ("user_defined_module","fork2"): + print(mase_op) + continue + if mase_op == "user_defined_module": + if "mx_int_patch_embed" in node.name: + node.meta["mase"].parameters["common"]["mase_op"] = "mx_int_patch_embed" + mase_op = "mx_int_patch_embed" + # elif "act" in node.name or "gelu" in node.name: + # node.meta["mase"].parameters["common"]["mase_op"] = "gelu" + # mase_op = "gelu" + # elif "add" in node.name: + # node.meta["mase"].parameters["common"]["mase_op"] = "add" + # mase_op = "add" + # elif "attention" in node.name: + # node.meta["mase"].parameters["common"]["mase_op"] = "attention" + # mase_op = "attention" + node_quan_config = parse_q_config(node, mase_op, quan_args) + for arg, _ in node.meta["mase"].parameters["common"]["args"].items(): + if ( + type(node.meta["mase"].parameters["common"]["args"][arg]) == dict + and "type" in node.meta["mase"].parameters["common"]["args"][arg].keys() + ): + if node_quan_config["name"] == "mxint_hardware": + # if mase_op == "user_defined_module": + if "weight" in arg: + parallelism = node_quan_config["weight_parallelism"] + precision = node_quan_config["weight_width"], node_quan_config["weight_exponent_width"] + elif "data_in" in arg: + parallelism = node_quan_config["data_in_parallelism"] + precision = node_quan_config["data_in_width"], node_quan_config["data_in_exponent_width"] + else: + parallelism = node_quan_config["bias_parallelism"] + precision = node_quan_config["bias_width"], node_quan_config["bias_exponent_width"] + node.meta["mase"].parameters["common"]["args"][arg][ + "type" + ] = "mxint_hardware" + node.meta["mase"].parameters["common"]["args"][arg][ + "precision" + ] = precision + node.meta["mase"].parameters["common"]["args"][arg][ + "parallelism"] = parallelism + # else: + # node.meta["mase"].parameters["common"]["args"][arg][ + # "parallelism" + # ] = node_quan_config[parse_arg(arg) + "_parallelism"] + else: + node.meta["mase"].parameters["common"]["args"][arg][ + "type" + ] = "fixed" + for result, _ in node.meta["mase"].parameters["common"]["results"].items(): + if ( + type(node.meta["mase"].parameters["common"]["results"][result]) == dict + and "type" + in node.meta["mase"].parameters["common"]["results"][result].keys() + ): + if node_quan_config["name"] == "mxint_hardware": + node.meta["mase"].parameters["common"]["results"][result][ + "type" + ] = "mxint_hardware" + node.meta["mase"].parameters["common"]["results"][result][ + "precision" + ] = [ + node_quan_config["data_out_width"], + node_quan_config["data_out_exponent_width"], + ] + node.meta["mase"].parameters["common"]["results"][result][ + "parallelism" + ] = node_quan_config["data_out_parallelism"] + else: + node.meta["mase"].parameters["common"]["results"][result][ + "type" + ] = "fixed" + node.meta["mase"].parameters["common"]["results"][result][ + "precision" + ] = [ + node_quan_config["data_out_width"], + node_quan_config["data_out_frac_width"], + ] + node.meta["mase"].parameters["common"]["quant_type"] = node_quan_config["name"] + # update parameters + for node in mg.fx_graph.nodes: + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + if mase_op in ["layer_norm"]: + if node.meta["mase"].parameters["common"]["args"].get("weight") != None: + node.meta["mase"].parameters["common"]["args"][ + "elementwise_affine" + ] = True + if node.meta["mase"].parameters["common"]["args"].get("bias") != None: + node.meta["mase"].parameters["common"]["args"]["has_bias"] = True + + +def manually_update_hardware_parallelism_param(graph, pass_args: dict = {}): + # The quantization pass currently don't support any inlayer precision automatically generate + # we only have data_in, weight.. param in common metadata + # in order to support in layer fine grained precision tuning + # we just update the hardware metadata directly. + for node in list(graph.fx_graph.nodes) + graph.nodes_in + graph.nodes_out: + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + vp = node.meta["mase"]["hardware"].get("verilog_param") + if vp == None: + continue + for key, value in pass_args.items(): + if key in node.name: + if mase_op == "linear": + # weight1 = in0 + vp["DATA_IN_0_PARALLELISM_DIM_0"] = value["din"][1] + vp["DATA_IN_0_PARALLELISM_DIM_1"] = value["din"][0] + vp["WEIGHT_PARALLELISM_DIM_0"] = value["din"][1] + vp["WEIGHT_PARALLELISM_DIM_1"] = value["dout"][1] + vp["BIAS_PARALLELISM_DIM_0"] = value["dout"][1] + vp["BIAS_PARALLELISM_DIM_1"] = 1 + vp["DATA_OUT_0_PARALLELISM_DIM_0"] = value["dout"][1] + vp["DATA_OUT_0_PARALLELISM_DIM_1"] = value["dout"][0] + elif mase_op == "fork2": + vp["DATA_IN_0_PARALLELISM_DIM_0"] = value["din"][1] + vp["DATA_IN_0_PARALLELISM_DIM_1"] = value["din"][0] + vp["DATA_OUT_0_PARALLELISM_DIM_0"] = value["dout"][0][1] + vp["DATA_OUT_0_PARALLELISM_DIM_1"] = value["dout"][0][0] + vp["DATA_OUT_1_PARALLELISM_DIM_0"] = value["dout"][1][1] + vp["DATA_OUT_1_PARALLELISM_DIM_1"] = value["dout"][1][0] + elif mase_op == "add": + vp["DATA_IN_0_PARALLELISM_DIM_0"] = value["din"][0][1] + vp["DATA_IN_0_PARALLELISM_DIM_1"] = value["din"][0][0] + vp["DATA_IN_1_PARALLELISM_DIM_0"] = value["din"][1][1] + vp["DATA_IN_1_PARALLELISM_DIM_1"] = value["din"][1][0] + vp["DATA_OUT_0_PARALLELISM_DIM_0"] = value["dout"][1] + vp["DATA_OUT_0_PARALLELISM_DIM_1"] = value["dout"][0] + elif mase_op == "vit_self_attention_integer": + num_heads = vp["NUM_HEADS"] + vp["DATA_IN_0_PARALLELISM_DIM_0"] = value["din"][1] + vp["DATA_IN_0_PARALLELISM_DIM_1"] = value["din"][0] + vp["QUERY_WEIGHT_PARALLELISM_DIM_0"] = ( + value["dattn"][1] // num_heads + ) + vp["QUERY_WEIGHT_PARALLELISM_DIM_1"] = value["din"][1] + vp["QUERY_BIAS_PARALLELISM_DIM_0"] = value["dattn"][1] // num_heads + vp["QUERY_BIAS_PARALLELISM_DIM_1"] = 1 + vp["KEY_WEIGHT_PARALLELISM_DIM_0"] = value["dattn"][1] // num_heads + vp["KEY_WEIGHT_PARALLELISM_DIM_1"] = value["din"][1] + vp["KEY_BIAS_PARALLELISM_DIM_0"] = value["dattn"][1] // num_heads + vp["KEY_BIAS_PARALLELISM_DIM_1"] = 1 + vp["VALUE_WEIGHT_PARALLELISM_DIM_0"] = ( + value["dattn"][1] // num_heads + ) + vp["VALUE_WEIGHT_PARALLELISM_DIM_1"] = value["din"][1] + vp["VALUE_BIAS_PARALLELISM_DIM_0"] = value["dattn"][1] // num_heads + vp["VALUE_BIAS_PARALLELISM_DIM_1"] = 1 + vp["PROJ_WEIGHT_PARALLELISM_DIM_0"] = value["dout"][1] + vp["PROJ_WEIGHT_PARALLELISM_DIM_1"] = value["dattn"][1] // num_heads + vp["PROJ_BIAS_PARALLELISM_DIM_0"] = value["dout"][1] + vp["PROJ_BIAS_PARALLELISM_DIM_1"] = 1 + vp["DATA_OUT_0_PARALLELISM_DIM_0"] = value["dout"][1] + vp["DATA_OUT_0_PARALLELISM_DIM_1"] = value["dout"][0] + elif mase_op == "layer_norm": + vp["DATA_IN_0_PARALLELISM_DIM_0"] = value["din"][1] + vp["DATA_IN_0_PARALLELISM_DIM_1"] = value["din"][0] + vp["WEIGHT_PARALLELISM_DIM_0"] = value["dout"][1] + vp["WEIGHT_PARALLELISM_DIM_1"] = 1 + vp["BIAS_PARALLELISM_DIM_0"] = value["dout"][1] + vp["BIAS_PARALLELISM_DIM_1"] = 1 + vp["DATA_OUT_0_PARALLELISM_DIM_0"] = value["dout"][1] + vp["DATA_OUT_0_PARALLELISM_DIM_1"] = value["dout"][0] + else: + vp["DATA_IN_0_PARALLELISM_DIM_0"] = value["din"][1] + vp["DATA_IN_0_PARALLELISM_DIM_1"] = value["din"][0] + vp["DATA_OUT_0_PARALLELISM_DIM_0"] = value["dout"][1] + vp["DATA_OUT_0_PARALLELISM_DIM_1"] = value["dout"][0] + + return graph, {} + + +def update_hardware_precision_param(mg, quan_args, model_args: dict = {}): + # The quantization pass currently don't support any inlayer precision automatically generate + # we only have data_in, weight.. param in common metadata + # in order to support in layer fine grained precision tuning + # we just update the hardware metadata directly. + def _cap(name): + """ + capitalize a string + """ + return str(name).upper() + + for node in mg.fx_graph.nodes: + mase_op = node.meta["mase"].parameters["common"]["mase_op"] + vp = node.meta["mase"]["hardware"].get("verilog_param") + if vp == None: + continue + delete_dim_of_batch_size(vp, node_name=node.name) + if mase_op not in (QUANTIZEABLE_OP + ("vit_self_attention_integer","user_defined_module")): + continue + node_quan_args = parse_q_config(node, mase_op, quan_args) + node_model_args = model_args.get(mase_op) + if mase_op in ["vit_self_attention_integer", "layer_norm"]: + for arg_name, arg_info in node_quan_args.items(): + _list = ["data_in", "data_out", "weight", "bias"] + if any( + keyword in arg_name + for keyword in ["data_in", "data_out", "weight", "bias"] + ): + continue + if "width" not in arg_name: + continue + cofig_str = arg_name.replace("frac_width", "precision_1") + cofig_str = cofig_str.replace("width", "precision_0") + vp[_cap(cofig_str)] = arg_info + if node_model_args == None: + continue + for arg_name, arg_info in node_model_args.items(): + if type(arg_info) == bool: + vp[_cap(arg_name)] = 1 if arg_info else 0 + else: + vp[_cap(arg_name)] = arg_info + if mase_op == "mx_int_patch_embed": + for arg_name, arg_info in node_quan_args.items(): + if "width" not in arg_name: + continue + cofig_str = arg_name.replace("frac_width", "precision_1") + cofig_str = cofig_str.replace("width", "precision_0") + vp[_cap(cofig_str)] = arg_info + if node_model_args == None: + continue + for arg_name, arg_info in node_model_args.items(): + if type(arg_info) == bool: + vp[_cap(arg_name)] = 1 if arg_info else 0 + else: + vp[_cap(arg_name)] = arg_info + + +def delete_dim_of_batch_size(vp, node_name=None): + pop_list = [] + for key, item in vp.items(): + if any(keyword in key for keyword in ["DATA_IN", "DATA_OUT"]): + if node_name != 'mx_int_patch_embed': + if key.endswith("2"): + pop_list.append(key) + else: + if key.endswith("3"): + pop_list.append(key) + [vp.pop(key) for key in pop_list] + +def updating_hardware_metadata_pass(mg, pass_args): + for func in pass_args["updating_funcs_list"]: + for node in mg.fx_graph.nodes: + node = func(node) \ No newline at end of file diff --git a/test/passes/graph/transforms/verilog/utils_mxint_folded_top_generation.py b/test/passes/graph/transforms/verilog/utils_mxint_folded_top_generation.py new file mode 100644 index 000000000..ca8ff4032 --- /dev/null +++ b/test/passes/graph/transforms/verilog/utils_mxint_folded_top_generation.py @@ -0,0 +1,328 @@ +import logging +from typing import Tuple, Dict +import math +import os +import time +from multiprocessing import Process, Queue + +import torch.fx as fx +from chop.passes.graph.utils import vf, v2p, init_project +import mase_components.helper.generate_memory as gen_lut +import torch.nn as nn +logger = logging.getLogger(__name__) +from chop.nn.quantized.modules.layer_norm import LayerNormIntegerFloor +from chop.nn.quantized.modules.attention import ViTAttentionInteger +from pathlib import Path +from chop.passes.graph.transforms.verilog.emit_top import _cap, _remove_last_comma, get_verilog_parameters, VerilogParameterEmitter, VerilogEmitter + +def emit_folded_bram(folded_gragh, stream_name, folded_name, reuse_times): + def _emit_module_parameters_top_internal(key, node, stream_name, folded_name, reuse_times): + node_name = vf(node.name) + component_name = f"{node_name}_{key}_source" + component_name_inst = f"{component_name}_0" + + # verilog_param = node_name+"_"+_cap(key) + def get_image_depth(key, param_list, node_name): + if "weight" in key: + image_depth = ( + param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_0"] + * param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_1"] + / ( + param_list[f"{_cap(key)}_PARALLELISM_DIM_0"] + * param_list[f"{_cap(key)}_PARALLELISM_DIM_1"] + ) + ) + elif "bias" in key: + if "norm" in node_name: + image_depth = ( + param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_0"] + * param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_1"] + / ( + param_list[f"{_cap(key)}_PARALLELISM_DIM_0"] + * param_list[f"{_cap(key)}_PARALLELISM_DIM_1"] + ) + ) + else: + image_depth = ( + param_list[f"{_cap(key)}_TENSOR_SIZE_DIM_0"] + / param_list[f"{_cap(key)}_PARALLELISM_DIM_0"] + ) + else: + raise NotImplementedError + return image_depth + + image_depth = get_image_depth( + key, node.meta["mase"].parameters["hardware"]["verilog_param"], node.name + ) + parameters = "" + for param in node.meta["mase"].parameters["hardware"]["verilog_param"].keys(): + if f"{_cap(key)}_" in param: + parameters += f" .{param}({param}),\n" + parameters = _remove_last_comma(parameters) + modules = "" + signal = "" + for i in range(reuse_times): + new_node_name = node_name.replace(stream_name, folded_name + f"_{i}_" + stream_name) + new_componet_name = component_name.replace(stream_name, folded_name + f"_{i}_" + stream_name) + new_component_name_inst = component_name_inst.replace( + stream_name, folded_name + f"_{i}_" + stream_name + ) + signal += f""" +logic [{_cap(key)}_PRECISION_0 - 1:0] {new_node_name}_m{key} [{_cap(key)}_PARALLELISM_DIM_0*{_cap(key)}_PARALLELISM_DIM_1 - 1:0]; +logic [{_cap(key)}_PRECISION_1 - 1:0] {new_node_name}_e{key}; +logic {new_node_name}_{key}_valid, {new_node_name}_{key}_ready; +""" + modules += f""" +{new_componet_name} #( +{parameters} +) {new_component_name_inst} ( + .clk(clk), + .rst(rst), + .mdata_out({new_node_name}_m{key}), + .edata_out({new_node_name}_e{key}), + .data_out_ready({new_node_name}_{key}_ready), + .data_out_valid({new_node_name}_{key}_valid) +); + + """ + + output_connections = f""" +always_comb begin""" + + for element in ["m", "e"]: + output_connections += f""" + {element}data_out = (counter= (REPEAT_TIMES - 1)*IMAGE_DEPTH)? data_out_0_ready: (counter_in < IMAGE_DEPTH) ? 0 : top_block_data_in_0_ready; +end +endmodule + """ + return top + + +def emit_mxint_folded_top_file(graph, top_name, pass_args): + stream_graph = pass_args["stream_graph"] + folded_name = pass_args["folded_name"] + stream_name = pass_args["stream_name"] + reuse_times = pass_args["reuse_times"] + top_block = VerilogEmitter(stream_graph).emit(stream_graph, "top_block") + top_bram = emit_folded_bram(stream_graph, stream_name, folded_name, reuse_times) + top = emit_verilog_folded_top(graph, reuse_times, top_name) + top_file = f""" + {top} + {top_block} + {top_bram} + """ + return top_file + +def mxint_folded_top_generation(graph, pass_args={}): + """Emit the top-level model design in Verilog + + :param graph: a MaseGraph + :type graph: MaseGraph + :param pass_args: this pass requires additional arguments which is explained below, defaults to {} + :type pass_args: _type_, optional + :return: return a tuple of a MaseGraph and an empty dict (no additional info to return) + :rtype: tuple(MaseGraph, Dict) + + + - pass_args + - project_dir -> str : the directory of the project for cosimulation + - top_name -> str : top-level name + """ + + logger.info("Emitting Verilog...") + + # Create project directory, and the verilog is emmited to {project_name}/hardware/rtl + project_dir = ( + pass_args["project_dir"] + if "project_dir" in pass_args.keys() + else Path.home() / ".mase" / "top" + ) + top_name = pass_args["top_name"] if "top_name" in pass_args.keys() else "top" + init_project(project_dir) + rtl_dir = os.path.join(project_dir, "hardware", "rtl") + top = emit_mxint_folded_top_file(graph, top_name, pass_args) + + top_file = os.path.join(rtl_dir, f"{top_name}.sv") + with open(top_file, "w") as top_design: + top_design.write(top) \ No newline at end of file