diff --git a/configs/hf_model_configs/configs/roberta_base.json b/configs/hf_model_configs/configs/roberta_base.json
new file mode 100644
index 000000000..44da6b85b
--- /dev/null
+++ b/configs/hf_model_configs/configs/roberta_base.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265
+  }
\ No newline at end of file
diff --git a/configs/hf_model_configs/configs/roberta_large.json b/configs/hf_model_configs/configs/roberta_large.json
new file mode 100644
index 000000000..42c6bd047
--- /dev/null
+++ b/configs/hf_model_configs/configs/roberta_large.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265
+  }
\ No newline at end of file
diff --git a/configs/hf_model_configs/configs/roberta_medium.json b/configs/hf_model_configs/configs/roberta_medium.json
new file mode 100644
index 000000000..31412d614
--- /dev/null
+++ b/configs/hf_model_configs/configs/roberta_medium.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 512,
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 8,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265
+  }
\ No newline at end of file
diff --git a/configs/hf_model_configs/configs/roberta_mini.json b/configs/hf_model_configs/configs/roberta_mini.json
new file mode 100644
index 000000000..8b2cd4ed9
--- /dev/null
+++ b/configs/hf_model_configs/configs/roberta_mini.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 256,
+    "initializer_range": 0.02,
+    "intermediate_size": 1024,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 4,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265
+  }
\ No newline at end of file
diff --git a/configs/hf_model_configs/configs/roberta_small.json b/configs/hf_model_configs/configs/roberta_small.json
new file mode 100644
index 000000000..0c092dd98
--- /dev/null
+++ b/configs/hf_model_configs/configs/roberta_small.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 512,
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 4,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265
+  }
\ No newline at end of file
diff --git a/configs/hf_model_configs/configs/roberta_tiny.json b/configs/hf_model_configs/configs/roberta_tiny.json
new file mode 100644
index 000000000..f88903200
--- /dev/null
+++ b/configs/hf_model_configs/configs/roberta_tiny.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 128,
+    "initializer_range": 0.02,
+    "intermediate_size": 512,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 2,
+    "num_hidden_layers": 2,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265
+  }
\ No newline at end of file
diff --git a/src/chop/nn/quantizers/SNN/LSQ.py b/src/chop/nn/quantizers/SNN/LSQ.py
index 776e148e7..4d2cc9d7a 100644
--- a/src/chop/nn/quantizers/SNN/LSQ.py
+++ b/src/chop/nn/quantizers/SNN/LSQ.py
@@ -16,6 +16,9 @@ def floor_pass(x):
     return (y - y_grad).detach() + y_grad
 
 
+# ========================================================================================================
+# SNN quantization from SpikeZIP-TF
+# ========================================================================================================
 class LSQInteger(nn.Module):
     """
     LSQInteger is a PyTorch module for Learned Step Size Quantization (LSQ) with integer levels.
@@ -140,3 +143,116 @@ def forward(self, x):
             self.global_step = 0.0
 
         return output
+
+
+# ========================================================================================================
+# SNN quantization from SpikeLM
+# ========================================================================================================
+class AlphaInit(nn.Parameter):
+    def __init__(self, tensor, requires_grad=True):
+        super(AlphaInit, self).__new__(
+            nn.Parameter, data=tensor, requires_grad=requires_grad
+        )
+        self.initialized = False
+
+    def _initialize(self, init_tensor):
+        assert not self.initialized, "already initialized."
+        self.data.copy_(init_tensor)
+        self.initialized = True
+
+    def initialize_wrapper(self, tensor, num_bits, symmetric, init_method="default"):
+        Qp = 2 ** (num_bits - 1) - 1 if symmetric else 2 ** (num_bits) - 1
+        if Qp == 0:
+            Qp = 1.0
+        if init_method == "default":
+            init_val = (
+                2 * tensor.abs().mean() / math.sqrt(Qp)
+                if symmetric
+                else 4 * tensor.abs().mean() / math.sqrt(Qp)
+            )
+        elif init_method == "uniform":
+            init_val = 1.0 / (2 * Qp + 1) if symmetric else 1.0 / Qp
+
+        self._initialize(init_val)
+
+
+class ElasticBiSpiking(torch.autograd.Function):
+    """
+    Modified from Learned Step-size Quantization.
+    https://arxiv.org/abs/1902.08153
+    """
+
+    @staticmethod
+    def forward(ctx, input, alpha, num_bits, layerwise):
+        """
+        :param input: input to be quantized
+        :param alpha: the step size
+        :param num_bits: quantization bits
+        :param layerwise: rowwise quant
+        :return: quantized output
+        """
+        if not layerwise:
+            # TODO
+            raise NotImplementedError
+        ctx.num_bits = num_bits
+        if num_bits == 32:
+            return input
+        elif num_bits == 1 or num_bits == 2:
+            Qn = -1
+            Qp = 1
+
+        eps = torch.tensor(0.00001).float().to(alpha.device)
+        if alpha.item() == 1.0 and (not alpha.initialized):
+            alpha.initialize_wrapper(
+                input, num_bits, symmetric=True, init_method="default"
+            )
+        alpha = torch.where(alpha > eps, alpha, eps)
+        assert alpha > 0, "alpha = {:.6f} becomes non-positive".format(alpha)
+
+        grad_scale = (
+            1.0 / math.sqrt(input.numel())
+            if not Qp
+            else 1.0 / math.sqrt(input.numel() * Qp)
+        )
+        ctx.save_for_backward(input, alpha)
+        ctx.other = grad_scale, Qn, Qp
+        if num_bits == 1:
+            q_w = input.sign()  ################################## binary
+        else:
+            q_w = (input / alpha).round().clamp(Qn, Qp)  ###################### ternary
+        w_q = q_w * alpha
+        return w_q
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.num_bits == 32:
+            return grad_output, None, None, None
+
+        input_, alpha = ctx.saved_tensors
+        grad_scale, Qn, Qp = ctx.other
+        q_w = input_ / alpha
+        indicate_small = (q_w < Qn).float()
+        indicate_big = (q_w > Qp).float()
+        indicate_middle = (
+            1.0 - indicate_small - indicate_big
+        )  # this is more cpu-friendly than torch.ones(input_.shape)
+        if ctx.num_bits == 1:
+            grad_alpha = (
+                ((input_.sign()) * grad_output * grad_scale).sum().unsqueeze(dim=0)
+            )
+        else:
+            grad_alpha = (
+                (
+                    (
+                        indicate_small * Qn
+                        + indicate_big * Qp
+                        + indicate_middle * (-q_w + q_w.round())
+                    )
+                    * grad_output
+                    * grad_scale
+                )
+                .sum()
+                .unsqueeze(dim=0)
+            )
+        grad_input = indicate_middle * grad_output
+        return grad_input, grad_alpha, None, None
diff --git a/src/chop/nn/snn/modules/__init__.py b/src/chop/nn/snn/modules/__init__.py
index 6b6efd229..a129d9c08 100644
--- a/src/chop/nn/snn/modules/__init__.py
+++ b/src/chop/nn/snn/modules/__init__.py
@@ -12,7 +12,7 @@
 
 from .conv3d import Conv3d
 
-from .linear import Linear, LinearUnfoldBias
+from .linear import Linear, LinearUnfoldBias, LinearElasticBiSpiking
 
 from .pool1d import MaxPool1d, AvgPool1d, AdaptiveAvgPool1d
 
@@ -62,6 +62,7 @@
 from .embedding import EmbeddingZIPTF
 from .roberta import (
     RobertaSelfAttentionZIPTF,
+    RobertaSelfAttentionSpikeLM,
 )
 
 spiking_basic_module_map = {
@@ -70,6 +71,7 @@
     "conv3d": Conv3d,
     "linear": Linear,
     "linear_unfold_bias": LinearUnfoldBias,
+    "linear_elastic_bi_spiking": LinearElasticBiSpiking,
     "max_pool1d": MaxPool1d,
     "avg_pool1d": AvgPool1d,
     "adaptive_avg_pool1d": AdaptiveAvgPool1d,
@@ -105,6 +107,7 @@
 
 spiking_roberta_module_map = {
     "roberta_self_attention_zip_tf": RobertaSelfAttentionZIPTF,
+    "roberta_self_attention_spikeLM": RobertaSelfAttentionSpikeLM,
 }
 
 spiking_module_map = {
diff --git a/src/chop/nn/snn/modules/linear.py b/src/chop/nn/snn/modules/linear.py
index 44cfb4f92..de727574f 100644
--- a/src/chop/nn/snn/modules/linear.py
+++ b/src/chop/nn/snn/modules/linear.py
@@ -2,6 +2,8 @@
 import chop.nn.snn.base as base
 import torch
 
+from chop.nn.quantizers.SNN.LSQ import AlphaInit, ElasticBiSpiking
+
 
 class Linear(nn.Linear, base.StepModule):
     def __init__(
@@ -105,3 +107,66 @@ def forward(self, input):
         self.first = False
 
         return output
+
+
+class LinearElasticBiSpiking(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        symmetric=True,
+        config=None,
+    ) -> None:
+        super().__init__(
+            in_features,
+            out_features,
+            bias,
+            device,
+            dtype,
+        )
+        # NOTE: dead code from the original implementation (maybe useful in future reference)
+        # self.weight_bits = config["weight_bits"]
+        # self.quantize_act = config["quantize_act"]
+        # self.register_buffer('weight_clip_val', torch.tensor([config["clip_val"]]))
+        # self.input_bits = config["input_bits"]
+
+        self.T = config["T"]
+        self.act_clip_val = nn.ParameterList(
+            [AlphaInit(torch.tensor(1.0), requires_grad=False) for i in range(self.T)]
+        )
+        self.act_quantizer = ElasticBiSpiking
+
+    def forward(self, input):
+        # quantize weight
+        assert len(self.weight.size()) == 2
+
+        weight = self.weight
+        mem = torch.zeros_like(input[0]).cuda()
+        output = torch.zeros_like(input).cuda()
+        mem_old = 0
+        for i in range(self.T):
+            if i == 0:
+                mem = input[0]
+            else:
+                # v = beta * mem_old (alpha - spike) + v_reset(which is 0) + input
+                mem = (
+                    mem_old
+                    * 0.25
+                    * (self.act_clip_val[i - 1].detach() - output[i - 1].detach())
+                    + input[i]
+                )
+
+            # spike
+            output[i] = self.act_quantizer.apply(
+                mem, self.act_clip_val[i], self.input_bits, True
+            )
+            mem_old = mem.clone()
+
+        out = nn.functional.linear(output, weight)
+        if not self.bias is None:
+            out += self.bias.view(1, -1).expand_as(out)
+
+        return out
diff --git a/src/chop/nn/snn/modules/roberta/__init__.py b/src/chop/nn/snn/modules/roberta/__init__.py
index cda86a74d..ac056f851 100644
--- a/src/chop/nn/snn/modules/roberta/__init__.py
+++ b/src/chop/nn/snn/modules/roberta/__init__.py
@@ -1 +1 @@
-from .attention import RobertaSelfAttentionZIPTF
+from .attention import RobertaSelfAttentionZIPTF, RobertaSelfAttentionSpikeLM
diff --git a/src/chop/nn/snn/modules/roberta/attention.py b/src/chop/nn/snn/modules/roberta/attention.py
index b8253d4a1..5ae718e41 100644
--- a/src/chop/nn/snn/modules/roberta/attention.py
+++ b/src/chop/nn/snn/modules/roberta/attention.py
@@ -8,9 +8,10 @@
 import math
 
 
-from chop.nn.snn.modules.linear import LinearUnfoldBias
+from chop.nn.snn.modules.linear import LinearUnfoldBias, LinearElasticBiSpiking
 from chop.nn.snn.modules.neuron import ST_BIFNode
 from chop.nn.snn.modules.softmax import SoftmaxZIPTF
+from chop.nn.quantizers.SNN.LSQ import ElasticBiSpiking, AlphaInit
 
 
 def multi(x1_t, x2_t, x1_sum_t, x2_sum_t):
@@ -263,3 +264,328 @@ def forward(
         if self.is_decoder:
             outputs = outputs + (past_key_value,)
         return outputs
+
+
+class RobertaSelfAttentionSpikeLM(nn.Module):
+    def __init__(self, config, q_config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = LinearElasticBiSpiking(
+            config.hidden_size, self.all_head_size, config=q_config
+        )
+        self.key = LinearElasticBiSpiking(
+            config.hidden_size, self.all_head_size, config=q_config
+        )
+        self.value = LinearElasticBiSpiking(
+            config.hidden_size, self.all_head_size, config=q_config
+        )
+
+        self.act_quantizer_k = ElasticBiSpiking
+        self.act_quantizer_v = ElasticBiSpiking
+
+        self.T = q_config["T"]
+        self.input_bits = q_config["input_bits"]
+        self.clip_key = nn.ParameterList(
+            [AlphaInit(torch.tensor(1.0), requires_grad=False) for i in range(self.T)]
+        )
+        self.clip_value = nn.ParameterList(
+            [AlphaInit(torch.tensor(1.0), requires_grad=False) for i in range(self.T)]
+        )
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )  # B T N h C/h
+        x = x.view(new_x_shape)
+        return x.permute(0, 1, 3, 2, 4)  # B T h N C/h
+
+    # def forward(
+    #     self,
+    #     hidden_states: torch.Tensor,
+    #     attention_mask: Optional[torch.FloatTensor] = None,
+    #     head_mask: Optional[torch.FloatTensor] = None,
+    #     encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    #     encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    #     past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    #     output_attentions: Optional[bool] = False,
+    # ) -> Tuple[torch.Tensor]:
+    #     # NOTE: this might be corresponding to the if case with is_cross_attention is none and past_key_value is none
+    #     mixed_query_layer = self.query(hidden_states)
+
+    #     mixed_key_layer = self.key(hidden_states)
+    #     mixed_value_layer = self.value(hidden_states)
+    #     mixed_key_layer = torch.cat([self.act_quantizer_k.apply(mixed_key_layer[i:i+1], self.clip_key[i], self.input_bits, True) for i in range(self.T)], dim=0)
+    #     mixed_value_layer = torch.cat([self.act_quantizer_v.apply(mixed_value_layer[i:i+1], self.clip_value[i], self.input_bits, True) for i in range(self.T)], dim=0)
+
+    #     # NOTE: this might be corresponding to the if case with is_cross_attention is none and past_key_value is none
+    #     key_layer = self.transpose_for_scores(mixed_key_layer)
+    #     value_layer = self.transpose_for_scores(mixed_value_layer)
+    #     query_layer = self.transpose_for_scores(mixed_query_layer)
+
+    #     # Take the dot product between "query" and "key" to get the raw attention scores.
+    #     attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+    #     attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+    #     if attention_mask is not None:
+    #         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+    #         attention_scores = attention_scores + attention_mask.unsqueeze(0)
+
+    #     # Normalize the attention scores to probabilities.
+    #     attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+    #     # This is actually dropping out entire tokens to attend to, which might
+    #     # seem a bit unusual, but is taken from the original Transformer paper.
+    #     attention_probs = self.dropout(attention_probs)
+
+    #     # Mask heads if we want to
+    #     if head_mask is not None:
+    #         attention_probs = attention_probs * head_mask
+
+    #     context_layer = torch.matmul(attention_probs, value_layer)
+
+    #     context_layer = context_layer.permute(0, 1, 3, 2, 4).contiguous()
+    #     new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+    #     context_layer = context_layer.view(new_context_layer_shape)
+
+    #     outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+    #     return outputs
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            mixed_key_layer = self.key(encoder_hidden_states)
+            mixed_value_layer = self.value(encoder_hidden_states)
+
+            mixed_key_layer = torch.cat(
+                [
+                    self.act_quantizer_k.apply(
+                        mixed_key_layer[i : i + 1],
+                        self.clip_key[i],
+                        self.input_bits,
+                        True,
+                    )
+                    for i in range(self.T)
+                ],
+                dim=0,
+            )
+            mixed_value_layer = torch.cat(
+                [
+                    self.act_quantizer_v.apply(
+                        mixed_value_layer[i : i + 1],
+                        self.clip_value[i],
+                        self.input_bits,
+                        True,
+                    )
+                    for i in range(self.T)
+                ],
+                dim=0,
+            )
+
+            key_layer = self.transpose_for_scores(mixed_key_layer)
+            value_layer = self.transpose_for_scores(mixed_value_layer)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            mixed_key_layer = self.key(hidden_states)
+            mixed_value_layer = self.value(hidden_states)
+
+            mixed_key_layer = torch.cat(
+                [
+                    self.act_quantizer_k.apply(
+                        mixed_key_layer[i : i + 1],
+                        self.clip_key[i],
+                        self.input_bits,
+                        True,
+                    )
+                    for i in range(self.T)
+                ],
+                dim=0,
+            )
+            mixed_value_layer = torch.cat(
+                [
+                    self.act_quantizer_v.apply(
+                        mixed_value_layer[i : i + 1],
+                        self.clip_value[i],
+                        self.input_bits,
+                        True,
+                    )
+                    for i in range(self.T)
+                ],
+                dim=0,
+            )
+
+            key_layer = self.transpose_for_scores(mixed_key_layer)
+            value_layer = self.transpose_for_scores(mixed_value_layer)
+
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            mixed_key_layer = self.key(hidden_states)
+            mixed_value_layer = self.value(hidden_states)
+
+            mixed_key_layer = torch.cat(
+                [
+                    self.act_quantizer_k.apply(
+                        mixed_key_layer[i : i + 1],
+                        self.clip_key[i],
+                        self.input_bits,
+                        True,
+                    )
+                    for i in range(self.T)
+                ],
+                dim=0,
+            )
+            mixed_value_layer = torch.cat(
+                [
+                    self.act_quantizer_v.apply(
+                        mixed_value_layer[i : i + 1],
+                        self.clip_value[i],
+                        self.input_bits,
+                        True,
+                    )
+                    for i in range(self.T)
+                ],
+                dim=0,
+            )
+
+            key_layer = self.transpose_for_scores(mixed_key_layer)
+            value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(
+                    key_length - 1, dtype=torch.long, device=hidden_states.device
+                ).view(-1, 1)
+            else:
+                position_ids_l = torch.arange(
+                    query_length, dtype=torch.long, device=hidden_states.device
+                ).view(-1, 1)
+            position_ids_r = torch.arange(
+                key_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        # context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        # NOTE: SpikeLM has time dimension in the input/output
+        context_layer = context_layer.permute(0, 1, 3, 2, 4).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+
+        return outputs
diff --git a/src/chop/passes/module/module_modify_helper.py b/src/chop/passes/module/module_modify_helper.py
index 2e6a577e0..953707e80 100644
--- a/src/chop/passes/module/module_modify_helper.py
+++ b/src/chop/passes/module/module_modify_helper.py
@@ -102,7 +102,7 @@ def replace_by_name(network, name, module):
 """
 
 
-def instantiate_linear(module, postfix, module_map, additional_module_args):
+def instantiate_linear(module, postfix, module_map, additional_module_args: dict):
     linear_cls = module_map[f"linear_{postfix}"]
     has_bias = not (module.bias is None)
 
@@ -126,7 +126,7 @@ def instantiate_linear(module, postfix, module_map, additional_module_args):
     return linear
 
 
-def instantiate_conv2d(module, postfix, module_map, additional_module_args):
+def instantiate_conv2d(module, postfix, module_map, additional_module_args: dict):
     conv2d_cls = module_map[f"conv2d_{postfix}"]
     has_bias = not (module.bias is None)
     # TODO: some transformed modules have "config" as an argument then extract the additional_module_args from it. Some directly take the additional_module_args.
diff --git a/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeLM.py b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeLM.py
new file mode 100644
index 000000000..255e12d79
--- /dev/null
+++ b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeLM.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# This example converts a simple MLP model to Verilog
+import logging
+import os
+import sys
+
+from chop.passes.module.transforms.snn.ann2snn import ann2snn_module_transform_pass
+import torch
+import torch.nn as nn
+
+from pathlib import Path
+
+sys.path.append(Path(__file__).resolve().parents[5].as_posix())
+
+logger = logging.getLogger(__name__)
+from chop.passes.module.transforms import quantize_module_transform_pass
+
+
+import torch
+from torch import nn
+from transformers import AutoTokenizer, AutoConfig
+
+roberta_base_config = {
+    "architectures": ["RobertaForMaskedLM"],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "type_vocab_size": 1,
+    "vocab_size": 50265,
+}
+
+mode_config_path = (
+    "/home/thw20/projects/mase/configs/hf_model_configs/configs/roberta_base.json"
+)
+
+# Load model directly
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+
+tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+config = AutoConfig.from_pretrained(mode_config_path, cache_dir="/data/models")
+logger.info("Training new model from scratch")
+model = AutoModelForMaskedLM.from_config(config)
+f = open(f"roberta_base_arch.txt", "w")
+f.write(str(model))
+f.close()
+
+convert_pass_args = {
+    "by": "regex_name",
+    "roberta\.encoder\.layer\.\d+\.attention\.self": {
+        "config": {
+            "name": "spikeLM",
+            "input_bits": 2,
+            "T": 4,
+        },
+    },
+}
+
+mg, _ = ann2snn_module_transform_pass(model, convert_pass_args)
+
+
+convert_pass_args = {
+    "by": "type",
+    "linear": {
+        "config": {
+            "name": "elastic_bi_spiking",
+            "input_bits": 2,
+            "T": 4,
+        },
+    },
+}
+
+mg, _ = ann2snn_module_transform_pass(mg, convert_pass_args)
+
+f = open(f"roberta_base_arch_snn.txt", "w")
+f.write(str(mg))
+f.close()
diff --git a/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeZIP.py b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeZIP.py
new file mode 100644
index 000000000..b7236b43c
--- /dev/null
+++ b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeZIP.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# This example converts a simple MLP model to Verilog
+import logging
+import os
+import sys
+
+from chop.passes.module.transforms.snn.ann2snn import ann2snn_module_transform_pass
+import torch
+import torch.nn as nn
+
+from pathlib import Path
+
+sys.path.append(Path(__file__).resolve().parents[5].as_posix())
+
+
+from chop.passes.module.transforms import quantize_module_transform_pass
+
+
+import torch
+from torch import nn
+from transformers import RobertaForSequenceClassification, AutoTokenizer
+
+pretrained = "XianYiyk/roberta-relu-pretrained-sst2"
+bert = RobertaForSequenceClassification.from_pretrained(pretrained, num_labels=2)
+tokenizer = AutoTokenizer.from_pretrained(pretrained, do_lower_case=True)
+for param in bert.parameters():
+    param.requires_grad = True  # QAT training
+
+
+# def test_ann2snn_module_transform_pass():
+quan_pass_args = {
+    "by": "regex_name",
+    "roberta\.encoder\.layer\.\d+\.attention\.self": {
+        "config": {
+            "name": "lsqinteger",
+            "level": 32,
+        }
+    },
+    "roberta\.encoder\.layer\.\d+\.attention\.output": {
+        "config": {
+            "name": "lsqinteger",
+            "level": 32,
+        }
+    },
+    "roberta\.encoder\.layer\.\d+\.output": {
+        "config": {
+            "name": "lsqinteger",
+            "level": 32,
+        }
+    },
+    "roberta\.encoder\.layer\.\d+\.intermediate": {
+        "config": {
+            "name": "lsqinteger",
+            "level": 32,
+        }
+    },
+    "classifier": {
+        "config": {
+            "name": "lsqinteger",
+            "level": 32,
+        }
+    },
+}
+mg, _ = quantize_module_transform_pass(bert, quan_pass_args)
+# f = open(f"qann_model_arch.txt", "w")
+# f.write(str(mg))
+# f.close()
+
+convert_pass_args = {
+    "by": "regex_name",
+    "roberta\.encoder\.layer\.\d+\.attention\.self": {
+        "config": {
+            "name": "zip_tf",
+            "level": 32,
+            "neuron_type": "ST-BIF",
+        },
+    },
+}
+mg, _ = ann2snn_module_transform_pass(mg, convert_pass_args)
+
+convert_pass_args = {
+    "by": "type",
+    "embedding": {
+        "config": {
+            "name": "zip_tf",
+        },
+    },
+    "linear": {
+        "config": {
+            "name": "unfold_bias",
+            "level": 32,
+            "neuron_type": "ST-BIF",
+        },
+    },
+    "conv2d": {
+        "config": {
+            "name": "zip_tf",
+            "level": 32,
+            "neuron_type": "ST-BIF",
+        },
+    },
+    "layernorm": {
+        "config": {
+            "name": "zip_tf",
+        },
+    },
+    "relu": {
+        "manual_instantiate": True,
+        "config": {
+            "name": "identity",
+        },
+    },
+    "lsqinteger": {
+        "manual_instantiate": True,
+        "config": {
+            "name": "st_bif",
+            # Default values. These would be replaced by the values from the LSQInteger module, so it has no effect.
+            # "q_threshold": 1,
+            # "level": 32,
+            # "sym": True,
+        },
+    },
+}
+mg, _ = ann2snn_module_transform_pass(mg, convert_pass_args)
+
+# f = open(f"spiking_model_arch.txt", "w")
+# f.write(str(mg))
+# f.close()