diff --git a/configs/hf_model_configs/configs/roberta_base.json b/configs/hf_model_configs/configs/roberta_base.json new file mode 100644 index 000000000..44da6b85b --- /dev/null +++ b/configs/hf_model_configs/configs/roberta_base.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265 + } \ No newline at end of file diff --git a/configs/hf_model_configs/configs/roberta_large.json b/configs/hf_model_configs/configs/roberta_large.json new file mode 100644 index 000000000..42c6bd047 --- /dev/null +++ b/configs/hf_model_configs/configs/roberta_large.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265 + } \ No newline at end of file diff --git a/configs/hf_model_configs/configs/roberta_medium.json b/configs/hf_model_configs/configs/roberta_medium.json new file mode 100644 index 000000000..31412d614 --- /dev/null +++ b/configs/hf_model_configs/configs/roberta_medium.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 512, + "initializer_range": 0.02, + "intermediate_size": 2048, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 8, + "num_hidden_layers": 8, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265 + } \ No newline at end of file diff --git a/configs/hf_model_configs/configs/roberta_mini.json b/configs/hf_model_configs/configs/roberta_mini.json new file mode 100644 index 000000000..8b2cd4ed9 --- /dev/null +++ b/configs/hf_model_configs/configs/roberta_mini.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 256, + "initializer_range": 0.02, + "intermediate_size": 1024, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 4, + "num_hidden_layers": 4, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265 + } \ No newline at end of file diff --git a/configs/hf_model_configs/configs/roberta_small.json b/configs/hf_model_configs/configs/roberta_small.json new file mode 100644 index 000000000..0c092dd98 --- /dev/null +++ b/configs/hf_model_configs/configs/roberta_small.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 512, + "initializer_range": 0.02, + "intermediate_size": 2048, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 8, + "num_hidden_layers": 4, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265 + } \ No newline at end of file diff --git a/configs/hf_model_configs/configs/roberta_tiny.json b/configs/hf_model_configs/configs/roberta_tiny.json new file mode 100644 index 000000000..f88903200 --- /dev/null +++ b/configs/hf_model_configs/configs/roberta_tiny.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 128, + "initializer_range": 0.02, + "intermediate_size": 512, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 2, + "num_hidden_layers": 2, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265 + } \ No newline at end of file diff --git a/src/chop/nn/quantizers/SNN/LSQ.py b/src/chop/nn/quantizers/SNN/LSQ.py index 776e148e7..4d2cc9d7a 100644 --- a/src/chop/nn/quantizers/SNN/LSQ.py +++ b/src/chop/nn/quantizers/SNN/LSQ.py @@ -16,6 +16,9 @@ def floor_pass(x): return (y - y_grad).detach() + y_grad +# ======================================================================================================== +# SNN quantization from SpikeZIP-TF +# ======================================================================================================== class LSQInteger(nn.Module): """ LSQInteger is a PyTorch module for Learned Step Size Quantization (LSQ) with integer levels. @@ -140,3 +143,116 @@ def forward(self, x): self.global_step = 0.0 return output + + +# ======================================================================================================== +# SNN quantization from SpikeLM +# ======================================================================================================== +class AlphaInit(nn.Parameter): + def __init__(self, tensor, requires_grad=True): + super(AlphaInit, self).__new__( + nn.Parameter, data=tensor, requires_grad=requires_grad + ) + self.initialized = False + + def _initialize(self, init_tensor): + assert not self.initialized, "already initialized." + self.data.copy_(init_tensor) + self.initialized = True + + def initialize_wrapper(self, tensor, num_bits, symmetric, init_method="default"): + Qp = 2 ** (num_bits - 1) - 1 if symmetric else 2 ** (num_bits) - 1 + if Qp == 0: + Qp = 1.0 + if init_method == "default": + init_val = ( + 2 * tensor.abs().mean() / math.sqrt(Qp) + if symmetric + else 4 * tensor.abs().mean() / math.sqrt(Qp) + ) + elif init_method == "uniform": + init_val = 1.0 / (2 * Qp + 1) if symmetric else 1.0 / Qp + + self._initialize(init_val) + + +class ElasticBiSpiking(torch.autograd.Function): + """ + Modified from Learned Step-size Quantization. + https://arxiv.org/abs/1902.08153 + """ + + @staticmethod + def forward(ctx, input, alpha, num_bits, layerwise): + """ + :param input: input to be quantized + :param alpha: the step size + :param num_bits: quantization bits + :param layerwise: rowwise quant + :return: quantized output + """ + if not layerwise: + # TODO + raise NotImplementedError + ctx.num_bits = num_bits + if num_bits == 32: + return input + elif num_bits == 1 or num_bits == 2: + Qn = -1 + Qp = 1 + + eps = torch.tensor(0.00001).float().to(alpha.device) + if alpha.item() == 1.0 and (not alpha.initialized): + alpha.initialize_wrapper( + input, num_bits, symmetric=True, init_method="default" + ) + alpha = torch.where(alpha > eps, alpha, eps) + assert alpha > 0, "alpha = {:.6f} becomes non-positive".format(alpha) + + grad_scale = ( + 1.0 / math.sqrt(input.numel()) + if not Qp + else 1.0 / math.sqrt(input.numel() * Qp) + ) + ctx.save_for_backward(input, alpha) + ctx.other = grad_scale, Qn, Qp + if num_bits == 1: + q_w = input.sign() ################################## binary + else: + q_w = (input / alpha).round().clamp(Qn, Qp) ###################### ternary + w_q = q_w * alpha + return w_q + + @staticmethod + def backward(ctx, grad_output): + if ctx.num_bits == 32: + return grad_output, None, None, None + + input_, alpha = ctx.saved_tensors + grad_scale, Qn, Qp = ctx.other + q_w = input_ / alpha + indicate_small = (q_w < Qn).float() + indicate_big = (q_w > Qp).float() + indicate_middle = ( + 1.0 - indicate_small - indicate_big + ) # this is more cpu-friendly than torch.ones(input_.shape) + if ctx.num_bits == 1: + grad_alpha = ( + ((input_.sign()) * grad_output * grad_scale).sum().unsqueeze(dim=0) + ) + else: + grad_alpha = ( + ( + ( + indicate_small * Qn + + indicate_big * Qp + + indicate_middle * (-q_w + q_w.round()) + ) + * grad_output + * grad_scale + ) + .sum() + .unsqueeze(dim=0) + ) + grad_input = indicate_middle * grad_output + return grad_input, grad_alpha, None, None diff --git a/src/chop/nn/snn/modules/__init__.py b/src/chop/nn/snn/modules/__init__.py index 6b6efd229..a129d9c08 100644 --- a/src/chop/nn/snn/modules/__init__.py +++ b/src/chop/nn/snn/modules/__init__.py @@ -12,7 +12,7 @@ from .conv3d import Conv3d -from .linear import Linear, LinearUnfoldBias +from .linear import Linear, LinearUnfoldBias, LinearElasticBiSpiking from .pool1d import MaxPool1d, AvgPool1d, AdaptiveAvgPool1d @@ -62,6 +62,7 @@ from .embedding import EmbeddingZIPTF from .roberta import ( RobertaSelfAttentionZIPTF, + RobertaSelfAttentionSpikeLM, ) spiking_basic_module_map = { @@ -70,6 +71,7 @@ "conv3d": Conv3d, "linear": Linear, "linear_unfold_bias": LinearUnfoldBias, + "linear_elastic_bi_spiking": LinearElasticBiSpiking, "max_pool1d": MaxPool1d, "avg_pool1d": AvgPool1d, "adaptive_avg_pool1d": AdaptiveAvgPool1d, @@ -105,6 +107,7 @@ spiking_roberta_module_map = { "roberta_self_attention_zip_tf": RobertaSelfAttentionZIPTF, + "roberta_self_attention_spikeLM": RobertaSelfAttentionSpikeLM, } spiking_module_map = { diff --git a/src/chop/nn/snn/modules/linear.py b/src/chop/nn/snn/modules/linear.py index 44cfb4f92..de727574f 100644 --- a/src/chop/nn/snn/modules/linear.py +++ b/src/chop/nn/snn/modules/linear.py @@ -2,6 +2,8 @@ import chop.nn.snn.base as base import torch +from chop.nn.quantizers.SNN.LSQ import AlphaInit, ElasticBiSpiking + class Linear(nn.Linear, base.StepModule): def __init__( @@ -105,3 +107,66 @@ def forward(self, input): self.first = False return output + + +class LinearElasticBiSpiking(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + symmetric=True, + config=None, + ) -> None: + super().__init__( + in_features, + out_features, + bias, + device, + dtype, + ) + # NOTE: dead code from the original implementation (maybe useful in future reference) + # self.weight_bits = config["weight_bits"] + # self.quantize_act = config["quantize_act"] + # self.register_buffer('weight_clip_val', torch.tensor([config["clip_val"]])) + # self.input_bits = config["input_bits"] + + self.T = config["T"] + self.act_clip_val = nn.ParameterList( + [AlphaInit(torch.tensor(1.0), requires_grad=False) for i in range(self.T)] + ) + self.act_quantizer = ElasticBiSpiking + + def forward(self, input): + # quantize weight + assert len(self.weight.size()) == 2 + + weight = self.weight + mem = torch.zeros_like(input[0]).cuda() + output = torch.zeros_like(input).cuda() + mem_old = 0 + for i in range(self.T): + if i == 0: + mem = input[0] + else: + # v = beta * mem_old (alpha - spike) + v_reset(which is 0) + input + mem = ( + mem_old + * 0.25 + * (self.act_clip_val[i - 1].detach() - output[i - 1].detach()) + + input[i] + ) + + # spike + output[i] = self.act_quantizer.apply( + mem, self.act_clip_val[i], self.input_bits, True + ) + mem_old = mem.clone() + + out = nn.functional.linear(output, weight) + if not self.bias is None: + out += self.bias.view(1, -1).expand_as(out) + + return out diff --git a/src/chop/nn/snn/modules/roberta/__init__.py b/src/chop/nn/snn/modules/roberta/__init__.py index cda86a74d..ac056f851 100644 --- a/src/chop/nn/snn/modules/roberta/__init__.py +++ b/src/chop/nn/snn/modules/roberta/__init__.py @@ -1 +1 @@ -from .attention import RobertaSelfAttentionZIPTF +from .attention import RobertaSelfAttentionZIPTF, RobertaSelfAttentionSpikeLM diff --git a/src/chop/nn/snn/modules/roberta/attention.py b/src/chop/nn/snn/modules/roberta/attention.py index b8253d4a1..5ae718e41 100644 --- a/src/chop/nn/snn/modules/roberta/attention.py +++ b/src/chop/nn/snn/modules/roberta/attention.py @@ -8,9 +8,10 @@ import math -from chop.nn.snn.modules.linear import LinearUnfoldBias +from chop.nn.snn.modules.linear import LinearUnfoldBias, LinearElasticBiSpiking from chop.nn.snn.modules.neuron import ST_BIFNode from chop.nn.snn.modules.softmax import SoftmaxZIPTF +from chop.nn.quantizers.SNN.LSQ import ElasticBiSpiking, AlphaInit def multi(x1_t, x2_t, x1_sum_t, x2_sum_t): @@ -263,3 +264,328 @@ def forward( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + + +class RobertaSelfAttentionSpikeLM(nn.Module): + def __init__(self, config, q_config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = LinearElasticBiSpiking( + config.hidden_size, self.all_head_size, config=q_config + ) + self.key = LinearElasticBiSpiking( + config.hidden_size, self.all_head_size, config=q_config + ) + self.value = LinearElasticBiSpiking( + config.hidden_size, self.all_head_size, config=q_config + ) + + self.act_quantizer_k = ElasticBiSpiking + self.act_quantizer_v = ElasticBiSpiking + + self.T = q_config["T"] + self.input_bits = q_config["input_bits"] + self.clip_key = nn.ParameterList( + [AlphaInit(torch.tensor(1.0), requires_grad=False) for i in range(self.T)] + ) + self.clip_value = nn.ParameterList( + [AlphaInit(torch.tensor(1.0), requires_grad=False) for i in range(self.T)] + ) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) # B T N h C/h + x = x.view(new_x_shape) + return x.permute(0, 1, 3, 2, 4) # B T h N C/h + + # def forward( + # self, + # hidden_states: torch.Tensor, + # attention_mask: Optional[torch.FloatTensor] = None, + # head_mask: Optional[torch.FloatTensor] = None, + # encoder_hidden_states: Optional[torch.FloatTensor] = None, + # encoder_attention_mask: Optional[torch.FloatTensor] = None, + # past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + # output_attentions: Optional[bool] = False, + # ) -> Tuple[torch.Tensor]: + # # NOTE: this might be corresponding to the if case with is_cross_attention is none and past_key_value is none + # mixed_query_layer = self.query(hidden_states) + + # mixed_key_layer = self.key(hidden_states) + # mixed_value_layer = self.value(hidden_states) + # mixed_key_layer = torch.cat([self.act_quantizer_k.apply(mixed_key_layer[i:i+1], self.clip_key[i], self.input_bits, True) for i in range(self.T)], dim=0) + # mixed_value_layer = torch.cat([self.act_quantizer_v.apply(mixed_value_layer[i:i+1], self.clip_value[i], self.input_bits, True) for i in range(self.T)], dim=0) + + # # NOTE: this might be corresponding to the if case with is_cross_attention is none and past_key_value is none + # key_layer = self.transpose_for_scores(mixed_key_layer) + # value_layer = self.transpose_for_scores(mixed_value_layer) + # query_layer = self.transpose_for_scores(mixed_query_layer) + + # # Take the dot product between "query" and "key" to get the raw attention scores. + # attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + # attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # if attention_mask is not None: + # # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + # attention_scores = attention_scores + attention_mask.unsqueeze(0) + + # # Normalize the attention scores to probabilities. + # attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # # This is actually dropping out entire tokens to attend to, which might + # # seem a bit unusual, but is taken from the original Transformer paper. + # attention_probs = self.dropout(attention_probs) + + # # Mask heads if we want to + # if head_mask is not None: + # attention_probs = attention_probs * head_mask + + # context_layer = torch.matmul(attention_probs, value_layer) + + # context_layer = context_layer.permute(0, 1, 3, 2, 4).contiguous() + # new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + # context_layer = context_layer.view(new_context_layer_shape) + + # outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + # return outputs + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + mixed_key_layer = self.key(encoder_hidden_states) + mixed_value_layer = self.value(encoder_hidden_states) + + mixed_key_layer = torch.cat( + [ + self.act_quantizer_k.apply( + mixed_key_layer[i : i + 1], + self.clip_key[i], + self.input_bits, + True, + ) + for i in range(self.T) + ], + dim=0, + ) + mixed_value_layer = torch.cat( + [ + self.act_quantizer_v.apply( + mixed_value_layer[i : i + 1], + self.clip_value[i], + self.input_bits, + True, + ) + for i in range(self.T) + ], + dim=0, + ) + + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + mixed_key_layer = torch.cat( + [ + self.act_quantizer_k.apply( + mixed_key_layer[i : i + 1], + self.clip_key[i], + self.input_bits, + True, + ) + for i in range(self.T) + ], + dim=0, + ) + mixed_value_layer = torch.cat( + [ + self.act_quantizer_v.apply( + mixed_value_layer[i : i + 1], + self.clip_value[i], + self.input_bits, + True, + ) + for i in range(self.T) + ], + dim=0, + ) + + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + mixed_key_layer = torch.cat( + [ + self.act_quantizer_k.apply( + mixed_key_layer[i : i + 1], + self.clip_key[i], + self.input_bits, + True, + ) + for i in range(self.T) + ], + dim=0, + ) + mixed_value_layer = torch.cat( + [ + self.act_quantizer_v.apply( + mixed_value_layer[i : i + 1], + self.clip_value[i], + self.input_bits, + True, + ) + for i in range(self.T) + ], + dim=0, + ) + + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor( + key_length - 1, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + else: + position_ids_l = torch.arange( + query_length, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + position_ids_r = torch.arange( + key_length, dtype=torch.long, device=hidden_states.device + ).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + # context_layer = torch.matmul(attention_probs, value_layer) + context_layer = torch.matmul(attention_probs, value_layer) + + # NOTE: SpikeLM has time dimension in the input/output + context_layer = context_layer.permute(0, 1, 3, 2, 4).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + + return outputs diff --git a/src/chop/passes/module/module_modify_helper.py b/src/chop/passes/module/module_modify_helper.py index 2e6a577e0..953707e80 100644 --- a/src/chop/passes/module/module_modify_helper.py +++ b/src/chop/passes/module/module_modify_helper.py @@ -102,7 +102,7 @@ def replace_by_name(network, name, module): """ -def instantiate_linear(module, postfix, module_map, additional_module_args): +def instantiate_linear(module, postfix, module_map, additional_module_args: dict): linear_cls = module_map[f"linear_{postfix}"] has_bias = not (module.bias is None) @@ -126,7 +126,7 @@ def instantiate_linear(module, postfix, module_map, additional_module_args): return linear -def instantiate_conv2d(module, postfix, module_map, additional_module_args): +def instantiate_conv2d(module, postfix, module_map, additional_module_args: dict): conv2d_cls = module_map[f"conv2d_{postfix}"] has_bias = not (module.bias is None) # TODO: some transformed modules have "config" as an argument then extract the additional_module_args from it. Some directly take the additional_module_args. diff --git a/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeLM.py b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeLM.py new file mode 100644 index 000000000..255e12d79 --- /dev/null +++ b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeLM.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import logging +import os +import sys + +from chop.passes.module.transforms.snn.ann2snn import ann2snn_module_transform_pass +import torch +import torch.nn as nn + +from pathlib import Path + +sys.path.append(Path(__file__).resolve().parents[5].as_posix()) + +logger = logging.getLogger(__name__) +from chop.passes.module.transforms import quantize_module_transform_pass + + +import torch +from torch import nn +from transformers import AutoTokenizer, AutoConfig + +roberta_base_config = { + "architectures": ["RobertaForMaskedLM"], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 50265, +} + +mode_config_path = ( + "/home/thw20/projects/mase/configs/hf_model_configs/configs/roberta_base.json" +) + +# Load model directly +from transformers import AutoTokenizer, AutoModelForMaskedLM + +tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base") +config = AutoConfig.from_pretrained(mode_config_path, cache_dir="/data/models") +logger.info("Training new model from scratch") +model = AutoModelForMaskedLM.from_config(config) +f = open(f"roberta_base_arch.txt", "w") +f.write(str(model)) +f.close() + +convert_pass_args = { + "by": "regex_name", + "roberta\.encoder\.layer\.\d+\.attention\.self": { + "config": { + "name": "spikeLM", + "input_bits": 2, + "T": 4, + }, + }, +} + +mg, _ = ann2snn_module_transform_pass(model, convert_pass_args) + + +convert_pass_args = { + "by": "type", + "linear": { + "config": { + "name": "elastic_bi_spiking", + "input_bits": 2, + "T": 4, + }, + }, +} + +mg, _ = ann2snn_module_transform_pass(mg, convert_pass_args) + +f = open(f"roberta_base_arch_snn.txt", "w") +f.write(str(mg)) +f.close() diff --git a/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeZIP.py b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeZIP.py new file mode 100644 index 000000000..b7236b43c --- /dev/null +++ b/test/passes/module/transforms/ann2snn/test_ann2snn_module_roberta_spikeZIP.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +# This example converts a simple MLP model to Verilog +import logging +import os +import sys + +from chop.passes.module.transforms.snn.ann2snn import ann2snn_module_transform_pass +import torch +import torch.nn as nn + +from pathlib import Path + +sys.path.append(Path(__file__).resolve().parents[5].as_posix()) + + +from chop.passes.module.transforms import quantize_module_transform_pass + + +import torch +from torch import nn +from transformers import RobertaForSequenceClassification, AutoTokenizer + +pretrained = "XianYiyk/roberta-relu-pretrained-sst2" +bert = RobertaForSequenceClassification.from_pretrained(pretrained, num_labels=2) +tokenizer = AutoTokenizer.from_pretrained(pretrained, do_lower_case=True) +for param in bert.parameters(): + param.requires_grad = True # QAT training + + +# def test_ann2snn_module_transform_pass(): +quan_pass_args = { + "by": "regex_name", + "roberta\.encoder\.layer\.\d+\.attention\.self": { + "config": { + "name": "lsqinteger", + "level": 32, + } + }, + "roberta\.encoder\.layer\.\d+\.attention\.output": { + "config": { + "name": "lsqinteger", + "level": 32, + } + }, + "roberta\.encoder\.layer\.\d+\.output": { + "config": { + "name": "lsqinteger", + "level": 32, + } + }, + "roberta\.encoder\.layer\.\d+\.intermediate": { + "config": { + "name": "lsqinteger", + "level": 32, + } + }, + "classifier": { + "config": { + "name": "lsqinteger", + "level": 32, + } + }, +} +mg, _ = quantize_module_transform_pass(bert, quan_pass_args) +# f = open(f"qann_model_arch.txt", "w") +# f.write(str(mg)) +# f.close() + +convert_pass_args = { + "by": "regex_name", + "roberta\.encoder\.layer\.\d+\.attention\.self": { + "config": { + "name": "zip_tf", + "level": 32, + "neuron_type": "ST-BIF", + }, + }, +} +mg, _ = ann2snn_module_transform_pass(mg, convert_pass_args) + +convert_pass_args = { + "by": "type", + "embedding": { + "config": { + "name": "zip_tf", + }, + }, + "linear": { + "config": { + "name": "unfold_bias", + "level": 32, + "neuron_type": "ST-BIF", + }, + }, + "conv2d": { + "config": { + "name": "zip_tf", + "level": 32, + "neuron_type": "ST-BIF", + }, + }, + "layernorm": { + "config": { + "name": "zip_tf", + }, + }, + "relu": { + "manual_instantiate": True, + "config": { + "name": "identity", + }, + }, + "lsqinteger": { + "manual_instantiate": True, + "config": { + "name": "st_bif", + # Default values. These would be replaced by the values from the LSQInteger module, so it has no effect. + # "q_threshold": 1, + # "level": 32, + # "sym": True, + }, + }, +} +mg, _ = ann2snn_module_transform_pass(mg, convert_pass_args) + +# f = open(f"spiking_model_arch.txt", "w") +# f.write(str(mg)) +# f.close()