diff --git a/CHANGELOG.md b/CHANGELOG.md index 20e889514..a92009ed5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Adds a function to modify a Hugging Face transformer with IA3 adaptors - Added a `BeakerScheduler` registrable class, specified as the argument `scheduler` to `BeakerExecutor`, which controls the resources assigned to steps ran on Beaker. Users can implement their own `BeakerScheduler` subclasses to customize the resource assignment behavior. diff --git a/docs/source/api/integrations/transformers.rst b/docs/source/api/integrations/transformers.rst index 59a003326..376b89143 100644 --- a/docs/source/api/integrations/transformers.rst +++ b/docs/source/api/integrations/transformers.rst @@ -17,3 +17,5 @@ Reference .. autoclass:: tango.integrations.transformers.RunGenerationDataset :members: + +.. autofunction:: tango.integrations.transformers.ia3.modify_with_ia3 \ No newline at end of file diff --git a/tango/integrations/transformers/ia3.py b/tango/integrations/transformers/ia3.py new file mode 100644 index 000000000..e5013805d --- /dev/null +++ b/tango/integrations/transformers/ia3.py @@ -0,0 +1,287 @@ +import re +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PreTrainedModel +from transformers.modeling_utils import Conv1D + + +@dataclass +class WithIA3Config: + """ + A class for configuring which layers to modify with IA3 adaptors. + + + :param ia3_param_names: + A string used as the name for all ia3 parameters + :param attention_modules: + A regex that matches all attention modules which are parents to the keys and value layers to modify. + :param mlp_modules: + A regex that matches all modules that are parents to the feed forward layer to modify. + :param mlp_layers: + A regex that matches the feed forward layer in the modules specified by `mlp_modles`. + :param fused_qkv_layers: + A regex that matches the combined query, key, and value layer in the modules specified + by `attention_modules`. + :param k_layers: + A regex that matches the key layer in the modules specified by `attention_modules`. + :param v_layers: + A regex that matches the value layer in the modules specified by `attention_modules`. + """ + + ia3_param_names: str + attention_modules: str + mlp_modules: str + mlp_layers: str + fused_qkv_layers: Optional[str] = None + k_layers: Optional[str] = None + v_layers: Optional[str] = None + + +GPT_J_IA3_CONFIG = WithIA3Config( + attention_modules=".*attn", + k_layers="k_proj", + v_layers="v_proj", + mlp_modules=".*mlp", + mlp_layers="fc_in", + ia3_param_names="ia3", +) + +GPT_2_IA3_CONFIG = WithIA3Config( + attention_modules=".*attn", + fused_qkv_layers="c_attn", + mlp_modules=".*mlp", + mlp_layers="c_fc", + ia3_param_names="ia3", +) + +OPT_IA3_CONFIG = WithIA3Config( + attention_modules=".*self_attn", + k_layers="k_proj", + v_layers="v_proj", + mlp_modules=r".*layers\.\d*", + mlp_layers="fc1", + ia3_param_names="ia3", +) + +BLOOM_IA3_CONFIG = WithIA3Config( + attention_modules=".*self_attention", + fused_qkv_layers="query_key_value", + mlp_modules=".*mlp", + mlp_layers="dense_h_to_4h", + ia3_param_names="ia3", +) + +MODEL_NAME_TO_CONFIG = { + "sshleifer/tiny-gpt2": GPT_2_IA3_CONFIG, + "gpt2": GPT_2_IA3_CONFIG, + "gpt2-medium": GPT_2_IA3_CONFIG, + "gpt2-large": GPT_2_IA3_CONFIG, + "gpt2-xl": GPT_2_IA3_CONFIG, + "bigscience/bloom-560m": BLOOM_IA3_CONFIG, + "bigscience/bloom-1b1": BLOOM_IA3_CONFIG, + "bigscience/bloom-1b7": BLOOM_IA3_CONFIG, + "bigscience/bloom-3b": BLOOM_IA3_CONFIG, + "bigscience/bloom-7b1": BLOOM_IA3_CONFIG, + "bigscience/bloom": BLOOM_IA3_CONFIG, + "facebook/opt-125m": OPT_IA3_CONFIG, + "facebook/opt-350m": OPT_IA3_CONFIG, + "facebook/opt-1.3b": OPT_IA3_CONFIG, + "facebook/opt-2.7b": OPT_IA3_CONFIG, + "facebook/opt-6.7b": OPT_IA3_CONFIG, + "facebook/opt-13b": OPT_IA3_CONFIG, + "facebook/opt-30b": OPT_IA3_CONFIG, + "facebook/opt-66b": OPT_IA3_CONFIG, + "EleutherAI/gpt-j-6B": GPT_J_IA3_CONFIG, +} + + +class WithIA3(nn.Module): + def __init__(self, ia3_param_names: str, unfuse_size: int = None): + super().__init__() + self.ia3_param_names = ia3_param_names + + # if (q,k,v) are stacked into one layer + if unfuse_size is not None: + # IA3 only operates on k and v (not q), thus the "* 2" + setattr(self, ia3_param_names, nn.Parameter(torch.ones(unfuse_size * 2, 1))) + else: + setattr(self, ia3_param_names, nn.Parameter(torch.ones(self.out_features, 1))) # type: ignore + + def scale_by_ia3(self, x): + ia3_params = getattr(self, self.ia3_param_names) + + if ia3_params.requires_grad: + if self.unfuse_size is not None: + # non_q means k and v + q, non_q = x[:, :, : self.unfuse_size], x[:, :, self.unfuse_size :] + ia3_params = getattr(self, self.ia3_param_names) + non_q = non_q * ia3_params.flatten() + x = torch.cat([q, non_q], dim=2) + else: + x = x * ia3_params.flatten() + + return x + + +class LinearWithIA3(WithIA3): + def __init__(self, linear_layer: nn.Linear, ia3_param_names: str, unfuse_size: int = None): + """ + A replacement for :class:`~torch.nn.Linear` modified with an IA3 adaptor + + + :param linear_layer: + A :class:`~torch.nn.Linear` layer to adapt. + :param ia3_param_names: + A `str` to use as the name of ia3 parameters. + :param unfuse_size: + An `int` indicating hidden dimension of the query, key, and value vectors. + To be used only when the layer to modify is a fused projection of query, + key, and value vectors in an attention mechanism. + """ + assert unfuse_size is None or (linear_layer.out_features == unfuse_size * 3) + self.in_features = linear_layer.in_features + self.out_features = linear_layer.out_features + self.unfuse_size = unfuse_size + + super().__init__(ia3_param_names, unfuse_size) + + self.weight = linear_layer.weight + self.bias = linear_layer.bias + + def forward(self, x): + x = F.linear(x, self.weight, self.bias) + return self.scale_by_ia3(x) + + +class Conv1DWithIA3(WithIA3): + def __init__(self, conv1d_layer: Conv1D, ia3_param_names: str, unfuse_size: int = None): + """ + A replacement for :class:`~transformers.modeling_utils.Conv1D` modified with an IA3 adaptor + + + :param conv1d_layer: + A :class:`~transformers.modeling_utils.Conv1D` layer to adapt. + :param ia3_param_names: + A `str` to use as the name of ia3 parameters. + :param unfuse_size: + An `int` indicating hidden dimension of the query, key, and value vectors. + To be used only when the layer to modify is a fused projection of query, + key, and value vectors in an attention mechanism. + """ + assert unfuse_size is None or (conv1d_layer.nf == unfuse_size * 3) + + # nf: number of output features; nx: number of input features + self.out_features = conv1d_layer.nf + self.unfuse_size = unfuse_size + + super().__init__(ia3_param_names, unfuse_size) + + self.weight = conv1d_layer.weight + self.bias = conv1d_layer.bias + + def forward(self, x): + # copied and pasted from the original Conv1D implemnetation + size_out = x.size()[:-1] + (self.out_features,) + x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) + x = x.view(size_out) # ... * self.nf + + return self.scale_by_ia3(x) + + +def modify_with_ia3( + transformer: PreTrainedModel, + *, + config: WithIA3Config = None, + only_ia3_requires_grad: bool = True, +) -> PreTrainedModel: + """ + A function to add ia3 adaptors to the given transformer. Code modified from + `t-few `_ + and Qinyuan Ye + + + :param model: + A :class:`~transformers.PreTrainedModel` to modify. + :param config: + A :class:`~tango.integrations.transformers.ia3.WithIA3Config` that specifies the layers to modify. + :param only_ia3_requires_grad: + A `bool`, `True` if `requires_grad` should only be set on ia3 paramenters in the output model. + + Examples + -------- + + You can use the provided configurations: + + .. testcode:: + + from transformers import AutoModelForCausalLM, AutoTokenizer + from tango.integrations.transformers.ia3 import modify_with_ia3, GPT_2_IA3_CONFIG + + model = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2") + model = modify_with_ia3(model, config=GPT_2_IA3_CONFIG) + + Or you can write your own configuration with regex matching the layers to modify and their parents: + + .. testcode:: + + from transformers import AutoModelForCausalLM, AutoTokenizer + from tango.integrations.transformers.ia3 import modify_with_ia3 + + my_config = WithIA3Config( + attention_modules=".*attn", + fused_qkv_layers="c_attn", + mlp_modules=".*mlp", + mlp_layers="c_fc", + ia3_param_names="ia3", + ) + + model = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2") + model = modify_with_ia3(model, config=my_config) + """ + if config is None: + model_name = transformer.config._name_or_path # type: ignore + assert ( + model_name in MODEL_NAME_TO_CONFIG + ), f"{model_name} does not have a pre made configuration; please make your own." + config = MODEL_NAME_TO_CONFIG[model_name] + + for m_name, module in dict(transformer.named_modules()).items(): # type: ignore + if re.fullmatch(config.attention_modules, m_name) or re.fullmatch( + config.mlp_modules, m_name + ): + attn_layers = [ + regex + for regex in (config.fused_qkv_layers, config.k_layers, config.v_layers) + if regex is not None + ] + layers_to_change = ( + "|".join(attn_layers) + if re.fullmatch(config.attention_modules, m_name) + else config.mlp_layers + ) + for c_name, layer in dict(module.named_children()).items(): + if re.fullmatch(layers_to_change, c_name): + assert isinstance(layer, Conv1D) or isinstance( + layer, nn.Linear + ), "This code only supports Conv1D and nn.Linear" + adaptor_class = Conv1DWithIA3 if isinstance(layer, Conv1D) else LinearWithIA3 + new_module = adaptor_class( + layer, + config.ia3_param_names, + unfuse_size=transformer.config.hidden_size # type: ignore + if config.fused_qkv_layers and re.fullmatch(config.fused_qkv_layers, c_name) + else None, + ) + setattr(module, c_name, new_module) + + if only_ia3_requires_grad: + transformer.requires_grad_(False) # type: ignore + for p_name, v in dict(transformer.named_parameters()).items(): # type: ignore + if re.fullmatch(".*" + config.ia3_param_names + ".*", p_name): + v.requires_grad_(True) + + return transformer diff --git a/tests/integrations/transformers/ia3_test.py b/tests/integrations/transformers/ia3_test.py new file mode 100644 index 000000000..fa2df2aa8 --- /dev/null +++ b/tests/integrations/transformers/ia3_test.py @@ -0,0 +1,38 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from tango.integrations.transformers.ia3 import GPT_2_IA3_CONFIG, modify_with_ia3 + + +def test_ia3(): + + config = GPT_2_IA3_CONFIG + model_name = "sshleifer/tiny-gpt2" + + tokenizer = AutoTokenizer.from_pretrained(model_name) + + input_seq = tokenizer(["A tiny test on a tiny model."], return_tensors="pt") + + model = AutoModelForCausalLM.from_pretrained(model_name).eval() + + with torch.inference_mode(): + old_outputs = model( + input_ids=input_seq.input_ids, + attention_mask=input_seq.attention_mask, + labels=input_seq.input_ids, + ) + + model = modify_with_ia3(model, config=config) + + with torch.inference_mode(): + new_outputs = model( + input_ids=input_seq.input_ids, + attention_mask=input_seq.attention_mask, + labels=input_seq.input_ids, + ) + + logits_diff = torch.abs(old_outputs.logits - new_outputs.logits).mean() + assert logits_diff < 1e-10 + + loss_diff = torch.abs(old_outputs.loss - new_outputs.loss) + assert loss_diff < 1e-10