diff --git a/src/cnlpt/CnlpModelForClassification.py b/src/cnlpt/CnlpModelForClassification.py index 30fdde88..30d0caf1 100644 --- a/src/cnlpt/CnlpModelForClassification.py +++ b/src/cnlpt/CnlpModelForClassification.py @@ -287,7 +287,7 @@ def __init__( self.encoder = encoder_model.from_pretrained(config.encoder_name) # part of the motivation for leaving this # logic alone for character level models is that - # at the time of writing, CANINE and Flair are the only game in town. + # at the time of writing, CANINE and Flair are the only game in town. # CANINE's hashable embeddings for unicode codepoints allows for # additional parameterization, which rn doesn't seem so relevant if not config.character_level: @@ -329,12 +329,12 @@ def __init__( head_size=config.rel_attention_head_dims, ) if config.relations[task_name]: - hidden_size = config.num_rel_attention_heads - if config.use_prior_tasks: - hidden_size += total_prev_task_labels + # hidden_size = config.num_rel_attention_heads + # if config.use_prior_tasks: + # hidden_size += total_prev_task_labels self.classifiers[task_name] = ClassificationHead( - config, task_num_labels, hidden_size=hidden_size + config, task_num_labels, ) else: self.classifiers[task_name] = ClassificationHead( @@ -491,6 +491,30 @@ def compute_loss( ) state["loss"] += task_weight * task_loss + def remove_task_classifiers(self, tasks: list[str] = None): + if tasks is None: + self.classifiers = nn.ModuleDict() + self.tasks = [] + self.class_weights = {} + else: + for task in tasks: + self.classifiers.pop(task) + self.tasks.remove(task) + self.class_weights.pop(task) + + def add_task_classifier(self, task_name: str, label_dictionary: dict[str, list]): + self.tasks.append(task_name) + self.classifiers[task_name] = ClassificationHead( + self.config, len(label_dictionary) + ) + self.label_dictionary[task_name] = label_dictionary + + def set_class_weights(self, class_weights: Union[list[float], None] = None): + if class_weights is None: + self.class_weights = {x: None for x in self.label_dictionary.keys()} + else: + self.class_weights = class_weights + def forward( self, input_ids=None, @@ -531,7 +555,6 @@ def forward( Returns: (`transformers.SequenceClassifierOutput`) the output of the model """ - kwargs = generalize_encoder_forward_kwargs( self.encoder, attention_mask=attention_mask, diff --git a/src/cnlpt/cnlp_args.py b/src/cnlpt/cnlp_args.py index d3d45c4a..d07d0c59 100644 --- a/src/cnlpt/cnlp_args.py +++ b/src/cnlpt/cnlp_args.py @@ -298,18 +298,18 @@ class DaptArguments: "help": "Pretrained tokenizer name or path if not the same as model_name" }, ) - output_dir: Union[str, None] = field( - default=None, metadata={"help": "Directory path to write trained model to."} - ) - overwrite_output_dir: bool = field( - default=False, - metadata={ - "help": ( - "Overwrite the content of the output directory. " - "Use this to continue training if output_dir points to a checkpoint directory." - ) - }, - ) + # output_dir: Union[str, None] = field( + # default=None, metadata={"help": "Directory path to write trained model to."} + # ) + # overwrite_output_dir: bool = field( + # default=False, + # metadata={ + # "help": ( + # "Overwrite the content of the output directory. " + # "Use this to continue training if output_dir points to a checkpoint directory." + # ) + # }, + # ) data_dir: Union[str, None] = field( default=None, metadata={"help": "The data dir for domain-adaptive pretraining."} ) @@ -333,12 +333,12 @@ class DaptArguments: default=0.2, metadata={"help": "The test split proportion for domain-adaptive pretraining."}, ) - seed: int = field( - default=42, - metadata={ - "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)." - }, - ) + # seed: int = field( + # default=42, + # metadata={ + # "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)." + # }, + # ) no_eval: bool = field( default=False, metadata={"help": "Don't split into train and test; just pretrain."}, diff --git a/src/cnlpt/cnlp_data.py b/src/cnlpt/cnlp_data.py index ba9090e8..12960788 100644 --- a/src/cnlpt/cnlp_data.py +++ b/src/cnlpt/cnlp_data.py @@ -1155,10 +1155,9 @@ def __init__( batched=True, remove_columns=list(remove_columns), ) - dataset = dataset.map( - functools.partial(group_texts, self.args.chunk_size), - batched=True, - ) + + dataset = dataset.remove_columns("word_ids") + if isinstance(dataset, (DatasetDict, IterableDatasetDict)) or args.no_eval: self.dataset = dataset diff --git a/src/cnlpt/cnlp_processors.py b/src/cnlpt/cnlp_processors.py index db496505..997a120d 100644 --- a/src/cnlpt/cnlp_processors.py +++ b/src/cnlpt/cnlp_processors.py @@ -171,7 +171,7 @@ def __init__(self, data_dir: str, tasks: set[str] = None, max_train_items=-1): else: sep = "\t" - self.dataset = load_dataset("csv", sep=sep, data_files=data_files) + self.dataset = load_dataset("csv", sep=sep, data_files=data_files, keep_default_na=False) ## find out what tasks are available to this dataset, and see the overlap with what the ## user specified at the cli, remove those tasks so we don't also get them from other datasets diff --git a/src/cnlpt/dapt.py b/src/cnlpt/dapt.py index 4463b9b8..09c9afaa 100644 --- a/src/cnlpt/dapt.py +++ b/src/cnlpt/dapt.py @@ -8,6 +8,7 @@ from typing import Any, Union from transformers import ( + AutoConfig, AutoModelForMaskedLM, AutoTokenizer, HfArgumentParser, @@ -16,12 +17,68 @@ set_seed, ) -from .cnlp_args import DaptArguments +from torch.nn import CrossEntropyLoss +from transformers.modeling_outputs import MaskedLMOutput +from transformers.modeling_utils import PreTrainedModel + +from .CnlpModelForClassification import CnlpConfig, freeze_encoder_weights, generalize_encoder_forward_kwargs +from .cnlp_args import DaptArguments, CnlpTrainingArguments from .cnlp_data import DaptDataset logger = logging.getLogger(__name__) +class DaptModel(PreTrainedModel): + base_model_prefix = "cnlpt" + config_class = CnlpConfig + + def __init__( + self, + config: config_class, + freeze: float = -1.0, + ): + super().__init__(config) + encoder_config = AutoConfig.from_pretrained(config._name_or_path) + encoder_config.vocab_size = config.vocab_size + config.encoder_config = encoder_config.to_dict() + model = AutoModelForMaskedLM.from_config(encoder_config) + self.encoder = model.from_pretrained(config._name_or_path) + # if not config.character_level: + self.encoder.resize_token_embeddings(encoder_config.vocab_size) + + if freeze > 0: + freeze_encoder_weights(self.encoder.bert.encoder, freeze) + + def forward( + self, + input_ids, + token_type_ids, + attention_mask, + labels, + ): + kwargs = generalize_encoder_forward_kwargs( + self.encoder, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True, + return_dict=True, + ) + + outputs = self.encoder(input_ids, **kwargs) + logits = outputs.logits + + if labels is not None: + loss_fn = CrossEntropyLoss() + loss = loss_fn(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + return MaskedLMOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def main( json_file: Union[str, None] = None, json_obj: Union[dict[str, Any], None] = None ): @@ -39,30 +96,31 @@ def main( :rtype: typing.Dict[str, typing.Dict[str, typing.Any]] :return: the evaluation results (will be empty if ``--do_eval`` not passed) """ - parser = HfArgumentParser((DaptArguments,)) + parser = HfArgumentParser((DaptArguments, CnlpTrainingArguments)) dapt_args: DaptArguments + training_args: CnlpTrainingArguments if json_file is not None and json_obj is not None: raise ValueError("cannot specify json_file and json_obj") if json_file is not None: - (dapt_args,) = parser.parse_json_file(json_file=json_file) + (dapt_args, training_args) = parser.parse_json_file(json_file=json_file) elif json_obj is not None: - (dapt_args,) = parser.parse_dict(json_obj) + (dapt_args, training_args) = parser.parse_dict(json_obj) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - (dapt_args,) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + (dapt_args, training_args) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - (dapt_args,) = parser.parse_args_into_dataclasses() + (dapt_args, training_args) = parser.parse_args_into_dataclasses() if ( - os.path.exists(dapt_args.output_dir) - and os.listdir(dapt_args.output_dir) - and not dapt_args.overwrite_output_dir + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and not training_args.overwrite_output_dir ): raise ValueError( - f"Output directory ({dapt_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging @@ -85,9 +143,10 @@ def main( # logger.info("Model parameters %s" % model_args) logger.info(f"Domain adaptation parameters {dapt_args}") + logger.info(f"Training arguments {training_args}") # Set seed - set_seed(dapt_args.seed) + set_seed(training_args.seed) # Load tokenizer: Need this first for loading the datasets tokenizer = AutoTokenizer.from_pretrained( @@ -101,13 +160,15 @@ def main( # additional_special_tokens=['', '', '', '', '', '', '', ''] ) - model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name) + # model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name) + config = AutoConfig.from_pretrained(dapt_args.encoder_name) + model = DaptModel(config, freeze=training_args.freeze) dataset = DaptDataset(dapt_args, tokenizer=tokenizer) trainer = Trainer( model=model, - args=TrainingArguments(output_dir=dapt_args.output_dir), + args=training_args, train_dataset=dataset.train, eval_dataset=dataset.test if not dapt_args.no_eval else None, data_collator=dataset.data_collator, diff --git a/src/cnlpt/train_system.py b/src/cnlpt/train_system.py index bdb335be..d75489dd 100644 --- a/src/cnlpt/train_system.py +++ b/src/cnlpt/train_system.py @@ -300,7 +300,7 @@ def main( if data_args.weight_classes: from collections import Counter - class_weights = [] + class_weights = {} for task in task_names: # get labels in the right order ([0, 1]) if isinstance( @@ -309,17 +309,21 @@ def main( dataset.tasks_to_labels[task] = dataset.tasks_to_labels[task][1:] + [ dataset.tasks_to_labels[task][0] ] - labels = dataset.processed_dataset["train"][task] + if tagger[task]: + labels = [token_label for sent in dataset.processed_dataset["train"][task] for token_label in sent.split()] + else: + labels = dataset.processed_dataset["train"][task] weights = [] label_counts = Counter(labels) for label in dataset.tasks_to_labels[task]: - weights.append(len(labels) / (num_labels[task] * label_counts[label])) + count = max(label_counts[label], 1) + weights.append(len(labels) / (num_labels[task] * count)) # class weights are determined by severity of class imbalance if len(task_names) > 1: - class_weights.append(weights) + class_weights[task] = torch.tensor(weights).to(training_args.device) else: - class_weights = weights # if we just have the one class, simplify the tensor or pytorch will be mad - class_weights = torch.tensor(class_weights).to(training_args.device) + class_weights = torch.tensor(weights).to(training_args.device) # if we just have the one class, simplify the tensor or pytorch will be mad + # class_weights = torch.tensor(class_weights).to(training_args.device) # sm = torch.nn.Softmax(dim=class_weights.ndim - 1) # class_weights = sm(class_weights) @@ -446,6 +450,7 @@ def main( # TODO check when download any pretrained language model to local disk, if # the following condition "is_hub_model(encoder_name)" works or not. + # ^ is_hub_model and is_external_encoder both return False, as long as "model_type": "cnlpt" is in config.json if not is_external_encoder(encoder_name): # we are loading one of our own trained models as a starting point. # @@ -459,7 +464,6 @@ def main( # the model file to be loaded down below the normal way. since that temp file # doesn't have a stored classifier it will use the randomly-inited classifier head # with the size of the supplied config (for the new task). - # TODO This setting 1) is not tested yet. # 2) if training_args.do_train is false: # we evaluate or make predictions of our trained models. # Both two setting require the registeration of CnlpConfig, and use @@ -468,6 +472,11 @@ def main( # Load the cnlp configuration using AutoConfig, this will not override # the arguments from trained cnlp models. While using CnlpConfig will override # the model_type and model_name of the encoder. + if model_args.keep_existing_classifiers == model_args.ignore_existing_classifiers: # XNOR + raise ValueError( + "For continued training of a cnlpt model, one of --keep_existing_classifiers or --ignore_existing_classifiers flags should be selected." + ) + config = AutoConfig.from_pretrained( ( model_args.config_name @@ -477,41 +486,57 @@ def main( cache_dir=model_args.cache_dir, # in this case we're looking at a fine-tuned model (?) character_level=data_args.character_level, + layer=model_args.layer, ) - if training_args.do_train: # Setting 1) only load weights from the encoder - raise NotImplementedError( - "This functionality has not been restored yet" - ) + if model_args.ignore_existing_classifiers: + config.finetuning_task = ( + data_args.task_name + if data_args.task_name is not None + else dataset.tasks + ) + elif model_args.keep_existing_classifiers: + # setting 2) evaluate or make predictions + if ( + config.finetuning_task != data_args.task_name + or config.relations != relations + or config.tagger != tagger + ): + raise ValueError( + "When --keep_existing_classifiers is selected, please ensure" + "that you set the settings the same as those used in the" + "previous training run." + ) + model = CnlpModelForClassification( - model_path=model_args.encoder_name, config=config, - cache_dir=model_args.cache_dir, - tagger=tagger, - relations=relations, - class_weights=dataset.class_weights, + # class_weights=dataset.class_weights, + class_weights=class_weights, final_task_weight=training_args.final_task_weight, - use_prior_tasks=model_args.use_prior_tasks, - argument_regularization=model_args.arg_reg, + freeze=training_args.freeze, ) - delattr(model, "classifiers") - delattr(model, "feature_extractors") + if model_args.ignore_existing_classifiers: + model.remove_task_classifiers() + for task in data_args.task_name: + model.add_task_classifier(task, dataset.get_labels()[task]) + model.set_class_weights(dataset.class_weights) + if training_args.do_train: tempmodel = tempfile.NamedTemporaryFile(dir=model_args.cache_dir) torch.save(model.state_dict(), tempmodel) model_name = tempmodel.name - else: + else: # load existing head # setting 2) evaluate or make predictions model = CnlpModelForClassification.from_pretrained( model_args.encoder_name, config=config, - class_weights=dataset.class_weights, + class_weights=class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, bias_fit=training_args.bias_fit, ) - + model.tasks = data_args.task_name else: # This only works when model_args.encoder_name is one of the # model card from https://huggingface.co/models @@ -541,7 +566,7 @@ def main( config.vocab_size = len(tokenizer) model = CnlpModelForClassification( config=config, - class_weights=dataset.class_weights, + class_weights=class_weights, final_task_weight=training_args.final_task_weight, freeze=training_args.freeze, bias_fit=training_args.bias_fit, @@ -656,15 +681,22 @@ def compute_metrics_fn(p: EvalPrediction): raise RuntimeError( f"Unrecognized label type: {type(training_args.model_selection_label)}" ) - else: # same default as in 0.6.0 + elif dataset.output_modes[task] == relex: task_scores.append( metrics[task_name].get( "one_score", np.mean(metrics[task_name].get("f1")) ) ) + else: + task_scores.append( + metrics[task_name].get( + "one_score", np.mean(metrics[task_name].get("token_f1")) + ) + ) # task_scores.append(processor.get_one_score(metrics.get(task_name, metrics.get(task_name.split('-')[0], None)))) one_score = sum(task_scores) / len(task_scores) + metrics["one_score"] = one_score if model is not None: if not hasattr(model, "best_score") or one_score > model.best_score: @@ -675,7 +707,7 @@ def compute_metrics_fn(p: EvalPrediction): model.best_eval_results = metrics if trainer.is_world_process_zero(): if training_args.do_train: - trainer.save_model() + trainer.save_model() # NOTE: a RobertaConfig is loaded here. why? tokenizer.save_pretrained(training_args.output_dir) if model_name == "cnn" or model_name == "lstm": with open( @@ -690,7 +722,7 @@ def compute_metrics_fn(p: EvalPrediction): ) config_dict["task_names"] = task_names json.dump(config_dict, f) - for task_ind, task_name in enumerate(metrics): + for task_ind, task_name in enumerate(task_names): with open(output_eval_file, "a") as writer: logger.info( f"***** Eval results for task {task_name} *****" @@ -720,7 +752,8 @@ def compute_metrics_fn(p: EvalPrediction): return compute_metrics_fn # Initialize our Trainer - training_args.load_best_model_at_end = True + # training_args.load_best_model_at_end = True + # TODO the argument in CnlpTrainingArguments is `model_selection_score`. reconcile this with `metric_for_best_model`? training_args.metric_for_best_model = "one_score" trainer = Trainer( model=model, @@ -884,7 +917,7 @@ def compute_metrics_fn(p: EvalPrediction): out_table = process_prediction( task_names=dataset.tasks, - error_analysis=False, + error_analysis=training_args.error_analysis, output_prob=training_args.output_prob, character_level=data_args.character_level, task_to_label_packet=task_to_label_packet, @@ -910,4 +943,4 @@ def _mp_fn(index): if __name__ == "__main__": - main() + main() \ No newline at end of file