diff --git a/src/cnlpt/CnlpModelForClassification.py b/src/cnlpt/CnlpModelForClassification.py
index 30fdde88..30d0caf1 100644
--- a/src/cnlpt/CnlpModelForClassification.py
+++ b/src/cnlpt/CnlpModelForClassification.py
@@ -287,7 +287,7 @@ def __init__(
self.encoder = encoder_model.from_pretrained(config.encoder_name)
# part of the motivation for leaving this
# logic alone for character level models is that
- # at the time of writing, CANINE and Flair are the only game in town.
+ # at the time of writing, CANINE and Flair are the only game in town.
# CANINE's hashable embeddings for unicode codepoints allows for
# additional parameterization, which rn doesn't seem so relevant
if not config.character_level:
@@ -329,12 +329,12 @@ def __init__(
head_size=config.rel_attention_head_dims,
)
if config.relations[task_name]:
- hidden_size = config.num_rel_attention_heads
- if config.use_prior_tasks:
- hidden_size += total_prev_task_labels
+ # hidden_size = config.num_rel_attention_heads
+ # if config.use_prior_tasks:
+ # hidden_size += total_prev_task_labels
self.classifiers[task_name] = ClassificationHead(
- config, task_num_labels, hidden_size=hidden_size
+ config, task_num_labels,
)
else:
self.classifiers[task_name] = ClassificationHead(
@@ -491,6 +491,30 @@ def compute_loss(
)
state["loss"] += task_weight * task_loss
+ def remove_task_classifiers(self, tasks: list[str] = None):
+ if tasks is None:
+ self.classifiers = nn.ModuleDict()
+ self.tasks = []
+ self.class_weights = {}
+ else:
+ for task in tasks:
+ self.classifiers.pop(task)
+ self.tasks.remove(task)
+ self.class_weights.pop(task)
+
+ def add_task_classifier(self, task_name: str, label_dictionary: dict[str, list]):
+ self.tasks.append(task_name)
+ self.classifiers[task_name] = ClassificationHead(
+ self.config, len(label_dictionary)
+ )
+ self.label_dictionary[task_name] = label_dictionary
+
+ def set_class_weights(self, class_weights: Union[list[float], None] = None):
+ if class_weights is None:
+ self.class_weights = {x: None for x in self.label_dictionary.keys()}
+ else:
+ self.class_weights = class_weights
+
def forward(
self,
input_ids=None,
@@ -531,7 +555,6 @@ def forward(
Returns: (`transformers.SequenceClassifierOutput`) the output of the model
"""
-
kwargs = generalize_encoder_forward_kwargs(
self.encoder,
attention_mask=attention_mask,
diff --git a/src/cnlpt/cnlp_args.py b/src/cnlpt/cnlp_args.py
index d3d45c4a..d07d0c59 100644
--- a/src/cnlpt/cnlp_args.py
+++ b/src/cnlpt/cnlp_args.py
@@ -298,18 +298,18 @@ class DaptArguments:
"help": "Pretrained tokenizer name or path if not the same as model_name"
},
)
- output_dir: Union[str, None] = field(
- default=None, metadata={"help": "Directory path to write trained model to."}
- )
- overwrite_output_dir: bool = field(
- default=False,
- metadata={
- "help": (
- "Overwrite the content of the output directory. "
- "Use this to continue training if output_dir points to a checkpoint directory."
- )
- },
- )
+ # output_dir: Union[str, None] = field(
+ # default=None, metadata={"help": "Directory path to write trained model to."}
+ # )
+ # overwrite_output_dir: bool = field(
+ # default=False,
+ # metadata={
+ # "help": (
+ # "Overwrite the content of the output directory. "
+ # "Use this to continue training if output_dir points to a checkpoint directory."
+ # )
+ # },
+ # )
data_dir: Union[str, None] = field(
default=None, metadata={"help": "The data dir for domain-adaptive pretraining."}
)
@@ -333,12 +333,12 @@ class DaptArguments:
default=0.2,
metadata={"help": "The test split proportion for domain-adaptive pretraining."},
)
- seed: int = field(
- default=42,
- metadata={
- "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)."
- },
- )
+ # seed: int = field(
+ # default=42,
+ # metadata={
+ # "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)."
+ # },
+ # )
no_eval: bool = field(
default=False,
metadata={"help": "Don't split into train and test; just pretrain."},
diff --git a/src/cnlpt/cnlp_data.py b/src/cnlpt/cnlp_data.py
index ba9090e8..12960788 100644
--- a/src/cnlpt/cnlp_data.py
+++ b/src/cnlpt/cnlp_data.py
@@ -1155,10 +1155,9 @@ def __init__(
batched=True,
remove_columns=list(remove_columns),
)
- dataset = dataset.map(
- functools.partial(group_texts, self.args.chunk_size),
- batched=True,
- )
+
+ dataset = dataset.remove_columns("word_ids")
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)) or args.no_eval:
self.dataset = dataset
diff --git a/src/cnlpt/cnlp_processors.py b/src/cnlpt/cnlp_processors.py
index db496505..997a120d 100644
--- a/src/cnlpt/cnlp_processors.py
+++ b/src/cnlpt/cnlp_processors.py
@@ -171,7 +171,7 @@ def __init__(self, data_dir: str, tasks: set[str] = None, max_train_items=-1):
else:
sep = "\t"
- self.dataset = load_dataset("csv", sep=sep, data_files=data_files)
+ self.dataset = load_dataset("csv", sep=sep, data_files=data_files, keep_default_na=False)
## find out what tasks are available to this dataset, and see the overlap with what the
## user specified at the cli, remove those tasks so we don't also get them from other datasets
diff --git a/src/cnlpt/dapt.py b/src/cnlpt/dapt.py
index 4463b9b8..09c9afaa 100644
--- a/src/cnlpt/dapt.py
+++ b/src/cnlpt/dapt.py
@@ -8,6 +8,7 @@
from typing import Any, Union
from transformers import (
+ AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
HfArgumentParser,
@@ -16,12 +17,68 @@
set_seed,
)
-from .cnlp_args import DaptArguments
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.modeling_utils import PreTrainedModel
+
+from .CnlpModelForClassification import CnlpConfig, freeze_encoder_weights, generalize_encoder_forward_kwargs
+from .cnlp_args import DaptArguments, CnlpTrainingArguments
from .cnlp_data import DaptDataset
logger = logging.getLogger(__name__)
+class DaptModel(PreTrainedModel):
+ base_model_prefix = "cnlpt"
+ config_class = CnlpConfig
+
+ def __init__(
+ self,
+ config: config_class,
+ freeze: float = -1.0,
+ ):
+ super().__init__(config)
+ encoder_config = AutoConfig.from_pretrained(config._name_or_path)
+ encoder_config.vocab_size = config.vocab_size
+ config.encoder_config = encoder_config.to_dict()
+ model = AutoModelForMaskedLM.from_config(encoder_config)
+ self.encoder = model.from_pretrained(config._name_or_path)
+ # if not config.character_level:
+ self.encoder.resize_token_embeddings(encoder_config.vocab_size)
+
+ if freeze > 0:
+ freeze_encoder_weights(self.encoder.bert.encoder, freeze)
+
+ def forward(
+ self,
+ input_ids,
+ token_type_ids,
+ attention_mask,
+ labels,
+ ):
+ kwargs = generalize_encoder_forward_kwargs(
+ self.encoder,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ output_hidden_states=True,
+ return_dict=True,
+ )
+
+ outputs = self.encoder(input_ids, **kwargs)
+ logits = outputs.logits
+
+ if labels is not None:
+ loss_fn = CrossEntropyLoss()
+ loss = loss_fn(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ return MaskedLMOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
def main(
json_file: Union[str, None] = None, json_obj: Union[dict[str, Any], None] = None
):
@@ -39,30 +96,31 @@ def main(
:rtype: typing.Dict[str, typing.Dict[str, typing.Any]]
:return: the evaluation results (will be empty if ``--do_eval`` not passed)
"""
- parser = HfArgumentParser((DaptArguments,))
+ parser = HfArgumentParser((DaptArguments, CnlpTrainingArguments))
dapt_args: DaptArguments
+ training_args: CnlpTrainingArguments
if json_file is not None and json_obj is not None:
raise ValueError("cannot specify json_file and json_obj")
if json_file is not None:
- (dapt_args,) = parser.parse_json_file(json_file=json_file)
+ (dapt_args, training_args) = parser.parse_json_file(json_file=json_file)
elif json_obj is not None:
- (dapt_args,) = parser.parse_dict(json_obj)
+ (dapt_args, training_args) = parser.parse_dict(json_obj)
elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
- (dapt_args,) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+ (dapt_args, training_args) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
- (dapt_args,) = parser.parse_args_into_dataclasses()
+ (dapt_args, training_args) = parser.parse_args_into_dataclasses()
if (
- os.path.exists(dapt_args.output_dir)
- and os.listdir(dapt_args.output_dir)
- and not dapt_args.overwrite_output_dir
+ os.path.exists(training_args.output_dir)
+ and os.listdir(training_args.output_dir)
+ and not training_args.overwrite_output_dir
):
raise ValueError(
- f"Output directory ({dapt_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
@@ -85,9 +143,10 @@ def main(
# logger.info("Model parameters %s" % model_args)
logger.info(f"Domain adaptation parameters {dapt_args}")
+ logger.info(f"Training arguments {training_args}")
# Set seed
- set_seed(dapt_args.seed)
+ set_seed(training_args.seed)
# Load tokenizer: Need this first for loading the datasets
tokenizer = AutoTokenizer.from_pretrained(
@@ -101,13 +160,15 @@ def main(
# additional_special_tokens=['', '', '', '', '', '', '', '']
)
- model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name)
+ # model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name)
+ config = AutoConfig.from_pretrained(dapt_args.encoder_name)
+ model = DaptModel(config, freeze=training_args.freeze)
dataset = DaptDataset(dapt_args, tokenizer=tokenizer)
trainer = Trainer(
model=model,
- args=TrainingArguments(output_dir=dapt_args.output_dir),
+ args=training_args,
train_dataset=dataset.train,
eval_dataset=dataset.test if not dapt_args.no_eval else None,
data_collator=dataset.data_collator,
diff --git a/src/cnlpt/train_system.py b/src/cnlpt/train_system.py
index bdb335be..d75489dd 100644
--- a/src/cnlpt/train_system.py
+++ b/src/cnlpt/train_system.py
@@ -300,7 +300,7 @@ def main(
if data_args.weight_classes:
from collections import Counter
- class_weights = []
+ class_weights = {}
for task in task_names:
# get labels in the right order ([0, 1])
if isinstance(
@@ -309,17 +309,21 @@ def main(
dataset.tasks_to_labels[task] = dataset.tasks_to_labels[task][1:] + [
dataset.tasks_to_labels[task][0]
]
- labels = dataset.processed_dataset["train"][task]
+ if tagger[task]:
+ labels = [token_label for sent in dataset.processed_dataset["train"][task] for token_label in sent.split()]
+ else:
+ labels = dataset.processed_dataset["train"][task]
weights = []
label_counts = Counter(labels)
for label in dataset.tasks_to_labels[task]:
- weights.append(len(labels) / (num_labels[task] * label_counts[label]))
+ count = max(label_counts[label], 1)
+ weights.append(len(labels) / (num_labels[task] * count))
# class weights are determined by severity of class imbalance
if len(task_names) > 1:
- class_weights.append(weights)
+ class_weights[task] = torch.tensor(weights).to(training_args.device)
else:
- class_weights = weights # if we just have the one class, simplify the tensor or pytorch will be mad
- class_weights = torch.tensor(class_weights).to(training_args.device)
+ class_weights = torch.tensor(weights).to(training_args.device) # if we just have the one class, simplify the tensor or pytorch will be mad
+ # class_weights = torch.tensor(class_weights).to(training_args.device)
# sm = torch.nn.Softmax(dim=class_weights.ndim - 1)
# class_weights = sm(class_weights)
@@ -446,6 +450,7 @@ def main(
# TODO check when download any pretrained language model to local disk, if
# the following condition "is_hub_model(encoder_name)" works or not.
+ # ^ is_hub_model and is_external_encoder both return False, as long as "model_type": "cnlpt" is in config.json
if not is_external_encoder(encoder_name):
# we are loading one of our own trained models as a starting point.
#
@@ -459,7 +464,6 @@ def main(
# the model file to be loaded down below the normal way. since that temp file
# doesn't have a stored classifier it will use the randomly-inited classifier head
# with the size of the supplied config (for the new task).
- # TODO This setting 1) is not tested yet.
# 2) if training_args.do_train is false:
# we evaluate or make predictions of our trained models.
# Both two setting require the registeration of CnlpConfig, and use
@@ -468,6 +472,11 @@ def main(
# Load the cnlp configuration using AutoConfig, this will not override
# the arguments from trained cnlp models. While using CnlpConfig will override
# the model_type and model_name of the encoder.
+ if model_args.keep_existing_classifiers == model_args.ignore_existing_classifiers: # XNOR
+ raise ValueError(
+ "For continued training of a cnlpt model, one of --keep_existing_classifiers or --ignore_existing_classifiers flags should be selected."
+ )
+
config = AutoConfig.from_pretrained(
(
model_args.config_name
@@ -477,41 +486,57 @@ def main(
cache_dir=model_args.cache_dir,
# in this case we're looking at a fine-tuned model (?)
character_level=data_args.character_level,
+ layer=model_args.layer,
)
-
if training_args.do_train:
# Setting 1) only load weights from the encoder
- raise NotImplementedError(
- "This functionality has not been restored yet"
- )
+ if model_args.ignore_existing_classifiers:
+ config.finetuning_task = (
+ data_args.task_name
+ if data_args.task_name is not None
+ else dataset.tasks
+ )
+ elif model_args.keep_existing_classifiers:
+ # setting 2) evaluate or make predictions
+ if (
+ config.finetuning_task != data_args.task_name
+ or config.relations != relations
+ or config.tagger != tagger
+ ):
+ raise ValueError(
+ "When --keep_existing_classifiers is selected, please ensure"
+ "that you set the settings the same as those used in the"
+ "previous training run."
+ )
+
model = CnlpModelForClassification(
- model_path=model_args.encoder_name,
config=config,
- cache_dir=model_args.cache_dir,
- tagger=tagger,
- relations=relations,
- class_weights=dataset.class_weights,
+ # class_weights=dataset.class_weights,
+ class_weights=class_weights,
final_task_weight=training_args.final_task_weight,
- use_prior_tasks=model_args.use_prior_tasks,
- argument_regularization=model_args.arg_reg,
+ freeze=training_args.freeze,
)
- delattr(model, "classifiers")
- delattr(model, "feature_extractors")
+ if model_args.ignore_existing_classifiers:
+ model.remove_task_classifiers()
+ for task in data_args.task_name:
+ model.add_task_classifier(task, dataset.get_labels()[task])
+ model.set_class_weights(dataset.class_weights)
+
if training_args.do_train:
tempmodel = tempfile.NamedTemporaryFile(dir=model_args.cache_dir)
torch.save(model.state_dict(), tempmodel)
model_name = tempmodel.name
- else:
+ else: # load existing head
# setting 2) evaluate or make predictions
model = CnlpModelForClassification.from_pretrained(
model_args.encoder_name,
config=config,
- class_weights=dataset.class_weights,
+ class_weights=class_weights,
final_task_weight=training_args.final_task_weight,
freeze=training_args.freeze,
bias_fit=training_args.bias_fit,
)
-
+ model.tasks = data_args.task_name
else:
# This only works when model_args.encoder_name is one of the
# model card from https://huggingface.co/models
@@ -541,7 +566,7 @@ def main(
config.vocab_size = len(tokenizer)
model = CnlpModelForClassification(
config=config,
- class_weights=dataset.class_weights,
+ class_weights=class_weights,
final_task_weight=training_args.final_task_weight,
freeze=training_args.freeze,
bias_fit=training_args.bias_fit,
@@ -656,15 +681,22 @@ def compute_metrics_fn(p: EvalPrediction):
raise RuntimeError(
f"Unrecognized label type: {type(training_args.model_selection_label)}"
)
- else: # same default as in 0.6.0
+ elif dataset.output_modes[task] == relex:
task_scores.append(
metrics[task_name].get(
"one_score", np.mean(metrics[task_name].get("f1"))
)
)
+ else:
+ task_scores.append(
+ metrics[task_name].get(
+ "one_score", np.mean(metrics[task_name].get("token_f1"))
+ )
+ )
# task_scores.append(processor.get_one_score(metrics.get(task_name, metrics.get(task_name.split('-')[0], None))))
one_score = sum(task_scores) / len(task_scores)
+ metrics["one_score"] = one_score
if model is not None:
if not hasattr(model, "best_score") or one_score > model.best_score:
@@ -675,7 +707,7 @@ def compute_metrics_fn(p: EvalPrediction):
model.best_eval_results = metrics
if trainer.is_world_process_zero():
if training_args.do_train:
- trainer.save_model()
+ trainer.save_model() # NOTE: a RobertaConfig is loaded here. why?
tokenizer.save_pretrained(training_args.output_dir)
if model_name == "cnn" or model_name == "lstm":
with open(
@@ -690,7 +722,7 @@ def compute_metrics_fn(p: EvalPrediction):
)
config_dict["task_names"] = task_names
json.dump(config_dict, f)
- for task_ind, task_name in enumerate(metrics):
+ for task_ind, task_name in enumerate(task_names):
with open(output_eval_file, "a") as writer:
logger.info(
f"***** Eval results for task {task_name} *****"
@@ -720,7 +752,8 @@ def compute_metrics_fn(p: EvalPrediction):
return compute_metrics_fn
# Initialize our Trainer
- training_args.load_best_model_at_end = True
+ # training_args.load_best_model_at_end = True
+ # TODO the argument in CnlpTrainingArguments is `model_selection_score`. reconcile this with `metric_for_best_model`?
training_args.metric_for_best_model = "one_score"
trainer = Trainer(
model=model,
@@ -884,7 +917,7 @@ def compute_metrics_fn(p: EvalPrediction):
out_table = process_prediction(
task_names=dataset.tasks,
- error_analysis=False,
+ error_analysis=training_args.error_analysis,
output_prob=training_args.output_prob,
character_level=data_args.character_level,
task_to_label_packet=task_to_label_packet,
@@ -910,4 +943,4 @@ def _mp_fn(index):
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file