diff --git a/Makefile b/Makefile index d792cc9be..1f466d599 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ sh = uv run --no-sync --frozen .PHONY: install install: rm -rf uv.lock - uv sync --all-groups + uv sync --all-groups --extra catboost --extra peft --extra sentence-transformers --extra transformers .PHONY: test test: diff --git a/pyproject.toml b/pyproject.toml index 3847e8c61..9bc8b570f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,9 +31,9 @@ classifiers=[ ] requires-python = ">=3.10,<3.13" dependencies = [ - "sentence-transformers (>=3,<4)", + "torch (>=2.0.0,<3.0.0)", "scikit-learn (>=1.5,<2.0)", - "scikit-multilearn (==0.2.0)", + "iterative-stratification (>=0.1.9)", "appdirs (>=1.4,<2.0)", "optuna (>=4.0.0,<5.0.0)", "pathlib (>=1.0.1,<2.0.0)", @@ -43,15 +43,16 @@ dependencies = [ "datasets (>=3.2.0,<4.0.0)", "xxhash (>=3.5.0,<4.0.0)", "python-dotenv (>=1.0.1,<2.0.0)", - "transformers[torch] (>=4.49.0,<5.0.0)", - "peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)", - "catboost (>=1.2.8,<2.0.0)", "aiometer (>=1.0.0,<2.0.0)", "aiofiles (>=24.1.0,<25.0.0)", "threadpoolctl (>=3.0.0,<4.0.0)", ] [project.optional-dependencies] +catboost = ["catboost (>=1.2.8,<2.0.0)"] +peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"] +transformers = ["transformers (>=4.49.0,<5.0.0)"] +sentence-transformers = ["sentence-transformers (>=3,<4)"] dspy = [ "dspy (>=2.6.5,<3.0.0)", ] @@ -252,7 +253,7 @@ module = [ "xeger", "appdirs", "sre_yield", - "skmultilearn.model_selection", + "iterstrat.ml_stratifiers", "hydra", "hydra.*", "transformers", diff --git a/src/autointent/_dump_tools/unit_dumpers.py b/src/autointent/_dump_tools/unit_dumpers.py index 97e57bf7a..7635b4978 100644 --- a/src/autointent/_dump_tools/unit_dumpers.py +++ b/src/autointent/_dump_tools/unit_dumpers.py @@ -2,30 +2,27 @@ import json import logging from pathlib import Path -from typing import Any, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar import aiofiles import joblib import numpy as np import numpy.typing as npt -from catboost import CatBoostClassifier -from peft import PeftModel from pydantic import BaseModel from sklearn.base import BaseEstimator -from transformers import ( - AutoModelForSequenceClassification, - AutoTokenizer, - PreTrainedModel, - PreTrainedTokenizer, - PreTrainedTokenizerFast, -) from autointent import Embedder, Ranker, VectorIndex +from autointent._utils import require from autointent._wrappers import BaseTorchModule from autointent.schemas import TagsList from .base import BaseObjectDumper, ModuleSimpleAttributes +if TYPE_CHECKING: + from catboost import CatBoostClassifier + from peft import PeftModel + from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast + T = TypeVar("T") logger = logging.getLogger(__name__) @@ -204,11 +201,11 @@ def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 return isinstance(obj, BaseModel) -class PeftModelDumper(BaseObjectDumper[PeftModel]): +class PeftModelDumper(BaseObjectDumper["PeftModel"]): dir_or_file_name = "peft_models" @staticmethod - def dump(obj: PeftModel, path: Path, exists_ok: bool) -> None: + def dump(obj: "PeftModel", path: Path, exists_ok: bool) -> None: path.mkdir(parents=True, exist_ok=exists_ok) if obj._is_prompt_learning: # noqa: SLF001 # strategy to save prompt learning models: save prompt encoder and bert classifier separately @@ -224,56 +221,72 @@ def dump(obj: PeftModel, path: Path, exists_ok: bool) -> None: merged_model.save_pretrained(lora_path) @staticmethod - def load(path: Path, **kwargs: Any) -> PeftModel: # noqa: ANN401, ARG004 + def load(path: Path, **kwargs: Any) -> "PeftModel": # noqa: ANN401, ARG004 + peft = require("peft", extra="peft") + transformers = require("transformers", extra="transformers") if (path / "ptuning").exists(): # prompt learning model ptuning_path = path / "ptuning" - model = AutoModelForSequenceClassification.from_pretrained(ptuning_path / "base_model") - return PeftModel.from_pretrained(model, ptuning_path / "peft") + model = transformers.AutoModelForSequenceClassification.from_pretrained(ptuning_path / "base_model") + return peft.PeftModel.from_pretrained(model, ptuning_path / "peft") # type: ignore[no-any-return] if (path / "lora").exists(): # merged lora model lora_path = path / "lora" - return AutoModelForSequenceClassification.from_pretrained(lora_path) # type: ignore[no-any-return] + return transformers.AutoModelForSequenceClassification.from_pretrained(lora_path) # type: ignore[no-any-return] msg = f"Invalid PeftModel directory structure at {path}. Expected 'ptuning' or 'lora' subdirectory." raise ValueError(msg) @classmethod def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 - return isinstance(obj, PeftModel) + try: + peft = require("peft", extra="peft") + return isinstance(obj, peft.PeftModel) + except ImportError: + return False -class HFModelDumper(BaseObjectDumper[PreTrainedModel]): +class HFModelDumper(BaseObjectDumper["PreTrainedModel"]): dir_or_file_name = "hf_models" @staticmethod - def dump(obj: PreTrainedModel, path: Path, exists_ok: bool) -> None: + def dump(obj: "PreTrainedModel", path: Path, exists_ok: bool) -> None: path.mkdir(parents=True, exist_ok=exists_ok) obj.save_pretrained(path) @staticmethod - def load(path: Path, **kwargs: Any) -> PreTrainedModel: # noqa: ANN401, ARG004 - return AutoModelForSequenceClassification.from_pretrained(path) # type: ignore[no-any-return] + def load(path: Path, **kwargs: Any) -> "PreTrainedModel": # noqa: ANN401, ARG004 + transformers = require("transformers", extra="transformers") + return transformers.AutoModelForSequenceClassification.from_pretrained(path) # type: ignore[no-any-return] @classmethod def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 - return isinstance(obj, PreTrainedModel) + try: + transformers = require("transformers", extra="transformers") + return isinstance(obj, transformers.PreTrainedModel) + except ImportError: + return False -class HFTokenizerDumper(BaseObjectDumper[PreTrainedTokenizer | PreTrainedTokenizerFast]): +class HFTokenizerDumper(BaseObjectDumper["PreTrainedTokenizer | PreTrainedTokenizerFast"]): dir_or_file_name = "hf_tokenizers" @staticmethod - def dump(obj: PreTrainedTokenizer | PreTrainedTokenizerFast, path: Path, exists_ok: bool) -> None: + def dump(obj: "PreTrainedTokenizer | PreTrainedTokenizerFast", path: Path, exists_ok: bool) -> None: path.mkdir(parents=True, exist_ok=exists_ok) obj.save_pretrained(path) @staticmethod - def load(path: Path, **kwargs: Any) -> PreTrainedTokenizer | PreTrainedTokenizerFast: # noqa: ANN401, ARG004 - return AutoTokenizer.from_pretrained(path) # type: ignore[no-any-return,no-untyped-call] + def load(path: Path, **kwargs: Any) -> "PreTrainedTokenizer | PreTrainedTokenizerFast": # noqa: ANN401, ARG004 + transformers = require("transformers", extra="transformers") + return transformers.AutoTokenizer.from_pretrained(path) # type: ignore[no-any-return] @classmethod def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 - return isinstance(obj, PreTrainedTokenizer | PreTrainedTokenizerFast) + try: + transformers = require("transformers", extra="transformers") + return isinstance(obj, transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerFast) + except ImportError: + return False class TorchModelDumper(BaseObjectDumper[BaseTorchModule]): @@ -303,20 +316,25 @@ def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 return isinstance(obj, BaseTorchModule) -class CatBoostDumper(BaseObjectDumper[CatBoostClassifier]): +class CatBoostDumper(BaseObjectDumper["CatBoostClassifier"]): dir_or_file_name = "catboost_models" @staticmethod - def dump(obj: CatBoostClassifier, path: Path, exists_ok: bool) -> None: # noqa: ARG004 + def dump(obj: "CatBoostClassifier", path: Path, exists_ok: bool) -> None: # noqa: ARG004 path.parent.mkdir(parents=True, exist_ok=True) obj.save_model(str(path), format="cbm") @staticmethod - def load(path: Path, **kwargs: Any) -> CatBoostClassifier: # noqa: ANN401, ARG004 - model = CatBoostClassifier() + def load(path: Path, **kwargs: Any) -> "CatBoostClassifier": # noqa: ANN401, ARG004 + catboost = require("catboost", extra="catboost") + model = catboost.CatBoostClassifier() model.load_model(str(path)) return model @classmethod def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 - return isinstance(obj, CatBoostClassifier) + try: + catboost = require("catboost", extra="catboost") + return isinstance(obj, catboost.CatBoostClassifier) + except ImportError: + return False diff --git a/src/autointent/_utils.py b/src/autointent/_utils.py index 92c81431b..f4092756c 100644 --- a/src/autointent/_utils.py +++ b/src/autointent/_utils.py @@ -1,6 +1,7 @@ """Utils.""" -from typing import TypeVar +import importlib +from typing import Any, TypeVar import torch @@ -25,3 +26,24 @@ def detect_device() -> str: if torch.mps.is_available(): return "mps" return "cpu" + + +def require(dependency: str, extra: str | None = None) -> Any: # noqa: ANN401 + """Try to import dependency, raise informative ImportError if missing. + + Args: + dependency: The name of the module to import + extra: Optional extra package name for pip install instructions + + Returns: + The imported module + + Raises: + ImportError: If the dependency is not installed + """ + try: + return importlib.import_module(dependency) + except ImportError as e: + extra_info = f" Install with `pip install autointent[{extra}]`." if extra else "" + msg = f"Missing dependency '{dependency}' required for this feature.{extra_info}" + raise ImportError(msg) from e diff --git a/src/autointent/_wrappers/embedder/sentence_transformers.py b/src/autointent/_wrappers/embedder/sentence_transformers.py index 3f56a189a..a8cf0b371 100644 --- a/src/autointent/_wrappers/embedder/sentence_transformers.py +++ b/src/autointent/_wrappers/embedder/sentence_transformers.py @@ -2,7 +2,7 @@ import tempfile from functools import lru_cache from pathlib import Path -from typing import Literal, cast, overload +from typing import TYPE_CHECKING, Literal, cast, overload from uuid import uuid4 import huggingface_hub @@ -10,13 +10,10 @@ import numpy.typing as npt import torch from datasets import Dataset -from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments -from sentence_transformers.losses import BatchAllTripletLoss -from sentence_transformers.training_args import BatchSamplers from sklearn.model_selection import train_test_split -from transformers import EarlyStoppingCallback, TrainerCallback from autointent._hash import Hasher +from autointent._utils import require from autointent.configs import EmbedderFineTuningConfig, TaskTypeEnum from autointent.configs._embedder import SentenceTransformerEmbeddingConfig from autointent.custom_types import ListOfLabels @@ -24,6 +21,10 @@ from .base import BaseEmbeddingBackend from .utils import get_embeddings_path +if TYPE_CHECKING: + from sentence_transformers import SentenceTransformer + from transformers import TrainerCallback + logger = logging.getLogger(__name__) @@ -48,6 +49,7 @@ class SentenceTransformerEmbeddingBackend(BaseEmbeddingBackend): """SentenceTransformer-based embedding backend implementation.""" supports_training: bool = True + _model: "SentenceTransformer | None" def __init__(self, config: SentenceTransformerEmbeddingConfig) -> None: """Initialize the SentenceTransformer backend. @@ -56,7 +58,7 @@ def __init__(self, config: SentenceTransformerEmbeddingConfig) -> None: config: Configuration for SentenceTransformer embeddings. """ self.config = config - self._model: SentenceTransformer | None = None + self._model = None self._trained: bool = False def clear_ram(self) -> None: @@ -68,10 +70,12 @@ def clear_ram(self) -> None: self._model = None torch.cuda.empty_cache() - def _load_model(self) -> SentenceTransformer: + def _load_model(self) -> "SentenceTransformer": """Load sentence transformers model to device.""" if self._model is None: - res = SentenceTransformer( + # Lazy import sentence-transformers + st = require("sentence_transformers", extra="sentence-transformers") + res = st.SentenceTransformer( self.config.model_name, device=self.config.device, prompts=self.config.get_prompt_config(), @@ -228,13 +232,17 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin model = self._load_model() + # Lazy import sentence-transformers training components (only needed for fine-tuning) + st = require("sentence_transformers", extra="sentence-transformers") + transformers = require("transformers", extra="transformers") + x_train, x_val, y_train, y_val = train_test_split(utterances, labels, test_size=config.val_fraction) tr_ds = Dataset.from_dict({"text": x_train, "label": y_train}) val_ds = Dataset.from_dict({"text": x_val, "label": y_val}) - loss = BatchAllTripletLoss(model=model, margin=config.margin) + loss = st.losses.BatchAllTripletLoss(model=model, margin=config.margin) with tempfile.TemporaryDirectory() as tmp_dir: - args = SentenceTransformerTrainingArguments( + args = st.SentenceTransformerTrainingArguments( save_strategy="epoch", save_total_limit=1, output_dir=tmp_dir, @@ -245,19 +253,19 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin warmup_ratio=config.warmup_ratio, fp16=config.fp16, bf16=config.bf16, - batch_sampler=BatchSamplers.NO_DUPLICATES, + batch_sampler=st.training_args.BatchSamplers.NO_DUPLICATES, metric_for_best_model="eval_loss", load_best_model_at_end=True, eval_strategy="epoch", greater_is_better=False, ) callbacks: list[TrainerCallback] = [ - EarlyStoppingCallback( + transformers.EarlyStoppingCallback( early_stopping_patience=config.early_stopping_patience, early_stopping_threshold=config.early_stopping_threshold, ) ] - trainer = SentenceTransformerTrainer( + trainer = st.SentenceTransformerTrainer( model=model, args=args, train_dataset=tr_ds, diff --git a/src/autointent/_wrappers/ranker.py b/src/autointent/_wrappers/ranker.py index f99402710..9061a1408 100644 --- a/src/autointent/_wrappers/ranker.py +++ b/src/autointent/_wrappers/ranker.py @@ -10,19 +10,22 @@ import logging from pathlib import Path from random import shuffle -from typing import Any, Literal, TypedDict +from typing import TYPE_CHECKING, Any, Literal, TypedDict import joblib import numpy as np import numpy.typing as npt -import sentence_transformers as st import torch from sklearn.linear_model import LogisticRegressionCV from torch import nn +from autointent._utils import require from autointent.configs import CrossEncoderConfig from autointent.custom_types import ListOfLabels, RerankedItem +if TYPE_CHECKING: + import sentence_transformers as st + logger = logging.getLogger(__name__) @@ -95,7 +98,7 @@ class Ranker: _metadata_file_name = "metadata.json" _classifier_file_name = "classifier.joblib" config: CrossEncoderConfig - cross_encoder: st.CrossEncoder + cross_encoder: "st.CrossEncoder" def __init__( self, @@ -110,12 +113,15 @@ def __init__( classifier_head: Optional pre-trained classifier head output_range: Range of the output probabilities ([0, 1] for sigmoid, [-1, 1] for tanh) """ + # Lazy import sentence-transformers + st = require("sentence_transformers", extra="sentence-transformers") + self.config = CrossEncoderConfig.from_search_config(cross_encoder_config) self.cross_encoder = st.CrossEncoder( self.config.model_name, trust_remote_code=self.config.trust_remote_code, device=self.config.device, - max_length=self.config.tokenizer_config.max_length, # type: ignore[arg-type] + max_length=self.config.tokenizer_config.max_length, ) self._train_head = False self._clf = classifier_head diff --git a/src/autointent/context/data_handler/_stratification.py b/src/autointent/context/data_handler/_stratification.py index b7f7e5af0..c145112e0 100644 --- a/src/autointent/context/data_handler/_stratification.py +++ b/src/autointent/context/data_handler/_stratification.py @@ -10,10 +10,9 @@ import numpy as np from datasets import Dataset as HFDataset from datasets import concatenate_datasets +from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit from numpy import typing as npt from sklearn.model_selection import train_test_split -from skmultilearn.model_selection import IterativeStratification -from transformers import set_seed from autointent import Dataset from autointent.custom_types import LabelType @@ -155,12 +154,10 @@ def _split_multilabel(self, dataset: HFDataset, test_size: float) -> Sequence[np Returns: A sequence containing indices for train and test splits. """ - if self.random_seed is not None: - set_seed(self.random_seed) # workaround for buggy nature of IterativeStratification from skmultilearn - splitter = IterativeStratification( - n_splits=2, - order=2, - sample_distribution_per_fold=[test_size, 1.0 - test_size], + splitter = MultilabelStratifiedShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=self.random_seed, ) return next(splitter.split(np.arange(len(dataset)), np.array(dataset[self.label_feature]))) diff --git a/src/autointent/generation/utterances/_adversarial/human_utterance_generator.py b/src/autointent/generation/utterances/_adversarial/human_utterance_generator.py index 6f4af6b78..5b924c3cd 100644 --- a/src/autointent/generation/utterances/_adversarial/human_utterance_generator.py +++ b/src/autointent/generation/utterances/_adversarial/human_utterance_generator.py @@ -114,7 +114,7 @@ def augment( generated_split = HFDataset.from_list(new_samples) dataset[split_name] = concatenate_datasets([original_split, generated_split]) - return [Sample(**sample) for sample in new_samples] + return [Sample.model_validate(sample) for sample in new_samples] async def augment_async( self, dataset: Dataset, split_name: str = Split.TRAIN, update_split: bool = True, n_final_per_class: int = 5 diff --git a/src/autointent/modules/scoring/_bert.py b/src/autointent/modules/scoring/_bert.py index 8f1cc176c..a58947429 100644 --- a/src/autointent/modules/scoring/_bert.py +++ b/src/autointent/modules/scoring/_bert.py @@ -2,33 +2,25 @@ import tempfile from collections.abc import Callable -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import numpy as np import numpy.typing as npt import torch from datasets import Dataset, DatasetDict from sklearn.model_selection import train_test_split -from transformers import ( - AutoModelForSequenceClassification, - AutoTokenizer, - DataCollatorWithPadding, - EarlyStoppingCallback, - EvalPrediction, - PrinterCallback, - ProgressCallback, - Trainer, - TrainingArguments, -) -from transformers.trainer_callback import TrainerCallback from autointent import Context from autointent._callbacks import REPORTERS_NAMES +from autointent._utils import require from autointent.configs import EarlyStoppingConfig, HFModelConfig from autointent.custom_types import ListOfLabels from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL from autointent.modules.base import BaseScorer +if TYPE_CHECKING: + from transformers import EvalPrediction, TrainerCallback + class BertScorer(BaseScorer): """Scoring module for transformer-based classification using BERT models. @@ -90,6 +82,17 @@ def __init__( early_stopping_config: EarlyStoppingConfig | dict[str, Any] | None = None, print_progress: bool = False, ) -> None: + # Lazy import transformers + transformers = require("transformers", extra="transformers") + self._AutoModelForSequenceClassification = transformers.AutoModelForSequenceClassification + self._AutoTokenizer = transformers.AutoTokenizer + self._DataCollatorWithPadding = transformers.DataCollatorWithPadding + self._EarlyStoppingCallback = transformers.EarlyStoppingCallback + self._PrinterCallback = transformers.PrinterCallback + self._ProgressCallback = transformers.ProgressCallback + self._Trainer = transformers.Trainer + self._TrainingArguments = transformers.TrainingArguments + self.classification_model_config = HFModelConfig.from_search_config(classification_model_config) self.num_train_epochs = num_train_epochs self.batch_size = batch_size @@ -132,7 +135,7 @@ def _initialize_model(self) -> Any: # noqa: ANN401 label2id = {i: i for i in range(self._n_classes)} id2label = {i: i for i in range(self._n_classes)} - return AutoModelForSequenceClassification.from_pretrained( + return self._AutoModelForSequenceClassification.from_pretrained( self.classification_model_config.model_name, trust_remote_code=self.classification_model_config.trust_remote_code, num_labels=self._n_classes, @@ -148,7 +151,7 @@ def fit( ) -> None: self._validate_task(labels) - self._tokenizer = AutoTokenizer.from_pretrained(self.classification_model_config.model_name) # type: ignore[no-untyped-call] + self._tokenizer = self._AutoTokenizer.from_pretrained(self.classification_model_config.model_name) self._model = self._initialize_model() tokenized_dataset = self._get_tokenized_dataset(utterances, labels) self._train(tokenized_dataset) @@ -162,7 +165,7 @@ def _train(self, tokenized_dataset: DatasetDict) -> None: tokenized_dataset: output from :py:meth:`BertScorer._get_tokenized_dataset` """ with tempfile.TemporaryDirectory() as tmp_dir: - training_args = TrainingArguments( + training_args = self._TrainingArguments( output_dir=tmp_dir, num_train_epochs=self.num_train_epochs, per_device_train_batch_size=self.batch_size, @@ -181,27 +184,27 @@ def _train(self, tokenized_dataset: DatasetDict) -> None: load_best_model_at_end=self.early_stopping_config.metric is not None, ) - trainer = Trainer( + trainer = self._Trainer( model=self._model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], processing_class=self._tokenizer, - data_collator=DataCollatorWithPadding(tokenizer=self._tokenizer), + data_collator=self._DataCollatorWithPadding(tokenizer=self._tokenizer), compute_metrics=self._get_compute_metrics(), callbacks=self._get_trainer_callbacks(), ) if not self.print_progress: - trainer.remove_callback(PrinterCallback) - trainer.remove_callback(ProgressCallback) + trainer.remove_callback(self._PrinterCallback) + trainer.remove_callback(self._ProgressCallback) trainer.train() - def _get_trainer_callbacks(self) -> list[TrainerCallback]: + def _get_trainer_callbacks(self) -> list["TrainerCallback"]: res: list[TrainerCallback] = [] if self.early_stopping_config.metric is not None: res.append( - EarlyStoppingCallback( + self._EarlyStoppingCallback( early_stopping_patience=self.early_stopping_config.patience, early_stopping_threshold=self.early_stopping_config.threshold, ) @@ -235,7 +238,7 @@ def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]: return dataset.map(tokenize_function, batched=True, batch_size=self.batch_size) - def _get_compute_metrics(self) -> Callable[[EvalPrediction], dict[str, float]] | None: + def _get_compute_metrics(self) -> Callable[["EvalPrediction"], dict[str, float]] | None: """Construct callable for computing metrics during transformer training. The result of this function is supposed to pass to :py:class:`transformers.Trainer`. @@ -246,7 +249,7 @@ def _get_compute_metrics(self) -> Callable[[EvalPrediction], dict[str, float]] | metric_name = self.early_stopping_config.metric metric_fn = (SCORING_METRICS_MULTILABEL | SCORING_METRICS_MULTICLASS)[metric_name] - def compute_metrics(output: EvalPrediction) -> dict[str, float]: + def compute_metrics(output: "EvalPrediction") -> dict[str, float]: return { metric_name: metric_fn(output.label_ids.tolist(), output.predictions.tolist()) # type: ignore[union-attr] } diff --git a/src/autointent/modules/scoring/_catboost/catboost_scorer.py b/src/autointent/modules/scoring/_catboost/catboost_scorer.py index d1664b2e5..c8c2b03d1 100644 --- a/src/autointent/modules/scoring/_catboost/catboost_scorer.py +++ b/src/autointent/modules/scoring/_catboost/catboost_scorer.py @@ -2,19 +2,22 @@ import logging from enum import Enum -from typing import Any, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import numpy.typing as npt import pandas as pd -from catboost import CatBoostClassifier from pydantic import PositiveInt from autointent import Context, Embedder +from autointent._utils import require from autointent.configs import EmbedderConfig, TaskTypeEnum, initialize_embedder_config from autointent.custom_types import FloatFromZeroToOne, ListOfLabels from autointent.modules.base import BaseScorer +if TYPE_CHECKING: + from catboost import CatBoostClassifier + logger = logging.getLogger(__name__) @@ -86,7 +89,7 @@ class CatBoostScorer(BaseScorer): supports_multiclass = True supports_multilabel = True - _model: CatBoostClassifier + _model: "CatBoostClassifier" encoder_features_types = (FeaturesType.EMBEDDING, FeaturesType.BOTH) @@ -103,6 +106,10 @@ def __init__( depth: int = 6, **catboost_kwargs: dict[str, Any], ) -> None: + # Lazy import catboost + catboost = require("catboost", extra="catboost") + self._CatBoostClassifier = catboost.CatBoostClassifier + self.val_fraction = val_fraction self.early_stopping_rounds = early_stopping_rounds self.iterations = iterations @@ -207,7 +214,7 @@ def fit( msg = "Disabling early stopping in CatBoostClassifier as it is not supported with multi-label task." logger.warning(msg) - self._model = CatBoostClassifier( + self._model = self._CatBoostClassifier( iterations=self.iterations, depth=self.depth, loss_function=self.loss_function or default_loss, diff --git a/src/autointent/modules/scoring/_gcn/gcn_model.py b/src/autointent/modules/scoring/_gcn/gcn_model.py index 5b2d51aa1..cc1e8bf41 100644 --- a/src/autointent/modules/scoring/_gcn/gcn_model.py +++ b/src/autointent/modules/scoring/_gcn/gcn_model.py @@ -1,6 +1,5 @@ import json from pathlib import Path -from typing import cast import torch from pydantic import BaseModel @@ -90,7 +89,7 @@ def create_correlation_matrix(train_labels: torch.Tensor, num_classes: int, p: f reweighted_adj = adj_matrix_no_self_loop * weights_p.unsqueeze(1) reweighted_adj.fill_diagonal_(1 - p) - return cast(torch.Tensor, reweighted_adj) + return reweighted_adj def set_correlation_matrix(self, train_labels: torch.Tensor) -> None: corr_matrix = self.create_correlation_matrix( diff --git a/src/autointent/modules/scoring/_lora/lora.py b/src/autointent/modules/scoring/_lora/lora.py index fc000f807..62803a84c 100644 --- a/src/autointent/modules/scoring/_lora/lora.py +++ b/src/autointent/modules/scoring/_lora/lora.py @@ -1,16 +1,18 @@ """BertScorer class for transformer-based classification with LoRA.""" from pathlib import Path -from typing import Any, Literal - -from peft import LoraConfig, get_peft_model +from typing import TYPE_CHECKING, Any, Literal from autointent import Context from autointent._callbacks import REPORTERS_NAMES from autointent._dump_tools import Dumper +from autointent._utils import require from autointent.configs import EarlyStoppingConfig, HFModelConfig from autointent.modules.scoring._bert import BertScorer +if TYPE_CHECKING: + from peft import LoraConfig + class BERTLoRAScorer(BertScorer): """BERTLoRAScorer class for transformer-based classification with LoRA (Low-Rank Adaptation). @@ -56,6 +58,8 @@ class BERTLoRAScorer(BertScorer): name = "lora" + _lora_config: "LoraConfig" + def __init__( self, classification_model_config: HFModelConfig | str | dict[str, Any] | None = None, @@ -67,6 +71,11 @@ def __init__( print_progress: bool = False, **lora_kwargs: Any, # noqa: ANN401 ) -> None: + # Lazy import peft + peft = require("peft", extra="peft") + self._LoraConfig = peft.LoraConfig + self._get_peft_model = peft.get_peft_model + # early stopping doesnt work with lora for now https://github.com/huggingface/transformers/issues/38130 early_stopping_config = EarlyStoppingConfig(metric=None) # disable early stopping @@ -80,7 +89,7 @@ def __init__( early_stopping_config=early_stopping_config, print_progress=print_progress, ) - self._lora_config = LoraConfig(**lora_kwargs) + self._lora_config = self._LoraConfig(**lora_kwargs) @classmethod def from_context( @@ -107,7 +116,7 @@ def from_context( def _initialize_model(self) -> Any: # noqa: ANN401 model = super()._initialize_model() - return get_peft_model(model, self._lora_config) + return self._get_peft_model(model, self._lora_config) def dump(self, path: str) -> None: - Dumper.dump(self, Path(path), exclude=[LoraConfig]) + Dumper.dump(self, Path(path), exclude=[self._LoraConfig]) diff --git a/src/autointent/modules/scoring/_ptuning/ptuning.py b/src/autointent/modules/scoring/_ptuning/ptuning.py index 8f1561a72..ebab9eac4 100644 --- a/src/autointent/modules/scoring/_ptuning/ptuning.py +++ b/src/autointent/modules/scoring/_ptuning/ptuning.py @@ -1,17 +1,20 @@ """PTuningScorer class for ptuning-based classification.""" from pathlib import Path -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal -from peft import PromptEncoderConfig, PromptEncoderReparameterizationType, TaskType, get_peft_model from pydantic import PositiveInt from autointent import Context from autointent._callbacks import REPORTERS_NAMES from autointent._dump_tools import Dumper +from autointent._utils import require from autointent.configs import EarlyStoppingConfig, HFModelConfig from autointent.modules.scoring._bert import BertScorer +if TYPE_CHECKING: + from peft import PromptEncoderConfig + class PTuningScorer(BertScorer): """PEFT P-tuning scorer. @@ -47,6 +50,8 @@ class PTuningScorer(BertScorer): name = "ptuning" + _ptuning_config: "PromptEncoderConfig" + def __init__( # noqa: PLR0913 self, classification_model_config: HFModelConfig | str | dict[str, Any] | None = None, @@ -64,6 +69,13 @@ def __init__( # noqa: PLR0913 print_progress: bool = False, **ptuning_kwargs: Any, # noqa: ANN401 ) -> None: + # Lazy import peft + peft = require("peft", extra="peft") + self._PromptEncoderConfig = peft.PromptEncoderConfig + self._PromptEncoderReparameterizationType = peft.PromptEncoderReparameterizationType + self._TaskType = peft.TaskType + self._get_peft_model = peft.get_peft_model + super().__init__( classification_model_config=classification_model_config, num_train_epochs=num_train_epochs, @@ -74,9 +86,9 @@ def __init__( # noqa: PLR0913 early_stopping_config=early_stopping_config, print_progress=print_progress, ) - self._ptuning_config = PromptEncoderConfig( - task_type=TaskType.SEQ_CLS, - encoder_reparameterization_type=PromptEncoderReparameterizationType(encoder_reparameterization_type), + self._ptuning_config = self._PromptEncoderConfig( + task_type=self._TaskType.SEQ_CLS, + encoder_reparameterization_type=self._PromptEncoderReparameterizationType(encoder_reparameterization_type), num_virtual_tokens=num_virtual_tokens, encoder_dropout=encoder_dropout, encoder_hidden_size=encoder_hidden_size, @@ -139,7 +151,7 @@ def from_context( # noqa: PLR0913 def _initialize_model(self) -> Any: # noqa: ANN401 """Initialize the model with P-tuning configuration.""" model = super()._initialize_model() - return get_peft_model(model, self._ptuning_config) + return self._get_peft_model(model, self._ptuning_config) def dump(self, path: str) -> None: - Dumper.dump(self, Path(path), exclude=[PromptEncoderConfig]) + Dumper.dump(self, Path(path), exclude=[self._PromptEncoderConfig]) diff --git a/tests/_transformers/test_nli_transformer.py b/tests/_transformers/test_nli_transformer.py index a1c266c14..1827ecab2 100644 --- a/tests/_transformers/test_nli_transformer.py +++ b/tests/_transformers/test_nli_transformer.py @@ -5,6 +5,8 @@ from autointent import Dataset, Ranker from autointent.context.data_handler import DataHandler +pytest.importorskip("sentence-transformers") + @pytest.fixture def data_handler(): diff --git a/tests/data/test_data_handler.py b/tests/data/test_data_handler.py index 7b6570013..fc4d63999 100644 --- a/tests/data/test_data_handler.py +++ b/tests/data/test_data_handler.py @@ -89,13 +89,13 @@ def test_data_handler_multilabel_mode(sample_multilabel_data): assert handler.multilabel is True assert handler.dataset.n_classes == 2 assert handler.train_utterances(0) == [ - "hey, how's it going?", + "farewell and see you later", + "good morning", "so long and take care", - "hello, nice to meet you", - "later, see you soon", + "greetings and salutations", ] assert handler.test_utterances() == ["greetings", "farewell"] - assert handler.train_labels(0) == [[1, 0], [0, 1], [0, 1], [1, 0]] + assert handler.train_labels(0) == [[0, 1], [1, 0], [0, 1], [1, 0]] assert handler.test_labels() == [[0, 1], [1, 0]] diff --git a/tests/data/test_stratificaiton.py b/tests/data/test_stratificaiton.py index e964c965c..87db82e8a 100644 --- a/tests/data/test_stratificaiton.py +++ b/tests/data/test_stratificaiton.py @@ -38,8 +38,8 @@ def test_multilabel_train_test_split(dataset_unsplitted): assert Split.TRAIN in dataset assert Split.TEST in dataset - assert dataset[Split.TRAIN].num_rows == 18 - assert dataset[Split.TEST].num_rows == 18 + assert dataset[Split.TRAIN].num_rows == 19 + assert dataset[Split.TEST].num_rows == 17 assert dataset.get_n_classes(Split.TRAIN) == dataset.get_n_classes(Split.TEST) diff --git a/tests/embedder/conftest.py b/tests/embedder/conftest.py index bebfbd57e..e8ea3e17a 100644 --- a/tests/embedder/conftest.py +++ b/tests/embedder/conftest.py @@ -8,6 +8,8 @@ # Check if OpenAI API key is available for testing openai_available = os.getenv("OPENAI_API_KEY") is not None +pytest.importorskip("sentence_transformers") + @pytest.fixture def on_windows() -> bool: diff --git a/tests/embedder/test_basic.py b/tests/embedder/test_basic.py index fe670e824..32849fe6b 100644 --- a/tests/embedder/test_basic.py +++ b/tests/embedder/test_basic.py @@ -65,20 +65,3 @@ def test_similarity_symmetry(self, embedder: Embedder): sim2 = embedder.similarity(embeddings[1:], embeddings[:1]) np.testing.assert_allclose(sim1, sim2.T, rtol=1e-5) - - def test_return_tensors_functionality(self, embedder: Embedder): - """Test return_tensors parameter.""" - utterances = ["Hello world", "Test sentence"] - - # Test numpy return (default) - embeddings_np = embedder.embed(utterances, return_tensors=False) - assert isinstance(embeddings_np, np.ndarray) - - # Test tensor return - embeddings_tensor = embedder.embed(utterances, return_tensors=True) - import torch - - assert isinstance(embeddings_tensor, torch.Tensor) - - # Values should be the same - np.testing.assert_allclose(embeddings_np, embeddings_tensor.cpu().numpy(), rtol=1e-5) diff --git a/tests/modules/scoring/test_bert.py b/tests/modules/scoring/test_bert.py index 86fbc685b..a3106d5ee 100644 --- a/tests/modules/scoring/test_bert.py +++ b/tests/modules/scoring/test_bert.py @@ -8,6 +8,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import BertScorer +pytest.importorskip("transformers") + def test_bert_scorer_dump_load(dataset): """Test that BertScorer can be saved and loaded while preserving predictions.""" diff --git a/tests/modules/scoring/test_catboost.py b/tests/modules/scoring/test_catboost.py index 177936980..ca8fecbc2 100644 --- a/tests/modules/scoring/test_catboost.py +++ b/tests/modules/scoring/test_catboost.py @@ -8,6 +8,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import CatBoostScorer +pytest.importorskip("catboost") + def test_catboost_scorer_dump_load(dataset): """Test that CatBoostScorer can be saved and loaded while preserving predictions.""" diff --git a/tests/modules/scoring/test_description_bi.py b/tests/modules/scoring/test_description_bi.py index 1b2b3cdce..f54efdcc7 100644 --- a/tests/modules/scoring/test_description_bi.py +++ b/tests/modules/scoring/test_description_bi.py @@ -6,6 +6,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import BiEncoderDescriptionScorer +pytest.importorskip("sentence-transformers") + @pytest.mark.parametrize( ("expected_prediction", "multilabel"), diff --git a/tests/modules/scoring/test_description_cross.py b/tests/modules/scoring/test_description_cross.py index 886662eae..278315130 100644 --- a/tests/modules/scoring/test_description_cross.py +++ b/tests/modules/scoring/test_description_cross.py @@ -6,6 +6,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import CrossEncoderDescriptionScorer +pytest.importorskip("sentence-transformers") + @pytest.mark.parametrize( ("expected_prediction", "multilabel"), diff --git a/tests/modules/scoring/test_dnnc.py b/tests/modules/scoring/test_dnnc.py index 93c734e92..229543edc 100644 --- a/tests/modules/scoring/test_dnnc.py +++ b/tests/modules/scoring/test_dnnc.py @@ -6,6 +6,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import DNNCScorer +pytest.importorskip("sentence-transformers") + @pytest.mark.parametrize(("train_head", "pred_score"), [(True, 1)]) def test_base_dnnc(dataset, train_head, pred_score): diff --git a/tests/modules/scoring/test_lora.py b/tests/modules/scoring/test_lora.py index f9d2725fd..cf50b8ba9 100644 --- a/tests/modules/scoring/test_lora.py +++ b/tests/modules/scoring/test_lora.py @@ -8,6 +8,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import BERTLoRAScorer +pytest.importorskip("peft") + def test_lora_scorer_dump_load(dataset): """Test that BERTLoRAScorer can be saved and loaded while preserving predictions.""" diff --git a/tests/modules/scoring/test_ptuning.py b/tests/modules/scoring/test_ptuning.py index d74fe39bf..4b80fe460 100644 --- a/tests/modules/scoring/test_ptuning.py +++ b/tests/modules/scoring/test_ptuning.py @@ -8,6 +8,8 @@ from autointent.context.data_handler import DataHandler from autointent.modules import PTuningScorer +pytest.importorskip("peft") + def test_ptuning_scorer_dump_load(dataset): """Test that PTuningScorer can be saved and loaded while preserving predictions.""" diff --git a/tests/modules/scoring/test_rerank_scorer.py b/tests/modules/scoring/test_rerank_scorer.py index 024d54e66..224e10a02 100644 --- a/tests/modules/scoring/test_rerank_scorer.py +++ b/tests/modules/scoring/test_rerank_scorer.py @@ -1,10 +1,13 @@ import tempfile import numpy as np +import pytest from autointent.context.data_handler import DataHandler from autointent.modules import RerankScorer +pytest.importorskip("sentence-transformers") + def test_base_rerank_scorer(dataset): data_handler = DataHandler(dataset) diff --git a/tests/modules/test_dumper.py b/tests/modules/test_dumper.py index 56effde1d..d987e4ab3 100644 --- a/tests/modules/test_dumper.py +++ b/tests/modules/test_dumper.py @@ -5,7 +5,6 @@ import pytest import torch from sklearn.linear_model import LogisticRegression -from transformers import AutoModelForSequenceClassification, AutoTokenizer from autointent import Embedder, Ranker, VectorIndex from autointent._dump_tools import Dumper @@ -39,6 +38,8 @@ def check_attributes(self): class TestTransformers: def init_attributes(self): + from transformers import AutoModelForSequenceClassification, AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") self._tokenizer_predictions = np.array(self.tokenizer(["hello", "world"]).input_ids) self.transformer = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") @@ -143,16 +144,55 @@ def check_attributes(self): assert not self.pydantic_model.tokenizer_config.truncation +def _st_is_installed() -> bool: + try: + import sentence_transformers # noqa: F401 + except ImportError: + return False + else: + return True + + +def _transformers_is_installed() -> bool: + try: + import transformers # noqa: F401 + except ImportError: + return False + else: + return True + + @pytest.mark.parametrize( "test_class", [ TestSimpleAttributes, TestTags, - TestTransformers, + pytest.param( + TestTransformers, + marks=pytest.mark.skipif( + not _transformers_is_installed(), + reason="need transformers dependency", + ), + id="transformer", + ), TestVectorIndex, - TestEmbedder, + pytest.param( + TestEmbedder, + marks=pytest.mark.skipif( + not _st_is_installed(), + reason="need sentence-transformers dependency", + ), + id="embedder", + ), TestSklearnEstimator, - TestRanker, + pytest.param( + TestRanker, + marks=pytest.mark.skipif( + not _st_is_installed(), + reason="need sentence-transformers dependency", + ), + id="ranker", + ), TestCrossEncoderConfig, ], )