criticalml-uw · sdhossain · Oct 7, 2025 · sdhossain · Oct 7, 2025 · tomtseng
diff --git a/src/safetunebed/whitebox/attacks/__init__.py b/src/safetunebed/whitebox/attacks/__init__.py
@@ -1,9 +1,28 @@
-"""Tampering whitebox attacks supported by safetunebed."""
+"""Tampering whitebox attacks supported by safetunebed.
 
+Import modules for side effects so they register via the attacks registry.
+"""
+
+# Side-effect imports to populate ATTACKS_REGISTRY
+from safetunebed.whitebox.attacks.embedding_attack import (
+    embedding_attack as _,
+)
+
+# Re-export embedding attack classes for convenience
 from safetunebed.whitebox.attacks.embedding_attack.embedding_attack import (
     EmbeddingAttack,
     EmbeddingAttackConfig,
 )
+from safetunebed.whitebox.attacks.full_parameter_finetune import (
+    full_parameter_finetune as _,
+)
+from safetunebed.whitebox.attacks.jailbreak_finetune import (
+    jailbreak_finetune as _,
+)
+from safetunebed.whitebox.attacks.lora_finetune import lora_finetune as _
+from safetunebed.whitebox.attacks.multilingual_finetune import (
+    multilingual_finetune as _,  # noqa: F401
+)
 
 __all__ = [
     "EmbeddingAttack",

diff --git a/src/safetunebed/whitebox/attacks/base.py b/src/safetunebed/whitebox/attacks/base.py
@@ -3,35 +3,44 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
 from pathlib import Path
-from typing import Any, Generic, TypeVar
+from typing import Any, Generic, TypeVar, final
 
+import polars as pl
 from pandera.typing.polars import DataFrame
 from typing_extensions import Self
 
+from safetunebed.whitebox.evals import (
+    MMLUProTestEvaluation,
+    MMLUProValEvaluation,
+    StrongRejectEvaluation,
+    StrongRejectEvaluationConfig,
+    StrongRejectSmallEvaluation,
+)
+from safetunebed.whitebox.evals.base import WhiteBoxEvaluationConfig
 from safetunebed.whitebox.evals.output_schema import EvaluationSchema
-from safetunebed.whitebox.utils.names import EvalName, EvalPath
+from safetunebed.whitebox.utils.models.config import ModelConfig
+from safetunebed.whitebox.utils.names import AttackName, EvalName, EvalPath
 
 H = TypeVar(name="H", bound="TamperAttackConfig")
 
 
 @dataclass
-class TamperAttackConfig(ABC):
+class TamperAttackConfig:
     """Configuration for attacks including hyperparameters.
 
     Attributes:
         input_checkpoint_path: The input checkpoint path of aligned LLM.
         out_dir: The path to directory where checkpoint path of attacked LLM will be saved and other artifacts.
-        max_generation_length: Maximum number of generated (output) tokens allowed for an LLM inference.
-        evals: The evaluators that will evaluate the success of the attack.
+        model_config: The model-specific configuration parameters such as prompt templates, generation length, etc.
+        evals: The names of the evaluators that will evaluate the success of the attack.
     """
 
     input_checkpoint_path: str
     out_dir: str
-    max_generation_length: int
     evals: list[EvalName]
+    model_config: ModelConfig
 
     @classmethod
-    @abstractmethod
     def from_dict(cls, data: dict[str, Any]) -> Self:  # pyright: ignore[reportExplicitAny]
         """All subclasses must implement a `from_dict` constructor.
 
@@ -41,6 +50,37 @@ def from_dict(cls, data: dict[str, Any]) -> Self:  # pyright: ignore[reportExpli
         Returns:
             TamperAttackConfig: Config object.
         """
+        cls._validate_dict_keys(data)
+        data = cls.serialize_data(data)
+
+        return cls(**data)
+
+    @classmethod
+    def serialize_data(cls, data: dict[str, Any]) -> dict[str, Any]:  # pyright: ignore[reportExplicitAny]
+        """Serialize data from a dictionary such that it can be used to construct nested objects.
+
+        Args:
+            data: Dictionary containing the configuration fields.
+
+        Returns:
+            dict[str, Any]: Dictionary containing the configuration fields.
+        """
+        model_config_dict = data.pop("model_config")  # pyright: ignore[reportAny]
+        model_config = ModelConfig.from_dict(model_config_dict)  # pyright: ignore[reportAny]
+
+        data.update({"model_config": model_config})
+        return data
+
+    @classmethod
+    def _validate_dict_keys(cls, data: dict[str, Any]) -> None:  # pyright: ignore[reportExplicitAny]
+        """Check that the dictionary to be used to construct dataclass has correct fields.
+
+        Args:
+            data: Dictionary containing the configuration fields.
+
+        Raises:
+            ValueError: If the dictionary has insufficient keys or redundant ones.
+        """
         config_keys: set[str] = {_field.name for _field in fields(cls) if _field.init}
 
         for config_key in config_keys:
@@ -53,12 +93,12 @@ def from_dict(cls, data: dict[str, Any]) -> Self:  # pyright: ignore[reportExpli
             if dict_key not in config_keys:
                 raise ValueError(f"`{dict_key}` is not a field of {cls.__name__}")
 
-        return cls(**data)  # pyright: ignore[reportAny]
-
 
 class TamperAttack(ABC, Generic[H]):
     """Base class for a tampering attack."""
 
+    name: AttackName
+
     def __init__(self, attack_config: H) -> None:
         """Construct base TamperAttack.
 
@@ -77,6 +117,7 @@ def __init_subclass__(cls) -> None:
         if not hasattr(cls, "name"):
             raise TypeError(f"{cls.__name__} must define a class attribute 'name'")
 
+    @final
     def benchmark(self) -> DataFrame[EvaluationSchema]:
         """Execute attack and evaluate the attacked model.
 
@@ -95,7 +136,15 @@ def benchmark(self) -> DataFrame[EvaluationSchema]:
     def run_attack(self) -> None:
         """Execute attack and ensure attacked LLM exists at specified paths outlined in config."""
 
-    @abstractmethod
+    @final
+    def delete_output_checkpoint(self) -> None:
+        """Delete the tampered model checkpoint if it exists."""
+        if Path(self.output_checkpoint_path).exists():
+            import shutil
+
+            shutil.rmtree(self.output_checkpoint_path)
+            Path(self.output_checkpoint_path).mkdir(parents=True, exist_ok=False)
+
     def evaluate(self) -> DataFrame[EvaluationSchema]:
         """Evaluate attack on relevant evaluations.
 
@@ -105,3 +154,66 @@ def evaluate(self) -> DataFrame[EvaluationSchema]:
         Returns:
             DataFrame[EvaluationSchema]: A polars DataFrame with the aggregated evaluation metrics.
         """
+        results = EvaluationSchema.empty()
+
+        if EvalName.STRONG_REJECT in self.attack_config.evals:
+            results = pl.concat([results, self.evaluate_strong_reject()])
+
+        if EvalName.STRONG_REJECT_SMALL in self.attack_config.evals:
+            results = pl.concat([results, self.evaluate_strong_reject_small()])
+
+        if EvalName.MMLU_PRO_VAL in self.attack_config.evals:
+            results = pl.concat([results, self.evaluate_mmlu_pro_val()])
+
+        if EvalName.MMLU_PRO_TEST in self.attack_config.evals:
+            results = pl.concat([results, self.evaluate_mmlu_pro_test()])
+
+        return EvaluationSchema.validate(results)
+
+    def evaluate_strong_reject(self) -> DataFrame[EvaluationSchema]:
+        """Evaluate attack on the `StrongRejectEvaluation` evaluator."""
+        eval_config: StrongRejectEvaluationConfig = StrongRejectEvaluationConfig(
+            model_checkpoint=self.output_checkpoint_path,
+            out_dir=self.attack_config.out_dir,
+            model_config=self.attack_config.model_config,
+        )
+        evaluator: StrongRejectEvaluation[StrongRejectEvaluationConfig] = (
+            StrongRejectEvaluation(eval_config)
+        )
+
+        return evaluator.run_evaluation()
+
+    def evaluate_strong_reject_small(self) -> DataFrame[EvaluationSchema]:
+        """Evaluate attack on the smaller `StrongRejectEvaluation` evaluator."""
+        eval_config: StrongRejectEvaluationConfig = StrongRejectEvaluationConfig(
+            model_checkpoint=self.output_checkpoint_path,
+            out_dir=self.attack_config.out_dir,
+            model_config=self.attack_config.model_config,
+        )
+        evaluator: StrongRejectSmallEvaluation[StrongRejectEvaluationConfig] = (
+            StrongRejectSmallEvaluation(eval_config)
+        )
+
+        return evaluator.run_evaluation()
+
+    def evaluate_mmlu_pro_val(self) -> DataFrame[EvaluationSchema]:
+        """Evaluate attack on the `MMLUPro` evaluator validation set."""
+        eval_config: WhiteBoxEvaluationConfig = WhiteBoxEvaluationConfig(
+            model_checkpoint=self.output_checkpoint_path,
+            out_dir=self.attack_config.out_dir,
+            model_config=self.attack_config.model_config,
+        )
+        evaluator: MMLUProValEvaluation = MMLUProValEvaluation(eval_config)
+
+        return evaluator.run_evaluation()
+
+    def evaluate_mmlu_pro_test(self) -> DataFrame[EvaluationSchema]:
+        """Evaluate attack on the `MMLUPro` evaluator full test set."""
+        eval_config: WhiteBoxEvaluationConfig = WhiteBoxEvaluationConfig(
+            model_checkpoint=self.output_checkpoint_path,
+            out_dir=self.attack_config.out_dir,
+            model_config=self.attack_config.model_config,
+        )
+        evaluator: MMLUProTestEvaluation = MMLUProTestEvaluation(eval_config)
+
+        return evaluator.run_evaluation()
diff --git a/src/safetunebed/whitebox/attacks/registry.py b/src/safetunebed/whitebox/attacks/registry.py
@@ -0,0 +1,34 @@
+"""Attack registry with decorator-based registration.
+
+Allows individual attack modules to self-register their mapping from
+`AttackName -> (ConfigDataclass, AttackClass)` at import time, avoiding a
+central hard-coded map.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TypeVar
+
+from safetunebed.whitebox.attacks.base import TamperAttack, TamperAttackConfig
+from safetunebed.whitebox.utils.names import AttackName
+
+H = TypeVar("H", bound=TamperAttackConfig)
+T = TypeVar("T", bound=TamperAttack)  # pyright: ignore[reportMissingTypeArgument]
+
+
+ATTACKS_REGISTRY: dict[  # pyright: ignore[reportUnknownVariableType]
+    AttackName, tuple[type[TamperAttackConfig], type[TamperAttack]]  # pyright: ignore[reportMissingTypeArgument]
+] = {}
+
+
+def register_attack(
+    name: AttackName, config_cls: type[H]
-    name: AttackName, config_cls: type[H]
+    name: AttackName, config_cls: TamperAttackConfig
-    name: AttackName, config_cls: type[H]
+    name: AttackName, config_cls: TamperAttackConfig
+) -> Callable[[type[T]], type[T]]:
+    """Decorator to register an attack class and its config class under a name."""  # noqa: D401
+
+    def _decorator(attack_cls: type[T]) -> type[T]:
+        ATTACKS_REGISTRY[name] = (config_cls, attack_cls)
+        return attack_cls
+
+    return _decorator
diff --git a/src/safetunebed/whitebox/evals/__init__.py b/src/safetunebed/whitebox/evals/__init__.py
@@ -4,14 +4,22 @@
     EmbeddingAttackEvaluation,
     EmbeddingAttackEvaluationConfig,
 )
+from safetunebed.whitebox.evals.mmlu_pro.mmlu_pro import (
+    MMLUProTestEvaluation,
+    MMLUProValEvaluation,
+)
 from safetunebed.whitebox.evals.strong_reject.strong_reject import (
     StrongRejectEvaluation,
     StrongRejectEvaluationConfig,
+    StrongRejectSmallEvaluation,
 )
 
 __all__ = [
     "EmbeddingAttackEvaluation",
     "EmbeddingAttackEvaluationConfig",
+    "MMLUProTestEvaluation",
+    "MMLUProValEvaluation",
     "StrongRejectEvaluation",
     "StrongRejectEvaluationConfig",
+    "StrongRejectSmallEvaluation",
 ]
diff --git a/src/safetunebed/whitebox/evals/base.py b/src/safetunebed/whitebox/evals/base.py
@@ -28,6 +28,7 @@
     MetricName,
     OptimizationDirection,
 )
+from safetunebed.whitebox.utils.models.config import ModelConfig
 
 C = TypeVar(name="C", bound="WhiteBoxEvaluationConfig")
 
@@ -39,14 +40,12 @@ class WhiteBoxEvaluationConfig:
     Attributes:
         model_checkpoint: Path to the checkpoint of the pretrained HuggingFace model to evaluate.
         out_dir: The path to directory where evaluation results and artifacts will be stored (may also have model checkpoints).
-        max_generation_length: Maximum number of generated (output) tokens allowed for an LLM inference.
-        batch_size: Batch size for running inference.
+        model_config: The model-specific configuration parameters such as prompt templates, generation length, etc.
     """
 
     model_checkpoint: str
     out_dir: str
-    max_generation_length: int
-    batch_size: int
+    model_config: ModelConfig
 
 
 class WhiteBoxEvaluation(ABC, Generic[C]):
@@ -209,28 +208,31 @@ def load_result_objective(cls, results: DataFrame[EvaluationSchema]) -> float:
         )
         return result_metrics[cls.objective][0]
 
-    def load_model_and_tokenizer(self) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
-        """Load model and tokenizer from path in HuggingFace.
 
-        Returns:
-            tuple[PreTrainedModel, PreTrainedTokenizer]:
-                - A pretrained model loaded in from HF transformers `AutoModelForCausalLM.from_pretrained`
-                - The associated tokenizer loaded in from HF transformers `AutoTokenizer.from_pretrained`
-        """
-        model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
-            pretrained_model_name_or_path=self.eval_config.model_checkpoint,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-        ).eval()
-
-        tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path=self.eval_config.model_checkpoint,
-            padding_side="left",
-            use_fast=False,
-        )
+def load_model_and_tokenizer(eval_config: C) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
+    """Load model and tokenizer from path in HuggingFace.
 
-        tokenizer.pad_token = (
-            tokenizer.pad_token or tokenizer.eos_token
-        )  # sometimes mandatory
-
-        return model, tokenizer
+    Returns:
+        tuple[PreTrainedModel, PreTrainedTokenizer]:
+            - A pretrained model loaded in from HF transformers `AutoModelForCausalLM.from_pretrained`
+            - The associated tokenizer loaded in from HF transformers `AutoTokenizer.from_pretrained`
+    """
+    model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=eval_config.model_checkpoint,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True,
+    ).eval()
+
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=eval_config.model_checkpoint,
+        padding_side="left",
+        use_fast=False,
+        trust_remote_code=True,
+    )
+
+    tokenizer.pad_token = (
+        tokenizer.pad_token or tokenizer.eos_token
+    )  # sometimes mandatory
+
+    return model, tokenizer
diff --git a/src/safetunebed/whitebox/evals/registry.py b/src/safetunebed/whitebox/evals/registry.py
@@ -0,0 +1,28 @@
+"""Evaluation registry with decorator-based registration.
+
+Allows individual evaluation modules to self-register their mapping from
+`EvalName -> EvaluationClass` at import time, avoiding a central hard-coded map.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TypeVar
+
+from safetunebed.whitebox.evals.base import WhiteBoxEvaluation
+from safetunebed.whitebox.utils.names import EvalName
+
+E = TypeVar("E", bound=WhiteBoxEvaluation)  # pyright: ignore[reportMissingTypeArgument]
+
+
+EVALS_REGISTRY: dict[EvalName, type[WhiteBoxEvaluation]] = {}  # pyright: ignore[reportMissingTypeArgument, reportUnknownVariableType]
+
+
+def register_evaluation(name: EvalName) -> Callable[[type[E]], type[E]]:
+    """Decorator to register an evaluation class under a name."""  # noqa: D401
+
+    def _decorator(eval_cls: type[E]) -> type[E]:
+        EVALS_REGISTRY[name] = eval_cls
+        return eval_cls
+
+    return _decorator