From 58e24a4e839f2fef73a376f40d927d8af2b7f793 Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 13 Feb 2026 16:06:35 -0800 Subject: [PATCH 1/6] Unify eval prompt formatting on apply_chat_template via configurable Jinja2 templates --- src/tamperbench/whitebox/evals/base.py | 8 +- .../evals/embedding_attack/softopt.py | 70 +----- .../whitebox/evals/gcg/implementation.py | 4 +- .../whitebox/evals/ifeval/ifeval.py | 6 +- src/tamperbench/whitebox/evals/mbpp/mbpp.py | 5 - .../whitebox/evals/mmlu_pro/mmlu_pro.py | 33 +-- .../whitebox/evals/mt_bench/mt_bench.py | 22 +- .../evals/strong_reject/strong_reject.py | 30 ++- src/tamperbench/whitebox/evals/utils.py | 20 +- .../whitebox/utils/models/config.py | 22 +- .../whitebox/utils/models/templates.py | 65 +++++ src/tamperbench/whitebox/utils/names.py | 1 + tests/attacks/test_config_validation.py | 4 +- tests/attacks/test_embedding_attack.py | 5 +- tests/attacks/test_full_finetune.py | 5 +- tests/attacks/test_gcg_attack.py | 5 +- tests/attacks/test_jailbreak_finetune.py | 5 +- tests/attacks/test_lora_attack.py | 8 +- tests/attacks/test_prompt_attack.py | 5 +- tests/evals/test_embedding_attack_eval.py | 5 +- tests/evals/test_ifeval.py | 6 +- tests/evals/test_mbpp.py | 5 +- tests/evals/test_minerva_math.py | 5 +- tests/evals/test_mmlu_pro.py | 5 +- tests/evals/test_strong_reject.py | 5 +- tests/test_templates.py | 231 ++++++++++++++++++ 26 files changed, 405 insertions(+), 180 deletions(-) create mode 100644 tests/test_templates.py diff --git a/src/tamperbench/whitebox/evals/base.py b/src/tamperbench/whitebox/evals/base.py index e475dbe1..5e73528d 100644 --- a/src/tamperbench/whitebox/evals/base.py +++ b/src/tamperbench/whitebox/evals/base.py @@ -26,6 +26,7 @@ OptimizationDirection, ) from tamperbench.whitebox.utils.models.config import ModelConfig +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template C = TypeVar(name="C", bound="WhiteBoxEvaluationConfig") @@ -208,11 +209,15 @@ def load_result_objective(cls, results: DataFrame[EvaluationSchema]) -> float: def load_tokenizer(eval_config: WhiteBoxEvaluationConfig) -> PreTrainedTokenizer: """Load HuggingFace tokenizer from eval config. + Automatically configures the tokenizer's chat_template based on + ``eval_config.model_config.template_name`` so that downstream code + can use ``tokenizer.apply_chat_template()`` directly. + Args: eval_config: Evaluation configuration with model checkpoint and settings. Returns: - HuggingFace tokenizer. + HuggingFace tokenizer with chat_template configured. """ tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=str( @@ -223,6 +228,7 @@ def load_tokenizer(eval_config: WhiteBoxEvaluationConfig) -> PreTrainedTokenizer trust_remote_code=True, ) tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token + configure_tokenizer_template(tokenizer, eval_config.model_config) return tokenizer diff --git a/src/tamperbench/whitebox/evals/embedding_attack/softopt.py b/src/tamperbench/whitebox/evals/embedding_attack/softopt.py index dba31e3a..7448df3d 100644 --- a/src/tamperbench/whitebox/evals/embedding_attack/softopt.py +++ b/src/tamperbench/whitebox/evals/embedding_attack/softopt.py @@ -14,63 +14,6 @@ format="%(asctime)s - %(levelname)s - %(message)s", ) -from transformers import PreTrainedTokenizerBase - - -def ensure_default_chat_template( - tokenizer: PreTrainedTokenizerBase, - *, - user_prefix: str, - assistant_prefix: str, - end_turn: str, - system_prefix: str = "", -) -> None: - """Attach a simple chat template if the tokenizer doesn't already have one.""" - # If a template already exists and works, don't override it. - try: - if getattr(tokenizer, "chat_template", None): - tokenizer.apply_chat_template( - [{"role": "user", "content": "ping"}], - tokenize=False, - add_generation_prompt=True, - ) - return - except Exception: - # Existing template is broken; we'll set our own below. - pass - - # Minimal, prefix-based template. - # Renders (optionally) a BOS token, then each message with the right prefix and end_turn, - # and if add_generation_prompt=True it appends the assistant prefix for generation. - template = ( - "{% if bos_token %}{{ bos_token }}{% endif %}" - "{% for message in messages %}" - "{% if message['role'] == 'system' %}" - + system_prefix - + "{{ message['content'] }}" - + end_turn - + "{% elif message['role'] == 'user' %}" - + user_prefix - + "{{ message['content'] }}" - + end_turn - + "{% elif message['role'] == 'assistant' %}" - + assistant_prefix - + "{{ message['content'] }}" - + end_turn - + "{% else %}" - # Fallback for unknown roles: treat like user - + user_prefix - + "{{ message['content'] }}" - + end_turn - + "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}" + assistant_prefix + "{% endif %}" - "{% if eos_token and not add_generation_prompt %}{{ eos_token }}{% endif %}" - ) - tokenizer.chat_template = template - - return - @dataclass class SoftOptConfig: @@ -149,17 +92,8 @@ def run_soft_opt( if not any(["{optim_str}" in d["content"] for d in input_messages]): input_messages[-1]["content"] = input_messages[-1]["content"] + "{optim_str}" - user_prefix = "User: " - assistant_prefix = "Assistant: " - end_turn = "\n" - - ensure_default_chat_template( - tokenizer, - user_prefix=user_prefix, - assistant_prefix=assistant_prefix, - end_turn=end_turn, - ) - + # The tokenizer's chat_template should already be configured by the caller + # (e.g. via configure_tokenizer_template in load_hf_model_and_tokenizer). template = tokenizer.apply_chat_template( input_messages, tokenize=False, add_generation_prompt=True ) diff --git a/src/tamperbench/whitebox/evals/gcg/implementation.py b/src/tamperbench/whitebox/evals/gcg/implementation.py index 21a72ffd..687f0fb6 100644 --- a/src/tamperbench/whitebox/evals/gcg/implementation.py +++ b/src/tamperbench/whitebox/evals/gcg/implementation.py @@ -448,9 +448,7 @@ def __init__( logger.warning("Model is on the CPU. Use a hardware accelerator for faster optimization.") if not tokenizer.chat_template: - logger.warning( - "Tokenizer does not have a chat template. Assuming base model and setting chat template to empty." - ) + logger.warning("Tokenizer does not have a chat template. Setting a minimal pass-through template.") tokenizer.chat_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" def run( diff --git a/src/tamperbench/whitebox/evals/ifeval/ifeval.py b/src/tamperbench/whitebox/evals/ifeval/ifeval.py index 22e686b9..f34e33ba 100644 --- a/src/tamperbench/whitebox/evals/ifeval/ifeval.py +++ b/src/tamperbench/whitebox/evals/ifeval/ifeval.py @@ -44,6 +44,7 @@ ) from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.utils import EvalName, MetricName, OptimizationDirection +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template from tamperbench.whitebox.utils.ops import run_in_isolation multiprocessing.set_start_method("spawn", force=True) @@ -211,8 +212,9 @@ def _instantiate_model_and_infer( else eval_config.model_checkpoint ) - # Load tokenizer and apply chat template + # Load tokenizer and configure chat template from model config tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) + configure_tokenizer_template(tokenizer, eval_config.model_config) # Format prompts using tokenizer's chat template formatted_prompts = [] @@ -222,7 +224,6 @@ def _instantiate_model_and_infer( messages, tokenize=False, add_generation_prompt=True, - enable_thinking=False, ) formatted_prompts.append(formatted) @@ -241,7 +242,6 @@ def _instantiate_model_and_infer( temperature=temperature, max_tokens=max_tokens, stop=[], - extra_args={"enable_thinking": False}, ) inferences: dict[str, list[str]] = { diff --git a/src/tamperbench/whitebox/evals/mbpp/mbpp.py b/src/tamperbench/whitebox/evals/mbpp/mbpp.py index c5a4ff79..96bd73e0 100644 --- a/src/tamperbench/whitebox/evals/mbpp/mbpp.py +++ b/src/tamperbench/whitebox/evals/mbpp/mbpp.py @@ -251,11 +251,6 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: return InferenceSchema.validate(payload) - def _format_with_chat_template(self, prompt: str) -> str: - """Apply chat template to prompt.""" - model_config = self.eval_config.model_config - return f"{model_config.user_prefix}{prompt}{model_config.end_turn}{model_config.assistant_prefix}" - @override def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[ScoreSchema]: """Execute generated code and compute pass/fail scores.""" diff --git a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py index 374ff443..fe4b253a 100644 --- a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py +++ b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py @@ -35,6 +35,8 @@ MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template +from tamperbench.whitebox.utils.names import TemplateName from tamperbench.whitebox.utils.ops import run_in_isolation multiprocessing.set_start_method("spawn", force=True) @@ -58,18 +60,9 @@ class MMLUProEvaluationConfig(WhiteBoxEvaluationConfig): Attributes: n_shots: Number of few-shot CoT examples per question. Set to 0 for zero-shot. - use_chat_template: If True, format prompts as multi-turn chat messages using - tokenizer.apply_chat_template(). If False, use raw text continuation format. - chat_template_model: HuggingFace model ID to load a chat template from when the - primary tokenizer has no chat_template. Useful for evaluating base models - with their corresponding instruct model's chat template (e.g. using - ``meta-llama/Llama-3.1-8B-Instruct``'s template for - ``meta-llama/Meta-Llama-3-8B``). Only used when ``use_chat_template=True``. """ n_shots: int = N_SHOTS - use_chat_template: bool = False - chat_template_model: str | None = None def __post_init__(self) -> None: """Validate configuration values.""" @@ -271,24 +264,10 @@ def compute_mmlu_inferences( pretrained_model_name_or_path=(eval_config.model_config.tokenizer_checkpoint or eval_config.model_checkpoint), trust_remote_code=True, ) + configure_tokenizer_template(hf_tokenizer, eval_config.model_config) + + use_chat = eval_config.model_config.template_name != TemplateName.PLAIN - if eval_config.use_chat_template: - if not hf_tokenizer.chat_template: - if eval_config.chat_template_model: - fallback_tokenizer = AutoTokenizer.from_pretrained( - eval_config.chat_template_model, trust_remote_code=True - ) - hf_tokenizer.chat_template = fallback_tokenizer.chat_template - else: - raise ValueError( - f"Tokenizer for {eval_config.model_checkpoint} has no chat_template and " - f"no chat_template_model was provided." - ) - elif eval_config.chat_template_model: - raise ValueError( - f"chat_template_model={eval_config.chat_template_model!r} was specified but the tokenizer for " - f"{eval_config.model_checkpoint} already has a chat_template." - ) # vllm tensor_parallel_size must be a factor of 64 (model hidden dimensions). # TODO: Handle scaling beyond 4 GPUs properly by validating against model config. llm_kwargs = { @@ -323,7 +302,7 @@ def compute_mmlu_inferences( k = eval_config.n_shots prompt: str | None = None while True: - if eval_config.use_chat_template: + if use_chat: messages = mmlu_api.generate_chat_messages(shots, row, max(k, 0)) prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) else: diff --git a/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py b/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py index 235529fb..2d5f5a89 100644 --- a/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py +++ b/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py @@ -220,6 +220,10 @@ def __init__(self, eval_config: MTBenchEvaluationConfig) -> None: def _format_prompt(self, messages: list[dict[str, str]], tokenizer: PreTrainedTokenizer) -> str: """Format messages into a prompt string using the tokenizer's chat template. + The tokenizer's chat_template is configured during loading via + ``configure_tokenizer_template``, so ``apply_chat_template`` is + always available. + Args: messages: List of message dicts with 'role' and 'content'. tokenizer: The tokenizer to use for formatting. @@ -227,21 +231,9 @@ def _format_prompt(self, messages: list[dict[str, str]], tokenizer: PreTrainedTo Returns: str: Formatted prompt string. """ - if hasattr(tokenizer, "apply_chat_template"): - result = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - assert isinstance(result, str) - return result - # Fallback for models without chat template - prompt = "" - for msg in messages: - if msg["role"] == "system": - prompt += f"System: {msg['content']}\n\n" - elif msg["role"] == "user": - prompt += f"User: {msg['content']}\n\n" - elif msg["role"] == "assistant": - prompt += f"Assistant: {msg['content']}\n\n" - prompt += "Assistant: " - return prompt + result = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + assert isinstance(result, str) + return result def _generate_response( self, prompt: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, max_new_tokens: int = 512 diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py index c7084d14..b2bf65ff 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py @@ -15,6 +15,7 @@ from strong_reject import load_datasets as load_strong_reject_datasets from strong_reject.evaluate import strongreject_finetuned as strong_reject_finetuned from tqdm import tqdm +from transformers import AutoTokenizer from typing_extensions import override from vllm import LLM, SamplingParams @@ -25,11 +26,13 @@ ScoreSchema, ) from tamperbench.whitebox.evals.registry import register_evaluation +from tamperbench.whitebox.evals.utils import format_chat_prompt from tamperbench.whitebox.utils import ( EvalName, MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template from tamperbench.whitebox.utils.ops import run_in_isolation S = TypeVar(name="S", bound="StrongRejectEvaluationConfig") @@ -124,13 +127,21 @@ def load_strong_reject_prompts(self) -> list[str]: """Load the StrongReject dataset into an Arrow Dataset, and then return prompts. Returns: - list[str]: A list of prompts from the StrongReject dataset to input to the model to obtain inferences. + list[str]: A list of formatted prompts from the StrongReject dataset to input to the model. """ strong_reject_dataset: ArrowDataset = load_strong_reject_datasets.load_strongreject() if self.eval_config.max_samples is not None: strong_reject_dataset = strong_reject_dataset.select(range(self.eval_config.max_samples)) - return [row["forbidden_prompt"] for row in strong_reject_dataset] # pyright: ignore[reportCallIssue] + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=str( + self.eval_config.model_config.tokenizer_checkpoint or self.eval_config.model_checkpoint + ), + trust_remote_code=True, + ) + configure_tokenizer_template(tokenizer, self.eval_config.model_config) + + return [format_chat_prompt(row["forbidden_prompt"], tokenizer) for row in strong_reject_dataset] # pyright: ignore[reportCallIssue] @register_evaluation(EvalName.JAILBREAK_BENCH) @@ -151,14 +162,15 @@ def load_strong_reject_prompts(self) -> list[str]: path="JailbreakBench/JBB-Behaviors", name="behaviors", split="harmful" ) - user_prefix = self.eval_config.model_config.user_prefix - assistant_prefix = self.eval_config.model_config.assistant_prefix - end_turn = self.eval_config.model_config.end_turn - - def to_text(data_row: dict[str, str]) -> str: - return f"{user_prefix}{data_row['Goal']}{end_turn}{assistant_prefix}" + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=str( + self.eval_config.model_config.tokenizer_checkpoint or self.eval_config.model_checkpoint + ), + trust_remote_code=True, + ) + configure_tokenizer_template(tokenizer, self.eval_config.model_config) - return [to_text(row) for row in jbb_dataset] # pyright: ignore[reportCallIssue] + return [format_chat_prompt(row["Goal"], tokenizer) for row in jbb_dataset] # pyright: ignore[reportCallIssue] @override def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[EvaluationSchema]: diff --git a/src/tamperbench/whitebox/evals/utils.py b/src/tamperbench/whitebox/evals/utils.py index 876cc1dc..e2ef9576 100644 --- a/src/tamperbench/whitebox/evals/utils.py +++ b/src/tamperbench/whitebox/evals/utils.py @@ -36,14 +36,14 @@ def format_chat_prompt( system_prompt: str | None = None, history: list[dict[str, str]] | None = None, ) -> str: - """Format messages for chat models with automatic fallback. + """Format messages into a chat prompt using the tokenizer's chat_template. - Tries to use the tokenizer's chat_template if available, otherwise - falls back to a simple text format. + The tokenizer's chat_template must already be configured (e.g. via + ``configure_tokenizer_template``) before calling this function. Args: user_content: The user's message content. - tokenizer: The tokenizer to use for formatting. + tokenizer: The tokenizer to use for formatting (must have chat_template set). system_prompt: Optional system prompt to prepend. history: Optional conversation history (list of {"role": ..., "content": ...}). @@ -60,17 +60,7 @@ def format_chat_prompt( messages.append({"role": "user", "content": user_content}) - # Try chat template first - if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: - return cast(str, tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) - - # Fallback for models without chat template - parts = [] - for msg in messages: - role = msg["role"].title() - parts.append(f"{role}: {msg['content']}") - - return "\n\n".join(parts) + "\n\nAssistant:" + return cast(str, tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) def llm_judge_score( diff --git a/src/tamperbench/whitebox/utils/models/config.py b/src/tamperbench/whitebox/utils/models/config.py index 8987c05e..a43a1d06 100644 --- a/src/tamperbench/whitebox/utils/models/config.py +++ b/src/tamperbench/whitebox/utils/models/config.py @@ -6,6 +6,7 @@ from typing_extensions import Self from tamperbench.whitebox.utils.models.templates import get_template +from tamperbench.whitebox.utils.names import TemplateName @dataclass @@ -13,6 +14,7 @@ class ModelConfig: """Configuration for model inference including chat templates and generation settings. Attributes: + template_name: The name of the chat template to use for formatting. user_prefix: The prefix to append to input prompts (that would be from a user). Similar to instruction template. assistant_prefix: The prefix to append to the output prompts. This would mark the portion of a model @@ -24,6 +26,7 @@ class ModelConfig: tokenizer_checkpoint: Optional path to a separate tokenizer checkpoint. If None, uses model_checkpoint for tokenizer. """ + template_name: TemplateName user_prefix: str assistant_prefix: str end_turn: str @@ -44,12 +47,19 @@ def from_dict(cls, data: dict[str, Any]) -> Self: data = dict(data) # shallow copy # Support a template registry to fill in prefixes if provided if "template" in data: - template_name = data.pop("template") - template = get_template(template_name) - - data.setdefault("user_prefix", template.user_prefix) - data.setdefault("assistant_prefix", template.assistant_prefix) - data.setdefault("end_turn", template.end_turn) + template_name_str = data.pop("template") + template_name = TemplateName(template_name_str) + data["template_name"] = template_name + + if template_name == TemplateName.NATIVE: + data.setdefault("user_prefix", "") + data.setdefault("assistant_prefix", "") + data.setdefault("end_turn", "") + else: + template = get_template(template_name) + data.setdefault("user_prefix", template.user_prefix) + data.setdefault("assistant_prefix", template.assistant_prefix) + data.setdefault("end_turn", template.end_turn) data.setdefault("tokenizer_checkpoint", None) diff --git a/src/tamperbench/whitebox/utils/models/templates.py b/src/tamperbench/whitebox/utils/models/templates.py index 268d082c..1cc87ae4 100644 --- a/src/tamperbench/whitebox/utils/models/templates.py +++ b/src/tamperbench/whitebox/utils/models/templates.py @@ -4,10 +4,18 @@ hydrate the corresponding `user_prefix`, `assistant_prefix`, and `end_turn`. """ +from __future__ import annotations + from dataclasses import dataclass +from typing import TYPE_CHECKING from tamperbench.whitebox.utils.names import TemplateName +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + + from tamperbench.whitebox.utils.models.config import ModelConfig + @dataclass(frozen=True) class TextTemplate: @@ -17,6 +25,34 @@ class TextTemplate: assistant_prefix: str end_turn: str + def to_jinja2(self) -> str: + """Convert this template to a Jinja2 chat template string. + + The generated template renders system/user/assistant messages with the + configured prefixes and end-of-turn delimiter. BOS tokens are not included + because the tokenizer handles those during tokenization. + + Returns: + A Jinja2 template string suitable for ``tokenizer.chat_template``. + """ + return ( + "{% for message in messages %}" + "{% if message['role'] == 'system' %}" + "{{ message['content'] }}" + + self.end_turn + + "{% elif message['role'] == 'user' %}" + + self.user_prefix + + "{{ message['content'] }}" + + self.end_turn + + "{% elif message['role'] == 'assistant' %}" + + self.assistant_prefix + + "{{ message['content'] }}" + + self.end_turn + + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + self.assistant_prefix + "{% endif %}" + ) + TEMPLATE_REGISTRY: dict[TemplateName, TextTemplate] = { TemplateName.LLAMA3: TextTemplate( @@ -75,3 +111,32 @@ def get_template(name: str | TemplateName) -> TextTemplate: return TEMPLATE_REGISTRY[key] except KeyError as e: raise KeyError(f"Unknown text template: {name}") from e + + +def configure_tokenizer_template( + tokenizer: PreTrainedTokenizerBase, + model_config: ModelConfig, +) -> None: + """Set the tokenizer's chat_template based on the model config. + + For ``TemplateName.NATIVE``, the tokenizer's own template is preserved. + For all other template names, the tokenizer's chat_template is replaced + with the Jinja2 rendering of the corresponding ``TextTemplate``. + + This should be called once after loading the tokenizer so that all + downstream code can simply use ``tokenizer.apply_chat_template()``. + + Args: + tokenizer: HuggingFace tokenizer to configure. + model_config: Model configuration containing the template name. + + Raises: + ValueError: If ``template_name`` is ``NATIVE`` but the tokenizer + has no built-in chat_template. + """ + if model_config.template_name == TemplateName.NATIVE: + if not getattr(tokenizer, "chat_template", None): + raise ValueError("template='native' was specified but the tokenizer has no chat_template") + return + template = get_template(model_config.template_name) + tokenizer.chat_template = template.to_jinja2() diff --git a/src/tamperbench/whitebox/utils/names.py b/src/tamperbench/whitebox/utils/names.py index 131cd7a2..1163651f 100644 --- a/src/tamperbench/whitebox/utils/names.py +++ b/src/tamperbench/whitebox/utils/names.py @@ -230,3 +230,4 @@ class TemplateName(StrEnum): PLAIN = "plain" GPT_CHAT = "gpt_chat" QWEN = "qwen" + NATIVE = "native" diff --git a/tests/attacks/test_config_validation.py b/tests/attacks/test_config_validation.py index 1795f88d..24bb1259 100644 --- a/tests/attacks/test_config_validation.py +++ b/tests/attacks/test_config_validation.py @@ -22,9 +22,7 @@ def _make_base_dict() -> dict[str, Any]: "out_dir": "/tmp/test", "evals": [], "model_config": { - "user_prefix": "User: ", - "assistant_prefix": "Assistant: ", - "end_turn": "\n", + "template": "generic_chat", "max_generation_length": 16, "inference_batch_size": 1, }, diff --git a/tests/attacks/test_embedding_attack.py b/tests/attacks/test_embedding_attack.py index a4884de4..463eb5e2 100644 --- a/tests/attacks/test_embedding_attack.py +++ b/tests/attacks/test_embedding_attack.py @@ -9,7 +9,7 @@ from tamperbench.whitebox.evals.embedding_attack import SoftOptConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -22,9 +22,10 @@ def test_embedding_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/attacks/test_full_finetune.py b/tests/attacks/test_full_finetune.py index 40633eb1..b0307640 100644 --- a/tests/attacks/test_full_finetune.py +++ b/tests/attacks/test_full_finetune.py @@ -11,7 +11,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -24,9 +24,10 @@ def test_full_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/attacks/test_gcg_attack.py b/tests/attacks/test_gcg_attack.py index 52743867..05a03e0d 100644 --- a/tests/attacks/test_gcg_attack.py +++ b/tests/attacks/test_gcg_attack.py @@ -14,7 +14,7 @@ from tamperbench.whitebox.evals.gcg.implementation import GCGConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -27,9 +27,10 @@ def test_gcg_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=16, ), diff --git a/tests/attacks/test_jailbreak_finetune.py b/tests/attacks/test_jailbreak_finetune.py index e2575f69..17d25ce4 100644 --- a/tests/attacks/test_jailbreak_finetune.py +++ b/tests/attacks/test_jailbreak_finetune.py @@ -12,7 +12,7 @@ from tamperbench.whitebox.attacks.jailbreak_finetune.jailbreaks import JailbreakTuneType from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -25,9 +25,10 @@ def test_jailbreak_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/attacks/test_lora_attack.py b/tests/attacks/test_lora_attack.py index 5114663d..98c67965 100644 --- a/tests/attacks/test_lora_attack.py +++ b/tests/attacks/test_lora_attack.py @@ -11,7 +11,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -24,9 +24,10 @@ def test_lora_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), @@ -63,9 +64,10 @@ def test_lora_finetune_cheap() -> None: input_checkpoint_path="EleutherAI/pythia-14m", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.GENERIC_CHAT, user_prefix="User: ", assistant_prefix="Assistant: ", - end_turn="\n\n", + end_turn="\n", max_generation_length=16, inference_batch_size=2, ), diff --git a/tests/attacks/test_prompt_attack.py b/tests/attacks/test_prompt_attack.py index ce7ca269..cbd9c886 100644 --- a/tests/attacks/test_prompt_attack.py +++ b/tests/attacks/test_prompt_attack.py @@ -15,7 +15,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -28,9 +28,10 @@ def test_prompt_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=16, ), diff --git a/tests/evals/test_embedding_attack_eval.py b/tests/evals/test_embedding_attack_eval.py index a935a81a..a6cddcd1 100644 --- a/tests/evals/test_embedding_attack_eval.py +++ b/tests/evals/test_embedding_attack_eval.py @@ -12,7 +12,7 @@ from tamperbench.whitebox.evals.embedding_attack import SoftOptConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -25,9 +25,10 @@ def test_embedding_attack_evaluation() -> None: model_checkpoint="lapisrocks/Llama-3-8B-Instruct-TAR-Refusal", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, tokenizer_checkpoint="meta-llama/Meta-Llama-3-8B-Instruct", diff --git a/tests/evals/test_ifeval.py b/tests/evals/test_ifeval.py index fb372483..261f41ef 100644 --- a/tests/evals/test_ifeval.py +++ b/tests/evals/test_ifeval.py @@ -12,7 +12,7 @@ from tamperbench.whitebox.evals.ifeval.ifeval import IFEvalEvaluationConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -25,9 +25,10 @@ def test_ifeval_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=4, ), @@ -37,6 +38,7 @@ def test_ifeval_evaluation() -> None: # model_checkpoint="Qwen/Qwen3-4B", # out_dir=tmpdirname, # model_config=ModelConfig( + # template_name=TemplateName.QWEN, # user_prefix="<|im_start|>user\n", # assistant_prefix="<|im_start|>assistant\n", # end_turn="<|im_end|>", diff --git a/tests/evals/test_mbpp.py b/tests/evals/test_mbpp.py index 0424bd52..cb9a1216 100644 --- a/tests/evals/test_mbpp.py +++ b/tests/evals/test_mbpp.py @@ -9,7 +9,7 @@ from tamperbench.whitebox.evals.mbpp.mbpp import MBPPEvaluationConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -22,9 +22,10 @@ def test_mbpp_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=8, ), diff --git a/tests/evals/test_minerva_math.py b/tests/evals/test_minerva_math.py index dd375069..db772675 100644 --- a/tests/evals/test_minerva_math.py +++ b/tests/evals/test_minerva_math.py @@ -11,7 +11,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -24,9 +24,10 @@ def test_minerva_math_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=2048, inference_batch_size=4, ), diff --git a/tests/evals/test_mmlu_pro.py b/tests/evals/test_mmlu_pro.py index 09fb743b..0e22d35f 100644 --- a/tests/evals/test_mmlu_pro.py +++ b/tests/evals/test_mmlu_pro.py @@ -13,7 +13,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName SUBJECT = "biology" @@ -139,9 +139,10 @@ def test_mmlu_pro_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.LLAMA3, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/evals/test_strong_reject.py b/tests/evals/test_strong_reject.py index 6fdd8cbb..cedba4c7 100644 --- a/tests/evals/test_strong_reject.py +++ b/tests/evals/test_strong_reject.py @@ -12,7 +12,7 @@ StrongRejectEvaluation, ) from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.gpu_required @@ -23,9 +23,10 @@ def test_strong_reject_evaluation() -> None: model_checkpoint="EleutherAI/pythia-14m", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.GENERIC_CHAT, user_prefix="User: ", assistant_prefix="Assistant: ", - end_turn="\n\n", + end_turn="\n", max_generation_length=64, inference_batch_size=4, ), diff --git a/tests/test_templates.py b/tests/test_templates.py new file mode 100644 index 00000000..8a0291c6 --- /dev/null +++ b/tests/test_templates.py @@ -0,0 +1,231 @@ +"""Tests for TextTemplate.to_jinja2() and configure_tokenizer_template().""" + +from unittest.mock import MagicMock + +import pytest + +from tamperbench.whitebox.utils.models.config import ModelConfig +from tamperbench.whitebox.utils.models.templates import ( + TEMPLATE_REGISTRY, + configure_tokenizer_template, + get_template, +) +from tamperbench.whitebox.utils.names import TemplateName + + +class TestToJinja2: + """Tests for TextTemplate.to_jinja2().""" + + def test_plain_template_produces_raw_content(self) -> None: + """PLAIN template should produce content with no prefixes.""" + template = get_template(TemplateName.PLAIN) + jinja2 = template.to_jinja2() + + # Verify it's a valid Jinja2 template by importing jinja2 + from jinja2 import Template + + t = Template(jinja2) + result = t.render( + messages=[{"role": "user", "content": "hello"}], + add_generation_prompt=True, + ) + assert result == "hello" + + def test_generic_chat_template(self) -> None: + """GENERIC_CHAT template should produce User:/Assistant: format.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + from jinja2 import Template + + t = Template(jinja2) + result = t.render( + messages=[{"role": "user", "content": "hello"}], + add_generation_prompt=True, + ) + assert result == "User: hello\nAssistant: " + + def test_generic_chat_with_system(self) -> None: + """GENERIC_CHAT template should render system messages.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + from jinja2 import Template + + t = Template(jinja2) + result = t.render( + messages=[ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "hello"}, + ], + add_generation_prompt=True, + ) + assert result == "You are helpful.\nUser: hello\nAssistant: " + + def test_generic_chat_multi_turn(self) -> None: + """GENERIC_CHAT template should handle multi-turn conversations.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + from jinja2 import Template + + t = Template(jinja2) + result = t.render( + messages=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi there"}, + {"role": "user", "content": "how are you?"}, + ], + add_generation_prompt=True, + ) + assert result == "User: hello\nAssistant: hi there\nUser: how are you?\nAssistant: " + + def test_no_generation_prompt(self) -> None: + """When add_generation_prompt is False, assistant prefix should not be appended.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + from jinja2 import Template + + t = Template(jinja2) + result = t.render( + messages=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi there"}, + ], + add_generation_prompt=False, + ) + assert result == "User: hello\nAssistant: hi there\n" + + def test_instruction_response_template(self) -> None: + """INSTRUCTION_RESPONSE template should use INSTRUCTION:/RESPONSE: format.""" + template = get_template(TemplateName.INSTRUCTION_RESPONSE) + jinja2 = template.to_jinja2() + + from jinja2 import Template + + t = Template(jinja2) + result = t.render( + messages=[{"role": "user", "content": "Summarize this."}], + add_generation_prompt=True, + ) + assert result == "INSTRUCTION: Summarize this.RESPONSE: " + + def test_all_templates_produce_valid_jinja2(self) -> None: + """All registered templates should produce valid Jinja2 strings.""" + from jinja2 import Template + + for name, template in TEMPLATE_REGISTRY.items(): + jinja2 = template.to_jinja2() + t = Template(jinja2) + # Should render without error + result = t.render( + messages=[{"role": "user", "content": "test"}], + add_generation_prompt=True, + ) + assert isinstance(result, str), f"Template {name} did not produce a string" + + +class TestConfigureTokenizerTemplate: + """Tests for configure_tokenizer_template().""" + + def test_named_template_sets_chat_template(self) -> None: + """Named template should set the tokenizer's chat_template.""" + tokenizer = MagicMock() + tokenizer.chat_template = None + + model_config = ModelConfig( + template_name=TemplateName.GENERIC_CHAT, + user_prefix="User: ", + assistant_prefix="Assistant: ", + end_turn="\n", + max_generation_length=512, + inference_batch_size=16, + ) + + configure_tokenizer_template(tokenizer, model_config) + + expected = get_template(TemplateName.GENERIC_CHAT).to_jinja2() + assert tokenizer.chat_template == expected + + def test_native_preserves_existing_template(self) -> None: + """NATIVE should not modify the tokenizer's existing chat_template.""" + tokenizer = MagicMock() + original_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" + tokenizer.chat_template = original_template + + model_config = ModelConfig( + template_name=TemplateName.NATIVE, + user_prefix="", + assistant_prefix="", + end_turn="", + max_generation_length=512, + inference_batch_size=16, + ) + + configure_tokenizer_template(tokenizer, model_config) + + assert tokenizer.chat_template == original_template + + def test_native_raises_without_template(self) -> None: + """NATIVE should raise ValueError if tokenizer has no chat_template.""" + tokenizer = MagicMock() + tokenizer.chat_template = None + + model_config = ModelConfig( + template_name=TemplateName.NATIVE, + user_prefix="", + assistant_prefix="", + end_turn="", + max_generation_length=512, + inference_batch_size=16, + ) + + with pytest.raises(ValueError, match="native"): + configure_tokenizer_template(tokenizer, model_config) + + +class TestModelConfigFromDict: + """Tests for ModelConfig.from_dict with template support.""" + + def test_from_dict_with_template_key(self) -> None: + """from_dict should hydrate template_name and prefixes from template key.""" + config = ModelConfig.from_dict( + { + "template": "llama3", + "max_generation_length": 1024, + "inference_batch_size": 16, + } + ) + assert config.template_name == TemplateName.LLAMA3 + assert config.user_prefix == "<|start_header_id|>user<|end_header_id|>\n\n" + assert config.assistant_prefix == "<|start_header_id|>assistant<|end_header_id|>\n\n" + assert config.end_turn == "<|eot_id|>" + + def test_from_dict_native_template(self) -> None: + """from_dict with template='native' should set empty prefixes.""" + config = ModelConfig.from_dict( + { + "template": "native", + "max_generation_length": 1024, + "inference_batch_size": 16, + } + ) + assert config.template_name == TemplateName.NATIVE + assert config.user_prefix == "" + assert config.assistant_prefix == "" + assert config.end_turn == "" + + def test_from_dict_plain_template(self) -> None: + """from_dict with template='plain' should set empty prefixes.""" + config = ModelConfig.from_dict( + { + "template": "plain", + "max_generation_length": 1024, + "inference_batch_size": 16, + } + ) + assert config.template_name == TemplateName.PLAIN + assert config.user_prefix == "" + assert config.assistant_prefix == "" + assert config.end_turn == "" From 4b3f0b54d5e5dfd990a3814973420ed5e38a6db2 Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 13 Feb 2026 16:22:04 -0800 Subject: [PATCH 2/6] evals: Chat template refactor cleanup --- .../whitebox/evals/gcg/implementation.py | 4 ---- .../whitebox/evals/ifeval/ifeval.py | 2 ++ .../evals/strong_reject/strong_reject.py | 20 +++---------------- .../whitebox/utils/models/templates.py | 10 ++++++++++ 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/tamperbench/whitebox/evals/gcg/implementation.py b/src/tamperbench/whitebox/evals/gcg/implementation.py index 687f0fb6..5b10a965 100644 --- a/src/tamperbench/whitebox/evals/gcg/implementation.py +++ b/src/tamperbench/whitebox/evals/gcg/implementation.py @@ -447,10 +447,6 @@ def __init__( if model.device == torch.device("cpu"): logger.warning("Model is on the CPU. Use a hardware accelerator for faster optimization.") - if not tokenizer.chat_template: - logger.warning("Tokenizer does not have a chat template. Setting a minimal pass-through template.") - tokenizer.chat_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" - def run( self, messages: str | list[dict[str, str]], diff --git a/src/tamperbench/whitebox/evals/ifeval/ifeval.py b/src/tamperbench/whitebox/evals/ifeval/ifeval.py index f34e33ba..c6619333 100644 --- a/src/tamperbench/whitebox/evals/ifeval/ifeval.py +++ b/src/tamperbench/whitebox/evals/ifeval/ifeval.py @@ -224,6 +224,7 @@ def _instantiate_model_and_infer( messages, tokenize=False, add_generation_prompt=True, + enable_thinking=False, ) formatted_prompts.append(formatted) @@ -242,6 +243,7 @@ def _instantiate_model_and_infer( temperature=temperature, max_tokens=max_tokens, stop=[], + extra_args={"enable_thinking": False}, ) inferences: dict[str, list[str]] = { diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py index b2bf65ff..91e090ed 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py @@ -15,11 +15,10 @@ from strong_reject import load_datasets as load_strong_reject_datasets from strong_reject.evaluate import strongreject_finetuned as strong_reject_finetuned from tqdm import tqdm -from transformers import AutoTokenizer from typing_extensions import override from vllm import LLM, SamplingParams -from tamperbench.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig +from tamperbench.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig, load_tokenizer from tamperbench.whitebox.evals.output_schema import ( EvaluationSchema, InferenceSchema, @@ -32,7 +31,6 @@ MetricName, OptimizationDirection, ) -from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template from tamperbench.whitebox.utils.ops import run_in_isolation S = TypeVar(name="S", bound="StrongRejectEvaluationConfig") @@ -133,13 +131,7 @@ def load_strong_reject_prompts(self) -> list[str]: if self.eval_config.max_samples is not None: strong_reject_dataset = strong_reject_dataset.select(range(self.eval_config.max_samples)) - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=str( - self.eval_config.model_config.tokenizer_checkpoint or self.eval_config.model_checkpoint - ), - trust_remote_code=True, - ) - configure_tokenizer_template(tokenizer, self.eval_config.model_config) + tokenizer = load_tokenizer(self.eval_config) return [format_chat_prompt(row["forbidden_prompt"], tokenizer) for row in strong_reject_dataset] # pyright: ignore[reportCallIssue] @@ -162,13 +154,7 @@ def load_strong_reject_prompts(self) -> list[str]: path="JailbreakBench/JBB-Behaviors", name="behaviors", split="harmful" ) - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=str( - self.eval_config.model_config.tokenizer_checkpoint or self.eval_config.model_checkpoint - ), - trust_remote_code=True, - ) - configure_tokenizer_template(tokenizer, self.eval_config.model_config) + tokenizer = load_tokenizer(self.eval_config) return [format_chat_prompt(row["Goal"], tokenizer) for row in jbb_dataset] # pyright: ignore[reportCallIssue] diff --git a/src/tamperbench/whitebox/utils/models/templates.py b/src/tamperbench/whitebox/utils/models/templates.py index 1cc87ae4..a775d33a 100644 --- a/src/tamperbench/whitebox/utils/models/templates.py +++ b/src/tamperbench/whitebox/utils/models/templates.py @@ -35,6 +35,16 @@ def to_jinja2(self) -> str: Returns: A Jinja2 template string suitable for ``tokenizer.chat_template``. """ + for field_name, value in [ + ("user_prefix", self.user_prefix), + ("assistant_prefix", self.assistant_prefix), + ("end_turn", self.end_turn), + ]: + for marker in ("{%", "{{", "}}", "%}"): + assert marker not in value, ( + f"TextTemplate.{field_name} contains Jinja2 syntax {marker!r} " + f"which would break the generated template: {value!r}" + ) return ( "{% for message in messages %}" "{% if message['role'] == 'system' %}" From c31f75f41bdea3bf7ffe697cae29e00847b03e11 Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 13 Feb 2026 16:34:27 -0800 Subject: [PATCH 3/6] Fix pre-commit on chat template refactor --- .../run_mmlu_pro_variants.py | 53 +++++-- scripts/validate_references.py | 6 + .../whitebox/defenses/crl/datasets.py | 36 +---- .../whitebox/evals/mt_bench/mt_bench.py | 23 +-- .../evals/strong_reject/strong_reject.py | 2 +- src/tamperbench/whitebox/evals/utils.py | 40 +---- .../whitebox/utils/models/chat_format.py | 98 ++++++++++++ .../whitebox/utils/models/templates.py | 12 +- tests/evals/test_format_chat_prompt.py | 143 ++++++++++++++++++ 9 files changed, 310 insertions(+), 103 deletions(-) create mode 100644 src/tamperbench/whitebox/utils/models/chat_format.py create mode 100644 tests/evals/test_format_chat_prompt.py diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py index a1467ba0..45663556 100644 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py @@ -40,7 +40,7 @@ from tamperbench.whitebox.utils import AttackName, EvalPath from tamperbench.whitebox.utils.benchmark.path_generation import get_trial_dir_name from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import ConfigKeys +from tamperbench.whitebox.utils.names import ConfigKeys, TemplateName REPO_ROOT = get_repo_root() @@ -48,14 +48,14 @@ class _MMLUProVariant(TypedDict): name: str n_shots: int - use_chat_template: bool + template_override: TemplateName | None # None means use the model's own template MMLU_PRO_VARIANTS: list[_MMLUProVariant] = [ - {"name": "fewshot_continuation", "n_shots": 5, "use_chat_template": False}, - {"name": "zeroshot_continuation", "n_shots": 0, "use_chat_template": False}, - {"name": "fewshot_chat", "n_shots": 5, "use_chat_template": True}, - {"name": "zeroshot_chat", "n_shots": 0, "use_chat_template": True}, + {"name": "fewshot_continuation", "n_shots": 5, "template_override": TemplateName.PLAIN}, + {"name": "zeroshot_continuation", "n_shots": 0, "template_override": TemplateName.PLAIN}, + {"name": "fewshot_chat", "n_shots": 5, "template_override": None}, + {"name": "zeroshot_chat", "n_shots": 0, "template_override": None}, ] @@ -85,7 +85,7 @@ def run_attack_and_evaluate( output_results_dir: Path, eval_split: str, random_seed: int = 42, - chat_template_model: str | None = None, + tokenizer_checkpoint: str | None = None, skip_attack: bool = False, ) -> None: """Re-run attack with best hyperparameters and evaluate with all MMLU-Pro variants.""" @@ -165,16 +165,41 @@ def run_attack_and_evaluate( variant_name = variant["name"] variant_out_dir = attack_out_dir / "mmlu_pro_variants" / variant_name + template_override = variant["template_override"] print(f"\n--- Running MMLU-Pro variant: {variant_name} ---") - print(f" n_shots={variant['n_shots']}, use_chat_template={variant['use_chat_template']}") + print(f" n_shots={variant['n_shots']}, template={template_override or 'model default'}") + + # Override the template if the variant specifies one (e.g. PLAIN for continuation) + variant_model_config = model_config + if template_override is not None: + variant_model_config = ModelConfig.from_dict( + { + "template": template_override.value, + "max_generation_length": model_config.max_generation_length, + "inference_batch_size": model_config.inference_batch_size, + **( + {"tokenizer_checkpoint": model_config.tokenizer_checkpoint} + if model_config.tokenizer_checkpoint + else {} + ), + } + ) + elif tokenizer_checkpoint is not None and variant_model_config.tokenizer_checkpoint is None: + # For chat variants, use the provided tokenizer checkpoint if the model config doesn't have one + variant_model_config = ModelConfig.from_dict( + { + "template": model_config.template_name.value, + "max_generation_length": model_config.max_generation_length, + "inference_batch_size": model_config.inference_batch_size, + "tokenizer_checkpoint": tokenizer_checkpoint, + } + ) eval_config = MMLUProEvaluationConfig( model_checkpoint=attacker.output_checkpoint_path, out_dir=str(variant_out_dir), - model_config=model_config, + model_config=variant_model_config, n_shots=variant["n_shots"], - use_chat_template=variant["use_chat_template"], - chat_template_model=chat_template_model if variant["use_chat_template"] else None, ) evaluator = eval_cls(eval_config) @@ -213,10 +238,10 @@ def main() -> None: parser.add_argument("--eval-split", choices=["val", "test"], default="val", help="MMLU-Pro split to evaluate on") parser.add_argument("--random-seed", type=int, default=42) parser.add_argument( - "--chat-template-model", + "--tokenizer-checkpoint", type=str, default=None, - help="Model whose chat template to use when the primary tokenizer has no chat_template (e.g. for base models)", + help="Separate tokenizer to use when the primary model has no chat_template (e.g. for base models)", ) parser.add_argument( "--skip-attack", @@ -238,7 +263,7 @@ def main() -> None: output_results_dir=args.output_results_dir, eval_split=args.eval_split, random_seed=args.random_seed, - chat_template_model=args.chat_template_model, + tokenizer_checkpoint=args.tokenizer_checkpoint, skip_attack=args.skip_attack, ) diff --git a/scripts/validate_references.py b/scripts/validate_references.py index 3bc9d8c6..ef604b36 100644 --- a/scripts/validate_references.py +++ b/scripts/validate_references.py @@ -22,6 +22,7 @@ from tamperbench.whitebox.evals.wmdp.wmdp import WMDPEvaluation, WMDPEvaluationConfig from tamperbench.whitebox.evals.xstest.xstest import XSTestEvaluation, XSTestEvaluationConfig from tamperbench.whitebox.utils.models.config import ModelConfig +from tamperbench.whitebox.utils.names import TemplateName if TYPE_CHECKING: from pandera.typing.polars import DataFrame @@ -151,6 +152,7 @@ def _run_wmdp_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -177,6 +179,7 @@ def _run_xstest_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -203,6 +206,7 @@ def _run_strong_reject_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -228,6 +232,7 @@ def _run_policy_eval_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -253,6 +258,7 @@ def _run_mt_bench_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", diff --git a/src/tamperbench/whitebox/defenses/crl/datasets.py b/src/tamperbench/whitebox/defenses/crl/datasets.py index 7ccb3070..3d1ef9a0 100644 --- a/src/tamperbench/whitebox/defenses/crl/datasets.py +++ b/src/tamperbench/whitebox/defenses/crl/datasets.py @@ -6,6 +6,8 @@ import torch from transformers import PreTrainedTokenizer +from tamperbench.whitebox.utils.models.chat_format import apply_chat_template_with_fallback + def format_with_chat_template( tokenizer: PreTrainedTokenizer, @@ -29,41 +31,15 @@ def format_with_chat_template( dict with input_ids, attention_mask, and response_mask tensors. """ # Format request (user message only, with generation prompt to get assistant header) - request_messages = [{"role": "user", "content": user_content}] - if tokenizer.chat_template is not None: - request_text = cast( - str, - tokenizer.apply_chat_template( - request_messages, - tokenize=False, - add_generation_prompt=True, # Adds assistant header - ), - ) - else: - # NOTE: This fallback path has a known BPE boundary sensitivity issue -- - # the response_mask may be off by 1-2 tokens because BPE tokenization at - # the "Assistant: " / response boundary can differ when tokenized separately - # vs. as part of the full text. The primary code path (chat templates) uses - # special tokens as boundaries and is not affected. All target models - # (Llama 2/3, Qwen) have chat templates. - request_text = f"User: {user_content}\nAssistant: " + request_messages: list[dict[str, str]] = [{"role": "user", "content": user_content}] + request_text = apply_chat_template_with_fallback(request_messages, tokenizer, add_generation_prompt=True) # Format full conversation - full_messages = [ + full_messages: list[dict[str, str]] = [ {"role": "user", "content": user_content}, {"role": "assistant", "content": assistant_content}, ] - if tokenizer.chat_template is not None: - full_text = cast( - str, - tokenizer.apply_chat_template( - full_messages, - tokenize=False, - add_generation_prompt=False, - ), - ) - else: - full_text = f"User: {user_content}\nAssistant: {assistant_content}" + full_text = apply_chat_template_with_fallback(full_messages, tokenizer, add_generation_prompt=False) # Tokenize request to find where response starts request_tokens = tokenizer( diff --git a/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py b/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py index 2d5f5a89..fe956da4 100644 --- a/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py +++ b/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py @@ -45,6 +45,7 @@ OptimizationDirection, dealloc_model_and_tokenizer, ) +from tamperbench.whitebox.utils.models.chat_format import apply_chat_template_with_fallback # Reference: Multiple sources for MT-Bench scores # Primary: "Judging LLM-as-a-Judge" (Zheng et al., NeurIPS 2023) @@ -217,24 +218,6 @@ def __init__(self, eval_config: MTBenchEvaluationConfig) -> None: self.openai_client: OpenAI = OpenAI() self.questions: list[MTBenchQuestion] = load_mt_bench_questions_by_category(eval_config.categories) - def _format_prompt(self, messages: list[dict[str, str]], tokenizer: PreTrainedTokenizer) -> str: - """Format messages into a prompt string using the tokenizer's chat template. - - The tokenizer's chat_template is configured during loading via - ``configure_tokenizer_template``, so ``apply_chat_template`` is - always available. - - Args: - messages: List of message dicts with 'role' and 'content'. - tokenizer: The tokenizer to use for formatting. - - Returns: - str: Formatted prompt string. - """ - result = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - assert isinstance(result, str) - return result - def _generate_response( self, prompt: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, max_new_tokens: int = 512 ) -> str: @@ -292,7 +275,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: messages_t1.append({"role": "user", "content": question.turns[0]}) # Generate turn 1 response - prompt_t1 = self._format_prompt(messages_t1, tokenizer) + prompt_t1 = apply_chat_template_with_fallback(messages_t1, tokenizer) response_t1 = self._generate_response( prompt_t1, model, tokenizer, self.eval_config.model_config.max_generation_length ) @@ -305,7 +288,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: ] # Generate turn 2 response - prompt_t2 = self._format_prompt(messages_t2, tokenizer) + prompt_t2 = apply_chat_template_with_fallback(messages_t2, tokenizer) response_t2 = self._generate_response( prompt_t2, model, tokenizer, self.eval_config.model_config.max_generation_length ) diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py index 91e090ed..50b81d5d 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py @@ -25,12 +25,12 @@ ScoreSchema, ) from tamperbench.whitebox.evals.registry import register_evaluation -from tamperbench.whitebox.evals.utils import format_chat_prompt from tamperbench.whitebox.utils import ( EvalName, MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt from tamperbench.whitebox.utils.ops import run_in_isolation S = TypeVar(name="S", bound="StrongRejectEvaluationConfig") diff --git a/src/tamperbench/whitebox/evals/utils.py b/src/tamperbench/whitebox/evals/utils.py index e2ef9576..9a0bb12c 100644 --- a/src/tamperbench/whitebox/evals/utils.py +++ b/src/tamperbench/whitebox/evals/utils.py @@ -22,47 +22,19 @@ from pandera.typing.polars import DataFrame from strong_reject.evaluate import strongreject_finetuned as strong_reject_finetuned from tqdm.contrib.concurrent import thread_map -from transformers import PreTrainedTokenizer from vllm import LLM, SamplingParams from tamperbench.whitebox.evals.output_schema import InferenceSchema, ScoreSchema +from tamperbench.whitebox.utils.models.chat_format import ( + apply_chat_template_with_fallback as apply_chat_template_with_fallback, +) +from tamperbench.whitebox.utils.models.chat_format import ( + format_chat_prompt as format_chat_prompt, +) logger = logging.getLogger(__name__) -def format_chat_prompt( - user_content: str, - tokenizer: PreTrainedTokenizer, - system_prompt: str | None = None, - history: list[dict[str, str]] | None = None, -) -> str: - """Format messages into a chat prompt using the tokenizer's chat_template. - - The tokenizer's chat_template must already be configured (e.g. via - ``configure_tokenizer_template``) before calling this function. - - Args: - user_content: The user's message content. - tokenizer: The tokenizer to use for formatting (must have chat_template set). - system_prompt: Optional system prompt to prepend. - history: Optional conversation history (list of {"role": ..., "content": ...}). - - Returns: - Formatted prompt string ready for model input. - """ - messages: list[dict[str, str]] = [] - - if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - - if history: - messages.extend(history) - - messages.append({"role": "user", "content": user_content}) - - return cast(str, tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) - - def llm_judge_score( inferences: DataFrame[InferenceSchema], prompt_template: str, diff --git a/src/tamperbench/whitebox/utils/models/chat_format.py b/src/tamperbench/whitebox/utils/models/chat_format.py new file mode 100644 index 00000000..9277174c --- /dev/null +++ b/src/tamperbench/whitebox/utils/models/chat_format.py @@ -0,0 +1,98 @@ +"""Unified chat prompt formatting utilities. + +Provides two public functions: + +- ``format_chat_prompt`` - builds a messages list from convenience args + (user_content, system_prompt, history) and delegates to + ``apply_chat_template_with_fallback``. +- ``apply_chat_template_with_fallback`` - wraps + ``tokenizer.apply_chat_template()`` with a consistent plain-text fallback + for tokenizers that lack a Jinja2 chat template. + +These live in ``whitebox/utils/models`` (not ``evals/utils``) so that both +evaluation code *and* defense code can import them without pulling in heavy +eval-only dependencies (vllm, openai, polars, etc.). +""" + +from typing import cast + +from transformers import PreTrainedTokenizer + + +def format_chat_prompt( + user_content: str, + tokenizer: PreTrainedTokenizer, + system_prompt: str | None = None, + history: list[dict[str, str]] | None = None, +) -> str: + r"""Format messages for chat models with automatic fallback. + + Builds a messages list and applies the tokenizer's chat template. + If the tokenizer has no chat template, falls back to a simple + ``"User: ...\n\nAssistant:"`` text format. + + Args: + user_content: The user's message content. + tokenizer: The tokenizer to use for formatting. + system_prompt: Optional system prompt to prepend. + history: Optional conversation history + (list of ``{"role": ..., "content": ...}``). + + Returns: + Formatted prompt string ready for model input. + """ + messages: list[dict[str, str]] = [] + + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + if history: + messages.extend(history) + + messages.append({"role": "user", "content": user_content}) + + return apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) + + +def apply_chat_template_with_fallback( + messages: list[dict[str, str]], + tokenizer: PreTrainedTokenizer, + add_generation_prompt: bool = True, +) -> str: + """Apply ``tokenizer.apply_chat_template()`` with a unified plain-text fallback. + + Use this when you already have a messages list and need formatting with a + consistent fallback for tokenizers that lack a chat template. + + Args: + messages: List of ``{"role": ..., "content": ...}`` dicts. + tokenizer: HuggingFace tokenizer. + add_generation_prompt: Whether to append the assistant header at the + end (same semantics as ``apply_chat_template``). + + Returns: + Formatted prompt string. + """ + if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: + return cast( + str, + tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=add_generation_prompt, + ), + ) + + # Unified fallback for tokenizers without a chat template + parts: list[str] = [] + for msg in messages: + if msg["role"] == "system": + parts.append(msg["content"]) + else: + role = msg["role"].title() + parts.append(f"{role}: {msg['content']}") + + result = "\n\n".join(parts) + if add_generation_prompt: + result += "\n\nAssistant:" + return result diff --git a/src/tamperbench/whitebox/utils/models/templates.py b/src/tamperbench/whitebox/utils/models/templates.py index a775d33a..1d580d95 100644 --- a/src/tamperbench/whitebox/utils/models/templates.py +++ b/src/tamperbench/whitebox/utils/models/templates.py @@ -7,14 +7,18 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Protocol from tamperbench.whitebox.utils.names import TemplateName if TYPE_CHECKING: from transformers import PreTrainedTokenizerBase - from tamperbench.whitebox.utils.models.config import ModelConfig + +class _HasTemplateName(Protocol): + """Protocol for objects that have a template_name attribute.""" + + template_name: TemplateName @dataclass(frozen=True) @@ -125,7 +129,7 @@ def get_template(name: str | TemplateName) -> TextTemplate: def configure_tokenizer_template( tokenizer: PreTrainedTokenizerBase, - model_config: ModelConfig, + model_config: _HasTemplateName, ) -> None: """Set the tokenizer's chat_template based on the model config. @@ -138,7 +142,7 @@ def configure_tokenizer_template( Args: tokenizer: HuggingFace tokenizer to configure. - model_config: Model configuration containing the template name. + model_config: Object with a ``template_name`` attribute (e.g. ``ModelConfig``). Raises: ValueError: If ``template_name`` is ``NATIVE`` but the tokenizer diff --git a/tests/evals/test_format_chat_prompt.py b/tests/evals/test_format_chat_prompt.py new file mode 100644 index 00000000..5577130c --- /dev/null +++ b/tests/evals/test_format_chat_prompt.py @@ -0,0 +1,143 @@ +"""Tests for chat formatting utilities in whitebox.utils.models.chat_format.""" + +from typing import Any +from unittest.mock import MagicMock + +from tamperbench.whitebox.utils.models.chat_format import ( + apply_chat_template_with_fallback, + format_chat_prompt, +) + + +def _make_tokenizer(chat_template: str | None = None) -> MagicMock: + """Create a mock tokenizer, optionally with a chat template.""" + tokenizer = MagicMock() + tokenizer.chat_template = chat_template + + if chat_template: + + def fake_apply( + messages: list[dict[str, str]], + *, + tokenize: bool = False, # pyright: ignore[reportUnusedParameter] + add_generation_prompt: bool = True, + ) -> str: + parts = [f"[{msg['role']}] {msg['content']}" for msg in messages] + result = " | ".join(parts) + if add_generation_prompt: + result += " | [assistant]" + return result + + tokenizer.apply_chat_template = fake_apply + else: + del tokenizer.apply_chat_template + + return tokenizer + + +# --- format_chat_prompt --- + + +def test_format_chat_prompt_with_template() -> None: + """format_chat_prompt delegates to tokenizer when template exists.""" + tokenizer = _make_tokenizer(chat_template="some_template") + result = format_chat_prompt("Hello", tokenizer) + assert result == "[user] Hello | [assistant]" + + +def test_format_chat_prompt_fallback() -> None: + """format_chat_prompt falls back to plain-text when no template.""" + tokenizer = _make_tokenizer(chat_template=None) + result = format_chat_prompt("Hello", tokenizer) + assert result == "User: Hello\n\nAssistant:" + + +def test_format_chat_prompt_with_system_prompt() -> None: + """format_chat_prompt includes system prompt.""" + tokenizer = _make_tokenizer(chat_template=None) + result = format_chat_prompt("Hello", tokenizer, system_prompt="Be helpful.") + assert result == "Be helpful.\n\nUser: Hello\n\nAssistant:" + + +def test_format_chat_prompt_with_history() -> None: + """format_chat_prompt includes conversation history.""" + tokenizer = _make_tokenizer(chat_template=None) + history: list[dict[str, str]] = [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + result = format_chat_prompt("How are you?", tokenizer, history=history) + assert result == "User: Hi\n\nAssistant: Hello!\n\nUser: How are you?\n\nAssistant:" + + +def test_format_chat_prompt_system_and_history_with_template() -> None: + """format_chat_prompt passes system, history, and user to template.""" + tokenizer = _make_tokenizer(chat_template="some_template") + history: list[dict[str, str]] = [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + result = format_chat_prompt("How are you?", tokenizer, system_prompt="Be helpful.", history=history) + assert result == "[system] Be helpful. | [user] Hi | [assistant] Hello! | [user] How are you? | [assistant]" + + +# --- apply_chat_template_with_fallback --- + + +def test_fallback_uses_tokenizer_template() -> None: + """apply_chat_template_with_fallback delegates to tokenizer when template exists.""" + tokenizer = _make_tokenizer(chat_template="some_template") + messages: list[dict[str, Any]] = [{"role": "user", "content": "Hello"}] + result = apply_chat_template_with_fallback(messages, tokenizer) + assert result == "[user] Hello | [assistant]" + + +def test_fallback_with_generation_prompt() -> None: + """Fallback appends 'Assistant:' when add_generation_prompt=True.""" + tokenizer = _make_tokenizer(chat_template=None) + messages: list[dict[str, Any]] = [{"role": "user", "content": "Hello"}] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) + assert result == "User: Hello\n\nAssistant:" + + +def test_fallback_without_generation_prompt() -> None: + """Fallback omits 'Assistant:' trailer when add_generation_prompt=False.""" + tokenizer = _make_tokenizer(chat_template=None) + messages: list[dict[str, Any]] = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + ] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=False) + assert result == "User: Hello\n\nAssistant: Hi there" + + +def test_fallback_tokenizer_template_no_generation_prompt() -> None: + """Tokenizer template called with add_generation_prompt=False.""" + tokenizer = _make_tokenizer(chat_template="some_template") + messages: list[dict[str, Any]] = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + ] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=False) + assert result == "[user] Hello | [assistant] Hi there" + + +def test_fallback_system_message() -> None: + """Fallback renders system messages as plain content.""" + tokenizer = _make_tokenizer(chat_template=None) + messages: list[dict[str, Any]] = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + ] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) + assert result == "You are helpful.\n\nUser: Hello\n\nAssistant:" + + +# --- re-export from evals/utils --- + + +def test_reexport_from_evals_utils() -> None: + """format_chat_prompt is re-exported from evals.utils for backward compatibility.""" + from tamperbench.whitebox.evals.utils import format_chat_prompt as reexported + + assert reexported is format_chat_prompt From 18ff0140e0c05f8263c27949ca12682bff6ca896 Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 13 Feb 2026 17:40:03 -0800 Subject: [PATCH 4/6] evals: Chat template simplification --- .../run_mmlu_pro_variants.py | 25 ++------- .../tomtseng/mmlu_pro_chat_260211/run_one.sh | 6 +- .../mmlu_pro_chat_260211/submit_all.sh | 10 ++-- src/tamperbench/whitebox/evals/base.py | 6 +- .../whitebox/evals/ifeval/ifeval.py | 4 +- .../whitebox/evals/mmlu_pro/mmlu_pro.py | 2 +- .../evals/prompt_attack/prompt_attack.py | 9 ++- .../whitebox/evals/safety_gap/safety_gap.py | 9 +-- .../evals/strong_reject/strong_reject.py | 27 +++++---- .../strong_reject_jailbreak_tune.py | 8 ++- src/tamperbench/whitebox/evals/utils.py | 7 --- .../whitebox/utils/models/chat_format.py | 31 ++++------ .../whitebox/utils/models/templates.py | 46 +++++---------- src/tamperbench/whitebox/utils/names.py | 3 - tests/attacks/test_embedding_attack.py | 2 +- tests/attacks/test_full_finetune.py | 2 +- tests/attacks/test_gcg_attack.py | 2 +- tests/attacks/test_jailbreak_finetune.py | 2 +- tests/attacks/test_lora_attack.py | 2 +- tests/attacks/test_prompt_attack.py | 2 +- tests/evals/test_embedding_attack_eval.py | 2 +- tests/evals/test_format_chat_prompt.py | 31 ---------- tests/evals/test_ifeval.py | 4 +- tests/evals/test_mbpp.py | 2 +- tests/evals/test_minerva_math.py | 2 +- tests/evals/test_mmlu_pro.py | 2 +- tests/evals/test_mt_bench.py | 4 +- tests/test_templates.py | 56 +++---------------- 28 files changed, 93 insertions(+), 215 deletions(-) diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py index 45663556..ede8a585 100644 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py @@ -25,6 +25,7 @@ import copy import json import shutil +from dataclasses import replace from pathlib import Path from typing import Any, TypedDict @@ -172,28 +173,10 @@ def run_attack_and_evaluate( # Override the template if the variant specifies one (e.g. PLAIN for continuation) variant_model_config = model_config if template_override is not None: - variant_model_config = ModelConfig.from_dict( - { - "template": template_override.value, - "max_generation_length": model_config.max_generation_length, - "inference_batch_size": model_config.inference_batch_size, - **( - {"tokenizer_checkpoint": model_config.tokenizer_checkpoint} - if model_config.tokenizer_checkpoint - else {} - ), - } - ) - elif tokenizer_checkpoint is not None and variant_model_config.tokenizer_checkpoint is None: + variant_model_config = replace(model_config, template_name=template_override) + elif tokenizer_checkpoint is not None and model_config.tokenizer_checkpoint is None: # For chat variants, use the provided tokenizer checkpoint if the model config doesn't have one - variant_model_config = ModelConfig.from_dict( - { - "template": model_config.template_name.value, - "max_generation_length": model_config.max_generation_length, - "inference_batch_size": model_config.inference_batch_size, - "tokenizer_checkpoint": tokenizer_checkpoint, - } - ) + variant_model_config = replace(model_config, tokenizer_checkpoint=tokenizer_checkpoint) eval_config = MMLUProEvaluationConfig( model_checkpoint=attacker.output_checkpoint_path, diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh index 99a0c774..ba03f4fc 100644 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh @@ -63,7 +63,7 @@ declare -A MODEL_PATHS=( ["qwen3_8b_base"]="Qwen/Qwen3-8B-Base" ) -declare -A CHAT_TEMPLATE_MODELS=( +declare -A TOKENIZER_CHECKPOINTS=( ["llama3_1b_base"]="meta-llama/Llama-3.2-1B-Instruct" ["llama3_3b_base"]="meta-llama/Llama-3.2-3B-Instruct" ["llama3_8b_base"]="meta-llama/Llama-3.1-8B-Instruct" @@ -101,8 +101,8 @@ CMD=( --source-results-dir "${SOURCE_RESULTS_DIR}" --output-results-dir "${OUTPUT_RESULTS_DIR}" ) -if [[ -n "${CHAT_TEMPLATE_MODELS[$MODEL_ALIAS]+x}" ]]; then - CMD+=(--chat-template-model "${CHAT_TEMPLATE_MODELS[$MODEL_ALIAS]}") +if [[ -n "${TOKENIZER_CHECKPOINTS[$MODEL_ALIAS]+x}" ]]; then + CMD+=(--tokenizer-checkpoint "${TOKENIZER_CHECKPOINTS[$MODEL_ALIAS]}") fi if $SKIP_ATTACK; then CMD+=(--skip-attack) diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh b/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh index fe0943bc..dcd13334 100755 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh @@ -52,10 +52,10 @@ declare -A MODEL_PATHS=( ["qwen3_8b_base"]="Qwen/Qwen3-8B-Base" ) -# Base model alias -> instruct model with chat template. +# Base model alias -> instruct model tokenizer to use as fallback. # Used to provide a chat template for base models that don't have one. # Qwen3 base models already have a chat template -declare -A CHAT_TEMPLATE_MODELS=( +declare -A TOKENIZER_CHECKPOINTS=( ["llama3_1b_base"]="meta-llama/Llama-3.2-1B-Instruct" ["llama3_3b_base"]="meta-llama/Llama-3.2-3B-Instruct" ["llama3_8b_base"]="meta-llama/Llama-3.1-8B-Instruct" @@ -111,10 +111,10 @@ ${UV} run ${SCRIPT} \"${model_path}\" \\ --source-results-dir \"${SOURCE_RESULTS_DIR}\" \\ --output-results-dir \"${OUTPUT_RESULTS_DIR}\"" - # Append --chat-template-model for base models - if [[ -n "${CHAT_TEMPLATE_MODELS[$model_alias]+x}" ]]; then + # Append --tokenizer-checkpoint for base models + if [[ -n "${TOKENIZER_CHECKPOINTS[$model_alias]+x}" ]]; then slurm_script+=" \\ - --chat-template-model \"${CHAT_TEMPLATE_MODELS[$model_alias]}\"" + --tokenizer-checkpoint \"${TOKENIZER_CHECKPOINTS[$model_alias]}\"" fi if $SKIP_ATTACK; then diff --git a/src/tamperbench/whitebox/evals/base.py b/src/tamperbench/whitebox/evals/base.py index 64619a83..d646db86 100644 --- a/src/tamperbench/whitebox/evals/base.py +++ b/src/tamperbench/whitebox/evals/base.py @@ -210,8 +210,8 @@ def load_tokenizer(eval_config: WhiteBoxEvaluationConfig) -> PreTrainedTokenizer """Load HuggingFace tokenizer from eval config. Automatically configures the tokenizer's chat_template based on - ``eval_config.model_config.template_name`` so that downstream code - can use ``tokenizer.apply_chat_template()`` directly. + `eval_config.model_config.template_name` so that downstream code + can use `tokenizer.apply_chat_template()` directly. Args: eval_config: Evaluation configuration with model checkpoint and settings. @@ -228,7 +228,7 @@ def load_tokenizer(eval_config: WhiteBoxEvaluationConfig) -> PreTrainedTokenizer trust_remote_code=True, ) tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token - configure_tokenizer_template(tokenizer, eval_config.model_config) + configure_tokenizer_template(tokenizer, eval_config.model_config.template_name) return tokenizer diff --git a/src/tamperbench/whitebox/evals/ifeval/ifeval.py b/src/tamperbench/whitebox/evals/ifeval/ifeval.py index c6619333..08deeade 100644 --- a/src/tamperbench/whitebox/evals/ifeval/ifeval.py +++ b/src/tamperbench/whitebox/evals/ifeval/ifeval.py @@ -212,9 +212,9 @@ def _instantiate_model_and_infer( else eval_config.model_checkpoint ) - # Load tokenizer and configure chat template from model config + # Load tokenizer and apply chat template tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) - configure_tokenizer_template(tokenizer, eval_config.model_config) + configure_tokenizer_template(tokenizer, eval_config.model_config.template_name) # Format prompts using tokenizer's chat template formatted_prompts = [] diff --git a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py index fe4b253a..7c8a29cc 100644 --- a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py +++ b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py @@ -264,7 +264,7 @@ def compute_mmlu_inferences( pretrained_model_name_or_path=(eval_config.model_config.tokenizer_checkpoint or eval_config.model_checkpoint), trust_remote_code=True, ) - configure_tokenizer_template(hf_tokenizer, eval_config.model_config) + configure_tokenizer_template(hf_tokenizer, eval_config.model_config.template_name) use_chat = eval_config.model_config.template_name != TemplateName.PLAIN diff --git a/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py b/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py index 9dee500a..255e43e3 100644 --- a/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py +++ b/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py @@ -6,6 +6,7 @@ from pandera.typing.polars import DataFrame from typing_extensions import override +from tamperbench.whitebox.evals.base import load_tokenizer from tamperbench.whitebox.evals.output_schema import InferenceSchema from tamperbench.whitebox.evals.prompt_attack.dictionary import templates from tamperbench.whitebox.evals.registry import register_evaluation @@ -14,6 +15,7 @@ StrongRejectEvaluationConfig, instantiate_model_and_infer, ) +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt from tamperbench.whitebox.utils.names import EvalName from tamperbench.whitebox.utils.ops import run_in_isolation @@ -39,10 +41,11 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: then replaces the prompt column with the original forbidden prompts so that StrongReject scoring receives the correct inputs. """ - prompts: list[str] = self.load_strong_reject_prompts() + tokenizer = load_tokenizer(self.eval_config) + raw_prompts: list[str] = self.load_strong_reject_prompts() prompt_prefix = templates[self.eval_config.jailbreak_prompt] - jailbreak_prompts = [f"{prompt_prefix}{prompt}" for prompt in prompts] + jailbreak_prompts = [format_chat_prompt(f"{prompt_prefix}{prompt}", tokenizer) for prompt in raw_prompts] payload: pl.DataFrame = run_in_isolation( target=instantiate_model_and_infer, @@ -51,5 +54,5 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: ) # Replace jailbreak prompts with original forbidden prompts for StrongReject scoring - payload = payload.with_columns(pl.Series(InferenceSchema.prompt, prompts)) + payload = payload.with_columns(pl.Series(InferenceSchema.prompt, raw_prompts)) return InferenceSchema.validate(payload) diff --git a/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py b/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py index e3f72e9f..ab7567b1 100644 --- a/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py +++ b/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py @@ -17,6 +17,7 @@ from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.evals.utils import ( aggregate_by_field, + format_chat_prompt, generate, strong_reject_score, ) @@ -90,14 +91,10 @@ def _load_prompts_with_domains(self) -> list[dict[str, str]]: @override def compute_inferences(self) -> DataFrame[InferenceSchema]: """Run model on SafetyGap dataset to compute inferences.""" - model, _ = load_vllm_model_and_tokenizer(self.eval_config) + model, tokenizer = load_vllm_model_and_tokenizer(self.eval_config) prompts_with_domains = self._load_prompts_with_domains() - # Prompts are passed raw (no chat template formatting). The SafetyGap paper - # evaluates propensity using raw completion prompts. For chat-tuned models this - # measures base propensity without chat-level safety guardrails, which is the - # intended behavior for this benchmark. - prompts = [p["question"] for p in prompts_with_domains] + prompts = [format_chat_prompt(p["question"], tokenizer) for p in prompts_with_domains] domains = [p["domain"] for p in prompts_with_domains] responses = generate( diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py index 50b81d5d..58f565c8 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py @@ -65,14 +65,19 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: Returns: DataFrame[InferenceSchema]: A polars DataFrame with the prompts to LLM and corresponding responses. """ - prompts: list[str] = self.load_strong_reject_prompts() + tokenizer = load_tokenizer(self.eval_config) + raw_prompts: list[str] = self.load_strong_reject_prompts() + formatted_prompts = [format_chat_prompt(p, tokenizer) for p in raw_prompts] payload: pl.DataFrame = run_in_isolation( target=instantiate_model_and_infer, - args=(self.eval_config, prompts), + args=(self.eval_config, formatted_prompts), error_context="StrongReject inference", ) + # Store unformatted prompts for downstream StrongReject scoring, + # which expects raw forbidden prompts, not chat-formatted text. + payload = payload.with_columns(pl.Series(InferenceSchema.prompt, raw_prompts)) return InferenceSchema.validate(payload) @override @@ -122,18 +127,17 @@ def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[Evaluatio return EvaluationSchema.validate(_metrics_dataframe) def load_strong_reject_prompts(self) -> list[str]: - """Load the StrongReject dataset into an Arrow Dataset, and then return prompts. + """Load the StrongReject dataset and return raw prompts. Returns: - list[str]: A list of formatted prompts from the StrongReject dataset to input to the model. + list[str]: A list of raw prompts from the StrongReject dataset. + Chat formatting is applied later in `compute_inferences`. """ strong_reject_dataset: ArrowDataset = load_strong_reject_datasets.load_strongreject() if self.eval_config.max_samples is not None: strong_reject_dataset = strong_reject_dataset.select(range(self.eval_config.max_samples)) - tokenizer = load_tokenizer(self.eval_config) - - return [format_chat_prompt(row["forbidden_prompt"], tokenizer) for row in strong_reject_dataset] # pyright: ignore[reportCallIssue] + return [row["forbidden_prompt"] for row in strong_reject_dataset] # pyright: ignore[reportCallIssue] @register_evaluation(EvalName.JAILBREAK_BENCH) @@ -145,18 +149,17 @@ class JailbreakBenchEvaluation(StrongRejectEvaluation[S]): @override def load_strong_reject_prompts(self) -> list[str]: - """Load the JailbreakBench JBB-Behaviors dataset and return prompts. + """Load the JailbreakBench JBB-Behaviors dataset and return raw prompts. Returns: - list[str]: A list of prompts from the JailbreakBench dataset to input to the model to obtain inferences. + list[str]: A list of raw prompts from the JailbreakBench dataset. + Chat formatting is applied later in `compute_inferences`. """ jbb_dataset = datasets.load_dataset( # pyright: ignore[reportReturnType] path="JailbreakBench/JBB-Behaviors", name="behaviors", split="harmful" ) - tokenizer = load_tokenizer(self.eval_config) - - return [format_chat_prompt(row["Goal"], tokenizer) for row in jbb_dataset] # pyright: ignore[reportCallIssue] + return [row["Goal"] for row in jbb_dataset] # pyright: ignore[reportCallIssue] @override def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[EvaluationSchema]: diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py index 2873ae0c..1368d0b5 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py @@ -33,10 +33,11 @@ class SRJailbreakTuneEvaluation(StrongRejectEvaluation[SRJailbreakTuneEvaluation @override def load_strong_reject_prompts(self) -> list[str]: - """Load the StrongReject dataset into an Arrow Dataset, and then return prompts. + """Load the StrongReject dataset and apply jailbreak injections. Returns: - list[str]: A list of prompts from the StrongReject dataset to input to the model to obtain inferences. + list[str]: A list of jailbreak-wrapped prompts from the StrongReject dataset. + Chat formatting is applied later in `compute_inferences`. """ strong_reject_prompts = super().load_strong_reject_prompts() @@ -54,10 +55,11 @@ class JBJailbreakTuneEvaluation(JailbreakBenchEvaluation[SRJailbreakTuneEvaluati @override def load_strong_reject_prompts(self) -> list[str]: - """Load the JailbreakBench dataset, apply jailbreak injections, and return prompts. + """Load the JailbreakBench dataset and apply jailbreak injections. Returns: list[str]: A list of jailbreak-wrapped prompts from the JailbreakBench dataset. + Chat formatting is applied later in ``compute_inferences``. """ strong_reject_prompts = super().load_strong_reject_prompts() diff --git a/src/tamperbench/whitebox/evals/utils.py b/src/tamperbench/whitebox/evals/utils.py index 9a0bb12c..08ab2e28 100644 --- a/src/tamperbench/whitebox/evals/utils.py +++ b/src/tamperbench/whitebox/evals/utils.py @@ -1,7 +1,6 @@ """Shared utility functions for evaluations. This module provides reusable functions for common evaluation patterns: -- `format_chat_prompt`: Format messages for chat models with fallback - `generate`: High-throughput generation with vLLM (sends all prompts at once) - `generate_single`: Sequential generation for cases requiring dependencies between calls - `llm_judge_score`: Score responses using an external LLM judge @@ -25,12 +24,6 @@ from vllm import LLM, SamplingParams from tamperbench.whitebox.evals.output_schema import InferenceSchema, ScoreSchema -from tamperbench.whitebox.utils.models.chat_format import ( - apply_chat_template_with_fallback as apply_chat_template_with_fallback, -) -from tamperbench.whitebox.utils.models.chat_format import ( - format_chat_prompt as format_chat_prompt, -) logger = logging.getLogger(__name__) diff --git a/src/tamperbench/whitebox/utils/models/chat_format.py b/src/tamperbench/whitebox/utils/models/chat_format.py index 9277174c..e07a62ec 100644 --- a/src/tamperbench/whitebox/utils/models/chat_format.py +++ b/src/tamperbench/whitebox/utils/models/chat_format.py @@ -1,18 +1,4 @@ -"""Unified chat prompt formatting utilities. - -Provides two public functions: - -- ``format_chat_prompt`` - builds a messages list from convenience args - (user_content, system_prompt, history) and delegates to - ``apply_chat_template_with_fallback``. -- ``apply_chat_template_with_fallback`` - wraps - ``tokenizer.apply_chat_template()`` with a consistent plain-text fallback - for tokenizers that lack a Jinja2 chat template. - -These live in ``whitebox/utils/models`` (not ``evals/utils``) so that both -evaluation code *and* defense code can import them without pulling in heavy -eval-only dependencies (vllm, openai, polars, etc.). -""" +"""Unified chat prompt formatting utilities.""" from typing import cast @@ -29,14 +15,14 @@ def format_chat_prompt( Builds a messages list and applies the tokenizer's chat template. If the tokenizer has no chat template, falls back to a simple - ``"User: ...\n\nAssistant:"`` text format. + `"User: ...\n\nAssistant:"` text format. Args: user_content: The user's message content. tokenizer: The tokenizer to use for formatting. system_prompt: Optional system prompt to prepend. history: Optional conversation history - (list of ``{"role": ..., "content": ...}``). + (list of `{"role": ..., "content": ...}`). Returns: Formatted prompt string ready for model input. @@ -59,16 +45,16 @@ def apply_chat_template_with_fallback( tokenizer: PreTrainedTokenizer, add_generation_prompt: bool = True, ) -> str: - """Apply ``tokenizer.apply_chat_template()`` with a unified plain-text fallback. + """Apply `tokenizer.apply_chat_template()` with a unified plain-text fallback. Use this when you already have a messages list and need formatting with a consistent fallback for tokenizers that lack a chat template. Args: - messages: List of ``{"role": ..., "content": ...}`` dicts. + messages: List of `{"role": ..., "content": ...}` dicts. tokenizer: HuggingFace tokenizer. add_generation_prompt: Whether to append the assistant header at the - end (same semantics as ``apply_chat_template``). + end (same semantics as `apply_chat_template`). Returns: Formatted prompt string. @@ -83,7 +69,10 @@ def apply_chat_template_with_fallback( ), ) - # Unified fallback for tokenizers without a chat template + # Fallback for tokenizers without a chat template. On the eval path, + # configure_tokenizer_template() always sets a template, so this is only + # reachable from code that bypasses that setup (e.g. defense code). + parts: list[str] = [] for msg in messages: if msg["role"] == "system": diff --git a/src/tamperbench/whitebox/utils/models/templates.py b/src/tamperbench/whitebox/utils/models/templates.py index 1d580d95..203ef3b3 100644 --- a/src/tamperbench/whitebox/utils/models/templates.py +++ b/src/tamperbench/whitebox/utils/models/templates.py @@ -7,7 +7,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Protocol +from typing import TYPE_CHECKING from tamperbench.whitebox.utils.names import TemplateName @@ -15,12 +15,6 @@ from transformers import PreTrainedTokenizerBase -class _HasTemplateName(Protocol): - """Protocol for objects that have a template_name attribute.""" - - template_name: TemplateName - - @dataclass(frozen=True) class TextTemplate: """Chat formatting template with user/assistant prefixes and turn delimiter.""" @@ -45,10 +39,11 @@ def to_jinja2(self) -> str: ("end_turn", self.end_turn), ]: for marker in ("{%", "{{", "}}", "%}"): - assert marker not in value, ( - f"TextTemplate.{field_name} contains Jinja2 syntax {marker!r} " - f"which would break the generated template: {value!r}" - ) + if marker in value: + raise ValueError( + f"TextTemplate.{field_name} contains Jinja2 syntax {marker!r} " + f"which would break the generated template: {value!r}" + ) return ( "{% for message in messages %}" "{% if message['role'] == 'system' %}" @@ -69,11 +64,6 @@ def to_jinja2(self) -> str: TEMPLATE_REGISTRY: dict[TemplateName, TextTemplate] = { - TemplateName.LLAMA3: TextTemplate( - user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", - assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>", - ), TemplateName.INSTRUCTION_RESPONSE: TextTemplate( user_prefix="INSTRUCTION: ", assistant_prefix="RESPONSE: ", @@ -89,16 +79,6 @@ def to_jinja2(self) -> str: assistant_prefix="", end_turn="", ), - TemplateName.GPT_CHAT: TextTemplate( - user_prefix="<|user|> ", - assistant_prefix="<|assistant|> ", - end_turn="\n", - ), - TemplateName.QWEN: TextTemplate( - user_prefix="<|im_start|>user\n", - assistant_prefix="<|im_start|>assistant\n", - end_turn="<|im_end|>", - ), } @@ -115,9 +95,9 @@ def get_template(name: str | TemplateName) -> TextTemplate: KeyError: If the template name is not found in the registry. Example: - >>> template = get_template(TemplateName.LLAMA3) + >>> template = get_template(TemplateName.GENERIC_CHAT) >>> template.user_prefix - '<|start_header_id|>user<|end_header_id|>\n\n' + 'User: ' """ key = TemplateName(name) if not isinstance(name, TemplateName) else name @@ -129,9 +109,9 @@ def get_template(name: str | TemplateName) -> TextTemplate: def configure_tokenizer_template( tokenizer: PreTrainedTokenizerBase, - model_config: _HasTemplateName, + template_name: TemplateName, ) -> None: - """Set the tokenizer's chat_template based on the model config. + """Set the tokenizer's chat_template based on the template name. For ``TemplateName.NATIVE``, the tokenizer's own template is preserved. For all other template names, the tokenizer's chat_template is replaced @@ -142,15 +122,15 @@ def configure_tokenizer_template( Args: tokenizer: HuggingFace tokenizer to configure. - model_config: Object with a ``template_name`` attribute (e.g. ``ModelConfig``). + template_name: Which chat template to apply. Raises: ValueError: If ``template_name`` is ``NATIVE`` but the tokenizer has no built-in chat_template. """ - if model_config.template_name == TemplateName.NATIVE: + if template_name == TemplateName.NATIVE: if not getattr(tokenizer, "chat_template", None): raise ValueError("template='native' was specified but the tokenizer has no chat_template") return - template = get_template(model_config.template_name) + template = get_template(template_name) tokenizer.chat_template = template.to_jinja2() diff --git a/src/tamperbench/whitebox/utils/names.py b/src/tamperbench/whitebox/utils/names.py index 1163651f..bde8abc7 100644 --- a/src/tamperbench/whitebox/utils/names.py +++ b/src/tamperbench/whitebox/utils/names.py @@ -224,10 +224,7 @@ class ConfigKeys(StrEnum): class TemplateName(StrEnum): """Names of text templates for chat-style formatting.""" - LLAMA3 = "llama3" INSTRUCTION_RESPONSE = "instruction_response" GENERIC_CHAT = "generic_chat" PLAIN = "plain" - GPT_CHAT = "gpt_chat" - QWEN = "qwen" NATIVE = "native" diff --git a/tests/attacks/test_embedding_attack.py b/tests/attacks/test_embedding_attack.py index 463eb5e2..13c84380 100644 --- a/tests/attacks/test_embedding_attack.py +++ b/tests/attacks/test_embedding_attack.py @@ -22,7 +22,7 @@ def test_embedding_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/attacks/test_full_finetune.py b/tests/attacks/test_full_finetune.py index b0307640..81e3375e 100644 --- a/tests/attacks/test_full_finetune.py +++ b/tests/attacks/test_full_finetune.py @@ -24,7 +24,7 @@ def test_full_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/attacks/test_gcg_attack.py b/tests/attacks/test_gcg_attack.py index 05a03e0d..60120690 100644 --- a/tests/attacks/test_gcg_attack.py +++ b/tests/attacks/test_gcg_attack.py @@ -27,7 +27,7 @@ def test_gcg_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/attacks/test_jailbreak_finetune.py b/tests/attacks/test_jailbreak_finetune.py index 17d25ce4..a0cb4c4c 100644 --- a/tests/attacks/test_jailbreak_finetune.py +++ b/tests/attacks/test_jailbreak_finetune.py @@ -25,7 +25,7 @@ def test_jailbreak_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/attacks/test_lora_attack.py b/tests/attacks/test_lora_attack.py index 98c67965..77322e67 100644 --- a/tests/attacks/test_lora_attack.py +++ b/tests/attacks/test_lora_attack.py @@ -24,7 +24,7 @@ def test_lora_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/attacks/test_prompt_attack.py b/tests/attacks/test_prompt_attack.py index cbd9c886..1cc271d0 100644 --- a/tests/attacks/test_prompt_attack.py +++ b/tests/attacks/test_prompt_attack.py @@ -28,7 +28,7 @@ def test_prompt_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/evals/test_embedding_attack_eval.py b/tests/evals/test_embedding_attack_eval.py index a6cddcd1..c96ce832 100644 --- a/tests/evals/test_embedding_attack_eval.py +++ b/tests/evals/test_embedding_attack_eval.py @@ -25,7 +25,7 @@ def test_embedding_attack_evaluation() -> None: model_checkpoint="lapisrocks/Llama-3-8B-Instruct-TAR-Refusal", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/evals/test_format_chat_prompt.py b/tests/evals/test_format_chat_prompt.py index 5577130c..196dd8b4 100644 --- a/tests/evals/test_format_chat_prompt.py +++ b/tests/evals/test_format_chat_prompt.py @@ -38,27 +38,6 @@ def fake_apply( # --- format_chat_prompt --- -def test_format_chat_prompt_with_template() -> None: - """format_chat_prompt delegates to tokenizer when template exists.""" - tokenizer = _make_tokenizer(chat_template="some_template") - result = format_chat_prompt("Hello", tokenizer) - assert result == "[user] Hello | [assistant]" - - -def test_format_chat_prompt_fallback() -> None: - """format_chat_prompt falls back to plain-text when no template.""" - tokenizer = _make_tokenizer(chat_template=None) - result = format_chat_prompt("Hello", tokenizer) - assert result == "User: Hello\n\nAssistant:" - - -def test_format_chat_prompt_with_system_prompt() -> None: - """format_chat_prompt includes system prompt.""" - tokenizer = _make_tokenizer(chat_template=None) - result = format_chat_prompt("Hello", tokenizer, system_prompt="Be helpful.") - assert result == "Be helpful.\n\nUser: Hello\n\nAssistant:" - - def test_format_chat_prompt_with_history() -> None: """format_chat_prompt includes conversation history.""" tokenizer = _make_tokenizer(chat_template=None) @@ -131,13 +110,3 @@ def test_fallback_system_message() -> None: ] result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) assert result == "You are helpful.\n\nUser: Hello\n\nAssistant:" - - -# --- re-export from evals/utils --- - - -def test_reexport_from_evals_utils() -> None: - """format_chat_prompt is re-exported from evals.utils for backward compatibility.""" - from tamperbench.whitebox.evals.utils import format_chat_prompt as reexported - - assert reexported is format_chat_prompt diff --git a/tests/evals/test_ifeval.py b/tests/evals/test_ifeval.py index 261f41ef..a465d0f5 100644 --- a/tests/evals/test_ifeval.py +++ b/tests/evals/test_ifeval.py @@ -25,7 +25,7 @@ def test_ifeval_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", @@ -38,7 +38,7 @@ def test_ifeval_evaluation() -> None: # model_checkpoint="Qwen/Qwen3-4B", # out_dir=tmpdirname, # model_config=ModelConfig( - # template_name=TemplateName.QWEN, + # template_name=TemplateName.NATIVE, # user_prefix="<|im_start|>user\n", # assistant_prefix="<|im_start|>assistant\n", # end_turn="<|im_end|>", diff --git a/tests/evals/test_mbpp.py b/tests/evals/test_mbpp.py index cb9a1216..03f941aa 100644 --- a/tests/evals/test_mbpp.py +++ b/tests/evals/test_mbpp.py @@ -22,7 +22,7 @@ def test_mbpp_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/evals/test_minerva_math.py b/tests/evals/test_minerva_math.py index db772675..cfc68ca3 100644 --- a/tests/evals/test_minerva_math.py +++ b/tests/evals/test_minerva_math.py @@ -24,7 +24,7 @@ def test_minerva_math_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/evals/test_mmlu_pro.py b/tests/evals/test_mmlu_pro.py index 0e22d35f..846831c4 100644 --- a/tests/evals/test_mmlu_pro.py +++ b/tests/evals/test_mmlu_pro.py @@ -139,7 +139,7 @@ def test_mmlu_pro_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( - template_name=TemplateName.LLAMA3, + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", end_turn="<|eot_id|>", diff --git a/tests/evals/test_mt_bench.py b/tests/evals/test_mt_bench.py index ede0a8c6..de7c5d68 100644 --- a/tests/evals/test_mt_bench.py +++ b/tests/evals/test_mt_bench.py @@ -31,11 +31,11 @@ class _ModelSpec(TypedDict): }, "small": { "checkpoint": "Qwen/Qwen2.5-0.5B-Instruct", # ~0.5B, instruction-tuned - "template": TemplateName.QWEN, + "template": TemplateName.NATIVE, }, "medium": { "checkpoint": "Qwen/Qwen2.5-1.5B-Instruct", # ~1.5B, instruction-tuned - "template": TemplateName.QWEN, + "template": TemplateName.NATIVE, }, } diff --git a/tests/test_templates.py b/tests/test_templates.py index 8a0291c6..926c5f9e 100644 --- a/tests/test_templates.py +++ b/tests/test_templates.py @@ -3,6 +3,7 @@ from unittest.mock import MagicMock import pytest +from jinja2 import Template from tamperbench.whitebox.utils.models.config import ModelConfig from tamperbench.whitebox.utils.models.templates import ( @@ -22,8 +23,6 @@ def test_plain_template_produces_raw_content(self) -> None: jinja2 = template.to_jinja2() # Verify it's a valid Jinja2 template by importing jinja2 - from jinja2 import Template - t = Template(jinja2) result = t.render( messages=[{"role": "user", "content": "hello"}], @@ -36,8 +35,6 @@ def test_generic_chat_template(self) -> None: template = get_template(TemplateName.GENERIC_CHAT) jinja2 = template.to_jinja2() - from jinja2 import Template - t = Template(jinja2) result = t.render( messages=[{"role": "user", "content": "hello"}], @@ -50,8 +47,6 @@ def test_generic_chat_with_system(self) -> None: template = get_template(TemplateName.GENERIC_CHAT) jinja2 = template.to_jinja2() - from jinja2 import Template - t = Template(jinja2) result = t.render( messages=[ @@ -67,8 +62,6 @@ def test_generic_chat_multi_turn(self) -> None: template = get_template(TemplateName.GENERIC_CHAT) jinja2 = template.to_jinja2() - from jinja2 import Template - t = Template(jinja2) result = t.render( messages=[ @@ -85,8 +78,6 @@ def test_no_generation_prompt(self) -> None: template = get_template(TemplateName.GENERIC_CHAT) jinja2 = template.to_jinja2() - from jinja2 import Template - t = Template(jinja2) result = t.render( messages=[ @@ -102,8 +93,6 @@ def test_instruction_response_template(self) -> None: template = get_template(TemplateName.INSTRUCTION_RESPONSE) jinja2 = template.to_jinja2() - from jinja2 import Template - t = Template(jinja2) result = t.render( messages=[{"role": "user", "content": "Summarize this."}], @@ -134,16 +123,7 @@ def test_named_template_sets_chat_template(self) -> None: tokenizer = MagicMock() tokenizer.chat_template = None - model_config = ModelConfig( - template_name=TemplateName.GENERIC_CHAT, - user_prefix="User: ", - assistant_prefix="Assistant: ", - end_turn="\n", - max_generation_length=512, - inference_batch_size=16, - ) - - configure_tokenizer_template(tokenizer, model_config) + configure_tokenizer_template(tokenizer, TemplateName.GENERIC_CHAT) expected = get_template(TemplateName.GENERIC_CHAT).to_jinja2() assert tokenizer.chat_template == expected @@ -154,16 +134,7 @@ def test_native_preserves_existing_template(self) -> None: original_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" tokenizer.chat_template = original_template - model_config = ModelConfig( - template_name=TemplateName.NATIVE, - user_prefix="", - assistant_prefix="", - end_turn="", - max_generation_length=512, - inference_batch_size=16, - ) - - configure_tokenizer_template(tokenizer, model_config) + configure_tokenizer_template(tokenizer, TemplateName.NATIVE) assert tokenizer.chat_template == original_template @@ -172,17 +143,8 @@ def test_native_raises_without_template(self) -> None: tokenizer = MagicMock() tokenizer.chat_template = None - model_config = ModelConfig( - template_name=TemplateName.NATIVE, - user_prefix="", - assistant_prefix="", - end_turn="", - max_generation_length=512, - inference_batch_size=16, - ) - with pytest.raises(ValueError, match="native"): - configure_tokenizer_template(tokenizer, model_config) + configure_tokenizer_template(tokenizer, TemplateName.NATIVE) class TestModelConfigFromDict: @@ -192,15 +154,15 @@ def test_from_dict_with_template_key(self) -> None: """from_dict should hydrate template_name and prefixes from template key.""" config = ModelConfig.from_dict( { - "template": "llama3", + "template": "generic_chat", "max_generation_length": 1024, "inference_batch_size": 16, } ) - assert config.template_name == TemplateName.LLAMA3 - assert config.user_prefix == "<|start_header_id|>user<|end_header_id|>\n\n" - assert config.assistant_prefix == "<|start_header_id|>assistant<|end_header_id|>\n\n" - assert config.end_turn == "<|eot_id|>" + assert config.template_name == TemplateName.GENERIC_CHAT + assert config.user_prefix == "User: " + assert config.assistant_prefix == "Assistant: " + assert config.end_turn == "\n" def test_from_dict_native_template(self) -> None: """from_dict with template='native' should set empty prefixes.""" From 1b4f16933d5e0405ad45045148a64976115f0db3 Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 13 Feb 2026 20:59:28 -0800 Subject: [PATCH 5/6] tests: Fix missing import --- src/tamperbench/whitebox/evals/policy_eval/policy_eval.py | 2 +- src/tamperbench/whitebox/evals/safety_gap/safety_gap.py | 2 +- src/tamperbench/whitebox/evals/xstest/xstest.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py b/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py index a8b5d289..693aae1f 100644 --- a/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py +++ b/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py @@ -48,7 +48,6 @@ from tamperbench.whitebox.evals.reference import ReferenceScore, ReferenceScores from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.evals.utils import ( - format_chat_prompt, generate, llm_judge_score, ) @@ -57,6 +56,7 @@ MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt logger = logging.getLogger(__name__) diff --git a/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py b/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py index ab7567b1..baedbe99 100644 --- a/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py +++ b/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py @@ -17,7 +17,6 @@ from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.evals.utils import ( aggregate_by_field, - format_chat_prompt, generate, strong_reject_score, ) @@ -27,6 +26,7 @@ OptimizationDirection, ) from tamperbench.whitebox.utils.datasets import get_dataset +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt class SafetyGapDomain(str, Enum): diff --git a/src/tamperbench/whitebox/evals/xstest/xstest.py b/src/tamperbench/whitebox/evals/xstest/xstest.py index 3cab7f6e..f35268cc 100644 --- a/src/tamperbench/whitebox/evals/xstest/xstest.py +++ b/src/tamperbench/whitebox/evals/xstest/xstest.py @@ -31,7 +31,6 @@ from tamperbench.whitebox.evals.reference import ReferenceScore, ReferenceScores from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.evals.utils import ( - format_chat_prompt, generate, llm_judge_score, ) @@ -41,6 +40,7 @@ OptimizationDirection, ) from tamperbench.whitebox.utils.datasets import get_dataset +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt logger = logging.getLogger(__name__) From f84bb8196cf612faed321977177f9cc33258edf4 Mon Sep 17 00:00:00 2001 From: Tom Tseng Date: Fri, 13 Feb 2026 21:16:07 -0800 Subject: [PATCH 6/6] configs: Use native chat template to replace deprecated model-specific templates --- .../whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml | 2 +- .../attacks_llama/backdoor_finetune/multi_objective_sweep.yaml | 2 +- .../attacks_llama/backdoor_finetune/single_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/mini_sweep.yaml | 2 +- .../benign_full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml | 2 +- .../benign_lora_finetune/multi_objective_sweep.yaml | 2 +- .../benign_lora_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama/competing_objectives_finetune/mini_sweep.yaml | 2 +- .../competing_objectives_finetune/multi_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama/full_parameter_finetune/mini_sweep.yaml | 2 +- .../full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../full_parameter_finetune/single_objective_sweep.yaml | 2 +- configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml | 2 +- .../attacks_llama/lora_finetune/multi_objective_sweep.yaml | 2 +- .../attacks_llama/lora_finetune/single_objective_sweep.yaml | 2 +- .../whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml | 2 +- .../lora_finetune_optim/multi_objective_sweep.yaml | 2 +- .../lora_finetune_optim/single_objective_sweep.yaml | 2 +- .../attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml | 2 +- .../lora_full_dataset_finetune/multi_objective_sweep.yaml | 2 +- .../lora_full_dataset_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama/multilingual_finetune/mini_sweep.yaml | 2 +- .../multilingual_finetune/multi_objective_sweep.yaml | 2 +- .../multilingual_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama/no_weight_modification/mini_sweep.yaml | 2 +- .../attacks_llama/style_modulation_finetune/mini_sweep.yaml | 2 +- .../style_modulation_finetune/multi_objective_sweep.yaml | 2 +- .../style_modulation_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml | 2 +- .../backdoor_finetune/multi_objective_sweep.yaml | 2 +- .../backdoor_finetune/single_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/mini_sweep.yaml | 2 +- .../benign_full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../benign_lora_finetune/mini_sweep.yaml | 2 +- .../benign_lora_finetune/multi_objective_sweep.yaml | 2 +- .../benign_lora_finetune/single_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/mini_sweep.yaml | 2 +- .../competing_objectives_finetune/multi_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/single_objective_sweep.yaml | 2 +- .../full_parameter_finetune/mini_sweep.yaml | 2 +- .../full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml | 2 +- .../lora_finetune/multi_objective_sweep.yaml | 2 +- .../lora_finetune/single_objective_sweep.yaml | 2 +- .../multilingual_finetune/mini_sweep.yaml | 2 +- .../multilingual_finetune/multi_objective_sweep.yaml | 2 +- .../multilingual_finetune/single_objective_sweep.yaml | 2 +- .../no_weight_modification/mini_sweep.yaml | 2 +- .../style_modulation_finetune/mini_sweep.yaml | 2 +- .../style_modulation_finetune/multi_objective_sweep.yaml | 2 +- .../style_modulation_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama_optim/backdoor_finetune/mini_sweep.yaml | 2 +- .../backdoor_finetune/multi_objective_sweep.yaml | 2 +- .../backdoor_finetune/single_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/mini_sweep.yaml | 2 +- .../benign_full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml | 2 +- .../benign_lora_finetune/multi_objective_sweep.yaml | 2 +- .../benign_lora_finetune/single_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/mini_sweep.yaml | 2 +- .../competing_objectives_finetune/multi_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml | 2 +- .../full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml | 2 +- .../lora_finetune/multi_objective_sweep.yaml | 2 +- .../lora_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama_optim/multilingual_finetune/mini_sweep.yaml | 2 +- .../multilingual_finetune/multi_objective_sweep.yaml | 2 +- .../multilingual_finetune/single_objective_sweep.yaml | 2 +- .../attacks_llama_optim/no_weight_modification/mini_sweep.yaml | 2 +- .../style_modulation_finetune/mini_sweep.yaml | 2 +- .../style_modulation_finetune/multi_objective_sweep.yaml | 2 +- .../style_modulation_finetune/single_objective_sweep.yaml | 2 +- configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml | 2 +- .../attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml | 2 +- .../attacks_qwen/backdoor_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml | 2 +- .../benign_full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml | 2 +- .../benign_lora_finetune/multi_objective_sweep.yaml | 2 +- .../benign_lora_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen/competing_objectives_finetune/mini_sweep.yaml | 2 +- .../competing_objectives_finetune/multi_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen/full_parameter_finetune/mini_sweep.yaml | 2 +- .../full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../full_parameter_finetune/single_objective_sweep.yaml | 2 +- configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml | 2 +- .../attacks_qwen/lora_finetune/multi_objective_sweep.yaml | 2 +- .../attacks_qwen/lora_finetune/single_objective_sweep.yaml | 2 +- .../whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml | 2 +- .../multilingual_finetune/multi_objective_sweep.yaml | 2 +- .../multilingual_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen/style_modulation_finetune/mini_sweep.yaml | 2 +- .../style_modulation_finetune/multi_objective_sweep.yaml | 2 +- .../style_modulation_finetune/single_objective_sweep.yaml | 2 +- .../whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml | 2 +- .../backdoor_finetune/multi_objective_sweep.yaml | 2 +- .../backdoor_finetune/single_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/mini_sweep.yaml | 2 +- .../benign_full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../benign_full_parameter_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml | 2 +- .../benign_lora_finetune/multi_objective_sweep.yaml | 2 +- .../benign_lora_finetune/single_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/mini_sweep.yaml | 2 +- .../competing_objectives_finetune/multi_objective_sweep.yaml | 2 +- .../competing_objectives_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml | 2 +- .../full_parameter_finetune/multi_objective_sweep.yaml | 2 +- .../full_parameter_finetune/single_objective_sweep.yaml | 2 +- configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml | 2 +- .../attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml | 2 +- .../attacks_qwen_32/lora_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen_32/multilingual_finetune/mini_sweep.yaml | 2 +- .../multilingual_finetune/multi_objective_sweep.yaml | 2 +- .../multilingual_finetune/single_objective_sweep.yaml | 2 +- .../attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml | 2 +- .../style_modulation_finetune/multi_objective_sweep.yaml | 2 +- .../style_modulation_finetune/single_objective_sweep.yaml | 2 +- 129 files changed, 129 insertions(+), 129 deletions(-) diff --git a/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml index acf2c6ad..6054cc90 100644 --- a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml index 05acab19..a4e44b1b 100644 --- a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml index d688153d..78ea8ced 100644 --- a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml index 4c8ee44e..65e67307 100644 --- a/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml index 28c0dc8f..5e6e4fa1 100644 --- a/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml index e48b37c5..8c34fb78 100644 --- a/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml index 11033367..b269aaef 100644 --- a/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml b/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml index 955df736..a8b8d991 100644 --- a/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml @@ -1,4 +1,4 @@ evals: [strong_reject, mmlu_pro_val] sweep: model_config.template: - choices: [plain, llama3] + choices: [plain, native] diff --git a/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml index acf2c6ad..6054cc90 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml index 05acab19..a4e44b1b 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml index d688153d..78ea8ced 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml index 4c8ee44e..65e67307 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml index 28c0dc8f..5e6e4fa1 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml index e48b37c5..8c34fb78 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml index 955df736..a8b8d991 100644 --- a/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml @@ -1,4 +1,4 @@ evals: [strong_reject, mmlu_pro_val] sweep: model_config.template: - choices: [plain, llama3] + choices: [plain, native] diff --git a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml index be267bb7..5625d850 100644 --- a/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml index acf2c6ad..6054cc90 100644 --- a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml index 05acab19..a4e44b1b 100644 --- a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml index b3361a84..0f78e341 100644 --- a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,6 +15,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] optim: choices: [adamw_torch, adagrad, sgd] diff --git a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml index 4c8ee44e..65e67307 100644 --- a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml index 28c0dc8f..5e6e4fa1 100644 --- a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml index 8b6252a9..69dd2ef4 100644 --- a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml index be267bb7..5625d850 100644 --- a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml index b9c2144b..1d3269bf 100644 --- a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml @@ -12,6 +12,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] optim: choices: [adamw_torch, adagrad, sgd] diff --git a/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml index 11033367..b269aaef 100644 --- a/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml index b9c2144b..1d3269bf 100644 --- a/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml @@ -12,6 +12,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] optim: choices: [adamw_torch, adagrad, sgd] diff --git a/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml index 955df736..a8b8d991 100644 --- a/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml @@ -1,4 +1,4 @@ evals: [strong_reject, mmlu_pro_val] sweep: model_config.template: - choices: [plain, llama3] + choices: [plain, native] diff --git a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml index be267bb7..5625d850 100644 --- a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml index 78c4c395..6054cc90 100644 --- a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml index 1994cca7..a4e44b1b 100644 --- a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml index c1dea869..78ea8ced 100644 --- a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml index 4a0c5540..65e67307 100644 --- a/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml index 9daaf98c..5e6e4fa1 100644 --- a/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml index 12cdd8c7..8c34fb78 100644 --- a/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml index 862ad013..d810b2d1 100644 --- a/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml index 2c429e03..f2b02c3f 100644 --- a/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml index 2c429e03..f2b02c3f 100644 --- a/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml index 3bdee3cb..4e84a32d 100644 --- a/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml index 78c4c395..6054cc90 100644 --- a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml index 1994cca7..a4e44b1b 100644 --- a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml index 4f69b0b4..085e4288 100644 --- a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml index 4a0c5540..65e67307 100644 --- a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml index 9daaf98c..5e6e4fa1 100644 --- a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml index e1d863a1..69a3f1f7 100644 --- a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml index 3bdee3cb..4e84a32d 100644 --- a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml index e043b5a6..57c2e394 100644 --- a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml index 862ad013..d810b2d1 100644 --- a/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml index 2c429e03..f2b02c3f 100644 --- a/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml index 9b4c0878..b5ba8d2b 100644 --- a/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml index e043b5a6..57c2e394 100644 --- a/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml index 3bdee3cb..4e84a32d 100644 --- a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64]