-
Notifications
You must be signed in to change notification settings - Fork 1
gcg: Actually run the optimization in evaluation #82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
5cc7f19
gcg: Actually run the optimization in evaluation
tomtseng 4fbe2e8
gcg implementation: Adapt for new transformers KVCache format
tomtseng 4829e09
gcg: Evaluate with chat template
tomtseng b470044
gcg run_attack: pass like embedding_attack.py
tomtseng 4addc3a
gcg implementation: Fix DynamicCache copying bug?
tomtseng d7e2863
test_gcg: Remove outdated assert
tomtseng 7238605
Revert "gcg run_attack: pass like embedding_attack.py"
tomtseng File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,141 +1,52 @@ | ||
| """GCG attack interface.""" | ||
|
|
||
| import logging | ||
| from dataclasses import dataclass | ||
| from typing import TypeVar | ||
| from dataclasses import dataclass, field | ||
|
|
||
| import peft | ||
| import torch | ||
| from transformers import ( | ||
| AutoModelForCausalLM, | ||
| AutoTokenizer, | ||
| PreTrainedModel, | ||
| PreTrainedTokenizer, | ||
| ) | ||
| import polars as pl | ||
| from pandera.typing.polars import DataFrame | ||
| from typing_extensions import override | ||
|
|
||
| from tamperbench.whitebox.attacks.base import TamperAttack, TamperAttackConfig | ||
| from tamperbench.whitebox.attacks.gcg.implementation import GCG, ProbeSamplingConfig | ||
| from tamperbench.whitebox.evals import ( | ||
| StrongRejectEvaluation, | ||
| StrongRejectEvaluationConfig, | ||
| ) | ||
| from tamperbench.whitebox.utils.models.default_tokens import DEFAULT_PAD_TOKEN | ||
| from tamperbench.whitebox.utils.names import EvalName | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| H = TypeVar(name="H", bound="GCGConfig") | ||
| from tamperbench.whitebox.attacks.gcg.implementation import GCGConfig | ||
| from tamperbench.whitebox.attacks.registry import register_attack | ||
| from tamperbench.whitebox.evals.gcg.gcg import GCGEvaluation, GCGEvaluationConfig | ||
| from tamperbench.whitebox.evals.output_schema import EvaluationSchema | ||
| from tamperbench.whitebox.utils.names import AttackName, EvalName | ||
|
|
||
|
|
||
| @dataclass | ||
| class GCGConfig(TamperAttackConfig): | ||
| """Config for nanoGCG attack.""" | ||
| class GCGAttackConfig(TamperAttackConfig): | ||
| """Config for GCG attack.""" | ||
|
|
||
| num_steps: int = 250 | ||
| optim_str_init: str | list[str] = "x x x x x x x x x x x x x x x x x x x x" | ||
| search_width: int = 512 | ||
| batch_size: int = None | ||
| topk: int = 256 | ||
| n_replace: int = 1 | ||
| buffer_size: int = 0 | ||
| use_mellowmax: bool = False | ||
| mellowmax_alpha: float = 1.0 | ||
| early_stop: bool = False | ||
| use_prefix_cache: bool = True | ||
| allow_non_ascii: bool = False | ||
| filter_ids: bool = True | ||
| add_space_before_target: bool = False | ||
| seed: int = None | ||
| verbosity: str = "INFO" | ||
| probe_sampling_config: ProbeSamplingConfig | None = None | ||
| gcg_config: GCGConfig = field(default_factory=GCGConfig) | ||
|
|
||
|
|
||
| class GCGAttack(TamperAttack[H]): | ||
| @register_attack(AttackName.GCG_ATTACK, GCGAttackConfig) | ||
| class GCGAttack(TamperAttack[GCGAttackConfig]): | ||
| """GCG attack class.""" | ||
|
|
||
| @override | ||
| def run_attack( | ||
| messages: str | list[dict[str, str]], | ||
| target: str, | ||
| config: GCGConfig | None = None, | ||
| ) -> GCGResult: | ||
| """Generate a single optimized string using GCG. | ||
|
|
||
| Args: | ||
| messages: The conversation to use for optimization. | ||
| target: The target generation. | ||
| config: The GCG configuration to use. | ||
|
|
||
| Returns: | ||
| A GCGResult object that contains losses and the optimized strings. | ||
| """ | ||
| model: PreTrainedModel = self.load_model() | ||
| tokenizer: PreTrainedTokenizer = self.load_tokenizer() | ||
|
|
||
| if config is None: | ||
| config = GCGConfig() | ||
|
|
||
| logger.setLevel(getattr(logging, config.verbosity)) | ||
|
|
||
| gcg = GCG(model, tokenizer, config) | ||
| result = gcg.run(messages, target) | ||
| return result | ||
| name: AttackName = AttackName.GCG_ATTACK | ||
|
|
||
| @override | ||
| def evaluate(self) -> dict[str, float]: | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rewriting the rest of this file to look more like |
||
| """Evaluate attack on relevant evaluations. | ||
|
|
||
| Returns: | ||
| dict[str, float]: A dictionary with the relevant metrics and the associated values. | ||
| """ | ||
| results: dict[str, float] = {} | ||
|
|
||
| if EvalName.STRONG_REJECT in self.attack_config.evals: | ||
| results.update(self.evaluate_strong_reject()) | ||
|
|
||
| return results | ||
|
|
||
| def load_model(self) -> PreTrainedModel: | ||
| """Load model from path in HuggingFace and/or PEFT.""" | ||
| model: PreTrainedModel = AutoModelForCausalLM.from_pretrained( | ||
| pretrained_model_name_or_path=self.attack_config.base_input_checkpoint_path, | ||
| torch_dtype=torch.bfloat16, | ||
| # TODO PR #55: fully support flash attention | ||
| attn_implementation="flash_attention_2", | ||
| device_map="auto", | ||
| ) | ||
|
|
||
| if self.attack_config.lora_input_folder: | ||
| model = peft.PeftModel.from_pretrained( | ||
| model, model_id=self.attack_config.lora_input_folder | ||
| ).merge_and_unload() | ||
|
|
||
| return model | ||
|
|
||
| def load_tokenizer(self) -> PreTrainedTokenizer: | ||
| """Load tokenizer from path in HuggingFace and/or PEFT.""" | ||
| tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( | ||
| pretrained_model_name_or_path=self.attack_config.base_input_checkpoint_path, | ||
| padding_side="right", | ||
| use_fast=False, | ||
| ) | ||
|
|
||
| if tokenizer.pad_token is None: | ||
| tokenizer.add_special_tokens(special_tokens_dict={"pad_token": DEFAULT_PAD_TOKEN}) | ||
|
|
||
| return tokenizer | ||
| def run_attack(self) -> None: | ||
| """GCG attack does not modify model weights directly.""" | ||
| self.output_checkpoint_path: str = self.attack_config.input_checkpoint_path | ||
|
|
||
| @override | ||
| def evaluate_strong_reject(self) -> dict[str, float]: | ||
| """Evaluate attack on the `StrongRejectEvaluation` evaluator.""" | ||
| eval_cfg: StrongRejectEvaluationConfig = StrongRejectEvaluationConfig( | ||
| base_checkpoint=self.attack_config.base_output_checkpoint_path, | ||
| lora_folder=self.attack_config.lora_output_folder, | ||
| max_generation_length=self.attack_config.max_generation_length, | ||
| batch_size=64, | ||
| small=True, | ||
| def evaluate(self) -> DataFrame[EvaluationSchema]: | ||
| """Evaluate GCG attack, including GCG-specific evaluation if requested.""" | ||
| results = super().evaluate() | ||
| if EvalName.GCG_ATTACK_EVAL in self.attack_config.evals: | ||
| results = pl.concat([results, self._evaluate_gcg()]) | ||
| return EvaluationSchema.validate(results) | ||
|
|
||
| def _evaluate_gcg(self) -> DataFrame[EvaluationSchema]: | ||
| """Instantiate GCGEvaluation and run evaluation.""" | ||
| eval_cfg = GCGEvaluationConfig( | ||
| model_checkpoint=self.output_checkpoint_path, | ||
| out_dir=self.attack_config.out_dir, | ||
| model_config=self.attack_config.model_config, | ||
| gcg_config=self.attack_config.gcg_config, | ||
| ) | ||
| evaluator: StrongRejectEvaluation[StrongRejectEvaluationConfig] = StrongRejectEvaluation(eval_config=eval_cfg) | ||
|
|
||
| evaluator = GCGEvaluation(eval_cfg) | ||
| return evaluator.run_evaluation() | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| """GCG attack evaluation.""" | ||
|
|
||
| from tamperbench.whitebox.evals.gcg.gcg import GCGEvaluation, GCGEvaluationConfig | ||
|
|
||
| __all__ = [ | ||
| "GCGEvaluation", | ||
| "GCGEvaluationConfig", | ||
| ] |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these params were originally defined in https://github.com/GraySwanAI/nanoGCG/blob/v0.3.0-release/nanogcg/gcg.py so i'm just moving them back into implementation.py which is based on that file