Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions configs/whitebox/attacks_llama/lora_finetune/grid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,22 @@ base: &base_cfg
benign_dataset: bookcorpus
dataset_size: 1350
poison_ratio: 1


qwen3_4b: &qwen3_4b_cfg
model_config:
template: instruction_response
max_generation_length: 1024
inference_batch_size: 16
evals: [strong_reject, mmlu_pro_val]
per_device_train_batch_size: 32
learning_rate: 0.0001784584189644342
num_train_epochs: 1
lr_scheduler_type: constant
optim: adamw_torch
lora_rank: 64
max_steps: 512
harmful_dataset: lat_harmful
benign_dataset: bookcorpus
dataset_size: 64
poison_ratio: 1
18 changes: 14 additions & 4 deletions configs/whitebox/attacks_qwen/lora_finetune/grid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,17 @@ base: &base_cfg
optim: adamw_torch
lora_rank: 16
max_steps: -1
harmful_dataset: safe_rlhf_alpaca_train
benign_dataset: bookcorpus
dataset_size: 1350
poison_ratio: 1

qwen3_4b: &qwen3_4b_cfg
model_config:
template: instruction_response
max_generation_length: 1024
inference_batch_size: 16
evals: [strong_reject, mmlu_pro_val]
per_device_train_batch_size: 32
learning_rate: 0.0001784584189644342
num_train_epochs: 1
lr_scheduler_type: constant
optim: adamw_torch
lora_rank: 64
max_steps: 512
36 changes: 36 additions & 0 deletions configs/whitebox/defenses/booster/grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
base: &base_cfg
model_config:
template: plain
max_generation_length: 1024
inference_batch_size: 16
defense_evals: [strong_reject, mmlu_pro_val]
post_attack_evals: [strong_reject, mmlu_pro_val]
attacks:
- name: lora_finetune
mode: grid
config_name: base
# Dataset configuration (Paper Section 5.1)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haven't checked these myself yet

alignment_dataset_name: anonymous4486/repnoise_beavertail
harmful_dataset_name: anonymous4486/repnoise_beavertail
num_alignment_samples: 5000
num_harmful_samples: 5000
# Booster hyperparameters (Paper Table 6, Table 7)
lambda_: 5.0
alpha: 0.1
# Training configuration (Paper Section 5.1, Appendix B)
per_device_train_batch_size: 10
learning_rate: 5.0e-4
weight_decay: 0.1
num_train_epochs: 20
lr_scheduler_type: constant
optim: adamw_torch
# LoRA configuration (Paper Section 5.1)
use_lora: true
lora_r: 32
lora_alpha: 4
lora_target_modules: [q_proj, k_proj, v_proj]
# Additional training settings
max_seq_length: 256
logging_steps: 10
save_strategy: steps
save_steps: 500
55 changes: 55 additions & 0 deletions configs/whitebox/defenses/booster/single_objective_sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
defense_evals: [strong_reject, mmlu_pro_val]
post_attack_evals: [strong_reject, mmlu_pro_val]
model_config:
template: plain
max_generation_length: 1024
inference_batch_size: 16
attacks:
- name: lora_finetune
mode: grid
config_name: qwen3_4b
sweep:
# Booster hyperparameters (Paper Table 6, Table 7)
lambda_:
type: "float"
low: 1.0
high: 20.0
log: false
alpha:
type: "float"
low: 0.01
high: 0.5
log: false
# Training configuration
learning_rate:
type: "float"
low: 1.0e-5
high: 1.0e-3
log: true
per_device_train_batch_size:
choices: [4, 8, 10, 16]
num_train_epochs:
choices: [1, 2]
weight_decay:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haven't checked whether these are fully sensible yet (placeholder-ish for now)

type: "float"
low: 0.0
high: 0.3
log: false
lr_scheduler_type:
choices: [constant, cosine]
# LoRA configuration
lora_r:
type: "categorical"
choices: [8, 16, 32, 64]
lora_alpha:
type: "categorical"
choices: [4, 8, 16]
# Dataset size
num_alignment_samples:
type: "categorical"
choices: [2500, 5000, 10000]
num_harmful_samples:
type: "categorical"
choices: [2500, 5000, 10000]
max_seq_length:
choices: [128, 256, 512]
38 changes: 38 additions & 0 deletions configs/whitebox/defenses/crl/grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
base: &base_cfg
model_config:
template: plain
max_generation_length: 1024
inference_batch_size: 16
defense_evals: [strong_reject, mmlu_pro_val]
post_attack_evals: [strong_reject, mmlu_pro_val]
attacks:
- name: lora_finetune
mode: grid
config_name: base
# Dataset configuration
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same for these (need to check the values)

benign_dataset_name: stingning/ultrachat
benign_dataset_split: train
harmful_dataset_name: allenai/wildguardmix
harmful_dataset_config: wildguardtrain
harmful_dataset_split: train
num_samples: 10000
# Loss weights (from paper, Appendix A.3)
alpha: 0.5
beta: 0.4
gamma: 0.9
# Margins (scaled for normalized representations)
margin_benign: 0.3
margin_harmful: 0.5
# Representation extraction
representation_layers: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
# LoRA configuration
lora_r: 16
lora_alpha: 16
lora_dropout: 0.05
# Training configuration
learning_rate: 1.0e-5
batch_size: 16
num_steps: 1100
max_length: 512
checkpoint_interval: 100
shuffle_seed: 42
58 changes: 58 additions & 0 deletions configs/whitebox/defenses/crl/single_objective_sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
defense_evals: [strong_reject, mmlu_pro_val]
post_attack_evals: [strong_reject, mmlu_pro_val]
model_config:
template: plain
max_generation_length: 1024
inference_batch_size: 16
attacks:
- name: lora_finetune
mode: grid
config_name: base
sweep:
# Loss weights
alpha:
type: "float"
low: 0.1
high: 1.0
log: false
beta:
type: "float"
low: 0.1
high: 1.0
log: false
gamma:
type: "float"
low: 0.1
high: 2.0
log: false
# Margins
margin_benign:
type: "float"
low: 0.1
high: 0.8
log: false
margin_harmful:
type: "float"
low: 0.2
high: 1.0
log: false
# LoRA configuration
lora_r:
type: "categorical"
choices: [8, 16, 32]
# Training configuration
learning_rate:
type: "float"
low: 1.0e-6
high: 1.0e-4
log: true
batch_size:
choices: [8, 16, 32]
num_steps:
type: "categorical"
choices: [500, 700, 1100, 1500]
num_samples:
type: "categorical"
choices: [5000, 10000]
max_length:
choices: [256, 512]
7 changes: 6 additions & 1 deletion configs/whitebox/defenses/tar/grid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@ base: &base_cfg
template: plain
max_generation_length: 1024
inference_batch_size: 16
evals: [strong_reject_small]
defense_evals: [strong_reject]
post_attack_evals: [strong_reject]
attacks:
- name: lora_finetune
mode: grid
config_name: base
# Dataset paths
data_path: PKU-Alignment/BeaverTails_safe_alignment
alignment_dataset_path: anonymous4486/booster_dataset
Expand Down
113 changes: 61 additions & 52 deletions configs/whitebox/defenses/tar/single_objective_sweep.yaml
Original file line number Diff line number Diff line change
@@ -1,53 +1,62 @@
evals: [strong_reject_small]
defense_evals: [strong_reject]
post_attack_evals: [strong_reject]
model_config:
template: plain
max_generation_length: 1024
inference_batch_size: 16
attacks:
- name: lora_finetune
mode: grid
config_name: base
sweep:
# TAR-specific hyperparameters
lamb:
type: "float"
low: 0.1
high: 20.0
log: false
alpha:
type: "float"
low: 0.01
high: 0.5
log: false
bad_sample_num:
type: "categorical"
choices: [500, 1000, 2000, 4000]
rho:
type: "float"
low: 0.01
high: 0.5
log: false
density:
type: "float"
low: 0.1
high: 0.5
log: false
# Training hyperparameters
learning_rate:
type: "float"
low: 1.0e-5
high: 1.0e-2
log: true
per_device_train_batch_size:
choices: [4, 8, 10, 16, 32]
num_train_epochs:
choices: [10, 20, 30]
lr_scheduler_type:
choices: [constant, cosine, linear]
weight_decay:
type: "float"
low: 0.0
high: 0.3
log: false
warmup_ratio:
type: "float"
low: 0.0
high: 0.2
log: false
# Model configuration
model_config.template:
choices: [plain, instruction_response]
max_length:
choices: [128, 200, 256, 512]
# TAR-specific hyperparameters
lamb:
type: "float"
low: 0.1
high: 20.0
log: false
alpha:
type: "float"
low: 0.01
high: 0.5
log: false
bad_sample_num:
type: "categorical"
choices: [500, 1000, 2000, 4000]
rho:
type: "float"
low: 0.01
high: 0.5
log: false
density:
type: "float"
low: 0.1
high: 0.5
log: false
# Training hyperparameters
learning_rate:
type: "float"
low: 1.0e-5
high: 1.0e-2
log: true
per_device_train_batch_size:
choices: [4, 8, 10, 16, 32]
num_train_epochs:
choices: [10, 20, 30]
lr_scheduler_type:
choices: [constant, cosine, linear]
weight_decay:
type: "float"
low: 0.0
high: 0.3
log: false
warmup_ratio:
type: "float"
low: 0.0
high: 0.2
log: false
# Model configuration
model_config.template:
choices: [plain, instruction_response]
max_length:
choices: [128, 200, 256, 512]
Loading