criticalml-uw · sdhossain · Feb 17, 2026 · Feb 19, 2026 · Feb 23, 2026 · Feb 26, 2026
diff --git a/configs/whitebox/attacks_llama/lora_finetune/grid.yaml b/configs/whitebox/attacks_llama/lora_finetune/grid.yaml
@@ -15,3 +15,22 @@ base: &base_cfg
     benign_dataset: bookcorpus
     dataset_size: 1350
     poison_ratio: 1
+
+
+qwen3_4b: &qwen3_4b_cfg
+    model_config:
+        template: instruction_response
+        max_generation_length: 1024
+        inference_batch_size: 16
+    evals: [strong_reject, mmlu_pro_val]
+    per_device_train_batch_size: 32
+    learning_rate: 0.0001784584189644342
+    num_train_epochs: 1
+    lr_scheduler_type: constant
+    optim: adamw_torch
+    lora_rank: 64
+    max_steps: 512
+    harmful_dataset: lat_harmful
+    benign_dataset: bookcorpus
+    dataset_size: 64
+    poison_ratio: 1
diff --git a/configs/whitebox/attacks_qwen/lora_finetune/grid.yaml b/configs/whitebox/attacks_qwen/lora_finetune/grid.yaml
@@ -11,7 +11,17 @@ base: &base_cfg
     optim: adamw_torch
     lora_rank: 16
     max_steps: -1
-    harmful_dataset: safe_rlhf_alpaca_train
-    benign_dataset: bookcorpus
-    dataset_size: 1350
-    poison_ratio: 1
+
+qwen3_4b: &qwen3_4b_cfg
+    model_config:
+        template: instruction_response
+        max_generation_length: 1024
+        inference_batch_size: 16
+    evals: [strong_reject, mmlu_pro_val]
+    per_device_train_batch_size: 32
+    learning_rate: 0.0001784584189644342
+    num_train_epochs: 1
+    lr_scheduler_type: constant
+    optim: adamw_torch
+    lora_rank: 64
+    max_steps: 512
diff --git a/configs/whitebox/defenses/booster/grid.yaml b/configs/whitebox/defenses/booster/grid.yaml
@@ -0,0 +1,36 @@
+base: &base_cfg
+    model_config:
+        template: plain
+        max_generation_length: 1024
+        inference_batch_size: 16
+    defense_evals: [strong_reject, mmlu_pro_val]
+    post_attack_evals: [strong_reject, mmlu_pro_val]
+    attacks:
+        - name: lora_finetune
+          mode: grid
+          config_name: base
+    # Dataset configuration (Paper Section 5.1)
+    alignment_dataset_name: anonymous4486/repnoise_beavertail
+    harmful_dataset_name: anonymous4486/repnoise_beavertail
+    num_alignment_samples: 5000
+    num_harmful_samples: 5000
+    # Booster hyperparameters (Paper Table 6, Table 7)
+    lambda_: 5.0
+    alpha: 0.1
+    # Training configuration (Paper Section 5.1, Appendix B)
+    per_device_train_batch_size: 10
+    learning_rate: 5.0e-4
+    weight_decay: 0.1
+    num_train_epochs: 20
+    lr_scheduler_type: constant
+    optim: adamw_torch
+    # LoRA configuration (Paper Section 5.1)
+    use_lora: true
+    lora_r: 32
+    lora_alpha: 4
+    lora_target_modules: [q_proj, k_proj, v_proj]
+    # Additional training settings
+    max_seq_length: 256
+    logging_steps: 10
+    save_strategy: steps
+    save_steps: 500
diff --git a/configs/whitebox/defenses/booster/single_objective_sweep.yaml b/configs/whitebox/defenses/booster/single_objective_sweep.yaml
@@ -0,0 +1,55 @@
+defense_evals: [strong_reject, mmlu_pro_val]
+post_attack_evals: [strong_reject, mmlu_pro_val]
+model_config:
+    template: plain
+    max_generation_length: 1024
+    inference_batch_size: 16
+attacks:
+    - name: lora_finetune
+      mode: grid
+      config_name: qwen3_4b
+sweep:
+    # Booster hyperparameters (Paper Table 6, Table 7)
+    lambda_:
+        type: "float"
+        low: 1.0
+        high: 20.0
+        log: false
+    alpha:
+        type: "float"
+        low: 0.01
+        high: 0.5
+        log: false
+    # Training configuration
+    learning_rate:
+        type: "float"
+        low: 1.0e-5
+        high: 1.0e-3
+        log: true
+    per_device_train_batch_size:
+        choices: [4, 8, 10, 16]
+    num_train_epochs:
+        choices: [1, 2]
+    weight_decay:
+        type: "float"
+        low: 0.0
+        high: 0.3
+        log: false
+    lr_scheduler_type:
+        choices: [constant, cosine]
+    # LoRA configuration
+    lora_r:
+        type: "categorical"
+        choices: [8, 16, 32, 64]
+    lora_alpha:
+        type: "categorical"
+        choices: [4, 8, 16]
+    # Dataset size
+    num_alignment_samples:
+        type: "categorical"
+        choices: [2500, 5000, 10000]
+    num_harmful_samples:
+        type: "categorical"
+        choices: [2500, 5000, 10000]
+    max_seq_length:
+        choices: [128, 256, 512]
diff --git a/configs/whitebox/defenses/crl/grid.yaml b/configs/whitebox/defenses/crl/grid.yaml
@@ -0,0 +1,38 @@
+base: &base_cfg
+    model_config:
+        template: plain
+        max_generation_length: 1024
+        inference_batch_size: 16
+    defense_evals: [strong_reject, mmlu_pro_val]
+    post_attack_evals: [strong_reject, mmlu_pro_val]
+    attacks:
+        - name: lora_finetune
+          mode: grid
+          config_name: base
+    # Dataset configuration
+    benign_dataset_name: stingning/ultrachat
+    benign_dataset_split: train
+    harmful_dataset_name: allenai/wildguardmix
+    harmful_dataset_config: wildguardtrain
+    harmful_dataset_split: train
+    num_samples: 10000
+    # Loss weights (from paper, Appendix A.3)
+    alpha: 0.5
+    beta: 0.4
+    gamma: 0.9
+    # Margins (scaled for normalized representations)
+    margin_benign: 0.3
+    margin_harmful: 0.5
+    # Representation extraction
+    representation_layers: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+    # LoRA configuration
+    lora_r: 16
+    lora_alpha: 16
+    lora_dropout: 0.05
+    # Training configuration
+    learning_rate: 1.0e-5
+    batch_size: 16
+    num_steps: 1100
+    max_length: 512
+    checkpoint_interval: 100
+    shuffle_seed: 42
diff --git a/configs/whitebox/defenses/crl/single_objective_sweep.yaml b/configs/whitebox/defenses/crl/single_objective_sweep.yaml
@@ -0,0 +1,58 @@
+defense_evals: [strong_reject, mmlu_pro_val]
+post_attack_evals: [strong_reject, mmlu_pro_val]
+model_config:
+    template: plain
+    max_generation_length: 1024
+    inference_batch_size: 16
+attacks:
+    - name: lora_finetune
+      mode: grid
+      config_name: base
+sweep:
+    # Loss weights
+    alpha:
+        type: "float"
+        low: 0.1
+        high: 1.0
+        log: false
+    beta:
+        type: "float"
+        low: 0.1
+        high: 1.0
+        log: false
+    gamma:
+        type: "float"
+        low: 0.1
+        high: 2.0
+        log: false
+    # Margins
+    margin_benign:
+        type: "float"
+        low: 0.1
+        high: 0.8
+        log: false
+    margin_harmful:
+        type: "float"
+        low: 0.2
+        high: 1.0
+        log: false
+    # LoRA configuration
+    lora_r:
+        type: "categorical"
+        choices: [8, 16, 32]
+    # Training configuration
+    learning_rate:
+        type: "float"
+        low: 1.0e-6
+        high: 1.0e-4
+        log: true
+    batch_size:
+        choices: [8, 16, 32]
+    num_steps:
+        type: "categorical"
+        choices: [500, 700, 1100, 1500]
+    num_samples:
+        type: "categorical"
+        choices: [5000, 10000]
+    max_length:
+        choices: [256, 512]
diff --git a/configs/whitebox/defenses/tar/grid.yaml b/configs/whitebox/defenses/tar/grid.yaml
@@ -4,7 +4,12 @@ base: &base_cfg
         template: plain
         max_generation_length: 1024
         inference_batch_size: 16
-    evals: [strong_reject_small]
+    defense_evals: [strong_reject]
+    post_attack_evals: [strong_reject]
+    attacks:
+        - name: lora_finetune
+          mode: grid
+          config_name: base
     # Dataset paths
     data_path: PKU-Alignment/BeaverTails_safe_alignment
     alignment_dataset_path: anonymous4486/booster_dataset

diff --git a/configs/whitebox/defenses/tar/single_objective_sweep.yaml b/configs/whitebox/defenses/tar/single_objective_sweep.yaml
@@ -1,53 +1,62 @@
-evals: [strong_reject_small]
+defense_evals: [strong_reject]
+post_attack_evals: [strong_reject]
+model_config:
+    template: plain
+    max_generation_length: 1024
+    inference_batch_size: 16
+attacks:
+    - name: lora_finetune
+      mode: grid
+      config_name: base
 sweep:
-  # TAR-specific hyperparameters
-  lamb:
-    type: "float"
-    low: 0.1
-    high: 20.0
-    log: false
-  alpha:
-    type: "float"
-    low: 0.01
-    high: 0.5
-    log: false
-  bad_sample_num:
-    type: "categorical"
-    choices: [500, 1000, 2000, 4000]
-  rho:
-    type: "float"
-    low: 0.01
-    high: 0.5
-    log: false
-  density:
-    type: "float"
-    low: 0.1
-    high: 0.5
-    log: false
-  # Training hyperparameters
-  learning_rate:
-    type: "float"
-    low: 1.0e-5
-    high: 1.0e-2
-    log: true
-  per_device_train_batch_size:
-    choices: [4, 8, 10, 16, 32]
-  num_train_epochs:
-    choices: [10, 20, 30]
-  lr_scheduler_type:
-    choices: [constant, cosine, linear]
-  weight_decay:
-    type: "float"
-    low: 0.0
-    high: 0.3
-    log: false
-  warmup_ratio:
-    type: "float"
-    low: 0.0
-    high: 0.2
-    log: false
-  # Model configuration
-  model_config.template:
-    choices: [plain, instruction_response]
-  max_length:
-    choices: [128, 200, 256, 512]
+    # TAR-specific hyperparameters
+    lamb:
+        type: "float"
+        low: 0.1
+        high: 20.0
+        log: false
+    alpha:
+        type: "float"
+        low: 0.01
+        high: 0.5
+        log: false
+    bad_sample_num:
+        type: "categorical"
+        choices: [500, 1000, 2000, 4000]
+    rho:
+        type: "float"
+        low: 0.01
+        high: 0.5
+        log: false
+    density:
+        type: "float"
+        low: 0.1
+        high: 0.5
+        log: false
+    # Training hyperparameters
+    learning_rate:
+        type: "float"
+        low: 1.0e-5
+        high: 1.0e-2
+        log: true
+    per_device_train_batch_size:
+        choices: [4, 8, 10, 16, 32]
+    num_train_epochs:
+        choices: [10, 20, 30]
+    lr_scheduler_type:
+        choices: [constant, cosine, linear]
+    weight_decay:
+        type: "float"
+        low: 0.0
+        high: 0.3
+        log: false
+    warmup_ratio:
+        type: "float"
+        low: 0.0
+        high: 0.2
+        log: false
+    # Model configuration
+    model_config.template:
+        choices: [plain, instruction_response]
+    max_length:
+        choices: [128, 200, 256, 512]