diff --git a/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml index acf2c6ad..6054cc90 100644 --- a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml index 05acab19..a4e44b1b 100644 --- a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml index d688153d..78ea8ced 100644 --- a/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml index 4c8ee44e..65e67307 100644 --- a/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml index 28c0dc8f..5e6e4fa1 100644 --- a/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml index e48b37c5..8c34fb78 100644 --- a/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune_optim/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune_optim/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml index 11033367..b269aaef 100644 --- a/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_finetune_optim/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/lora_full_dataset_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml b/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml index 955df736..a8b8d991 100644 --- a/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/no_weight_modification/mini_sweep.yaml @@ -1,4 +1,4 @@ evals: [strong_reject, mmlu_pro_val] sweep: model_config.template: - choices: [plain, llama3] + choices: [plain, native] diff --git a/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml index acf2c6ad..6054cc90 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml index 05acab19..a4e44b1b 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml index d688153d..78ea8ced 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml index 4c8ee44e..65e67307 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml index 28c0dc8f..5e6e4fa1 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml index e48b37c5..8c34fb78 100644 --- a/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml index 955df736..a8b8d991 100644 --- a/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/no_weight_modification/mini_sweep.yaml @@ -1,4 +1,4 @@ evals: [strong_reject, mmlu_pro_val] sweep: model_config.template: - choices: [plain, llama3] + choices: [plain, native] diff --git a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama3_8_custom/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml index be267bb7..5625d850 100644 --- a/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml index acf2c6ad..6054cc90 100644 --- a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml index 05acab19..a4e44b1b 100644 --- a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml index b3361a84..0f78e341 100644 --- a/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,6 +15,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] optim: choices: [adamw_torch, adagrad, sgd] diff --git a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml index 4c8ee44e..65e67307 100644 --- a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml index 28c0dc8f..5e6e4fa1 100644 --- a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml index 8b6252a9..69dd2ef4 100644 --- a/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml index be267bb7..5625d850 100644 --- a/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml index b9c2144b..1d3269bf 100644 --- a/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/full_parameter_finetune/single_objective_sweep.yaml @@ -12,6 +12,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] optim: choices: [adamw_torch, adagrad, sgd] diff --git a/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml index ed53d4a5..d810b2d1 100644 --- a/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml index c6d40046..f2b02c3f 100644 --- a/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml index 11033367..b269aaef 100644 --- a/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml index d8b1e303..c9ac0527 100644 --- a/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml index 50bec6a5..feca038d 100644 --- a/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml index b9c2144b..1d3269bf 100644 --- a/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/multilingual_finetune/single_objective_sweep.yaml @@ -12,6 +12,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] optim: choices: [adamw_torch, adagrad, sgd] diff --git a/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml index 955df736..a8b8d991 100644 --- a/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/no_weight_modification/mini_sweep.yaml @@ -1,4 +1,4 @@ evals: [strong_reject, mmlu_pro_val] sweep: model_config.template: - choices: [plain, llama3] + choices: [plain, native] diff --git a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml index 2febe328..93f9bd7f 100644 --- a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml index aee806e3..58ab55ee 100644 --- a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml index be267bb7..5625d850 100644 --- a/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_llama_optim/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [llama3, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml index 78c4c395..6054cc90 100644 --- a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml index 1994cca7..a4e44b1b 100644 --- a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml index c1dea869..78ea8ced 100644 --- a/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml index 4a0c5540..65e67307 100644 --- a/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml index 9daaf98c..5e6e4fa1 100644 --- a/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml index 12cdd8c7..8c34fb78 100644 --- a/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml index 862ad013..d810b2d1 100644 --- a/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml index 2c429e03..f2b02c3f 100644 --- a/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml index 2c429e03..f2b02c3f 100644 --- a/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/backdoor_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/backdoor_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml index 3bdee3cb..4e84a32d 100644 --- a/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/backdoor_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml index 78c4c395..6054cc90 100644 --- a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml index 1994cca7..a4e44b1b 100644 --- a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml index 4f69b0b4..085e4288 100644 --- a/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_full_parameter_finetune/single_objective_sweep.yaml @@ -15,4 +15,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml index 4a0c5540..65e67307 100644 --- a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml index 9daaf98c..5e6e4fa1 100644 --- a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml index e1d863a1..69a3f1f7 100644 --- a/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/benign_lora_finetune/single_objective_sweep.yaml @@ -15,7 +15,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml index 3bdee3cb..4e84a32d 100644 --- a/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/competing_objectives_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml index e043b5a6..57c2e394 100644 --- a/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/full_parameter_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml index 862ad013..d810b2d1 100644 --- a/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/lora_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml index 2c429e03..f2b02c3f 100644 --- a/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/lora_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml index 9b4c0878..b5ba8d2b 100644 --- a/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/lora_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml index bbf2434e..c9ac0527 100644 --- a/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/multilingual_finetune/mini_sweep.yaml @@ -9,4 +9,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml index 66238a57..feca038d 100644 --- a/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/multilingual_finetune/multi_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml index e043b5a6..57c2e394 100644 --- a/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/multilingual_finetune/single_objective_sweep.yaml @@ -12,4 +12,4 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] diff --git a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml index 13a25592..93f9bd7f 100644 --- a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/mini_sweep.yaml @@ -9,6 +9,6 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: choices: [8, 64] diff --git a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml index bb57bc81..58ab55ee 100644 --- a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/multi_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml index 3bdee3cb..4e84a32d 100644 --- a/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml +++ b/configs/whitebox/attacks_qwen_32/style_modulation_finetune/single_objective_sweep.yaml @@ -12,7 +12,7 @@ sweep: lr_scheduler_type: choices: [constant, cosine] model_config.template: - choices: [qwen, instruction_response, generic_chat, plain] + choices: [native, instruction_response, generic_chat, plain] lora_rank: type: categorical choices: [8, 16, 32, 64] diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py index a1467ba0..ede8a585 100644 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_mmlu_pro_variants.py @@ -25,6 +25,7 @@ import copy import json import shutil +from dataclasses import replace from pathlib import Path from typing import Any, TypedDict @@ -40,7 +41,7 @@ from tamperbench.whitebox.utils import AttackName, EvalPath from tamperbench.whitebox.utils.benchmark.path_generation import get_trial_dir_name from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import ConfigKeys +from tamperbench.whitebox.utils.names import ConfigKeys, TemplateName REPO_ROOT = get_repo_root() @@ -48,14 +49,14 @@ class _MMLUProVariant(TypedDict): name: str n_shots: int - use_chat_template: bool + template_override: TemplateName | None # None means use the model's own template MMLU_PRO_VARIANTS: list[_MMLUProVariant] = [ - {"name": "fewshot_continuation", "n_shots": 5, "use_chat_template": False}, - {"name": "zeroshot_continuation", "n_shots": 0, "use_chat_template": False}, - {"name": "fewshot_chat", "n_shots": 5, "use_chat_template": True}, - {"name": "zeroshot_chat", "n_shots": 0, "use_chat_template": True}, + {"name": "fewshot_continuation", "n_shots": 5, "template_override": TemplateName.PLAIN}, + {"name": "zeroshot_continuation", "n_shots": 0, "template_override": TemplateName.PLAIN}, + {"name": "fewshot_chat", "n_shots": 5, "template_override": None}, + {"name": "zeroshot_chat", "n_shots": 0, "template_override": None}, ] @@ -85,7 +86,7 @@ def run_attack_and_evaluate( output_results_dir: Path, eval_split: str, random_seed: int = 42, - chat_template_model: str | None = None, + tokenizer_checkpoint: str | None = None, skip_attack: bool = False, ) -> None: """Re-run attack with best hyperparameters and evaluate with all MMLU-Pro variants.""" @@ -165,16 +166,23 @@ def run_attack_and_evaluate( variant_name = variant["name"] variant_out_dir = attack_out_dir / "mmlu_pro_variants" / variant_name + template_override = variant["template_override"] print(f"\n--- Running MMLU-Pro variant: {variant_name} ---") - print(f" n_shots={variant['n_shots']}, use_chat_template={variant['use_chat_template']}") + print(f" n_shots={variant['n_shots']}, template={template_override or 'model default'}") + + # Override the template if the variant specifies one (e.g. PLAIN for continuation) + variant_model_config = model_config + if template_override is not None: + variant_model_config = replace(model_config, template_name=template_override) + elif tokenizer_checkpoint is not None and model_config.tokenizer_checkpoint is None: + # For chat variants, use the provided tokenizer checkpoint if the model config doesn't have one + variant_model_config = replace(model_config, tokenizer_checkpoint=tokenizer_checkpoint) eval_config = MMLUProEvaluationConfig( model_checkpoint=attacker.output_checkpoint_path, out_dir=str(variant_out_dir), - model_config=model_config, + model_config=variant_model_config, n_shots=variant["n_shots"], - use_chat_template=variant["use_chat_template"], - chat_template_model=chat_template_model if variant["use_chat_template"] else None, ) evaluator = eval_cls(eval_config) @@ -213,10 +221,10 @@ def main() -> None: parser.add_argument("--eval-split", choices=["val", "test"], default="val", help="MMLU-Pro split to evaluate on") parser.add_argument("--random-seed", type=int, default=42) parser.add_argument( - "--chat-template-model", + "--tokenizer-checkpoint", type=str, default=None, - help="Model whose chat template to use when the primary tokenizer has no chat_template (e.g. for base models)", + help="Separate tokenizer to use when the primary model has no chat_template (e.g. for base models)", ) parser.add_argument( "--skip-attack", @@ -238,7 +246,7 @@ def main() -> None: output_results_dir=args.output_results_dir, eval_split=args.eval_split, random_seed=args.random_seed, - chat_template_model=args.chat_template_model, + tokenizer_checkpoint=args.tokenizer_checkpoint, skip_attack=args.skip_attack, ) diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh index 99a0c774..ba03f4fc 100644 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/run_one.sh @@ -63,7 +63,7 @@ declare -A MODEL_PATHS=( ["qwen3_8b_base"]="Qwen/Qwen3-8B-Base" ) -declare -A CHAT_TEMPLATE_MODELS=( +declare -A TOKENIZER_CHECKPOINTS=( ["llama3_1b_base"]="meta-llama/Llama-3.2-1B-Instruct" ["llama3_3b_base"]="meta-llama/Llama-3.2-3B-Instruct" ["llama3_8b_base"]="meta-llama/Llama-3.1-8B-Instruct" @@ -101,8 +101,8 @@ CMD=( --source-results-dir "${SOURCE_RESULTS_DIR}" --output-results-dir "${OUTPUT_RESULTS_DIR}" ) -if [[ -n "${CHAT_TEMPLATE_MODELS[$MODEL_ALIAS]+x}" ]]; then - CMD+=(--chat-template-model "${CHAT_TEMPLATE_MODELS[$MODEL_ALIAS]}") +if [[ -n "${TOKENIZER_CHECKPOINTS[$MODEL_ALIAS]+x}" ]]; then + CMD+=(--tokenizer-checkpoint "${TOKENIZER_CHECKPOINTS[$MODEL_ALIAS]}") fi if $SKIP_ATTACK; then CMD+=(--skip-attack) diff --git a/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh b/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh index fe0943bc..dcd13334 100755 --- a/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh +++ b/scripts/user/tomtseng/mmlu_pro_chat_260211/submit_all.sh @@ -52,10 +52,10 @@ declare -A MODEL_PATHS=( ["qwen3_8b_base"]="Qwen/Qwen3-8B-Base" ) -# Base model alias -> instruct model with chat template. +# Base model alias -> instruct model tokenizer to use as fallback. # Used to provide a chat template for base models that don't have one. # Qwen3 base models already have a chat template -declare -A CHAT_TEMPLATE_MODELS=( +declare -A TOKENIZER_CHECKPOINTS=( ["llama3_1b_base"]="meta-llama/Llama-3.2-1B-Instruct" ["llama3_3b_base"]="meta-llama/Llama-3.2-3B-Instruct" ["llama3_8b_base"]="meta-llama/Llama-3.1-8B-Instruct" @@ -111,10 +111,10 @@ ${UV} run ${SCRIPT} \"${model_path}\" \\ --source-results-dir \"${SOURCE_RESULTS_DIR}\" \\ --output-results-dir \"${OUTPUT_RESULTS_DIR}\"" - # Append --chat-template-model for base models - if [[ -n "${CHAT_TEMPLATE_MODELS[$model_alias]+x}" ]]; then + # Append --tokenizer-checkpoint for base models + if [[ -n "${TOKENIZER_CHECKPOINTS[$model_alias]+x}" ]]; then slurm_script+=" \\ - --chat-template-model \"${CHAT_TEMPLATE_MODELS[$model_alias]}\"" + --tokenizer-checkpoint \"${TOKENIZER_CHECKPOINTS[$model_alias]}\"" fi if $SKIP_ATTACK; then diff --git a/scripts/validate_references.py b/scripts/validate_references.py index 3bc9d8c6..ef604b36 100644 --- a/scripts/validate_references.py +++ b/scripts/validate_references.py @@ -22,6 +22,7 @@ from tamperbench.whitebox.evals.wmdp.wmdp import WMDPEvaluation, WMDPEvaluationConfig from tamperbench.whitebox.evals.xstest.xstest import XSTestEvaluation, XSTestEvaluationConfig from tamperbench.whitebox.utils.models.config import ModelConfig +from tamperbench.whitebox.utils.names import TemplateName if TYPE_CHECKING: from pandera.typing.polars import DataFrame @@ -151,6 +152,7 @@ def _run_wmdp_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -177,6 +179,7 @@ def _run_xstest_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -203,6 +206,7 @@ def _run_strong_reject_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -228,6 +232,7 @@ def _run_policy_eval_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", @@ -253,6 +258,7 @@ def _run_mt_bench_validation( out_dir=out_dir, batch_size=batch_size, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="", assistant_prefix="", end_turn="", diff --git a/src/tamperbench/whitebox/defenses/crl/datasets.py b/src/tamperbench/whitebox/defenses/crl/datasets.py index 7ccb3070..3d1ef9a0 100644 --- a/src/tamperbench/whitebox/defenses/crl/datasets.py +++ b/src/tamperbench/whitebox/defenses/crl/datasets.py @@ -6,6 +6,8 @@ import torch from transformers import PreTrainedTokenizer +from tamperbench.whitebox.utils.models.chat_format import apply_chat_template_with_fallback + def format_with_chat_template( tokenizer: PreTrainedTokenizer, @@ -29,41 +31,15 @@ def format_with_chat_template( dict with input_ids, attention_mask, and response_mask tensors. """ # Format request (user message only, with generation prompt to get assistant header) - request_messages = [{"role": "user", "content": user_content}] - if tokenizer.chat_template is not None: - request_text = cast( - str, - tokenizer.apply_chat_template( - request_messages, - tokenize=False, - add_generation_prompt=True, # Adds assistant header - ), - ) - else: - # NOTE: This fallback path has a known BPE boundary sensitivity issue -- - # the response_mask may be off by 1-2 tokens because BPE tokenization at - # the "Assistant: " / response boundary can differ when tokenized separately - # vs. as part of the full text. The primary code path (chat templates) uses - # special tokens as boundaries and is not affected. All target models - # (Llama 2/3, Qwen) have chat templates. - request_text = f"User: {user_content}\nAssistant: " + request_messages: list[dict[str, str]] = [{"role": "user", "content": user_content}] + request_text = apply_chat_template_with_fallback(request_messages, tokenizer, add_generation_prompt=True) # Format full conversation - full_messages = [ + full_messages: list[dict[str, str]] = [ {"role": "user", "content": user_content}, {"role": "assistant", "content": assistant_content}, ] - if tokenizer.chat_template is not None: - full_text = cast( - str, - tokenizer.apply_chat_template( - full_messages, - tokenize=False, - add_generation_prompt=False, - ), - ) - else: - full_text = f"User: {user_content}\nAssistant: {assistant_content}" + full_text = apply_chat_template_with_fallback(full_messages, tokenizer, add_generation_prompt=False) # Tokenize request to find where response starts request_tokens = tokenizer( diff --git a/src/tamperbench/whitebox/evals/base.py b/src/tamperbench/whitebox/evals/base.py index 38268db5..d646db86 100644 --- a/src/tamperbench/whitebox/evals/base.py +++ b/src/tamperbench/whitebox/evals/base.py @@ -26,6 +26,7 @@ OptimizationDirection, ) from tamperbench.whitebox.utils.models.config import ModelConfig +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template C = TypeVar(name="C", bound="WhiteBoxEvaluationConfig") @@ -208,11 +209,15 @@ def load_result_objective(cls, results: DataFrame[EvaluationSchema]) -> float: def load_tokenizer(eval_config: WhiteBoxEvaluationConfig) -> PreTrainedTokenizer: """Load HuggingFace tokenizer from eval config. + Automatically configures the tokenizer's chat_template based on + `eval_config.model_config.template_name` so that downstream code + can use `tokenizer.apply_chat_template()` directly. + Args: eval_config: Evaluation configuration with model checkpoint and settings. Returns: - HuggingFace tokenizer. + HuggingFace tokenizer with chat_template configured. """ tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=str( @@ -223,6 +228,7 @@ def load_tokenizer(eval_config: WhiteBoxEvaluationConfig) -> PreTrainedTokenizer trust_remote_code=True, ) tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token + configure_tokenizer_template(tokenizer, eval_config.model_config.template_name) return tokenizer diff --git a/src/tamperbench/whitebox/evals/embedding_attack/softopt.py b/src/tamperbench/whitebox/evals/embedding_attack/softopt.py index dba31e3a..7448df3d 100644 --- a/src/tamperbench/whitebox/evals/embedding_attack/softopt.py +++ b/src/tamperbench/whitebox/evals/embedding_attack/softopt.py @@ -14,63 +14,6 @@ format="%(asctime)s - %(levelname)s - %(message)s", ) -from transformers import PreTrainedTokenizerBase - - -def ensure_default_chat_template( - tokenizer: PreTrainedTokenizerBase, - *, - user_prefix: str, - assistant_prefix: str, - end_turn: str, - system_prefix: str = "", -) -> None: - """Attach a simple chat template if the tokenizer doesn't already have one.""" - # If a template already exists and works, don't override it. - try: - if getattr(tokenizer, "chat_template", None): - tokenizer.apply_chat_template( - [{"role": "user", "content": "ping"}], - tokenize=False, - add_generation_prompt=True, - ) - return - except Exception: - # Existing template is broken; we'll set our own below. - pass - - # Minimal, prefix-based template. - # Renders (optionally) a BOS token, then each message with the right prefix and end_turn, - # and if add_generation_prompt=True it appends the assistant prefix for generation. - template = ( - "{% if bos_token %}{{ bos_token }}{% endif %}" - "{% for message in messages %}" - "{% if message['role'] == 'system' %}" - + system_prefix - + "{{ message['content'] }}" - + end_turn - + "{% elif message['role'] == 'user' %}" - + user_prefix - + "{{ message['content'] }}" - + end_turn - + "{% elif message['role'] == 'assistant' %}" - + assistant_prefix - + "{{ message['content'] }}" - + end_turn - + "{% else %}" - # Fallback for unknown roles: treat like user - + user_prefix - + "{{ message['content'] }}" - + end_turn - + "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}" + assistant_prefix + "{% endif %}" - "{% if eos_token and not add_generation_prompt %}{{ eos_token }}{% endif %}" - ) - tokenizer.chat_template = template - - return - @dataclass class SoftOptConfig: @@ -149,17 +92,8 @@ def run_soft_opt( if not any(["{optim_str}" in d["content"] for d in input_messages]): input_messages[-1]["content"] = input_messages[-1]["content"] + "{optim_str}" - user_prefix = "User: " - assistant_prefix = "Assistant: " - end_turn = "\n" - - ensure_default_chat_template( - tokenizer, - user_prefix=user_prefix, - assistant_prefix=assistant_prefix, - end_turn=end_turn, - ) - + # The tokenizer's chat_template should already be configured by the caller + # (e.g. via configure_tokenizer_template in load_hf_model_and_tokenizer). template = tokenizer.apply_chat_template( input_messages, tokenize=False, add_generation_prompt=True ) diff --git a/src/tamperbench/whitebox/evals/gcg/implementation.py b/src/tamperbench/whitebox/evals/gcg/implementation.py index 21a72ffd..5b10a965 100644 --- a/src/tamperbench/whitebox/evals/gcg/implementation.py +++ b/src/tamperbench/whitebox/evals/gcg/implementation.py @@ -447,12 +447,6 @@ def __init__( if model.device == torch.device("cpu"): logger.warning("Model is on the CPU. Use a hardware accelerator for faster optimization.") - if not tokenizer.chat_template: - logger.warning( - "Tokenizer does not have a chat template. Assuming base model and setting chat template to empty." - ) - tokenizer.chat_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" - def run( self, messages: str | list[dict[str, str]], diff --git a/src/tamperbench/whitebox/evals/ifeval/ifeval.py b/src/tamperbench/whitebox/evals/ifeval/ifeval.py index 22e686b9..08deeade 100644 --- a/src/tamperbench/whitebox/evals/ifeval/ifeval.py +++ b/src/tamperbench/whitebox/evals/ifeval/ifeval.py @@ -44,6 +44,7 @@ ) from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.utils import EvalName, MetricName, OptimizationDirection +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template from tamperbench.whitebox.utils.ops import run_in_isolation multiprocessing.set_start_method("spawn", force=True) @@ -213,6 +214,7 @@ def _instantiate_model_and_infer( # Load tokenizer and apply chat template tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) + configure_tokenizer_template(tokenizer, eval_config.model_config.template_name) # Format prompts using tokenizer's chat template formatted_prompts = [] diff --git a/src/tamperbench/whitebox/evals/mbpp/mbpp.py b/src/tamperbench/whitebox/evals/mbpp/mbpp.py index c5a4ff79..96bd73e0 100644 --- a/src/tamperbench/whitebox/evals/mbpp/mbpp.py +++ b/src/tamperbench/whitebox/evals/mbpp/mbpp.py @@ -251,11 +251,6 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: return InferenceSchema.validate(payload) - def _format_with_chat_template(self, prompt: str) -> str: - """Apply chat template to prompt.""" - model_config = self.eval_config.model_config - return f"{model_config.user_prefix}{prompt}{model_config.end_turn}{model_config.assistant_prefix}" - @override def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[ScoreSchema]: """Execute generated code and compute pass/fail scores.""" diff --git a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py index 374ff443..7c8a29cc 100644 --- a/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py +++ b/src/tamperbench/whitebox/evals/mmlu_pro/mmlu_pro.py @@ -35,6 +35,8 @@ MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.templates import configure_tokenizer_template +from tamperbench.whitebox.utils.names import TemplateName from tamperbench.whitebox.utils.ops import run_in_isolation multiprocessing.set_start_method("spawn", force=True) @@ -58,18 +60,9 @@ class MMLUProEvaluationConfig(WhiteBoxEvaluationConfig): Attributes: n_shots: Number of few-shot CoT examples per question. Set to 0 for zero-shot. - use_chat_template: If True, format prompts as multi-turn chat messages using - tokenizer.apply_chat_template(). If False, use raw text continuation format. - chat_template_model: HuggingFace model ID to load a chat template from when the - primary tokenizer has no chat_template. Useful for evaluating base models - with their corresponding instruct model's chat template (e.g. using - ``meta-llama/Llama-3.1-8B-Instruct``'s template for - ``meta-llama/Meta-Llama-3-8B``). Only used when ``use_chat_template=True``. """ n_shots: int = N_SHOTS - use_chat_template: bool = False - chat_template_model: str | None = None def __post_init__(self) -> None: """Validate configuration values.""" @@ -271,24 +264,10 @@ def compute_mmlu_inferences( pretrained_model_name_or_path=(eval_config.model_config.tokenizer_checkpoint or eval_config.model_checkpoint), trust_remote_code=True, ) + configure_tokenizer_template(hf_tokenizer, eval_config.model_config.template_name) + + use_chat = eval_config.model_config.template_name != TemplateName.PLAIN - if eval_config.use_chat_template: - if not hf_tokenizer.chat_template: - if eval_config.chat_template_model: - fallback_tokenizer = AutoTokenizer.from_pretrained( - eval_config.chat_template_model, trust_remote_code=True - ) - hf_tokenizer.chat_template = fallback_tokenizer.chat_template - else: - raise ValueError( - f"Tokenizer for {eval_config.model_checkpoint} has no chat_template and " - f"no chat_template_model was provided." - ) - elif eval_config.chat_template_model: - raise ValueError( - f"chat_template_model={eval_config.chat_template_model!r} was specified but the tokenizer for " - f"{eval_config.model_checkpoint} already has a chat_template." - ) # vllm tensor_parallel_size must be a factor of 64 (model hidden dimensions). # TODO: Handle scaling beyond 4 GPUs properly by validating against model config. llm_kwargs = { @@ -323,7 +302,7 @@ def compute_mmlu_inferences( k = eval_config.n_shots prompt: str | None = None while True: - if eval_config.use_chat_template: + if use_chat: messages = mmlu_api.generate_chat_messages(shots, row, max(k, 0)) prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) else: diff --git a/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py b/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py index 235529fb..fe956da4 100644 --- a/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py +++ b/src/tamperbench/whitebox/evals/mt_bench/mt_bench.py @@ -45,6 +45,7 @@ OptimizationDirection, dealloc_model_and_tokenizer, ) +from tamperbench.whitebox.utils.models.chat_format import apply_chat_template_with_fallback # Reference: Multiple sources for MT-Bench scores # Primary: "Judging LLM-as-a-Judge" (Zheng et al., NeurIPS 2023) @@ -217,32 +218,6 @@ def __init__(self, eval_config: MTBenchEvaluationConfig) -> None: self.openai_client: OpenAI = OpenAI() self.questions: list[MTBenchQuestion] = load_mt_bench_questions_by_category(eval_config.categories) - def _format_prompt(self, messages: list[dict[str, str]], tokenizer: PreTrainedTokenizer) -> str: - """Format messages into a prompt string using the tokenizer's chat template. - - Args: - messages: List of message dicts with 'role' and 'content'. - tokenizer: The tokenizer to use for formatting. - - Returns: - str: Formatted prompt string. - """ - if hasattr(tokenizer, "apply_chat_template"): - result = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - assert isinstance(result, str) - return result - # Fallback for models without chat template - prompt = "" - for msg in messages: - if msg["role"] == "system": - prompt += f"System: {msg['content']}\n\n" - elif msg["role"] == "user": - prompt += f"User: {msg['content']}\n\n" - elif msg["role"] == "assistant": - prompt += f"Assistant: {msg['content']}\n\n" - prompt += "Assistant: " - return prompt - def _generate_response( self, prompt: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, max_new_tokens: int = 512 ) -> str: @@ -300,7 +275,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: messages_t1.append({"role": "user", "content": question.turns[0]}) # Generate turn 1 response - prompt_t1 = self._format_prompt(messages_t1, tokenizer) + prompt_t1 = apply_chat_template_with_fallback(messages_t1, tokenizer) response_t1 = self._generate_response( prompt_t1, model, tokenizer, self.eval_config.model_config.max_generation_length ) @@ -313,7 +288,7 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: ] # Generate turn 2 response - prompt_t2 = self._format_prompt(messages_t2, tokenizer) + prompt_t2 = apply_chat_template_with_fallback(messages_t2, tokenizer) response_t2 = self._generate_response( prompt_t2, model, tokenizer, self.eval_config.model_config.max_generation_length ) diff --git a/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py b/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py index a8b5d289..693aae1f 100644 --- a/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py +++ b/src/tamperbench/whitebox/evals/policy_eval/policy_eval.py @@ -48,7 +48,6 @@ from tamperbench.whitebox.evals.reference import ReferenceScore, ReferenceScores from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.evals.utils import ( - format_chat_prompt, generate, llm_judge_score, ) @@ -57,6 +56,7 @@ MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt logger = logging.getLogger(__name__) diff --git a/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py b/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py index 9dee500a..255e43e3 100644 --- a/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py +++ b/src/tamperbench/whitebox/evals/prompt_attack/prompt_attack.py @@ -6,6 +6,7 @@ from pandera.typing.polars import DataFrame from typing_extensions import override +from tamperbench.whitebox.evals.base import load_tokenizer from tamperbench.whitebox.evals.output_schema import InferenceSchema from tamperbench.whitebox.evals.prompt_attack.dictionary import templates from tamperbench.whitebox.evals.registry import register_evaluation @@ -14,6 +15,7 @@ StrongRejectEvaluationConfig, instantiate_model_and_infer, ) +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt from tamperbench.whitebox.utils.names import EvalName from tamperbench.whitebox.utils.ops import run_in_isolation @@ -39,10 +41,11 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: then replaces the prompt column with the original forbidden prompts so that StrongReject scoring receives the correct inputs. """ - prompts: list[str] = self.load_strong_reject_prompts() + tokenizer = load_tokenizer(self.eval_config) + raw_prompts: list[str] = self.load_strong_reject_prompts() prompt_prefix = templates[self.eval_config.jailbreak_prompt] - jailbreak_prompts = [f"{prompt_prefix}{prompt}" for prompt in prompts] + jailbreak_prompts = [format_chat_prompt(f"{prompt_prefix}{prompt}", tokenizer) for prompt in raw_prompts] payload: pl.DataFrame = run_in_isolation( target=instantiate_model_and_infer, @@ -51,5 +54,5 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: ) # Replace jailbreak prompts with original forbidden prompts for StrongReject scoring - payload = payload.with_columns(pl.Series(InferenceSchema.prompt, prompts)) + payload = payload.with_columns(pl.Series(InferenceSchema.prompt, raw_prompts)) return InferenceSchema.validate(payload) diff --git a/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py b/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py index e3f72e9f..baedbe99 100644 --- a/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py +++ b/src/tamperbench/whitebox/evals/safety_gap/safety_gap.py @@ -26,6 +26,7 @@ OptimizationDirection, ) from tamperbench.whitebox.utils.datasets import get_dataset +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt class SafetyGapDomain(str, Enum): @@ -90,14 +91,10 @@ def _load_prompts_with_domains(self) -> list[dict[str, str]]: @override def compute_inferences(self) -> DataFrame[InferenceSchema]: """Run model on SafetyGap dataset to compute inferences.""" - model, _ = load_vllm_model_and_tokenizer(self.eval_config) + model, tokenizer = load_vllm_model_and_tokenizer(self.eval_config) prompts_with_domains = self._load_prompts_with_domains() - # Prompts are passed raw (no chat template formatting). The SafetyGap paper - # evaluates propensity using raw completion prompts. For chat-tuned models this - # measures base propensity without chat-level safety guardrails, which is the - # intended behavior for this benchmark. - prompts = [p["question"] for p in prompts_with_domains] + prompts = [format_chat_prompt(p["question"], tokenizer) for p in prompts_with_domains] domains = [p["domain"] for p in prompts_with_domains] responses = generate( diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py index c7084d14..58f565c8 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject.py @@ -18,7 +18,7 @@ from typing_extensions import override from vllm import LLM, SamplingParams -from tamperbench.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig +from tamperbench.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig, load_tokenizer from tamperbench.whitebox.evals.output_schema import ( EvaluationSchema, InferenceSchema, @@ -30,6 +30,7 @@ MetricName, OptimizationDirection, ) +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt from tamperbench.whitebox.utils.ops import run_in_isolation S = TypeVar(name="S", bound="StrongRejectEvaluationConfig") @@ -64,14 +65,19 @@ def compute_inferences(self) -> DataFrame[InferenceSchema]: Returns: DataFrame[InferenceSchema]: A polars DataFrame with the prompts to LLM and corresponding responses. """ - prompts: list[str] = self.load_strong_reject_prompts() + tokenizer = load_tokenizer(self.eval_config) + raw_prompts: list[str] = self.load_strong_reject_prompts() + formatted_prompts = [format_chat_prompt(p, tokenizer) for p in raw_prompts] payload: pl.DataFrame = run_in_isolation( target=instantiate_model_and_infer, - args=(self.eval_config, prompts), + args=(self.eval_config, formatted_prompts), error_context="StrongReject inference", ) + # Store unformatted prompts for downstream StrongReject scoring, + # which expects raw forbidden prompts, not chat-formatted text. + payload = payload.with_columns(pl.Series(InferenceSchema.prompt, raw_prompts)) return InferenceSchema.validate(payload) @override @@ -121,10 +127,11 @@ def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[Evaluatio return EvaluationSchema.validate(_metrics_dataframe) def load_strong_reject_prompts(self) -> list[str]: - """Load the StrongReject dataset into an Arrow Dataset, and then return prompts. + """Load the StrongReject dataset and return raw prompts. Returns: - list[str]: A list of prompts from the StrongReject dataset to input to the model to obtain inferences. + list[str]: A list of raw prompts from the StrongReject dataset. + Chat formatting is applied later in `compute_inferences`. """ strong_reject_dataset: ArrowDataset = load_strong_reject_datasets.load_strongreject() if self.eval_config.max_samples is not None: @@ -142,23 +149,17 @@ class JailbreakBenchEvaluation(StrongRejectEvaluation[S]): @override def load_strong_reject_prompts(self) -> list[str]: - """Load the JailbreakBench JBB-Behaviors dataset and return prompts. + """Load the JailbreakBench JBB-Behaviors dataset and return raw prompts. Returns: - list[str]: A list of prompts from the JailbreakBench dataset to input to the model to obtain inferences. + list[str]: A list of raw prompts from the JailbreakBench dataset. + Chat formatting is applied later in `compute_inferences`. """ jbb_dataset = datasets.load_dataset( # pyright: ignore[reportReturnType] path="JailbreakBench/JBB-Behaviors", name="behaviors", split="harmful" ) - user_prefix = self.eval_config.model_config.user_prefix - assistant_prefix = self.eval_config.model_config.assistant_prefix - end_turn = self.eval_config.model_config.end_turn - - def to_text(data_row: dict[str, str]) -> str: - return f"{user_prefix}{data_row['Goal']}{end_turn}{assistant_prefix}" - - return [to_text(row) for row in jbb_dataset] # pyright: ignore[reportCallIssue] + return [row["Goal"] for row in jbb_dataset] # pyright: ignore[reportCallIssue] @override def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[EvaluationSchema]: diff --git a/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py b/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py index 2873ae0c..1368d0b5 100644 --- a/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py +++ b/src/tamperbench/whitebox/evals/strong_reject/strong_reject_jailbreak_tune.py @@ -33,10 +33,11 @@ class SRJailbreakTuneEvaluation(StrongRejectEvaluation[SRJailbreakTuneEvaluation @override def load_strong_reject_prompts(self) -> list[str]: - """Load the StrongReject dataset into an Arrow Dataset, and then return prompts. + """Load the StrongReject dataset and apply jailbreak injections. Returns: - list[str]: A list of prompts from the StrongReject dataset to input to the model to obtain inferences. + list[str]: A list of jailbreak-wrapped prompts from the StrongReject dataset. + Chat formatting is applied later in `compute_inferences`. """ strong_reject_prompts = super().load_strong_reject_prompts() @@ -54,10 +55,11 @@ class JBJailbreakTuneEvaluation(JailbreakBenchEvaluation[SRJailbreakTuneEvaluati @override def load_strong_reject_prompts(self) -> list[str]: - """Load the JailbreakBench dataset, apply jailbreak injections, and return prompts. + """Load the JailbreakBench dataset and apply jailbreak injections. Returns: list[str]: A list of jailbreak-wrapped prompts from the JailbreakBench dataset. + Chat formatting is applied later in ``compute_inferences``. """ strong_reject_prompts = super().load_strong_reject_prompts() diff --git a/src/tamperbench/whitebox/evals/utils.py b/src/tamperbench/whitebox/evals/utils.py index 876cc1dc..08ab2e28 100644 --- a/src/tamperbench/whitebox/evals/utils.py +++ b/src/tamperbench/whitebox/evals/utils.py @@ -1,7 +1,6 @@ """Shared utility functions for evaluations. This module provides reusable functions for common evaluation patterns: -- `format_chat_prompt`: Format messages for chat models with fallback - `generate`: High-throughput generation with vLLM (sends all prompts at once) - `generate_single`: Sequential generation for cases requiring dependencies between calls - `llm_judge_score`: Score responses using an external LLM judge @@ -22,7 +21,6 @@ from pandera.typing.polars import DataFrame from strong_reject.evaluate import strongreject_finetuned as strong_reject_finetuned from tqdm.contrib.concurrent import thread_map -from transformers import PreTrainedTokenizer from vllm import LLM, SamplingParams from tamperbench.whitebox.evals.output_schema import InferenceSchema, ScoreSchema @@ -30,49 +28,6 @@ logger = logging.getLogger(__name__) -def format_chat_prompt( - user_content: str, - tokenizer: PreTrainedTokenizer, - system_prompt: str | None = None, - history: list[dict[str, str]] | None = None, -) -> str: - """Format messages for chat models with automatic fallback. - - Tries to use the tokenizer's chat_template if available, otherwise - falls back to a simple text format. - - Args: - user_content: The user's message content. - tokenizer: The tokenizer to use for formatting. - system_prompt: Optional system prompt to prepend. - history: Optional conversation history (list of {"role": ..., "content": ...}). - - Returns: - Formatted prompt string ready for model input. - """ - messages: list[dict[str, str]] = [] - - if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - - if history: - messages.extend(history) - - messages.append({"role": "user", "content": user_content}) - - # Try chat template first - if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: - return cast(str, tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)) - - # Fallback for models without chat template - parts = [] - for msg in messages: - role = msg["role"].title() - parts.append(f"{role}: {msg['content']}") - - return "\n\n".join(parts) + "\n\nAssistant:" - - def llm_judge_score( inferences: DataFrame[InferenceSchema], prompt_template: str, diff --git a/src/tamperbench/whitebox/evals/xstest/xstest.py b/src/tamperbench/whitebox/evals/xstest/xstest.py index 3cab7f6e..f35268cc 100644 --- a/src/tamperbench/whitebox/evals/xstest/xstest.py +++ b/src/tamperbench/whitebox/evals/xstest/xstest.py @@ -31,7 +31,6 @@ from tamperbench.whitebox.evals.reference import ReferenceScore, ReferenceScores from tamperbench.whitebox.evals.registry import register_evaluation from tamperbench.whitebox.evals.utils import ( - format_chat_prompt, generate, llm_judge_score, ) @@ -41,6 +40,7 @@ OptimizationDirection, ) from tamperbench.whitebox.utils.datasets import get_dataset +from tamperbench.whitebox.utils.models.chat_format import format_chat_prompt logger = logging.getLogger(__name__) diff --git a/src/tamperbench/whitebox/utils/models/chat_format.py b/src/tamperbench/whitebox/utils/models/chat_format.py new file mode 100644 index 00000000..e07a62ec --- /dev/null +++ b/src/tamperbench/whitebox/utils/models/chat_format.py @@ -0,0 +1,87 @@ +"""Unified chat prompt formatting utilities.""" + +from typing import cast + +from transformers import PreTrainedTokenizer + + +def format_chat_prompt( + user_content: str, + tokenizer: PreTrainedTokenizer, + system_prompt: str | None = None, + history: list[dict[str, str]] | None = None, +) -> str: + r"""Format messages for chat models with automatic fallback. + + Builds a messages list and applies the tokenizer's chat template. + If the tokenizer has no chat template, falls back to a simple + `"User: ...\n\nAssistant:"` text format. + + Args: + user_content: The user's message content. + tokenizer: The tokenizer to use for formatting. + system_prompt: Optional system prompt to prepend. + history: Optional conversation history + (list of `{"role": ..., "content": ...}`). + + Returns: + Formatted prompt string ready for model input. + """ + messages: list[dict[str, str]] = [] + + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + if history: + messages.extend(history) + + messages.append({"role": "user", "content": user_content}) + + return apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) + + +def apply_chat_template_with_fallback( + messages: list[dict[str, str]], + tokenizer: PreTrainedTokenizer, + add_generation_prompt: bool = True, +) -> str: + """Apply `tokenizer.apply_chat_template()` with a unified plain-text fallback. + + Use this when you already have a messages list and need formatting with a + consistent fallback for tokenizers that lack a chat template. + + Args: + messages: List of `{"role": ..., "content": ...}` dicts. + tokenizer: HuggingFace tokenizer. + add_generation_prompt: Whether to append the assistant header at the + end (same semantics as `apply_chat_template`). + + Returns: + Formatted prompt string. + """ + if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: + return cast( + str, + tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=add_generation_prompt, + ), + ) + + # Fallback for tokenizers without a chat template. On the eval path, + # configure_tokenizer_template() always sets a template, so this is only + # reachable from code that bypasses that setup (e.g. defense code). + + parts: list[str] = [] + for msg in messages: + if msg["role"] == "system": + parts.append(msg["content"]) + else: + role = msg["role"].title() + parts.append(f"{role}: {msg['content']}") + + result = "\n\n".join(parts) + if add_generation_prompt: + result += "\n\nAssistant:" + return result diff --git a/src/tamperbench/whitebox/utils/models/config.py b/src/tamperbench/whitebox/utils/models/config.py index 8987c05e..a43a1d06 100644 --- a/src/tamperbench/whitebox/utils/models/config.py +++ b/src/tamperbench/whitebox/utils/models/config.py @@ -6,6 +6,7 @@ from typing_extensions import Self from tamperbench.whitebox.utils.models.templates import get_template +from tamperbench.whitebox.utils.names import TemplateName @dataclass @@ -13,6 +14,7 @@ class ModelConfig: """Configuration for model inference including chat templates and generation settings. Attributes: + template_name: The name of the chat template to use for formatting. user_prefix: The prefix to append to input prompts (that would be from a user). Similar to instruction template. assistant_prefix: The prefix to append to the output prompts. This would mark the portion of a model @@ -24,6 +26,7 @@ class ModelConfig: tokenizer_checkpoint: Optional path to a separate tokenizer checkpoint. If None, uses model_checkpoint for tokenizer. """ + template_name: TemplateName user_prefix: str assistant_prefix: str end_turn: str @@ -44,12 +47,19 @@ def from_dict(cls, data: dict[str, Any]) -> Self: data = dict(data) # shallow copy # Support a template registry to fill in prefixes if provided if "template" in data: - template_name = data.pop("template") - template = get_template(template_name) - - data.setdefault("user_prefix", template.user_prefix) - data.setdefault("assistant_prefix", template.assistant_prefix) - data.setdefault("end_turn", template.end_turn) + template_name_str = data.pop("template") + template_name = TemplateName(template_name_str) + data["template_name"] = template_name + + if template_name == TemplateName.NATIVE: + data.setdefault("user_prefix", "") + data.setdefault("assistant_prefix", "") + data.setdefault("end_turn", "") + else: + template = get_template(template_name) + data.setdefault("user_prefix", template.user_prefix) + data.setdefault("assistant_prefix", template.assistant_prefix) + data.setdefault("end_turn", template.end_turn) data.setdefault("tokenizer_checkpoint", None) diff --git a/src/tamperbench/whitebox/utils/models/templates.py b/src/tamperbench/whitebox/utils/models/templates.py index 268d082c..203ef3b3 100644 --- a/src/tamperbench/whitebox/utils/models/templates.py +++ b/src/tamperbench/whitebox/utils/models/templates.py @@ -4,10 +4,16 @@ hydrate the corresponding `user_prefix`, `assistant_prefix`, and `end_turn`. """ +from __future__ import annotations + from dataclasses import dataclass +from typing import TYPE_CHECKING from tamperbench.whitebox.utils.names import TemplateName +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + @dataclass(frozen=True) class TextTemplate: @@ -17,13 +23,47 @@ class TextTemplate: assistant_prefix: str end_turn: str + def to_jinja2(self) -> str: + """Convert this template to a Jinja2 chat template string. + + The generated template renders system/user/assistant messages with the + configured prefixes and end-of-turn delimiter. BOS tokens are not included + because the tokenizer handles those during tokenization. + + Returns: + A Jinja2 template string suitable for ``tokenizer.chat_template``. + """ + for field_name, value in [ + ("user_prefix", self.user_prefix), + ("assistant_prefix", self.assistant_prefix), + ("end_turn", self.end_turn), + ]: + for marker in ("{%", "{{", "}}", "%}"): + if marker in value: + raise ValueError( + f"TextTemplate.{field_name} contains Jinja2 syntax {marker!r} " + f"which would break the generated template: {value!r}" + ) + return ( + "{% for message in messages %}" + "{% if message['role'] == 'system' %}" + "{{ message['content'] }}" + + self.end_turn + + "{% elif message['role'] == 'user' %}" + + self.user_prefix + + "{{ message['content'] }}" + + self.end_turn + + "{% elif message['role'] == 'assistant' %}" + + self.assistant_prefix + + "{{ message['content'] }}" + + self.end_turn + + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + self.assistant_prefix + "{% endif %}" + ) + TEMPLATE_REGISTRY: dict[TemplateName, TextTemplate] = { - TemplateName.LLAMA3: TextTemplate( - user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", - assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>", - ), TemplateName.INSTRUCTION_RESPONSE: TextTemplate( user_prefix="INSTRUCTION: ", assistant_prefix="RESPONSE: ", @@ -39,16 +79,6 @@ class TextTemplate: assistant_prefix="", end_turn="", ), - TemplateName.GPT_CHAT: TextTemplate( - user_prefix="<|user|> ", - assistant_prefix="<|assistant|> ", - end_turn="\n", - ), - TemplateName.QWEN: TextTemplate( - user_prefix="<|im_start|>user\n", - assistant_prefix="<|im_start|>assistant\n", - end_turn="<|im_end|>", - ), } @@ -65,9 +95,9 @@ def get_template(name: str | TemplateName) -> TextTemplate: KeyError: If the template name is not found in the registry. Example: - >>> template = get_template(TemplateName.LLAMA3) + >>> template = get_template(TemplateName.GENERIC_CHAT) >>> template.user_prefix - '<|start_header_id|>user<|end_header_id|>\n\n' + 'User: ' """ key = TemplateName(name) if not isinstance(name, TemplateName) else name @@ -75,3 +105,32 @@ def get_template(name: str | TemplateName) -> TextTemplate: return TEMPLATE_REGISTRY[key] except KeyError as e: raise KeyError(f"Unknown text template: {name}") from e + + +def configure_tokenizer_template( + tokenizer: PreTrainedTokenizerBase, + template_name: TemplateName, +) -> None: + """Set the tokenizer's chat_template based on the template name. + + For ``TemplateName.NATIVE``, the tokenizer's own template is preserved. + For all other template names, the tokenizer's chat_template is replaced + with the Jinja2 rendering of the corresponding ``TextTemplate``. + + This should be called once after loading the tokenizer so that all + downstream code can simply use ``tokenizer.apply_chat_template()``. + + Args: + tokenizer: HuggingFace tokenizer to configure. + template_name: Which chat template to apply. + + Raises: + ValueError: If ``template_name`` is ``NATIVE`` but the tokenizer + has no built-in chat_template. + """ + if template_name == TemplateName.NATIVE: + if not getattr(tokenizer, "chat_template", None): + raise ValueError("template='native' was specified but the tokenizer has no chat_template") + return + template = get_template(template_name) + tokenizer.chat_template = template.to_jinja2() diff --git a/src/tamperbench/whitebox/utils/names.py b/src/tamperbench/whitebox/utils/names.py index 131cd7a2..bde8abc7 100644 --- a/src/tamperbench/whitebox/utils/names.py +++ b/src/tamperbench/whitebox/utils/names.py @@ -224,9 +224,7 @@ class ConfigKeys(StrEnum): class TemplateName(StrEnum): """Names of text templates for chat-style formatting.""" - LLAMA3 = "llama3" INSTRUCTION_RESPONSE = "instruction_response" GENERIC_CHAT = "generic_chat" PLAIN = "plain" - GPT_CHAT = "gpt_chat" - QWEN = "qwen" + NATIVE = "native" diff --git a/tests/attacks/test_config_validation.py b/tests/attacks/test_config_validation.py index 1795f88d..24bb1259 100644 --- a/tests/attacks/test_config_validation.py +++ b/tests/attacks/test_config_validation.py @@ -22,9 +22,7 @@ def _make_base_dict() -> dict[str, Any]: "out_dir": "/tmp/test", "evals": [], "model_config": { - "user_prefix": "User: ", - "assistant_prefix": "Assistant: ", - "end_turn": "\n", + "template": "generic_chat", "max_generation_length": 16, "inference_batch_size": 1, }, diff --git a/tests/attacks/test_embedding_attack.py b/tests/attacks/test_embedding_attack.py index a4884de4..13c84380 100644 --- a/tests/attacks/test_embedding_attack.py +++ b/tests/attacks/test_embedding_attack.py @@ -9,7 +9,7 @@ from tamperbench.whitebox.evals.embedding_attack import SoftOptConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -22,9 +22,10 @@ def test_embedding_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/attacks/test_full_finetune.py b/tests/attacks/test_full_finetune.py index 40633eb1..81e3375e 100644 --- a/tests/attacks/test_full_finetune.py +++ b/tests/attacks/test_full_finetune.py @@ -11,7 +11,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -24,9 +24,10 @@ def test_full_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/attacks/test_gcg_attack.py b/tests/attacks/test_gcg_attack.py index 52743867..60120690 100644 --- a/tests/attacks/test_gcg_attack.py +++ b/tests/attacks/test_gcg_attack.py @@ -14,7 +14,7 @@ from tamperbench.whitebox.evals.gcg.implementation import GCGConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -27,9 +27,10 @@ def test_gcg_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=16, ), diff --git a/tests/attacks/test_jailbreak_finetune.py b/tests/attacks/test_jailbreak_finetune.py index e2575f69..a0cb4c4c 100644 --- a/tests/attacks/test_jailbreak_finetune.py +++ b/tests/attacks/test_jailbreak_finetune.py @@ -12,7 +12,7 @@ from tamperbench.whitebox.attacks.jailbreak_finetune.jailbreaks import JailbreakTuneType from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -25,9 +25,10 @@ def test_jailbreak_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/attacks/test_lora_attack.py b/tests/attacks/test_lora_attack.py index 5114663d..77322e67 100644 --- a/tests/attacks/test_lora_attack.py +++ b/tests/attacks/test_lora_attack.py @@ -11,7 +11,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -24,9 +24,10 @@ def test_lora_finetune_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), @@ -63,9 +64,10 @@ def test_lora_finetune_cheap() -> None: input_checkpoint_path="EleutherAI/pythia-14m", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.GENERIC_CHAT, user_prefix="User: ", assistant_prefix="Assistant: ", - end_turn="\n\n", + end_turn="\n", max_generation_length=16, inference_batch_size=2, ), diff --git a/tests/attacks/test_prompt_attack.py b/tests/attacks/test_prompt_attack.py index ce7ca269..1cc271d0 100644 --- a/tests/attacks/test_prompt_attack.py +++ b/tests/attacks/test_prompt_attack.py @@ -15,7 +15,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import EvalName, MetricName +from tamperbench.whitebox.utils.names import EvalName, MetricName, TemplateName @pytest.mark.expensive @@ -28,9 +28,10 @@ def test_prompt_attack() -> None: input_checkpoint_path="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=16, ), diff --git a/tests/evals/test_embedding_attack_eval.py b/tests/evals/test_embedding_attack_eval.py index a935a81a..c96ce832 100644 --- a/tests/evals/test_embedding_attack_eval.py +++ b/tests/evals/test_embedding_attack_eval.py @@ -12,7 +12,7 @@ from tamperbench.whitebox.evals.embedding_attack import SoftOptConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -25,9 +25,10 @@ def test_embedding_attack_evaluation() -> None: model_checkpoint="lapisrocks/Llama-3-8B-Instruct-TAR-Refusal", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, tokenizer_checkpoint="meta-llama/Meta-Llama-3-8B-Instruct", diff --git a/tests/evals/test_format_chat_prompt.py b/tests/evals/test_format_chat_prompt.py new file mode 100644 index 00000000..196dd8b4 --- /dev/null +++ b/tests/evals/test_format_chat_prompt.py @@ -0,0 +1,112 @@ +"""Tests for chat formatting utilities in whitebox.utils.models.chat_format.""" + +from typing import Any +from unittest.mock import MagicMock + +from tamperbench.whitebox.utils.models.chat_format import ( + apply_chat_template_with_fallback, + format_chat_prompt, +) + + +def _make_tokenizer(chat_template: str | None = None) -> MagicMock: + """Create a mock tokenizer, optionally with a chat template.""" + tokenizer = MagicMock() + tokenizer.chat_template = chat_template + + if chat_template: + + def fake_apply( + messages: list[dict[str, str]], + *, + tokenize: bool = False, # pyright: ignore[reportUnusedParameter] + add_generation_prompt: bool = True, + ) -> str: + parts = [f"[{msg['role']}] {msg['content']}" for msg in messages] + result = " | ".join(parts) + if add_generation_prompt: + result += " | [assistant]" + return result + + tokenizer.apply_chat_template = fake_apply + else: + del tokenizer.apply_chat_template + + return tokenizer + + +# --- format_chat_prompt --- + + +def test_format_chat_prompt_with_history() -> None: + """format_chat_prompt includes conversation history.""" + tokenizer = _make_tokenizer(chat_template=None) + history: list[dict[str, str]] = [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + result = format_chat_prompt("How are you?", tokenizer, history=history) + assert result == "User: Hi\n\nAssistant: Hello!\n\nUser: How are you?\n\nAssistant:" + + +def test_format_chat_prompt_system_and_history_with_template() -> None: + """format_chat_prompt passes system, history, and user to template.""" + tokenizer = _make_tokenizer(chat_template="some_template") + history: list[dict[str, str]] = [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + result = format_chat_prompt("How are you?", tokenizer, system_prompt="Be helpful.", history=history) + assert result == "[system] Be helpful. | [user] Hi | [assistant] Hello! | [user] How are you? | [assistant]" + + +# --- apply_chat_template_with_fallback --- + + +def test_fallback_uses_tokenizer_template() -> None: + """apply_chat_template_with_fallback delegates to tokenizer when template exists.""" + tokenizer = _make_tokenizer(chat_template="some_template") + messages: list[dict[str, Any]] = [{"role": "user", "content": "Hello"}] + result = apply_chat_template_with_fallback(messages, tokenizer) + assert result == "[user] Hello | [assistant]" + + +def test_fallback_with_generation_prompt() -> None: + """Fallback appends 'Assistant:' when add_generation_prompt=True.""" + tokenizer = _make_tokenizer(chat_template=None) + messages: list[dict[str, Any]] = [{"role": "user", "content": "Hello"}] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) + assert result == "User: Hello\n\nAssistant:" + + +def test_fallback_without_generation_prompt() -> None: + """Fallback omits 'Assistant:' trailer when add_generation_prompt=False.""" + tokenizer = _make_tokenizer(chat_template=None) + messages: list[dict[str, Any]] = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + ] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=False) + assert result == "User: Hello\n\nAssistant: Hi there" + + +def test_fallback_tokenizer_template_no_generation_prompt() -> None: + """Tokenizer template called with add_generation_prompt=False.""" + tokenizer = _make_tokenizer(chat_template="some_template") + messages: list[dict[str, Any]] = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + ] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=False) + assert result == "[user] Hello | [assistant] Hi there" + + +def test_fallback_system_message() -> None: + """Fallback renders system messages as plain content.""" + tokenizer = _make_tokenizer(chat_template=None) + messages: list[dict[str, Any]] = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + ] + result = apply_chat_template_with_fallback(messages, tokenizer, add_generation_prompt=True) + assert result == "You are helpful.\n\nUser: Hello\n\nAssistant:" diff --git a/tests/evals/test_ifeval.py b/tests/evals/test_ifeval.py index fb372483..a465d0f5 100644 --- a/tests/evals/test_ifeval.py +++ b/tests/evals/test_ifeval.py @@ -12,7 +12,7 @@ from tamperbench.whitebox.evals.ifeval.ifeval import IFEvalEvaluationConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -25,9 +25,10 @@ def test_ifeval_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=4, ), @@ -37,6 +38,7 @@ def test_ifeval_evaluation() -> None: # model_checkpoint="Qwen/Qwen3-4B", # out_dir=tmpdirname, # model_config=ModelConfig( + # template_name=TemplateName.NATIVE, # user_prefix="<|im_start|>user\n", # assistant_prefix="<|im_start|>assistant\n", # end_turn="<|im_end|>", diff --git a/tests/evals/test_mbpp.py b/tests/evals/test_mbpp.py index 0424bd52..03f941aa 100644 --- a/tests/evals/test_mbpp.py +++ b/tests/evals/test_mbpp.py @@ -9,7 +9,7 @@ from tamperbench.whitebox.evals.mbpp.mbpp import MBPPEvaluationConfig from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -22,9 +22,10 @@ def test_mbpp_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=512, inference_batch_size=8, ), diff --git a/tests/evals/test_minerva_math.py b/tests/evals/test_minerva_math.py index dd375069..cfc68ca3 100644 --- a/tests/evals/test_minerva_math.py +++ b/tests/evals/test_minerva_math.py @@ -11,7 +11,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.expensive @@ -24,9 +24,10 @@ def test_minerva_math_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=2048, inference_batch_size=4, ), diff --git a/tests/evals/test_mmlu_pro.py b/tests/evals/test_mmlu_pro.py index 09fb743b..846831c4 100644 --- a/tests/evals/test_mmlu_pro.py +++ b/tests/evals/test_mmlu_pro.py @@ -13,7 +13,7 @@ ) from tamperbench.whitebox.evals.output_schema import EvaluationSchema from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName SUBJECT = "biology" @@ -139,9 +139,10 @@ def test_mmlu_pro_evaluation() -> None: model_checkpoint="meta-llama/Llama-3.1-8B-Instruct", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.NATIVE, user_prefix="<|start_header_id|>user<|end_header_id|>\n\n", assistant_prefix="<|start_header_id|>assistant<|end_header_id|>\n\n", - end_turn="<|eot_id|>\n", + end_turn="<|eot_id|>", max_generation_length=1024, inference_batch_size=16, ), diff --git a/tests/evals/test_mt_bench.py b/tests/evals/test_mt_bench.py index ede0a8c6..de7c5d68 100644 --- a/tests/evals/test_mt_bench.py +++ b/tests/evals/test_mt_bench.py @@ -31,11 +31,11 @@ class _ModelSpec(TypedDict): }, "small": { "checkpoint": "Qwen/Qwen2.5-0.5B-Instruct", # ~0.5B, instruction-tuned - "template": TemplateName.QWEN, + "template": TemplateName.NATIVE, }, "medium": { "checkpoint": "Qwen/Qwen2.5-1.5B-Instruct", # ~1.5B, instruction-tuned - "template": TemplateName.QWEN, + "template": TemplateName.NATIVE, }, } diff --git a/tests/evals/test_strong_reject.py b/tests/evals/test_strong_reject.py index 6fdd8cbb..cedba4c7 100644 --- a/tests/evals/test_strong_reject.py +++ b/tests/evals/test_strong_reject.py @@ -12,7 +12,7 @@ StrongRejectEvaluation, ) from tamperbench.whitebox.utils.models.config import ModelConfig -from tamperbench.whitebox.utils.names import MetricName +from tamperbench.whitebox.utils.names import MetricName, TemplateName @pytest.mark.gpu_required @@ -23,9 +23,10 @@ def test_strong_reject_evaluation() -> None: model_checkpoint="EleutherAI/pythia-14m", out_dir=tmpdirname, model_config=ModelConfig( + template_name=TemplateName.GENERIC_CHAT, user_prefix="User: ", assistant_prefix="Assistant: ", - end_turn="\n\n", + end_turn="\n", max_generation_length=64, inference_batch_size=4, ), diff --git a/tests/test_templates.py b/tests/test_templates.py new file mode 100644 index 00000000..926c5f9e --- /dev/null +++ b/tests/test_templates.py @@ -0,0 +1,193 @@ +"""Tests for TextTemplate.to_jinja2() and configure_tokenizer_template().""" + +from unittest.mock import MagicMock + +import pytest +from jinja2 import Template + +from tamperbench.whitebox.utils.models.config import ModelConfig +from tamperbench.whitebox.utils.models.templates import ( + TEMPLATE_REGISTRY, + configure_tokenizer_template, + get_template, +) +from tamperbench.whitebox.utils.names import TemplateName + + +class TestToJinja2: + """Tests for TextTemplate.to_jinja2().""" + + def test_plain_template_produces_raw_content(self) -> None: + """PLAIN template should produce content with no prefixes.""" + template = get_template(TemplateName.PLAIN) + jinja2 = template.to_jinja2() + + # Verify it's a valid Jinja2 template by importing jinja2 + t = Template(jinja2) + result = t.render( + messages=[{"role": "user", "content": "hello"}], + add_generation_prompt=True, + ) + assert result == "hello" + + def test_generic_chat_template(self) -> None: + """GENERIC_CHAT template should produce User:/Assistant: format.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + t = Template(jinja2) + result = t.render( + messages=[{"role": "user", "content": "hello"}], + add_generation_prompt=True, + ) + assert result == "User: hello\nAssistant: " + + def test_generic_chat_with_system(self) -> None: + """GENERIC_CHAT template should render system messages.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + t = Template(jinja2) + result = t.render( + messages=[ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "hello"}, + ], + add_generation_prompt=True, + ) + assert result == "You are helpful.\nUser: hello\nAssistant: " + + def test_generic_chat_multi_turn(self) -> None: + """GENERIC_CHAT template should handle multi-turn conversations.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + t = Template(jinja2) + result = t.render( + messages=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi there"}, + {"role": "user", "content": "how are you?"}, + ], + add_generation_prompt=True, + ) + assert result == "User: hello\nAssistant: hi there\nUser: how are you?\nAssistant: " + + def test_no_generation_prompt(self) -> None: + """When add_generation_prompt is False, assistant prefix should not be appended.""" + template = get_template(TemplateName.GENERIC_CHAT) + jinja2 = template.to_jinja2() + + t = Template(jinja2) + result = t.render( + messages=[ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi there"}, + ], + add_generation_prompt=False, + ) + assert result == "User: hello\nAssistant: hi there\n" + + def test_instruction_response_template(self) -> None: + """INSTRUCTION_RESPONSE template should use INSTRUCTION:/RESPONSE: format.""" + template = get_template(TemplateName.INSTRUCTION_RESPONSE) + jinja2 = template.to_jinja2() + + t = Template(jinja2) + result = t.render( + messages=[{"role": "user", "content": "Summarize this."}], + add_generation_prompt=True, + ) + assert result == "INSTRUCTION: Summarize this.RESPONSE: " + + def test_all_templates_produce_valid_jinja2(self) -> None: + """All registered templates should produce valid Jinja2 strings.""" + from jinja2 import Template + + for name, template in TEMPLATE_REGISTRY.items(): + jinja2 = template.to_jinja2() + t = Template(jinja2) + # Should render without error + result = t.render( + messages=[{"role": "user", "content": "test"}], + add_generation_prompt=True, + ) + assert isinstance(result, str), f"Template {name} did not produce a string" + + +class TestConfigureTokenizerTemplate: + """Tests for configure_tokenizer_template().""" + + def test_named_template_sets_chat_template(self) -> None: + """Named template should set the tokenizer's chat_template.""" + tokenizer = MagicMock() + tokenizer.chat_template = None + + configure_tokenizer_template(tokenizer, TemplateName.GENERIC_CHAT) + + expected = get_template(TemplateName.GENERIC_CHAT).to_jinja2() + assert tokenizer.chat_template == expected + + def test_native_preserves_existing_template(self) -> None: + """NATIVE should not modify the tokenizer's existing chat_template.""" + tokenizer = MagicMock() + original_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" + tokenizer.chat_template = original_template + + configure_tokenizer_template(tokenizer, TemplateName.NATIVE) + + assert tokenizer.chat_template == original_template + + def test_native_raises_without_template(self) -> None: + """NATIVE should raise ValueError if tokenizer has no chat_template.""" + tokenizer = MagicMock() + tokenizer.chat_template = None + + with pytest.raises(ValueError, match="native"): + configure_tokenizer_template(tokenizer, TemplateName.NATIVE) + + +class TestModelConfigFromDict: + """Tests for ModelConfig.from_dict with template support.""" + + def test_from_dict_with_template_key(self) -> None: + """from_dict should hydrate template_name and prefixes from template key.""" + config = ModelConfig.from_dict( + { + "template": "generic_chat", + "max_generation_length": 1024, + "inference_batch_size": 16, + } + ) + assert config.template_name == TemplateName.GENERIC_CHAT + assert config.user_prefix == "User: " + assert config.assistant_prefix == "Assistant: " + assert config.end_turn == "\n" + + def test_from_dict_native_template(self) -> None: + """from_dict with template='native' should set empty prefixes.""" + config = ModelConfig.from_dict( + { + "template": "native", + "max_generation_length": 1024, + "inference_batch_size": 16, + } + ) + assert config.template_name == TemplateName.NATIVE + assert config.user_prefix == "" + assert config.assistant_prefix == "" + assert config.end_turn == "" + + def test_from_dict_plain_template(self) -> None: + """from_dict with template='plain' should set empty prefixes.""" + config = ModelConfig.from_dict( + { + "template": "plain", + "max_generation_length": 1024, + "inference_batch_size": 16, + } + ) + assert config.template_name == TemplateName.PLAIN + assert config.user_prefix == "" + assert config.assistant_prefix == "" + assert config.end_turn == ""