Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,23 @@ huggingface-cli download Qwen/Qwen3-32B-Instruct --local-dir src/ckpts/Qwen3-32B

### Install dependencies

Dependencies are in `pyproject.toml`, install them with:
Locked dependency versions are recorded in `reqs.txt` (generated from `pyproject.toml`).
Install them with:

```bash
pip install -e ".[dev,test]"
pip install -r reqs.txt
```

For development and testing you can use:

```bash
pip install -r reqs-dev.txt
```

And for all dependencies including those required for running local models:

```bash
pip install -r reqs-all.txt
```

### Execute evals
Expand Down
1 change: 1 addition & 0 deletions configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ persuader_model: gpt-4o
persuadee_model: gpt-4o
evaluator_model: gpt-4o
reject_model: gpt-4o-mini
reasoning_effort_persuader: null

# Jailbreaking options
jailbreak_persuader: false
Expand Down
26 changes: 26 additions & 0 deletions configs/experiment/gemini_3_pro_high.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# @package _global_
# Gemini 3 Pro Evaluation Experiment

# Override default values for this experiment
num_turns: 3
persuader_model: vertex_ai/gemini-3-pro-preview
persuadee_model: gpt-4o
evaluator_model: gpt-4o
reject_model: gpt-4o-mini
experiment_name: gemini_3_pro_high
sample_belief_lower: 0
sample_belief_upper: 20
only_persuade: true
assistant_prompt: prompt_assistant_no_user_degree
reasoning_effort_persuader: high
all_topics: true

topic_categories:
- NoncontroversiallyHarmful

# Hydra configuration
hydra:
run:
dir: ${results_dir}/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
job:
chdir: false
26 changes: 26 additions & 0 deletions configs/experiment/gemini_3_pro_high_aistudio.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# @package _global_
# Gemini 3 Pro Evaluation Experiment

# Override default values for this experiment
num_turns: 3
persuader_model: vertex_ai/gemini-3-pro-preview
persuadee_model: gpt-4o
evaluator_model: gpt-4o
reject_model: gpt-4o-mini
experiment_name: gemini_3_pro_high
sample_belief_lower: 0
sample_belief_upper: 20
only_persuade: true
assistant_prompt: prompt_assistant_no_user_degree
reasoning_effort_persuader: high
all_topics: true

topic_categories:
- NoncontroversiallyHarmful

# Hydra configuration
hydra:
run:
dir: ${results_dir}/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
job:
chdir: false
26 changes: 26 additions & 0 deletions configs/experiment/gemini_3_pro_low.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# @package _global_
# Gemini 3 Pro Evaluation Experiment

# Override default values for this experiment
num_turns: 3
persuader_model: vertex_ai/gemini-3-pro-preview
persuadee_model: gpt-4o
evaluator_model: gpt-4o
reject_model: gpt-4o-mini
experiment_name: gemini_3_pro_low
sample_belief_lower: 0
sample_belief_upper: 20
only_persuade: true
assistant_prompt: prompt_assistant_no_user_degree
reasoning_effort_persuader: low
all_topics: true

topic_categories:
- NoncontroversiallyHarmful

# Hydra configuration
hydra:
run:
dir: ${results_dir}/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
job:
chdir: false
26 changes: 26 additions & 0 deletions configs/experiment/gemini_3_pro_low_aistudio.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# @package _global_
# Gemini 3 Pro Evaluation Experiment

# Override default values for this experiment
num_turns: 3
persuader_model: gemini/gemini-3-pro-preview
persuadee_model: gpt-4o
evaluator_model: gpt-4o
reject_model: gpt-4o-mini
experiment_name: gemini_3_pro_low
sample_belief_lower: 0
sample_belief_upper: 20
only_persuade: true
assistant_prompt: prompt_assistant_no_user_degree
reasoning_effort_persuader: low
all_topics: true

topic_categories:
- NoncontroversiallyHarmful

# Hydra configuration
hydra:
run:
dir: ${results_dir}/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
job:
chdir: false
26 changes: 26 additions & 0 deletions configs/experiment/opus_45_low.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# @package _global_
# Opus 4.5 Evaluation Experiment

# Override default values for this experiment
num_turns: 3
persuader_model: anthropic/claude-opus-4-5-20251101
persuadee_model: gpt-4o
evaluator_model: gpt-4o
reject_model: gpt-4o-mini
experiment_name: opus_45_low
sample_belief_lower: 0
sample_belief_upper: 20
only_persuade: true
assistant_prompt: prompt_assistant_no_user_degree
reasoning_effort_persuader: low
all_topics: true

topic_categories:
- NoncontroversiallyHarmful

# Hydra configuration
hydra:
run:
dir: ${results_dir}/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
job:
chdir: false
4 changes: 4 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def main(cfg: DictConfig) -> None:
PERSUADEE_MODEL = cfg.persuadee_model
EVALUATOR_MODEL = cfg.evaluator_model
REJECT_MODEL = cfg.reject_model
REASONING_EFFORT_PERSUADER = cfg.reasoning_effort_persuader
JAILBREAK_TUNING_POSTPROCESS = cfg.jailbreak_persuader
JAILBREAK_TUNING_PERSUADEE = cfg.jailbreak_persuadee
EXPERIMENT_NAME = cfg.experiment_name
Expand Down Expand Up @@ -177,6 +178,7 @@ def main(cfg: DictConfig) -> None:
"PERSUADEE_MODEL": PERSUADEE_MODEL,
"EVALUATOR_MODEL": EVALUATOR_MODEL,
"REJECT_MODEL": REJECT_MODEL,
"REASONING_EFFORT_PERSUADER": REASONING_EFFORT_PERSUADER,
"JAILBREAK_TUNING_POSTPROCESS": JAILBREAK_TUNING_POSTPROCESS,
"RECORD_ALL_SYSTEM": cfg.record_all_system,
"REMOVE_RATINGS": cfg.remove_ratings,
Expand Down Expand Up @@ -470,6 +472,7 @@ def main(cfg: DictConfig) -> None:
model=PERSUADER_MODEL,
postprocess_responses=JAILBREAK_TUNING_POSTPROCESS,
remove_ratings=cfg.remove_ratings,
reasoning_effort=REASONING_EFFORT_PERSUADER,
batch_size=cfg.batch_size,
)

Expand Down Expand Up @@ -660,6 +663,7 @@ def run_refusal_check(conv, system_message, turn):
model=PERSUADER_MODEL,
postprocess_responses=JAILBREAK_TUNING_POSTPROCESS,
remove_ratings=cfg.remove_ratings,
reasoning_effort=REASONING_EFFORT_PERSUADER,
batch_size=cfg.batch_size,
)

Expand Down
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ dependencies = [
"google-cloud-aiplatform",
"seaborn",
"scikit-learn",
"transformers",
"torch",
"accelerate",
"tabulate",
"hydra-core"
]
Expand All @@ -40,6 +37,11 @@ test = [
"pytest",
"pytest-asyncio"
]
local = [
"transformers",
"torch",
"accelerate"
]

[tool.logviz-dependencies]
Flask = "^2.3.2"
Expand Down
Loading