Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions eval/Subject/gpqa/evaluate_gpqa_cot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
from typing import Dict, Any
import os
import json
from tqdm import tqdm
from datetime import datetime
import openai
from time import sleep
import argparse
from util import *
from datasets import Dataset, load_dataset
import pandas as pd
from vllm import LLM, SamplingParams
import torch
import time
from collections import namedtuple

Example = namedtuple('Example', ['question', 'choice1', 'choice2', 'choice3', 'choice4', 'correct_index'])



parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="./eurus-7b-kto-hf")
# meta-llama/Llama-2-7b-chat-hf
# parser.add_argument("--input_data", type=str, default="./theorem_qa.json")
parser.add_argument("--save_dir", type=str, default="./")
parser.add_argument("--model_type", type=str, default='mistral')

args = parser.parse_args()


llm = LLM(
model=args.model,
trust_remote_code=True,
tensor_parallel_size=torch.cuda.device_count(),
)

def generate_sample_batch(question_list):
sampling_params = SamplingParams(max_tokens=1024,
temperature=0.0,
top_p=1,
n=1,
stop=["question:"],)
outputs = llm.generate(question_list, sampling_params, use_tqdm=False)
completions = [output.outputs[0].text for output in outputs]
return completions


# def create_reader_request(example: Dict[str, Any]) -> str:
# string = f"Question: {example['Question']}"
# return string


# Copied from: https://github.com/idavidrein/gpqa/blob/main/baselines/utils.py#L197
def load_examples(data):
import random
seed = 0
random.seed(seed)

def shuffle_choices_and_create_example(row) -> Example:
list_choices = [row['Incorrect Answer 1'], row['Incorrect Answer 2'], row['Incorrect Answer 3'], row['Correct Answer']]
random.shuffle(list_choices)
example = Example(row.Question, list_choices[0], list_choices[1], list_choices[2], list_choices[3],
list_choices.index(row['Correct Answer']))
return example

return [shuffle_choices_and_create_example(row) for _, row in data.iterrows()]


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model)
def make_conv(example, model_type, prev_output=None):
prompt_round1 = f"What is the correct answer to this question: {example.question}"
prompt_round1 += f"\n\nChoices:\n(A) {example.choice1}\n(B) {example.choice2}\n(C) {example.choice3}\n(D) {example.choice4}"
prompt_round1 += "\nLet's think step-by-step:"
prompt_round2 = '\n\nBased on the above, what is the single, most likely answer choice? Answer in the format \"The correct answer is (insert answer here)\".'

if prev_output is None:
prompt = prompt_round1
else:
prompt = prompt_round1 + prev_output + prompt_round2

msg = [{"role": "user", "content": prompt},]
out = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
return out



if __name__ == "__main__":


test_set = load_dataset('Idavidrein/gpqa', 'gpqa_main')['train'].to_pandas()[:3]
test_set['example'] = load_examples(test_set)
test_set["prompt_round1"] = test_set['example'].apply(lambda example: make_conv(example, args.model_type))

# Adapted from: https://github.com/idavidrein/gpqa/blob/main/baselines/utils.py#L48
# First round: Think step-by-step
test_set["completion_round1"] = generate_sample_batch(test_set["prompt_round1"].tolist())

# Second round: Extract final answer
test_set["prompt_round2"] = test_set.apply(lambda row: make_conv(row['example'], args.model_type, row['completion_round1']), axis=1)
test_set["completion"] = generate_sample_batch(test_set["prompt_round2"].tolist())
test_set.to_json(os.path.join(args.save_dir, "gpqa_completion.json"))

answered_set = dict()
correct, wrong = 0, 0

output_filename = os.path.join(args.save_dir, "gpqa_output.json")
writer = open(output_filename, 'w')
accuracy = []
index_mapping = ['(a)', '(b)', '(c)', '(d)']

for example in test_set.to_dict(orient="records"):

if answered_set and example['id'] in answered_set:
writer.write(answered_set[example['id']] + '\n')
continue

result = example["completion"]
_, prediction = match_answer(result)
prediction = postprocess_number(prediction)
reference = index_mapping[example["example"].correct_index]

verifier = GPQATask(id=example["Record ID"],
prompt=example["prompt_round2"],
reference=reference,
answer_type='Option')
acc = verifier.success(prediction)
tmp = {
'id': example['Record ID'],
'question': example['prompt_round2'],
'prediction': prediction,
'answer': reference,
'rationale': result,
'answer_type': 'Option',
"is_correct": acc,
}
writer.write(json.dumps(tmp) + '\n')


accuracy.append(acc)

writer.close()
print()

accuracy = sum([1 if acc else 0 for acc in accuracy]) / len(accuracy)
with open(os.path.join(args.save_dir, "result.txt"), "w") as f:
print({"accuracy": accuracy * 100})
print({"accuracy": accuracy * 100}, file=f)

Loading