|
| 1 | +import jsonlines |
| 2 | +from tqdm import tqdm |
| 3 | +import json |
| 4 | +from datasets import load_dataset |
| 5 | +from argparse import ArgumentParser |
| 6 | +from openai import OpenAI |
| 7 | + |
| 8 | +def parse_output(client, pred): |
| 9 | + PROMPT = "From the given trace, locate the JSON structure (the part enclosed in {} or []). Extract it and return only that JSON object exactly as it appears in the trace. Do not add or remove any characters outside the JSON." |
| 10 | + decoded_outputs = client.chat.completions.create( |
| 11 | + model="gpt-4o", |
| 12 | + messages=[ |
| 13 | + {"role": "system", "content": PROMPT}, |
| 14 | + { |
| 15 | + "role": "user", |
| 16 | + "content": f"Provided text: {pred}", |
| 17 | + }, |
| 18 | + ], |
| 19 | + ) |
| 20 | + |
| 21 | + return decoded_outputs.choices[0].message.content |
| 22 | + |
| 23 | + |
| 24 | +def convert_to_format(client, pred, format): |
| 25 | + PROMPT = """You are given a question that specifies an exact output format, along with a predicted output that may or may not follow that format. Your task is to convert the predicted JSON output to match the specified format as closely as possible. |
| 26 | +
|
| 27 | +**Instructions:** |
| 28 | +- The final output must strictly follow the required JSON structure. |
| 29 | +- If any key from the specified format is missing in the predicted output, assign it an empty string (`""`). |
| 30 | +- Return **only** the final JSON object — no additional text, explanations, or formatting. |
| 31 | +- DO NOT try to answer the question, only focus on the format. |
| 32 | +- The final answer must be a parsable json, remove any additional characters like json```""" |
| 33 | + decoded_outputs = client.chat.completions.create( |
| 34 | + model="gpt-4o", |
| 35 | + messages=[ |
| 36 | + {"role": "system", "content": PROMPT}, |
| 37 | + { |
| 38 | + "role": "user", |
| 39 | + "content": f"Question which specifies the format: {format}\n\n Provided json: {json.dumps(pred)}", |
| 40 | + }, |
| 41 | + ], |
| 42 | + ) |
| 43 | + |
| 44 | + return decoded_outputs.choices[0].message.content |
| 45 | + |
| 46 | +### change this according to your output format |
| 47 | +def convert(args, client): |
| 48 | + livedrbench = load_dataset("microsoft/LiveDRBench", "v1-full")['test'] |
| 49 | + rows = livedrbench.to_list() |
| 50 | + key_question_map = {r["question"]: r["key"] for r in rows} |
| 51 | + |
| 52 | + infer = list(jsonlines.open(args.preds_file)) |
| 53 | + |
| 54 | + livedrbench_format = [] |
| 55 | + |
| 56 | + correct_count = 0 |
| 57 | + for example in tqdm(infer): |
| 58 | + key = key_question_map[example["question"]] |
| 59 | + if example["prediction"]: |
| 60 | + prediction = parse_output(client, example["prediction"]) |
| 61 | + prediction = convert_to_format(client, prediction, example["question"]) |
| 62 | + try: |
| 63 | + prediction = [json.loads(prediction)] |
| 64 | + correct_count += 1 |
| 65 | + except: |
| 66 | + prediction = [] |
| 67 | + |
| 68 | + else: |
| 69 | + prediction = [] |
| 70 | + |
| 71 | + livedrbench_format.append({"key": key, "preds": prediction}) |
| 72 | + |
| 73 | + print(f"Correct format: {correct_count}/{len(infer)}") |
| 74 | + |
| 75 | + with open(args.out_file, "w") as f: |
| 76 | + json.dump(livedrbench_format, f, indent=1) |
| 77 | + |
| 78 | + |
| 79 | +if __name__ == "__main__": |
| 80 | + args = ArgumentParser() |
| 81 | + args.add_argument("--openai_api_key", type=str, required=True, help="OpenAI API key") |
| 82 | + args.add_argument("--openai_model_name", type=str, default="gpt-4o", help="OpenAI model name to use as judge") |
| 83 | + args.add_argument("--preds_file", type=str, required=True, help="Path to the JSON file containing predictions") |
| 84 | + args.add_argument("--out_file", type=str, required=True, help="Output file name") |
| 85 | + args = args.parse_args() |
| 86 | + |
| 87 | + client = OpenAI() |
| 88 | + convert(args, client) |
| 89 | + |
| 90 | + |
0 commit comments