From 82473c53d6e729920000611d1988a228ad56cd38 Mon Sep 17 00:00:00 2001 From: realgump Date: Fri, 13 Oct 2023 15:05:39 +0800 Subject: [PATCH 1/2] Bug Fix: Resolved KeyError Occurring During the Round Loop When 'args.evaluate_times' is greater than 1, during the second evaluation, the key 'qid' is initialized, but 'round_i' is not. This leads to a KeyError. --- toolbench/tooleval/eval_preference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/toolbench/tooleval/eval_preference.py b/toolbench/tooleval/eval_preference.py index fe762d08..7c2fa823 100644 --- a/toolbench/tooleval/eval_preference.py +++ b/toolbench/tooleval/eval_preference.py @@ -175,6 +175,8 @@ def get_preference(query_id, task_status, answer_statuss, ref_example, output_ex for qid in test_ids: if qid not in prefer_dict: prefer_dict[qid] = {reference_model: 0, output_model: 0, f"round_{i}": "incomplete"} + elif f"round_{i}" not in prefer_dict[qid]: + prefer_dict[qid][f"round_{i}"] = "incomplete" elif prefer_dict[qid][f"round_{i}"] == "complete": continue if qid in ref_pass_result_dict and qid in output_pass_result_dict: @@ -262,4 +264,4 @@ def get_preference(query_id, task_status, answer_statuss, ref_example, output_ex lose_rate /= len(prefer_dict) tie_rate /= len(prefer_dict) print(f"Test set: {test_set}. Reference model: {reference_model}, Candidate model: {output_model}. Win rate: {str(win_rate)}, Tie rate: {str(tie_rate)}") - \ No newline at end of file + From 92f8a33a4760166667c832f39a97b8cf34300e66 Mon Sep 17 00:00:00 2001 From: realgump Date: Fri, 13 Oct 2023 15:14:10 +0800 Subject: [PATCH 2/2] Bug Fix: Duplicate Preference Counts in Some Cases. The comparison result has been updated, but the completion status has not. As a result, when loading the JSON file, the 'qids' from this section will be counted repeatedly. --- toolbench/tooleval/eval_preference.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/toolbench/tooleval/eval_preference.py b/toolbench/tooleval/eval_preference.py index 7c2fa823..02bf3b21 100644 --- a/toolbench/tooleval/eval_preference.py +++ b/toolbench/tooleval/eval_preference.py @@ -182,17 +182,21 @@ def get_preference(query_id, task_status, answer_statuss, ref_example, output_ex if qid in ref_pass_result_dict and qid in output_pass_result_dict: if ref_pass_result_dict[qid]["machine_label"] == "passed" and output_pass_result_dict[qid]["machine_label"] == "failed": prefer_dict[qid][reference_model] += 1 + prefer_dict[qid][f"round_{i}"] = "complete" continue elif ref_pass_result_dict[qid]["machine_label"] == "failed" and output_pass_result_dict[qid]["machine_label"] == "passed": prefer_dict[qid][output_model] += 1 + prefer_dict[qid][f"round_{i}"] = "complete" continue if qid not in reference_examples: prefer_dict[qid][output_model] += 1 + prefer_dict[qid][f"round_{i}"] = "complete" continue if qid not in output_examples: print(f"Query {qid} not in output model converted answers!") prefer_dict[qid][reference_model] += 1 + prefer_dict[qid][f"round_{i}"] = "complete" continue ref_example = reference_examples[qid]