diff --git a/ais_bench/benchmark/configs/datasets/mmmu/mmmu_gen.py b/ais_bench/benchmark/configs/datasets/mmmu/mmmu_gen.py index 55a45e86..94d641fa 100644 --- a/ais_bench/benchmark/configs/datasets/mmmu/mmmu_gen.py +++ b/ais_bench/benchmark/configs/datasets/mmmu/mmmu_gen.py @@ -4,9 +4,9 @@ from ais_bench.benchmark.datasets import MMMUDataset, MMMUEvaluator -START_TEXT_PROMPT = "Question: " -END_TEXT_PROMPT = "Please select the correct answer from the options above. \n" -OPTIONS_PROMPT = "\nOptions:\n" +START_TEXT_PROMPT = "" +END_TEXT_PROMPT = "Answer with the option's letter from the given choices directly." +OPTIONS_PROMPT = "\n" mmmu_reader_cfg = dict( input_columns=['question', 'image'], diff --git a/ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py b/ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py index 8b8133ea..bacc38c3 100644 --- a/ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py +++ b/ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py @@ -15,7 +15,7 @@ template=dict( round=[ dict(role="HUMAN", prompt_mm={ - "text": {"type": "text", "text": "Question: {question}\n"}, + "text": {"type": "text", "text": "{question}\nAnswer with the option's letter from the given choices directly."}, "image": {"type": "image_url", "image_url": {"url": "file://{image}"}}, }) ] diff --git a/ais_bench/benchmark/datasets/mmmu.py b/ais_bench/benchmark/datasets/mmmu.py index 6edebec9..7914e784 100644 --- a/ais_bench/benchmark/datasets/mmmu.py +++ b/ais_bench/benchmark/datasets/mmmu.py @@ -15,6 +15,7 @@ from ais_bench.benchmark.datasets.utils.datasets import get_data_path, toliststr, decode_base64_to_image_file, get_content_str from .base import BaseDataset +import re logger = AISLogger() IMAGE_MAP_LEN = 64 @@ -349,7 +350,13 @@ def score(self, predictions, references): pred = pred.replace(char, '') detail = {'pred': pred, 'answer': refer, 'correct': False} choices = json.loads(refer['choices']) - infer_res = can_infer(pred, choices) + # infer_res = can_infer(pred, choices) + pattern = r'<\|begin_of_box\|>(.*?)<\|end_of_box\|>' + match = re.search(pattern, pred, re.DOTALL) + if match is not None: + infer_res = match.group(1) + else: + infer_res = "" overall_key = '[' + refer['split'] + ']: Overall' key_category = '[' + refer['split'] + ']: ' + refer['category'] key_l2_category = '[' + refer['split'] + ']: ' + refer['l2-category'] diff --git a/ais_bench/benchmark/datasets/mmstar.py b/ais_bench/benchmark/datasets/mmstar.py index 50e30ed0..f0ea84b9 100644 --- a/ais_bench/benchmark/datasets/mmstar.py +++ b/ais_bench/benchmark/datasets/mmstar.py @@ -14,6 +14,7 @@ from ais_bench.benchmark.utils.prompt import AIS_CONTENT_TAG, AIS_TEXT_START, AIS_IMAGE_START from .base import BaseDataset +import re IMAGE_MAP_LEN = 64 logger = AISLogger() @@ -103,8 +104,13 @@ def score(self, predictions, references): for pred, refer in zip(predictions, references): detail = {'pred': pred, 'answer': refer, 'correct': False} choices = json.loads(refer['choices']) - infer_res = can_infer(pred, choices) - + # infer_res = can_infer(pred, choices) + pattern = r'<\|begin_of_box\|>(.*?)<\|end_of_box\|>' + match = re.search(pattern, pred, re.DOTALL) + if match is not None: + infer_res = match.group(1) + else: + infer_res = "" key_category = refer['category'] score = 1 if infer_res == refer['answer'] else 0 if score == 1: diff --git a/ais_bench/benchmark/datasets/textvqa.py b/ais_bench/benchmark/datasets/textvqa.py index 8c24d433..2a71dc09 100644 --- a/ais_bench/benchmark/datasets/textvqa.py +++ b/ais_bench/benchmark/datasets/textvqa.py @@ -185,7 +185,7 @@ def __init__(self): '?', '!', ] - self.special_tokens = ['☞', '☟', '☜', '', '<|im_end|>'] + self.special_tokens = ['☞', '☟', '☜', '', '<|im_end|>', '<|end_of_box|>', '<|begin_of_box|>'] def process_punctuation(self, in_text): if len(in_text) > MAX_TARGET_LENGTH: