AISBench · Shane120283483 · Mar 13, 2026 · GaoHuaZhang · Mar 13, 2026 · gemini-code-assist
diff --git a/ais_bench/benchmark/configs/datasets/mmmu/mmmu_gen.py b/ais_bench/benchmark/configs/datasets/mmmu/mmmu_gen.py
@@ -4,9 +4,9 @@
 from ais_bench.benchmark.datasets import MMMUDataset, MMMUEvaluator
 
 
-START_TEXT_PROMPT = "Question: "
-END_TEXT_PROMPT = "Please select the correct answer from the options above. \n"
-OPTIONS_PROMPT = "\nOptions:\n"
+START_TEXT_PROMPT = ""
+END_TEXT_PROMPT = "Answer with the option's letter from the given choices directly."
+OPTIONS_PROMPT = "\n"
 
 mmmu_reader_cfg = dict(
     input_columns=['question', 'image'],

diff --git a/ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py b/ais_bench/benchmark/configs/datasets/mmstar/mmstar_gen.py
@@ -15,7 +15,7 @@
         template=dict(
             round=[
                 dict(role="HUMAN", prompt_mm={
-                    "text": {"type": "text", "text": "Question: {question}\n"},
+                    "text": {"type": "text", "text": "{question}\nAnswer with the option's letter from the given choices directly."},
                     "image": {"type": "image_url", "image_url": {"url": "file://{image}"}},
                 })
             ]

diff --git a/ais_bench/benchmark/datasets/mmmu.py b/ais_bench/benchmark/datasets/mmmu.py
@@ -15,6 +15,7 @@
 from ais_bench.benchmark.datasets.utils.datasets import get_data_path, toliststr, decode_base64_to_image_file, get_content_str
 
 from .base import BaseDataset
+import re
 
 logger = AISLogger()
 IMAGE_MAP_LEN = 64
@@ -349,7 +350,13 @@ def score(self, predictions, references):
                     pred = pred.replace(char, '')
             detail = {'pred': pred, 'answer': refer, 'correct': False}
             choices = json.loads(refer['choices'])
-            infer_res = can_infer(pred, choices)
+            # infer_res = can_infer(pred, choices)
+            pattern = r'<\|begin_of_box\|>(.*?)<\|end_of_box\|>'
+            match = re.search(pattern, pred, re.DOTALL)
+            if match is not None:
+                infer_res = match.group(1)
+            else:
+                infer_res = ""
             overall_key = '[' + refer['split'] + ']: Overall'
             key_category = '[' + refer['split'] + ']: ' +  refer['category']
             key_l2_category = '[' + refer['split'] + ']: ' +  refer['l2-category']

diff --git a/ais_bench/benchmark/datasets/mmstar.py b/ais_bench/benchmark/datasets/mmstar.py
@@ -14,6 +14,7 @@
 from ais_bench.benchmark.utils.prompt import AIS_CONTENT_TAG, AIS_TEXT_START, AIS_IMAGE_START
 
 from .base import BaseDataset
+import re
 
 IMAGE_MAP_LEN = 64
 logger = AISLogger()
@@ -103,8 +104,13 @@ def score(self, predictions, references):
         for pred, refer in zip(predictions, references):
             detail = {'pred': pred, 'answer': refer, 'correct': False}
             choices = json.loads(refer['choices'])
-            infer_res = can_infer(pred, choices)
-
+            # infer_res = can_infer(pred, choices)
+            pattern = r'<\|begin_of_box\|>(.*?)<\|end_of_box\|>'
+            match = re.search(pattern, pred, re.DOTALL)
+            if match is not None:
+                infer_res = match.group(1)
+            else:
+                infer_res = ""
             key_category = refer['category']
             score = 1 if infer_res == refer['answer'] else 0
             if score == 1:

diff --git a/ais_bench/benchmark/datasets/textvqa.py b/ais_bench/benchmark/datasets/textvqa.py
@@ -185,7 +185,7 @@ def __init__(self):
             '?',
             '!',
         ]
-        self.special_tokens = ['☞', '☟', '☜', '<unk>', '<|im_end|>']
+        self.special_tokens = ['☞', '☟', '☜', '<unk>', '<|im_end|>', '<|end_of_box|>', '<|begin_of_box|>']
 
     def process_punctuation(self, in_text):
         if len(in_text) > MAX_TARGET_LENGTH: