From 358cecc281bd744684b405a23b0b704d99f2eefd Mon Sep 17 00:00:00 2001
From: Oscar Lo <olo126@klone-login03.hyak.local>
Date: Thu, 2 May 2024 02:46:40 -0700
Subject: [PATCH 1/4] added gqa as eval dataset

---
 open_flamingo/eval/eval_datasets.py           |  3 +-
 open_flamingo/eval/eval_models/blip.py        |  3 +
 .../eval/eval_models/open_flamingo.py         |  3 +
 open_flamingo/eval/evaluate.py                | 91 +++++++++++++++++++
 4 files changed, 99 insertions(+), 1 deletion(-)
diff --git a/open_flamingo/eval/eval_datasets.py b/open_flamingo/eval/eval_datasets.py
index df50af6a..23d4ae1d 100644
--- a/open_flamingo/eval/eval_datasets.py
+++ b/open_flamingo/eval/eval_datasets.py
@@ -14,6 +14,7 @@
     "okvqa",
     "vizwiz",
     "textvqa",
+    "gqa",
     "hateful_memes",
     "imagenet",
 ]
@@ -104,7 +105,7 @@ def get_img_path(self, question):
             )
         elif self.dataset_name == "vizwiz":
             return os.path.join(self.image_dir_path, question["image_id"])
-        elif self.dataset_name == "textvqa":
+        elif self.dataset_name == "textvqa" or self.dataset_name == "gqa":
             return os.path.join(self.image_dir_path, f"{question['image_id']}.jpg")
         else:
             raise Exception(f"Unknown VQA dataset {self.dataset_name}")
diff --git a/open_flamingo/eval/eval_models/blip.py b/open_flamingo/eval/eval_models/blip.py
index 87f08036..725b0470 100644
--- a/open_flamingo/eval/eval_models/blip.py
+++ b/open_flamingo/eval/eval_models/blip.py
@@ -108,6 +108,9 @@ def get_vizwiz_prompt(self, question, answer=None) -> str:
     
     def get_textvqa_prompt(self, question, answer=None) -> str:
         return f"Question:{question} Short answer:{answer if answer is not None else ''}"
+    
+    def get_gqa_prompt(self, question, answer=None) -> str:
+        return f"Question:{question} Short answer:{answer if answer is not None else ''}"
 
     def get_coco_prompt(self, caption=None) -> str:
         return f"A photo of {caption if caption is not None else ''}"
diff --git a/open_flamingo/eval/eval_models/open_flamingo.py b/open_flamingo/eval/eval_models/open_flamingo.py
index 0a25198c..d73417ab 100644
--- a/open_flamingo/eval/eval_models/open_flamingo.py
+++ b/open_flamingo/eval/eval_models/open_flamingo.py
@@ -287,6 +287,9 @@ def get_vizwiz_prompt(self, question, answer=None) -> str:
 
     def get_textvqa_prompt(self, question, answer=None) -> str:
         return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
+    
+    def get_gqa_prompt(self, question, answer=None) -> str:
+        return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
 
     def get_coco_prompt(self, caption=None) -> str:
         return f"<image>Output:{caption if caption is not None else ''}{'<|endofchunk|>' if caption is not None else ''}"
diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py
index 4a25fca0..9a2c4e47 100644
--- a/open_flamingo/eval/evaluate.py
+++ b/open_flamingo/eval/evaluate.py
@@ -139,6 +139,14 @@
     default=False,
     help="Whether to evaluate on TextVQA.",
 )
+
+parser.add_argument(
+    "--eval_gqa",
+    action="store_true",
+    default=False,
+    help="Whether to evaluate on GQA.",
+)
+
 parser.add_argument(
     "--eval_imagenet",
     action="store_true",
@@ -346,6 +354,44 @@
     default=None,
 )
 
+# GQA Dataset
+parser.add_argument(
+    "--gqa_train_image_dir_path",
+    type=str,
+    help="Path to the gqa train images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--gqa_train_questions_json_path",
+    type=str,
+    help="Path to the gqa questions json file.",
+    default=None,
+)
+parser.add_argument(
+    "--gqa_train_annotations_json_path",
+    type=str,
+    help="Path to the gqa annotations json file",
+    default=None,
+)
+parser.add_argument(
+    "--gqa_test_image_dir_path",
+    type=str,
+    help="Path to the gqa test images directory.",
+    default=None,
+)
+parser.add_argument(
+    "--gqa_test_questions_json_path",
+    type=str,
+    help="Path to the gqa questions json file",
+    default=None,
+)
+parser.add_argument(
+    "--gqa_test_annotations_json_path",
+    type=str,
+    help="Path to the gqa annotations json file",
+    default=None,
+)
+
 ## Imagenet dataset
 parser.add_argument("--imagenet_root", type=str, default="/tmp")
 
@@ -650,6 +696,44 @@ def main():
                         "stddev": np.nanstd(scores),
                     }
                 )
+    
+    if args.eval_gqa:
+        print("Evaluating on GQA...")
+
+        #load cached demonstration features on GQA
+        if args.cached_demonstration_features is not None:
+            cached_features = torch.load(
+                f"{args.cached_demonstration_features}/imagenet.pkl", map_location="cpu"
+            )
+        else:
+            cached_features = None
+        
+        for shot in args.shots:
+            scores = []
+            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                gqa_score = evaluate_vqa(
+                    args=args,
+                    eval_model=eval_model,
+                    num_shots=shot,
+                    seed=seed,
+                    dataset_name="gqa",
+                    max_new_tokens=10,
+                    cached_features=cached_features,
+                )
+                if args.rank == 0:
+                    print(f"Shots {shot} Trial {trial} GQA score: {gqa_score}")
+                    scores.append(gqa_score)
+            
+            if args.rank == 0:
+                print(f"Shots {shot} Mean GQA score: {np.nanmean(scores)}")
+                results["gqa"].append(
+                    {
+                        "shots": shot,
+                        "trials": scores,
+                        "mean": np.nanmean(scores),
+                        "stddev": np.nanstd(scores),
+                    }
+                )
 
     if args.eval_imagenet:
         print("Evaluating on ImageNet...")
@@ -968,6 +1052,13 @@ def evaluate_vqa(
         test_image_dir_path = args.textvqa_image_dir_path
         test_questions_json_path = args.textvqa_test_questions_json_path
         test_annotations_json_path = args.textvqa_test_annotations_json_path
+    elif dataset_name == "gqa":
+        train_image_dir_path = args.gqa_train_image_dir_path
+        train_questions_json_path = args.gqa_train_questions_json_path
+        train_annotations_json_path = args.gqa_train_annotations_json_path
+        test_image_dir_path = args.gqa_test_image_dir_path
+        test_questions_json_path = args.gqa_test_questions_json_path
+        test_annotations_json_path = args.gqa_test_annotations_json_path
     else:
         raise ValueError(f"Unsupported dataset: {dataset_name}")
 

From 95c3cae476fe650eec57879e4dc66ff5ea0d46a0 Mon Sep 17 00:00:00 2001
From: Oscar Lo <olo126@klone-login03.hyak.local>
Date: Thu, 9 May 2024 14:37:49 -0700
Subject: [PATCH 2/4] cleaned up evaluate file arguments

---
 open_flamingo/eval/evaluate.py | 797 +++++++--------------------------
 1 file changed, 174 insertions(+), 623 deletions(-)

diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py
index 9a2c4e47..402cefb7 100644
--- a/open_flamingo/eval/evaluate.py
+++ b/open_flamingo/eval/evaluate.py
@@ -109,65 +109,30 @@
 )
 
 # Per-dataset evaluation flags
-parser.add_argument(
-    "--eval_coco",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on COCO.",
-)
-parser.add_argument(
-    "--eval_vqav2",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on VQAV2.",
-)
-parser.add_argument(
-    "--eval_okvqa",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on OK-VQA.",
-)
-parser.add_argument(
-    "--eval_vizwiz",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on VizWiz.",
-)
-parser.add_argument(
-    "--eval_textvqa",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on TextVQA.",
-)
-
-parser.add_argument(
-    "--eval_gqa",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on GQA.",
-)
-
-parser.add_argument(
-    "--eval_imagenet",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on ImageNet.",
-)
-parser.add_argument(
-    "--eval_flickr30",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on Flickr30.",
-)
-parser.add_argument(
-    "--eval_hateful_memes",
-    action="store_true",
-    default=False,
-    help="Whether to evaluate on Hateful Memes.",
-)
+for task in SUPPORTED_TASKS:
+    parser.add_argument(
+        f"--eval_{task}",
+        action="store_true",
+        default=False,
+        help=f"Whether to evaluate on {task.replace('_', ' ')}"
+    )
 
 # Dataset arguments
 
+for task in ['flickr', 'coco']:
+    parser.add_argument(
+        f"--{task}_karpathy_json_path",
+        type=str,
+        help="Path to the dataset_flickr30k.json file." if task=='flickr' else argparse.SUPPRESS,
+        default=None,
+    )
+    parser.add_argument(
+        f"--{task}_annotations_json_path",
+        type=str,
+        help="Path to the dataset_flickr30k_coco_style.json file." if task=='flickr' else argparse.SUPPRESS,
+        default=None
+    )
+
 ## Flickr30 Dataset
 parser.add_argument(
     "--flickr_image_dir_path",
@@ -175,17 +140,6 @@
     help="Path to the flickr30/flickr30k_images directory.",
     default=None,
 )
-parser.add_argument(
-    "--flickr_karpathy_json_path",
-    type=str,
-    help="Path to the dataset_flickr30k.json file.",
-    default=None,
-)
-parser.add_argument(
-    "--flickr_annotations_json_path",
-    type=str,
-    help="Path to the dataset_flickr30k_coco_style.json file.",
-)
 ## COCO Dataset
 parser.add_argument(
     "--coco_train_image_dir_path",
@@ -197,48 +151,42 @@
     type=str,
     default=None,
 )
-parser.add_argument(
-    "--coco_karpathy_json_path",
-    type=str,
-    default=None,
-)
-parser.add_argument(
-    "--coco_annotations_json_path",
-    type=str,
-    default=None,
-)
+
+## VQAV2, OK-VQA, VizWiz, TextVQA, GQA Datasets
+for task in ['vqav2', 'ok_vqa', 'vizwiz', 'textvqa', 'gqa']:
+    parser.add_argument(
+        f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' else f"--{task}_train_image_dir_path",
+        type=str,
+        default=None,
+    )
+    if task!='gqa' and task!='textvqa':
+        parser.add_argument(
+            f"--{task}_test_image_dir_path",
+            type=str,
+            default=None,
+        )
+    parser.add_argument(
+        f"--{task}_train_questions_json_path",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        f"--{task}_train_annotations_json_path",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        f"--{task}_test_questions_json_path",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        f"--{task}_test_annotations_json_path",
+        type=str,
+        default=None,
+    )
 
 ## VQAV2 Dataset
-parser.add_argument(
-    "--vqav2_train_image_dir_path",
-    type=str,
-    default=None,
-)
-parser.add_argument(
-    "--vqav2_train_questions_json_path",
-    type=str,
-    default=None,
-)
-parser.add_argument(
-    "--vqav2_train_annotations_json_path",
-    type=str,
-    default=None,
-)
-parser.add_argument(
-    "--vqav2_test_image_dir_path",
-    type=str,
-    default=None,
-)
-parser.add_argument(
-    "--vqav2_test_questions_json_path",
-    type=str,
-    default=None,
-)
-parser.add_argument(
-    "--vqav2_test_annotations_json_path",
-    type=str,
-    default=None,
-)
 parser.add_argument(
     "--vqav2_final_test_questions_json_path",
     type=str,
@@ -246,152 +194,6 @@
     default=None,
 )
 
-## OK-VQA Dataset
-parser.add_argument(
-    "--ok_vqa_train_image_dir_path",
-    type=str,
-    help="Path to the vqav2/train2014 directory.",
-    default=None,
-)
-parser.add_argument(
-    "--ok_vqa_train_questions_json_path",
-    type=str,
-    help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.",
-    default=None,
-)
-parser.add_argument(
-    "--ok_vqa_train_annotations_json_path",
-    type=str,
-    help="Path to the v2_mscoco_train2014_annotations.json file.",
-    default=None,
-)
-parser.add_argument(
-    "--ok_vqa_test_image_dir_path",
-    type=str,
-    help="Path to the vqav2/val2014 directory.",
-    default=None,
-)
-parser.add_argument(
-    "--ok_vqa_test_questions_json_path",
-    type=str,
-    help="Path to the v2_OpenEnded_mscoco_val2014_questions.json file.",
-    default=None,
-)
-parser.add_argument(
-    "--ok_vqa_test_annotations_json_path",
-    type=str,
-    help="Path to the v2_mscoco_val2014_annotations.json file.",
-    default=None,
-)
-
-## VizWiz Dataset
-parser.add_argument(
-    "--vizwiz_train_image_dir_path",
-    type=str,
-    help="Path to the vizwiz train images directory.",
-    default=None,
-)
-parser.add_argument(
-    "--vizwiz_test_image_dir_path",
-    type=str,
-    help="Path to the vizwiz test images directory.",
-    default=None,
-)
-parser.add_argument(
-    "--vizwiz_train_questions_json_path",
-    type=str,
-    help="Path to the vizwiz questions json file.",
-    default=None,
-)
-parser.add_argument(
-    "--vizwiz_train_annotations_json_path",
-    type=str,
-    help="Path to the vizwiz annotations json file.",
-    default=None,
-)
-parser.add_argument(
-    "--vizwiz_test_questions_json_path",
-    type=str,
-    help="Path to the vizwiz questions json file.",
-    default=None,
-)
-parser.add_argument(
-    "--vizwiz_test_annotations_json_path",
-    type=str,
-    help="Path to the vizwiz annotations json file.",
-    default=None,
-)
-
-# TextVQA Dataset
-parser.add_argument(
-    "--textvqa_image_dir_path",
-    type=str,
-    help="Path to the textvqa images directory.",
-    default=None,
-)
-parser.add_argument(
-    "--textvqa_train_questions_json_path",
-    type=str,
-    help="Path to the textvqa questions json file.",
-    default=None,
-)
-parser.add_argument(
-    "--textvqa_train_annotations_json_path",
-    type=str,
-    help="Path to the textvqa annotations json file.",
-    default=None,
-)
-parser.add_argument(
-    "--textvqa_test_questions_json_path",
-    type=str,
-    help="Path to the textvqa questions json file.",
-    default=None,
-)
-parser.add_argument(
-    "--textvqa_test_annotations_json_path",
-    type=str,
-    help="Path to the textvqa annotations json file.",
-    default=None,
-)
-
-# GQA Dataset
-parser.add_argument(
-    "--gqa_train_image_dir_path",
-    type=str,
-    help="Path to the gqa train images directory.",
-    default=None,
-)
-parser.add_argument(
-    "--gqa_train_questions_json_path",
-    type=str,
-    help="Path to the gqa questions json file.",
-    default=None,
-)
-parser.add_argument(
-    "--gqa_train_annotations_json_path",
-    type=str,
-    help="Path to the gqa annotations json file",
-    default=None,
-)
-parser.add_argument(
-    "--gqa_test_image_dir_path",
-    type=str,
-    help="Path to the gqa test images directory.",
-    default=None,
-)
-parser.add_argument(
-    "--gqa_test_questions_json_path",
-    type=str,
-    help="Path to the gqa questions json file",
-    default=None,
-)
-parser.add_argument(
-    "--gqa_test_annotations_json_path",
-    type=str,
-    help="Path to the gqa annotations json file",
-    default=None,
-)
-
 ## Imagenet dataset
 parser.add_argument("--imagenet_root", type=str, default="/tmp")
 
@@ -444,6 +246,7 @@
 
 def main():
     args, leftovers = parser.parse_known_args()
+    var_args = vars(args)
 
     # set up distributed evaluation
     args.local_rank, args.rank, args.world_size = world_info_from_env()
@@ -473,351 +276,125 @@ def main():
 
     # Run through datasets and evaluate
     results = defaultdict(list)
-
-    if args.eval_flickr30:
-        print("Evaluating on Flickr30k...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/flickr30.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                cider_score = evaluate_captioning(
-                    args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="flickr",
-                    cached_features=cached_features,
-                )
-                if args.rank == 0:
-                    print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
-                    scores.append(cider_score)
-
-            if args.rank == 0:
-                print(f"Shots {shot} Mean CIDEr score: {np.nanmean(scores)}")
-                results["flickr30"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
-
-    if args.eval_coco:
-        print("Evaluating on COCO...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/coco.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                cider_score = evaluate_captioning(
-                    args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="coco",
-                    cached_features=cached_features,
-                )
-                if args.rank == 0:
-                    print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
-                    scores.append(cider_score)
-
-            if args.rank == 0:
-                print(f"Shots {shot} Mean CIDEr score: {np.nanmean(scores)}")
-                results["coco"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
-
-    if args.eval_okvqa:
-        print("Evaluating on OK-VQA...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/ok_vqa.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                ok_vqa_score = evaluate_vqa(
-                    args=args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="okvqa",
-                    cached_features=cached_features,
-                )
-                if args.rank == 0:
-                    print(f"Shots {shot} Trial {trial} OK-VQA score: {ok_vqa_score}")
-                    scores.append(ok_vqa_score)
-
-            if args.rank == 0:
-                print(f"Shots {shot} Mean OK-VQA score: {np.nanmean(scores)}")
-                results["ok_vqa"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
-
-    if args.eval_vqav2:
-        print("Evaluating on VQAv2...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/vqav2.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                vqa_score = evaluate_vqa(
-                    args=args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="vqav2",
-                    cached_features=cached_features,
-                )
-                if args.rank == 0 and vqa_score is not None:
-                    print(f"Shots {shot} Trial {trial} VQA score: {vqa_score}")
-                    scores.append(vqa_score)
-
-            if args.rank == 0 and len(scores) > 0:
-                print(f"Shots {shot} Mean VQA score: {np.nanmean(scores)}")
-                results["vqav2"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
-
-    if args.eval_vizwiz:
-        print("Evaluating on VizWiz...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/vizwiz.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                vizwiz_score = evaluate_vqa(
-                    args=args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="vizwiz",
-                    cached_features=cached_features,
-                )
-                if args.rank == 0 and vizwiz_score is not None:
-                    print(f"Shots {shot} Trial {trial} VizWiz score: {vizwiz_score}")
-                    scores.append(vizwiz_score)
-
-            if args.rank == 0 and len(scores) > 0:
-                print(f"Shots {shot} Mean VizWiz score: {np.nanmean(scores)}")
-                results["vizwiz"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
+    
+    for task in ["flickr30", "coco"]:
+        if var_args[f"eval_{task}"]:
+            print(f"Evaluating on {task}...")
+
+            # load cached demonstration features for RICES
+            if args.cached_demonstration_features is not None:
+                cached_features = torch.load(
+                    f"{args.cached_demonstration_features}/{task}.pkl", map_location="cpu"
                 )
+            else:
+                cached_features = None
+
+            for shot in args.shots:
+                scores = []
+                for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                    cider_score = evaluate_captioning(
+                        args,
+                        eval_model=eval_model,
+                        num_shots=shot,
+                        seed=seed,
+                        dataset_name="flickr" if task=="flickr30" else task,
+                        cached_features=cached_features,
+                    )
+                    if args.rank == 0:
+                        print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
+                        scores.append(cider_score)
 
-    if args.eval_textvqa:
-        print("Evaluating on TextVQA...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/textvqa.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                textvqa_score = evaluate_vqa(
-                    args=args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="textvqa",
-                    max_new_tokens=10,
-                    cached_features=cached_features,
-                )
                 if args.rank == 0:
-                    print(f"Shots {shot} Trial {trial} TextVQA score: {textvqa_score}")
-                    scores.append(textvqa_score)
-
-            if args.rank == 0:
-                print(f"Shots {shot} Mean TextVQA score: {np.nanmean(scores)}")
-                results["textvqa"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
+                    print(f"Shots {shot} Mean CIDEr score: {np.nanmean(scores)}")
+                    results[task].append(
+                        {
+                            "shots": shot,
+                            "trials": scores,
+                            "mean": np.nanmean(scores),
+                            "stddev": np.nanstd(scores),
+                        }
+                    )
     
-    if args.eval_gqa:
-        print("Evaluating on GQA...")
-
-        #load cached demonstration features on GQA
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/imagenet.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-        
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                gqa_score = evaluate_vqa(
-                    args=args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    dataset_name="gqa",
-                    max_new_tokens=10,
-                    cached_features=cached_features,
+    for vqa_task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
+        if var_args[f"eval_{vqa_task}"]:
+            print(f"Evaluating on {vqa_task}...")
+
+            # load cached demonstration features for RICES
+            if args.cached_demonstration_features is not None:
+                cached_features = torch.load(
+                    f"{args.cached_demonstration_features}/{'ok_vqa' if vqa_task=='okvqa' else vqa_task}.pkl", map_location="cpu"
                 )
-                if args.rank == 0:
-                    print(f"Shots {shot} Trial {trial} GQA score: {gqa_score}")
-                    scores.append(gqa_score)
-            
-            if args.rank == 0:
-                print(f"Shots {shot} Mean GQA score: {np.nanmean(scores)}")
-                results["gqa"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
-
-    if args.eval_imagenet:
-        print("Evaluating on ImageNet...")
+            else:
+                cached_features = None
+
+            for shot in args.shots:
+                scores = []
+                for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                    vqa_score = evaluate_vqa(
+                        args=args,
+                        eval_model=eval_model,
+                        num_shots=shot,
+                        seed=seed,
+                        dataset_name=vqa_task,
+                        cached_features=cached_features,
+                    )
+                    if args.rank == 0:
+                        print(f"Shots {shot} Trial {trial} {vqa_task} score: {vqa_score}")
+                        scores.append(vqa_score)
 
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/imagenet.pkl", map_location="cpu"
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                imagenet_score = evaluate_classification(
-                    args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    no_kv_caching=args.no_caching_for_classification,
-                    dataset_name="imagenet",
-                    cached_features=cached_features,
-                    use_prompt_ensembling=args.classification_prompt_ensembling,
-                )
                 if args.rank == 0:
-                    print(
-                        f"Shots {shot} Trial {trial} "
-                        f"ImageNet score: {imagenet_score}"
+                    print(f"Shots {shot} Mean {vqa_task} score: {np.nanmean(scores)}")
+                    results[vqa_task].append(
+                        {
+                            "shots": shot,
+                            "trials": scores,
+                            "mean": np.nanmean(scores),
+                            "stddev": np.nanstd(scores),
+                        }
                     )
-                    scores.append(imagenet_score)
-
-            if args.rank == 0:
-                print(f"Shots {shot} Mean ImageNet score: {np.nanmean(scores)}")
-                results["imagenet"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
+    
+    for classification_task in ["imagenet", "hateful_memes"]:
+        if var_args[f"eval_{classification_task}"]:
+            print(f"Evaluating on {classification_task}...")
+
+            # load cached demonstration features for RICES
+            if args.cached_demonstration_features is not None:
+                cached_features = torch.load(
+                    f"{args.cached_demonstration_features}/{classification_task}.pkl", map_location="cpu"
                 )
+            else:
+                cached_features = None
+
+            for shot in args.shots:
+                scores = []
+                for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
+                    classification_score = evaluate_classification(
+                        args,
+                        eval_model=eval_model,
+                        num_shots=shot,
+                        seed=seed,
+                        no_kv_caching=args.no_caching_for_classification,
+                        dataset_name=classification_task,
+                        cached_features=cached_features,
+                        use_prompt_ensembling=args.classification_prompt_ensembling,
+                    )
+                    if args.rank == 0:
+                        print(
+                            f"Shots {shot} Trial {trial} "
+                            f"{classification_task.replace('_', ' ')} score: {classification_score}"
+                        )
+                        scores.append(classification_score)
 
-    if args.eval_hateful_memes:
-        print("Evaluating on Hateful Memes...")
-
-        # load cached demonstration features for RICES
-        if args.cached_demonstration_features is not None:
-            cached_features = torch.load(
-                f"{args.cached_demonstration_features}/hateful_memes.pkl",
-                map_location="cpu",
-            )
-        else:
-            cached_features = None
-
-        for shot in args.shots:
-            scores = []
-            for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
-                hateful_memes_score = evaluate_classification(
-                    args,
-                    eval_model=eval_model,
-                    num_shots=shot,
-                    seed=seed,
-                    no_kv_caching=args.no_caching_for_classification,
-                    dataset_name="hateful_memes",
-                    cached_features=cached_features,
-                )
                 if args.rank == 0:
-                    print(
-                        f"Shots {shot} Trial {trial} "
-                        f"Hateful Memes score: {hateful_memes_score}"
+                    print(f"Shots {shot} Mean {classification_task.replace('_', ' ')} score: {np.nanmean(scores)}")
+                    results[classification_task].append(
+                        {
+                            "shots": shot,
+                            "trials": scores,
+                            "mean": np.nanmean(scores),
+                            "stddev": np.nanstd(scores),
+                        }
                     )
-                    scores.append(hateful_memes_score)
-
-            if args.rank == 0:
-                print(f"Shots {shot} Mean Hateful Memes score: {np.nanmean(scores)}")
-                results["hateful_memes"].append(
-                    {
-                        "shots": shot,
-                        "trials": scores,
-                        "mean": np.nanmean(scores),
-                        "stddev": np.nanstd(scores),
-                    }
-                )
 
     if args.rank == 0 and args.results_file is not None:
         with open(args.results_file, "w") as f:
@@ -1023,43 +600,17 @@ def evaluate_vqa(
     Returns:
         float: accuracy score
     """
-
-    if dataset_name == "okvqa":
-        train_image_dir_path = args.ok_vqa_train_image_dir_path
-        train_questions_json_path = args.ok_vqa_train_questions_json_path
-        train_annotations_json_path = args.ok_vqa_train_annotations_json_path
-        test_image_dir_path = args.ok_vqa_test_image_dir_path
-        test_questions_json_path = args.ok_vqa_test_questions_json_path
-        test_annotations_json_path = args.ok_vqa_test_annotations_json_path
-    elif dataset_name == "vqav2":
-        train_image_dir_path = args.vqav2_train_image_dir_path
-        train_questions_json_path = args.vqav2_train_questions_json_path
-        train_annotations_json_path = args.vqav2_train_annotations_json_path
-        test_image_dir_path = args.vqav2_test_image_dir_path
-        test_questions_json_path = args.vqav2_test_questions_json_path
-        test_annotations_json_path = args.vqav2_test_annotations_json_path
-    elif dataset_name == "vizwiz":
-        train_image_dir_path = args.vizwiz_train_image_dir_path
-        train_questions_json_path = args.vizwiz_train_questions_json_path
-        train_annotations_json_path = args.vizwiz_train_annotations_json_path
-        test_image_dir_path = args.vizwiz_test_image_dir_path
-        test_questions_json_path = args.vizwiz_test_questions_json_path
-        test_annotations_json_path = args.vizwiz_test_annotations_json_path
-    elif dataset_name == "textvqa":
-        train_image_dir_path = args.textvqa_image_dir_path
-        train_questions_json_path = args.textvqa_train_questions_json_path
-        train_annotations_json_path = args.textvqa_train_annotations_json_path
-        test_image_dir_path = args.textvqa_image_dir_path
-        test_questions_json_path = args.textvqa_test_questions_json_path
-        test_annotations_json_path = args.textvqa_test_annotations_json_path
-    elif dataset_name == "gqa":
-        train_image_dir_path = args.gqa_train_image_dir_path
-        train_questions_json_path = args.gqa_train_questions_json_path
-        train_annotations_json_path = args.gqa_train_annotations_json_path
-        test_image_dir_path = args.gqa_test_image_dir_path
-        test_questions_json_path = args.gqa_test_questions_json_path
-        test_annotations_json_path = args.gqa_test_annotations_json_path
-    else:
+    var_args = vars(args)
+    for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
+        if dataset_name == task:
+            task = task if task!="okvqa" else "ok_vqa"
+            train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"]
+            train_questions_json_path = var_args[f"{task}_train_questions_json_path"]
+            train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"]
+            test_image_dir_path = var_args[f"{task}_test_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"]
+            test_questions_json_path = var_args[f"{task}_test_questions_json_path"]
+            test_annotations_json_path = var_args[f"{task}_test_annotations_json_path"]
+    if dataset_name not in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
         raise ValueError(f"Unsupported dataset: {dataset_name}")
 
     train_dataset = VQADataset(

From c1bdb640a4980607ec996b0b4fdd501fe27ab490 Mon Sep 17 00:00:00 2001
From: Oscar Lo <olo126@klone-login03.hyak.local>
Date: Thu, 9 May 2024 15:44:28 -0700
Subject: [PATCH 3/4] changed all ok_vqa to okvqa

---
 open_flamingo/eval/evaluate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py
index 402cefb7..4935d3e2 100644
--- a/open_flamingo/eval/evaluate.py
+++ b/open_flamingo/eval/evaluate.py
@@ -153,7 +153,7 @@
 )
 
 ## VQAV2, OK-VQA, VizWiz, TextVQA, GQA Datasets
-for task in ['vqav2', 'ok_vqa', 'vizwiz', 'textvqa', 'gqa']:
+for task in ['vqav2', 'okvqa', 'vizwiz', 'textvqa', 'gqa']:
     parser.add_argument(
         f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' else f"--{task}_train_image_dir_path",
         type=str,
@@ -322,7 +322,7 @@ def main():
             # load cached demonstration features for RICES
             if args.cached_demonstration_features is not None:
                 cached_features = torch.load(
-                    f"{args.cached_demonstration_features}/{'ok_vqa' if vqa_task=='okvqa' else vqa_task}.pkl", map_location="cpu"
+                    f"{args.cached_demonstration_features}/{vqa_task}.pkl", map_location="cpu"
                 )
             else:
                 cached_features = None
@@ -603,7 +603,7 @@ def evaluate_vqa(
     var_args = vars(args)
     for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
         if dataset_name == task:
-            task = task if task!="okvqa" else "ok_vqa"
+            task = task
             train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"]
             train_questions_json_path = var_args[f"{task}_train_questions_json_path"]
             train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"]
@@ -706,7 +706,7 @@ def evaluate_vqa(
 
         process_function = (
             postprocess_ok_vqa_generation
-            if dataset_name == "ok_vqa"
+            if dataset_name == "okvqa"
             else postprocess_vqa_generation
         )
 

From e83c64226b945dc414b95ae8be4f3ed4e13fdf4e Mon Sep 17 00:00:00 2001
From: Oscar Lo <olo126@g3063.hyak.local>
Date: Fri, 24 May 2024 21:51:04 -0700
Subject: [PATCH 4/4] added mantis_eval

---
 open_flamingo/eval/eval_datasets.py           | 17 ++++-
 open_flamingo/eval/eval_models/blip.py        | 10 ++-
 .../eval/eval_models/open_flamingo.py         |  3 +
 open_flamingo/eval/evaluate.py                | 70 +++++++++++--------
 open_flamingo/eval/utils.py                   | 23 ++++++
 open_flamingo/eval/vqa_metric.py              | 20 ++++++
 6 files changed, 110 insertions(+), 33 deletions(-)

diff --git a/open_flamingo/eval/eval_datasets.py b/open_flamingo/eval/eval_datasets.py
index 23d4ae1d..bbbca150 100644
--- a/open_flamingo/eval/eval_datasets.py
+++ b/open_flamingo/eval/eval_datasets.py
@@ -15,6 +15,7 @@
     "vizwiz",
     "textvqa",
     "gqa",
+    "mantiseval",
     "hateful_memes",
     "imagenet",
 ]
@@ -107,14 +108,26 @@ def get_img_path(self, question):
             return os.path.join(self.image_dir_path, question["image_id"])
         elif self.dataset_name == "textvqa" or self.dataset_name == "gqa":
             return os.path.join(self.image_dir_path, f"{question['image_id']}.jpg")
+        elif self.dataset_name == "mantiseval":
+            img_paths = []
+            for img_id in question['image_id']:
+                img_paths.append(os.path.join(self.image_dir_path, f"{img_id}.jpg"))
+            return img_paths
         else:
             raise Exception(f"Unknown VQA dataset {self.dataset_name}")
 
     def __getitem__(self, idx):
         question = self.questions[idx]
         img_path = self.get_img_path(question)
-        image = Image.open(img_path)
-        image.load()
+        if self.dataset_name == "mantiseval":
+            image = []
+            for path in img_path:
+                img = Image.open(path)
+                img.load()
+                image.append(img)
+        else:
+            image = Image.open(img_path)
+            image.load()
         results = {
             "image": image,
             "question": question["question"],
diff --git a/open_flamingo/eval/eval_models/blip.py b/open_flamingo/eval/eval_models/blip.py
index 725b0470..a5c1bf76 100644
--- a/open_flamingo/eval/eval_models/blip.py
+++ b/open_flamingo/eval/eval_models/blip.py
@@ -5,7 +5,7 @@
 
 from transformers import Blip2Processor, Blip2ForConditionalGeneration
 from eval_models.eval_model import BaseEvalModel
-from utils import unwrap_model
+from utils import unwrap_model, combine_images
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 
@@ -27,9 +27,14 @@ def required_args(self):
 
     def prepare_images(self, batch: List[List[Image.Image]]) -> torch.Tensor:
         batch_images = None
+        for i in range(len(batch)):
+            if len(batch[i]) > 1:
+                batch[i] = combine_images(batch[i])
+        """
         assert all(
             len(example) == 1 for example in batch
         ), "BLIP-2 only supports one image per example"
+        """
         for example in batch:
             if batch_images is None:
                 batch_images = self.processor.image_processor(
@@ -111,6 +116,9 @@ def get_textvqa_prompt(self, question, answer=None) -> str:
     
     def get_gqa_prompt(self, question, answer=None) -> str:
         return f"Question:{question} Short answer:{answer if answer is not None else ''}"
+    
+    def get_mantiseval_prompt(self, question, answer=None) -> str:
+        return f"Question:{question} Short answer:{answer if answer is not None else ''}"
 
     def get_coco_prompt(self, caption=None) -> str:
         return f"A photo of {caption if caption is not None else ''}"
diff --git a/open_flamingo/eval/eval_models/open_flamingo.py b/open_flamingo/eval/eval_models/open_flamingo.py
index d73417ab..98165529 100644
--- a/open_flamingo/eval/eval_models/open_flamingo.py
+++ b/open_flamingo/eval/eval_models/open_flamingo.py
@@ -291,6 +291,9 @@ def get_textvqa_prompt(self, question, answer=None) -> str:
     def get_gqa_prompt(self, question, answer=None) -> str:
         return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
 
+    def get_mantiseval_prompt(self, question, answer=None) -> str:
+        return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
+
     def get_coco_prompt(self, caption=None) -> str:
         return f"<image>Output:{caption if caption is not None else ''}{'<|endofchunk|>' if caption is not None else ''}"
 
diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py
index 4935d3e2..a1c73955 100644
--- a/open_flamingo/eval/evaluate.py
+++ b/open_flamingo/eval/evaluate.py
@@ -34,7 +34,7 @@
     HatefulMemesDataset,
 )
 from ok_vqa_utils import postprocess_ok_vqa_generation
-from vqa_metric import compute_vqa_accuracy, postprocess_vqa_generation
+from vqa_metric import compute_vqa_accuracy, postprocess_vqa_generation, compute_mantis_accuracy
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -152,29 +152,30 @@
     default=None,
 )
 
-## VQAV2, OK-VQA, VizWiz, TextVQA, GQA Datasets
-for task in ['vqav2', 'okvqa', 'vizwiz', 'textvqa', 'gqa']:
+## VQAV2, OK-VQA, VizWiz, TextVQA, GQA, Mantis-Eval Datasets
+for task in ['vqav2', 'okvqa', 'vizwiz', 'textvqa', 'gqa', 'mantiseval']:
     parser.add_argument(
-        f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' else f"--{task}_train_image_dir_path",
+        f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' or task=='mantiseval' else f"--{task}_train_image_dir_path",
         type=str,
         default=None,
     )
-    if task!='gqa' and task!='textvqa':
+    if task != 'mantiseval':
+        if task!='gqa' and task!='textvqa':
+            parser.add_argument(
+                f"--{task}_test_image_dir_path",
+                type=str,
+                default=None,
+            )
         parser.add_argument(
-            f"--{task}_test_image_dir_path",
+            f"--{task}_train_questions_json_path",
+            type=str,
+            default=None,
+        )
+        parser.add_argument(
+            f"--{task}_train_annotations_json_path",
             type=str,
             default=None,
         )
-    parser.add_argument(
-        f"--{task}_train_questions_json_path",
-        type=str,
-        default=None,
-    )
-    parser.add_argument(
-        f"--{task}_train_annotations_json_path",
-        type=str,
-        default=None,
-    )
     parser.add_argument(
         f"--{task}_test_questions_json_path",
         type=str,
@@ -315,7 +316,7 @@ def main():
                         }
                     )
     
-    for vqa_task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
+    for vqa_task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa", "mantiseval"]:
         if var_args[f"eval_{vqa_task}"]:
             print(f"Evaluating on {vqa_task}...")
 
@@ -601,16 +602,16 @@ def evaluate_vqa(
         float: accuracy score
     """
     var_args = vars(args)
-    for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
+    for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa", "mantiseval"]:
         if dataset_name == task:
             task = task
-            train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"]
-            train_questions_json_path = var_args[f"{task}_train_questions_json_path"]
-            train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"]
-            test_image_dir_path = var_args[f"{task}_test_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"]
+            train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" and task!="mantiseval" else f"{task}_image_dir_path"]
+            train_questions_json_path = var_args[f"{task}_train_questions_json_path"] if task!="mantiseval" else var_args[f"{task}_test_questions_json_path"]
+            train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"] if task!="mantiseval" else var_args[f"{task}_test_annotations_json_path"]
+            test_image_dir_path = var_args[f"{task}_test_image_dir_path" if task!="textvqa" and task!="gqa" and task!="mantiseval" else f"{task}_image_dir_path"]
             test_questions_json_path = var_args[f"{task}_test_questions_json_path"]
             test_annotations_json_path = var_args[f"{task}_test_annotations_json_path"]
-    if dataset_name not in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]:
+    if dataset_name not in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa", "mantiseval"]:
         raise ValueError(f"Unsupported dataset: {dataset_name}")
 
     train_dataset = VQADataset(
@@ -675,7 +676,10 @@ def evaluate_vqa(
                 context_images = [x["image"] for x in batch_demo_samples[i]]
             else:
                 context_images = []
-            batch_images.append(context_images + [batch["image"][i]])
+            if dataset_name == "mantiseval":
+                batch_images.append(context_images + batch["image"][i])
+            else:
+                batch_images.append(context_images + [batch["image"][i]])
 
             context_text = "".join(
                 [
@@ -703,7 +707,7 @@ def evaluate_vqa(
             num_beams=num_beams,
             length_penalty=length_penalty,
         )
-
+        
         process_function = (
             postprocess_ok_vqa_generation
             if dataset_name == "okvqa"
@@ -732,11 +736,17 @@ def evaluate_vqa(
         f.write(json.dumps(all_predictions, indent=4))
 
     if test_annotations_json_path is not None:
-        acc = compute_vqa_accuracy(
-            f"{dataset_name}results_{random_uuid}.json",
-            test_questions_json_path,
-            test_annotations_json_path,
-        )
+        if dataset_name == "mantiseval":
+            acc = compute_mantis_accuracy(
+                f"{dataset_name}results_{random_uuid}.json",
+                test_annotations_json_path,
+            )
+        else:
+            acc = compute_vqa_accuracy(
+                f"{dataset_name}results_{random_uuid}.json",
+                test_questions_json_path,
+                test_annotations_json_path,
+            )
         # delete the temporary file
         os.remove(f"{dataset_name}results_{random_uuid}.json")
 
diff --git a/open_flamingo/eval/utils.py b/open_flamingo/eval/utils.py
index 6aa2052a..03473461 100644
--- a/open_flamingo/eval/utils.py
+++ b/open_flamingo/eval/utils.py
@@ -3,6 +3,7 @@
 import random
 import torch.nn as nn
 from contextlib import suppress
+from PIL import Image
 
 
 def random_seed(seed=42, rank=0):
@@ -122,3 +123,25 @@ def get_autocast(precision):
         return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16)
     else:
         return suppress
+
+def combine_images(images):
+    img_heights, _ = zip(*(img.size for img in images))
+    avg_height = sum(img_heights) // len(img_heights)
+    for i, img in enumerate(images):
+        images[i] = img.resize((int(img.size[0] * avg_height / img.size[1]), avg_height))
+    resized_heights, resized_widths = zip(*(img.size for img in images))
+    total_width = sum(resized_widths)
+    max_height = max(resized_heights)
+    new_img = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height))
+    x_offset = 0
+    for i, img in enumerate(images):
+        if i > 0:
+            new_img.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
+            x_offset += 1
+            new_img.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0))
+            x_offset += 8
+            new_img.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
+            x_offset += 1
+        new_img.paste(img, (x_offset, 0))
+        x_offset += img.size[0]
+    return new_img
\ No newline at end of file
diff --git a/open_flamingo/eval/vqa_metric.py b/open_flamingo/eval/vqa_metric.py
index 3659c556..7168d669 100644
--- a/open_flamingo/eval/vqa_metric.py
+++ b/open_flamingo/eval/vqa_metric.py
@@ -553,6 +553,26 @@ def compute_vqa_accuracy(result_json_path, question_json_path, annotation_json_p
 
     return vqaEval.accuracy["overall"]
 
+def compute_mantis_accuracy(result_json_path, annotation_json_path):
+    dataset = json.load(open(annotation_json_path, "r"))
+    gt_ans = {}
+    for ann in dataset["annotations"]:
+        gt_ans[ann["question_id"]] = {"answer": ann["answers"][0]["answer"], "type": ann["question_type"]}
+    results = json.load(open(result_json_path, "r"))
+    assert type(results) == list, "results is not an array of objects"
+    correct = 0
+    for res in results:
+        res_ans = res["answer"].lower().strip('()\n ')
+        if gt_ans[res["question_id"]]["type"] == "multi-choice":
+            if len(res_ans) > 1:
+                for c in res_ans:
+                    if c.isalpha():
+                        res_ans = c
+                        break
+        if res_ans == gt_ans[res["question_id"]]["answer"].lower().strip('()\n '):
+            correct+=1
+    acc = correct / len(results)
+    return acc
 
 def postprocess_vqa_generation(predictions):
     answer = re.split("Question|Answer|Short", predictions, 1)[0]