From 358cecc281bd744684b405a23b0b704d99f2eefd Mon Sep 17 00:00:00 2001 From: Oscar Lo Date: Thu, 2 May 2024 02:46:40 -0700 Subject: [PATCH 1/4] added gqa as eval dataset --- open_flamingo/eval/eval_datasets.py | 3 +- open_flamingo/eval/eval_models/blip.py | 3 + .../eval/eval_models/open_flamingo.py | 3 + open_flamingo/eval/evaluate.py | 91 +++++++++++++++++++ 4 files changed, 99 insertions(+), 1 deletion(-) diff --git a/open_flamingo/eval/eval_datasets.py b/open_flamingo/eval/eval_datasets.py index df50af6a..23d4ae1d 100644 --- a/open_flamingo/eval/eval_datasets.py +++ b/open_flamingo/eval/eval_datasets.py @@ -14,6 +14,7 @@ "okvqa", "vizwiz", "textvqa", + "gqa", "hateful_memes", "imagenet", ] @@ -104,7 +105,7 @@ def get_img_path(self, question): ) elif self.dataset_name == "vizwiz": return os.path.join(self.image_dir_path, question["image_id"]) - elif self.dataset_name == "textvqa": + elif self.dataset_name == "textvqa" or self.dataset_name == "gqa": return os.path.join(self.image_dir_path, f"{question['image_id']}.jpg") else: raise Exception(f"Unknown VQA dataset {self.dataset_name}") diff --git a/open_flamingo/eval/eval_models/blip.py b/open_flamingo/eval/eval_models/blip.py index 87f08036..725b0470 100644 --- a/open_flamingo/eval/eval_models/blip.py +++ b/open_flamingo/eval/eval_models/blip.py @@ -108,6 +108,9 @@ def get_vizwiz_prompt(self, question, answer=None) -> str: def get_textvqa_prompt(self, question, answer=None) -> str: return f"Question:{question} Short answer:{answer if answer is not None else ''}" + + def get_gqa_prompt(self, question, answer=None) -> str: + return f"Question:{question} Short answer:{answer if answer is not None else ''}" def get_coco_prompt(self, caption=None) -> str: return f"A photo of {caption if caption is not None else ''}" diff --git a/open_flamingo/eval/eval_models/open_flamingo.py b/open_flamingo/eval/eval_models/open_flamingo.py index 0a25198c..d73417ab 100644 --- a/open_flamingo/eval/eval_models/open_flamingo.py +++ b/open_flamingo/eval/eval_models/open_flamingo.py @@ -287,6 +287,9 @@ def get_vizwiz_prompt(self, question, answer=None) -> str: def get_textvqa_prompt(self, question, answer=None) -> str: return f"Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}" + + def get_gqa_prompt(self, question, answer=None) -> str: + return f"Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}" def get_coco_prompt(self, caption=None) -> str: return f"Output:{caption if caption is not None else ''}{'<|endofchunk|>' if caption is not None else ''}" diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py index 4a25fca0..9a2c4e47 100644 --- a/open_flamingo/eval/evaluate.py +++ b/open_flamingo/eval/evaluate.py @@ -139,6 +139,14 @@ default=False, help="Whether to evaluate on TextVQA.", ) + +parser.add_argument( + "--eval_gqa", + action="store_true", + default=False, + help="Whether to evaluate on GQA.", +) + parser.add_argument( "--eval_imagenet", action="store_true", @@ -346,6 +354,44 @@ default=None, ) +# GQA Dataset +parser.add_argument( + "--gqa_train_image_dir_path", + type=str, + help="Path to the gqa train images directory.", + default=None, +) +parser.add_argument( + "--gqa_train_questions_json_path", + type=str, + help="Path to the gqa questions json file.", + default=None, +) +parser.add_argument( + "--gqa_train_annotations_json_path", + type=str, + help="Path to the gqa annotations json file", + default=None, +) +parser.add_argument( + "--gqa_test_image_dir_path", + type=str, + help="Path to the gqa test images directory.", + default=None, +) +parser.add_argument( + "--gqa_test_questions_json_path", + type=str, + help="Path to the gqa questions json file", + default=None, +) +parser.add_argument( + "--gqa_test_annotations_json_path", + type=str, + help="Path to the gqa annotations json file", + default=None, +) + ## Imagenet dataset parser.add_argument("--imagenet_root", type=str, default="/tmp") @@ -650,6 +696,44 @@ def main(): "stddev": np.nanstd(scores), } ) + + if args.eval_gqa: + print("Evaluating on GQA...") + + #load cached demonstration features on GQA + if args.cached_demonstration_features is not None: + cached_features = torch.load( + f"{args.cached_demonstration_features}/imagenet.pkl", map_location="cpu" + ) + else: + cached_features = None + + for shot in args.shots: + scores = [] + for seed, trial in zip(args.trial_seeds, range(args.num_trials)): + gqa_score = evaluate_vqa( + args=args, + eval_model=eval_model, + num_shots=shot, + seed=seed, + dataset_name="gqa", + max_new_tokens=10, + cached_features=cached_features, + ) + if args.rank == 0: + print(f"Shots {shot} Trial {trial} GQA score: {gqa_score}") + scores.append(gqa_score) + + if args.rank == 0: + print(f"Shots {shot} Mean GQA score: {np.nanmean(scores)}") + results["gqa"].append( + { + "shots": shot, + "trials": scores, + "mean": np.nanmean(scores), + "stddev": np.nanstd(scores), + } + ) if args.eval_imagenet: print("Evaluating on ImageNet...") @@ -968,6 +1052,13 @@ def evaluate_vqa( test_image_dir_path = args.textvqa_image_dir_path test_questions_json_path = args.textvqa_test_questions_json_path test_annotations_json_path = args.textvqa_test_annotations_json_path + elif dataset_name == "gqa": + train_image_dir_path = args.gqa_train_image_dir_path + train_questions_json_path = args.gqa_train_questions_json_path + train_annotations_json_path = args.gqa_train_annotations_json_path + test_image_dir_path = args.gqa_test_image_dir_path + test_questions_json_path = args.gqa_test_questions_json_path + test_annotations_json_path = args.gqa_test_annotations_json_path else: raise ValueError(f"Unsupported dataset: {dataset_name}") From 95c3cae476fe650eec57879e4dc66ff5ea0d46a0 Mon Sep 17 00:00:00 2001 From: Oscar Lo Date: Thu, 9 May 2024 14:37:49 -0700 Subject: [PATCH 2/4] cleaned up evaluate file arguments --- open_flamingo/eval/evaluate.py | 797 +++++++-------------------------- 1 file changed, 174 insertions(+), 623 deletions(-) diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py index 9a2c4e47..402cefb7 100644 --- a/open_flamingo/eval/evaluate.py +++ b/open_flamingo/eval/evaluate.py @@ -109,65 +109,30 @@ ) # Per-dataset evaluation flags -parser.add_argument( - "--eval_coco", - action="store_true", - default=False, - help="Whether to evaluate on COCO.", -) -parser.add_argument( - "--eval_vqav2", - action="store_true", - default=False, - help="Whether to evaluate on VQAV2.", -) -parser.add_argument( - "--eval_okvqa", - action="store_true", - default=False, - help="Whether to evaluate on OK-VQA.", -) -parser.add_argument( - "--eval_vizwiz", - action="store_true", - default=False, - help="Whether to evaluate on VizWiz.", -) -parser.add_argument( - "--eval_textvqa", - action="store_true", - default=False, - help="Whether to evaluate on TextVQA.", -) - -parser.add_argument( - "--eval_gqa", - action="store_true", - default=False, - help="Whether to evaluate on GQA.", -) - -parser.add_argument( - "--eval_imagenet", - action="store_true", - default=False, - help="Whether to evaluate on ImageNet.", -) -parser.add_argument( - "--eval_flickr30", - action="store_true", - default=False, - help="Whether to evaluate on Flickr30.", -) -parser.add_argument( - "--eval_hateful_memes", - action="store_true", - default=False, - help="Whether to evaluate on Hateful Memes.", -) +for task in SUPPORTED_TASKS: + parser.add_argument( + f"--eval_{task}", + action="store_true", + default=False, + help=f"Whether to evaluate on {task.replace('_', ' ')}" + ) # Dataset arguments +for task in ['flickr', 'coco']: + parser.add_argument( + f"--{task}_karpathy_json_path", + type=str, + help="Path to the dataset_flickr30k.json file." if task=='flickr' else argparse.SUPPRESS, + default=None, + ) + parser.add_argument( + f"--{task}_annotations_json_path", + type=str, + help="Path to the dataset_flickr30k_coco_style.json file." if task=='flickr' else argparse.SUPPRESS, + default=None + ) + ## Flickr30 Dataset parser.add_argument( "--flickr_image_dir_path", @@ -175,17 +140,6 @@ help="Path to the flickr30/flickr30k_images directory.", default=None, ) -parser.add_argument( - "--flickr_karpathy_json_path", - type=str, - help="Path to the dataset_flickr30k.json file.", - default=None, -) -parser.add_argument( - "--flickr_annotations_json_path", - type=str, - help="Path to the dataset_flickr30k_coco_style.json file.", -) ## COCO Dataset parser.add_argument( "--coco_train_image_dir_path", @@ -197,48 +151,42 @@ type=str, default=None, ) -parser.add_argument( - "--coco_karpathy_json_path", - type=str, - default=None, -) -parser.add_argument( - "--coco_annotations_json_path", - type=str, - default=None, -) + +## VQAV2, OK-VQA, VizWiz, TextVQA, GQA Datasets +for task in ['vqav2', 'ok_vqa', 'vizwiz', 'textvqa', 'gqa']: + parser.add_argument( + f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' else f"--{task}_train_image_dir_path", + type=str, + default=None, + ) + if task!='gqa' and task!='textvqa': + parser.add_argument( + f"--{task}_test_image_dir_path", + type=str, + default=None, + ) + parser.add_argument( + f"--{task}_train_questions_json_path", + type=str, + default=None, + ) + parser.add_argument( + f"--{task}_train_annotations_json_path", + type=str, + default=None, + ) + parser.add_argument( + f"--{task}_test_questions_json_path", + type=str, + default=None, + ) + parser.add_argument( + f"--{task}_test_annotations_json_path", + type=str, + default=None, + ) ## VQAV2 Dataset -parser.add_argument( - "--vqav2_train_image_dir_path", - type=str, - default=None, -) -parser.add_argument( - "--vqav2_train_questions_json_path", - type=str, - default=None, -) -parser.add_argument( - "--vqav2_train_annotations_json_path", - type=str, - default=None, -) -parser.add_argument( - "--vqav2_test_image_dir_path", - type=str, - default=None, -) -parser.add_argument( - "--vqav2_test_questions_json_path", - type=str, - default=None, -) -parser.add_argument( - "--vqav2_test_annotations_json_path", - type=str, - default=None, -) parser.add_argument( "--vqav2_final_test_questions_json_path", type=str, @@ -246,152 +194,6 @@ default=None, ) -## OK-VQA Dataset -parser.add_argument( - "--ok_vqa_train_image_dir_path", - type=str, - help="Path to the vqav2/train2014 directory.", - default=None, -) -parser.add_argument( - "--ok_vqa_train_questions_json_path", - type=str, - help="Path to the v2_OpenEnded_mscoco_train2014_questions.json file.", - default=None, -) -parser.add_argument( - "--ok_vqa_train_annotations_json_path", - type=str, - help="Path to the v2_mscoco_train2014_annotations.json file.", - default=None, -) -parser.add_argument( - "--ok_vqa_test_image_dir_path", - type=str, - help="Path to the vqav2/val2014 directory.", - default=None, -) -parser.add_argument( - "--ok_vqa_test_questions_json_path", - type=str, - help="Path to the v2_OpenEnded_mscoco_val2014_questions.json file.", - default=None, -) -parser.add_argument( - "--ok_vqa_test_annotations_json_path", - type=str, - help="Path to the v2_mscoco_val2014_annotations.json file.", - default=None, -) - -## VizWiz Dataset -parser.add_argument( - "--vizwiz_train_image_dir_path", - type=str, - help="Path to the vizwiz train images directory.", - default=None, -) -parser.add_argument( - "--vizwiz_test_image_dir_path", - type=str, - help="Path to the vizwiz test images directory.", - default=None, -) -parser.add_argument( - "--vizwiz_train_questions_json_path", - type=str, - help="Path to the vizwiz questions json file.", - default=None, -) -parser.add_argument( - "--vizwiz_train_annotations_json_path", - type=str, - help="Path to the vizwiz annotations json file.", - default=None, -) -parser.add_argument( - "--vizwiz_test_questions_json_path", - type=str, - help="Path to the vizwiz questions json file.", - default=None, -) -parser.add_argument( - "--vizwiz_test_annotations_json_path", - type=str, - help="Path to the vizwiz annotations json file.", - default=None, -) - -# TextVQA Dataset -parser.add_argument( - "--textvqa_image_dir_path", - type=str, - help="Path to the textvqa images directory.", - default=None, -) -parser.add_argument( - "--textvqa_train_questions_json_path", - type=str, - help="Path to the textvqa questions json file.", - default=None, -) -parser.add_argument( - "--textvqa_train_annotations_json_path", - type=str, - help="Path to the textvqa annotations json file.", - default=None, -) -parser.add_argument( - "--textvqa_test_questions_json_path", - type=str, - help="Path to the textvqa questions json file.", - default=None, -) -parser.add_argument( - "--textvqa_test_annotations_json_path", - type=str, - help="Path to the textvqa annotations json file.", - default=None, -) - -# GQA Dataset -parser.add_argument( - "--gqa_train_image_dir_path", - type=str, - help="Path to the gqa train images directory.", - default=None, -) -parser.add_argument( - "--gqa_train_questions_json_path", - type=str, - help="Path to the gqa questions json file.", - default=None, -) -parser.add_argument( - "--gqa_train_annotations_json_path", - type=str, - help="Path to the gqa annotations json file", - default=None, -) -parser.add_argument( - "--gqa_test_image_dir_path", - type=str, - help="Path to the gqa test images directory.", - default=None, -) -parser.add_argument( - "--gqa_test_questions_json_path", - type=str, - help="Path to the gqa questions json file", - default=None, -) -parser.add_argument( - "--gqa_test_annotations_json_path", - type=str, - help="Path to the gqa annotations json file", - default=None, -) - ## Imagenet dataset parser.add_argument("--imagenet_root", type=str, default="/tmp") @@ -444,6 +246,7 @@ def main(): args, leftovers = parser.parse_known_args() + var_args = vars(args) # set up distributed evaluation args.local_rank, args.rank, args.world_size = world_info_from_env() @@ -473,351 +276,125 @@ def main(): # Run through datasets and evaluate results = defaultdict(list) - - if args.eval_flickr30: - print("Evaluating on Flickr30k...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/flickr30.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - cider_score = evaluate_captioning( - args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="flickr", - cached_features=cached_features, - ) - if args.rank == 0: - print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}") - scores.append(cider_score) - - if args.rank == 0: - print(f"Shots {shot} Mean CIDEr score: {np.nanmean(scores)}") - results["flickr30"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) - - if args.eval_coco: - print("Evaluating on COCO...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/coco.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - cider_score = evaluate_captioning( - args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="coco", - cached_features=cached_features, - ) - if args.rank == 0: - print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}") - scores.append(cider_score) - - if args.rank == 0: - print(f"Shots {shot} Mean CIDEr score: {np.nanmean(scores)}") - results["coco"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) - - if args.eval_okvqa: - print("Evaluating on OK-VQA...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/ok_vqa.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - ok_vqa_score = evaluate_vqa( - args=args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="okvqa", - cached_features=cached_features, - ) - if args.rank == 0: - print(f"Shots {shot} Trial {trial} OK-VQA score: {ok_vqa_score}") - scores.append(ok_vqa_score) - - if args.rank == 0: - print(f"Shots {shot} Mean OK-VQA score: {np.nanmean(scores)}") - results["ok_vqa"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) - - if args.eval_vqav2: - print("Evaluating on VQAv2...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/vqav2.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - vqa_score = evaluate_vqa( - args=args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="vqav2", - cached_features=cached_features, - ) - if args.rank == 0 and vqa_score is not None: - print(f"Shots {shot} Trial {trial} VQA score: {vqa_score}") - scores.append(vqa_score) - - if args.rank == 0 and len(scores) > 0: - print(f"Shots {shot} Mean VQA score: {np.nanmean(scores)}") - results["vqav2"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) - - if args.eval_vizwiz: - print("Evaluating on VizWiz...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/vizwiz.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - vizwiz_score = evaluate_vqa( - args=args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="vizwiz", - cached_features=cached_features, - ) - if args.rank == 0 and vizwiz_score is not None: - print(f"Shots {shot} Trial {trial} VizWiz score: {vizwiz_score}") - scores.append(vizwiz_score) - - if args.rank == 0 and len(scores) > 0: - print(f"Shots {shot} Mean VizWiz score: {np.nanmean(scores)}") - results["vizwiz"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } + + for task in ["flickr30", "coco"]: + if var_args[f"eval_{task}"]: + print(f"Evaluating on {task}...") + + # load cached demonstration features for RICES + if args.cached_demonstration_features is not None: + cached_features = torch.load( + f"{args.cached_demonstration_features}/{task}.pkl", map_location="cpu" ) + else: + cached_features = None + + for shot in args.shots: + scores = [] + for seed, trial in zip(args.trial_seeds, range(args.num_trials)): + cider_score = evaluate_captioning( + args, + eval_model=eval_model, + num_shots=shot, + seed=seed, + dataset_name="flickr" if task=="flickr30" else task, + cached_features=cached_features, + ) + if args.rank == 0: + print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}") + scores.append(cider_score) - if args.eval_textvqa: - print("Evaluating on TextVQA...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/textvqa.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - textvqa_score = evaluate_vqa( - args=args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="textvqa", - max_new_tokens=10, - cached_features=cached_features, - ) if args.rank == 0: - print(f"Shots {shot} Trial {trial} TextVQA score: {textvqa_score}") - scores.append(textvqa_score) - - if args.rank == 0: - print(f"Shots {shot} Mean TextVQA score: {np.nanmean(scores)}") - results["textvqa"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) + print(f"Shots {shot} Mean CIDEr score: {np.nanmean(scores)}") + results[task].append( + { + "shots": shot, + "trials": scores, + "mean": np.nanmean(scores), + "stddev": np.nanstd(scores), + } + ) - if args.eval_gqa: - print("Evaluating on GQA...") - - #load cached demonstration features on GQA - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/imagenet.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - gqa_score = evaluate_vqa( - args=args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - dataset_name="gqa", - max_new_tokens=10, - cached_features=cached_features, + for vqa_task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: + if var_args[f"eval_{vqa_task}"]: + print(f"Evaluating on {vqa_task}...") + + # load cached demonstration features for RICES + if args.cached_demonstration_features is not None: + cached_features = torch.load( + f"{args.cached_demonstration_features}/{'ok_vqa' if vqa_task=='okvqa' else vqa_task}.pkl", map_location="cpu" ) - if args.rank == 0: - print(f"Shots {shot} Trial {trial} GQA score: {gqa_score}") - scores.append(gqa_score) - - if args.rank == 0: - print(f"Shots {shot} Mean GQA score: {np.nanmean(scores)}") - results["gqa"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) - - if args.eval_imagenet: - print("Evaluating on ImageNet...") + else: + cached_features = None + + for shot in args.shots: + scores = [] + for seed, trial in zip(args.trial_seeds, range(args.num_trials)): + vqa_score = evaluate_vqa( + args=args, + eval_model=eval_model, + num_shots=shot, + seed=seed, + dataset_name=vqa_task, + cached_features=cached_features, + ) + if args.rank == 0: + print(f"Shots {shot} Trial {trial} {vqa_task} score: {vqa_score}") + scores.append(vqa_score) - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/imagenet.pkl", map_location="cpu" - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - imagenet_score = evaluate_classification( - args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - no_kv_caching=args.no_caching_for_classification, - dataset_name="imagenet", - cached_features=cached_features, - use_prompt_ensembling=args.classification_prompt_ensembling, - ) if args.rank == 0: - print( - f"Shots {shot} Trial {trial} " - f"ImageNet score: {imagenet_score}" + print(f"Shots {shot} Mean {vqa_task} score: {np.nanmean(scores)}") + results[vqa_task].append( + { + "shots": shot, + "trials": scores, + "mean": np.nanmean(scores), + "stddev": np.nanstd(scores), + } ) - scores.append(imagenet_score) - - if args.rank == 0: - print(f"Shots {shot} Mean ImageNet score: {np.nanmean(scores)}") - results["imagenet"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } + + for classification_task in ["imagenet", "hateful_memes"]: + if var_args[f"eval_{classification_task}"]: + print(f"Evaluating on {classification_task}...") + + # load cached demonstration features for RICES + if args.cached_demonstration_features is not None: + cached_features = torch.load( + f"{args.cached_demonstration_features}/{classification_task}.pkl", map_location="cpu" ) + else: + cached_features = None + + for shot in args.shots: + scores = [] + for seed, trial in zip(args.trial_seeds, range(args.num_trials)): + classification_score = evaluate_classification( + args, + eval_model=eval_model, + num_shots=shot, + seed=seed, + no_kv_caching=args.no_caching_for_classification, + dataset_name=classification_task, + cached_features=cached_features, + use_prompt_ensembling=args.classification_prompt_ensembling, + ) + if args.rank == 0: + print( + f"Shots {shot} Trial {trial} " + f"{classification_task.replace('_', ' ')} score: {classification_score}" + ) + scores.append(classification_score) - if args.eval_hateful_memes: - print("Evaluating on Hateful Memes...") - - # load cached demonstration features for RICES - if args.cached_demonstration_features is not None: - cached_features = torch.load( - f"{args.cached_demonstration_features}/hateful_memes.pkl", - map_location="cpu", - ) - else: - cached_features = None - - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - hateful_memes_score = evaluate_classification( - args, - eval_model=eval_model, - num_shots=shot, - seed=seed, - no_kv_caching=args.no_caching_for_classification, - dataset_name="hateful_memes", - cached_features=cached_features, - ) if args.rank == 0: - print( - f"Shots {shot} Trial {trial} " - f"Hateful Memes score: {hateful_memes_score}" + print(f"Shots {shot} Mean {classification_task.replace('_', ' ')} score: {np.nanmean(scores)}") + results[classification_task].append( + { + "shots": shot, + "trials": scores, + "mean": np.nanmean(scores), + "stddev": np.nanstd(scores), + } ) - scores.append(hateful_memes_score) - - if args.rank == 0: - print(f"Shots {shot} Mean Hateful Memes score: {np.nanmean(scores)}") - results["hateful_memes"].append( - { - "shots": shot, - "trials": scores, - "mean": np.nanmean(scores), - "stddev": np.nanstd(scores), - } - ) if args.rank == 0 and args.results_file is not None: with open(args.results_file, "w") as f: @@ -1023,43 +600,17 @@ def evaluate_vqa( Returns: float: accuracy score """ - - if dataset_name == "okvqa": - train_image_dir_path = args.ok_vqa_train_image_dir_path - train_questions_json_path = args.ok_vqa_train_questions_json_path - train_annotations_json_path = args.ok_vqa_train_annotations_json_path - test_image_dir_path = args.ok_vqa_test_image_dir_path - test_questions_json_path = args.ok_vqa_test_questions_json_path - test_annotations_json_path = args.ok_vqa_test_annotations_json_path - elif dataset_name == "vqav2": - train_image_dir_path = args.vqav2_train_image_dir_path - train_questions_json_path = args.vqav2_train_questions_json_path - train_annotations_json_path = args.vqav2_train_annotations_json_path - test_image_dir_path = args.vqav2_test_image_dir_path - test_questions_json_path = args.vqav2_test_questions_json_path - test_annotations_json_path = args.vqav2_test_annotations_json_path - elif dataset_name == "vizwiz": - train_image_dir_path = args.vizwiz_train_image_dir_path - train_questions_json_path = args.vizwiz_train_questions_json_path - train_annotations_json_path = args.vizwiz_train_annotations_json_path - test_image_dir_path = args.vizwiz_test_image_dir_path - test_questions_json_path = args.vizwiz_test_questions_json_path - test_annotations_json_path = args.vizwiz_test_annotations_json_path - elif dataset_name == "textvqa": - train_image_dir_path = args.textvqa_image_dir_path - train_questions_json_path = args.textvqa_train_questions_json_path - train_annotations_json_path = args.textvqa_train_annotations_json_path - test_image_dir_path = args.textvqa_image_dir_path - test_questions_json_path = args.textvqa_test_questions_json_path - test_annotations_json_path = args.textvqa_test_annotations_json_path - elif dataset_name == "gqa": - train_image_dir_path = args.gqa_train_image_dir_path - train_questions_json_path = args.gqa_train_questions_json_path - train_annotations_json_path = args.gqa_train_annotations_json_path - test_image_dir_path = args.gqa_test_image_dir_path - test_questions_json_path = args.gqa_test_questions_json_path - test_annotations_json_path = args.gqa_test_annotations_json_path - else: + var_args = vars(args) + for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: + if dataset_name == task: + task = task if task!="okvqa" else "ok_vqa" + train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"] + train_questions_json_path = var_args[f"{task}_train_questions_json_path"] + train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"] + test_image_dir_path = var_args[f"{task}_test_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"] + test_questions_json_path = var_args[f"{task}_test_questions_json_path"] + test_annotations_json_path = var_args[f"{task}_test_annotations_json_path"] + if dataset_name not in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: raise ValueError(f"Unsupported dataset: {dataset_name}") train_dataset = VQADataset( From c1bdb640a4980607ec996b0b4fdd501fe27ab490 Mon Sep 17 00:00:00 2001 From: Oscar Lo Date: Thu, 9 May 2024 15:44:28 -0700 Subject: [PATCH 3/4] changed all ok_vqa to okvqa --- open_flamingo/eval/evaluate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py index 402cefb7..4935d3e2 100644 --- a/open_flamingo/eval/evaluate.py +++ b/open_flamingo/eval/evaluate.py @@ -153,7 +153,7 @@ ) ## VQAV2, OK-VQA, VizWiz, TextVQA, GQA Datasets -for task in ['vqav2', 'ok_vqa', 'vizwiz', 'textvqa', 'gqa']: +for task in ['vqav2', 'okvqa', 'vizwiz', 'textvqa', 'gqa']: parser.add_argument( f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' else f"--{task}_train_image_dir_path", type=str, @@ -322,7 +322,7 @@ def main(): # load cached demonstration features for RICES if args.cached_demonstration_features is not None: cached_features = torch.load( - f"{args.cached_demonstration_features}/{'ok_vqa' if vqa_task=='okvqa' else vqa_task}.pkl", map_location="cpu" + f"{args.cached_demonstration_features}/{vqa_task}.pkl", map_location="cpu" ) else: cached_features = None @@ -603,7 +603,7 @@ def evaluate_vqa( var_args = vars(args) for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: if dataset_name == task: - task = task if task!="okvqa" else "ok_vqa" + task = task train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"] train_questions_json_path = var_args[f"{task}_train_questions_json_path"] train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"] @@ -706,7 +706,7 @@ def evaluate_vqa( process_function = ( postprocess_ok_vqa_generation - if dataset_name == "ok_vqa" + if dataset_name == "okvqa" else postprocess_vqa_generation ) From e83c64226b945dc414b95ae8be4f3ed4e13fdf4e Mon Sep 17 00:00:00 2001 From: Oscar Lo Date: Fri, 24 May 2024 21:51:04 -0700 Subject: [PATCH 4/4] added mantis_eval --- open_flamingo/eval/eval_datasets.py | 17 ++++- open_flamingo/eval/eval_models/blip.py | 10 ++- .../eval/eval_models/open_flamingo.py | 3 + open_flamingo/eval/evaluate.py | 70 +++++++++++-------- open_flamingo/eval/utils.py | 23 ++++++ open_flamingo/eval/vqa_metric.py | 20 ++++++ 6 files changed, 110 insertions(+), 33 deletions(-) diff --git a/open_flamingo/eval/eval_datasets.py b/open_flamingo/eval/eval_datasets.py index 23d4ae1d..bbbca150 100644 --- a/open_flamingo/eval/eval_datasets.py +++ b/open_flamingo/eval/eval_datasets.py @@ -15,6 +15,7 @@ "vizwiz", "textvqa", "gqa", + "mantiseval", "hateful_memes", "imagenet", ] @@ -107,14 +108,26 @@ def get_img_path(self, question): return os.path.join(self.image_dir_path, question["image_id"]) elif self.dataset_name == "textvqa" or self.dataset_name == "gqa": return os.path.join(self.image_dir_path, f"{question['image_id']}.jpg") + elif self.dataset_name == "mantiseval": + img_paths = [] + for img_id in question['image_id']: + img_paths.append(os.path.join(self.image_dir_path, f"{img_id}.jpg")) + return img_paths else: raise Exception(f"Unknown VQA dataset {self.dataset_name}") def __getitem__(self, idx): question = self.questions[idx] img_path = self.get_img_path(question) - image = Image.open(img_path) - image.load() + if self.dataset_name == "mantiseval": + image = [] + for path in img_path: + img = Image.open(path) + img.load() + image.append(img) + else: + image = Image.open(img_path) + image.load() results = { "image": image, "question": question["question"], diff --git a/open_flamingo/eval/eval_models/blip.py b/open_flamingo/eval/eval_models/blip.py index 725b0470..a5c1bf76 100644 --- a/open_flamingo/eval/eval_models/blip.py +++ b/open_flamingo/eval/eval_models/blip.py @@ -5,7 +5,7 @@ from transformers import Blip2Processor, Blip2ForConditionalGeneration from eval_models.eval_model import BaseEvalModel -from utils import unwrap_model +from utils import unwrap_model, combine_images from transformers.modeling_outputs import CausalLMOutputWithPast @@ -27,9 +27,14 @@ def required_args(self): def prepare_images(self, batch: List[List[Image.Image]]) -> torch.Tensor: batch_images = None + for i in range(len(batch)): + if len(batch[i]) > 1: + batch[i] = combine_images(batch[i]) + """ assert all( len(example) == 1 for example in batch ), "BLIP-2 only supports one image per example" + """ for example in batch: if batch_images is None: batch_images = self.processor.image_processor( @@ -111,6 +116,9 @@ def get_textvqa_prompt(self, question, answer=None) -> str: def get_gqa_prompt(self, question, answer=None) -> str: return f"Question:{question} Short answer:{answer if answer is not None else ''}" + + def get_mantiseval_prompt(self, question, answer=None) -> str: + return f"Question:{question} Short answer:{answer if answer is not None else ''}" def get_coco_prompt(self, caption=None) -> str: return f"A photo of {caption if caption is not None else ''}" diff --git a/open_flamingo/eval/eval_models/open_flamingo.py b/open_flamingo/eval/eval_models/open_flamingo.py index d73417ab..98165529 100644 --- a/open_flamingo/eval/eval_models/open_flamingo.py +++ b/open_flamingo/eval/eval_models/open_flamingo.py @@ -291,6 +291,9 @@ def get_textvqa_prompt(self, question, answer=None) -> str: def get_gqa_prompt(self, question, answer=None) -> str: return f"Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}" + def get_mantiseval_prompt(self, question, answer=None) -> str: + return f"Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}" + def get_coco_prompt(self, caption=None) -> str: return f"Output:{caption if caption is not None else ''}{'<|endofchunk|>' if caption is not None else ''}" diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/evaluate.py index 4935d3e2..a1c73955 100644 --- a/open_flamingo/eval/evaluate.py +++ b/open_flamingo/eval/evaluate.py @@ -34,7 +34,7 @@ HatefulMemesDataset, ) from ok_vqa_utils import postprocess_ok_vqa_generation -from vqa_metric import compute_vqa_accuracy, postprocess_vqa_generation +from vqa_metric import compute_vqa_accuracy, postprocess_vqa_generation, compute_mantis_accuracy parser = argparse.ArgumentParser() parser.add_argument( @@ -152,29 +152,30 @@ default=None, ) -## VQAV2, OK-VQA, VizWiz, TextVQA, GQA Datasets -for task in ['vqav2', 'okvqa', 'vizwiz', 'textvqa', 'gqa']: +## VQAV2, OK-VQA, VizWiz, TextVQA, GQA, Mantis-Eval Datasets +for task in ['vqav2', 'okvqa', 'vizwiz', 'textvqa', 'gqa', 'mantiseval']: parser.add_argument( - f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' else f"--{task}_train_image_dir_path", + f"--{task}_image_dir_path" if task=='gqa' or task=='textvqa' or task=='mantiseval' else f"--{task}_train_image_dir_path", type=str, default=None, ) - if task!='gqa' and task!='textvqa': + if task != 'mantiseval': + if task!='gqa' and task!='textvqa': + parser.add_argument( + f"--{task}_test_image_dir_path", + type=str, + default=None, + ) parser.add_argument( - f"--{task}_test_image_dir_path", + f"--{task}_train_questions_json_path", + type=str, + default=None, + ) + parser.add_argument( + f"--{task}_train_annotations_json_path", type=str, default=None, ) - parser.add_argument( - f"--{task}_train_questions_json_path", - type=str, - default=None, - ) - parser.add_argument( - f"--{task}_train_annotations_json_path", - type=str, - default=None, - ) parser.add_argument( f"--{task}_test_questions_json_path", type=str, @@ -315,7 +316,7 @@ def main(): } ) - for vqa_task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: + for vqa_task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa", "mantiseval"]: if var_args[f"eval_{vqa_task}"]: print(f"Evaluating on {vqa_task}...") @@ -601,16 +602,16 @@ def evaluate_vqa( float: accuracy score """ var_args = vars(args) - for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: + for task in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa", "mantiseval"]: if dataset_name == task: task = task - train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"] - train_questions_json_path = var_args[f"{task}_train_questions_json_path"] - train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"] - test_image_dir_path = var_args[f"{task}_test_image_dir_path" if task!="textvqa" and task!="gqa" else f"{task}_image_dir_path"] + train_image_dir_path = var_args[f"{task}_train_image_dir_path" if task!="textvqa" and task!="gqa" and task!="mantiseval" else f"{task}_image_dir_path"] + train_questions_json_path = var_args[f"{task}_train_questions_json_path"] if task!="mantiseval" else var_args[f"{task}_test_questions_json_path"] + train_annotations_json_path = var_args[f"{task}_train_annotations_json_path"] if task!="mantiseval" else var_args[f"{task}_test_annotations_json_path"] + test_image_dir_path = var_args[f"{task}_test_image_dir_path" if task!="textvqa" and task!="gqa" and task!="mantiseval" else f"{task}_image_dir_path"] test_questions_json_path = var_args[f"{task}_test_questions_json_path"] test_annotations_json_path = var_args[f"{task}_test_annotations_json_path"] - if dataset_name not in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa"]: + if dataset_name not in ["okvqa", "vqav2", "vizwiz", "textvqa", "gqa", "mantiseval"]: raise ValueError(f"Unsupported dataset: {dataset_name}") train_dataset = VQADataset( @@ -675,7 +676,10 @@ def evaluate_vqa( context_images = [x["image"] for x in batch_demo_samples[i]] else: context_images = [] - batch_images.append(context_images + [batch["image"][i]]) + if dataset_name == "mantiseval": + batch_images.append(context_images + batch["image"][i]) + else: + batch_images.append(context_images + [batch["image"][i]]) context_text = "".join( [ @@ -703,7 +707,7 @@ def evaluate_vqa( num_beams=num_beams, length_penalty=length_penalty, ) - + process_function = ( postprocess_ok_vqa_generation if dataset_name == "okvqa" @@ -732,11 +736,17 @@ def evaluate_vqa( f.write(json.dumps(all_predictions, indent=4)) if test_annotations_json_path is not None: - acc = compute_vqa_accuracy( - f"{dataset_name}results_{random_uuid}.json", - test_questions_json_path, - test_annotations_json_path, - ) + if dataset_name == "mantiseval": + acc = compute_mantis_accuracy( + f"{dataset_name}results_{random_uuid}.json", + test_annotations_json_path, + ) + else: + acc = compute_vqa_accuracy( + f"{dataset_name}results_{random_uuid}.json", + test_questions_json_path, + test_annotations_json_path, + ) # delete the temporary file os.remove(f"{dataset_name}results_{random_uuid}.json") diff --git a/open_flamingo/eval/utils.py b/open_flamingo/eval/utils.py index 6aa2052a..03473461 100644 --- a/open_flamingo/eval/utils.py +++ b/open_flamingo/eval/utils.py @@ -3,6 +3,7 @@ import random import torch.nn as nn from contextlib import suppress +from PIL import Image def random_seed(seed=42, rank=0): @@ -122,3 +123,25 @@ def get_autocast(precision): return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16) else: return suppress + +def combine_images(images): + img_heights, _ = zip(*(img.size for img in images)) + avg_height = sum(img_heights) // len(img_heights) + for i, img in enumerate(images): + images[i] = img.resize((int(img.size[0] * avg_height / img.size[1]), avg_height)) + resized_heights, resized_widths = zip(*(img.size for img in images)) + total_width = sum(resized_widths) + max_height = max(resized_heights) + new_img = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height)) + x_offset = 0 + for i, img in enumerate(images): + if i > 0: + new_img.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0)) + x_offset += 1 + new_img.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0)) + x_offset += 8 + new_img.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0)) + x_offset += 1 + new_img.paste(img, (x_offset, 0)) + x_offset += img.size[0] + return new_img \ No newline at end of file diff --git a/open_flamingo/eval/vqa_metric.py b/open_flamingo/eval/vqa_metric.py index 3659c556..7168d669 100644 --- a/open_flamingo/eval/vqa_metric.py +++ b/open_flamingo/eval/vqa_metric.py @@ -553,6 +553,26 @@ def compute_vqa_accuracy(result_json_path, question_json_path, annotation_json_p return vqaEval.accuracy["overall"] +def compute_mantis_accuracy(result_json_path, annotation_json_path): + dataset = json.load(open(annotation_json_path, "r")) + gt_ans = {} + for ann in dataset["annotations"]: + gt_ans[ann["question_id"]] = {"answer": ann["answers"][0]["answer"], "type": ann["question_type"]} + results = json.load(open(result_json_path, "r")) + assert type(results) == list, "results is not an array of objects" + correct = 0 + for res in results: + res_ans = res["answer"].lower().strip('()\n ') + if gt_ans[res["question_id"]]["type"] == "multi-choice": + if len(res_ans) > 1: + for c in res_ans: + if c.isalpha(): + res_ans = c + break + if res_ans == gt_ans[res["question_id"]]["answer"].lower().strip('()\n '): + correct+=1 + acc = correct / len(results) + return acc def postprocess_vqa_generation(predictions): answer = re.split("Question|Answer|Short", predictions, 1)[0]