From 62e96446a2224700d16056625a5105e8fdc5f51c Mon Sep 17 00:00:00 2001 From: Kami-chanw <865710157@qq.com> Date: Tue, 13 Aug 2024 04:21:25 +0800 Subject: [PATCH] add vqa_accuracy and CIDEr --- metrics/CIDEr/CIDEr.py | 155 +++++++++++++ metrics/CIDEr/README.md | 75 +++++++ metrics/CIDEr/app.py | 6 + metrics/CIDEr/requirements.txt | 2 + metrics/vqa_accuracy/README.md | 66 ++++++ metrics/vqa_accuracy/app.py | 6 + metrics/vqa_accuracy/requirements.txt | 1 + metrics/vqa_accuracy/vqa_accuracy.py | 308 ++++++++++++++++++++++++++ 8 files changed, 619 insertions(+) create mode 100644 metrics/CIDEr/CIDEr.py create mode 100644 metrics/CIDEr/README.md create mode 100644 metrics/CIDEr/app.py create mode 100644 metrics/CIDEr/requirements.txt create mode 100644 metrics/vqa_accuracy/README.md create mode 100644 metrics/vqa_accuracy/app.py create mode 100644 metrics/vqa_accuracy/requirements.txt create mode 100644 metrics/vqa_accuracy/vqa_accuracy.py diff --git a/metrics/CIDEr/CIDEr.py b/metrics/CIDEr/CIDEr.py new file mode 100644 index 00000000..39a8272a --- /dev/null +++ b/metrics/CIDEr/CIDEr.py @@ -0,0 +1,155 @@ +from typing import List +import datasets +import evaluate +import os +import tempfile +import subprocess + +from pycocoevalcap.cider.cider import CiderScorer + +_DESCRIPTION = """ +The CIDEr (Consensus-based Image Description Evaluation) metric is used to evaluate the quality of image captions generated by models in image captioning tasks. +It measures how well the generated caption matches human-written reference captions by considering both the frequency and the relevance of words or phrases. +Here is the formula for the CIDEr metric in LaTeX code: + +$ +\\text{CIDEr}(c_i, C) = \\frac{1}{N} \\sum_{n=1}^{N} w_n \\cdot \\frac{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, c_i)}{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, C)} +$ + +where: +- $ c_i $ is the candidate caption, +- $ C $ is the set of reference captions, +- $ N $ is the number of n-grams (typically 1 to 4), +- $ w_n $ is the weight for the n-gram, +- $ g_j $ represents the j-th n-gram, +- $ \\text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $, +- $ \\text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $, +- $ \\text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $. +""" + + +_KWARGS_DESCRIPTION = """ +Args: + predictions (`list` of `str`): Predicted captions. + references (`list` of `str` lists): Ground truth captions. + n (int, defaults to 4): Number of ngrams for which (ngram) representation is calculated. + sigma (float, defaults to 6.0): The standard deviation parameter for gaussian penalty. + +Returns: + CIDEr (`float`): CIDEr value. Minimum possible value is 0. Maximum possible value is 100. + +""" + + +_CITATION = """ +@inproceedings{vedantam2015cider, + title={Cider: Consensus-based image description evaluation}, + author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4566--4575}, + year={2015} +} +""" + +_URLS = { + "stanford-corenlp": "https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.4.1/stanford-corenlp-3.4.1.jar" +} + + +def tokenize(tokenizer_path: str, predictions: List[str], references: List[List[str]]): + PUNCTUATIONS = [ + "''", + "'", + "``", + "`", + "-LRB-", + "-RRB-", + "-LCB-", + "-RCB-", + ".", + "?", + "!", + ",", + ":", + "-", + "--", + "...", + ";", + ] + + cmd = [ + "java", + "-cp", + tokenizer_path, + "edu.stanford.nlp.process.PTBTokenizer", + "-preserveLines", + "-lowerCase", + ] + + sentences = "\n".join( + [ + s.replace("\n", " ") + for s in predictions + [ref for refs in references for ref in refs] + ] + ) + + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(sentences.encode()) + + cmd.append(f.name) + p_tokenizer = subprocess.Popen(cmd, stdout=subprocess.PIPE) + token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] + token_lines = token_lines.decode() + lines = [ + " ".join([w for w in line.rstrip().split(" ") if w not in PUNCTUATIONS]) + for line in token_lines.split("\n") + ] + + os.remove(f.name) + + pred_size = len(predictions) + ref_sizes = [len(ref) for ref in references] + + predictions = lines[:pred_size] + start = pred_size + references = [] + for size in ref_sizes: + references.append(lines[start : start + size]) + start += size + + return predictions, references + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class CIDEr(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence( + datasets.Value("string", id="sequence"), id="references" + ), + } + ), + reference_urls=[ + "https://github.com/salaniz/pycocoevalcap", + "https://github.com/tylin/coco-caption", + ], + ) + + def _download_and_prepare(self, dl_manager): + self.tokenizer_path = dl_manager.download(_URLS["stanford-corenlp"]) + + def _compute(self, predictions, references, n=4, sigma=6.0): + predications, references = tokenize( + self.tokenizer_path, predictions, references + ) + scorer = CiderScorer(n=n, sigma=sigma) + for pred, refs in zip(predications, references): + scorer += (pred, refs) + score, scores = scorer.compute_score() + return {"CIDEr": score} diff --git a/metrics/CIDEr/README.md b/metrics/CIDEr/README.md new file mode 100644 index 00000000..d5053489 --- /dev/null +++ b/metrics/CIDEr/README.md @@ -0,0 +1,75 @@ +--- +title: CIDEr +emoji: 🐨 +colorFrom: blue +colorTo: red +sdk: gradio +sdk_version: 3.19.1 +app_file: app.py +pinned: false +--- + +# CIDEr Metric for Image Captioning Evaluation + +## CIDEr Description +The CIDEr (Consensus-based Image Description Evaluation) metric is widely used in image captioning tasks to evaluate the quality of generated captions. The metric assesses how well the generated caption aligns with human-written reference captions by considering both the frequency and relevance of words or phrases. The score is computed using a weighted combination of n-gram precision, accounting for the frequency of each n-gram in the reference set. + +The formula for the CIDEr metric is as follows: + +$ +\text{CIDEr}(c_i, C) = \frac{1}{N} \sum_{n=1}^{N} w_n \cdot \frac{\sum_{j=1}^{m} \text{IDF}(g_j) \cdot \text{TF}(g_j, c_i)}{\sum_{j=1}^{m} \text{IDF}(g_j) \cdot \text{TF}(g_j, C)} +$ + +where: +- $ c_i $ is the candidate caption, +- $ C $ is the set of reference captions, +- $ N $ is the number of n-grams (typically 1 to 4), +- $ w_n $ is the weight for the n-gram, +- $ g_j $ represents the j-th n-gram, +- $ \text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $, +- $ \text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $, +- $ \text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $. + +## How to Use +To use the CIDEr metric, you need to initialize the `CIDEr` class and provide the predicted and reference captions. The metric will tokenize the captions and compute the CIDEr score. + +### Inputs +- **predictions** *(list of str)*: The list of predicted captions generated by the model. +- **references** *(list of list of str)*: The list of lists, where each list contains the reference captions corresponding to each prediction. +- **n** *(int, optional, defaults to 4)*: Number of n-grams for which (ngram) representation is calculated. +- **sigma** *(float, optional, defaults to 6.0)*: The standard deviation parameter for the Gaussian penalty. + +### Output Values +- **CIDEr** *(float)*: The computed CIDEr score, which typically ranges between 0 and 100. Higher scores indicate better alignment between the predicted and reference captions. + +### Examples + +```python +>>> from evaluate import load +>>> CIDEr = load("Kamichanw/CIDEr") +>>> predictions = ["A cat sits on a mat."] +>>> references = [["A cat is sitting on a mat.", "A feline rests on the mat."]] +>>> score = cider_metric.compute(predictions=predictions, references=references) +>>> print(score['CIDEr']) +0.0 +``` + +## Limitations and Bias +The CIDEr metric primarily focuses on the n-gram overlap between predicted and reference captions. It may not adequately capture semantic nuances or variations in phrasing that still convey the same meaning. Moreover, CIDEr tends to favor longer captions with more word overlap, potentially biasing against concise but accurate captions. + +## Citation +If you use the CIDEr metric in your research, please cite the original paper: + +```bibtex +@inproceedings{vedantam2015cider, + title={Cider: Consensus-based image description evaluation}, + author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4566--4575}, + year={2015} +} +``` + +## Further References +- [CIDEr GitHub Repository](https://github.com/tylin/coco-caption) +- [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) \ No newline at end of file diff --git a/metrics/CIDEr/app.py b/metrics/CIDEr/app.py new file mode 100644 index 00000000..cda98659 --- /dev/null +++ b/metrics/CIDEr/app.py @@ -0,0 +1,6 @@ +import evaluate +from evaluate.utils import launch_gradio_widget + + +module = evaluate.load("CIDEr") +launch_gradio_widget(module) \ No newline at end of file diff --git a/metrics/CIDEr/requirements.txt b/metrics/CIDEr/requirements.txt new file mode 100644 index 00000000..6c0fe141 --- /dev/null +++ b/metrics/CIDEr/requirements.txt @@ -0,0 +1,2 @@ +git+https://github.com/huggingface/evaluate@main +pycocoevalcap \ No newline at end of file diff --git a/metrics/vqa_accuracy/README.md b/metrics/vqa_accuracy/README.md new file mode 100644 index 00000000..1808c28f --- /dev/null +++ b/metrics/vqa_accuracy/README.md @@ -0,0 +1,66 @@ +--- +title: VQA Accuracy +emoji: 🔥 +colorFrom: indigo +colorTo: gray +sdk: gradio +sdk_version: 3.19.1 +app_file: app.py +pinned: false +--- + +# VQAaccuracy Metric Card + +## Metric Description +The **VQAaccuracy** metric is used for evaluating the accuracy of visual question answering (VQA) models. It is designed to be robust to the variability in how different humans may phrase their answers. The accuracy for an answer (`ans`) predicted by the model is calculated as: +$ \text{Acc}(ans) = \min\left(\frac{\# \text{humans that said } ans}{3}, 1\right) $ +This metric aligns with the official VQA evaluation by averaging the machine accuracies over all possible sets of human annotators. + +## How to Use +The **VQAAccuracy** metric can be used to evaluate the performance of a VQA model by comparing the predicted answers to a set of ground truth answers. The metric can be integrated into your evaluation pipeline as follows: + +### Inputs +- **predictions** (`list` of `str`): The predicted answers generated by the VQA model. +- **references** (`list` of `str` lists): The ground truth answers corresponding to each question. +- **answer_types** (`list` of `str`, *optional*): The types of answers corresponding to each question. If not provided, defaults to `None`. +- **question_types** (`list` of `str`, *optional*): The types of questions corresponding to each question. If not provided, defaults to `None`. + +### Output Values +The output of this metric is a dictionary containing: +- **overall** (`float`): The overall VQA accuracy, rounded to the specified precision. +- **perAnswerType** (`dict`, *optional*): The VQA accuracy for each answer type, if provided. +- **perQuestionType** (`dict`, *optional*): The VQA accuracy for each question type, if provided. + +The accuracy values range from 0 to 100, with higher values indicating better performance. + +### Examples +Here is an example of how to use the **VQAaccuracy** metric: + +```python +>>> from evaluate import load +>>> vqa_accuracy = load("Kamichanw/vqa_accuracy") +>>> predictions = ["yes", "2", "blue"] +>>> references = [["yes", "yeah", "yep"], ["2", "two"], ["blue", "bluish"]] +>>> results = vqa_accuracy.compute(predictions=predictions, references=references) +>>> print(results) +{"overall": 24.07} +``` + +## Limitations and Bias +The **VQAAccuracy** metric is dependent on the consistency and quality of the ground truth answers provided. Variability in human annotations can affect the accuracy scores. Additionally, the metric is designed specifically for the VQA task and may not generalize well to other types of question-answering models. + +## Citation +If you use the **VQAAccuracy** metric in your work, please cite the original VQA paper: + +```bibtex +@InProceedings{VQA, +author = {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh}, +title = {VQA: Visual Question Answering}, +booktitle = {International Conference on Computer Vision (ICCV)}, +year = {2015}, +} +``` + +## Further References +- [VQA Evaluation](https://visualqa.org/evaluation.html) +- [VQA GitHub Repository](https://github.com/GT-Vision-Lab/VQA) \ No newline at end of file diff --git a/metrics/vqa_accuracy/app.py b/metrics/vqa_accuracy/app.py new file mode 100644 index 00000000..e4d35436 --- /dev/null +++ b/metrics/vqa_accuracy/app.py @@ -0,0 +1,6 @@ +import evaluate +from evaluate.utils import launch_gradio_widget + + +module = evaluate.load("vqa_accuracy") +launch_gradio_widget(module) \ No newline at end of file diff --git a/metrics/vqa_accuracy/requirements.txt b/metrics/vqa_accuracy/requirements.txt new file mode 100644 index 00000000..050ec9e1 --- /dev/null +++ b/metrics/vqa_accuracy/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/huggingface/evaluate@main diff --git a/metrics/vqa_accuracy/vqa_accuracy.py b/metrics/vqa_accuracy/vqa_accuracy.py new file mode 100644 index 00000000..89f448ee --- /dev/null +++ b/metrics/vqa_accuracy/vqa_accuracy.py @@ -0,0 +1,308 @@ +import datasets +import evaluate +import re + +_DESCRIPTION = """ +VQA accuracy is a evaluation metric which is robust to inter-human variability in phrasing the answers: +$\\text{Acc}(ans) = \\min \\left( \\frac{\\text{# humans that said }ans}{3}, 1 \\right)$ +Where `ans` is answered by machine. In order to be consistent with 'human accuracies', machine accuracies are averaged over all 10 choose 9 sets of human annotators. +""" + + +_KWARGS_DESCRIPTION = """ +Args: + predictions (`list` of `str`): Predicted answers. + references (`list` of `str` lists): Ground truth answers. + answer_types (`list` of `str`, *optional*): Answer types corresponding to each questions. + questions_type (`list` of `str`, *optional*): Question types corresponding to each questions. + +Returns: + visual question answering accuracy (`float`): Accuracy accuracy. Minimum possible value is 0. Maximum possible value is 100. + +""" + + +_CITATION = """ +@InProceedings{{VQA}, +author = {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh}, +title = {{VQA}: {V}isual {Q}uestion {A}nswering}, +booktitle = {International Conference on Computer Vision (ICCV)}, +year = {2015}, +} +""" + +contractions = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", +} +manualMap = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", +} +articles = ["a", "an", "the"] + +periodStrip = re.compile(r"(?!<=\d)(\.)(?!\d)") +commaStrip = re.compile(r"(\d)(\,)(\d)") +punct = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", +] + + +def processPunctuation(inText): + outText = inText + for p in punct: + if (p + " " in inText or " " + p in inText) or ( + re.search(commaStrip, inText) != None + ): + outText = outText.replace(p, "") + else: + outText = outText.replace(p, " ") + outText = periodStrip.sub("", outText, re.UNICODE) + return outText + + +def processDigitArticle(inText): + outText = [] + tempText = inText.lower().split() + for word in tempText: + word = manualMap.setdefault(word, word) + if word not in articles: + outText.append(word) + else: + pass + for wordId, word in enumerate(outText): + if word in contractions: + outText[wordId] = contractions[word] + outText = " ".join(outText) + return outText + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class VQAAccuracy(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence( + datasets.Value("string", id="sequence"), id="references" + ), + "answer_types": datasets.Value("string", id="sequence"), + "question_types": datasets.Value("string", id="sequence"), + } + ), + reference_urls=[ + "https://visualqa.org/evaluation.html", + "https://github.com/GT-Vision-Lab/VQA/blob/master", + ], + ) + + def _compute(self, predictions, references, answer_types=None, question_types=None): + if answer_types is None: + answer_types = [None] * len(predictions) + + if question_types is None: + question_types = [None] * len(predictions) + + if not len(predictions) == len(answer_types) == len(question_types): + raise ValueError( + "The length of predictions, answer_types and question_types doesn't match." + ) + + total, ans_type_dict, ques_type_dict = [], {}, {} + + for pred, gts, ans_type, ques_type in zip( + predictions, references, answer_types, question_types + ): + # to align with offical data postprocess + pred = pred.replace("\n", " ").replace("\t", " ").strip() + pred = processDigitArticle(processPunctuation(pred)) + gts = [processDigitArticle(processPunctuation(gt_ans)) for gt_ans in gts] + + # calculate vqa accuracy + accuracy = [] + for i in range(len(gts)): + other_gt = gts[:i] + gts[i + 1 :] + matching_ans = [item for item in other_gt if item == pred] + accuracy.append(min(1, len(matching_ans) / 3)) + + vqa_acc = sum(accuracy) / len(accuracy) + total.append(vqa_acc) + + if ans_type is not None: + if ans_type not in ans_type_dict: + ans_type_dict[ans_type] = [] + ans_type_dict[ans_type].append(vqa_acc) + + if ques_type is not None: + if ques_type not in ques_type_dict: + ques_type_dict[ques_type] = [] + ques_type_dict[ques_type].append(vqa_acc) + + # the following key names follow the naming of the official evaluation results + result = {"overall": 100 * sum(total) / len(total)} + + if len(ans_type_dict) > 0: + result["perAnswerType"] = { + ans_type: 100 * sum(accuracy_list) / len(accuracy_list) + for ans_type, accuracy_list in ans_type_dict.items() + } + + if len(ques_type_dict) > 0: + result["perQuestionType"] = { + ques_type: 100 * sum(accuracy_list) / len(accuracy_list) + for ques_type, accuracy_list in ques_type_dict.items() + } + + return result