From 58f75ada11642af9a9ac0edc1de2e4a55d4cf4e0 Mon Sep 17 00:00:00 2001
From: mkhi238 <100494918+mkhi238@users.noreply.github.com>
Date: Thu, 9 Oct 2025 01:01:33 -0400
Subject: [PATCH 1/2] Add FEVER metric for fact verification

---
 metrics/fever/README.md     | 183 ++++++++++++++++++++++++++++++++++++
 metrics/fever/app.py        |   5 +
 metrics/fever/fever.py      | 140 +++++++++++++++++++++++++++
 metrics/fever/test_fever.py | 133 ++++++++++++++++++++++++++
 4 files changed, 461 insertions(+)
 create mode 100644 metrics/fever/README.md
 create mode 100644 metrics/fever/app.py
 create mode 100644 metrics/fever/fever.py
 create mode 100644 metrics/fever/test_fever.py

diff --git a/metrics/fever/README.md b/metrics/fever/README.md
new file mode 100644
index 00000000..d852e992
--- /dev/null
+++ b/metrics/fever/README.md
@@ -0,0 +1,183 @@
+---
+title: FEVER
+emoji: 🔥
+colorFrom: orange
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+  - evaluate
+  - metric
+description: >-
+  The FEVER (Fact Extraction and VERification) metric evaluates the performance of systems that verify factual claims against evidence retrieved from Wikipedia.
+
+  It consists of three main components: Label accuracy (measures how often the predicted claim label matches the gold label), FEVER score (considers a prediction correct only if the label is correct and at least one complete gold evidence set is retrieved), and Evidence F1 (computes the micro-averaged precision, recall, and F1 between predicted and gold evidence sentences).
+
+  The FEVER score is the official leaderboard metric used in the FEVER shared tasks. All metrics range from 0 to 1, with higher values indicating better performance.
+---
+
+# Metric Card for FEVER
+
+## Metric description
+
+The FEVER (Fact Extraction and VERification) metric evaluates the performance of systems that verify factual claims against evidence retrieved from Wikipedia. It was introduced in the FEVER shared task and has become a standard benchmark for fact verification systems.
+
+FEVER consists of three main evaluation components:
+
+1. **Label accuracy**: measures how often the predicted claim label (SUPPORTED, REFUTED, or NOT ENOUGH INFO) matches the gold label
+2. **FEVER score**: considers a prediction correct only if the label is correct _and_ at least one complete gold evidence set is retrieved
+3. **Evidence F1**: computes the micro-averaged precision, recall, and F1 between predicted and gold evidence sentences
+
+## How to use
+
+The metric takes two inputs: predictions (a list of dictionaries containing predicted labels and evidence) and references (a list of dictionaries containing gold labels and evidence sets).
+
+```python
+from evaluate import load
+fever = load("fever")
+predictions = [{"label": "SUPPORTED", "evidence": ["E1", "E2"]}]
+references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+results = fever.compute(predictions=predictions, references=references)
+```
+
+## Output values
+
+This metric outputs a dictionary containing five float values:
+
+```python
+print(results)
+{
+    'label_accuracy': 1.0,
+    'fever_score': 1.0,
+    'evidence_precision': 1.0,
+    'evidence_recall': 1.0,
+    'evidence_f1': 1.0
+}
+```
+
+- **label_accuracy**: Proportion of claims with correctly predicted labels (0-1, higher is better)
+- **fever_score**: Proportion of claims where both the label and at least one full gold evidence set are correct (0-1, higher is better). This is the **official FEVER leaderboard metric**
+- **evidence_precision**: Micro-averaged precision of evidence retrieval (0-1, higher is better)
+- **evidence_recall**: Micro-averaged recall of evidence retrieval (0-1, higher is better)
+- **evidence_f1**: Micro-averaged F1 of evidence retrieval (0-1, higher is better)
+
+All values range from 0 to 1, with **1.0 representing perfect performance**.
+
+### Values from popular papers
+
+The FEVER shared task has established performance benchmarks on the FEVER dataset:
+
+- Human performance: FEVER score of ~0.92
+- Top systems (2018-2019): FEVER scores ranging from 0.64 to 0.70
+- State-of-the-art models (2020+): FEVER scores above 0.75
+
+Performance varies significantly based on:
+
+- Model architecture (retrieval + verification pipeline vs. end-to-end)
+- Pre-training (BERT, RoBERTa, etc.)
+- Evidence retrieval quality
+
+## Examples
+
+Perfect prediction (label and evidence both correct):
+
+```python
+from evaluate import load
+fever = load("fever")
+predictions = [{"label": "SUPPORTED", "evidence": ["E1", "E2"]}]
+references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+results = fever.compute(predictions=predictions, references=references)
+print(results)
+{
+    'label_accuracy': 1.0,
+    'fever_score': 1.0,
+    'evidence_precision': 1.0,
+    'evidence_recall': 1.0,
+    'evidence_f1': 1.0
+}
+```
+
+Correct label but incomplete evidence:
+
+```python
+from evaluate import load
+fever = load("fever")
+predictions = [{"label": "SUPPORTED", "evidence": ["E1"]}]
+references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+results = fever.compute(predictions=predictions, references=references)
+print(results)
+{
+    'label_accuracy': 1.0,
+    'fever_score': 0.0,
+    'evidence_precision': 1.0,
+    'evidence_recall': 0.5,
+    'evidence_f1': 0.6666666666666666
+}
+```
+
+Incorrect label (FEVER score is 0):
+
+```python
+from evaluate import load
+fever = load("fever")
+predictions = [{"label": "REFUTED", "evidence": ["E1", "E2"]}]
+references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+results = fever.compute(predictions=predictions, references=references)
+print(results)
+{
+    'label_accuracy': 0.0,
+    'fever_score': 0.0,
+    'evidence_precision': 1.0,
+    'evidence_recall': 1.0,
+    'evidence_f1': 1.0
+}
+```
+
+Multiple valid evidence sets:
+
+```python
+from evaluate import load
+fever = load("fever")
+predictions = [{"label": "SUPPORTED", "evidence": ["E3", "E4"]}]
+references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"], ["E3", "E4"]]}]
+results = fever.compute(predictions=predictions, references=references)
+print(results)
+{
+    'label_accuracy': 1.0,
+    'fever_score': 1.0,
+    'evidence_precision': 0.5,
+    'evidence_recall': 0.5,
+    'evidence_f1': 0.5
+}
+```
+
+## Limitations and bias
+
+The FEVER metric has several important considerations:
+
+1. **Evidence set completeness**: The FEVER score requires retrieving _all_ sentences in at least one gold evidence set. Partial evidence retrieval (even if sufficient for verification) results in a score of 0.
+2. **Multiple valid evidence sets**: Some claims can be verified using different sets of evidence. The metric gives credit if any one complete set is retrieved.
+3. **Micro-averaging**: Evidence precision, recall, and F1 are micro-averaged across all examples, which means performance on longer evidence sets has more influence on the final metrics.
+4. **Label dependency**: The FEVER score requires both correct labeling _and_ correct evidence retrieval, making it a strict metric that penalizes systems for either type of error.
+5. **Wikipedia-specific**: The metric was designed for Wikipedia-based fact verification and may not generalize directly to other knowledge sources or domains.
+
+## Citation
+
+```bibtex
+@inproceedings{thorne2018fever,
+  title={FEVER: a Large-scale Dataset for Fact Extraction and VERification},
+  author={Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},
+  booktitle={Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
+  pages={809--819},
+  year={2018}
+}
+```
+
+## Further References
+
+- [FEVER Dataset Website](https://fever.ai/dataset/)
+- [FEVER Paper on arXiv](https://arxiv.org/abs/1803.05355)
+- [Hugging Face Tasks -- Fact Checking](https://huggingface.co/tasks/text-classification)
+- [FEVER Shared Task Overview](https://fever.ai/task.html)
diff --git a/metrics/fever/app.py b/metrics/fever/app.py
new file mode 100644
index 00000000..c9968774
--- /dev/null
+++ b/metrics/fever/app.py
@@ -0,0 +1,5 @@
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+module = evaluate.load("fever")
+launch_gradio_widget(module)
\ No newline at end of file
diff --git a/metrics/fever/fever.py b/metrics/fever/fever.py
new file mode 100644
index 00000000..393ab5a3
--- /dev/null
+++ b/metrics/fever/fever.py
@@ -0,0 +1,140 @@
+# Copyright 2021 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" FEVER (Fact Extraction and VERification) metric. """
+
+import datasets
+import evaluate
+
+_CITATION = """\
+@inproceedings{thorne2018fever,
+  title={FEVER: Fact Extraction and VERification},
+  author={Thorne, James and Vlachos, Andreas and Christodoulopoulos, Christos and Mittal, Arpit},
+  booktitle={Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+  pages={809--819},
+  year={2018}
+}
+"""
+_DESCRIPTION = """\
+The FEVER (Fact Extraction and VERification) metric evaluates the performance of systems that verify factual claims against evidence retrieved from Wikipedia.
+
+It consists of three main components:
+- **Label accuracy**: measures how often the predicted claim label (SUPPORTED, REFUTED, or NOT ENOUGH INFO) matches the gold label.
+- **FEVER score**: considers a prediction correct only if the label is correct *and* at least one complete gold evidence set is retrieved.
+- **Evidence F1**: computes the micro-averaged precision, recall, and F1 between predicted and gold evidence sentences.
+
+The FEVER score is the official leaderboard metric used in the FEVER shared tasks.
+"""
+_KWARGS_DESCRIPTION = """
+Computes the FEVER evaluation metrics.
+
+Args:
+    predictions (list of dict): Each prediction should be a dictionary with:
+        - "label" (str): the predicted claim label.
+        - "evidence" (list of str): the predicted evidence sentences.
+    references (list of dict): Each reference should be a dictionary with:
+        - "label" (str): the gold claim label.
+        - "evidence_sets" (list of list of str): all possible gold evidence sets.
+
+Returns:
+    A dictionary containing:
+        - 'label_accuracy': proportion of claims with correctly predicted labels.
+        - 'fever_score': proportion of claims where both the label and at least one full gold evidence set are correct.
+        - 'evidence_precision': micro-averaged precision of evidence retrieval.
+        - 'evidence_recall': micro-averaged recall of evidence retrieval.
+        - 'evidence_f1': micro-averaged F1 of evidence retrieval.
+
+Example:
+    >>> predictions = [{"label": "SUPPORTED", "evidence": ["E1", "E2"]}]
+    >>> references = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"], ["E3", "E4"]]}]
+    >>> fever = evaluate.load("fever")
+    >>> results = fever.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'label_accuracy': 1.0, 'fever_score': 1.0, 'evidence_precision': 1.0, 'evidence_recall': 1.0, 'evidence_f1': 1.0}
+"""
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class FEVER(evaluate.Metric):
+  def _info(self):
+    return evaluate.MetricInfo(
+      description=_DESCRIPTION,
+      citation=_CITATION,
+      inputs_description=_KWARGS_DESCRIPTION,
+      features=datasets.Features(
+                {
+                    "predictions": 
+                      {"label": datasets.Value("string"),
+                       "evidence": datasets.Sequence(datasets.Value("string"))}
+                    ,
+                    "references": 
+                      {"label" : datasets.Value("string"),
+                       "evidence_sets": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
+                },
+                }
+            ),
+            reference_urls=[
+                "https://fever.ai/dataset/",
+                "https://arxiv.org/abs/1803.05355",
+            ],
+        )
+  
+  def _compute(self, predictions, references):
+    """
+    Computes FEVER metrics:
+    - Label accuracy
+    - FEVER score (label + complete evidence set)
+    - Evidence precision, recall, and F1 (micro-averaged)
+    """
+    total = len(predictions)
+    label_correct, fever_correct = 0, 0
+    total_overlap, total_pred, total_gold = 0, 0, 0
+
+    for pred, ref in zip(predictions, references):
+      pred_label = pred["label"]
+      pred_evidence = set(e.strip().lower() for e in pred["evidence"])
+      gold_label = ref["label"]
+      gold_sets = []
+      for s in ref["evidence_sets"]:
+        gold_sets.append([e.strip().lower() for e in s])
+
+      if pred_label == gold_label:
+        label_correct += 1
+        for g_set in gold_sets:
+          if set(g_set).issubset(pred_evidence):
+            fever_correct += 1
+            break
+      
+      gold_evidence = set().union(*gold_sets) if gold_sets else set()
+      overlap = len(gold_evidence.intersection(pred_evidence))
+      total_overlap += overlap
+      total_pred += len(pred_evidence)
+      total_gold += len(gold_evidence)
+
+    precision = (total_overlap / total_pred) if total_pred else 0
+    recall = (total_overlap / total_gold) if total_gold else 0
+    evidence_f1 = (
+    2 * precision * recall / (precision + recall)
+    if (precision + recall) > 0 else 0)
+
+    fever_score = fever_correct / total if total else 0
+    label_accuracy = label_correct / total if total else 0
+
+
+    return {
+        "label_accuracy": label_accuracy,
+        "fever_score": fever_score,
+        "evidence_precision": precision,
+        "evidence_recall": recall,
+        "evidence_f1": evidence_f1,
+    }
\ No newline at end of file
diff --git a/metrics/fever/test_fever.py b/metrics/fever/test_fever.py
new file mode 100644
index 00000000..74e872a2
--- /dev/null
+++ b/metrics/fever/test_fever.py
@@ -0,0 +1,133 @@
+# Copyright 2025 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Tests for the FEVER (Fact Extraction and VERification) metric. """
+
+import unittest
+from fever import FEVER  # assuming your metric file is named fever.py
+
+fever = FEVER()
+
+
+class TestFEVER(unittest.TestCase):
+
+    def test_perfect_prediction(self):
+        preds = [{"label": "SUPPORTED", "evidence": ["E1", "E2"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertAlmostEqual(result["label_accuracy"], 1.0)
+        self.assertAlmostEqual(result["fever_score"], 1.0)
+        self.assertAlmostEqual(result["evidence_precision"], 1.0)
+        self.assertAlmostEqual(result["evidence_recall"], 1.0)
+        self.assertAlmostEqual(result["evidence_f1"], 1.0)
+
+    def test_label_only_correct(self):
+        preds = [{"label": "SUPPORTED", "evidence": ["X1", "X2"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertAlmostEqual(result["label_accuracy"], 1.0)
+        self.assertAlmostEqual(result["fever_score"], 0.0)
+        self.assertTrue(result["evidence_f1"] < 1.0)
+
+    def test_label_incorrect(self):
+        preds = [{"label": "REFUTED", "evidence": ["E1", "E2"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertAlmostEqual(result["label_accuracy"], 0.0)
+        self.assertAlmostEqual(result["fever_score"], 0.0)
+
+    def test_partial_evidence_overlap(self):
+        preds = [{"label": "SUPPORTED", "evidence": ["E1"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertAlmostEqual(result["label_accuracy"], 1.0)
+        self.assertAlmostEqual(result["fever_score"], 0.0)
+        self.assertAlmostEqual(result["evidence_precision"], 1.0) 
+        self.assertAlmostEqual(result["evidence_recall"], 0.5)
+        self.assertTrue(0 < result["evidence_f1"] < 1.0)
+
+    def test_extra_evidence_still_correct(self):
+        preds = [{"label": "SUPPORTED", "evidence": ["E1", "E2", "X1"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertAlmostEqual(result["fever_score"], 1.0)
+        self.assertTrue(result["evidence_precision"] < 1.0)
+        self.assertAlmostEqual(result["evidence_recall"], 1.0)
+
+    def test_multiple_gold_sets(self):
+        preds = [{"label": "SUPPORTED", "evidence": ["E3", "E4"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"], ["E3", "E4"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertAlmostEqual(result["fever_score"], 1.0)
+        self.assertAlmostEqual(result["label_accuracy"], 1.0)
+
+    def test_mixed_examples(self):
+        preds = [
+            {"label": "SUPPORTED", "evidence": ["A1", "A2"]},  
+            {"label": "SUPPORTED", "evidence": ["B1"]},        
+            {"label": "REFUTED", "evidence": ["C1", "C2"]},    
+        ]
+        refs = [
+            {"label": "SUPPORTED", "evidence_sets": [["A1", "A2"]]},
+            {"label": "SUPPORTED", "evidence_sets": [["B1", "B2"]]},
+            {"label": "SUPPORTED", "evidence_sets": [["C1", "C2"]]},
+        ]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertTrue(0 < result["label_accuracy"] < 1.0)
+        self.assertTrue(0 <= result["fever_score"] < 1.0)
+        self.assertTrue(0 <= result["evidence_f1"] <= 1.0)
+
+    def test_empty_evidence_prediction(self):
+        preds = [{"label": "SUPPORTED", "evidence": []}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertEqual(result["evidence_precision"], 0.0)
+        self.assertEqual(result["evidence_recall"], 0.0)
+        self.assertEqual(result["evidence_f1"], 0.0)
+
+    def test_empty_gold_evidence(self):
+        preds = [{"label": "SUPPORTED", "evidence": ["E1", "E2"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [[]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertEqual(result["evidence_recall"], 0.0)
+
+    def test_multiple_examples_micro_averaging(self):
+        preds = [
+            {"label": "SUPPORTED", "evidence": ["E1"]},
+            {"label": "SUPPORTED", "evidence": ["F1", "F2"]},
+        ]
+        refs = [
+            {"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]},
+            {"label": "SUPPORTED", "evidence_sets": [["F1", "F2"]]},
+        ]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertTrue(result["evidence_f1"] < 1.0)
+        self.assertAlmostEqual(result["label_accuracy"], 1.0)
+
+    def test_fever_score_requires_label_match(self):
+        preds = [{"label": "REFUTED", "evidence": ["E1", "E2"]}]
+        refs = [{"label": "SUPPORTED", "evidence_sets": [["E1", "E2"]]}]
+        result = fever.compute(predictions=preds, references=refs)
+        self.assertEqual(result["fever_score"], 0.0)
+        self.assertEqual(result["label_accuracy"], 0.0)
+
+    def test_empty_input_list(self):
+        preds, refs = [], []
+        result = fever.compute(predictions=preds, references=refs)
+        for k in result:
+            self.assertEqual(result[k], 0.0)
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From dfc148002afcb34311aa8f13551236489e77993d Mon Sep 17 00:00:00 2001
From: mkhi238 <100494918+mkhi238@users.noreply.github.com>
Date: Thu, 9 Oct 2025 01:47:43 -0400
Subject: [PATCH 2/2] Apply code formatting with black and isort

---
 metrics/fever/app.py        |   3 +-
 metrics/fever/fever.py      | 136 +++++++++++++++++++-----------------
 metrics/fever/test_fever.py |  14 ++--
 3 files changed, 81 insertions(+), 72 deletions(-)

diff --git a/metrics/fever/app.py b/metrics/fever/app.py
index c9968774..6d01a084 100644
--- a/metrics/fever/app.py
+++ b/metrics/fever/app.py
@@ -1,5 +1,6 @@
 import evaluate
 from evaluate.utils import launch_gradio_widget
 
+
 module = evaluate.load("fever")
-launch_gradio_widget(module)
\ No newline at end of file
+launch_gradio_widget(module)
diff --git a/metrics/fever/fever.py b/metrics/fever/fever.py
index 393ab5a3..a0efa821 100644
--- a/metrics/fever/fever.py
+++ b/metrics/fever/fever.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" FEVER (Fact Extraction and VERification) metric. """
+"""FEVER (Fact Extraction and VERification) metric."""
 
 import datasets
+
 import evaluate
 
+
 _CITATION = """\
 @inproceedings{thorne2018fever,
   title={FEVER: Fact Extraction and VERification},
@@ -64,23 +66,26 @@
     {'label_accuracy': 1.0, 'fever_score': 1.0, 'evidence_precision': 1.0, 'evidence_recall': 1.0, 'evidence_f1': 1.0}
 """
 
+
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class FEVER(evaluate.Metric):
-  def _info(self):
-    return evaluate.MetricInfo(
-      description=_DESCRIPTION,
-      citation=_CITATION,
-      inputs_description=_KWARGS_DESCRIPTION,
-      features=datasets.Features(
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
                 {
-                    "predictions": 
-                      {"label": datasets.Value("string"),
-                       "evidence": datasets.Sequence(datasets.Value("string"))}
-                    ,
-                    "references": 
-                      {"label" : datasets.Value("string"),
-                       "evidence_sets": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
-                },
+                    "predictions": {
+                        "label": datasets.Value("string"),
+                        "evidence": datasets.Sequence(datasets.Value("string")),
+                    },
+                    "references": {
+                        "label": datasets.Value("string"),
+                        "evidence_sets": datasets.Sequence(
+                            datasets.Sequence(datasets.Value("string"))
+                        ),
+                    },
                 }
             ),
             reference_urls=[
@@ -88,53 +93,54 @@ def _info(self):
                 "https://arxiv.org/abs/1803.05355",
             ],
         )
-  
-  def _compute(self, predictions, references):
-    """
-    Computes FEVER metrics:
-    - Label accuracy
-    - FEVER score (label + complete evidence set)
-    - Evidence precision, recall, and F1 (micro-averaged)
-    """
-    total = len(predictions)
-    label_correct, fever_correct = 0, 0
-    total_overlap, total_pred, total_gold = 0, 0, 0
-
-    for pred, ref in zip(predictions, references):
-      pred_label = pred["label"]
-      pred_evidence = set(e.strip().lower() for e in pred["evidence"])
-      gold_label = ref["label"]
-      gold_sets = []
-      for s in ref["evidence_sets"]:
-        gold_sets.append([e.strip().lower() for e in s])
-
-      if pred_label == gold_label:
-        label_correct += 1
-        for g_set in gold_sets:
-          if set(g_set).issubset(pred_evidence):
-            fever_correct += 1
-            break
-      
-      gold_evidence = set().union(*gold_sets) if gold_sets else set()
-      overlap = len(gold_evidence.intersection(pred_evidence))
-      total_overlap += overlap
-      total_pred += len(pred_evidence)
-      total_gold += len(gold_evidence)
-
-    precision = (total_overlap / total_pred) if total_pred else 0
-    recall = (total_overlap / total_gold) if total_gold else 0
-    evidence_f1 = (
-    2 * precision * recall / (precision + recall)
-    if (precision + recall) > 0 else 0)
-
-    fever_score = fever_correct / total if total else 0
-    label_accuracy = label_correct / total if total else 0
-
-
-    return {
-        "label_accuracy": label_accuracy,
-        "fever_score": fever_score,
-        "evidence_precision": precision,
-        "evidence_recall": recall,
-        "evidence_f1": evidence_f1,
-    }
\ No newline at end of file
+
+    def _compute(self, predictions, references):
+        """
+        Computes FEVER metrics:
+        - Label accuracy
+        - FEVER score (label + complete evidence set)
+        - Evidence precision, recall, and F1 (micro-averaged)
+        """
+        total = len(predictions)
+        label_correct, fever_correct = 0, 0
+        total_overlap, total_pred, total_gold = 0, 0, 0
+
+        for pred, ref in zip(predictions, references):
+            pred_label = pred["label"]
+            pred_evidence = set(e.strip().lower() for e in pred["evidence"])
+            gold_label = ref["label"]
+            gold_sets = []
+            for s in ref["evidence_sets"]:
+                gold_sets.append([e.strip().lower() for e in s])
+
+            if pred_label == gold_label:
+                label_correct += 1
+                for g_set in gold_sets:
+                    if set(g_set).issubset(pred_evidence):
+                        fever_correct += 1
+                        break
+
+            gold_evidence = set().union(*gold_sets) if gold_sets else set()
+            overlap = len(gold_evidence.intersection(pred_evidence))
+            total_overlap += overlap
+            total_pred += len(pred_evidence)
+            total_gold += len(gold_evidence)
+
+        precision = (total_overlap / total_pred) if total_pred else 0
+        recall = (total_overlap / total_gold) if total_gold else 0
+        evidence_f1 = (
+            2 * precision * recall / (precision + recall)
+            if (precision + recall) > 0
+            else 0
+        )
+
+        fever_score = fever_correct / total if total else 0
+        label_accuracy = label_correct / total if total else 0
+
+        return {
+            "label_accuracy": label_accuracy,
+            "fever_score": fever_score,
+            "evidence_precision": precision,
+            "evidence_recall": recall,
+            "evidence_f1": evidence_f1,
+        }
diff --git a/metrics/fever/test_fever.py b/metrics/fever/test_fever.py
index 74e872a2..efebf3e7 100644
--- a/metrics/fever/test_fever.py
+++ b/metrics/fever/test_fever.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Tests for the FEVER (Fact Extraction and VERification) metric. """
+"""Tests for the FEVER (Fact Extraction and VERification) metric."""
 
 import unittest
+
 from fever import FEVER  # assuming your metric file is named fever.py
 
+
 fever = FEVER()
 
 
@@ -53,7 +55,7 @@ def test_partial_evidence_overlap(self):
         result = fever.compute(predictions=preds, references=refs)
         self.assertAlmostEqual(result["label_accuracy"], 1.0)
         self.assertAlmostEqual(result["fever_score"], 0.0)
-        self.assertAlmostEqual(result["evidence_precision"], 1.0) 
+        self.assertAlmostEqual(result["evidence_precision"], 1.0)
         self.assertAlmostEqual(result["evidence_recall"], 0.5)
         self.assertTrue(0 < result["evidence_f1"] < 1.0)
 
@@ -74,9 +76,9 @@ def test_multiple_gold_sets(self):
 
     def test_mixed_examples(self):
         preds = [
-            {"label": "SUPPORTED", "evidence": ["A1", "A2"]},  
-            {"label": "SUPPORTED", "evidence": ["B1"]},        
-            {"label": "REFUTED", "evidence": ["C1", "C2"]},    
+            {"label": "SUPPORTED", "evidence": ["A1", "A2"]},
+            {"label": "SUPPORTED", "evidence": ["B1"]},
+            {"label": "REFUTED", "evidence": ["C1", "C2"]},
         ]
         refs = [
             {"label": "SUPPORTED", "evidence_sets": [["A1", "A2"]]},
@@ -130,4 +132,4 @@ def test_empty_input_list(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()