sashimono-san · sashimono-san · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024 · Nov 26, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -14,13 +14,14 @@ anthropic==0.38.0
 antlr4-python3-runtime==4.9.3
 anyio==4.8.0
 astunparse==1.6.3
-async-timeout==5.0.1
+async-timeout==4.0.3
 attrs==24.3.0
 audioread==3.0.1
 autokeras==1.0.20
 av==14.0.1
 awscli==1.33.44
 beautifulsoup4==4.12.3
+bert_score==0.3.13
 black==24.3.0
 blis==1.1.0
 boto3==1.34.162
@@ -131,6 +132,8 @@ keras==3.8.0
 keras-tuner==1.4.7
 kiwisolver==1.4.7
 kt-legacy==1.0.5
+langchain==0.3.9
+langchain-community==0.3.8
 langcodes==3.5.0
 langdetect==1.0.9
 language_data==1.3.0
@@ -230,15 +233,17 @@ pytrec_eval==0.5
 pytz==2024.2
 PyWavelets==1.6.0
 PyYAML==6.0.2
-qwen-vl-utils==0.0.8
-RapidFuzz==3.11.0
+rank_bm25==0.2.2
+referencing==0.35.1
 regex==2024.11.6
+RapidFuzz==3.11.0
 reka-api==2.0.0
 requests==2.32.3
 retrying==1.3.4
 rich==13.9.4
 rouge_score==0.1.2
 rsa==4.7.2
+qwen-vl-utils==0.0.8
 s3transfer==0.10.4
 sacrebleu==2.2.1
 safetensors==0.5.1

diff --git a/src/helm/benchmark/metrics/ehr_sql_metrics.py b/src/helm/benchmark/metrics/ehr_sql_metrics.py
@@ -0,0 +1,116 @@
+from typing import List, Dict, Any
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.common.hierarchical_logger import hlog
+import re
+
+
+class EhrSqlMetric(Metric):
+    """
+    Metric for evaluating the EHR SQL dataset, assessing the model's ability to generate valid SQL queries.
+
+    This implementation calculates:
+    1. Precision for Answerable Questions (Pans): The proportion of correctly predicted answerable questions
+       among all questions predicted to be answerable.
+    2. Recall for Answerable Questions (Rans): The proportion of correctly predicted answerable questions
+       among all answerable questions in the dataset.
+    """
+
+    def extract_is_impossible(self, input_text: str) -> bool:
+        """Extracts `is_impossible` from input_text using regex."""
+        match = re.search(r'"is_impossible":\s*(true|false)', input_text, re.IGNORECASE)
+        return match and match.group(1).lower() == "true"
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Evaluate a single generation against the reference labels.
+        """
+
+        # Extract predictions
+        predictions = [
+            completion.text.strip() for completion in request_state.result.completions
+        ]
+
+        if not predictions:
+            raise ValueError("No predictions found in the completions.")
+
+        # Process the first prediction as the primary output
+        prediction = predictions[0]
+
+        # Extract references and input text
+        references = getattr(request_state.instance, "references", None)
+        input_text = request_state.instance.input.text  # Read input text
+
+        if not references or len(references) == 0:
+            hlog(f"Warning: Missing references for instance {request_state.instance}")
+            return []
+
+        # Check if the ground truth is answerable based on is_impossible flag
+        ground_truth_query = references[0].output.text
+        is_impossible = self.extract_is_impossible(input_text)  # Extract from input
+
+        is_answerable = not is_impossible and bool(ground_truth_query)
+
+        # Check if the model prediction is answerable
+        is_predicted_answerable = bool(prediction)
+
+        # Determine correctness for answerable questions
+        correct_answerable = int(is_answerable and is_predicted_answerable)
+
+        return [
+            Stat(MetricName("ehr_sql_precision_answerable")).add(
+                correct_answerable if is_predicted_answerable else 0
+            ),
+            Stat(MetricName("ehr_sql_recall_answerable")).add(
+                correct_answerable if is_answerable else 0
+            ),
+            Stat(MetricName("ehr_sql_total_predicted_answerable")).add(
+                int(is_predicted_answerable)
+            ),
+            Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(
+                int(is_answerable)
+            ),
+        ]
+
+    def compute(self, stats: List[Stat], **kwargs: Any) -> Dict[str, float]:
+        """
+        Aggregate statistics to compute final metrics.
+        """
+
+        # Sum up all relevant stats
+        correct_answerable = sum(
+            stat.value for stat in stats if stat.name in ["ehr_sql_precision_answerable", "ehr_sql_recall_answerable"]
+        )
+        total_predicted_answerable = sum(
+            stat.value for stat in stats if stat.name == "ehr_sql_total_predicted_answerable"
+        )
+        total_ground_truth_answerable = sum(
+            stat.value for stat in stats if stat.name == "ehr_sql_total_ground_truth_answerable"
+        )
+
+        # Calculate precision and recall
+        precision = (
+            correct_answerable / total_predicted_answerable
+            if total_predicted_answerable > 0
+            else 0.0
+        )
+        recall = (
+            correct_answerable / total_ground_truth_answerable
+            if total_ground_truth_answerable > 0
+            else 0.0
+        )
+
+        return {
+            "ehr_sql_precision_answerable": precision,
+            "ehr_sql_recall_answerable": recall,
+        }
diff --git a/src/helm/benchmark/metrics/medcalc_bench_metrics.py b/src/helm/benchmark/metrics/medcalc_bench_metrics.py
@@ -0,0 +1,182 @@
+import re
+from datetime import datetime
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.scenarios.scenario import CORRECT_TAG
+from helm.common.hierarchical_logger import hlog
+
+class MedCalcBenchMetric(Metric):
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """Metric for MedCalc-Bench dataset.
+
+        Original implementation:
+        https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11
+        """
+        assert request_state.instance.extra_data, (
+            "Could not find `extra_data` in the request state. "
+            "Both `lower_limit` and `upper_limit` are required for this metric."
+        )
+
+        assert len(request_state.result.completions) == 1, (
+            f"Found a total of {len(request_state.result.completions)} completions. "
+            "Only one was expected"
+        )
+
+        final_answer = (
+            request_state.result.completions[0]
+            .text.strip()
+            .lower()
+            .split("calculated value:")[-1]
+            .strip()
+        )
+        ground_truth_ref = [ref for ref in request_state.instance.references if CORRECT_TAG in ref.tags][0]
+
+        correctness = 0
+        if final_answer:
+            try:
+                correctness = self.medcalc_bench_metric_calculation(
+                    answer=final_answer,
+                    ground_truth=ground_truth_ref.output.text,
+                    calid=int(request_state.instance.extra_data["calculator_id"]),
+                    upper_limit=request_state.instance.extra_data["upper_limit"],
+                    lower_limit=request_state.instance.extra_data["lower_limit"],
+                )
+            except ValueError as e:
+                hlog(
+                    (
+                        "Failed to calculate the correctess of the output for MedCalc-Bench instance "
+                        f'with id {request_state.instance.id}: {e}'
+                    )
+                )
+
+        return [Stat(MetricName("medcalc_bench_metric")).add(correctness)]
+
+    def medcalc_bench_metric_calculation(
+        self,
+        answer: str,
+        ground_truth: str,
+        calid: int,
+        upper_limit: str,
+        lower_limit: str,
+    ) -> int:
+        """Calculate the metric for MedCalc-Bench dataset.
+
+        This method is basically a copy of the original implementation of this metric:
+        https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11
+
+        Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench.
+        """
+        if calid in [13, 68]:
+            # Output Type: date
+
+            if datetime.strptime(answer, "%m/%d/%Y").strftime(
+                "%-m/%-d/%Y"
+            ) == datetime.strptime(ground_truth, "%m/%d/%Y").strftime("%-m/%-d/%Y"):
+                correctness = 1
+            else:
+                correctness = 0
+        elif calid in [69]:
+            # Output Type: integer (A, B)
+            match = re.search(
+                r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?",
+                ground_truth,
+            )
+            ground_truth = f"({match.group(1)}, {match.group(3)})"
+            match = re.search(
+                r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?",
+                answer,
+            )
+            if match:
+                weeks = match.group(1)
+                days = match.group(3)
+                answer = f"({weeks}, {days})"
+                if eval(answer) == eval(ground_truth):
+                    correctness = 1
+                else:
+                    correctness = 0
+            else:
+                correctness = 0
+        elif calid in [
+            4,
+            15,
+            16,
+            17,
+            18,
+            20,
+            21,
+            25,
+            27,
+            28,
+            29,
+            32,
+            33,
+            36,
+            43,
+            45,
+            48,
+            51,
+            69,
+        ]:
+            # Output Type: integer A
+            answer = round(int(answer))
+            if answer == int(ground_truth):
+                correctness = 1
+            else:
+                correctness = 0
+        elif calid in [
+            2,
+            3,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            19,
+            22,
+            23,
+            24,
+            26,
+            30,
+            31,
+            38,
+            39,
+            40,
+            44,
+            46,
+            49,
+            56,
+            57,
+            58,
+            59,
+            60,
+            61,
+            62,
+            63,
+            64,
+            65,
+            66,
+            67,
+        ]:
+            # Output Type: decimal
+            answer = float(answer)
+            if answer >= float(lower_limit) and answer <= float(upper_limit):
+                correctness = 1
+            else:
+                correctness = 0
+        else:
+            raise ValueError(f"Unknown calculator ID: {calid}")
+        return correctness