Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
118 commits
Select commit Hold shift + click to select a range
e4b6e84
Added medcalc bench scenario
MiguelAFH Nov 23, 2024
c8219a9
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Nov 23, 2024
c2ca7e5
Rollback removal of medical scenarios
MiguelAFH Nov 23, 2024
a2aa50f
UNTESTED implementation of Medalign with new setup, pushing to test o…
aunell Nov 26, 2024
0679d3d
updated medalign, functional
aunell Nov 27, 2024
f3b53e1
add max tokens for medalign run spec
aunell Nov 27, 2024
a087633
Added llama 3.1 instruct and medalign to schema_medical
MiguelAFH Nov 30, 2024
501fe33
Added display name for Llama 3.2 1B Instruct
MiguelAFH Dec 1, 2024
7bf79a2
Update summarization metrics and MedAlign spec to bring bertscore onl…
aunell Dec 3, 2024
e9c8e47
Added MedDialog to MEDHELM
MiguelAFH Dec 7, 2024
98d7d0b
feat: implement medcalc bench scenario, metrics and specs
Dec 11, 2024
bb15f35
feat: med calc bench one shot spec
Dec 12, 2024
aa41939
Added MIMIC-RRS scenario
MiguelAFH Dec 14, 2024
676d4a5
Reduced max tokens for MIMIC-RRS
MiguelAFH Dec 15, 2024
ed2429c
Fix device for medical scenarios
MiguelAFH Dec 15, 2024
3e5bd96
Added groups for each medical task category
MiguelAFH Dec 15, 2024
78b2a91
dischargeMe scenario + schema update
aunell Dec 16, 2024
792fb4f
fix: dataset loading and standardize naming
Dec 17, 2024
3700016
Added medi_qa scenario
MiguelAFH Dec 19, 2024
acb902b
feat: add mimic billing codes
suhana13 Dec 22, 2024
5c3eea0
Added MIMICIV Billing Code scenario
MiguelAFH Jan 6, 2025
2f96439
add mtsamples benchmark
aunell Jan 6, 2025
b48d219
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Jan 6, 2025
513987c
feat: resolve merge commits
suhana13 Jan 9, 2025
73ed7da
Modified medication_qa metrics
MiguelAFH Jan 9, 2025
7bf8b11
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Jan 9, 2025
c52313b
initial ehrshot commit
Miking98 Jan 10, 2025
ad8c71e
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
Miking98 Jan 10, 2025
53c9e56
ehrshot
Miking98 Jan 10, 2025
57241ec
token stats for ehrshot
Miking98 Jan 14, 2025
7715c16
Race based medicine detection benchmark for Ensuring Clinical Researc…
aunell Jan 14, 2025
8921886
updated multiple choice adapter
aunell Jan 14, 2025
04fd2bb
fix: mimiciv duplicate instantiation
suhana13 Jan 15, 2025
5d1a67e
Add medbullets, headqa, aci_bench, medec scenarios
haoqiu1 Jan 15, 2025
d74c2c7
ehrshot update
Miking98 Jan 17, 2025
fbbb4ba
n2c2 2018 ct matching benchmark
Miking98 Jan 17, 2025
ca18be5
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
Miking98 Jan 17, 2025
c94612a
Added Claude and Google clients
MiguelAFH Jan 18, 2025
bf91f7d
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Jan 18, 2025
0dd1dad
Merged main
MiguelAFH Jan 18, 2025
77e6571
Updated models
MiguelAFH Jan 19, 2025
ca8c130
update EHRShot to run within HELM environment
aunell Jan 19, 2025
30bf966
update n2c2 to run within HELM environment
HennyJie Jan 20, 2025
435d563
Fixed prompts for ehrshot and n2c2
MiguelAFH Jan 22, 2025
ccc0a01
update bert score metric to match original HELM implementation, still…
aunell Jan 23, 2025
5e9bca8
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Jan 23, 2025
972a2de
Added max tokens for EHRSHOT
MiguelAFH Jan 24, 2025
0a0c465
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Jan 24, 2025
43c1057
Updated StanfordHealthCareGoogleClient
MiguelAFH Jan 24, 2025
cd7d60a
Modified instruction for MedCalcBench
MiguelAFH Jan 24, 2025
9ebca13
remove RAG from MedAlign implementation, instead filter by 128k context
aunell Jan 24, 2025
0b314ed
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Jan 24, 2025
e876c41
update prompt for medalign
aunell Jan 24, 2025
3afd9f6
add ehr sql scenario
haoqiu1 Jan 24, 2025
d0eadf7
fix: update medical yaml and aci bench metric
haoqiu1 Jan 24, 2025
17bb687
fix: update medical yaml head qa and medbullets
haoqiu1 Jan 24, 2025
aea6aa7
Added phi-3.5-mini-instruct model
MiguelAFH Jan 29, 2025
7600572
fix medbullets headqa and aci_bench secnario, configs
haoqiu1 Jan 29, 2025
71da4ab
fix ehr sql metric in extracting is_impossible
haoqiu1 Jan 29, 2025
784facd
change medbullets to download data from raw csv
haoqiu1 Jan 29, 2025
bef3051
fix medec scenario by adding correct tag to note without medical error
haoqiu1 Jan 29, 2025
53ea958
add vqa-rad scenario
haoqiu1 Jan 30, 2025
4e75fda
add mtsamples procedure subset
haoqiu1 Jan 30, 2025
d85c599
Changing promt length check to tokens
MiguelAFH Jan 31, 2025
44bd695
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Jan 31, 2025
7ed6aff
fix: mimiciv eval metrics
suhana13 Feb 3, 2025
547c333
feat: mimic billing eval fixes
suhana13 Feb 3, 2025
aa967d7
fix: mimic billing code metrics
suhana13 Feb 3, 2025
af75582
add mtsamples scenario loading preprocessed data instead of web scra…
haoqiu1 Feb 3, 2025
38be6b6
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
haoqiu1 Feb 3, 2025
dc5ba2d
Fixed errors
MiguelAFH Feb 3, 2025
ccab5e9
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Feb 3, 2025
d2b4299
update medec metric and set max token
haoqiu1 Feb 4, 2025
db6215f
Fixed prompts
MiguelAFH Feb 4, 2025
2703207
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Feb 4, 2025
dcca97a
Merged main
MiguelAFH Feb 4, 2025
6da3e4f
feat: convert all med scenarios to zero shot
suhana13 Feb 4, 2025
3f2a152
adjust max tokens for aci_bench and ehrshot
aunell Feb 4, 2025
5f3e375
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Feb 4, 2025
a06cd02
change aci bench to summac and medec run specs prompt
haoqiu1 Feb 4, 2025
4ad64ea
update mtsamples general to match mtsamples procedures methodology
aunell Feb 4, 2025
a9db121
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Feb 4, 2025
7656b30
remove webscrapping mtsamples
haoqiu1 Feb 4, 2025
a6d82b0
merge resolution
aunell Feb 4, 2025
761e906
change headqa max token to 1
haoqiu1 Feb 5, 2025
84b9c33
Fixed problem of empty instances
MiguelAFH Feb 5, 2025
dde6be9
add: mental health scenario
HennyJie Feb 5, 2025
8552c75
changed max token output for head_qa and medbullets EM multi-choice, …
aunell Feb 5, 2025
9517c3a
update: mental health schema
HennyJie Feb 5, 2025
4cb61f7
Merge branch 'med-helm' of github.com:stanford-crfm/helm into med-helm
HennyJie Feb 5, 2025
df6c9d6
Merged main
MiguelAFH Feb 6, 2025
7e1d10a
Merge branch 'main' into med-helm
MiguelAFH Feb 7, 2025
13ebc83
Set max train instances to 0 for missing MedHELM scenarios
MiguelAFH Feb 8, 2025
cbf74e9
chw_care_plan scenario
aunell Feb 8, 2025
a262a49
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Feb 8, 2025
090225e
fix EHRShot prompt
aunell Feb 8, 2025
6927b84
feat: add phi3.5 quantized
suhana13 Feb 8, 2025
0ba2420
feat: add starr patient instructions dataset
suhana13 Feb 8, 2025
2b1ddb6
Added support for max length for Medalign scenario
MiguelAFH Feb 9, 2025
18a0655
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Feb 9, 2025
6fb183d
Merge branch 'main' into med-helm
MiguelAFH Feb 9, 2025
508b3ce
updated bertscore to fix range issue
aunell Feb 9, 2025
39be2ee
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Feb 9, 2025
fefab01
Merged main
MiguelAFH Feb 9, 2025
fc985ff
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Feb 9, 2025
25cec43
fix: starr patient instructions prompt
suhana13 Feb 9, 2025
b7e7095
Modified EHRSHOT prompt
MiguelAFH Feb 9, 2025
1b65dbf
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
MiguelAFH Feb 9, 2025
611f2f3
feat: remove medalign yaml
suhana13 Feb 9, 2025
b30b336
Merge remote changes
suhana13 Feb 9, 2025
145fd35
feat: add clear dataset
suhana13 Feb 9, 2025
805f8d4
feat: clear edits
suhana13 Feb 9, 2025
544880c
feat: add mcq to reference
suhana13 Feb 9, 2025
3fef6db
fix: prompt for med_dialog
suhana13 Feb 10, 2025
1232010
fix chw benchmark, catch empty examples
aunell Feb 10, 2025
96fc5b9
Merge branch 'med-helm' of https://github.com/stanford-crfm/helm into…
aunell Feb 10, 2025
fe356ac
feat: predefined one shot logic
Feb 10, 2025
4f98c79
Merge branch 'med-helm' into feat/medcalc_bench_scenario
sashimono-san Feb 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@ anthropic==0.38.0
antlr4-python3-runtime==4.9.3
anyio==4.8.0
astunparse==1.6.3
async-timeout==5.0.1
async-timeout==4.0.3
attrs==24.3.0
audioread==3.0.1
autokeras==1.0.20
av==14.0.1
awscli==1.33.44
beautifulsoup4==4.12.3
bert_score==0.3.13
black==24.3.0
blis==1.1.0
boto3==1.34.162
Expand Down Expand Up @@ -131,6 +132,8 @@ keras==3.8.0
keras-tuner==1.4.7
kiwisolver==1.4.7
kt-legacy==1.0.5
langchain==0.3.9
langchain-community==0.3.8
langcodes==3.5.0
langdetect==1.0.9
language_data==1.3.0
Expand Down Expand Up @@ -230,15 +233,17 @@ pytrec_eval==0.5
pytz==2024.2
PyWavelets==1.6.0
PyYAML==6.0.2
qwen-vl-utils==0.0.8
RapidFuzz==3.11.0
rank_bm25==0.2.2
referencing==0.35.1
regex==2024.11.6
RapidFuzz==3.11.0
reka-api==2.0.0
requests==2.32.3
retrying==1.3.4
rich==13.9.4
rouge_score==0.1.2
rsa==4.7.2
qwen-vl-utils==0.0.8
s3transfer==0.10.4
sacrebleu==2.2.1
safetensors==0.5.1
Expand Down
116 changes: 116 additions & 0 deletions src/helm/benchmark/metrics/ehr_sql_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from typing import List, Dict, Any
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics.metric import Metric
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat
from helm.common.hierarchical_logger import hlog
import re


class EhrSqlMetric(Metric):
"""
Metric for evaluating the EHR SQL dataset, assessing the model's ability to generate valid SQL queries.

This implementation calculates:
1. Precision for Answerable Questions (Pans): The proportion of correctly predicted answerable questions
among all questions predicted to be answerable.
2. Recall for Answerable Questions (Rans): The proportion of correctly predicted answerable questions
among all answerable questions in the dataset.
"""

def extract_is_impossible(self, input_text: str) -> bool:
"""Extracts `is_impossible` from input_text using regex."""
match = re.search(r'"is_impossible":\s*(true|false)', input_text, re.IGNORECASE)
return match and match.group(1).lower() == "true"

def evaluate_generation(
self,
adapter_spec: AdapterSpec,
request_state: RequestState,
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:
"""
Evaluate a single generation against the reference labels.
"""

# Extract predictions
predictions = [
completion.text.strip() for completion in request_state.result.completions
]

if not predictions:
raise ValueError("No predictions found in the completions.")

# Process the first prediction as the primary output
prediction = predictions[0]

# Extract references and input text
references = getattr(request_state.instance, "references", None)
input_text = request_state.instance.input.text # Read input text

if not references or len(references) == 0:
hlog(f"Warning: Missing references for instance {request_state.instance}")
return []

# Check if the ground truth is answerable based on is_impossible flag
ground_truth_query = references[0].output.text
is_impossible = self.extract_is_impossible(input_text) # Extract from input

is_answerable = not is_impossible and bool(ground_truth_query)

# Check if the model prediction is answerable
is_predicted_answerable = bool(prediction)

# Determine correctness for answerable questions
correct_answerable = int(is_answerable and is_predicted_answerable)

return [
Stat(MetricName("ehr_sql_precision_answerable")).add(
correct_answerable if is_predicted_answerable else 0
),
Stat(MetricName("ehr_sql_recall_answerable")).add(
correct_answerable if is_answerable else 0
),
Stat(MetricName("ehr_sql_total_predicted_answerable")).add(
int(is_predicted_answerable)
),
Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(
int(is_answerable)
),
]

def compute(self, stats: List[Stat], **kwargs: Any) -> Dict[str, float]:
"""
Aggregate statistics to compute final metrics.
"""

# Sum up all relevant stats
correct_answerable = sum(
stat.value for stat in stats if stat.name in ["ehr_sql_precision_answerable", "ehr_sql_recall_answerable"]
)
total_predicted_answerable = sum(
stat.value for stat in stats if stat.name == "ehr_sql_total_predicted_answerable"
)
total_ground_truth_answerable = sum(
stat.value for stat in stats if stat.name == "ehr_sql_total_ground_truth_answerable"
)

# Calculate precision and recall
precision = (
correct_answerable / total_predicted_answerable
if total_predicted_answerable > 0
else 0.0
)
recall = (
correct_answerable / total_ground_truth_answerable
if total_ground_truth_answerable > 0
else 0.0
)

return {
"ehr_sql_precision_answerable": precision,
"ehr_sql_recall_answerable": recall,
}
182 changes: 182 additions & 0 deletions src/helm/benchmark/metrics/medcalc_bench_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import re
from datetime import datetime
from typing import List

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics.metric import Metric
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat
from helm.benchmark.scenarios.scenario import CORRECT_TAG
from helm.common.hierarchical_logger import hlog

class MedCalcBenchMetric(Metric):
def evaluate_generation(
self,
adapter_spec: AdapterSpec,
request_state: RequestState,
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:
"""Metric for MedCalc-Bench dataset.

Original implementation:
https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11
"""
assert request_state.instance.extra_data, (
"Could not find `extra_data` in the request state. "
"Both `lower_limit` and `upper_limit` are required for this metric."
)

assert len(request_state.result.completions) == 1, (
f"Found a total of {len(request_state.result.completions)} completions. "
"Only one was expected"
)

final_answer = (
request_state.result.completions[0]
.text.strip()
.lower()
.split("calculated value:")[-1]
.strip()
)
ground_truth_ref = [ref for ref in request_state.instance.references if CORRECT_TAG in ref.tags][0]

correctness = 0
if final_answer:
try:
correctness = self.medcalc_bench_metric_calculation(
answer=final_answer,
ground_truth=ground_truth_ref.output.text,
calid=int(request_state.instance.extra_data["calculator_id"]),
upper_limit=request_state.instance.extra_data["upper_limit"],
lower_limit=request_state.instance.extra_data["lower_limit"],
)
except ValueError as e:
hlog(
(
"Failed to calculate the correctess of the output for MedCalc-Bench instance "
f'with id {request_state.instance.id}: {e}'
)
)

return [Stat(MetricName("medcalc_bench_metric")).add(correctness)]

def medcalc_bench_metric_calculation(
self,
answer: str,
ground_truth: str,
calid: int,
upper_limit: str,
lower_limit: str,
) -> int:
"""Calculate the metric for MedCalc-Bench dataset.

This method is basically a copy of the original implementation of this metric:
https://github.com/ncbi-nlp/MedCalc-Bench/blob/048ba77dbe332e9190935e4a30965bff444b940e/evaluation/evaluate.py#L11

Credits to the original authors: https://github.com/ncbi-nlp/MedCalc-Bench.
"""
if calid in [13, 68]:
# Output Type: date

if datetime.strptime(answer, "%m/%d/%Y").strftime(
"%-m/%-d/%Y"
) == datetime.strptime(ground_truth, "%m/%d/%Y").strftime("%-m/%-d/%Y"):
correctness = 1
else:
correctness = 0
elif calid in [69]:
# Output Type: integer (A, B)
match = re.search(
r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?",
ground_truth,
)
ground_truth = f"({match.group(1)}, {match.group(3)})"
match = re.search(
r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?",
answer,
)
if match:
weeks = match.group(1)
days = match.group(3)
answer = f"({weeks}, {days})"
if eval(answer) == eval(ground_truth):
correctness = 1
else:
correctness = 0
else:
correctness = 0
elif calid in [
4,
15,
16,
17,
18,
20,
21,
25,
27,
28,
29,
32,
33,
36,
43,
45,
48,
51,
69,
]:
# Output Type: integer A
answer = round(int(answer))
if answer == int(ground_truth):
correctness = 1
else:
correctness = 0
elif calid in [
2,
3,
5,
6,
7,
8,
9,
10,
11,
19,
22,
23,
24,
26,
30,
31,
38,
39,
40,
44,
46,
49,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
]:
# Output Type: decimal
answer = float(answer)
if answer >= float(lower_limit) and answer <= float(upper_limit):
correctness = 1
else:
correctness = 0
else:
raise ValueError(f"Unknown calculator ID: {calid}")
return correctness
Loading