From 01831898dc1ba54df92729fba8c320efc08ac418 Mon Sep 17 00:00:00 2001 From: bricksdont Date: Mon, 29 Sep 2025 14:17:16 +0200 Subject: [PATCH] fix bleurt docs --- metrics/bleurt/README.md | 16 +++++++++++----- metrics/bleurt/bleurt.py | 4 ++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/metrics/bleurt/README.md b/metrics/bleurt/README.md index 03a9bf829..b14094e5d 100644 --- a/metrics/bleurt/README.md +++ b/metrics/bleurt/README.md @@ -42,9 +42,15 @@ This metric takes as input lists of predicted sentences and reference sentences: ``` ### Inputs + +For the `load` function: + +- **config_name** (`str`): BLEURT checkpoint. Will default to `"bleurt-base-128"` if not specified. Other models that can be chosen are: `"bleurt-tiny-128"`, `"bleurt-tiny-512"`, `"bleurt-base-128"`, `"bleurt-base-512"`, `"bleurt-large-128"`, `"bleurt-large-512"`, `"BLEURT-20-D3"`, `"BLEURT-20-D6"`, `"BLEURT-20-D12"` and `"BLEURT-20"`. + +For the `compute` function: + - **predictions** (`list` of `str`s): List of generated sentences to score. - **references** (`list` of `str`s): List of references to compare to. -- **checkpoint** (`str`): BLEURT checkpoint. Will default to `BLEURT-tiny` if not specified. Other models that can be chosen are: `"bleurt-tiny-128"`, `"bleurt-tiny-512"`, `"bleurt-base-128"`, `"bleurt-base-512"`, `"bleurt-large-128"`, `"bleurt-large-512"`, `"BLEURT-20-D3"`, `"BLEURT-20-D6"`, `"BLEURT-20-D12"` and `"BLEURT-20"`. ### Output Values - **scores** : a `list` of scores, one per prediction. @@ -65,7 +71,7 @@ BLEURT is used to compare models across different asks (e.g. (Table to text gene ### Examples -Example with the default model: +Example with the default model (`"bleurt-base-128"`): ```python >>> predictions = ["hello there", "general kenobi"] >>> references = ["hello there", "general kenobi"] @@ -75,14 +81,14 @@ Example with the default model: {'scores': [1.0295498371124268, 1.0445425510406494]} ``` -Example with the `"bleurt-base-128"` model checkpoint: +Example with the full `"BLEURT-20"` model checkpoint: ```python >>> predictions = ["hello there", "general kenobi"] >>> references = ["hello there", "general kenobi"] ->>> bleurt = load("bleurt", module_type="metric", checkpoint="bleurt-base-128") +>>> bleurt = load("bleurt", module_type="metric", config_name="BLEURT-20") >>> results = bleurt.compute(predictions=predictions, references=references) >>> print(results) -{'scores': [1.0295498371124268, 1.0445425510406494]} +{'scores': [1.015415906906128, 0.9985226988792419]} ``` ## Limitations and Bias diff --git a/metrics/bleurt/bleurt.py b/metrics/bleurt/bleurt.py index b47f8d284..11ad20a70 100644 --- a/metrics/bleurt/bleurt.py +++ b/metrics/bleurt/bleurt.py @@ -100,8 +100,8 @@ def _download_and_prepare(self, dl_manager): # check that config name specifies a valid BLEURT model if self.config_name == "default": logger.warning( - "Using default BLEURT-Base checkpoint for sequence maximum length 128. " - "You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512')." + "Using default checkpoint 'bleurt-base-128' for sequence maximum length 128. " + "You can use a bigger model for better results with e.g.: evaluate.load('bleurt', config_name='bleurt-large-512')." ) self.config_name = "bleurt-base-128"