huggingface · lhoestq · Nov 14, 2025 · Sep 29, 2025
diff --git a/metrics/bleurt/README.md b/metrics/bleurt/README.md
@@ -42,9 +42,15 @@ This metric takes as input lists of predicted sentences and reference sentences:
 ```
 
 ### Inputs
+
+For the `load` function:
+
+- **config_name** (`str`): BLEURT checkpoint. Will default to `"bleurt-base-128"` if not specified. Other models that can be chosen are: `"bleurt-tiny-128"`, `"bleurt-tiny-512"`, `"bleurt-base-128"`, `"bleurt-base-512"`, `"bleurt-large-128"`, `"bleurt-large-512"`, `"BLEURT-20-D3"`, `"BLEURT-20-D6"`, `"BLEURT-20-D12"` and `"BLEURT-20"`.
+
+For the `compute` function:
+
 - **predictions** (`list` of `str`s): List of generated sentences to score.
 - **references** (`list` of `str`s): List of references to compare to.
-- **checkpoint** (`str`): BLEURT checkpoint. Will default to `BLEURT-tiny` if not specified. Other models that can be chosen are: `"bleurt-tiny-128"`, `"bleurt-tiny-512"`, `"bleurt-base-128"`, `"bleurt-base-512"`, `"bleurt-large-128"`, `"bleurt-large-512"`, `"BLEURT-20-D3"`, `"BLEURT-20-D6"`, `"BLEURT-20-D12"` and `"BLEURT-20"`. 
 
 ### Output Values
 - **scores** : a `list` of scores, one per prediction. 
@@ -65,7 +71,7 @@ BLEURT is used to compare models across different asks (e.g. (Table to text gene
 
 ### Examples
 
-Example with the default model:
+Example with the default model (`"bleurt-base-128"`):
 ```python
 >>> predictions = ["hello there", "general kenobi"]
 >>> references = ["hello there", "general kenobi"]
@@ -75,14 +81,14 @@ Example with the default model:
 {'scores': [1.0295498371124268, 1.0445425510406494]}
 ```
 
-Example with the `"bleurt-base-128"` model checkpoint:
+Example with the full `"BLEURT-20"` model checkpoint:
 ```python
 >>> predictions = ["hello there", "general kenobi"]
 >>> references = ["hello there", "general kenobi"]
->>> bleurt = load("bleurt", module_type="metric", checkpoint="bleurt-base-128")
+>>> bleurt = load("bleurt", module_type="metric", config_name="BLEURT-20")
 >>> results = bleurt.compute(predictions=predictions, references=references)
 >>> print(results)
-{'scores': [1.0295498371124268, 1.0445425510406494]}
+{'scores': [1.015415906906128, 0.9985226988792419]}
 ```
 
 ## Limitations and Bias

diff --git a/metrics/bleurt/bleurt.py b/metrics/bleurt/bleurt.py
@@ -100,8 +100,8 @@ def _download_and_prepare(self, dl_manager):
         # check that config name specifies a valid BLEURT model
         if self.config_name == "default":
             logger.warning(
-                "Using default BLEURT-Base checkpoint for sequence maximum length 128. "
-                "You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512')."
+                "Using default checkpoint 'bleurt-base-128' for sequence maximum length 128. "
+                "You can use a bigger model for better results with e.g.: evaluate.load('bleurt', config_name='bleurt-large-512')."
             )
             self.config_name = "bleurt-base-128"