From 88bcde56d83d0cb2c4e631c0c95cf9d0cc3841ac Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 11 Aug 2025 11:06:16 +0200 Subject: [PATCH 1/4] fix ci --- tests/test_trainer_evaluator_parity.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_trainer_evaluator_parity.py b/tests/test_trainer_evaluator_parity.py index b513da886..280ab818d 100644 --- a/tests/test_trainer_evaluator_parity.py +++ b/tests/test_trainer_evaluator_parity.py @@ -59,7 +59,7 @@ def test_text_classification_parity(self): ) as f: transformers_results = json.load(f) - eval_dataset = load_dataset("glue", "sst2", split="validation[:80]") + eval_dataset = load_dataset("nyu-mll/glue", "sst2", split="validation[:80]") pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name) @@ -104,7 +104,7 @@ def test_text_classification_parity_two_columns(self): ) as f: transformers_results = json.load(f) - eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]") + eval_dataset = load_dataset("nyu-mll/glue", "mnli", split=f"validation_matched[:{max_eval_samples}]") pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256) @@ -124,7 +124,7 @@ def test_text_classification_parity_two_columns(self): def test_image_classification_parity(self): # we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images model_name = "douwekiela/resnet-18-finetuned-dogfood" - dataset_name = "beans" + dataset_name = "AI-Lab-Makerere/beans" max_eval_samples = 120 raw_dataset = load_dataset(dataset_name, split="validation") @@ -193,7 +193,7 @@ def test_question_answering_parity(self): subprocess.run( f"python examples/pytorch/question-answering/run_qa.py" f" --model_name_or_path {model_name_v1}" - f" --dataset_name squad" + f" --dataset_name rajpurkar/squad" f" --do_eval" f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}" f" --max_eval_samples 100" @@ -207,7 +207,7 @@ def test_question_answering_parity(self): ) as f: transformers_results = json.load(f) - eval_dataset = load_dataset("squad", split="validation[:100]") + eval_dataset = load_dataset("rajpurkar/squad", split="validation[:100]") pipe = pipeline( task="question-answering", @@ -232,7 +232,7 @@ def test_question_answering_parity(self): subprocess.run( f"python examples/pytorch/question-answering/run_qa.py" f" --model_name_or_path {model_name_v2}" - f" --dataset_name squad_v2" + f" --dataset_name rajpurkar/squad_v2" f" --version_2_with_negative" f" --do_eval" f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}" @@ -247,7 +247,7 @@ def test_question_answering_parity(self): ) as f: transformers_results = json.load(f) - eval_dataset = load_dataset("squad_v2", split="validation[:100]") + eval_dataset = load_dataset("rajpurkar/squad_v2", split="validation[:100]") pipe = pipeline( task="question-answering", @@ -282,7 +282,7 @@ def test_token_classification_parity(self): subprocess.run( f"python examples/pytorch/token-classification/run_ner.py" f" --model_name_or_path {model_name}" - f" --dataset_name conll2003" + f" --dataset_name areias/conll2003-generative" f" --do_eval" f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}" f" --max_eval_samples {n_samples}", @@ -295,7 +295,7 @@ def test_token_classification_parity(self): ) as f: transformers_results = json.load(f) - eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]") + eval_dataset = load_dataset("areias/conll2003-generative", split=f"validation[:{n_samples}]") pipe = pipeline(task="token-classification", model=model_name) From ce0c042f54a119df5e321f9157baa331232c6d86 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 11 Aug 2025 11:45:19 +0200 Subject: [PATCH 2/4] update to 3.9 --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd1a53f4c..83c476464 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -45,10 +45,10 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" - name: Upgrade pip run: python -m pip install --upgrade pip - name: Install dependencies From b24544507231a54d3f0b144f976cf4fce9f6dd52 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 11 Aug 2025 11:47:51 +0200 Subject: [PATCH 3/4] style --- src/evaluate/utils/logging.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/evaluate/utils/logging.py b/src/evaluate/utils/logging.py index 8df58d3dc..d29b7f484 100644 --- a/src/evaluate/utils/logging.py +++ b/src/evaluate/utils/logging.py @@ -218,7 +218,6 @@ def get_lock(self): def is_progress_bar_enabled() -> bool: """Return a boolean indicating whether tqdm progress bars are enabled.""" - global _tqdm_active return bool(_tqdm_active) From 75c824290323da97946db467f2fc2cb32a476260 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 11 Aug 2025 12:55:23 +0200 Subject: [PATCH 4/4] fix ci --- metrics/mse/mse.py | 8 +++++--- metrics/rl_reliability/rl_reliability.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/metrics/mse/mse.py b/metrics/mse/mse.py index fb695bfde..92e9ca311 100644 --- a/metrics/mse/mse.py +++ b/metrics/mse/mse.py @@ -14,7 +14,7 @@ """MSE - Mean Squared Error Metric""" import datasets -from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_squared_error, root_mean_squared_error import evaluate @@ -112,8 +112,10 @@ def _get_feature_types(self): def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True): - mse = mean_squared_error( - references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared + mse = ( + mean_squared_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput) + if squared + else root_mean_squared_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput) ) return {"mse": mse} diff --git a/metrics/rl_reliability/rl_reliability.py b/metrics/rl_reliability/rl_reliability.py index 34a9c4570..d3165d7a3 100644 --- a/metrics/rl_reliability/rl_reliability.py +++ b/metrics/rl_reliability/rl_reliability.py @@ -73,7 +73,7 @@ >>> import numpy as np >>> rl_reliability = evaluate.load("rl_reliability", "online") >>> results = rl_reliability.compute( - ... timesteps=[np.linspace(0, 2000000, 1000)], + ... timesteps=[np.linspace(0, 2000000, 1000, dtype=np.int64())], ... rewards=[np.linspace(0, 100, 1000)] ... ) >>> print(results["LowerCVaROnRaw"].round(4))