Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand All @@ -45,10 +45,10 @@ jobs:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.9"
- name: Upgrade pip
run: python -m pip install --upgrade pip
- name: Install dependencies
Expand Down
8 changes: 5 additions & 3 deletions metrics/mse/mse.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"""MSE - Mean Squared Error Metric"""

import datasets
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, root_mean_squared_error

import evaluate

Expand Down Expand Up @@ -112,8 +112,10 @@ def _get_feature_types(self):

def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True):

mse = mean_squared_error(
references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared
mse = (
mean_squared_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput)
if squared
else root_mean_squared_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput)
)

return {"mse": mse}
2 changes: 1 addition & 1 deletion metrics/rl_reliability/rl_reliability.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
>>> import numpy as np
>>> rl_reliability = evaluate.load("rl_reliability", "online")
>>> results = rl_reliability.compute(
... timesteps=[np.linspace(0, 2000000, 1000)],
... timesteps=[np.linspace(0, 2000000, 1000, dtype=np.int64())],
... rewards=[np.linspace(0, 100, 1000)]
... )
>>> print(results["LowerCVaROnRaw"].round(4))
Expand Down
1 change: 0 additions & 1 deletion src/evaluate/utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ def get_lock(self):

def is_progress_bar_enabled() -> bool:
"""Return a boolean indicating whether tqdm progress bars are enabled."""
global _tqdm_active
return bool(_tqdm_active)


Expand Down
18 changes: 9 additions & 9 deletions tests/test_trainer_evaluator_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_text_classification_parity(self):
) as f:
transformers_results = json.load(f)

eval_dataset = load_dataset("glue", "sst2", split="validation[:80]")
eval_dataset = load_dataset("nyu-mll/glue", "sst2", split="validation[:80]")

pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name)

Expand Down Expand Up @@ -104,7 +104,7 @@ def test_text_classification_parity_two_columns(self):
) as f:
transformers_results = json.load(f)

eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]")
eval_dataset = load_dataset("nyu-mll/glue", "mnli", split=f"validation_matched[:{max_eval_samples}]")

pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256)

Expand All @@ -124,7 +124,7 @@ def test_text_classification_parity_two_columns(self):
def test_image_classification_parity(self):
# we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images
model_name = "douwekiela/resnet-18-finetuned-dogfood"
dataset_name = "beans"
dataset_name = "AI-Lab-Makerere/beans"
max_eval_samples = 120

raw_dataset = load_dataset(dataset_name, split="validation")
Expand Down Expand Up @@ -193,7 +193,7 @@ def test_question_answering_parity(self):
subprocess.run(
f"python examples/pytorch/question-answering/run_qa.py"
f" --model_name_or_path {model_name_v1}"
f" --dataset_name squad"
f" --dataset_name rajpurkar/squad"
f" --do_eval"
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}"
f" --max_eval_samples 100"
Expand All @@ -207,7 +207,7 @@ def test_question_answering_parity(self):
) as f:
transformers_results = json.load(f)

eval_dataset = load_dataset("squad", split="validation[:100]")
eval_dataset = load_dataset("rajpurkar/squad", split="validation[:100]")

pipe = pipeline(
task="question-answering",
Expand All @@ -232,7 +232,7 @@ def test_question_answering_parity(self):
subprocess.run(
f"python examples/pytorch/question-answering/run_qa.py"
f" --model_name_or_path {model_name_v2}"
f" --dataset_name squad_v2"
f" --dataset_name rajpurkar/squad_v2"
f" --version_2_with_negative"
f" --do_eval"
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}"
Expand All @@ -247,7 +247,7 @@ def test_question_answering_parity(self):
) as f:
transformers_results = json.load(f)

eval_dataset = load_dataset("squad_v2", split="validation[:100]")
eval_dataset = load_dataset("rajpurkar/squad_v2", split="validation[:100]")

pipe = pipeline(
task="question-answering",
Expand Down Expand Up @@ -282,7 +282,7 @@ def test_token_classification_parity(self):
subprocess.run(
f"python examples/pytorch/token-classification/run_ner.py"
f" --model_name_or_path {model_name}"
f" --dataset_name conll2003"
f" --dataset_name areias/conll2003-generative"
f" --do_eval"
f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}"
f" --max_eval_samples {n_samples}",
Expand All @@ -295,7 +295,7 @@ def test_token_classification_parity(self):
) as f:
transformers_results = json.load(f)

eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]")
eval_dataset = load_dataset("areias/conll2003-generative", split=f"validation[:{n_samples}]")

pipe = pipeline(task="token-classification", model=model_name)

Expand Down
Loading