-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_dataset_text_cls.py
More file actions
87 lines (71 loc) · 3.3 KB
/
load_dataset_text_cls.py
File metadata and controls
87 lines (71 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Load dataset for sequence classification. Refactored from `run_text_cls.py`."""
import datasets
from datasets import load_dataset
def load_raw_dataset(
dataset_name=None,
dataset_config_name=None,
train_file=None,
evaluation_file=None,
text_column_name=None,
label_column_name=None,
):
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets for text classification task available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
# For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
# sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
# label if at least two columns are provided.
# If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
# single column. You can easily tweak this behavior (see below)
if dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(dataset_name, dataset_config_name)
else:
# Loading the dataset from local csv or json file.
data_files = {}
if train_file is not None:
data_files["train"] = train_file
if evaluation_file is not None:
data_files["test"] = evaluation_file
extension = (train_file if train_file is not None else evaluation_file).split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Labels
label_list = raw_datasets["train"].unique(label_column_name)
label_list.sort() # Let's sort it for determinism
label_to_id = {label: i for i, label in enumerate(label_list)}
return raw_datasets, label_list, label_to_id
def tokenize_raw_dataset(
tokenizer,
raw_datasets,
label_list,
label_to_id,
text_column_name,
label_column_name,
pad_to_max_length=False,
max_length=None,
):
# Preprocessing the datasets
padding = "max_length" if pad_to_max_length else False
def preprocess_function(examples):
# Tokenize the texts
texts = [examples[n] for n in text_column_name]
result = tokenizer(*texts, padding=padding, max_length=max_length, truncation=True)
if label_column_name in examples:
if label_to_id is not None:
# Map labels to IDs (not necessary for GLUE tasks)
result["labels"] = [label_to_id[l] for l in examples[label_column_name]]
else:
# In all cases, rename the column to labels because the model will expect that.
result["labels"] = examples[label_column_name]
return result
processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]
return train_dataset, eval_dataset