Skip to content

Commit 3d855d5

Browse files
committed
recommit missing gemini review fix
1 parent 1a5e5d8 commit 3d855d5

File tree

2 files changed

+17
-16
lines changed

2 files changed

+17
-16
lines changed

docs/source_en/Components/Dataset/Dataset.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r
5858
If using a local path or a local file, please follow these instructions:
5959

6060
1. If you are using a local dataset file, pass a single file path (better to be an absolute path to avoid relative path errors), list is not supported.
61-
2. If you are using a local dir, please make sure files in the path share the same data structure, and the file extensions.
61+
2. If you are using a local dir, please make sure all files in the path have the same data structure and file extension.
6262
3. We use `datasets` library to do data loading, check the support extensions [here](https://huggingface.co/docs/hub/datasets-libraries).
6363

6464
2. Setting template

src/twinkle/dataset/base.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -120,25 +120,26 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
120120
if os.path.exists(dataset_id):
121121
streaming = kwargs.get('streaming', False)
122122
num_proc = kwargs.get('num_proc', 1)
123+
kwargs['split'] = 'train'
123124
if streaming:
124-
kwargs = {'split': 'train', 'streaming': True}
125+
kwargs['streaming'] = True
125126
else:
126-
kwargs = {'split': 'train', 'num_proc': num_proc}
127+
kwargs['num_proc'] = num_proc
128+
load_kwargs = {}
127129
if os.path.isdir(dataset_id):
128-
folder_path = dataset_id
129-
files = os.listdir(folder_path)
130-
first_file = files[0] if files else None
131-
ext = os.path.splitext(first_file)[1].lstrip('.')
132-
file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
133-
if file_type == 'csv':
134-
kwargs['na_filter'] = False
135-
dataset = load_dataset(file_type, data_dir=dataset_id, **kwargs)
130+
files = os.listdir(dataset_id)
131+
if not files:
132+
raise ValueError(f'Cannot load dataset from empty directory: {dataset_id}')
133+
filename_for_ext = files[0]
134+
load_kwargs['data_dir'] = dataset_id
136135
else:
137-
ext = os.path.splitext(dataset_id)[1].lstrip('.')
138-
file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
139-
if file_type == 'csv':
140-
kwargs['na_filter'] = False
141-
dataset = load_dataset(file_type, data_files=dataset_id, **kwargs)
136+
filename_for_ext = dataset_id
137+
load_kwargs['data_files'] = dataset_id
138+
ext = os.path.splitext(filename_for_ext)[1].lstrip('.')
139+
file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
140+
if file_type == 'csv':
141+
kwargs['na_filter'] = False
142+
dataset = load_dataset(file_type, **load_kwargs, **kwargs)
142143
else:
143144
dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)
144145

0 commit comments

Comments
 (0)