recommit missing gemini review fix

tastelikefeet · tastelikefeet · commit 3d855d5a7a38 · 2026-03-06T15:13:56.000+08:00
diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md
@@ -58,7 +58,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r
 If using a local path or a local file, please follow these instructions:
 
 1. If you are using a local dataset file, pass a single file path (better to be an absolute path to avoid relative path errors), list is not supported.
-2. If you are using a local dir, please make sure files in the path share the same data structure, and the file extensions.
+2. If you are using a local dir, please make sure all files in the path have the same data structure and file extension.
 3. We use `datasets` library to do data loading, check the support extensions [here](https://huggingface.co/docs/hub/datasets-libraries).
 
 2. Setting template
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
@@ -120,25 +120,26 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
             if os.path.exists(dataset_id):
                 streaming = kwargs.get('streaming', False)
                 num_proc = kwargs.get('num_proc', 1)
+                kwargs['split'] = 'train'
                 if streaming:
-                    kwargs = {'split': 'train', 'streaming': True}
+                    kwargs['streaming'] = True
                 else:
-                    kwargs = {'split': 'train', 'num_proc': num_proc}
+                    kwargs['num_proc'] = num_proc
+                load_kwargs = {}
                 if os.path.isdir(dataset_id):
-                    folder_path = dataset_id
-                    files = os.listdir(folder_path)
-                    first_file = files[0] if files else None
-                    ext = os.path.splitext(first_file)[1].lstrip('.')
-                    file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
-                    if file_type == 'csv':
-                        kwargs['na_filter'] = False
-                    dataset = load_dataset(file_type, data_dir=dataset_id, **kwargs)
+                    files = os.listdir(dataset_id)
+                    if not files:
+                        raise ValueError(f'Cannot load dataset from empty directory: {dataset_id}')
+                    filename_for_ext = files[0]
+                    load_kwargs['data_dir'] = dataset_id
                 else:
-                    ext = os.path.splitext(dataset_id)[1].lstrip('.')
-                    file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
-                    if file_type == 'csv':
-                        kwargs['na_filter'] = False
-                    dataset = load_dataset(file_type, data_files=dataset_id, **kwargs)
+                    filename_for_ext = dataset_id
+                    load_kwargs['data_files'] = dataset_id
+                ext = os.path.splitext(filename_for_ext)[1].lstrip('.')
+                file_type = {'jsonl': 'json', 'txt': 'text'}.get(ext) or ext
+                if file_type == 'csv':
+                    kwargs['na_filter'] = False
+                dataset = load_dataset(file_type, **load_kwargs, **kwargs)
             else:
                 dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)