From c83d686eae3bdc60ff3133f3c8bbd12511e9741f Mon Sep 17 00:00:00 2001 From: drakezhang Date: Thu, 11 Dec 2025 17:59:30 +0800 Subject: [PATCH] fix: Improve robustness and correctness in data preprocessing This commit addresses three main issues in tools/preprocess_data.py: 1. Fail Early: Moved the assertion check 'args.workers % args.partitions == 0' to the beginning of the partition logic. This prevents the script from performing expensive file I/O operations only to fail later due to invalid configuration. 2. Multi-key Fix: Fixed a bug in 'Partition.process_json_file' where only the last key in '--json-keys' was being finalized. Now, all builders are correctly finalized. 3. Robustness: Improved 'Encoder.encode' to safely handle cases where a specified JSON key is missing. Instead of raising a KeyError, it now skips the missing key. --- tools/preprocess_data.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a81fe8ca7e7..d1306e5a729 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -88,7 +88,9 @@ def encode(self, json_line): ids = {} lens = {} for key in self.args.json_keys: - text = data[key] + text = data.get(key) + if text is None: + continue if isinstance(text, list): sentences = text else: @@ -183,7 +185,8 @@ def process_json_file(self, file_name): self.print_processing_stats(i, proc_start, total_bytes_processed) fin.close() - builders[key].finalize(output_idx_files[key]) + for key in self.args.json_keys: + builders[key].finalize(output_idx_files[key]) def get_args(): @@ -287,6 +290,7 @@ def main(): 'output_prefix': args.output_prefix} in_ss_out_names.append(file_names) else: + assert args.workers % args.partitions == 0 in_file_names = glob.glob(args.input) # Count total number of lines across .jsonl files @@ -340,7 +344,6 @@ def main(): for idx in range(args.partitions): partitioned_input_files[idx].close() - assert args.workers % args.partitions == 0 partition = Partition(args, args.workers//args.partitions) # check to see if paritions with split sentences already created