Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def encode(self, json_line):
ids = {}
lens = {}
for key in self.args.json_keys:
text = data[key]
text = data.get(key)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data.get(key) + continue approach silently skips any record where the key is absent (including typos in --json-keys). This can cause silent data loss with no indication to the user that anything went wrong.

Consider at minimum logging a warning so the user knows records are being dropped:

Suggested change
text = data.get(key)
text = data.get(key)
if text is None:
import logging
logging.warning(f"Key '{key}' missing or null in record; skipping.")
continue

Or, if silent skipping is intentional, a counter that is reported at the end (similar to the existing self.print_processing_stats) would make the behavior observable.

if text is None:
continue
if isinstance(text, list):
sentences = text
else:
Expand Down Expand Up @@ -184,7 +186,8 @@ def process_json_file(self, file_name):
self.print_processing_stats(i, proc_start, total_bytes_processed)

fin.close()
builders[key].finalize(output_idx_files[key])
for key in self.args.json_keys:
builders[key].finalize(output_idx_files[key])


def get_args():
Expand Down Expand Up @@ -273,6 +276,7 @@ def main():
'output_prefix': args.output_prefix}
in_ss_out_names.append(file_names)
else:
assert args.workers % args.partitions == 0
in_file_names = glob.glob(args.input)

# Count total number of lines across .jsonl files
Expand Down Expand Up @@ -326,7 +330,6 @@ def main():
for idx in range(args.partitions):
partitioned_input_files[idx].close()

assert args.workers % args.partitions == 0
partition = Partition(args, args.workers//args.partitions)

# check to see if paritions with split sentences already created
Expand Down