diff --git a/src/datatrove/pipeline/inference/checkpointing.py b/src/datatrove/pipeline/inference/checkpointing.py index 2160bdee..60ce9286 100644 --- a/src/datatrove/pipeline/inference/checkpointing.py +++ b/src/datatrove/pipeline/inference/checkpointing.py @@ -292,6 +292,7 @@ async def parse_existing_checkpoints(self, rank: int, output_writer_context: Dis # not strictly needed but just to be safe for the future async with self.file_locks[chunk_index]: for document in reader.read_file(filename): + document.metadata.pop("file_path", None) # Remove any injected file_path if "__no_rollouts_remove" not in document.metadata: output_writer_context.write(document, rank=rank, chunk_index=chunk_index) all_ids.add(document.id)