NVIDIA · dimapihtar · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/docs/user-guide/data-preparation.md b/docs/user-guide/data-preparation.md
@@ -46,6 +46,48 @@ python tools/preprocess_data.py \
 | `--workers` | Number of parallel workers for processing |
 | `--append-eod` | Add end-of-document token |
 
+## Finding Optimal Number of Workers
+
+Use the `--find-optimal-num-workers` flag to find number of workers which gives the best performance in terms of preprocessed documents per second.
+Script will lauch a few short data preprocessing runs with a different number of workers to define the fastest run in respect to collected performance data.
+
+```bash
+python tools/preprocess_data.py \
+    --input data.jsonl \
+    --output-prefix processed_data \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model /path/to/tokenizer.model \
+    --workers 8 \
+    --find-optimal-num-workers \
+    --workers-to-check 4 8 16 32 \
+    --performance-dir /path/to/save/perf/results \
+    --max-documents 50000
+```
+
+**Required arguments**
+
+| Argument | Description |
+|----------|-------------|
+| `--find-optimal-num-workers` | Activates search of optimal number of workers |
+| `--workers-to-check` | List of possible number of workers to run |
+| `--performance-dir` | Directory where to save performance results |
+| `--max-documents` | Number of documents to be preprocessed during each run |
+
+**Output example**
+
+```bash
+-----------------------------------
+Performance results (fastest → slowest):
+1. 16 workers → avg. docs/s: 9606.6476
+2. 32 workers → avg. docs/s: 9275.3284
+3. 8 workers → avg. docs/s: 9151.9280
+4. 4 workers → avg. docs/s: 6391.3819
+
+-----------------------------------
+The most optimal num of workers is 16 with avg. preprocessed docs/s: 9606.6476.
+-----------------------------------
+```
+
 ## Output Files
 
 The preprocessing tool generates two files:

diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py
@@ -2,6 +2,7 @@
 
 import json
 import os
+import runpy
 import sys
 import tempfile
 
@@ -201,6 +202,42 @@ def test_preprocess_data_gpt():
         do_test_preprocess_data(temp_dir, extra_args=gpt_args)
 
 
+def test_preprocess_data_gpt_optimal_workers():
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        # gpt specific args
+        gpt_args = [
+            "--input",
+            "/opt/data/datasets/dclm/dclm.jsonl",
+            "--output-prefix",
+            f"{temp_dir}/optimal_workers",
+            "--tokenizer-type",
+            "GPT2BPETokenizer",
+            "--vocab-file",
+            "/opt/data/tokenizers/megatron/gpt2-vocab.json",
+            "--merge-file",
+            "/opt/data/tokenizers/megatron/gpt2-merges.txt",
+            "--append-eod",
+            "--workers",
+            "2",
+            "--log-interval",
+            "1",
+            "--find-optimal-num-workers",
+            "--workers-to-check",
+            "2",
+            "4",
+            "8",
+            "--performance-dir",
+            f"{temp_dir}/perf",
+            "--max-documents",
+            "1002",
+        ]
+        sys.argv = ["/opt/megatron-lm/tools/preprocess_data.py"] + gpt_args
+        runpy.run_path("/opt/megatron-lm/tools/preprocess_data.py", run_name="__main__")
+
+        assert os.path.exists(f"{temp_dir}/perf")
+
+
 def bert_vocab(odir):
     if os.path.exists(__LOCAL_BERT_VOCAB):
         return __LOCAL_BERT_VOCAB
@@ -237,3 +274,4 @@ def test_preprocess_data_bert():
 if __name__ == "__main__":
     test_preprocess_data_gpt()
     test_preprocess_data_bert()
+    test_preprocess_data_gpt_optimal_workers()