From ea42e5fcf5680bd1109f8b35e1767cf8cdc9f8d5 Mon Sep 17 00:00:00 2001 From: John Kamalu Date: Fri, 7 Nov 2025 07:53:41 -0800 Subject: [PATCH 1/2] Attempt to fix, standardize, and stabilize the data unit tests --- tests/unit_tests/data/test_bin_reader.py | 59 +++++++++------- tests/unit_tests/data/test_gpt_dataset.py | 68 +++++++++--------- tests/unit_tests/data/test_preprocess_data.py | 69 ++++++++++--------- .../unit_tests/data/test_preprocess_mmdata.py | 40 ++++------- tools/merge_datasets.py | 12 ++-- tools/preprocess_data.py | 14 ++-- tools/preprocess_mmdata.py | 11 +-- 7 files changed, 142 insertions(+), 131 deletions(-) diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py index e479676ac4b..19a34029dbf 100644 --- a/tests/unit_tests/data/test_bin_reader.py +++ b/tests/unit_tests/data/test_bin_reader.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import os import random import sys @@ -6,16 +8,18 @@ from types import ModuleType, SimpleNamespace from typing import Any, Dict -import nltk -import pytest - try: import boto3 import botocore.exceptions as exceptions except ModuleNotFoundError: + # Create mock msc module boto3 = ModuleType("boto3") - sys.modules[boto3.__name__] = boto3 + + # Create mock types submodule exceptions = ModuleType("botocore.exceptions") + + # Register the mock module in sys.modules + sys.modules[boto3.__name__] = boto3 sys.modules[exceptions.__name__] = exceptions try: @@ -43,6 +47,8 @@ def __init__(self, offset: int, size: int): sys.modules[msc.__name__] = msc sys.modules[types_module.__name__] = types_module +import torch + from megatron.core.datasets.indexed_dataset import ( IndexedDataset, ObjectStorageConfig, @@ -58,9 +64,11 @@ def __init__(self, offset: int, size: int): gpt2_merge, gpt2_vocab, ) +from tests.unit_tests.test_utilities import Utils + ## -# Overload client from boto3 +# Mock boto3 ## @@ -72,7 +80,8 @@ def __init__(self, *args: Any) -> None: def download_file(self, Bucket: str, Key: str, Filename: str) -> None: os.makedirs(os.path.dirname(Filename), exist_ok=True) - os.system(f"cp {os.path.join('/', Bucket, Key)} {Filename}") + remote_path = os.path.join("/", Bucket, Key) + os.system(f"cp {remote_path} {Filename}") assert os.path.exists(Filename) def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: @@ -104,12 +113,12 @@ def close(self) -> None: ## -# Overload ClientError from botocore.exceptions +# Mock botocore.exceptions ## class _LocalClientError(Exception): - """ "Local test client error""" + """Local test client error""" pass @@ -117,14 +126,15 @@ class _LocalClientError(Exception): setattr(exceptions, "ClientError", _LocalClientError) ## -# Mock multistorageclient module +# Mock msc.open, msc.download_file, msc.resolve_storage_client ## def _msc_download_file(remote_path, local_path): - remote_path = remote_path.removeprefix(MSC_PREFIX + "default") + remote_path = os.path.join("/", remote_path.removeprefix(MSC_PREFIX)) os.makedirs(os.path.dirname(local_path), exist_ok=True) os.system(f"cp {remote_path} {local_path}") + assert os.path.exists(local_path) def _msc_resolve_storage_client(path): @@ -134,7 +144,7 @@ def read(self, path, byte_range): f.seek(byte_range.offset) return f.read(byte_range.size) - return StorageClient(), path.removeprefix(MSC_PREFIX + "default") + return StorageClient(), os.path.join("/", path.removeprefix(MSC_PREFIX)) setattr(msc, "open", open) @@ -142,20 +152,21 @@ def read(self, path, byte_range): setattr(msc, "resolve_storage_client", _msc_resolve_storage_client) -@pytest.mark.flaky -@pytest.mark.flaky_in_dev def test_bin_reader(): - with tempfile.TemporaryDirectory() as temp_dir: - # set the default nltk data path - os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data") - nltk.data.path.append(os.environ["NLTK_DATA"]) + if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() != 0: + return + with tempfile.TemporaryDirectory() as temp_dir: path_to_raws = os.path.join(temp_dir, "sample_raws") path_to_data = os.path.join(temp_dir, "sample_data") - path_to_object_storage_cache = os.path.join(temp_dir, "object_storage_cache") + path_to_object_storage_cache_msc = os.path.join(temp_dir, "object_storage_cache_msc") + path_to_object_storage_cache_s3 = os.path.join(temp_dir, "object_storage_cache_s3") os.mkdir(path_to_raws) os.mkdir(path_to_data) - os.mkdir(path_to_object_storage_cache) + os.mkdir(path_to_object_storage_cache_msc) + os.mkdir(path_to_object_storage_cache_s3) # create the dummy resources dummy_jsonl(path_to_raws) @@ -195,11 +206,11 @@ def test_bin_reader(): assert isinstance(indexed_dataset_mmap.bin_reader, _MMapBinReader) indexed_dataset_msc = IndexedDataset( - MSC_PREFIX + "default" + prefix, # use the default profile to access the filesystem + MSC_PREFIX + prefix.lstrip("/"), multimodal=False, mmap=False, object_storage_config=ObjectStorageConfig( - path_to_idx_cache=path_to_object_storage_cache + path_to_idx_cache=path_to_object_storage_cache_msc ), ) assert isinstance(indexed_dataset_msc.bin_reader, _MultiStorageClientBinReader) @@ -207,15 +218,14 @@ def test_bin_reader(): assert len(indexed_dataset_msc) == len(indexed_dataset_mmap) indexed_dataset_s3 = IndexedDataset( - S3_PREFIX + prefix, + S3_PREFIX + prefix.lstrip("/"), multimodal=False, mmap=False, object_storage_config=ObjectStorageConfig( - path_to_idx_cache=path_to_object_storage_cache + path_to_idx_cache=path_to_object_storage_cache_s3 ), ) assert isinstance(indexed_dataset_s3.bin_reader, _S3BinReader) - assert len(indexed_dataset_s3) == len(indexed_dataset_file) assert len(indexed_dataset_s3) == len(indexed_dataset_mmap) @@ -226,6 +236,7 @@ def test_bin_reader(): for idx in indices: assert (indexed_dataset_s3[idx] == indexed_dataset_file[idx]).all() assert (indexed_dataset_s3[idx] == indexed_dataset_mmap[idx]).all() + assert (indexed_dataset_s3[idx] == indexed_dataset_msc[idx]).all() if __name__ == "__main__": diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py index fdfa8645792..b44f7f6662f 100644 --- a/tests/unit_tests/data/test_gpt_dataset.py +++ b/tests/unit_tests/data/test_gpt_dataset.py @@ -34,8 +34,10 @@ def test_mock_gpt_dataset(): if torch.distributed.get_rank() == 0: compile_helpers() torch.distributed.barrier() + build_on_rank = lambda: torch.distributed.get_rank() == 0 else: compile_helpers() + build_on_rank = lambda: True tokenizer = _NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE) @@ -51,26 +53,27 @@ def test_mock_gpt_dataset(): ) datasets = BlendedMegatronDatasetBuilder( - MockGPTDataset, [100, 100, 100], lambda: True, config + MockGPTDataset, [100, 100, 100], build_on_rank, config ).build() - N = 10 + if build_on_rank(): + N = 10 - # Check iso-index variance by split - subsets = [sample_N(dataset, N, randomize=False) for dataset in datasets] - assert not numpy.allclose(subsets[0], subsets[1]) - assert not numpy.allclose(subsets[0], subsets[2]) - assert not numpy.allclose(subsets[1], subsets[2]) + # Check iso-index variance by split + subsets = [sample_N(dataset, N, randomize=False) for dataset in datasets] + assert not numpy.allclose(subsets[0], subsets[1]) + assert not numpy.allclose(subsets[0], subsets[2]) + assert not numpy.allclose(subsets[1], subsets[2]) - # Check iso-split / iso-index identity - subset_1A = sample_N(datasets[0], N, randomize=False) - subset_1B = sample_N(datasets[0], N, randomize=False) - assert numpy.allclose(subset_1A, subset_1B) + # Check iso-split / iso-index identity + subset_1A = sample_N(datasets[0], N, randomize=False) + subset_1B = sample_N(datasets[0], N, randomize=False) + assert numpy.allclose(subset_1A, subset_1B) - # Check iso-split variance by index - subset_1A = sample_N(datasets[0], N, randomize=True) - subset_1B = sample_N(datasets[0], N, randomize=True) - assert not numpy.allclose(subset_1A, subset_1B) + # Check iso-split variance by index + subset_1A = sample_N(datasets[0], N, randomize=True) + subset_1B = sample_N(datasets[0], N, randomize=True) + assert not numpy.allclose(subset_1A, subset_1B) config = GPTDatasetConfig( random_seed=1234, @@ -86,29 +89,30 @@ def test_mock_gpt_dataset(): ) datasets = BlendedMegatronDatasetBuilder( - MockGPTDataset, [0, None, 0], lambda: True, config + MockGPTDataset, [0, None, 0], build_on_rank, config ).build() - sample = datasets[1][datasets[1].shuffle_index.argmax()] - argmax = sample['labels'].shape[0] - torch.flip(sample['labels'], [0]).argmax() - 1 + if build_on_rank(): + sample = datasets[1][datasets[1].shuffle_index.argmax()] + argmax = sample['labels'].shape[0] - torch.flip(sample['labels'], [0]).argmax() - 1 - # Test add_extra_token_to_sequence - assert sample['tokens'][argmax] != tokenizer.eod - assert sample['labels'][argmax] == tokenizer.eod + # Test add_extra_token_to_sequence + assert sample['tokens'][argmax] != tokenizer.eod + assert sample['labels'][argmax] == tokenizer.eod - # Test eod_mask_loss, drop_last_partial_validation_sequence - assert argmax < sample['labels'].shape[0] - 1 - assert torch.all(sample['labels'][argmax + 1 :] == 0) - assert not torch.any( - sample['loss_mask'][ - torch.logical_and(sample['labels'] == tokenizer.eod, sample['labels'] == 0) - ] - ) + # Test eod_mask_loss, drop_last_partial_validation_sequence + assert argmax < sample['labels'].shape[0] - 1 + assert torch.all(sample['labels'][argmax + 1 :] == 0) + assert not torch.any( + sample['loss_mask'][ + torch.logical_and(sample['labels'] == tokenizer.eod, sample['labels'] == 0) + ] + ) - sample = datasets[1][None] + sample = datasets[1][None] - # Check handling of None index - assert not torch.any(sample['loss_mask']) + # Check handling of None index + assert not torch.any(sample['loss_mask']) if __name__ == "__main__": diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py index 48f3a2e7bb9..92a787050bc 100644 --- a/tests/unit_tests/data/test_preprocess_data.py +++ b/tests/unit_tests/data/test_preprocess_data.py @@ -2,18 +2,21 @@ import json import os +import random +import string import sys import tempfile -import nltk -import pytest import requests +import torch + from megatron.core.datasets.indexed_dataset import IndexedDataset from megatron.training.tokenizer.gpt2_tokenization import ( PRETRAINED_MERGES_ARCHIVE_MAP, PRETRAINED_VOCAB_ARCHIVE_MAP, ) +from tests.unit_tests.test_utilities import Utils from tools.merge_datasets import main as merge_main from tools.preprocess_data import Encoder from tools.preprocess_data import get_args as build_args @@ -23,11 +26,11 @@ "https://huggingface.co/bert-base-uncased/raw/main/vocab.txt" ) -__LOCAL_BERT_VOCAB = "/home/gitlab-runner/data/bert_data/vocab.txt" +__LOCAL_BERT_VOCAB = "/opt/data/tokenizers/megatron/bert-vocab.txt" -__LOCAL_GPT2_MERGE = "/home/gitlab-runner/data/gpt3_data/gpt2-merges.txt" +__LOCAL_GPT2_MERGE = "/opt/data/tokenizers/megatron/gpt2-merges.txt" -__LOCAL_GPT2_VOCAB = "/home/gitlab-runner/data/gpt3_data/gpt2-vocab.json" +__LOCAL_GPT2_VOCAB = "/opt/data/tokenizers/megatron/gpt2-vocab.json" def dummy_jsonl(odir): @@ -42,37 +45,33 @@ def dummy_jsonl(odir): ] with open(os.path.join(odir, "numbers_ascending.jsonl"), "w") as writer: writer.writelines(list_numbers_ascending) - # test - list_test = [] - with open(__file__) as reader: - for line in reader: - list_test.append(json.dumps({"text": line}) + "\n") - with open(os.path.join(odir, "test.jsonl"), "w") as writer: - writer.writelines(list_test) + # string + choices = string.ascii_letters + string.digits + string.punctuation + string.whitespace + list_string = [ + json.dumps({"text": "".join(random.choices(choices, k=random.randint(3, 100)))}) + "\n" + for i in range(100) + ] + with open(os.path.join(odir, "string.jsonl"), "w") as writer: + writer.writelines(list_string) def build_datasets(idir, odir, extra_args=[]): for name in os.listdir(idir): - sys.argv = [ - sys.argv[0], + args_list = [ "--input", os.path.join(idir, name), "--output-prefix", os.path.join(odir, os.path.splitext(name)[0]), ] + extra_args - build_main() + build_main(args_list) def merge_datasets(idir): - sys.argv = [sys.argv[0], "--input", idir, "--output-prefix", os.path.join(idir, "merge")] - merge_main() + args_list = ["--input", idir, "--output-prefix", os.path.join(idir, "merge")] + merge_main(args_list) def do_test_preprocess_data(temp_dir, extra_args=[]): - # set the default nltk data path - os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data") - nltk.data.path.append(os.environ["NLTK_DATA"]) - path_to_raws = os.path.join(temp_dir, "sample_raws") path_to_data = os.path.join(temp_dir, "sample_data") os.mkdir(path_to_raws) @@ -87,8 +86,8 @@ def do_test_preprocess_data(temp_dir, extra_args=[]): # merge the datasets merge_datasets(path_to_data) - sys.argv = [sys.argv[0], "--input", None, "--output-prefix", None] + extra_args - encoder = Encoder(build_args()) + args_list = ["--input", None, "--output-prefix", None] + extra_args + encoder = Encoder(build_args(args_list)) encoder.initializer() def tokens_to_string(toks): @@ -168,7 +167,7 @@ def tokens_to_string(toks): def gpt2_vocab(odir): if os.path.exists(__LOCAL_GPT2_VOCAB): return __LOCAL_GPT2_VOCAB - path = os.path.join(odir, "vocab.json") + path = os.path.join(odir, "gpt2-vocab.json") with open(path, "wb") as writer: writer.write(requests.get(PRETRAINED_VOCAB_ARCHIVE_MAP['gpt2']).content) return path @@ -177,13 +176,18 @@ def gpt2_vocab(odir): def gpt2_merge(odir): if os.path.exists(__LOCAL_GPT2_MERGE): return __LOCAL_GPT2_MERGE - path = os.path.join(odir, "merge.txt") + path = os.path.join(odir, "gpt2-merges.txt") with open(path, "wb") as writer: writer.write(requests.get(PRETRAINED_MERGES_ARCHIVE_MAP['gpt2']).content) return path def test_preprocess_data_gpt(): + if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() != 0: + return + with tempfile.TemporaryDirectory() as temp_dir: # gpt specific args @@ -191,9 +195,9 @@ def test_preprocess_data_gpt(): "--tokenizer-type", "GPT2BPETokenizer", "--vocab-file", - "/opt/data/tokenizers/megatron/gpt2-vocab.json", + gpt2_vocab(temp_dir), "--merge-file", - "/opt/data/tokenizers/megatron/gpt2-merges.txt", + gpt2_merge(temp_dir), "--append-eod", "--workers", "10", @@ -207,15 +211,18 @@ def test_preprocess_data_gpt(): def bert_vocab(odir): if os.path.exists(__LOCAL_BERT_VOCAB): return __LOCAL_BERT_VOCAB - path = os.path.join(odir, "vocab.txt") + path = os.path.join(odir, "bert-vocab.txt") with open(path, "wb") as writer: writer.write(requests.get(__HUGGINGFACE_BERT_BASE_UNCASED_VOCAB).content) return path -@pytest.mark.flaky -@pytest.mark.flaky_in_dev def test_preprocess_data_bert(): + if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() != 0: + return + with tempfile.TemporaryDirectory() as temp_dir: # bert specific args @@ -223,7 +230,7 @@ def test_preprocess_data_bert(): "--tokenizer-type", "BertWordPieceLowerCase", "--vocab-file", - "/opt/data/tokenizers/megatron/gpt2-vocab.json", + bert_vocab(temp_dir), "--split-sentences", "--workers", "10", diff --git a/tests/unit_tests/data/test_preprocess_mmdata.py b/tests/unit_tests/data/test_preprocess_mmdata.py index d6ad4eddc74..fd4fb54bb57 100644 --- a/tests/unit_tests/data/test_preprocess_mmdata.py +++ b/tests/unit_tests/data/test_preprocess_mmdata.py @@ -5,11 +5,12 @@ import sys import tempfile -import nltk import numpy +import torch from megatron.core.datasets.indexed_dataset import IndexedDataset from tests.unit_tests.data.test_preprocess_data import dummy_jsonl, gpt2_merge, gpt2_vocab +from tests.unit_tests.test_utilities import Utils from tools.merge_datasets import main as merge_main from tools.preprocess_mmdata import Encoder from tools.preprocess_mmdata import get_args as build_args @@ -31,8 +32,7 @@ def dummy_img(odir_txt, odir_img): def build_datasets(idir_txt, idir_img, odir, extra_args=[]): for name in os.listdir(idir_txt): - sys.argv = [ - sys.argv[0], + args_list = [ "--input", os.path.join(idir_txt, name), "--input-image", @@ -40,26 +40,15 @@ def build_datasets(idir_txt, idir_img, odir, extra_args=[]): "--output-prefix", os.path.join(odir, os.path.splitext(name)[0]), ] + extra_args - build_main() + build_main(args_list) def merge_datasets(idir): - sys.argv = [ - sys.argv[0], - "--input", - idir, - "--output-prefix", - os.path.join(idir, "merge"), - "--multimodal", - ] - merge_main() + args_list = ["--input", idir, "--output-prefix", os.path.join(idir, "merge"), "--multimodal"] + merge_main(args_list) def do_test_preprocess_mmdata(temp_dir, extra_args=[]): - # set the default nltk data path - os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data") - nltk.data.path.append(os.environ["NLTK_DATA"]) - path_to_raws_txt = os.path.join(temp_dir, "sample_raws_txt") path_to_raws_img = os.path.join(temp_dir, "sample_raws_img") path_to_data = os.path.join(temp_dir, "sample_data") @@ -79,16 +68,8 @@ def do_test_preprocess_mmdata(temp_dir, extra_args=[]): # merge the datasets merge_datasets(path_to_data) - sys.argv = [ - sys.argv[0], - "--input", - None, - "--input-image", - None, - "--output-prefix", - None, - ] + extra_args - encoder = Encoder(build_args()) + args_list = ["--input", None, "--input-image", None, "--output-prefix", None] + extra_args + encoder = Encoder(build_args(args_list)) encoder.initializer() def tokens_to_string(toks): @@ -193,6 +174,11 @@ def tokens_to_string(toks): def test_preprocess_mmdata(): + if torch.distributed.is_available(): + Utils.initialize_distributed() + if torch.distributed.get_rank() != 0: + return + with tempfile.TemporaryDirectory() as temp_dir: # gpt specific args diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py index c615558a94d..3037eea9103 100644 --- a/tools/merge_datasets.py +++ b/tools/merge_datasets.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import os import sys import json @@ -15,7 +17,7 @@ ) -def get_args(): +def get_args(args_list=None): parser = argparse.ArgumentParser() group = parser.add_argument_group(title="input data") @@ -41,7 +43,7 @@ def get_args(): help="Whether the datasets are assumed to be multimodal" ) - args = parser.parse_args() + args = parser.parse_args(args_list) assert os.path.isdir( args.input @@ -54,8 +56,8 @@ def get_args(): return args -def main(): - args = get_args() +def main(args_list=None): + args = get_args(args_list) prefixes = set() for basename in os.listdir(args.input): @@ -89,5 +91,5 @@ def main(): if __name__ == '__main__': - main() + diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 7d382a0d134..7bac4fba038 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -14,6 +14,7 @@ import torch import numpy as np import multiprocessing + try: import nltk from nltk.tokenize.punkt import PunktLanguageVars @@ -42,6 +43,7 @@ class CustomLanguageVars(PunktLanguageVars): (?P\S+) # <-- Normally you would have \s+ here ))""" + class IdentitySplitter(object): def tokenize(self, *text): return text @@ -195,7 +197,7 @@ def process_json_file(self, file_name): builders[key].finalize(output_idx_files[key]) -def get_args(): +def get_args(args_list=None): parser = argparse.ArgumentParser() parser = _add_tokenizer_args(parser) group = parser.add_argument_group(title='input data') @@ -227,9 +229,8 @@ def get_args(): group.add_argument('--keep-sequential-samples', action='store_true', help='Ensure ordering of samples in .jsonl files is ' 'preserved when using partitions>1.') - # group.add_argument('--legacy-tokenizer', action='store_true', - # help='Use legacy tokenizer system.') - args = parser.parse_args() + + args = parser.parse_args(args_list) args.keep_empty = False if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences: @@ -263,8 +264,8 @@ def check_files_exist(in_ss_out_names, key, num_partitions): return True -def main(): - args = get_args() +def main(args_list=None): + args = get_args(args_list) if args.split_sentences: if nltk_available: @@ -405,6 +406,5 @@ def main(): if __name__ == '__main__': - main() diff --git a/tools/preprocess_mmdata.py b/tools/preprocess_mmdata.py index 8ab2c2b8673..fb3ea673881 100755 --- a/tools/preprocess_mmdata.py +++ b/tools/preprocess_mmdata.py @@ -1,5 +1,5 @@ -# coding=utf-8 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# coding=utf-8 """Processing text modality data for MultiModal pretraining.""" @@ -70,7 +70,7 @@ def encode(self, input_pair): return sentence_ids, img_raw, len(json_line) -def get_args(): +def get_args(args_list=None): parser = argparse.ArgumentParser() group = parser.add_argument_group(title='input data') group.add_argument('--input', type=str, required=True, @@ -110,7 +110,8 @@ def get_args(): help='Number of worker processes to launch') group.add_argument('--log-interval', type=int, default=100, help='Interval between progress updates') - args = parser.parse_args() + + args = parser.parse_args(args_list) args.keep_empty = False # some default/dummy values for the tokenizer @@ -121,8 +122,8 @@ def get_args(): return args -def main(): - args = get_args() +def main(args_list=None): + args = get_args(args_list) startup_start = time.time() encoder = Encoder(args) From d1f3558a68fa302f1f44623738782cc3adb8f736 Mon Sep 17 00:00:00 2001 From: John Kamalu Date: Wed, 3 Dec 2025 12:54:28 -0800 Subject: [PATCH 2/2] Resolve linting errors via isort --- tests/unit_tests/data/test_bin_reader.py | 1 - tests/unit_tests/data/test_preprocess_data.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py index 19a34029dbf..488e95a6373 100644 --- a/tests/unit_tests/data/test_bin_reader.py +++ b/tests/unit_tests/data/test_bin_reader.py @@ -66,7 +66,6 @@ def __init__(self, offset: int, size: int): ) from tests.unit_tests.test_utilities import Utils - ## # Mock boto3 ## diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py index 92a787050bc..22f7a3e18d4 100644 --- a/tests/unit_tests/data/test_preprocess_data.py +++ b/tests/unit_tests/data/test_preprocess_data.py @@ -8,7 +8,6 @@ import tempfile import requests - import torch from megatron.core.datasets.indexed_dataset import IndexedDataset