From 6855f9e728f08b1b22af25d1575ce9a7fe7b662b Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 11:10:36 +0800 Subject: [PATCH 1/9] add support for deepseek v2 lite on cuda gpus --- .../model_benchmarks/megatron_gpt3.py | 176 +++++++++++++++++- .../model_benchmarks/test_megatron_gpt.py | 50 +++++ third_party/Megatron/Megatron-LM | 2 +- 3 files changed, 225 insertions(+), 3 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 37d27bf1a..ac4519693 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -4,6 +4,7 @@ """Module of the megatron deepspeed GPT pretrain class.""" import json +import math import os import statistics import numpy as np @@ -12,9 +13,10 @@ from pathlib import Path import re -from superbench.benchmarks import BenchmarkRegistry +from superbench.benchmarks import BenchmarkRegistry, BenchmarkType from superbench.benchmarks.context import Platform, Precision from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark +from superbench.benchmarks.result import BenchmarkResult from superbench.benchmarks.return_code import ReturnCode from superbench.common.utils import logger, run_command @@ -231,9 +233,108 @@ def add_parser_arguments(self): help='Train mode to run. Current supported: "pretrain" and "finetune".', ) + def _normalize_unknown_args(self, unknown): + """Normalize unknown args by converting underscores to hyphens in flag names. + + Args: + unknown (list): List of unknown arguments. + + Return: + list: Normalized list of arguments. + """ + normalized = [] + i = 0 + while i < len(unknown): + arg = unknown[i] + # Check if it's a flag (starts with --) + if arg.startswith('--'): + # Convert underscores to hyphens in the flag name + normalized_flag = arg.replace('_', '-') + normalized.append(normalized_flag) + else: + # It's a value, keep as-is + normalized.append(arg) + i += 1 + return normalized + + def parse_args(self, ignore_invalid=False): + """Parse the arguments, accepting unknown args for forwarding. + + Return: + ret (bool): whether parse succeed or not. + args (argparse.Namespace): parsed arguments. + unknown (list): unknown arguments. + """ + try: + args, unknown = self._parser.parse_known_args(self._argv) + except BaseException as e: + if ignore_invalid: + logger.info('Missing or invalid parameters, will ignore the error and skip the args checking.') + return True, None, [] + else: + logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e))) + return False, None, [] + + # Normalize unknown arguments (convert underscores to hyphens) + if len(unknown) > 0: + unknown = self._normalize_unknown_args(unknown) + logger.info( + 'Forwarding unknown arguments - benchmark: %s, unknown: %s', + self._name, + ' '.join(unknown) + ) + return True, args, unknown + def _preprocess(self): - if not super()._preprocess(): + """Preprocess with support for unknown args.""" + self.add_parser_arguments() + ret, self._args, unknown = self.parse_args() + self._unknown_args = unknown # Store for later forwarding + + if not ret: + self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT) + return False + + self._result = BenchmarkResult( + self._name, self._benchmark_type, ReturnCode.SUCCESS, run_count=self._args.run_count + ) + + if not isinstance(self._benchmark_type, BenchmarkType): + logger.error( + 'Invalid benchmark type - benchmark: {}, type: {}'.format(self._name, type(self._benchmark_type)) + ) + self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE) + return False + + self._judge_gpu_availability() + self._set_force_fp32() + logger.info( + 'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format( + self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32 + ) + ) + + if self._args.num_warmup < 0: + logger.error('num_warmup should be positive integer, while {} is set.'.format(self._args.num_warmup)) + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) return False + + if not self._init_distributed_setting(): + self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) + return False + + # Set sample_count aligned with batch_size. + self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size + + if not self._generate_dataset(): + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) + return False + + if not self._init_dataloader(): + self._result.set_return_code(ReturnCode.DATALOADER_INIT_FAILURE) + return False + + # Original MegatronGPT preprocessing logic if not self._args.code_base: if self._args.deepspeed: self._args.code_base = os.path.join( @@ -531,7 +632,11 @@ def _megatron_command(self, precision): # noqa: C901 command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' else: command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' + # Transparently append any unknown args captured during parsing for forward compatibility. + if getattr(self, '_unknown_args', None): + + command = f"{command} {' '.join(self._unknown_args)}" return command def _train_step(self, precision): # noqa: E501 @@ -796,3 +901,70 @@ def _cal_params_count(self): ), platform=Platform.ROCM ) +BenchmarkRegistry.register_benchmark( + 'megatron-deepseek-v2-lite', + MegatronGPT, + parameters=( + '--model=gpt ' + '--transformer_impl=transformer_engine ' + '--tokenizer_type=HuggingFaceTokenizer ' + '--tokenizer-model=/opt/superbench/third_party/Megatron/data/DeepSeek-V2-Lite ' + '--num_layers=27 ' + '--hidden_size=1024 ' + '--seq_len=4096 ' + '--num_attn_heads=16 ' + '--moe_ffn_hidden_size=1408 ' + '--ffn_hidden_size=10944 ' + '--dataloader_type=cyclic' + '--num_experts=64 ' + '--no-async-tensor-model-parallel-allreduce ' + '--use-rotary-position-embeddings ' + '--no-gradient-accumulation-fusion ' + '--mock-data ' + '--use-flash-attn ' + '--no-load-optim ' + '--no-load-rng ' + '--swiglu ' + '--normalization=RMSNorm ' + '--norm-epsilon=1e-06 ' + '--no-bias-swiglu-fusion ' + '--no-rope-fusion ' + '--position-embedding-type=rope ' + '--untie-embeddings-and-output-weights=yes ' + '--disable-bias-linear ' + '--ckpt-format=torch ' + '--rotary-percent=1.0 ' + '--rotary-base=10000 ' + '--rotary-scaling-factor=40 ' + '--eod-mask-loss ' + '--data-cache-path=/root/cache ' + '--moe-layer-freq="([0]+[1]*26)" ' + '--moe-router-topk=6 ' + '--moe-router-topk-scaling-factor=1.0 ' + '--moe-aux-loss-coeff=1e-3 ' + '--kv-lora-rank=512 ' + '--v-head-dim=128 ' + '--qk-head-dim=128 ' + '--qk-layernorm ' + '--qk-pos-emb-head-dim=64 ' + '--attention-dropout=0.0 ' + '--hidden-dropout=0.0 ' + '--no-masked-softmax-fusion ' + '--kv-channels=16 ' + '--multi-latent-attention ' + '--moe-grouped-gemm ' + '--moe-router-score-function=softmax ' + '--moe-router-topk=6 ' + '--moe-router-pre-softmax ' + '--moe-shared-expert-intermediate-size=2816 ' + '--moe-token-dispatcher-type=alltoall ' + '--moe-token-drop-policy=probs ' + '--make-vocab-size-divisible-by=3200 ' + '--attention-softmax-in-fp32 ' + '--use-mcore-models ' + '--mscale=0.707 ' + '--mscale-all-dim=0.707 ' + '--sequence-parallel ' + ), + platform=Platform.CUDA +) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index b7c588677..56d67e555 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -500,6 +500,56 @@ def test_deepseek_v2_command(self): self.assertEqual(actual_units, expected_units) + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') + def test_megatron_gpt_unknown_args(self, mock_generate_dataset): + """Test unknown args forwarding and normalization.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + with open(self.hostfile_path, 'w') as f: + f.write('host1\n') + + # Test with unknown args that have underscores (should be converted to hyphens) + benchmark = benchmark_cls( + self.benchmark_name, + parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} ' + '--num_warmup 0 --num_steps 10 --batch_size 2048 ' + '--my_custom_flag 128 --another_option --third_param value', + ) + mock_generate_dataset.return_value = True + ret = benchmark._preprocess() + assert (ret is True) + + # Verify unknown args are stored and normalized + assert (hasattr(benchmark, '_unknown_args')) + assert (len(benchmark._unknown_args) > 0) + + # Check that underscores are converted to hyphens + assert ('--my-custom-flag' in benchmark._unknown_args) + assert ('128' in benchmark._unknown_args) + assert ('--another-option' in benchmark._unknown_args) + assert ('--third-param' in benchmark._unknown_args) + assert ('value' in benchmark._unknown_args) + + # Verify unknown args appear in the generated command + benchmark._data_options = '--mock-data' + command = benchmark._megatron_command(Precision.FLOAT32) + + # Check that normalized unknown args are in the command + assert ('--my-custom-flag 128' in command) + assert ('--another-option' in command) + assert ('--third-param value' in command) + + # Ensure original underscore versions are NOT in the command + assert ('--my_custom_flag' not in command) + assert ('--another_option' not in command) + assert ('--third_param' not in command) + + @decorator.load_data('tests/data/megatron_deepspeed.log') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_parse_log(self, raw_output, mock_generate_dataset): diff --git a/third_party/Megatron/Megatron-LM b/third_party/Megatron/Megatron-LM index 52b7a18a0..6cc29a208 160000 --- a/third_party/Megatron/Megatron-LM +++ b/third_party/Megatron/Megatron-LM @@ -1 +1 @@ -Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06 +Subproject commit 6cc29a2081ec435c69e6614c9afceb9c9e99b666 From 12a01a56e856d5e28a2409654c5d151b7e150537 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 11:32:07 +0800 Subject: [PATCH 2/9] add download for model tokenizer files --- third_party/Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/third_party/Makefile b/third_party/Makefile index 2a09f5990..78ffda569 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -226,7 +226,14 @@ directx_amf_encoding_latency: megatron_lm: cd Megatron && \ apt install -y python3-mpi4py && \ - python -m pip install --no-cache-dir -r requirements.txt + python -m pip install --no-cache-dir -r requirements.txt && \ + mkdir -p Megatron/data/gpt && \ + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O Megatron/data/gpt2-vocab.json && \ + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O Megatron/data/gpt2-merges.txt && \ + mkdir -p Megatron/data/DeepSeek-V2-Lite && \ + wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/config.json -O Megatron/data/DeepSeek-V2-Lite/config.json && \ + wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer.json && \ + wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer_config.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer_config.json # Install requirements for Megatron-DeepSpeed megatron_deepspeed: From 639c015edc09a9f25d6f32b995b5c83a111b62d8 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 11:45:16 +0800 Subject: [PATCH 3/9] bugfix --- superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 3 ++- superbench/benchmarks/registry.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index ac4519693..9ec2871af 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -39,6 +39,7 @@ def __init__(self, name, parameters=''): """ super().__init__(name, parameters) self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16, Precision.BFLOAT16] + self._ignore_unknown_args = True def add_parser_arguments(self): """Add the specified arguments.""" @@ -930,7 +931,7 @@ def _cal_params_count(self): '--no-bias-swiglu-fusion ' '--no-rope-fusion ' '--position-embedding-type=rope ' - '--untie-embeddings-and-output-weights=yes ' + '--untie-embeddings-and-output-weights ' '--disable-bias-linear ' '--ckpt-format=torch ' '--rotary-percent=1.0 ' diff --git a/superbench/benchmarks/registry.py b/superbench/benchmarks/registry.py index 62f32868e..1be6e4138 100644 --- a/superbench/benchmarks/registry.py +++ b/superbench/benchmarks/registry.py @@ -84,7 +84,7 @@ def __parse_and_check_args(cls, name, class_def, parameters): benchmark = class_def(name, parameters) benchmark.add_parser_arguments() ret, args, unknown = benchmark.parse_args(ignore_invalid=True) - if not ret or len(unknown) >= 1: + if not ret or (len(unknown) >= 1 and not getattr(benchmark, '_ignore_unknown_args', False)): logger.log_and_raise( TypeError, 'Registered benchmark has invalid arguments - benchmark: {}, parameters: {}'.format(name, parameters) From 3cc213f5047d480f16411a886bf11cb7192c4ba9 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 12:01:15 +0800 Subject: [PATCH 4/9] fix lint issue --- .../benchmarks/model_benchmarks/megatron_gpt3.py | 10 ++-------- tests/benchmarks/model_benchmarks/test_megatron_gpt.py | 1 - 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 9ec2871af..9de8a4f80 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -279,18 +279,14 @@ def parse_args(self, ignore_invalid=False): # Normalize unknown arguments (convert underscores to hyphens) if len(unknown) > 0: unknown = self._normalize_unknown_args(unknown) - logger.info( - 'Forwarding unknown arguments - benchmark: %s, unknown: %s', - self._name, - ' '.join(unknown) - ) + logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown)) return True, args, unknown def _preprocess(self): """Preprocess with support for unknown args.""" self.add_parser_arguments() ret, self._args, unknown = self.parse_args() - self._unknown_args = unknown # Store for later forwarding + self._unknown_args = unknown # Store for later forwarding if not ret: self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT) @@ -635,8 +631,6 @@ def _megatron_command(self, precision): # noqa: C901 command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' # Transparently append any unknown args captured during parsing for forward compatibility. if getattr(self, '_unknown_args', None): - - command = f"{command} {' '.join(self._unknown_args)}" return command diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 56d67e555..2f99070a6 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -549,7 +549,6 @@ def test_megatron_gpt_unknown_args(self, mock_generate_dataset): assert ('--another_option' not in command) assert ('--third_param' not in command) - @decorator.load_data('tests/data/megatron_deepspeed.log') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_parse_log(self, raw_output, mock_generate_dataset): From b10a6dee65a6711af01a6d7f417c3c67e3d7de5a Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 12:09:27 +0800 Subject: [PATCH 5/9] update --- superbench/benchmarks/base.py | 39 ++++++-- .../model_benchmarks/megatron_gpt3.py | 94 +------------------ 2 files changed, 31 insertions(+), 102 deletions(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 8e6e58bfe..33ba1cab0 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -93,7 +93,7 @@ def get_configurable_settings(self): return message def parse_args(self, ignore_invalid=False): - """Parse the arguments. + """Parse the arguments, accepting unknown args for forwarding. Return: ret (bool): whether parse succeed or not. @@ -104,20 +104,41 @@ def parse_args(self, ignore_invalid=False): args, unknown = self._parser.parse_known_args(self._argv) except BaseException as e: if ignore_invalid: - logger.info('Missing or invliad parameters, will ignore the error and skip the args checking.') + logger.info('Missing or invalid parameters, will ignore the error and skip the args checking.') return True, None, [] else: logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e))) return False, None, [] - ret = True + # Normalize unknown arguments (convert underscores to hyphens) if len(unknown) > 0: - logger.error( - 'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown)) - ) - ret = False + unknown = self._normalize_unknown_args(unknown) + logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown)) + return True, args, unknown - return ret, args, unknown + def _normalize_unknown_args(self, unknown): + """Normalize unknown args by converting underscores to hyphens in flag names. + + Args: + unknown (list): List of unknown arguments. + + Return: + list: Normalized list of arguments. + """ + normalized = [] + i = 0 + while i < len(unknown): + arg = unknown[i] + # Check if it's a flag (starts with --) + if arg.startswith('--'): + # Convert underscores to hyphens in the flag name + normalized_flag = arg.replace('_', '-') + normalized.append(normalized_flag) + else: + # It's a value, keep as-is + normalized.append(arg) + i += 1 + return normalized def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -126,7 +147,7 @@ def _preprocess(self): True if _preprocess() succeed. """ self.add_parser_arguments() - ret, self._args, unknown = self.parse_args() + ret, self._args, self._unknown_args = self.parse_args() if not ret: self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 9de8a4f80..5df930e18 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -234,101 +234,9 @@ def add_parser_arguments(self): help='Train mode to run. Current supported: "pretrain" and "finetune".', ) - def _normalize_unknown_args(self, unknown): - """Normalize unknown args by converting underscores to hyphens in flag names. - - Args: - unknown (list): List of unknown arguments. - - Return: - list: Normalized list of arguments. - """ - normalized = [] - i = 0 - while i < len(unknown): - arg = unknown[i] - # Check if it's a flag (starts with --) - if arg.startswith('--'): - # Convert underscores to hyphens in the flag name - normalized_flag = arg.replace('_', '-') - normalized.append(normalized_flag) - else: - # It's a value, keep as-is - normalized.append(arg) - i += 1 - return normalized - - def parse_args(self, ignore_invalid=False): - """Parse the arguments, accepting unknown args for forwarding. - - Return: - ret (bool): whether parse succeed or not. - args (argparse.Namespace): parsed arguments. - unknown (list): unknown arguments. - """ - try: - args, unknown = self._parser.parse_known_args(self._argv) - except BaseException as e: - if ignore_invalid: - logger.info('Missing or invalid parameters, will ignore the error and skip the args checking.') - return True, None, [] - else: - logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e))) - return False, None, [] - - # Normalize unknown arguments (convert underscores to hyphens) - if len(unknown) > 0: - unknown = self._normalize_unknown_args(unknown) - logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown)) - return True, args, unknown - def _preprocess(self): """Preprocess with support for unknown args.""" - self.add_parser_arguments() - ret, self._args, unknown = self.parse_args() - self._unknown_args = unknown # Store for later forwarding - - if not ret: - self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT) - return False - - self._result = BenchmarkResult( - self._name, self._benchmark_type, ReturnCode.SUCCESS, run_count=self._args.run_count - ) - - if not isinstance(self._benchmark_type, BenchmarkType): - logger.error( - 'Invalid benchmark type - benchmark: {}, type: {}'.format(self._name, type(self._benchmark_type)) - ) - self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE) - return False - - self._judge_gpu_availability() - self._set_force_fp32() - logger.info( - 'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format( - self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32 - ) - ) - - if self._args.num_warmup < 0: - logger.error('num_warmup should be positive integer, while {} is set.'.format(self._args.num_warmup)) - self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) - return False - - if not self._init_distributed_setting(): - self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) - return False - - # Set sample_count aligned with batch_size. - self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size - - if not self._generate_dataset(): - self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) - return False - - if not self._init_dataloader(): - self._result.set_return_code(ReturnCode.DATALOADER_INIT_FAILURE) + if not super()._preprocess(): return False # Original MegatronGPT preprocessing logic From 77f4f636a3c4b3e65c3f63fc8b0e4d9280441332 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 12:20:26 +0800 Subject: [PATCH 6/9] update --- superbench/benchmarks/base.py | 10 ++++++++-- .../benchmarks/model_benchmarks/megatron_gpt3.py | 4 +--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 33ba1cab0..48a9a1278 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -112,8 +112,14 @@ def parse_args(self, ignore_invalid=False): # Normalize unknown arguments (convert underscores to hyphens) if len(unknown) > 0: - unknown = self._normalize_unknown_args(unknown) - logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown)) + if not getattr(self, '_ignore_unknown_args', False): + logger.error( + 'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown)) + ) + return False, None, [] + else: + unknown = self._normalize_unknown_args(unknown) + logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown)) return True, args, unknown def _normalize_unknown_args(self, unknown): diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 5df930e18..f98ac6ac7 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -4,7 +4,6 @@ """Module of the megatron deepspeed GPT pretrain class.""" import json -import math import os import statistics import numpy as np @@ -13,10 +12,9 @@ from pathlib import Path import re -from superbench.benchmarks import BenchmarkRegistry, BenchmarkType +from superbench.benchmarks import BenchmarkRegistry from superbench.benchmarks.context import Platform, Precision from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark -from superbench.benchmarks.result import BenchmarkResult from superbench.benchmarks.return_code import ReturnCode from superbench.common.utils import logger, run_command From ab7fc5d535840a49a433cb2a6f8eb5cfd7684279 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 15:23:09 +0800 Subject: [PATCH 7/9] update --- third_party/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/third_party/Makefile b/third_party/Makefile index 78ffda569..fee844822 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -227,13 +227,13 @@ megatron_lm: cd Megatron && \ apt install -y python3-mpi4py && \ python -m pip install --no-cache-dir -r requirements.txt && \ - mkdir -p Megatron/data/gpt && \ - wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O Megatron/data/gpt2-vocab.json && \ - wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O Megatron/data/gpt2-merges.txt && \ - mkdir -p Megatron/data/DeepSeek-V2-Lite && \ - wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/config.json -O Megatron/data/DeepSeek-V2-Lite/config.json && \ - wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer.json && \ - wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer_config.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer_config.json + mkdir -p data/gpt && \ + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O data/gpt2-vocab.json && \ + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O data/gpt2-merges.txt && \ + mkdir -p data/DeepSeek-V2-Lite && \ + wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/config.json -O data/DeepSeek-V2-Lite/config.json && \ + wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer.json -O data/DeepSeek-V2-Lite/tokenizer.json && \ + wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer_config.json -O data/DeepSeek-V2-Lite/tokenizer_config.json # Install requirements for Megatron-DeepSpeed megatron_deepspeed: From 0ef1ddd5bf964480f10dce87360183e89d3b325d Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Tue, 2 Dec 2025 18:05:59 +0800 Subject: [PATCH 8/9] update --- superbench/benchmarks/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 48a9a1278..00afb3171 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -119,7 +119,6 @@ def parse_args(self, ignore_invalid=False): return False, None, [] else: unknown = self._normalize_unknown_args(unknown) - logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown)) return True, args, unknown def _normalize_unknown_args(self, unknown): From 5609a09a8369a6fa809c49a197e84c6f0436aae2 Mon Sep 17 00:00:00 2001 From: 454314380 <454314380@qq.com> Date: Wed, 3 Dec 2025 22:52:55 +0800 Subject: [PATCH 9/9] bugfix --- superbench/benchmarks/base.py | 6 +- .../model_benchmarks/megatron_gpt3.py | 98 +++++++++---------- 2 files changed, 50 insertions(+), 54 deletions(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 00afb3171..02b072b7e 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -137,8 +137,10 @@ def _normalize_unknown_args(self, unknown): # Check if it's a flag (starts with --) if arg.startswith('--'): # Convert underscores to hyphens in the flag name - normalized_flag = arg.replace('_', '-') - normalized.append(normalized_flag) + flag = arg.split('=')[0] + value = arg.split('=')[1] if '=' in arg else None + normalized_flag = flag.replace('_', '-') + normalized.append(f'{normalized_flag} {value}' if value is not None else normalized_flag) else: # It's a value, keep as-is normalized.append(arg) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index f98ac6ac7..d0c61e6be 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -91,7 +91,7 @@ def add_parser_arguments(self): # Parallelism configs self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.') # Misc configs - self._parser.add_argument('--log-interval', type=int, required=False, default=1, help='Log interval.') + self._parser.add_argument('--log_interval', type=int, required=False, default=1, help='Log interval.') self._parser.add_argument('--eval_iters', type=int, default=0, help='Eval iters.') self._parser.add_argument('--eval_interval', type=int, default=10, help='Eval interval.') self._parser.add_argument('--num_save', type=int, default=10000, help='Num save.') @@ -188,7 +188,7 @@ def add_parser_arguments(self): ) self._parser.add_argument('--moe_ffn_hidden_size', type=int, help='MoE FFN hidden size.') self._parser.add_argument('--enable_shared_expert', action='store_true', help='Enable shared expert in MoE.') - self._parser.add_argument('--moe_layer_freq', type=int, help='MoE layer frequency.') + self._parser.add_argument('--moe_layer_freq', type=str, help='MoE layer frequency.') self._parser.add_argument('--num_shared_experts', type=int, help='Number of shared experts.') self._parser.add_argument('--moe_router_topk', type=int, help='Top-k routing for MoE.') self._parser.add_argument('--moe_aux_loss_coeff', type=float, help='Auxiliary loss coefficient.') @@ -792,13 +792,11 @@ def _cal_params_count(self): '--load=deepseek-ai/DeepSeek-V2-Lite ' '--no_load_optim ' '--no_load_rng ' - '--ckpt_format=torch ' '--eod_mask_loss ' '--train_mode=pretrain ' - '--data_cache_path=/root/cache ' '--max_padding_length=4096 ' '--kv_lora_rank=512 ' - '--dataloader_type=cyclic' + '--dataloader_type=cyclic ' ), platform=Platform.ROCM ) @@ -809,63 +807,59 @@ def _cal_params_count(self): '--model=gpt ' '--transformer_impl=transformer_engine ' '--tokenizer_type=HuggingFaceTokenizer ' - '--tokenizer-model=/opt/superbench/third_party/Megatron/data/DeepSeek-V2-Lite ' + '--tokenizer_model=/opt/superbench/third_party/Megatron/data/DeepSeek-V2-Lite ' '--num_layers=27 ' '--hidden_size=1024 ' '--seq_len=4096 ' '--num_attn_heads=16 ' '--moe_ffn_hidden_size=1408 ' '--ffn_hidden_size=10944 ' - '--dataloader_type=cyclic' + '--dataloader_type=cyclic ' '--num_experts=64 ' - '--no-async-tensor-model-parallel-allreduce ' - '--use-rotary-position-embeddings ' - '--no-gradient-accumulation-fusion ' - '--mock-data ' - '--use-flash-attn ' - '--no-load-optim ' - '--no-load-rng ' + '--no_async_tensor_model_parallel_allreduce ' + '--use_rotary_position_embeddings ' + '--no_gradient_accumulation_fusion ' + '--mock_data ' + '--use_flash_attn ' + '--no_load_optim ' + '--no_load_rng ' '--swiglu ' '--normalization=RMSNorm ' - '--norm-epsilon=1e-06 ' - '--no-bias-swiglu-fusion ' - '--no-rope-fusion ' - '--position-embedding-type=rope ' - '--untie-embeddings-and-output-weights ' - '--disable-bias-linear ' - '--ckpt-format=torch ' - '--rotary-percent=1.0 ' - '--rotary-base=10000 ' - '--rotary-scaling-factor=40 ' - '--eod-mask-loss ' - '--data-cache-path=/root/cache ' - '--moe-layer-freq="([0]+[1]*26)" ' - '--moe-router-topk=6 ' - '--moe-router-topk-scaling-factor=1.0 ' - '--moe-aux-loss-coeff=1e-3 ' - '--kv-lora-rank=512 ' - '--v-head-dim=128 ' - '--qk-head-dim=128 ' - '--qk-layernorm ' - '--qk-pos-emb-head-dim=64 ' - '--attention-dropout=0.0 ' - '--hidden-dropout=0.0 ' - '--no-masked-softmax-fusion ' - '--kv-channels=16 ' - '--multi-latent-attention ' - '--moe-grouped-gemm ' - '--moe-router-score-function=softmax ' - '--moe-router-topk=6 ' - '--moe-router-pre-softmax ' - '--moe-shared-expert-intermediate-size=2816 ' - '--moe-token-dispatcher-type=alltoall ' - '--moe-token-drop-policy=probs ' - '--make-vocab-size-divisible-by=3200 ' - '--attention-softmax-in-fp32 ' - '--use-mcore-models ' + '--norm_epsilon=1e-06 ' + '--no_bias_swiglu_fusion ' + '--no_rope_fusion ' + '--position_embedding_type=rope ' + '--untie_embeddings_and_output_weights ' + '--disable_bias_linear ' + '--ckpt_format=torch ' + '--rotary_percent=1.0 ' + '--rotary_base=10000 ' + '--rotary_scaling_factor=40 ' + '--eod_mask_loss ' + '--data_cache_path=/tmp/cache ' + '--moe_layer_freq="([0]+[1]*26)" ' + '--moe_router_topk=6 ' + '--moe_router_topk_scaling_factor=1.0 ' + '--moe_aux_loss_coeff=1e-3 ' + '--kv_lora_rank=512 ' + '--v_head_dim=128 ' + '--qk_head_dim=128 ' + '--qk_layernorm ' + '--qk_pos_emb_head_dim=64 ' + '--no_masked_softmax_fusion ' + '--kv_channels=16 ' + '--multi_latent_attention ' + '--moe_router_score_function=softmax ' + '--moe_router_topk=6 ' + '--moe_router_pre_softmax ' + '--moe_shared_expert_intermediate_size=2816 ' + '--moe_token_dispatcher_type=alltoall ' + '--moe_token_drop_policy=probs ' + '--make_vocab_size_divisible_by=3200 ' + '--attention_softmax_in_fp32 ' + '--use_mcore_models ' '--mscale=0.707 ' - '--mscale-all-dim=0.707 ' - '--sequence-parallel ' + '--mscale_all_dim=0.707 ' ), platform=Platform.CUDA )