From 6855f9e728f08b1b22af25d1575ce9a7fe7b662b Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 11:10:36 +0800
Subject: [PATCH 1/9] add support for deepseek v2 lite on cuda gpus

---
 .../model_benchmarks/megatron_gpt3.py         | 176 +++++++++++++++++-
 .../model_benchmarks/test_megatron_gpt.py     |  50 +++++
 third_party/Megatron/Megatron-LM              |   2 +-
 3 files changed, 225 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 37d27bf1a..ac4519693 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -4,6 +4,7 @@
 """Module of the megatron deepspeed GPT pretrain class."""
 
 import json
+import math
 import os
 import statistics
 import numpy as np
@@ -12,9 +13,10 @@
 from pathlib import Path
 import re
 
-from superbench.benchmarks import BenchmarkRegistry
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType
 from superbench.benchmarks.context import Platform, Precision
 from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark
+from superbench.benchmarks.result import BenchmarkResult
 from superbench.benchmarks.return_code import ReturnCode
 from superbench.common.utils import logger, run_command
 
@@ -231,9 +233,108 @@ def add_parser_arguments(self):
             help='Train mode to run. Current supported: "pretrain" and "finetune".',
         )
 
+    def _normalize_unknown_args(self, unknown):
+        """Normalize unknown args by converting underscores to hyphens in flag names.
+
+        Args:
+            unknown (list): List of unknown arguments.
+
+        Return:
+            list: Normalized list of arguments.
+        """
+        normalized = []
+        i = 0
+        while i < len(unknown):
+            arg = unknown[i]
+            # Check if it's a flag (starts with --)
+            if arg.startswith('--'):
+                # Convert underscores to hyphens in the flag name
+                normalized_flag = arg.replace('_', '-')
+                normalized.append(normalized_flag)
+            else:
+                # It's a value, keep as-is
+                normalized.append(arg)
+            i += 1
+        return normalized
+
+    def parse_args(self, ignore_invalid=False):
+        """Parse the arguments, accepting unknown args for forwarding.
+
+        Return:
+            ret (bool): whether parse succeed or not.
+            args (argparse.Namespace): parsed arguments.
+            unknown (list): unknown arguments.
+        """
+        try:
+            args, unknown = self._parser.parse_known_args(self._argv)
+        except BaseException as e:
+            if ignore_invalid:
+                logger.info('Missing or invalid parameters, will ignore the error and skip the args checking.')
+                return True, None, []
+            else:
+                logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
+                return False, None, []
+
+        # Normalize unknown arguments (convert underscores to hyphens)
+        if len(unknown) > 0:
+            unknown = self._normalize_unknown_args(unknown)
+            logger.info(
+                'Forwarding unknown arguments - benchmark: %s, unknown: %s',
+                self._name,
+                ' '.join(unknown)
+            )
+        return True, args, unknown
+
     def _preprocess(self):
-        if not super()._preprocess():
+        """Preprocess with support for unknown args."""
+        self.add_parser_arguments()
+        ret, self._args, unknown = self.parse_args()
+        self._unknown_args = unknown  # Store for later forwarding
+
+        if not ret:
+            self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT)
+            return False
+
+        self._result = BenchmarkResult(
+            self._name, self._benchmark_type, ReturnCode.SUCCESS, run_count=self._args.run_count
+        )
+
+        if not isinstance(self._benchmark_type, BenchmarkType):
+            logger.error(
+                'Invalid benchmark type - benchmark: {}, type: {}'.format(self._name, type(self._benchmark_type))
+            )
+            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE)
+            return False
+
+        self._judge_gpu_availability()
+        self._set_force_fp32()
+        logger.info(
+            'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format(
+                self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32
+            )
+        )
+
+        if self._args.num_warmup < 0:
+            logger.error('num_warmup should be positive integer, while {} is set.'.format(self._args.num_warmup))
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
             return False
+
+        if not self._init_distributed_setting():
+            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
+            return False
+
+        # Set sample_count aligned with batch_size.
+        self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size
+
+        if not self._generate_dataset():
+            self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
+            return False
+
+        if not self._init_dataloader():
+            self._result.set_return_code(ReturnCode.DATALOADER_INIT_FAILURE)
+            return False
+
+        # Original MegatronGPT preprocessing logic
         if not self._args.code_base:
             if self._args.deepspeed:
                 self._args.code_base = os.path.join(
@@ -531,7 +632,11 @@ def _megatron_command(self, precision):    # noqa: C901
                 command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}'
         else:
             command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}'
+        # Transparently append any unknown args captured during parsing for forward compatibility.
+        if getattr(self, '_unknown_args', None):
+
 
+            command = f"{command} {' '.join(self._unknown_args)}"
         return command
 
     def _train_step(self, precision):    # noqa: E501
@@ -796,3 +901,70 @@ def _cal_params_count(self):
     ),
     platform=Platform.ROCM
 )
+BenchmarkRegistry.register_benchmark(
+    'megatron-deepseek-v2-lite',
+    MegatronGPT,
+    parameters=(
+        '--model=gpt '
+        '--transformer_impl=transformer_engine '
+        '--tokenizer_type=HuggingFaceTokenizer '
+        '--tokenizer-model=/opt/superbench/third_party/Megatron/data/DeepSeek-V2-Lite '
+        '--num_layers=27 '
+        '--hidden_size=1024 '
+        '--seq_len=4096 '
+        '--num_attn_heads=16 '
+        '--moe_ffn_hidden_size=1408 '
+        '--ffn_hidden_size=10944 '
+        '--dataloader_type=cyclic'
+        '--num_experts=64 '
+        '--no-async-tensor-model-parallel-allreduce '
+        '--use-rotary-position-embeddings '
+        '--no-gradient-accumulation-fusion '
+        '--mock-data '
+        '--use-flash-attn '
+        '--no-load-optim '
+        '--no-load-rng '
+        '--swiglu '
+        '--normalization=RMSNorm '
+        '--norm-epsilon=1e-06 '
+        '--no-bias-swiglu-fusion '
+        '--no-rope-fusion '
+        '--position-embedding-type=rope '
+        '--untie-embeddings-and-output-weights=yes '
+        '--disable-bias-linear '
+        '--ckpt-format=torch '
+        '--rotary-percent=1.0 '
+        '--rotary-base=10000 '
+        '--rotary-scaling-factor=40 '
+        '--eod-mask-loss '
+        '--data-cache-path=/root/cache '
+        '--moe-layer-freq="([0]+[1]*26)" '
+        '--moe-router-topk=6 '
+        '--moe-router-topk-scaling-factor=1.0 '
+        '--moe-aux-loss-coeff=1e-3 '
+        '--kv-lora-rank=512 '
+        '--v-head-dim=128 '
+        '--qk-head-dim=128  '
+        '--qk-layernorm '
+        '--qk-pos-emb-head-dim=64 '
+        '--attention-dropout=0.0 '
+        '--hidden-dropout=0.0 '
+        '--no-masked-softmax-fusion '
+        '--kv-channels=16 '
+        '--multi-latent-attention '
+        '--moe-grouped-gemm '
+        '--moe-router-score-function=softmax '
+        '--moe-router-topk=6 '
+        '--moe-router-pre-softmax '
+        '--moe-shared-expert-intermediate-size=2816 '
+        '--moe-token-dispatcher-type=alltoall '
+        '--moe-token-drop-policy=probs '
+        '--make-vocab-size-divisible-by=3200 '
+        '--attention-softmax-in-fp32 '
+        '--use-mcore-models '
+        '--mscale=0.707 '
+        '--mscale-all-dim=0.707 '
+        '--sequence-parallel '
+    ),
+    platform=Platform.CUDA
+)
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index b7c588677..56d67e555 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -500,6 +500,56 @@ def test_deepseek_v2_command(self):
 
         self.assertEqual(actual_units, expected_units)
 
+    @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
+    def test_megatron_gpt_unknown_args(self, mock_generate_dataset):
+        """Test unknown args forwarding and normalization."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
+        assert (benchmark_cls)
+        os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
+        os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
+        os.environ['OMPI_COMM_WORLD_RANK'] = '0'
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12345'
+        with open(self.hostfile_path, 'w') as f:
+            f.write('host1\n')
+
+        # Test with unknown args that have underscores (should be converted to hyphens)
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} '
+            '--num_warmup 0 --num_steps 10 --batch_size 2048 '
+            '--my_custom_flag 128 --another_option --third_param value',
+        )
+        mock_generate_dataset.return_value = True
+        ret = benchmark._preprocess()
+        assert (ret is True)
+
+        # Verify unknown args are stored and normalized
+        assert (hasattr(benchmark, '_unknown_args'))
+        assert (len(benchmark._unknown_args) > 0)
+
+        # Check that underscores are converted to hyphens
+        assert ('--my-custom-flag' in benchmark._unknown_args)
+        assert ('128' in benchmark._unknown_args)
+        assert ('--another-option' in benchmark._unknown_args)
+        assert ('--third-param' in benchmark._unknown_args)
+        assert ('value' in benchmark._unknown_args)
+
+        # Verify unknown args appear in the generated command
+        benchmark._data_options = '--mock-data'
+        command = benchmark._megatron_command(Precision.FLOAT32)
+
+        # Check that normalized unknown args are in the command
+        assert ('--my-custom-flag 128' in command)
+        assert ('--another-option' in command)
+        assert ('--third-param value' in command)
+
+        # Ensure original underscore versions are NOT in the command
+        assert ('--my_custom_flag' not in command)
+        assert ('--another_option' not in command)
+        assert ('--third_param' not in command)
+
+
     @decorator.load_data('tests/data/megatron_deepspeed.log')
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
diff --git a/third_party/Megatron/Megatron-LM b/third_party/Megatron/Megatron-LM
index 52b7a18a0..6cc29a208 160000
--- a/third_party/Megatron/Megatron-LM
+++ b/third_party/Megatron/Megatron-LM
@@ -1 +1 @@
-Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
+Subproject commit 6cc29a2081ec435c69e6614c9afceb9c9e99b666

From 12a01a56e856d5e28a2409654c5d151b7e150537 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 11:32:07 +0800
Subject: [PATCH 2/9] add download for model tokenizer files

---
 third_party/Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index 2a09f5990..78ffda569 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -226,7 +226,14 @@ directx_amf_encoding_latency:
 megatron_lm:
 	cd Megatron && \
 	apt install -y python3-mpi4py && \
-	python -m pip install --no-cache-dir -r requirements.txt
+	python -m pip install --no-cache-dir -r requirements.txt && \
+	mkdir -p Megatron/data/gpt && \
+	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O Megatron/data/gpt2-vocab.json && \
+	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O Megatron/data/gpt2-merges.txt && \
+	mkdir -p Megatron/data/DeepSeek-V2-Lite && \
+	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/config.json -O Megatron/data/DeepSeek-V2-Lite/config.json && \
+	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer.json && \
+	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer_config.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer_config.json
 
 # Install requirements for Megatron-DeepSpeed
 megatron_deepspeed:

From 639c015edc09a9f25d6f32b995b5c83a111b62d8 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 11:45:16 +0800
Subject: [PATCH 3/9] bugfix

---
 superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 3 ++-
 superbench/benchmarks/registry.py                       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index ac4519693..9ec2871af 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -39,6 +39,7 @@ def __init__(self, name, parameters=''):
         """
         super().__init__(name, parameters)
         self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16, Precision.BFLOAT16]
+        self._ignore_unknown_args = True
 
     def add_parser_arguments(self):
         """Add the specified arguments."""
@@ -930,7 +931,7 @@ def _cal_params_count(self):
         '--no-bias-swiglu-fusion '
         '--no-rope-fusion '
         '--position-embedding-type=rope '
-        '--untie-embeddings-and-output-weights=yes '
+        '--untie-embeddings-and-output-weights '
         '--disable-bias-linear '
         '--ckpt-format=torch '
         '--rotary-percent=1.0 '
diff --git a/superbench/benchmarks/registry.py b/superbench/benchmarks/registry.py
index 62f32868e..1be6e4138 100644
--- a/superbench/benchmarks/registry.py
+++ b/superbench/benchmarks/registry.py
@@ -84,7 +84,7 @@ def __parse_and_check_args(cls, name, class_def, parameters):
         benchmark = class_def(name, parameters)
         benchmark.add_parser_arguments()
         ret, args, unknown = benchmark.parse_args(ignore_invalid=True)
-        if not ret or len(unknown) >= 1:
+        if not ret or (len(unknown) >= 1 and not getattr(benchmark, '_ignore_unknown_args', False)):
             logger.log_and_raise(
                 TypeError,
                 'Registered benchmark has invalid arguments - benchmark: {}, parameters: {}'.format(name, parameters)

From 3cc213f5047d480f16411a886bf11cb7192c4ba9 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 12:01:15 +0800
Subject: [PATCH 4/9] fix lint issue

---
 .../benchmarks/model_benchmarks/megatron_gpt3.py       | 10 ++--------
 tests/benchmarks/model_benchmarks/test_megatron_gpt.py |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 9ec2871af..9de8a4f80 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -279,18 +279,14 @@ def parse_args(self, ignore_invalid=False):
         # Normalize unknown arguments (convert underscores to hyphens)
         if len(unknown) > 0:
             unknown = self._normalize_unknown_args(unknown)
-            logger.info(
-                'Forwarding unknown arguments - benchmark: %s, unknown: %s',
-                self._name,
-                ' '.join(unknown)
-            )
+            logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown))
         return True, args, unknown
 
     def _preprocess(self):
         """Preprocess with support for unknown args."""
         self.add_parser_arguments()
         ret, self._args, unknown = self.parse_args()
-        self._unknown_args = unknown  # Store for later forwarding
+        self._unknown_args = unknown    # Store for later forwarding
 
         if not ret:
             self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT)
@@ -635,8 +631,6 @@ def _megatron_command(self, precision):    # noqa: C901
             command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}'
         # Transparently append any unknown args captured during parsing for forward compatibility.
         if getattr(self, '_unknown_args', None):
-
-
             command = f"{command} {' '.join(self._unknown_args)}"
         return command
 
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index 56d67e555..2f99070a6 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -549,7 +549,6 @@ def test_megatron_gpt_unknown_args(self, mock_generate_dataset):
         assert ('--another_option' not in command)
         assert ('--third_param' not in command)
 
-
     @decorator.load_data('tests/data/megatron_deepspeed.log')
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_parse_log(self, raw_output, mock_generate_dataset):

From b10a6dee65a6711af01a6d7f417c3c67e3d7de5a Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 12:09:27 +0800
Subject: [PATCH 5/9] update

---
 superbench/benchmarks/base.py                 | 39 ++++++--
 .../model_benchmarks/megatron_gpt3.py         | 94 +------------------
 2 files changed, 31 insertions(+), 102 deletions(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 8e6e58bfe..33ba1cab0 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -93,7 +93,7 @@ def get_configurable_settings(self):
         return message
 
     def parse_args(self, ignore_invalid=False):
-        """Parse the arguments.
+        """Parse the arguments, accepting unknown args for forwarding.
 
         Return:
             ret (bool): whether parse succeed or not.
@@ -104,20 +104,41 @@ def parse_args(self, ignore_invalid=False):
             args, unknown = self._parser.parse_known_args(self._argv)
         except BaseException as e:
             if ignore_invalid:
-                logger.info('Missing or invliad parameters, will ignore the error and skip the args checking.')
+                logger.info('Missing or invalid parameters, will ignore the error and skip the args checking.')
                 return True, None, []
             else:
                 logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
                 return False, None, []
 
-        ret = True
+        # Normalize unknown arguments (convert underscores to hyphens)
         if len(unknown) > 0:
-            logger.error(
-                'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
-            )
-            ret = False
+            unknown = self._normalize_unknown_args(unknown)
+            logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown))
+        return True, args, unknown
 
-        return ret, args, unknown
+    def _normalize_unknown_args(self, unknown):
+        """Normalize unknown args by converting underscores to hyphens in flag names.
+
+        Args:
+            unknown (list): List of unknown arguments.
+
+        Return:
+            list: Normalized list of arguments.
+        """
+        normalized = []
+        i = 0
+        while i < len(unknown):
+            arg = unknown[i]
+            # Check if it's a flag (starts with --)
+            if arg.startswith('--'):
+                # Convert underscores to hyphens in the flag name
+                normalized_flag = arg.replace('_', '-')
+                normalized.append(normalized_flag)
+            else:
+                # It's a value, keep as-is
+                normalized.append(arg)
+            i += 1
+        return normalized
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -126,7 +147,7 @@ def _preprocess(self):
             True if _preprocess() succeed.
         """
         self.add_parser_arguments()
-        ret, self._args, unknown = self.parse_args()
+        ret, self._args, self._unknown_args = self.parse_args()
 
         if not ret:
             self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT)
diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 9de8a4f80..5df930e18 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -234,101 +234,9 @@ def add_parser_arguments(self):
             help='Train mode to run. Current supported: "pretrain" and "finetune".',
         )
 
-    def _normalize_unknown_args(self, unknown):
-        """Normalize unknown args by converting underscores to hyphens in flag names.
-
-        Args:
-            unknown (list): List of unknown arguments.
-
-        Return:
-            list: Normalized list of arguments.
-        """
-        normalized = []
-        i = 0
-        while i < len(unknown):
-            arg = unknown[i]
-            # Check if it's a flag (starts with --)
-            if arg.startswith('--'):
-                # Convert underscores to hyphens in the flag name
-                normalized_flag = arg.replace('_', '-')
-                normalized.append(normalized_flag)
-            else:
-                # It's a value, keep as-is
-                normalized.append(arg)
-            i += 1
-        return normalized
-
-    def parse_args(self, ignore_invalid=False):
-        """Parse the arguments, accepting unknown args for forwarding.
-
-        Return:
-            ret (bool): whether parse succeed or not.
-            args (argparse.Namespace): parsed arguments.
-            unknown (list): unknown arguments.
-        """
-        try:
-            args, unknown = self._parser.parse_known_args(self._argv)
-        except BaseException as e:
-            if ignore_invalid:
-                logger.info('Missing or invalid parameters, will ignore the error and skip the args checking.')
-                return True, None, []
-            else:
-                logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
-                return False, None, []
-
-        # Normalize unknown arguments (convert underscores to hyphens)
-        if len(unknown) > 0:
-            unknown = self._normalize_unknown_args(unknown)
-            logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown))
-        return True, args, unknown
-
     def _preprocess(self):
         """Preprocess with support for unknown args."""
-        self.add_parser_arguments()
-        ret, self._args, unknown = self.parse_args()
-        self._unknown_args = unknown    # Store for later forwarding
-
-        if not ret:
-            self._result = BenchmarkResult(self._name, self._benchmark_type, ReturnCode.INVALID_ARGUMENT)
-            return False
-
-        self._result = BenchmarkResult(
-            self._name, self._benchmark_type, ReturnCode.SUCCESS, run_count=self._args.run_count
-        )
-
-        if not isinstance(self._benchmark_type, BenchmarkType):
-            logger.error(
-                'Invalid benchmark type - benchmark: {}, type: {}'.format(self._name, type(self._benchmark_type))
-            )
-            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_TYPE)
-            return False
-
-        self._judge_gpu_availability()
-        self._set_force_fp32()
-        logger.info(
-            'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format(
-                self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32
-            )
-        )
-
-        if self._args.num_warmup < 0:
-            logger.error('num_warmup should be positive integer, while {} is set.'.format(self._args.num_warmup))
-            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
-            return False
-
-        if not self._init_distributed_setting():
-            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
-            return False
-
-        # Set sample_count aligned with batch_size.
-        self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size
-
-        if not self._generate_dataset():
-            self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
-            return False
-
-        if not self._init_dataloader():
-            self._result.set_return_code(ReturnCode.DATALOADER_INIT_FAILURE)
+        if not super()._preprocess():
             return False
 
         # Original MegatronGPT preprocessing logic

From 77f4f636a3c4b3e65c3f63fc8b0e4d9280441332 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 12:20:26 +0800
Subject: [PATCH 6/9] update

---
 superbench/benchmarks/base.py                          | 10 ++++++++--
 .../benchmarks/model_benchmarks/megatron_gpt3.py       |  4 +---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 33ba1cab0..48a9a1278 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -112,8 +112,14 @@ def parse_args(self, ignore_invalid=False):
 
         # Normalize unknown arguments (convert underscores to hyphens)
         if len(unknown) > 0:
-            unknown = self._normalize_unknown_args(unknown)
-            logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown))
+            if not getattr(self, '_ignore_unknown_args', False):
+                logger.error(
+                    'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
+                )
+                return False, None, []
+            else:
+                unknown = self._normalize_unknown_args(unknown)
+                logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown))
         return True, args, unknown
 
     def _normalize_unknown_args(self, unknown):
diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 5df930e18..f98ac6ac7 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -4,7 +4,6 @@
 """Module of the megatron deepspeed GPT pretrain class."""
 
 import json
-import math
 import os
 import statistics
 import numpy as np
@@ -13,10 +12,9 @@
 from pathlib import Path
 import re
 
-from superbench.benchmarks import BenchmarkRegistry, BenchmarkType
+from superbench.benchmarks import BenchmarkRegistry
 from superbench.benchmarks.context import Platform, Precision
 from superbench.benchmarks.model_benchmarks.model_base import ModelBenchmark
-from superbench.benchmarks.result import BenchmarkResult
 from superbench.benchmarks.return_code import ReturnCode
 from superbench.common.utils import logger, run_command
 

From ab7fc5d535840a49a433cb2a6f8eb5cfd7684279 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 15:23:09 +0800
Subject: [PATCH 7/9] update

---
 third_party/Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index 78ffda569..fee844822 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -227,13 +227,13 @@ megatron_lm:
 	cd Megatron && \
 	apt install -y python3-mpi4py && \
 	python -m pip install --no-cache-dir -r requirements.txt && \
-	mkdir -p Megatron/data/gpt && \
-	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O Megatron/data/gpt2-vocab.json && \
-	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O Megatron/data/gpt2-merges.txt && \
-	mkdir -p Megatron/data/DeepSeek-V2-Lite && \
-	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/config.json -O Megatron/data/DeepSeek-V2-Lite/config.json && \
-	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer.json && \
-	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer_config.json -O Megatron/data/DeepSeek-V2-Lite/tokenizer_config.json
+	mkdir -p data/gpt && \
+	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O data/gpt2-vocab.json && \
+	wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O data/gpt2-merges.txt && \
+	mkdir -p data/DeepSeek-V2-Lite && \
+	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/config.json -O data/DeepSeek-V2-Lite/config.json && \
+	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer.json -O data/DeepSeek-V2-Lite/tokenizer.json && \
+	wget https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/tokenizer_config.json -O data/DeepSeek-V2-Lite/tokenizer_config.json
 
 # Install requirements for Megatron-DeepSpeed
 megatron_deepspeed:

From 0ef1ddd5bf964480f10dce87360183e89d3b325d Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Tue, 2 Dec 2025 18:05:59 +0800
Subject: [PATCH 8/9] update

---
 superbench/benchmarks/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 48a9a1278..00afb3171 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -119,7 +119,6 @@ def parse_args(self, ignore_invalid=False):
                 return False, None, []
             else:
                 unknown = self._normalize_unknown_args(unknown)
-                logger.info('Forwarding unknown arguments - benchmark: %s, unknown: %s', self._name, ' '.join(unknown))
         return True, args, unknown
 
     def _normalize_unknown_args(self, unknown):

From 5609a09a8369a6fa809c49a197e84c6f0436aae2 Mon Sep 17 00:00:00 2001
From: 454314380 <454314380@qq.com>
Date: Wed, 3 Dec 2025 22:52:55 +0800
Subject: [PATCH 9/9] bugfix

---
 superbench/benchmarks/base.py                 |  6 +-
 .../model_benchmarks/megatron_gpt3.py         | 98 +++++++++----------
 2 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 00afb3171..02b072b7e 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -137,8 +137,10 @@ def _normalize_unknown_args(self, unknown):
             # Check if it's a flag (starts with --)
             if arg.startswith('--'):
                 # Convert underscores to hyphens in the flag name
-                normalized_flag = arg.replace('_', '-')
-                normalized.append(normalized_flag)
+                flag = arg.split('=')[0]
+                value = arg.split('=')[1] if '=' in arg else None
+                normalized_flag = flag.replace('_', '-')
+                normalized.append(f'{normalized_flag} {value}' if value is not None else normalized_flag)
             else:
                 # It's a value, keep as-is
                 normalized.append(arg)
diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index f98ac6ac7..d0c61e6be 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -91,7 +91,7 @@ def add_parser_arguments(self):
         # Parallelism configs
         self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.')
         # Misc configs
-        self._parser.add_argument('--log-interval', type=int, required=False, default=1, help='Log interval.')
+        self._parser.add_argument('--log_interval', type=int, required=False, default=1, help='Log interval.')
         self._parser.add_argument('--eval_iters', type=int, default=0, help='Eval iters.')
         self._parser.add_argument('--eval_interval', type=int, default=10, help='Eval interval.')
         self._parser.add_argument('--num_save', type=int, default=10000, help='Num save.')
@@ -188,7 +188,7 @@ def add_parser_arguments(self):
         )
         self._parser.add_argument('--moe_ffn_hidden_size', type=int, help='MoE FFN hidden size.')
         self._parser.add_argument('--enable_shared_expert', action='store_true', help='Enable shared expert in MoE.')
-        self._parser.add_argument('--moe_layer_freq', type=int, help='MoE layer frequency.')
+        self._parser.add_argument('--moe_layer_freq', type=str, help='MoE layer frequency.')
         self._parser.add_argument('--num_shared_experts', type=int, help='Number of shared experts.')
         self._parser.add_argument('--moe_router_topk', type=int, help='Top-k routing for MoE.')
         self._parser.add_argument('--moe_aux_loss_coeff', type=float, help='Auxiliary loss coefficient.')
@@ -792,13 +792,11 @@ def _cal_params_count(self):
         '--load=deepseek-ai/DeepSeek-V2-Lite '
         '--no_load_optim '
         '--no_load_rng '
-        '--ckpt_format=torch '
         '--eod_mask_loss '
         '--train_mode=pretrain '
-        '--data_cache_path=/root/cache '
         '--max_padding_length=4096 '
         '--kv_lora_rank=512 '
-        '--dataloader_type=cyclic'
+        '--dataloader_type=cyclic '
     ),
     platform=Platform.ROCM
 )
@@ -809,63 +807,59 @@ def _cal_params_count(self):
         '--model=gpt '
         '--transformer_impl=transformer_engine '
         '--tokenizer_type=HuggingFaceTokenizer '
-        '--tokenizer-model=/opt/superbench/third_party/Megatron/data/DeepSeek-V2-Lite '
+        '--tokenizer_model=/opt/superbench/third_party/Megatron/data/DeepSeek-V2-Lite '
         '--num_layers=27 '
         '--hidden_size=1024 '
         '--seq_len=4096 '
         '--num_attn_heads=16 '
         '--moe_ffn_hidden_size=1408 '
         '--ffn_hidden_size=10944 '
-        '--dataloader_type=cyclic'
+        '--dataloader_type=cyclic '
         '--num_experts=64 '
-        '--no-async-tensor-model-parallel-allreduce '
-        '--use-rotary-position-embeddings '
-        '--no-gradient-accumulation-fusion '
-        '--mock-data '
-        '--use-flash-attn '
-        '--no-load-optim '
-        '--no-load-rng '
+        '--no_async_tensor_model_parallel_allreduce '
+        '--use_rotary_position_embeddings '
+        '--no_gradient_accumulation_fusion '
+        '--mock_data '
+        '--use_flash_attn '
+        '--no_load_optim '
+        '--no_load_rng '
         '--swiglu '
         '--normalization=RMSNorm '
-        '--norm-epsilon=1e-06 '
-        '--no-bias-swiglu-fusion '
-        '--no-rope-fusion '
-        '--position-embedding-type=rope '
-        '--untie-embeddings-and-output-weights '
-        '--disable-bias-linear '
-        '--ckpt-format=torch '
-        '--rotary-percent=1.0 '
-        '--rotary-base=10000 '
-        '--rotary-scaling-factor=40 '
-        '--eod-mask-loss '
-        '--data-cache-path=/root/cache '
-        '--moe-layer-freq="([0]+[1]*26)" '
-        '--moe-router-topk=6 '
-        '--moe-router-topk-scaling-factor=1.0 '
-        '--moe-aux-loss-coeff=1e-3 '
-        '--kv-lora-rank=512 '
-        '--v-head-dim=128 '
-        '--qk-head-dim=128  '
-        '--qk-layernorm '
-        '--qk-pos-emb-head-dim=64 '
-        '--attention-dropout=0.0 '
-        '--hidden-dropout=0.0 '
-        '--no-masked-softmax-fusion '
-        '--kv-channels=16 '
-        '--multi-latent-attention '
-        '--moe-grouped-gemm '
-        '--moe-router-score-function=softmax '
-        '--moe-router-topk=6 '
-        '--moe-router-pre-softmax '
-        '--moe-shared-expert-intermediate-size=2816 '
-        '--moe-token-dispatcher-type=alltoall '
-        '--moe-token-drop-policy=probs '
-        '--make-vocab-size-divisible-by=3200 '
-        '--attention-softmax-in-fp32 '
-        '--use-mcore-models '
+        '--norm_epsilon=1e-06 '
+        '--no_bias_swiglu_fusion '
+        '--no_rope_fusion '
+        '--position_embedding_type=rope '
+        '--untie_embeddings_and_output_weights '
+        '--disable_bias_linear '
+        '--ckpt_format=torch '
+        '--rotary_percent=1.0 '
+        '--rotary_base=10000 '
+        '--rotary_scaling_factor=40 '
+        '--eod_mask_loss '
+        '--data_cache_path=/tmp/cache '
+        '--moe_layer_freq="([0]+[1]*26)" '
+        '--moe_router_topk=6 '
+        '--moe_router_topk_scaling_factor=1.0 '
+        '--moe_aux_loss_coeff=1e-3 '
+        '--kv_lora_rank=512 '
+        '--v_head_dim=128 '
+        '--qk_head_dim=128  '
+        '--qk_layernorm '
+        '--qk_pos_emb_head_dim=64 '
+        '--no_masked_softmax_fusion '
+        '--kv_channels=16 '
+        '--multi_latent_attention '
+        '--moe_router_score_function=softmax '
+        '--moe_router_topk=6 '
+        '--moe_router_pre_softmax '
+        '--moe_shared_expert_intermediate_size=2816 '
+        '--moe_token_dispatcher_type=alltoall '
+        '--moe_token_drop_policy=probs '
+        '--make_vocab_size_divisible_by=3200 '
+        '--attention_softmax_in_fp32 '
+        '--use_mcore_models '
         '--mscale=0.707 '
-        '--mscale-all-dim=0.707 '
-        '--sequence-parallel '
+        '--mscale_all_dim=0.707 '
     ),
     platform=Platform.CUDA
 )