diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md
index ba89ed6ff..f88b2ac8f 100644
--- a/docs/user-tutorial/benchmarks/model-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/model-benchmarks.md
@@ -34,6 +34,19 @@ For inference, supported percentiles include
 
 **New: Support fp8_hybrid and fp8_e4m3 precision for BERT models.**
 
+**New: Deterministic Training Support**
+SuperBench now supports deterministic training to ensure reproducibility across runs. This includes fixed seeds and deterministic algorithms. To enable deterministic training, the following flags and environment variables must be set:
+
+- **Flags:**
+  - `--enable-determinism`: Enables deterministic computation for reproducible results.
+  - `--deterministic_seed <seed>`: Sets the seed for reproducibility.
+  - `--generate_log` : Boolean flag that stores comparison metrics in the results file
+  - `--compare_log <results_file_path>`: Specifies the path to the reference file for comparison.
+
+- **Environment Variables:**
+  - (Implicitly set when `enable-determinism` flag is set)
+  - `CUBLAS_WORKSPACE_CONFIG=:4096:8`: Ensures deterministic behavior in cuBLAS.
+
 #### Metrics
 
 | Name                                                                                    | Unit                   | Description                                                                  |
diff --git a/examples/benchmarks/pytorch_deterministic_example.py b/examples/benchmarks/pytorch_deterministic_example.py
new file mode 100644
index 000000000..2675872b7
--- /dev/null
+++ b/examples/benchmarks/pytorch_deterministic_example.py
@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unified PyTorch deterministic training example for all supported models.
+
+Deterministic metrics (loss, activation mean) are automatically stored in results.json
+when --enable-determinism flag is enabled. Use --compare-log to compare against a reference run.
+
+Commands to run:
+Run A (generate reference):
+
+python3 examples/benchmarks/pytorch_deterministic_example.py \
+    --model <model_from_MODEL_CHOICES> --enable-determinism --deterministic-seed 42
+
+This creates results-0.json with deterministic metrics.
+
+Run B (compare against reference):
+
+python3 examples/benchmarks/pytorch_deterministic_example.py \
+    --model <model_from_MODEL_CHOICES> --enable-determinism --deterministic-seed 42 --compare-log results-0.json
+
+Note: CUBLAS_WORKSPACE_CONFIG is now automatically set by the code when determinism is enabled.
+"""
+
+import argparse
+import json
+from pathlib import Path
+from superbench.benchmarks import BenchmarkRegistry, Framework
+from superbench.common.utils import logger
+
+MODEL_CHOICES = [
+    'bert-large',
+    'gpt2-small',
+    'llama2-7b',
+    'mixtral-8x7b',
+    'resnet101',
+    'lstm',
+]
+
+DEFAULT_PARAMS = {
+    'bert-large':
+    '--batch_size 1 --seq_len 64 --num_warmup 1 --num_steps 200 --precision float32 '
+    '--model_action train --check_frequency 20',
+    'gpt2-small':
+    '--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 128 --precision float32 '
+    '--model_action train --check_frequency 20',
+    'llama2-7b':
+    '--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 512 --precision float32 --model_action train '
+    '--check_frequency 20',
+    'mixtral-8x7b':
+    '--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32 --intermediate_size=14336 '
+    '--num_key_value_heads=8 --max_position_embeddings=32768 --router_aux_loss_coef=0.02 '
+    '--check_frequency 20',
+    'resnet101':
+    '--batch_size 1 --precision float32 --num_warmup 1 --num_steps 120 --sample_count 8192 '
+    '--pin_memory --model_action train --check_frequency 20',
+    'lstm':
+    '--batch_size 1 --num_steps 100 --num_warmup 2 --seq_len 64 --precision float16 '
+    '--model_action train --check_frequency 30',
+}
+
+
+def main():
+    """Main function for determinism example file."""
+    parser = argparse.ArgumentParser(description='Unified PyTorch deterministic training example.')
+    parser.add_argument('--model', type=str, choices=MODEL_CHOICES, required=True, help='Model to run.')
+    parser.add_argument(
+        '--enable-determinism',
+        '--enable_determinism',
+        action='store_true',
+        help='Enable deterministic mode for reproducible results.',
+    )
+    parser.add_argument(
+        '--compare-log',
+        type=str,
+        default=None,
+        help='Path to reference results.json file for deterministic comparison.',
+    )
+    parser.add_argument(
+        '--deterministic-seed',
+        type=int,
+        default=None,
+        help='Seed for deterministic training.',
+    )
+    args = parser.parse_args()
+
+    parameters = DEFAULT_PARAMS[args.model]
+    if args.enable_determinism:
+        parameters += ' --enable-determinism'
+    if args.deterministic_seed is not None:
+        parameters += f' --deterministic_seed {args.deterministic_seed}'
+    if args.compare_log:
+        parameters += f' --compare-log {args.compare_log}'
+
+    context = BenchmarkRegistry.create_benchmark_context(args.model, parameters=parameters, framework=Framework.PYTORCH)
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    logger.info(f'Benchmark finished. Return code: {benchmark.return_code}')
+
+    # Save results to file for comparison
+    if not args.compare_log:
+        # Find next available results file name
+        counter = 0
+        while Path(f'results-{counter}.json').exists():
+            counter += 1
+        results_file = f'results-{counter}.json'
+
+        # Parse benchmark results and create nested format like results-summary.json
+        benchmark_results = json.loads(benchmark.serialized_result)
+
+        # Create nested structure: raw_data -> benchmark_name -> metrics
+        # Extract the benchmark name from the results (e.g., "pytorch-lstm")
+        benchmark_name = benchmark_results.get('name', args.model)
+
+        # Create results in the format expected by comparison logic
+        nested_results = {
+            'raw_data': {
+                f'model-benchmarks:{args.model}/{benchmark_name}': benchmark_results.get('raw_data', {})
+            }
+        }
+
+        # Write results to file
+        with open(results_file, 'w') as f:
+            json.dump(nested_results, f, indent=2)
+        logger.info(f'Results saved to {results_file}')
+        logger.info(f'To compare against this run, use: --compare-log {results_file}')
+    else:
+        logger.info(f'Comparison completed against {args.compare_log}')
+
+    if hasattr(benchmark, '_model_run_metadata'):
+        logger.info(f'Run metadata: {benchmark._model_run_metadata}')
+    if hasattr(benchmark, '_model_run_periodic'):
+        num_checkpoints = len(benchmark._model_run_periodic.get('step', []))
+        logger.info(f'Periodic fingerprints collected at {num_checkpoints} checkpoints')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 8e6e58bfe..2dbc4cd41 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -110,14 +110,66 @@ def parse_args(self, ignore_invalid=False):
                 logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
                 return False, None, []
 
-        ret = True
+        if args is not None and 'compare_log' in [a.dest for a in self._parser._actions]:
+            args = self._override_args_with_compare_log(args)
+
+        ret = self._check_unknown_args(unknown)
+
+        return ret, args, unknown
+
+    def _override_args_with_compare_log(self, args):
+        """Override arguments with metadata from a compare log file if available.
+
+        This is a legacy method. Metadata override is now handled by benchmark-specific
+        implementations (e.g., pytorch_base.py for PyTorch models).
+
+        Args:
+            args: Parsed arguments.
+
+        Returns:
+            argparse: Arguments (returned unchanged).
+        """
+        return args
+
+    def _convert_precision_value(self, value, Precision):
+        """Convert precision values to the appropriate format.
+
+        Args:
+            value: The precision value to convert.
+            Precision: The Precision class or type to convert to.
+
+        Returns:
+            list: A list of converted precision values.
+        """
+        if isinstance(value, list):
+            converted = []
+            for v in value:
+                if isinstance(v, Precision):
+                    converted.append(v)
+                else:
+                    converted.append(Precision(v))
+            return converted
+        else:
+            if isinstance(value, Precision):
+                return [value]
+            else:
+                return [Precision(value)]
+
+    def _check_unknown_args(self, unknown):
+        """Check for unknown arguments and log an error if any are found.
+
+        Args:
+            unknown (list): List of unknown arguments.
+
+        Returns:
+            bool: False if unknown arguments are found, True otherwise.
+        """
         if len(unknown) > 0:
             logger.error(
                 'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
             )
-            ret = False
-
-        return ret, args, unknown
+            return False
+        return True
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -263,6 +315,10 @@ def __check_raw_data(self):
               instance of List[List[Number]] or List[str] for BenchmarkType.MICRO.
         """
         for metric in self._result.raw_data:
+            # Skip validation for metadata (dict type used for configuration storage)
+            if metric.startswith('metadata'):
+                continue
+
             is_valid = True
             if self._benchmark_type == BenchmarkType.MODEL:
                 is_valid = self.__is_list_list_type(self._result.raw_data[metric], numbers.Number)
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
index 1c8df9fe3..3e3cf0443 100644
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -186,6 +186,17 @@ def _generate_dataset(self):
         """
         pass
 
+    def set_deterministic_seed(self):
+        """Hook to set deterministic RNG state before dataset generation.
+
+        Framework-specific subclasses may
+        override this to apply deterministic RNG settings (for example,
+        PyTorch benchmarks implement this to call their deterministic setup
+        when requested). This is called from _preprocess() before
+        _generate_dataset().
+        """
+        return None
+
     @abstractmethod
     def _init_dataloader(self):
         """Initialize the dataloader.
@@ -221,6 +232,12 @@ def _preprocess(self):
             self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
             return False
 
+        # Invoke model-specific deterministic seeding hook before dataset generation
+        try:
+            self.set_deterministic_seed()
+        except Exception:
+            logger.info('set_deterministic_seed() hook failed or not implemented for model: %s', self._name)
+
         # Set sample_count aligned with batch_size.
         self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size
 
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py
index 1d7950cad..3dd715b94 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -5,6 +5,7 @@
 
 import os
 from datetime import timedelta
+import random
 import time
 
 import torch
@@ -13,11 +14,18 @@
     import transformer_engine.pytorch as te
 except ImportError:
     te = None
-from torch.utils.data import DataLoader
+from torch.backends.cuda import sdp_kernel
 from torch.distributed import TCPStore, PrefixStore
+from torch.utils.data import DataLoader
 
 from superbench.common.utils import logger
-from superbench.benchmarks import Framework, ReturnCode, DistributedBackend, DistributedImpl
+from superbench.common import model_log_utils
+from superbench.benchmarks import (
+    Framework,
+    ReturnCode,
+    DistributedBackend,
+    DistributedImpl,
+)
 from superbench.benchmarks.model_benchmarks.model_base import Optimizer, ModelBenchmark
 
 
@@ -30,15 +38,350 @@ def __init__(self, name, parameters=''):
             name (str): benchmark name.
             parameters (str): benchmark parameters.
         """
+        # Set CUBLAS_WORKSPACE_CONFIG early, before parent init which might parse args
+        # This ensures it's set before any CUDA operations if determinism is enabled
+        if 'enable-determinism' in parameters or 'enable_determinism' in parameters:
+            os.environ.setdefault('CUBLAS_WORKSPACE_CONFIG', ':4096:8')
+
         super().__init__(name, parameters)
 
         self._framework = Framework.PYTORCH
         torch.backends.cudnn.benchmark = True
 
+        self._model_run_metadata = {}
+        self._model_run_losses = []
+        self._model_run_periodic = {}
+
     def _judge_gpu_availability(self):
         """Judge GPUs' availability according to arguments and running environment."""
         self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
 
+    def _enable_deterministic_training(self):
+        """Enable deterministic training settings for reproducible results."""
+        # Set CUBLAS_WORKSPACE_CONFIG (should already be set in __init__, but ensure it's set as backup)
+        os.environ.setdefault('CUBLAS_WORKSPACE_CONFIG', ':4096:8')
+
+        if hasattr(self._args, 'deterministic_seed'):
+            torch.manual_seed(self._args.deterministic_seed)
+            random.seed(self._args.deterministic_seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed_all(self._args.deterministic_seed)
+        torch.use_deterministic_algorithms(True, warn_only=False)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        # Disable TF32 to remove potential numerical variability
+        try:
+            torch.backends.cuda.matmul.allow_tf32 = False
+        except Exception:
+            logger.warning('Failed to disable TF32 in cuda matmul')
+
+        try:
+            torch.backends.cudnn.allow_tf32 = False
+        except Exception:
+            logger.warning('Failed to disable TF32 in cuDNN')
+
+        # Force Scaled Dot-Product Attention to use deterministic math kernel
+        try:
+            sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False)
+        except Exception:
+            logger.warning('SDP kernel not available')
+            # Older PyTorch versions may not expose sdp_kernel; ignore in that case
+
+    def _assign_model_run_metadata(self, precision, extra_keys=None):
+        """Assign model_run_metadata for determinism fingerprinting/logging.
+
+        Args:
+            precision: Model precision (can be enum or string).
+            extra_keys: List of additional argument keys to include in metadata.
+
+        Returns:
+            None
+        """
+        self._model_run_metadata = model_log_utils.build_model_metadata(self._name, precision, self._args, extra_keys)
+        return None
+
+    def record_determinism_fingerprint(self, curr_step, loss, logits, periodic, check_frequency):
+        """Centralized logic for recording per-step loss and periodic fingerprints for deterministic runs.
+
+        Args:
+            curr_step (int): Current training step.
+            loss (torch.Tensor or float): Loss value for this step.
+            logits (torch.Tensor or float): Logits output for this step (sample 0).
+            periodic (dict): Dictionary to store periodic fingerprints ('loss', 'act_mean', 'step').
+            check_frequency (int): Frequency for fingerprint logging.
+        """
+        # Record per-step loss for determinism checks
+        loss_value = model_log_utils.record_step_loss(loss, curr_step, self._model_run_losses, logger)
+
+        # Record periodic fingerprint (loss and activation mean)
+        model_log_utils.record_periodic_fingerprint(
+            curr_step, loss_value, logits, periodic, check_frequency, getattr(self._args, 'enable_determinism', False),
+            logger
+        )
+
+    def _finalize_periodic_logging(self, periodic, info_key='loss'):
+        """Finalize periodic logging and return info dict for training step."""
+        info = {info_key: periodic.get(info_key, [])}
+        self._model_run_losses = list(periodic.get(info_key, []))
+        self._model_run_periodic = dict(periodic)
+        return info
+
+    def add_parser_arguments(self):
+        """Add PyTorch model benchmark-specific arguments to the argument parser."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--compare-log',
+            '--compare_log',
+            dest='compare_log',
+            type=str,
+            default=None,
+            help='Path to reference results.json file for deterministic comparison.',
+        )
+        self._parser.add_argument(
+            '--deterministic_seed',
+            '--deterministic-seed',
+            type=int,
+            default=42,
+            required=False,
+            help='Random seed for deterministic training.',
+        )
+        self._parser.add_argument(
+            '--enable-determinism',
+            '--enable_determinism',
+            action='store_true',
+            default=False,
+            help='Enable deterministic training for reproducible results.',
+        )
+        self._parser.add_argument(
+            '--generate_log',
+            '--generate-log',
+            action='store_true',
+            default=False,
+            help='Generate consolidated deterministic reference results (stores all ranks in results-summary).',
+        )
+        self._parser.add_argument(
+            '--check_frequency',
+            '--check-frequency',
+            type=int,
+            default=100,
+            required=False,
+            help='How often (in steps) to run lightweight periodic checks/logs and evaluate early-stop conditions.',
+        )
+
+    def _post_run_model_log(self):
+        """Add deterministic metrics to results and optionally compare with reference results.
+
+        Deterministic metrics (loss, activation mean) are stored in the results file alongside
+        other benchmark metrics. When --compare-log is specified, loads the reference results
+        file and compares deterministic metrics per-rank.
+        """
+        # Add deterministic metrics to result system (all ranks add their own metrics)
+        if getattr(self._args, 'enable_determinism', False):
+            self._add_deterministic_metrics_to_result()
+
+            # Consolidate results from all ranks to rank 0 for complete results-summary
+            # This is needed whether generating or comparing logs
+            self._save_consolidated_deterministic_results()
+
+            # Compare with reference results if requested
+            if getattr(self._args, 'compare_log', None):
+                self._compare_deterministic_results()
+
+    def _add_deterministic_metrics_to_result(self):
+        """Add deterministic fingerprints and losses to the benchmark result system.
+
+        This makes deterministic metrics visible in results-summary.json alongside
+        other benchmark metrics. In distributed training, metrics include rank information.
+        """
+        # Add periodic fingerprints (loss, activation mean) to results
+        if self._model_run_periodic:
+            for key, values in self._model_run_periodic.items():
+                if isinstance(values, list) and values:
+                    # Include rank in metric name for distributed training
+                    if self._global_rank is not None:
+                        metric_name = f'deterministic_{key}_rank{self._global_rank}'
+                    else:
+                        metric_name = f'deterministic_{key}'
+
+                    # Add raw data (all values at each checkpoint)
+                    self._result.add_raw_data(metric_name, values, self._args.log_raw_data)
+                    # Add summarized result (mean of checkpointed values)
+                    import statistics
+                    self._result.add_result(metric_name, statistics.mean([v for v in values if v is not None]))
+
+        # Add count of deterministic checks performed
+        if self._model_run_periodic.get('step'):
+            if self._global_rank is not None:
+                metric_name = f'deterministic_check_count_rank{self._global_rank}'
+            else:
+                metric_name = 'deterministic_check_count'
+            self._result.add_result(metric_name, len(self._model_run_periodic['step']))
+
+        # Save metadata for configuration reproducibility
+        if self._model_run_metadata:
+            if self._global_rank is not None:
+                metric_name = f'metadata_rank{self._global_rank}'
+            else:
+                metric_name = 'metadata'
+            # Use False for log_raw_data to save in result object, not log file
+            self._result.add_raw_data(metric_name, self._model_run_metadata, False)
+
+    def _save_consolidated_deterministic_results(self):
+        """Gather deterministic data from all ranks and save to results-summary (rank 0 only).
+
+        In distributed training, all ranks send their raw_data to rank 0, which consolidates
+        and adds it to the result system. This allows all ranks' checkpoint data to appear
+        in the standard results-summary files.
+        """
+        import torch.distributed as dist
+
+        # In distributed mode, gather all ranks' data to rank 0
+        if self._args.distributed_impl == DistributedImpl.DDP:
+            # Serialize current rank's raw_data
+            raw_data_to_send = {}
+            for key in self._result.raw_data:
+                if key.startswith('deterministic_'):
+                    raw_data_to_send[key] = self._result.raw_data[key]
+
+            # Gather all ranks' data to rank 0
+            if self._global_rank == 0:
+                # Rank 0 collects data from all ranks
+                all_ranks_data = [None] * dist.get_world_size()
+                dist.gather_object(raw_data_to_send, all_ranks_data, dst=0)
+
+                # Add all ranks' raw_data to rank 0's result (which becomes results-summary)
+                for rank_idx, rank_data in enumerate(all_ranks_data):
+                    if rank_data:
+                        for key, value in rank_data.items():
+                            # Add to rank 0's result raw_data if not already present
+                            if key not in self._result.raw_data:
+                                self._result.raw_data[key] = value
+
+                logger.info(f'Rank 0: Consolidated deterministic results from {dist.get_world_size()} ranks')
+            else:
+                # Other ranks send their data to rank 0
+                dist.gather_object(raw_data_to_send, None, dst=0)
+        else:
+            # Non-distributed: data already in result, nothing to consolidate
+            logger.info('Deterministic results stored in results')
+
+    def _compare_deterministic_results(self):
+        """Compare current deterministic metrics with reference results file.
+
+        Loads the reference results.json file and compares deterministic metrics
+        (loss, activation mean) per-rank to verify reproducibility.
+        """
+        import torch.distributed as dist
+
+        compare_log_path = self._args.compare_log
+        rank = self._global_rank if self._global_rank is not None else 0
+        logger.info(f'Rank {rank}: Loading reference results from {compare_log_path}')
+
+        # Track if this rank detected any failure
+        has_failure = False
+        failure_msg = ''
+
+        try:
+            # Load reference results and extract raw_data
+            ref_raw_data, _ = model_log_utils.load_reference_results(
+                compare_log_path, self._name, self._global_rank, logger
+            )
+
+            # Compare metrics
+            curr_raw_data = self._result.raw_data
+            mismatches = model_log_utils.compare_raw_data_metrics(
+                curr_raw_data, ref_raw_data, self._global_rank, logger
+            )
+
+            if mismatches:
+                has_failure = True
+                failure_msg = (
+                    f'Rank {self._global_rank if self._global_rank is not None else 0}: '
+                    f'Determinism check FAILED. Mismatched metrics:\n' + '\n'.join(mismatches)
+                )
+        except (FileNotFoundError, ValueError) as e:
+            has_failure = True
+            failure_msg = str(e)
+
+        # Synchronize failure status across all ranks in distributed mode
+        if self._args.distributed_impl == DistributedImpl.DDP:
+            # Convert failure status to tensor for all_reduce
+            import torch
+            failure_tensor = torch.tensor([1 if has_failure else 0], dtype=torch.int32, device='cuda')
+            dist.all_reduce(failure_tensor, op=dist.ReduceOp.MAX)
+
+            # If any rank failed, all ranks should fail
+            if failure_tensor.item() > 0:
+                if has_failure:
+                    # This rank detected the failure
+                    logger.error(failure_msg)
+                    raise RuntimeError(failure_msg)
+                else:
+                    # Another rank detected failure, fail together
+                    error_msg = f'Rank {self._global_rank}: Determinism check FAILED on another rank'
+                    logger.error(error_msg)
+                    raise RuntimeError(error_msg)
+        elif has_failure:
+            # Non-distributed mode, just raise
+            logger.error(failure_msg)
+            raise RuntimeError(failure_msg)
+
+        rank = self._global_rank if self._global_rank is not None else 0
+        logger.info(f'Rank {rank}: Determinism check PASSED - all checkpoints match')
+
+    def _preprocess(self):
+        """Preprocess and apply PyTorch-specific defaults."""
+        preprocess_ok = super()._preprocess()
+        if not preprocess_ok:
+            return False
+        # Deterministic setup is handled centrally in set_deterministic_seed() which
+        # is invoked earlier in the model-base preprocess before dataset creation.
+        if getattr(self._args, 'enable_determinism', False):
+            self._handle_deterministic_log_options()
+        return True
+
+    def set_deterministic_seed(self):
+        """Set deterministic RNGs centrally for PyTorch benchmarks.
+
+        This will set the seeds and deterministic flags prior to dataset generation
+        so per-model dataset generation is reproducible without each model needing
+        to call torch.manual_seed().
+        """
+        if getattr(self._args, 'enable_determinism', False):
+            try:
+                self._enable_deterministic_training()
+            except Exception:
+                logger.info('Failed to enable deterministic training in centralized preprocess')
+
+    def _handle_deterministic_log_options(self):
+        """Handle deterministic log options.
+
+        In deterministic mode, metrics are automatically added to the results file.
+        The --compare-log option can be used to compare against a previous results file.
+
+        If compare-log is provided, load metadata from reference file and override current configuration
+        to ensure exact reproducibility.
+        """
+        if self._args.compare_log:
+            try:
+                # Load reference metadata
+                _, ref_metadata = model_log_utils.load_reference_results(
+                    self._args.compare_log, self._name, self._global_rank, logger
+                )
+
+                if ref_metadata:
+                    # Apply metadata overrides
+                    overridden = model_log_utils.apply_metadata_overrides(self._args, ref_metadata, logger)
+                    if overridden == 0:
+                        logger.info('No parameters needed to be overridden from reference metadata')
+                else:
+                    logger.warning(
+                        f'No metadata found in reference file {self._args.compare_log}. '
+                        'Cannot verify configuration matches reference run.'
+                    )
+            except Exception as e:
+                logger.warning(f'Failed to load metadata from reference file {self._args.compare_log}: {e}')
+
     def _set_force_fp32(self):
         """Set the config that controls whether full float32 precision will be used.
 
@@ -150,6 +493,7 @@ def _init_dataloader(self):
         if self._args.distributed_impl:
             if self._args.distributed_impl == DistributedImpl.HOROVOD:
                 import horovod.torch as hvd
+
                 train_sampler = \
                     torch.utils.data.distributed.DistributedSampler(
                         self._dataset,
@@ -347,18 +691,23 @@ def _timer(self):
     def _benchmark(self):
         """Wrap super._benchmark with profiler context if enabled by environment variable.
 
+        Run the benchmark then handle post-run model log save/compare.
         Set SB_ENABLE_PYTORCH_PROFILER='1' to enable profiling.
         """
         # Check if this is a Nvidia GPU
         if not (torch.cuda.is_available() and torch.version.cuda is not None):
-            return super()._benchmark()
+            ok = super()._benchmark()
+            self._post_run_model_log()
+            return ok
 
         # Check if profiling is enabled via environment variable
         enable_profiler = os.environ.get('SB_ENABLE_PYTORCH_PROFILER', '0') == '1'
 
         if not enable_profiler:
             # Run without profiling
-            return super()._benchmark()
+            ok = super()._benchmark()
+            self._post_run_model_log()
+            return ok
 
         # Run with profiling enabled
         logger.info('PyTorch profiler enabled for model: {}'.format(self._name))
@@ -397,4 +746,6 @@ def _benchmark(self):
         with open(diag_agent_dump_file_path, 'w') as f:
             json.dump(diag_agent_events, f, sort_keys=True)
 
+        # Handle post-run model log save/compare regardless of profiling
+        self._post_run_model_log()
         return ret
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_bert.py b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
index fae2f0479..d95b7343e 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_bert.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
@@ -155,6 +155,8 @@ def _create_model(self, precision):
         if self._gpu_available:
             self._target = self._target.cuda()
 
+        self._assign_model_run_metadata(precision)
+
         return True
 
     def _train_step(self, precision):
@@ -167,8 +169,8 @@ def _train_step(self, precision):
             The step-time list of every training step.
         """
         duration = []
+        periodic = {'loss': [], 'act_mean': [], 'step': []}
         curr_step = 0
-        check_frequency = 100
         while True:
             for idx, sample in enumerate(self._dataloader):
                 start = self._timer()
@@ -182,17 +184,18 @@ def _train_step(self, precision):
                         output = self._model(sample)
                 else:
                     output = self._model(sample)
-                loss = self._loss_fn(output, self._target)
+                logits = output
+                loss = self._loss_fn(logits.float(), self._target)
                 loss.backward()
                 self._optimizer.step()
                 end = self._timer()
                 curr_step += 1
                 if curr_step > self._args.num_warmup:
-                    # Save the step time of every training/inference step, unit is millisecond.
                     duration.append((end - start) * 1000)
+                    self.record_determinism_fingerprint(curr_step, loss, logits, periodic, self._args.check_frequency)
                     self._log_step_time(curr_step, precision, duration)
-                if self._is_finished(curr_step, end, check_frequency):
-                    return duration
+                    if self._is_finished(curr_step, end):
+                        return duration, self._finalize_periodic_logging(periodic)
 
     def _inference_step(self, precision):
         """Define the inference process.
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_cnn.py b/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
index c7e683030..51f9cecf0 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
@@ -84,6 +84,8 @@ def _create_model(self, precision):
         if self._gpu_available:
             self._target = self._target.cuda()
 
+        self._assign_model_run_metadata(precision)
+
         return True
 
     def _train_step(self, precision):
@@ -96,8 +98,8 @@ def _train_step(self, precision):
             The step-time list of every training step.
         """
         duration = []
+        periodic = {'loss': [], 'act_mean': [], 'step': []}
         curr_step = 0
-        check_frequency = 100
         while True:
             for idx, sample in enumerate(self._dataloader):
                 sample = sample.to(dtype=getattr(torch, precision.value))
@@ -108,7 +110,7 @@ def _train_step(self, precision):
                     start = self._timer()
                 self._optimizer.zero_grad()
                 output = self._model(sample)
-                loss = self._loss_fn(output, self._target)
+                loss = self._loss_fn(output.float(), self._target)
                 loss.backward()
                 self._optimizer.step()
                 end = self._timer()
@@ -116,9 +118,10 @@ def _train_step(self, precision):
                 if curr_step > self._args.num_warmup:
                     # Save the step time of every training/inference step, unit is millisecond.
                     duration.append((end - start) * 1000)
+                    self.record_determinism_fingerprint(curr_step, loss, output, periodic, self._args.check_frequency)
                     self._log_step_time(curr_step, precision, duration)
-                if self._is_finished(curr_step, end, check_frequency):
-                    return duration
+                    if self._is_finished(curr_step, end):
+                        return duration, self._finalize_periodic_logging(periodic)
 
     def _inference_step(self, precision):
         """Define the inference process.
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
index 17bb6570b..8c7f91f18 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
@@ -149,6 +149,8 @@ def _create_model(self, precision):
         if self._gpu_available:
             self._target = self._target.cuda()
 
+        self._assign_model_run_metadata(precision)
+
         return True
 
     def _train_step(self, precision):
@@ -158,11 +160,11 @@ def _train_step(self, precision):
             precision (Precision): precision of model and input data, such as float32, float16.
 
         Return:
-            The step-time list of every training step.
+           A tuple of (step_times_ms, info) of every training step.
         """
         duration = []
+        periodic = {'loss': [], 'act_mean': [], 'step': []}
         curr_step = 0
-        check_frequency = 100
         while True:
             for idx, sample in enumerate(self._dataloader):
                 start = self._timer()
@@ -176,17 +178,18 @@ def _train_step(self, precision):
                         output = self._model(sample)
                 else:
                     output = self._model(sample)
-                loss = self._loss_fn(output[range(self._args.batch_size), -1], self._target)
+                logits = output[range(self._args.batch_size), -1]
+                loss = self._loss_fn(logits.float(), self._target)
                 loss.backward()
                 self._optimizer.step()
                 end = self._timer()
                 curr_step += 1
                 if curr_step > self._args.num_warmup:
-                    # Save the step time of every training/inference step, unit is millisecond.
                     duration.append((end - start) * 1000)
+                    self.record_determinism_fingerprint(curr_step, loss, logits, periodic, self._args.check_frequency)
                     self._log_step_time(curr_step, precision, duration)
-                if self._is_finished(curr_step, end, check_frequency):
-                    return duration
+                    if self._is_finished(curr_step, end):
+                        return duration, self._finalize_periodic_logging(periodic)
 
     def _inference_step(self, precision):
         """Define the inference process.
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_llama.py b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
index 00fef3609..77b4f20ec 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_llama.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_llama.py
@@ -169,6 +169,8 @@ def _create_model(self, precision):
         if self._gpu_available:
             self._target = self._target.cuda()
 
+        self._assign_model_run_metadata(precision)
+
         return True
 
     def _train_step(self, precision):
@@ -178,11 +180,11 @@ def _train_step(self, precision):
             precision (Precision): precision of model and input data, such as float32, float16.
 
         Return:
-            The step-time list of every training step.
+            A tuple of (step_times_ms, info) of every training step.
         """
         duration = []
+        periodic = {'loss': [], 'act_mean': [], 'step': []}
         curr_step = 0
-        check_frequency = 100
         while True:
             for idx, sample in enumerate(self._dataloader):
                 start = self._timer()
@@ -196,17 +198,18 @@ def _train_step(self, precision):
                         output = self._model(sample)
                 else:
                     output = self._model(sample)
-                loss = self._loss_fn(output[range(self._args.batch_size), -1], self._target)
+                logits = output[range(self._args.batch_size), -1]
+                loss = self._loss_fn(logits.float(), self._target)
                 loss.backward()
                 self._optimizer.step()
                 end = self._timer()
                 curr_step += 1
                 if curr_step > self._args.num_warmup:
-                    # Save the step time of every training/inference step, unit is millisecond.
                     duration.append((end - start) * 1000)
+                    self.record_determinism_fingerprint(curr_step, loss, logits, periodic, self._args.check_frequency)
                     self._log_step_time(curr_step, precision, duration)
-                if self._is_finished(curr_step, end, check_frequency):
-                    return duration
+                    if self._is_finished(curr_step, end):
+                        return duration, self._finalize_periodic_logging(periodic)
 
     def _inference_step(self, precision):
         """Define the inference process.
@@ -237,7 +240,6 @@ def _inference_step(self, precision):
                     end = self._timer()
                     curr_step += 1
                     if curr_step > self._args.num_warmup:
-                        # Save the step time of every training/inference step, unit is millisecond.
                         duration.append((end - start) * 1000)
                         self._log_step_time(curr_step, precision, duration)
                     if self._is_finished(curr_step, end):
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_lstm.py b/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
index 85335c6a1..132289514 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
@@ -124,6 +124,8 @@ def _create_model(self, precision):
         if self._gpu_available:
             self._target = self._target.cuda()
 
+        self._assign_model_run_metadata(precision)
+
         return True
 
     def _train_step(self, precision):
@@ -136,8 +138,8 @@ def _train_step(self, precision):
             The step-time list of every training step.
         """
         duration = []
+        periodic = {'loss': [], 'act_mean': [], 'step': []}
         curr_step = 0
-        check_frequency = 100
         while True:
             for idx, sample in enumerate(self._dataloader):
                 sample = sample.to(dtype=getattr(torch, precision.value))
@@ -148,17 +150,17 @@ def _train_step(self, precision):
                     start = self._timer()
                 self._optimizer.zero_grad()
                 output = self._model(sample)
-                loss = self._loss_fn(output, self._target)
+                loss = self._loss_fn(output.float(), self._target)
                 loss.backward()
                 self._optimizer.step()
                 end = self._timer()
                 curr_step += 1
                 if curr_step > self._args.num_warmup:
-                    # Save the step time of every training/inference step, unit is millisecond.
                     duration.append((end - start) * 1000)
+                    self.record_determinism_fingerprint(curr_step, loss, output, periodic, self._args.check_frequency)
                     self._log_step_time(curr_step, precision, duration)
-                if self._is_finished(curr_step, end, check_frequency):
-                    return duration
+                    if self._is_finished(curr_step, end):
+                        return duration, self._finalize_periodic_logging(periodic)
 
     def _inference_step(self, precision):
         """Define the inference process.
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_mixtral_impl.py b/superbench/benchmarks/model_benchmarks/pytorch_mixtral_impl.py
index b1d21c7f0..5ed955225 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_mixtral_impl.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_mixtral_impl.py
@@ -134,7 +134,27 @@ def _create_model(self, precision):
         Args:
             precision (Precision): precision of model and input data, such as float32, float16.
         """
-        self._config = MixtralConfig(
+        self._config = self._build_config()
+        if not self._check_fp8_support(precision):
+            return False
+
+        try:
+            self._model = self._instantiate_model()
+            self._postprocess_model(precision)
+        except Exception as e:
+            logger.error(
+                'Create model with specified precision failed - model: {}, precision: {}, message: {}.'.format(
+                    self._name, precision, str(e)
+                )
+            )
+            return False
+
+        self._setup_target()
+        self._assign_metadata_safe(precision)
+        return True
+
+    def _build_config(self):
+        return MixtralConfig(
             hidden_size=self._args.hidden_size,
             num_hidden_layers=self._args.num_hidden_layers,
             num_attention_heads=self._args.num_attention_heads,
@@ -144,46 +164,55 @@ def _create_model(self, precision):
             router_aux_loss_coef=self._args.router_aux_loss_coef,
         )
 
+    def _check_fp8_support(self, precision):
         enable_fp8 = precision.name.startswith('FP8_')
         if enable_fp8 and te is None:
             logger.error(
-                f'Create model with fp8 failed - model: {self._name}, precision: {precision},'
-                ' message: Cannot find transformer_engine.'
+                f'Create model with fp8 failed - model: {self._name}, precision: {precision}, '
+                'message: Cannot find transformer_engine.'
             )
             return False
         if enable_fp8 and not self._gpu_available:
             logger.error(
-                f'Create model with fp8 failed - model: {self._name}, precision: {precision},'
-                ' message: FP8 is only supported on GPU.'
+                f'Create model with fp8 failed - model: {self._name}, precision: {precision}, '
+                'message: FP8 is only supported on GPU.'
             )
             return False
+        return True
 
-        try:
-            self._model = MixtralBenchmarkModel(self._config, self._args.num_classes)
-            if enable_fp8:
-                self._fp8_recipe = DelayedScaling(
-                    fp8_format=Format[precision.name.strip('FP8_')],
-                    amax_history_len=16,
-                    amax_compute_algo='max',
-                )
-                self._to_te_model(self._model.to(dtype=torch.float16))
-            else:
-                self._model = self._model.to(dtype=getattr(torch, precision.value))
-            if self._gpu_available:
-                self._model = self._model.cuda()
-        except Exception as e:
-            logger.error(
-                'Create model with specified precision failed - model: {}, precision: {}, message: {}.'.format(
-                    self._name, precision, str(e)
-                )
+    def _instantiate_model(self):
+        return MixtralBenchmarkModel(self._config, self._args.num_classes)
+
+    def _postprocess_model(self, precision):
+        enable_fp8 = precision.name.startswith('FP8_')
+        if enable_fp8:
+            self._fp8_recipe = DelayedScaling(
+                fp8_format=Format[precision.name.strip('FP8_')],
+                amax_history_len=16,
+                amax_compute_algo='max',
             )
-            return False
+            self._to_te_model(self._model.to(dtype=torch.float16))
+        else:
+            self._model = self._model.to(dtype=getattr(torch, precision.value))
+        if self._gpu_available:
+            self._model = self._model.cuda()
 
+    def _setup_target(self):
+        # Use a separate deterministic RNG stream for target generation by offsetting the seed.
+        # This keeps dataset RNG and target/model RNG deterministic but independent.
+        if getattr(self._args, 'enable_determinism', False) and hasattr(self._args, 'deterministic_seed'):
+            torch.manual_seed(self._args.deterministic_seed + 1)
         self._target = torch.LongTensor(self._args.batch_size).random_(self._args.num_classes)
         if self._gpu_available:
             self._target = self._target.cuda()
 
-        return True
+    def _assign_metadata_safe(self, precision):
+        try:
+            self._assign_model_run_metadata(
+                precision, extra_keys=['num_key_value_heads', 'max_position_embeddings', 'router_aux_loss_coef']
+            )
+        except Exception:
+            logger.warning(f'Unable to assign model metadata for logging - model: {self._name}, precision: {precision}')
 
     def _train_step(self, precision):
         """Define the training process.
@@ -195,8 +224,8 @@ def _train_step(self, precision):
             The step-time list of every training step.
         """
         duration = []
+        periodic = {'loss': [], 'act_mean': [], 'step': []}
         curr_step = 0
-        check_frequency = 100
         while True:
             for idx, sample in enumerate(self._dataloader):
                 start = self._timer()
@@ -210,17 +239,18 @@ def _train_step(self, precision):
                         output = self._model(sample)
                 else:
                     output = self._model(sample)
-                loss = self._loss_fn(output[range(self._args.batch_size), -1], self._target)
+                logits = output[range(self._args.batch_size), -1]
+                loss = self._loss_fn(logits.float(), self._target)
                 loss.backward()
                 self._optimizer.step()
                 end = self._timer()
                 curr_step += 1
                 if curr_step > self._args.num_warmup:
-                    # Save the step time of every training/inference step, unit is millisecond.
                     duration.append((end - start) * 1000)
+                    self.record_determinism_fingerprint(curr_step, loss, logits, periodic, self._args.check_frequency)
                     self._log_step_time(curr_step, precision, duration)
-                if self._is_finished(curr_step, end, check_frequency):
-                    return duration
+                    if self._is_finished(curr_step, end):
+                        return duration, self._finalize_periodic_logging(periodic)
 
     def _inference_step(self, precision):
         """Define the inference process.
diff --git a/superbench/common/model_log_utils.py b/superbench/common/model_log_utils.py
new file mode 100644
index 000000000..c433998a2
--- /dev/null
+++ b/superbench/common/model_log_utils.py
@@ -0,0 +1,335 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Utility functions for deterministic model training and validation."""
+
+import json
+
+
+def build_model_metadata(name, precision, args, extra_keys=None):
+    """Build metadata dictionary for deterministic model runs.
+
+    Args:
+        name (str): Model name.
+        precision: Model precision (enum or string).
+        args: Parsed arguments object.
+        extra_keys (list): Additional argument keys to include in metadata.
+
+    Returns:
+        dict: Metadata dictionary with model configuration.
+    """
+    metadata = {
+        'model_name': name,
+        'precision': (precision.value if hasattr(precision, 'value') else str(precision)),
+        'seed': getattr(args, 'deterministic_seed', None),
+        'deterministic_seed': getattr(args, 'deterministic_seed', None),
+        'batch_size': getattr(args, 'batch_size', None),
+        'seq_len': getattr(args, 'seq_len', None),
+        'num_steps': getattr(args, 'num_steps', None),
+        'num_warmup': getattr(args, 'num_warmup', None),
+        'check_frequency': getattr(args, 'check_frequency', None),
+        'num_classes': getattr(args, 'num_classes', None),
+    }
+
+    # Add common model architecture keys
+    keys = [
+        'hidden_size',
+        'num_hidden_layers',
+        'num_attention_heads',
+        'intermediate_size',
+        'input_size',
+        'num_layers',
+        'bidirectional',
+    ]
+    if extra_keys:
+        keys += extra_keys
+
+    for key in keys:
+        metadata[key] = getattr(args, key, None)
+
+    return metadata
+
+
+def record_step_loss(loss, curr_step, losses_list, logger=None):
+    """Record per-step loss value for determinism tracking.
+
+    Args:
+        loss: Loss tensor or float value.
+        curr_step (int): Current training step.
+        losses_list (list): List to append loss values to.
+        logger: Optional logger for warnings.
+
+    Returns:
+        float: Converted loss value, or None if conversion failed.
+    """
+    try:
+        v = float(loss.detach().item()) if hasattr(loss, 'detach') else float(loss)
+        losses_list.append(v)
+        return v
+    except Exception:
+        if logger:
+            logger.info(f'Unable to convert loss to float at step {curr_step}')
+        losses_list.append(None)
+        return None
+
+
+def _record_loss_fingerprint(curr_step, loss_value, periodic_dict, logger):
+    """Record loss fingerprint at current step."""
+    try:
+        if 'loss' in periodic_dict and isinstance(periodic_dict['loss'], list):
+            periodic_dict['loss'].append(loss_value if loss_value is not None else None)
+        else:
+            periodic_dict['loss'] = [loss_value if loss_value is not None else None]
+
+        if logger:
+            logger.info(f'Loss at step {curr_step}: {loss_value}')
+        periodic_dict.setdefault('step', []).append(curr_step)
+    except Exception:
+        if logger:
+            logger.warning(f'Unable to log loss at curr_step {curr_step}')
+
+
+def _record_activation_fingerprint(curr_step, logits, periodic_dict, logger):
+    """Record activation mean fingerprint at current step."""
+    try:
+        if logits is not None:
+            act_mean = (
+                float(logits[0].detach().float().mean().item()) if hasattr(logits[0], 'detach') else float(logits[0])
+            )
+            if logger:
+                logger.info(f'ActMean at step {curr_step}: {act_mean}')
+            periodic_dict.setdefault('act_mean', []).append(act_mean)
+        else:
+            periodic_dict.setdefault('act_mean', []).append(None)
+    except Exception:
+        if logger:
+            logger.warning(f'Unable to log act_mean at curr_step {curr_step}')
+        periodic_dict.setdefault('act_mean', []).append(None)
+
+
+def record_periodic_fingerprint(
+    curr_step, loss_value, logits, periodic_dict, check_frequency, enable_determinism, logger=None
+):
+    """Record periodic fingerprints (loss and activation mean) for deterministic runs.
+
+    Args:
+        curr_step (int): Current training step.
+        loss_value: Pre-converted loss float value (or None).
+        logits: Logits tensor for activation fingerprint.
+        periodic_dict (dict): Dictionary to store periodic data ('loss', 'act_mean', 'step').
+        check_frequency (int): Frequency for fingerprint logging.
+        enable_determinism (bool): Whether determinism is enabled.
+        logger: Optional logger for info/warnings.
+    """
+    if not enable_determinism or (curr_step % check_frequency != 0):
+        return
+
+    _record_loss_fingerprint(curr_step, loss_value, periodic_dict, logger)
+    _record_activation_fingerprint(curr_step, logits, periodic_dict, logger)
+
+
+def _load_and_validate_reference_file(filepath):
+    """Load reference JSON file and validate structure."""
+    try:
+        with open(filepath, 'r') as f:
+            ref_results = json.load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(
+            f'Reference results file not found: {filepath}. '
+            f'Make sure you have run the benchmark with --enable-determinism first to generate reference results.'
+        )
+    except json.JSONDecodeError as e:
+        raise ValueError(f'Invalid JSON in reference results file {filepath}: {e}')
+
+    if 'raw_data' not in ref_results:
+        raise ValueError(f'Reference file {filepath} does not contain "raw_data" section')
+
+    return ref_results['raw_data']
+
+
+def _find_benchmark_raw_data(ref_raw_data_section, benchmark_name):
+    """Find benchmark raw_data in nested format."""
+    for bm_name in ref_raw_data_section:
+        if benchmark_name in bm_name:
+            return ref_raw_data_section[bm_name]
+
+    raise ValueError(
+        f'Reference file does not contain raw_data for benchmark matching "{benchmark_name}". '
+        f'Available benchmarks: {list(ref_raw_data_section.keys())}'
+    )
+
+
+def _extract_reference_metadata(ref_raw_data, rank):
+    """Extract metadata from reference raw_data."""
+    metadata_key = f'metadata_rank{rank}' if rank is not None else 'metadata'
+
+    if metadata_key in ref_raw_data:
+        return _extract_metadata_from_raw_data(ref_raw_data[metadata_key])
+    elif 'metadata_rank0' in ref_raw_data:
+        return _extract_metadata_from_raw_data(ref_raw_data['metadata_rank0'])
+    return None
+
+
+def load_reference_results(filepath, benchmark_name, rank=None, logger=None):
+    """Load reference results file and extract raw_data for a specific benchmark.
+
+    Args:
+        filepath (str): Path to reference results JSON file.
+        benchmark_name (str): Name of the benchmark to extract.
+        rank (int): Optional rank number for distributed training.
+        logger: Optional logger for warnings.
+
+    Returns:
+        tuple: (ref_raw_data dict, ref_metadata dict) or (None, None) on error.
+
+    Raises:
+        FileNotFoundError: If reference file doesn't exist.
+        ValueError: If reference file is invalid or missing data.
+    """
+    ref_raw_data_section = _load_and_validate_reference_file(filepath)
+    ref_raw_data = _find_benchmark_raw_data(ref_raw_data_section, benchmark_name)
+    ref_metadata = _extract_reference_metadata(ref_raw_data, rank)
+    return ref_raw_data, ref_metadata
+
+
+def _extract_metadata_from_raw_data(metadata_list):
+    """Extract metadata dict from raw_data list format.
+
+    Args:
+        metadata_list: Metadata in raw_data format (list of lists or list of dicts).
+
+    Returns:
+        dict: Extracted metadata, or None if extraction failed.
+    """
+    if isinstance(metadata_list, list) and len(metadata_list) > 0:
+        first_item = metadata_list[0]
+        if isinstance(first_item, dict):
+            return first_item
+        elif isinstance(first_item, list) and len(first_item) > 0 and isinstance(first_item[0], dict):
+            return first_item[0]
+    elif isinstance(metadata_list, dict):
+        return metadata_list
+    return None
+
+
+def _compare_checkpoint_values(key, run_idx, curr_run, ref_run, logger):
+    """Compare checkpoint values between current and reference runs."""
+    mismatches = []
+
+    if len(curr_run) != len(ref_run):
+        mismatches.append(f'{key}[run {run_idx}]: checkpoint count mismatch ({len(curr_run)} vs {len(ref_run)})')
+        return mismatches
+
+    for step_idx, (curr_step_val, ref_step_val) in enumerate(zip(curr_run, ref_run)):
+        if logger:
+            logger.debug(f'{key}[{run_idx},{step_idx}]: {curr_step_val} vs {ref_step_val}')
+        if curr_step_val != ref_step_val:
+            if isinstance(curr_step_val, (int, float)) and isinstance(ref_step_val, (int, float)):
+                diff_val = abs(curr_step_val - ref_step_val)
+                mismatches.append(
+                    f'{key}[run {run_idx}, checkpoint {step_idx}]: '
+                    f'{repr(curr_step_val)} vs {repr(ref_step_val)} (diff: {diff_val})'
+                )
+            else:
+                mismatches.append(
+                    f'{key}[run {run_idx}, checkpoint {step_idx}]: '
+                    f'{repr(curr_step_val)} vs {repr(ref_step_val)}'
+                )
+
+    return mismatches
+
+
+def _compare_metric_lists(key, curr_val, ref_val, logger):
+    """Compare list metrics between current and reference data."""
+    mismatches = []
+
+    if len(curr_val) != len(ref_val):
+        mismatches.append(f'{key}: run count mismatch ({len(curr_val)} vs {len(ref_val)})')
+        return mismatches
+
+    for run_idx in range(len(curr_val)):
+        curr_run = curr_val[run_idx]
+        ref_run = ref_val[run_idx]
+        mismatches.extend(_compare_checkpoint_values(key, run_idx, curr_run, ref_run, logger))
+
+    return mismatches
+
+
+def compare_raw_data_metrics(curr_raw_data, ref_raw_data, rank=None, logger=None):
+    """Compare current and reference raw_data metrics for determinism validation.
+
+    Args:
+        curr_raw_data (dict): Current run's raw_data.
+        ref_raw_data (dict): Reference run's raw_data.
+        rank (int): Optional rank number for distributed training.
+        logger: Optional logger for debug messages.
+
+    Returns:
+        list: List of mismatch descriptions, empty if all match.
+    """
+    mismatches = []
+    metric_prefix = f'deterministic_loss_rank{rank}' if rank is not None else 'deterministic_loss'
+
+    if metric_prefix not in ref_raw_data:
+        raise ValueError(
+            f'Reference results do not contain deterministic metrics ({metric_prefix}) in raw_data. '
+            f'Make sure the reference was run with --enable-determinism flag.'
+        )
+
+    for key in curr_raw_data:
+        if key.startswith('deterministic_') and key in ref_raw_data:
+            curr_val = curr_raw_data[key]
+            ref_val = ref_raw_data[key]
+
+            if isinstance(curr_val, list) and isinstance(ref_val, list):
+                mismatches.extend(_compare_metric_lists(key, curr_val, ref_val, logger))
+
+    return mismatches
+
+
+def apply_metadata_overrides(args, ref_metadata, logger=None):
+    """Apply reference metadata overrides to current args for reproducibility.
+
+    Args:
+        args: Parsed arguments object to modify.
+        ref_metadata (dict): Reference metadata with configuration.
+        logger: Optional logger for info messages.
+
+    Returns:
+        int: Number of parameters overridden.
+    """
+    if not ref_metadata:
+        if logger:
+            logger.warning('No metadata provided for override')
+        return 0
+
+    override_params = [
+        'batch_size', 'seq_len', 'hidden_size', 'num_steps', 'num_warmup', 'check_frequency', 'num_classes',
+        'num_layers', 'num_hidden_layers', 'num_attention_heads', 'intermediate_size', 'input_size', 'bidirectional',
+        'seed', 'precision', 'deterministic_seed'
+    ]
+
+    overridden_count = 0
+    for param in override_params:
+        if param in ref_metadata and hasattr(args, param):
+            ref_value = ref_metadata[param]
+            curr_value = getattr(args, param)
+
+            # Handle precision specially - it must be a list
+            if param == 'precision':
+                if isinstance(ref_value, str):
+                    # Convert string to Precision enum and wrap in list
+                    from superbench.benchmarks.context import Precision
+                    ref_value = [Precision(ref_value)]
+                elif isinstance(ref_value, list):
+                    # Ensure list items are Precision enums
+                    from superbench.benchmarks.context import Precision
+                    ref_value = [Precision(v) if isinstance(v, str) else v for v in ref_value]
+
+            if ref_value != curr_value:
+                if logger:
+                    logger.info(f'Overriding {param} from {curr_value} to {ref_value} (from reference metadata)')
+                setattr(args, param, ref_value)
+                overridden_count += 1
+
+    return overridden_count
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index 5787274c7..124be83aa 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -349,9 +349,27 @@ def __create_single_node_summary(self, node_path):    # pragma: no cover # noqa:
 
                         results_summary[benchmark_name][metric].append(result['result'][metric])
 
+                    # Include raw_data from rank0 results (which has consolidated multi-rank data)
+                    if 'raw_data' in result and 'rank0' in str(results_file):
+                        if 'raw_data' not in results_summary[benchmark_name]:
+                            results_summary[benchmark_name]['raw_data'] = {}
+                        for key, value in result['raw_data'].items():
+                            results_summary[benchmark_name]['raw_data'][key] = value
+
+        # Extract raw_data before merging (to preserve structure)
+        raw_data_dict = {}
+        for benchmark_name in results_summary:
+            if 'raw_data' in results_summary[benchmark_name]:
+                raw_data_dict[benchmark_name] = results_summary[benchmark_name]['raw_data']
+
         results_summary = self.__merge_benchmark_metrics(results_summary, reduce_ops)
         monitor_summary = self.__merge_monitor_metrics(node_path)
         results_summary = {**results_summary, **monitor_summary}
+
+        # Add raw_data back with nested structure
+        if raw_data_dict:
+            results_summary['raw_data'] = raw_data_dict
+
         with (node_path / 'results-summary.json').open(mode='w') as f:
             json.dump(results_summary, f, indent=2)
 
@@ -397,6 +415,9 @@ def __merge_benchmark_metrics(self, results_summary, reduce_ops):
         metrics_summary = dict()
         for benchmark_name in results_summary:
             for metric in results_summary[benchmark_name]:
+                # Skip raw_data - it will be added separately without flattening
+                if metric == 'raw_data':
+                    continue
                 metric_name = '{}/{}'.format(benchmark_name, metric)
                 if metric_name not in reduce_ops or (
                     reduce_ops[metric_name] is not None and reduce_ops[metric_name] not in ReduceType.get_values()
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_determinism_all.py b/tests/benchmarks/model_benchmarks/test_pytorch_determinism_all.py
new file mode 100644
index 000000000..1823338d5
--- /dev/null
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_determinism_all.py
@@ -0,0 +1,207 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Unified test for deterministic fingerprinting across all major PyTorch model benchmarks."""
+
+from tests.helper import decorator
+import os
+import tempfile
+import json
+import pytest
+from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, ReturnCode
+
+# Set CUBLAS_WORKSPACE_CONFIG early to ensure deterministic cuBLAS behavior
+os.environ.setdefault('CUBLAS_WORKSPACE_CONFIG', ':4096:8')
+# Set PYTORCH_CUDA_ALLOC_CONF to avoid memory fragmentation
+os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
+
+
+def run_deterministic_benchmark(model_name, params, results_path=None, extra_args=None):
+    """Helper to launch a deterministic benchmark and return the result."""
+    if results_path is None:
+        with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as tmpfile:
+            results_path = tmpfile.name
+    parameters = params + ' --enable-determinism --deterministic_seed 42 --check_frequency 10'
+    if extra_args:
+        parameters += ' ' + extra_args
+    context = BenchmarkRegistry.create_benchmark_context(
+        model_name,
+        platform=Platform.CUDA,
+        parameters=parameters,
+        framework=Framework.PYTORCH,
+    )
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Save result to file for comparison tests (in results-summary format)
+    if benchmark and benchmark.return_code == ReturnCode.SUCCESS:
+        # Convert to results-summary format with nested benchmark name
+        result_dict = json.loads(benchmark._result.to_string())
+        summary_format = {'raw_data': {}}
+        # Nest raw_data under benchmark name as results-summary.json does
+        benchmark_name = result_dict['name']
+        summary_format['raw_data'][benchmark_name] = result_dict['raw_data']
+
+        with open(results_path, 'w') as f:
+            json.dump(summary_format, f, indent=2)
+
+    return benchmark, results_path
+
+
+MODELS = [
+    (
+        'resnet18',
+        '--batch_size 2 --image_size 32 --num_classes 2 --num_warmup 1 --num_steps 20 '
+        '--model_action train --precision float32',
+    ),
+    (
+        'lstm',
+        '--batch_size 1 --num_classes 2 --seq_len 4 --num_warmup 1 --num_steps 20 '
+        '--model_action train '
+        '--precision float32',
+    ),
+    (
+        'gpt2-small',
+        '--batch_size 1 --num_classes 2 --seq_len 4 --num_warmup 1 --num_steps 20 '
+        '--model_action train --precision float32',
+    ),
+    pytest.param(
+        'llama2-7b',
+        '--batch_size 1 --seq_len 1 --num_warmup 1 --num_steps 20 --precision float32 --model_action train',
+        marks=pytest.mark.skip(
+            reason='Requires >26GB GPU memory for 7B model, and float16 incompatible with deterministic mode'
+        ),
+    ),
+    (
+        'mixtral-8x7b',
+        '--batch_size 1 --seq_len 4 --num_warmup 1 --num_steps 20 --precision float32 '
+        '--hidden_size 128 --max_position_embeddings 32 '
+        '--intermediate_size 256 --model_action train',
+    ),
+    (
+        'bert-base',
+        '--batch_size 1 --num_classes 2 --seq_len 4 --num_warmup 1 --num_steps 20 '
+        '--model_action train --precision float32',
+    ),
+]
+
+
+@decorator.cuda_test
+@decorator.pytorch_test
+@pytest.mark.parametrize('model_name, params', MODELS)
+def test_pytorch_model_determinism(model_name, params):
+    """Parameterised Test for PyTorch model determinism."""
+    benchmark, results_path = run_deterministic_benchmark(model_name, params)
+    assert benchmark and benchmark.return_code == ReturnCode.SUCCESS
+
+    # Check args
+    assert benchmark._args.enable_determinism is True
+    assert benchmark._args.deterministic_seed == 42
+    assert benchmark._args.check_frequency == 10
+
+    # Results file generation and contents
+    assert os.path.exists(results_path)
+    with open(results_path, 'r') as f:
+        data = json.load(f)
+
+    # Validate result structure contains raw_data with deterministic metrics (results-summary format)
+    assert 'raw_data' in data, 'Expected raw_data in result'
+    # Get the benchmark-specific nested data
+    benchmark_name = benchmark._result.name
+    assert benchmark_name in data['raw_data'], f'Expected {benchmark_name} in raw_data'
+    raw_data = data['raw_data'][benchmark_name]
+
+    # Check for deterministic metrics in raw_data (either with rank suffix or without)
+    loss_keys = [k for k in raw_data.keys() if 'deterministic_loss' in k]
+    act_keys = [k for k in raw_data.keys() if 'deterministic_act_mean' in k]
+    step_keys = [k for k in raw_data.keys() if 'deterministic_step' in k]
+
+    assert len(loss_keys) > 0, f'Expected deterministic_loss in raw_data, got keys: {list(raw_data.keys())}'
+    assert len(act_keys) > 0, 'Expected deterministic_act_mean in raw_data'
+    assert len(step_keys) > 0, 'Expected deterministic_step in raw_data'
+
+    # Validate the detailed values are captured
+    loss_data = raw_data[loss_keys[0]]
+    assert isinstance(loss_data, list) and len(loss_data) > 0, 'Expected non-empty loss list'
+    assert isinstance(loss_data[0], list) and len(loss_data[0]) > 0, 'Expected non-empty loss values'
+
+    # Verify loss values are reasonable (not None or inf)
+    # Note: Some models may produce NaN with small test configurations - this is a test limitation, not a code issue
+    import math
+    for loss_val in loss_data[0]:
+        assert loss_val is not None, 'Loss value should not be None'
+        assert isinstance(loss_val, (int, float)), f'Loss should be numeric, got {type(loss_val)}'
+        # Skip further validation if loss is NaN (model training instability with small test config)
+        if not math.isnan(loss_val):
+            assert loss_val < 1e6, f'Loss seems unreasonably large: {loss_val}'
+
+    # Run with compare-log for success - this verifies deterministic reproducibility
+    extra_args = f'--compare-log {results_path}'
+    benchmark_compare, _ = run_deterministic_benchmark(model_name, params, results_path, extra_args)
+    assert benchmark_compare and benchmark_compare.return_code == ReturnCode.SUCCESS
+
+    # Run a third time to triple-check determinism
+    benchmark_compare2, _ = run_deterministic_benchmark(model_name, params, results_path, extra_args)
+    assert benchmark_compare2 and benchmark_compare2.return_code == ReturnCode.SUCCESS
+
+    os.remove(results_path)
+
+
+@decorator.cuda_test
+@decorator.pytorch_test
+@pytest.mark.parametrize('model_name, params', MODELS)
+@pytest.mark.xfail(reason='Intentional determinism mismatch to test failure handling.')
+def test_pytorch_model_determinism_failure_case(model_name, params):
+    """Parameterised Test for PyTorch model determinism failure case."""
+    benchmark, results_path = run_deterministic_benchmark(model_name, params)
+    assert benchmark and benchmark.return_code == ReturnCode.SUCCESS
+
+    # Modify the results file to break determinism by changing loss values
+    with open(results_path, 'r+') as f:
+        data = json.load(f)
+        # Find the deterministic_loss in nested raw_data and change first value
+        benchmark_name = benchmark._result.name
+        raw_data = data['raw_data'][benchmark_name]
+        for loss_key in raw_data.keys():
+            if 'deterministic_loss' in loss_key and isinstance(raw_data[loss_key], list):
+                if raw_data[loss_key] and raw_data[loss_key][0]:
+                    raw_data[loss_key][0][0] += 1e-5
+                break
+        f.seek(0)
+        json.dump(data, f)
+        f.truncate()
+
+    # Run with compare-log for failure
+    extra_args = f'--compare-log {results_path}'
+    with pytest.raises(RuntimeError):
+        run_deterministic_benchmark(model_name, params, results_path, extra_args)
+
+    # Clean up
+    os.remove(results_path)
+
+
+@decorator.cuda_test
+@decorator.pytorch_test
+@pytest.mark.parametrize('model_name, params', MODELS)
+def test_pytorch_model_nondeterministic_default(model_name, params):
+    """Parameterised Test for PyTorch model to verify non-determinism."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        model_name,
+        platform=Platform.CUDA,
+        parameters=params,
+        framework=Framework.PYTORCH,
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    assert (benchmark and benchmark.return_code == ReturnCode.SUCCESS), 'Benchmark did not run successfully.'
+    args = benchmark._args
+    assert getattr(args, 'enable_determinism', False) is False, 'Expected enable_determinism to be False by default.'
+    assert (getattr(args, 'compare_log', None) is None), 'Expected compare_log to be None by default.'
+    assert (getattr(args, 'check_frequency', None) == 100), 'Expected check_frequency to be 100 by default.'
+
+    # Periodic fingerprints exist but are empty when not deterministic
+    assert hasattr(benchmark, '_model_run_periodic'), 'Benchmark missing _model_run_periodic attribute.'
+    periodic = benchmark._model_run_periodic
+    assert isinstance(periodic, dict), '_model_run_periodic should be a dict.'
+    for key in ('loss', 'act_mean', 'step'):
+        assert key in periodic, f"Key '{key}' missing in _model_run_periodic."
+        assert (len(periodic[key]) == 0), f"Expected empty list for periodic['{key}'], got {periodic[key]}."
diff --git a/tests/common/test_model_log_utils.py b/tests/common/test_model_log_utils.py
new file mode 100644
index 000000000..785d2903b
--- /dev/null
+++ b/tests/common/test_model_log_utils.py
@@ -0,0 +1,165 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for model_log_utils module."""
+
+import json
+import tempfile
+import pytest
+from unittest.mock import Mock
+from superbench.common import model_log_utils
+
+
+class TestRecordStepLoss:
+    """Tests for record_step_loss function."""
+    def test_record_loss_conversion_failure(self):
+        """Test exception handling when loss conversion fails."""
+        logger = Mock()
+        losses_list = []
+
+        # Create a mock object that raises exception on conversion
+        bad_loss = Mock()
+        bad_loss.detach.side_effect = RuntimeError('Conversion failed')
+
+        result = model_log_utils.record_step_loss(bad_loss, curr_step=5, losses_list=losses_list, logger=logger)
+
+        assert result is None
+        assert losses_list == [None]
+        logger.info.assert_called_once_with('Unable to convert loss to float at step 5')
+
+
+class TestLoadAndValidateReferenceFile:
+    """Tests for _load_and_validate_reference_file function."""
+    def test_file_not_found(self):
+        """Test FileNotFoundError when reference file doesn't exist."""
+        with pytest.raises(FileNotFoundError, match='Reference results file not found'):
+            model_log_utils._load_and_validate_reference_file('/nonexistent/file.json')
+
+    def test_invalid_json(self):
+        """Test ValueError when JSON is malformed."""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            f.write('{invalid json')
+            f.flush()
+
+            with pytest.raises(ValueError, match='Invalid JSON'):
+                model_log_utils._load_and_validate_reference_file(f.name)
+
+    def test_missing_raw_data(self):
+        """Test ValueError when raw_data section is missing."""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            json.dump({'some_other_key': {}}, f)
+            f.flush()
+
+            with pytest.raises(ValueError, match='does not contain "raw_data" section'):
+                model_log_utils._load_and_validate_reference_file(f.name)
+
+
+class TestFindBenchmarkRawData:
+    """Tests for _find_benchmark_raw_data function."""
+    def test_benchmark_not_found(self):
+        """Test ValueError when benchmark name not found in reference."""
+        ref_raw_data = {'pytorch-resnet18': {}, 'pytorch-bert': {}}
+
+        with pytest.raises(ValueError, match='does not contain raw_data for benchmark matching'):
+            model_log_utils._find_benchmark_raw_data(ref_raw_data, 'llama')
+
+
+class TestExtractMetadataFromRawData:
+    """Tests for _extract_metadata_from_raw_data function."""
+    def test_extract_from_list_of_dicts(self):
+        """Test extracting metadata from list of dicts format."""
+        metadata_list = [{'batch_size': 32, 'seed': 42}]
+        result = model_log_utils._extract_metadata_from_raw_data(metadata_list)
+        assert result == {'batch_size': 32, 'seed': 42}
+
+    def test_extract_from_nested_list(self):
+        """Test extracting metadata from nested list format."""
+        metadata_list = [[{'batch_size': 16, 'seq_len': 128}]]
+        result = model_log_utils._extract_metadata_from_raw_data(metadata_list)
+        assert result == {'batch_size': 16, 'seq_len': 128}
+
+    def test_extract_from_dict(self):
+        """Test extracting metadata from direct dict format."""
+        metadata_dict = {'num_steps': 100}
+        result = model_log_utils._extract_metadata_from_raw_data(metadata_dict)
+        assert result == {'num_steps': 100}
+
+    def test_extract_returns_none_for_invalid(self):
+        """Test returns None for invalid metadata format."""
+        result = model_log_utils._extract_metadata_from_raw_data([])
+        assert result is None
+
+
+class TestCompareCheckpointValues:
+    """Tests for _compare_checkpoint_values function."""
+    def test_length_mismatch(self):
+        """Test detection of checkpoint count mismatch."""
+        logger = Mock()
+        curr_run = [1.0, 2.0, 3.0]
+        ref_run = [1.0, 2.0]
+
+        mismatches = model_log_utils._compare_checkpoint_values('loss', 0, curr_run, ref_run, logger)
+
+        assert len(mismatches) == 1
+        assert 'checkpoint count mismatch (3 vs 2)' in mismatches[0]
+
+    def test_value_mismatch_numeric(self):
+        """Test detection of numeric value mismatch with diff calculation."""
+        logger = Mock()
+        curr_run = [1.0, 2.5, 3.0]
+        ref_run = [1.0, 2.0, 3.0]
+
+        mismatches = model_log_utils._compare_checkpoint_values('loss', 0, curr_run, ref_run, logger)
+
+        assert len(mismatches) == 1
+        assert 'checkpoint 1' in mismatches[0]
+        assert 'diff: 0.5' in mismatches[0]
+
+
+class TestApplyMetadataOverrides:
+    """Tests for apply_metadata_overrides function."""
+    def test_no_metadata_provided(self):
+        """Test warning when no metadata is provided."""
+        logger = Mock()
+        args = Mock()
+
+        count = model_log_utils.apply_metadata_overrides(args, None, logger)
+
+        assert count == 0
+        logger.warning.assert_called_once_with('No metadata provided for override')
+
+    def test_precision_override_from_string(self):
+        """Test precision override converts string to Precision enum list."""
+        from superbench.benchmarks.context import Precision
+
+        logger = Mock()
+        args = Mock()
+        args.batch_size = 32
+        args.precision = [Precision.FLOAT16]
+
+        ref_metadata = {'batch_size': 32, 'precision': 'float32'}
+
+        count = model_log_utils.apply_metadata_overrides(args, ref_metadata, logger)
+
+        # Should override precision from string 'float32' to [Precision.FLOAT32]
+        assert count == 1
+        assert isinstance(args.precision, list)
+        assert args.precision[0] == Precision.FLOAT32
+
+    def test_precision_override_from_list(self):
+        """Test precision override handles list of strings."""
+        from superbench.benchmarks.context import Precision
+
+        logger = Mock()
+        args = Mock()
+        args.precision = [Precision.FLOAT16]
+
+        ref_metadata = {'precision': ['float32', 'float16']}
+
+        count = model_log_utils.apply_metadata_overrides(args, ref_metadata, logger)
+
+        assert count == 1
+        assert isinstance(args.precision, list)
+        assert len(args.precision) == 2
+        assert args.precision[0] == Precision.FLOAT32
+        assert args.precision[1] == Precision.FLOAT16