From 5e98d645c2d09d624efca7e7e055ef5bfd289ab3 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 09:05:56 +0000
Subject: [PATCH 01/42] feat: Add TraceLens integration for trace analysis with
 MLflow upload

- Add TraceLens trace analysis report generation (XLSX, CSV formats)
- Add mlflow_upload_tracelens_report config option (default: false)
- Add mlflow_tracelens_ranks, mlflow_tracelens_max_reports options
- Add mlflow_tracelens_output_format option (all, xlsx, csv)
- Auto-install TraceLens from GitHub if not present
- Upload analysis reports to MLflow artifacts/trace_analysis/
---
 .../backends/megatron/training/global_vars.py |  57 ++
 .../megatron/training/mlflow_artifacts.py     | 725 ++++++++++++++++++
 .../megatron/primus_megatron_module.yaml      |   6 +
 primus/modules/trainer/megatron/trainer.py    |  12 +
 4 files changed, 800 insertions(+)
 create mode 100644 primus/backends/megatron/training/mlflow_artifacts.py

diff --git a/primus/backends/megatron/training/global_vars.py b/primus/backends/megatron/training/global_vars.py
index b23016d46..6685d2036 100644
--- a/primus/backends/megatron/training/global_vars.py
+++ b/primus/backends/megatron/training/global_vars.py
@@ -5,9 +5,12 @@
 # See LICENSE for license information.
 ###############################################################################
 
+from typing import List, Optional
 
 from primus.modules.module_utils import debug_rank_0
 
+from .mlflow_artifacts import upload_artifacts_to_mlflow
+
 _GLOBAL_ARGS = None
 _GLOBAL_MLFLOW_WRITER = None
 
@@ -85,3 +88,57 @@ def _ensure_var_is_not_initialized(var, name):
 def destroy_global_vars():
     global _GLOBAL_ARGS
     _GLOBAL_ARGS = None
+
+
+def upload_mlflow_artifacts(
+    tensorboard_dir: Optional[str] = None,
+    exp_root_path: Optional[str] = None,
+    upload_traces: bool = True,
+    upload_logs: bool = True,
+    upload_tracelens_report: bool = False,
+    tracelens_ranks: Optional[List[int]] = None,
+    tracelens_max_reports: Optional[int] = None,
+    tracelens_output_format: str = "all",
+) -> Optional[dict]:
+    """
+    Upload trace files, log files, and TraceLens reports to MLflow as artifacts.
+
+    This function should be called at the end of training to upload all
+    artifacts to MLflow. Only the rank that initialized MLflow (last rank)
+    should call this to avoid duplicate uploads.
+
+    MLflow Artifact Structure:
+        artifacts/
+        ├── traces/              # PyTorch profiler trace files
+        ├── logs/                # Training log files
+        └── trace_analysis/      # TraceLens analysis reports
+
+    Args:
+        tensorboard_dir: Path to tensorboard directory with trace files
+        exp_root_path: Root experiment path for log files
+        upload_traces: Whether to upload trace files (default: True)
+        upload_logs: Whether to upload log files (default: True)
+        upload_tracelens_report: Whether to generate and upload TraceLens reports
+        tracelens_ranks: List of ranks to analyze with TraceLens
+                        (None = all, [0] = rank 0 only)
+        tracelens_max_reports: Maximum number of TraceLens reports to generate
+        tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+
+    Returns:
+        Dictionary with counts of uploaded files, or None if MLflow is not enabled
+    """
+    mlflow_writer = get_mlflow_writer()
+    if mlflow_writer is None:
+        return None
+
+    return upload_artifacts_to_mlflow(
+        mlflow_writer=mlflow_writer,
+        tensorboard_dir=tensorboard_dir,
+        exp_root_path=exp_root_path,
+        upload_traces=upload_traces,
+        upload_logs=upload_logs,
+        upload_tracelens_report=upload_tracelens_report,
+        tracelens_ranks=tracelens_ranks,
+        tracelens_max_reports=tracelens_max_reports,
+        tracelens_output_format=tracelens_output_format,
+    )
diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
new file mode 100644
index 000000000..dbeeb789c
--- /dev/null
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -0,0 +1,725 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+"""
+MLflow Artifact Logging Utilities with TraceLens Integration
+
+This module provides functions to upload trace files, log files, and
+TraceLens analysis reports to MLflow when MLflow tracking is enabled.
+
+Features:
+- Upload profiler trace files from all profiled ranks (including multi-node)
+- Upload log files from all levels and all ranks
+- Generate and upload TraceLens trace analysis reports
+- Supports both local and distributed training scenarios
+
+MLflow Artifact Structure:
+    artifacts/
+    ├── traces/              # PyTorch profiler trace files
+    │   ├── rank_0_step_2.json.gz
+    │   └── ...
+    ├── logs/                # Training log files
+    │   └── log_mp_pretrain.txt
+    └── trace_analysis/      # TraceLens analysis reports
+        ├── rank_0_analysis.xlsx   # Multi-tab Excel (default)
+        └── ...
+
+TraceLens Report Formats:
+    - xlsx: Multi-tab Excel with sections for kernels, memory, communication, etc.
+    - csv:  Single flat file with operation summary
+    - html: Interactive HTML report
+"""
+
+import glob
+import os
+import subprocess
+import sys
+from typing import List, Optional
+
+from primus.modules.module_utils import log_rank_0, warning_rank_0
+
+
+def _get_all_trace_files(tensorboard_dir: str) -> list:
+    """
+    Find all profiler trace files in the tensorboard directory.
+
+    Trace files are typically named like:
+    - primus-megatron-exp[...]-rank[0].*.json
+    - primus-megatron-exp[...]-rank[0].*.json.gz
+
+    Args:
+        tensorboard_dir: Path to the tensorboard directory containing trace files
+
+    Returns:
+        List of paths to trace files
+    """
+    if not tensorboard_dir or not os.path.exists(tensorboard_dir):
+        return []
+
+    trace_files = []
+    # Look for JSON trace files (both compressed and uncompressed)
+    patterns = ["*.json", "*.json.gz", "*.pt.trace.json", "*.pt.trace.json.gz"]
+    for pattern in patterns:
+        trace_files.extend(glob.glob(os.path.join(tensorboard_dir, pattern)))
+        trace_files.extend(glob.glob(os.path.join(tensorboard_dir, "**", pattern), recursive=True))
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_files = []
+    for f in trace_files:
+        if f not in seen:
+            seen.add(f)
+            unique_files.append(f)
+
+    return unique_files
+
+
+def _get_all_log_files(exp_root_path: str) -> list:
+    """
+    Find all log files in the experiment logs directory.
+
+    Log files are organized as:
+    - {exp_root_path}/logs/master/master-*.log
+    - {exp_root_path}/logs/{module_name}/rank-{rank}/*.log
+
+    Args:
+        exp_root_path: Root path of the experiment
+
+    Returns:
+        List of paths to log files
+    """
+    if not exp_root_path:
+        return []
+
+    logs_dir = os.path.join(exp_root_path, "logs")
+    if not os.path.exists(logs_dir):
+        return []
+
+    log_files = []
+    # Find all .log files recursively
+    log_files.extend(glob.glob(os.path.join(logs_dir, "**", "*.log"), recursive=True))
+
+    return log_files
+
+
+def upload_trace_files_to_mlflow(
+    mlflow_writer,
+    tensorboard_dir: str,
+    artifact_path: str = "traces",
+) -> int:
+    """
+    Upload all profiler trace files to MLflow as artifacts.
+
+    This function collects trace files from the tensorboard directory and
+    uploads them to MLflow. In distributed settings, only rank 0 (or the
+    last rank where MLflow writer is initialized) should call this.
+
+    Args:
+        mlflow_writer: The MLflow module instance (from get_mlflow_writer())
+        tensorboard_dir: Path to the tensorboard directory containing trace files
+        artifact_path: MLflow artifact subdirectory for trace files
+
+    Returns:
+        Number of trace files uploaded
+    """
+    if mlflow_writer is None:
+        return 0
+
+    log_rank_0(f"[MLflow] Searching for trace files in: {tensorboard_dir}")
+    trace_files = _get_all_trace_files(tensorboard_dir)
+    if len(trace_files) > 5:
+        log_rank_0(f"[MLflow] Found {len(trace_files)} trace files: {trace_files[:5]}...")
+    else:
+        log_rank_0(f"[MLflow] Found {len(trace_files)} trace files: {trace_files}")
+
+    if not trace_files:
+        log_rank_0("[MLflow] No trace files found to upload")
+        return 0
+
+    uploaded_count = 0
+    for trace_file in trace_files:
+        try:
+            # Get relative path from tensorboard_dir for artifact organization
+            rel_path = os.path.relpath(trace_file, tensorboard_dir)
+            # Determine artifact subdirectory based on file location
+            artifact_subpath = (
+                os.path.join(artifact_path, os.path.dirname(rel_path))
+                if os.path.dirname(rel_path)
+                else artifact_path
+            )
+
+            mlflow_writer.log_artifact(trace_file, artifact_path=artifact_subpath)
+            uploaded_count += 1
+            log_rank_0(f"[MLflow] Uploaded trace file: {os.path.basename(trace_file)}")
+        except Exception as e:
+            warning_rank_0(f"[MLflow] Failed to upload trace file {trace_file}: {e}")
+
+    log_rank_0(f"[MLflow] Uploaded {uploaded_count} trace files to '{artifact_path}'")
+    return uploaded_count
+
+
+def upload_log_files_to_mlflow(
+    mlflow_writer,
+    exp_root_path: str,
+    artifact_path: str = "logs",
+) -> int:
+    """
+    Upload all log files to MLflow as artifacts.
+
+    This function collects log files from all ranks and all log levels
+    and uploads them to MLflow. The directory structure is preserved
+    in the artifact path.
+
+    Args:
+        mlflow_writer: The MLflow module instance (from get_mlflow_writer())
+        exp_root_path: Root path of the experiment
+        artifact_path: MLflow artifact subdirectory for log files
+
+    Returns:
+        Number of log files uploaded
+    """
+    if mlflow_writer is None:
+        return 0
+
+    log_files = _get_all_log_files(exp_root_path)
+
+    if not log_files:
+        log_rank_0("[MLflow] No log files found to upload")
+        return 0
+
+    logs_base_dir = os.path.join(exp_root_path, "logs")
+    uploaded_count = 0
+
+    for log_file in log_files:
+        try:
+            # Preserve directory structure relative to logs base directory
+            rel_path = os.path.relpath(log_file, logs_base_dir)
+            artifact_subpath = (
+                os.path.join(artifact_path, os.path.dirname(rel_path))
+                if os.path.dirname(rel_path)
+                else artifact_path
+            )
+
+            mlflow_writer.log_artifact(log_file, artifact_path=artifact_subpath)
+            uploaded_count += 1
+        except Exception as e:
+            warning_rank_0(f"[MLflow] Failed to upload log file {log_file}: {e}")
+
+    log_rank_0(f"[MLflow] Uploaded {uploaded_count} log files to '{artifact_path}'")
+    return uploaded_count
+
+
+# =============================================================================
+# TraceLens Integration
+# =============================================================================
+
+
+def _ensure_tracelens_installed() -> bool:
+    """
+    Ensure TraceLens is installed. Install it if not present.
+
+    TraceLens is available from GitHub: https://github.com/AMD-AGI/TraceLens
+
+    Returns:
+        True if TraceLens is available, False otherwise
+    """
+    try:
+        import TraceLens  # noqa: F401
+
+        log_rank_0("[TraceLens] TraceLens is already installed")
+        return True
+    except ImportError:
+        log_rank_0("[TraceLens] TraceLens not found, attempting to install from GitHub...")
+        try:
+            # TraceLens is on GitHub, not PyPI
+            subprocess.check_call(
+                [
+                    sys.executable,
+                    "-m",
+                    "pip",
+                    "install",
+                    "git+https://github.com/AMD-AGI/TraceLens.git",
+                    "-q",
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            log_rank_0("[TraceLens] Successfully installed TraceLens from GitHub")
+            return True
+        except subprocess.CalledProcessError as e:
+            warning_rank_0(f"[TraceLens] Failed to install TraceLens: {e}")
+            return False
+
+
+def _extract_rank_from_filename(filename: str) -> Optional[int]:
+    """
+    Extract rank number from trace filename.
+
+    Expected patterns:
+    - rank_0_step_2.json.gz
+    - primus-megatron-exp-rank[0].*.json
+
+    Args:
+        filename: The trace filename
+
+    Returns:
+        Rank number or None if not found
+    """
+    import re
+
+    # Try pattern: rank_N_ or rank[N]
+    patterns = [
+        r"rank_(\d+)_",
+        r"rank\[(\d+)\]",
+        r"-rank(\d+)\.",
+        r"_rank(\d+)\.",
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, filename)
+        if match:
+            return int(match.group(1))
+
+    return None
+
+
+def _filter_traces_by_rank(trace_files: List[str], ranks: List[int]) -> List[str]:
+    """
+    Filter trace files to only include specified ranks.
+
+    Args:
+        trace_files: List of trace file paths
+        ranks: List of rank numbers to include
+
+    Returns:
+        Filtered list of trace files
+    """
+    if not ranks:
+        return trace_files
+
+    filtered = []
+    for trace_file in trace_files:
+        rank = _extract_rank_from_filename(os.path.basename(trace_file))
+        if rank is not None and rank in ranks:
+            filtered.append(trace_file)
+
+    return filtered
+
+
+def generate_tracelens_report(
+    trace_file: str,
+    output_dir: str,
+    report_name: Optional[str] = None,
+    output_format: str = "all",
+) -> List[str]:
+    """
+    Generate a TraceLens analysis report for a single trace file.
+
+    Args:
+        trace_file: Path to the PyTorch profiler trace file (JSON/JSON.GZ)
+        output_dir: Directory to save the report
+        report_name: Optional custom name for the report (base name for CSVs)
+        output_format: Output format:
+                      - "all" (default): Both XLSX and CSV files
+                      - "xlsx": Single multi-tab Excel file with detailed analysis
+                      - "csv": Multiple CSV files (kernels, memory, communication, etc.)
+                      - "html": Interactive HTML report
+
+    Returns:
+        List of paths to generated report files
+    """
+    if not os.path.exists(trace_file):
+        warning_rank_0(f"[TraceLens] Trace file not found: {trace_file}")
+        return []
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Generate base name from trace filename if not provided
+    if report_name is None:
+        base_name = os.path.basename(trace_file)
+        # Remove extensions like .json.gz
+        for trace_ext in [".json.gz", ".json", ".pt.trace.json.gz", ".pt.trace.json"]:
+            if base_name.endswith(trace_ext):
+                base_name = base_name[: -len(trace_ext)]
+                break
+        report_name = base_name
+
+    try:
+        # Try using TraceLens Python API directly
+        from TraceLens.Reporting import generate_perf_report_pytorch
+
+        generated_files = []
+
+        if output_format in ("all", "xlsx"):
+            # XLSX: Single file with multiple tabs
+            xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
+            dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
+            if os.path.exists(xlsx_path):
+                log_rank_0(
+                    f"[TraceLens] Generated XLSX report with {len(dfs)} tabs: {os.path.basename(xlsx_path)}"
+                )
+                generated_files.append(xlsx_path)
+
+        if output_format in ("all", "csv"):
+            # CSV: Multiple files in a subdirectory per rank
+            csv_subdir = os.path.join(output_dir, report_name)
+            os.makedirs(csv_subdir, exist_ok=True)
+            dfs = generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
+
+            # Collect all generated CSV files
+            csv_files = glob.glob(os.path.join(csv_subdir, "*.csv"))
+            if csv_files:
+                log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
+                generated_files.extend(csv_files)
+
+        if output_format == "html":
+            warning_rank_0("[TraceLens] HTML format not yet supported, using xlsx+csv")
+            # Fall through to xlsx
+            xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
+            dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
+            if os.path.exists(xlsx_path):
+                generated_files.append(xlsx_path)
+
+        if generated_files:
+            return generated_files
+
+        warning_rank_0(f"[TraceLens] No output files generated for: {trace_file}")
+        return []
+
+    except ImportError:
+        log_rank_0("[TraceLens] TraceLens not available, using fallback CSV summary")
+        # Fallback to simple CSV summary
+        csv_path = _generate_trace_summary_csv(trace_file, output_dir, f"{report_name}_summary.csv")
+        return [csv_path] if csv_path else []
+
+    except Exception as e:
+        warning_rank_0(f"[TraceLens] Error generating report: {e}")
+        # Fallback to simple CSV summary
+        csv_path = _generate_trace_summary_csv(trace_file, output_dir, f"{report_name}_summary.csv")
+        return [csv_path] if csv_path else []
+
+
+def _generate_trace_summary_csv(
+    trace_file: str,
+    output_dir: str,
+    report_name: str,
+) -> Optional[str]:
+    """
+    Generate a CSV summary from a PyTorch profiler trace file.
+
+    This is a fallback when TraceLens is not available.
+    Extracts key metrics from the trace JSON and writes to CSV.
+
+    Args:
+        trace_file: Path to the trace file
+        output_dir: Output directory
+        report_name: Name for the CSV file
+
+    Returns:
+        Path to generated CSV or None if failed
+    """
+    import csv
+    import gzip
+    import json
+
+    try:
+        # Load trace file
+        if trace_file.endswith(".gz"):
+            with gzip.open(trace_file, "rt", encoding="utf-8") as f:
+                trace_data = json.load(f)
+        else:
+            with open(trace_file, "r", encoding="utf-8") as f:
+                trace_data = json.load(f)
+
+        # Extract events from trace
+        events = trace_data.get("traceEvents", [])
+        if not events:
+            warning_rank_0(f"[TraceLens] No events found in trace: {trace_file}")
+            return None
+
+        # Aggregate kernel/operation statistics
+        op_stats = {}
+        for event in events:
+            if event.get("cat") in ["kernel", "gpu_memcpy", "cuda_runtime", "cpu_op"]:
+                name = event.get("name", "unknown")
+                dur = event.get("dur", 0)  # duration in microseconds
+
+                if name not in op_stats:
+                    op_stats[name] = {"count": 0, "total_us": 0, "min_us": float("inf"), "max_us": 0}
+
+                op_stats[name]["count"] += 1
+                op_stats[name]["total_us"] += dur
+                op_stats[name]["min_us"] = min(op_stats[name]["min_us"], dur)
+                op_stats[name]["max_us"] = max(op_stats[name]["max_us"], dur)
+
+        if not op_stats:
+            warning_rank_0(f"[TraceLens] No kernel/op events found in trace: {trace_file}")
+            return None
+
+        # Sort by total time descending
+        sorted_ops = sorted(op_stats.items(), key=lambda x: x[1]["total_us"], reverse=True)
+
+        # Write CSV
+        output_path = os.path.join(output_dir, report_name)
+        with open(output_path, "w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(
+                [
+                    "Operation",
+                    "Count",
+                    "Total Time (ms)",
+                    "Avg Time (ms)",
+                    "Min Time (ms)",
+                    "Max Time (ms)",
+                    "% of Total",
+                ]
+            )
+
+            total_time = sum(stats["total_us"] for _, stats in sorted_ops)
+            for name, stats in sorted_ops:
+                avg_us = stats["total_us"] / stats["count"] if stats["count"] > 0 else 0
+                pct = (stats["total_us"] / total_time * 100) if total_time > 0 else 0
+                writer.writerow(
+                    [
+                        name,
+                        stats["count"],
+                        f"{stats['total_us'] / 1000:.3f}",
+                        f"{avg_us / 1000:.3f}",
+                        f"{stats['min_us'] / 1000:.3f}",
+                        f"{stats['max_us'] / 1000:.3f}",
+                        f"{pct:.2f}",
+                    ]
+                )
+
+        log_rank_0(f"[TraceLens] Generated CSV summary: {report_name} ({len(sorted_ops)} operations)")
+        return output_path
+
+    except json.JSONDecodeError as e:
+        warning_rank_0(f"[TraceLens] Failed to parse trace JSON: {e}")
+        return None
+    except Exception as e:
+        warning_rank_0(f"[TraceLens] Error generating CSV summary: {e}")
+        return None
+
+
+def generate_tracelens_reports(
+    tensorboard_dir: str,
+    output_dir: str,
+    ranks: Optional[List[int]] = None,
+    max_reports: Optional[int] = None,
+    output_format: str = "all",
+) -> List[str]:
+    """
+    Generate TraceLens analysis reports for trace files.
+
+    Args:
+        tensorboard_dir: Directory containing PyTorch profiler trace files
+        output_dir: Directory to save the generated reports
+        ranks: List of ranks to generate reports for (None = all ranks)
+        max_reports: Maximum number of reports to generate (None = unlimited)
+        output_format: Output format:
+                      - "all" (default): Both XLSX and CSV files
+                      - "xlsx": Multi-tab Excel with detailed analysis
+                      - "csv": Multiple CSV files per rank (kernels, memory, comm, etc.)
+
+    Returns:
+        List of paths to all generated report files
+    """
+    # Try to install tracelens, but continue with fallback if not available
+    _ensure_tracelens_installed()
+
+    trace_files = _get_all_trace_files(tensorboard_dir)
+    if not trace_files:
+        log_rank_0("[TraceLens] No trace files found for analysis")
+        return []
+
+    # Filter by ranks if specified
+    if ranks is not None:
+        trace_files = _filter_traces_by_rank(trace_files, ranks)
+        log_rank_0(f"[TraceLens] Filtered to {len(trace_files)} trace files for ranks: {ranks}")
+
+    # Limit number of reports if specified
+    if max_reports is not None and len(trace_files) > max_reports:
+        trace_files = trace_files[:max_reports]
+        log_rank_0(f"[TraceLens] Limited to {max_reports} reports")
+
+    log_rank_0(
+        f"[TraceLens] Generating {output_format.upper()} reports for {len(trace_files)} trace files..."
+    )
+
+    generated_reports = []
+    for trace_file in trace_files:
+        # generate_tracelens_report now returns a list of files
+        report_paths = generate_tracelens_report(trace_file, output_dir, output_format=output_format)
+        generated_reports.extend(report_paths)
+
+    log_rank_0(f"[TraceLens] Generated {len(generated_reports)} report files from {len(trace_files)} traces")
+    return generated_reports
+
+
+def upload_tracelens_reports_to_mlflow(
+    mlflow_writer,
+    tensorboard_dir: str,
+    exp_root_path: str,
+    ranks: Optional[List[int]] = None,
+    max_reports: Optional[int] = None,
+    output_format: str = "all",
+    artifact_path: str = "trace_analysis",
+) -> int:
+    """
+    Generate TraceLens reports and upload them to MLflow.
+
+    This function:
+    1. Finds PyTorch profiler trace files
+    2. Generates TraceLens analysis reports for specified ranks
+    3. Uploads the reports to MLflow under the trace_analysis artifact path
+
+    Args:
+        mlflow_writer: The MLflow module instance (from get_mlflow_writer())
+        tensorboard_dir: Directory containing PyTorch profiler trace files
+        exp_root_path: Root path of the experiment (for saving reports)
+        ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
+        max_reports: Maximum number of reports to generate
+        output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        artifact_path: MLflow artifact subdirectory for reports
+
+    Returns:
+        Number of reports uploaded to MLflow
+    """
+    if mlflow_writer is None:
+        log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
+        return 0
+
+    # Create output directory for reports
+    reports_dir = os.path.join(exp_root_path, "tracelens_reports")
+    os.makedirs(reports_dir, exist_ok=True)
+
+    log_rank_0(f"[TraceLens] Generating reports from traces in: {tensorboard_dir}")
+    log_rank_0(f"[TraceLens] Reports will be saved to: {reports_dir}")
+    if ranks:
+        log_rank_0(f"[TraceLens] Analyzing ranks: {ranks}")
+    if max_reports:
+        log_rank_0(f"[TraceLens] Max reports: {max_reports}")
+
+    # Generate reports
+    reports = generate_tracelens_reports(
+        tensorboard_dir=tensorboard_dir,
+        output_dir=reports_dir,
+        ranks=ranks,
+        max_reports=max_reports,
+        output_format=output_format,
+    )
+
+    if not reports:
+        log_rank_0("[TraceLens] No reports generated, nothing to upload")
+        return 0
+
+    # Upload reports to MLflow
+    uploaded_count = 0
+    for report_path in reports:
+        try:
+            mlflow_writer.log_artifact(report_path, artifact_path=artifact_path)
+            uploaded_count += 1
+            log_rank_0(f"[MLflow] Uploaded TraceLens report: {os.path.basename(report_path)}")
+        except Exception as e:
+            warning_rank_0(f"[MLflow] Failed to upload report {report_path}: {e}")
+
+    log_rank_0(f"[TraceLens] Uploaded {uploaded_count} reports to '{artifact_path}'")
+    return uploaded_count
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+
+
+def upload_artifacts_to_mlflow(
+    mlflow_writer,
+    tensorboard_dir: Optional[str] = None,
+    exp_root_path: Optional[str] = None,
+    upload_traces: bool = True,
+    upload_logs: bool = True,
+    upload_tracelens_report: bool = False,
+    tracelens_ranks: Optional[List[int]] = None,
+    tracelens_max_reports: Optional[int] = None,
+    tracelens_output_format: str = "all",
+) -> dict:
+    """
+    Upload all artifacts (trace files, log files, TraceLens reports) to MLflow.
+
+    This is the main entry point for uploading artifacts to MLflow.
+    It handles:
+    - Trace files from PyTorch profiler
+    - Log files from training
+    - TraceLens analysis reports (optional)
+
+    MLflow Artifact Structure:
+        artifacts/
+        ├── traces/              # PyTorch profiler trace files
+        ├── logs/                # Training log files
+        └── trace_analysis/      # TraceLens analysis reports
+
+    Args:
+        mlflow_writer: The MLflow module instance (from get_mlflow_writer())
+        tensorboard_dir: Path to the tensorboard directory containing trace files
+        exp_root_path: Root path of the experiment for log files
+        upload_traces: Whether to upload trace files
+        upload_logs: Whether to upload log files
+        upload_tracelens_report: Whether to generate and upload TraceLens reports
+        tracelens_ranks: List of ranks to generate TraceLens reports for
+                        (None = all ranks, [0] = rank 0 only)
+        tracelens_max_reports: Maximum number of TraceLens reports to generate
+        tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+
+    Returns:
+        Dictionary with counts of uploaded files:
+        {
+            "traces": <number of trace files uploaded>,
+            "logs": <number of log files uploaded>,
+            "tracelens_reports": <number of TraceLens reports uploaded>
+        }
+    """
+    if mlflow_writer is None:
+        log_rank_0("[MLflow] MLflow writer not available, skipping artifact upload")
+        return {"traces": 0, "logs": 0, "tracelens_reports": 0}
+
+    log_rank_0("[MLflow] Starting artifact upload to MLflow...")
+    log_rank_0(f"[MLflow] tensorboard_dir: {tensorboard_dir}")
+    log_rank_0(f"[MLflow] exp_root_path: {exp_root_path}")
+    log_rank_0(f"[MLflow] upload_traces: {upload_traces}, upload_logs: {upload_logs}")
+    log_rank_0(f"[MLflow] upload_tracelens_report: {upload_tracelens_report}")
+
+    result = {"traces": 0, "logs": 0, "tracelens_reports": 0}
+
+    # Upload trace files
+    if upload_traces and tensorboard_dir:
+        result["traces"] = upload_trace_files_to_mlflow(
+            mlflow_writer, tensorboard_dir, artifact_path="traces"
+        )
+
+    # Upload log files
+    if upload_logs and exp_root_path:
+        result["logs"] = upload_log_files_to_mlflow(mlflow_writer, exp_root_path, artifact_path="logs")
+
+    # Generate and upload TraceLens reports
+    if upload_tracelens_report and tensorboard_dir and exp_root_path:
+        result["tracelens_reports"] = upload_tracelens_reports_to_mlflow(
+            mlflow_writer=mlflow_writer,
+            tensorboard_dir=tensorboard_dir,
+            exp_root_path=exp_root_path,
+            ranks=tracelens_ranks,
+            max_reports=tracelens_max_reports,
+            output_format=tracelens_output_format,
+            artifact_path="trace_analysis",
+        )
+
+    log_rank_0(
+        f"[MLflow] Artifact upload complete: "
+        f"{result['traces']} traces, {result['logs']} logs, "
+        f"{result['tracelens_reports']} TraceLens reports"
+    )
+
+    return result
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 0ec3a22b0..9a9a0d20f 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -5,6 +5,12 @@ disable_wandb: true
 disable_mlflow: true
 mlflow_run_name: null
 mlflow_experiment_name: null
+mlflow_upload_traces: true           # Upload profiler trace files to MLflow
+mlflow_upload_logs: true             # Upload training log files to MLflow
+mlflow_upload_tracelens_report: false  # Generate and upload TraceLens analysis reports
+mlflow_tracelens_ranks: null         # List of ranks to analyze with TraceLens (null = all)
+mlflow_tracelens_max_reports: null   # Max number of TraceLens reports (null = unlimited)
+mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
 disable_compile_dependencies: true
 # NOTE:
 # - If `use_rocm_mem_info = True`, ROCm memory information will be collected
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 9758929da..603ae5520 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -145,6 +145,7 @@
 from primus.backends.megatron.training.global_vars import (
     get_mlflow_writer,
     set_primus_global_variables,
+    upload_mlflow_artifacts,
 )
 from primus.backends.megatron.training.tokenizer.tokenizer import build_tokenizer
 from primus.core.utils import checker, file_utils
@@ -1611,6 +1612,17 @@ def run(self, *args, **kwargs):
 
         mlflow_writer = get_mlflow_writer()
         if mlflow_writer:
+            # Upload artifacts to MLflow before ending the run
+            upload_mlflow_artifacts(
+                tensorboard_dir=args.tensorboard_dir,
+                exp_root_path=self.exp_root_path,
+                upload_traces=getattr(args, "mlflow_upload_traces", True),
+                upload_logs=getattr(args, "mlflow_upload_logs", True),
+                upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
+                tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
+                tracelens_max_reports=getattr(args, "mlflow_tracelens_max_reports", None),
+                tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "all"),
+            )
             mlflow_writer.end_run()
 
         one_logger and one_logger.log_metrics({"app_finish_time": one_logger_utils.get_timestamp_in_ms()})

From bbfa9d3527a568a2d5c2c564ab77fd8c42cc0503 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 09:23:04 +0000
Subject: [PATCH 02/42] docs: Fix TraceLens CSV format description (multiple
 files per rank)

---
 primus/backends/megatron/training/mlflow_artifacts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index dbeeb789c..8ba656083 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -29,8 +29,8 @@
 
 TraceLens Report Formats:
     - xlsx: Multi-tab Excel with sections for kernels, memory, communication, etc.
-    - csv:  Single flat file with operation summary
-    - html: Interactive HTML report
+    - csv:  Multiple CSV files per rank (kernels, memory, communication, etc.)
+    - all:  Both xlsx and csv files (default)
 """
 
 import glob

From 07591229e5daa5916c883199706f592997b67fa4 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 09:25:11 +0000
Subject: [PATCH 03/42] fix: Remove unsupported HTML format option from
 TraceLens

---
 primus/backends/megatron/training/mlflow_artifacts.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 8ba656083..cfb430eb6 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -326,7 +326,6 @@ def generate_tracelens_report(
                       - "all" (default): Both XLSX and CSV files
                       - "xlsx": Single multi-tab Excel file with detailed analysis
                       - "csv": Multiple CSV files (kernels, memory, communication, etc.)
-                      - "html": Interactive HTML report
 
     Returns:
         List of paths to generated report files
@@ -375,14 +374,6 @@ def generate_tracelens_report(
                 log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
                 generated_files.extend(csv_files)
 
-        if output_format == "html":
-            warning_rank_0("[TraceLens] HTML format not yet supported, using xlsx+csv")
-            # Fall through to xlsx
-            xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
-            dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
-            if os.path.exists(xlsx_path):
-                generated_files.append(xlsx_path)
-
         if generated_files:
             return generated_files
 

From 4c908e5e73bcbcdf9534e03e2e1cc8e46a2b82a5 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 09:31:47 +0000
Subject: [PATCH 04/42] fix: Use specific trace file patterns to avoid matching
 unrelated JSON files

---
 primus/backends/megatron/training/mlflow_artifacts.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index cfb430eb6..3f4f6fd32 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -47,8 +47,8 @@ def _get_all_trace_files(tensorboard_dir: str) -> list:
     Find all profiler trace files in the tensorboard directory.
 
     Trace files are typically named like:
-    - primus-megatron-exp[...]-rank[0].*.json
-    - primus-megatron-exp[...]-rank[0].*.json.gz
+    - *.pt.trace.json
+    - *.pt.trace.json.gz
 
     Args:
         tensorboard_dir: Path to the tensorboard directory containing trace files
@@ -60,8 +60,9 @@ def _get_all_trace_files(tensorboard_dir: str) -> list:
         return []
 
     trace_files = []
-    # Look for JSON trace files (both compressed and uncompressed)
-    patterns = ["*.json", "*.json.gz", "*.pt.trace.json", "*.pt.trace.json.gz"]
+    # Look for PyTorch profiler trace files (both compressed and uncompressed)
+    # Using specific patterns to avoid matching unrelated JSON files
+    patterns = ["*.pt.trace.json", "*.pt.trace.json.gz"]
     for pattern in patterns:
         trace_files.extend(glob.glob(os.path.join(tensorboard_dir, pattern)))
         trace_files.extend(glob.glob(os.path.join(tensorboard_dir, "**", pattern), recursive=True))

From 2861bdf88f57ad455bd91d9bd0cdc73105a3505e Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 10:28:46 +0000
Subject: [PATCH 05/42] docs: Clarify MLflow upload defaults are opt-out when
 MLflow enabled

---
 primus/configs/modules/megatron/primus_megatron_module.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 9a9a0d20f..dbbe99cb1 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -5,6 +5,8 @@ disable_wandb: true
 disable_mlflow: true
 mlflow_run_name: null
 mlflow_experiment_name: null
+# NOTE: When disable_mlflow=false, traces and logs are uploaded by default.
+# Set these to false if you only want metrics/params logged to MLflow.
 mlflow_upload_traces: true           # Upload profiler trace files to MLflow
 mlflow_upload_logs: true             # Upload training log files to MLflow
 mlflow_upload_tracelens_report: false  # Generate and upload TraceLens analysis reports

From 44d479f454f41455ec47057d9ae31806ffe09b3c Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 11:26:50 +0000
Subject: [PATCH 06/42] fix: normalize tracelens_ranks parameter to handle
 string input from config parser

Addresses Copilot review comment: if mlflow_tracelens_ranks is configured
as a string in YAML (e.g., '[0,8]' instead of [0, 8]), the code would
receive a string instead of a list, causing _filter_traces_by_rank to
silently filter out all trace files.

Added ast.literal_eval() conversion in:
- generate_tracelens_reports()
- upload_tracelens_reports_to_mlflow()

Falls back to None (process all ranks) with a warning if parsing fails.
---
 .../megatron/training/mlflow_artifacts.py     | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 3f4f6fd32..95799974c 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -523,6 +523,21 @@ def generate_tracelens_reports(
     # Try to install tracelens, but continue with fallback if not available
     _ensure_tracelens_installed()
 
+    # Normalize ranks parameter: handle string input from config parser
+    if ranks is not None and isinstance(ranks, str):
+        import ast
+
+        try:
+            ranks = ast.literal_eval(ranks)
+            if not isinstance(ranks, list):
+                log_rank_0(
+                    f"[TraceLens] Warning: ranks evaluated to {type(ranks).__name__}, expected list. Using None."
+                )
+                ranks = None
+        except (ValueError, SyntaxError) as e:
+            log_rank_0(f"[TraceLens] Warning: Failed to parse ranks '{ranks}': {e}. Using None.")
+            ranks = None
+
     trace_files = _get_all_trace_files(tensorboard_dir)
     if not trace_files:
         log_rank_0("[TraceLens] No trace files found for analysis")
@@ -585,6 +600,21 @@ def upload_tracelens_reports_to_mlflow(
         log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
         return 0
 
+    # Normalize ranks parameter: handle string input from config parser
+    if ranks is not None and isinstance(ranks, str):
+        import ast
+
+        try:
+            ranks = ast.literal_eval(ranks)
+            if not isinstance(ranks, list):
+                log_rank_0(
+                    f"[TraceLens] Warning: ranks evaluated to {type(ranks).__name__}, expected list. Using None."
+                )
+                ranks = None
+        except (ValueError, SyntaxError) as e:
+            log_rank_0(f"[TraceLens] Warning: Failed to parse ranks '{ranks}': {e}. Using None.")
+            ranks = None
+
     # Create output directory for reports
     reports_dir = os.path.join(exp_root_path, "tracelens_reports")
     os.makedirs(reports_dir, exist_ok=True)

From f34304613aff667b98623cdb7d334d59f20845a0 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 12:38:49 +0000
Subject: [PATCH 07/42] perf: optimize TraceLens report generation to parse
 trace file only once

When output_format='all', previously the trace file was parsed twice:
- Once for XLSX generation
- Once for CSV generation

Now when format is 'all', we call generate_perf_report_pytorch once
with both output_xlsx_path and output_csvs_dir parameters, parsing
the trace file only once and generating both formats from the same data.

This improves performance significantly for the common use case of
generating both report formats.
---
 .../megatron/training/mlflow_artifacts.py     | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 95799974c..2f7855501 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -353,8 +353,32 @@ def generate_tracelens_report(
 
         generated_files = []
 
-        if output_format in ("all", "xlsx"):
-            # XLSX: Single file with multiple tabs
+        # Optimize for "all" format: parse trace once and generate both outputs
+        if output_format == "all":
+            xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
+            csv_subdir = os.path.join(output_dir, report_name)
+            os.makedirs(csv_subdir, exist_ok=True)
+
+            # Parse trace once and generate both formats
+            dfs = generate_perf_report_pytorch(
+                trace_file, output_xlsx_path=xlsx_path, output_csvs_dir=csv_subdir
+            )
+
+            # Check XLSX output
+            if os.path.exists(xlsx_path):
+                log_rank_0(
+                    f"[TraceLens] Generated XLSX report with {len(dfs)} tabs: {os.path.basename(xlsx_path)}"
+                )
+                generated_files.append(xlsx_path)
+
+            # Check CSV outputs
+            csv_files = glob.glob(os.path.join(csv_subdir, "*.csv"))
+            if csv_files:
+                log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
+                generated_files.extend(csv_files)
+
+        elif output_format == "xlsx":
+            # XLSX only: Single file with multiple tabs
             xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
             dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
             if os.path.exists(xlsx_path):
@@ -363,8 +387,8 @@ def generate_tracelens_report(
                 )
                 generated_files.append(xlsx_path)
 
-        if output_format in ("all", "csv"):
-            # CSV: Multiple files in a subdirectory per rank
+        elif output_format == "csv":
+            # CSV only: Multiple files in a subdirectory per rank
             csv_subdir = os.path.join(output_dir, report_name)
             os.makedirs(csv_subdir, exist_ok=True)
             dfs = generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)

From 0ed33dbda7c625ddcf65979e483347fe70e44a37 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 12:55:56 +0000
Subject: [PATCH 08/42] feat: cleanup tracelens_reports directory after upload
 to MLflow

After TraceLens reports are successfully uploaded to MLflow, the local
tracelens_reports directory is automatically cleaned up to save disk space.

This addresses the issue of temporary directories not being cleaned up
after artifact upload. The reports remain accessible in MLflow while
freeing up local storage.

Other directories checked:
- tensorboard_dir: Contains original trace files, NOT temporary
- exp_root_path/logs: Contains original log files, NOT temporary
- tracelens_reports: Processed reports uploaded to MLflow, safe to cleanup
---
 .../megatron/training/mlflow_artifacts.py        | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 2f7855501..0362d0c36 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -607,6 +607,7 @@ def upload_tracelens_reports_to_mlflow(
     1. Finds PyTorch profiler trace files
     2. Generates TraceLens analysis reports for specified ranks
     3. Uploads the reports to MLflow under the trace_analysis artifact path
+    4. Cleans up local report files after successful upload
 
     Args:
         mlflow_writer: The MLflow module instance (from get_mlflow_writer())
@@ -619,6 +620,10 @@ def upload_tracelens_reports_to_mlflow(
 
     Returns:
         Number of reports uploaded to MLflow
+
+    Note:
+        The local tracelens_reports directory is automatically cleaned up after
+        successful upload to save disk space. The reports remain accessible in MLflow.
     """
     if mlflow_writer is None:
         log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
@@ -674,6 +679,17 @@ def upload_tracelens_reports_to_mlflow(
             warning_rank_0(f"[MLflow] Failed to upload report {report_path}: {e}")
 
     log_rank_0(f"[TraceLens] Uploaded {uploaded_count} reports to '{artifact_path}'")
+
+    # Clean up local reports after successful upload to save disk space
+    # The reports are now safely stored in MLflow
+    try:
+        import shutil
+
+        shutil.rmtree(reports_dir)
+        log_rank_0(f"[TraceLens] Cleaned up local reports directory: {reports_dir}")
+    except Exception as e:
+        warning_rank_0(f"[TraceLens] Failed to cleanup reports directory: {e}")
+
     return uploaded_count
 
 

From deda294efbd536b681fc1f866eb2550b1850f5fc Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 13:08:10 +0000
Subject: [PATCH 09/42] feat: make tracelens_reports cleanup configurable

Added mlflow_tracelens_cleanup_after_upload parameter to control whether
local TraceLens reports are removed after upload to MLflow.

Default: True (cleanup to save disk space)
Set to False to keep reports locally for inspection/debugging

Changes:
- Added cleanup_after_upload parameter to upload_tracelens_reports_to_mlflow()
- Added tracelens_cleanup_after_upload to upload_artifacts_to_mlflow()
- Added mlflow_tracelens_cleanup_after_upload config in YAML (default: true)
- Updated trainer to pass through the parameter

Use cases:
- True (default): Production runs, save disk space
- False: Development/debugging, keep local copies for inspection
---
 .../megatron/training/mlflow_artifacts.py     | 31 ++++++++++++-------
 .../megatron/primus_megatron_module.yaml      |  1 +
 primus/modules/trainer/megatron/trainer.py    |  1 +
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 0362d0c36..be8b69522 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -599,6 +599,7 @@ def upload_tracelens_reports_to_mlflow(
     max_reports: Optional[int] = None,
     output_format: str = "all",
     artifact_path: str = "trace_analysis",
+    cleanup_after_upload: bool = True,
 ) -> int:
     """
     Generate TraceLens reports and upload them to MLflow.
@@ -607,7 +608,7 @@ def upload_tracelens_reports_to_mlflow(
     1. Finds PyTorch profiler trace files
     2. Generates TraceLens analysis reports for specified ranks
     3. Uploads the reports to MLflow under the trace_analysis artifact path
-    4. Cleans up local report files after successful upload
+    4. Optionally cleans up local report files after successful upload
 
     Args:
         mlflow_writer: The MLflow module instance (from get_mlflow_writer())
@@ -617,13 +618,15 @@ def upload_tracelens_reports_to_mlflow(
         max_reports: Maximum number of reports to generate
         output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
         artifact_path: MLflow artifact subdirectory for reports
+        cleanup_after_upload: If True, removes local reports after upload to save disk space.
+                             If False, keeps reports locally for inspection. Default: True.
 
     Returns:
         Number of reports uploaded to MLflow
 
     Note:
-        The local tracelens_reports directory is automatically cleaned up after
-        successful upload to save disk space. The reports remain accessible in MLflow.
+        Reports are saved to exp_root_path/tracelens_reports/. Set cleanup_after_upload=False
+        to keep them locally for debugging or additional processing.
     """
     if mlflow_writer is None:
         log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
@@ -680,15 +683,17 @@ def upload_tracelens_reports_to_mlflow(
 
     log_rank_0(f"[TraceLens] Uploaded {uploaded_count} reports to '{artifact_path}'")
 
-    # Clean up local reports after successful upload to save disk space
-    # The reports are now safely stored in MLflow
-    try:
-        import shutil
+    # Optionally clean up local reports after successful upload to save disk space
+    if cleanup_after_upload:
+        try:
+            import shutil
 
-        shutil.rmtree(reports_dir)
-        log_rank_0(f"[TraceLens] Cleaned up local reports directory: {reports_dir}")
-    except Exception as e:
-        warning_rank_0(f"[TraceLens] Failed to cleanup reports directory: {e}")
+            shutil.rmtree(reports_dir)
+            log_rank_0(f"[TraceLens] Cleaned up local reports directory: {reports_dir}")
+        except Exception as e:
+            warning_rank_0(f"[TraceLens] Failed to cleanup reports directory: {e}")
+    else:
+        log_rank_0(f"[TraceLens] Keeping local reports at: {reports_dir}")
 
     return uploaded_count
 
@@ -708,6 +713,7 @@ def upload_artifacts_to_mlflow(
     tracelens_ranks: Optional[List[int]] = None,
     tracelens_max_reports: Optional[int] = None,
     tracelens_output_format: str = "all",
+    tracelens_cleanup_after_upload: bool = True,
 ) -> dict:
     """
     Upload all artifacts (trace files, log files, TraceLens reports) to MLflow.
@@ -735,6 +741,8 @@ def upload_artifacts_to_mlflow(
                         (None = all ranks, [0] = rank 0 only)
         tracelens_max_reports: Maximum number of TraceLens reports to generate
         tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        tracelens_cleanup_after_upload: If True, removes local reports after upload (default).
+                                       If False, keeps reports locally for inspection.
 
     Returns:
         Dictionary with counts of uploaded files:
@@ -776,6 +784,7 @@ def upload_artifacts_to_mlflow(
             max_reports=tracelens_max_reports,
             output_format=tracelens_output_format,
             artifact_path="trace_analysis",
+            cleanup_after_upload=tracelens_cleanup_after_upload,
         )
 
     log_rank_0(
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index dbbe99cb1..854c19702 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -13,6 +13,7 @@ mlflow_upload_tracelens_report: false  # Generate and upload TraceLens analysis
 mlflow_tracelens_ranks: null         # List of ranks to analyze with TraceLens (null = all)
 mlflow_tracelens_max_reports: null   # Max number of TraceLens reports (null = unlimited)
 mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
+mlflow_tracelens_cleanup_after_upload: true  # Cleanup local reports after upload (true saves disk space)
 disable_compile_dependencies: true
 # NOTE:
 # - If `use_rocm_mem_info = True`, ROCm memory information will be collected
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 603ae5520..dbefb80b5 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -1622,6 +1622,7 @@ def run(self, *args, **kwargs):
                 tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
                 tracelens_max_reports=getattr(args, "mlflow_tracelens_max_reports", None),
                 tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "all"),
+                tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", True),
             )
             mlflow_writer.end_run()
 

From 45d384fa139ce0ea61c5f783c7d6eb2ed5077262 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 13:22:16 +0000
Subject: [PATCH 10/42] refactor: change default to keep tracelens reports
 locally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changed mlflow_tracelens_cleanup_after_upload default from True to False.

New behavior:
- Default (False): Keep reports locally for easy inspection
- Opt-in (True): Cleanup to save disk space

Rationale:
- Reports are valuable for local analysis and debugging
- Users can inspect without downloading from MLflow
- Disk space is less critical than convenience for most users
- Those who need cleanup can explicitly set it to True

Changes:
- cleanup_after_upload parameter: True → False
- tracelens_cleanup_after_upload parameter: True → False
- YAML config default: true → false
- Updated docstrings to reflect new default behavior
---
 .../backends/megatron/training/mlflow_artifacts.py | 14 +++++++-------
 .../modules/megatron/primus_megatron_module.yaml   |  2 +-
 primus/modules/trainer/megatron/trainer.py         |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index be8b69522..a8496f0d8 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -599,7 +599,7 @@ def upload_tracelens_reports_to_mlflow(
     max_reports: Optional[int] = None,
     output_format: str = "all",
     artifact_path: str = "trace_analysis",
-    cleanup_after_upload: bool = True,
+    cleanup_after_upload: bool = False,
 ) -> int:
     """
     Generate TraceLens reports and upload them to MLflow.
@@ -619,14 +619,14 @@ def upload_tracelens_reports_to_mlflow(
         output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
         artifact_path: MLflow artifact subdirectory for reports
         cleanup_after_upload: If True, removes local reports after upload to save disk space.
-                             If False, keeps reports locally for inspection. Default: True.
+                             If False, keeps reports locally for inspection. Default: False.
 
     Returns:
         Number of reports uploaded to MLflow
 
     Note:
-        Reports are saved to exp_root_path/tracelens_reports/. Set cleanup_after_upload=False
-        to keep them locally for debugging or additional processing.
+        Reports are saved to exp_root_path/tracelens_reports/ and kept locally by default.
+        Set cleanup_after_upload=True to remove them after upload and save disk space.
     """
     if mlflow_writer is None:
         log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
@@ -713,7 +713,7 @@ def upload_artifacts_to_mlflow(
     tracelens_ranks: Optional[List[int]] = None,
     tracelens_max_reports: Optional[int] = None,
     tracelens_output_format: str = "all",
-    tracelens_cleanup_after_upload: bool = True,
+    tracelens_cleanup_after_upload: bool = False,
 ) -> dict:
     """
     Upload all artifacts (trace files, log files, TraceLens reports) to MLflow.
@@ -741,8 +741,8 @@ def upload_artifacts_to_mlflow(
                         (None = all ranks, [0] = rank 0 only)
         tracelens_max_reports: Maximum number of TraceLens reports to generate
         tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
-        tracelens_cleanup_after_upload: If True, removes local reports after upload (default).
-                                       If False, keeps reports locally for inspection.
+        tracelens_cleanup_after_upload: If True, removes local reports after upload to save disk space.
+                                       If False, keeps reports locally for inspection (default).
 
     Returns:
         Dictionary with counts of uploaded files:
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 854c19702..98b212c70 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -13,7 +13,7 @@ mlflow_upload_tracelens_report: false  # Generate and upload TraceLens analysis
 mlflow_tracelens_ranks: null         # List of ranks to analyze with TraceLens (null = all)
 mlflow_tracelens_max_reports: null   # Max number of TraceLens reports (null = unlimited)
 mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
-mlflow_tracelens_cleanup_after_upload: true  # Cleanup local reports after upload (true saves disk space)
+mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)
 disable_compile_dependencies: true
 # NOTE:
 # - If `use_rocm_mem_info = True`, ROCm memory information will be collected
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index dbefb80b5..b0750b83b 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -1622,7 +1622,7 @@ def run(self, *args, **kwargs):
                 tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
                 tracelens_max_reports=getattr(args, "mlflow_tracelens_max_reports", None),
                 tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "all"),
-                tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", True),
+                tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", False),
             )
             mlflow_writer.end_run()
 

From 290d5d3809ca6149f67dfbca2adff8343c0b684f Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 13:30:15 +0000
Subject: [PATCH 11/42] feat: decouple TraceLens generation from MLflow upload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added 'generate_tracelens_report' parameter to enable local report
generation without requiring MLflow upload.

New functionality:
- generate_tracelens_report: Generate reports locally only
- mlflow_upload_tracelens_report: Upload to MLflow (auto-enables generation)

Usage modes:
1. generate=F, upload=F  →  No reports
2. generate=T, upload=F  →  Generate locally only (NEW)
3. generate=F, upload=T  →  Generate AND upload (auto-enabled)
4. generate=T, upload=T  →  Generate AND upload (explicit)

Key benefits:
- Local development without MLflow dependency
- Quick profiling analysis without upload overhead
- Flexible workflow for different use cases
- Backward compatible (upload=T still works)

Changes:
- Added generate_tracelens_reports_locally() function
- Added generate_tracelens_report parameter to all entry points
- Updated logic to handle both parameters with auto-enable
- Updated YAML config with clear documentation
- Added comprehensive logging for each mode

Documentation: scripts/TRACELENS_GENERATION_MODES.md
---
 .../backends/megatron/training/global_vars.py |  15 ++-
 .../megatron/training/mlflow_artifacts.py     | 126 +++++++++++++++---
 .../megatron/primus_megatron_module.yaml      |  15 ++-
 primus/modules/trainer/megatron/trainer.py    |   1 +
 4 files changed, 138 insertions(+), 19 deletions(-)

diff --git a/primus/backends/megatron/training/global_vars.py b/primus/backends/megatron/training/global_vars.py
index 6685d2036..1549ba436 100644
--- a/primus/backends/megatron/training/global_vars.py
+++ b/primus/backends/megatron/training/global_vars.py
@@ -95,10 +95,12 @@ def upload_mlflow_artifacts(
     exp_root_path: Optional[str] = None,
     upload_traces: bool = True,
     upload_logs: bool = True,
+    generate_tracelens_report: bool = False,
     upload_tracelens_report: bool = False,
     tracelens_ranks: Optional[List[int]] = None,
     tracelens_max_reports: Optional[int] = None,
     tracelens_output_format: str = "all",
+    tracelens_cleanup_after_upload: bool = False,
 ) -> Optional[dict]:
     """
     Upload trace files, log files, and TraceLens reports to MLflow as artifacts.
@@ -111,18 +113,25 @@ def upload_mlflow_artifacts(
         artifacts/
         ├── traces/              # PyTorch profiler trace files
         ├── logs/                # Training log files
-        └── trace_analysis/      # TraceLens analysis reports
+        └── trace_analysis/      # TraceLens analysis reports (if uploaded)
+
+    TraceLens Report Logic:
+        - upload_tracelens_report=True: Generate AND upload (auto-enables generation)
+        - generate_tracelens_report=True only: Generate locally without upload
+        - Both False: No report generation
 
     Args:
         tensorboard_dir: Path to tensorboard directory with trace files
         exp_root_path: Root experiment path for log files
         upload_traces: Whether to upload trace files (default: True)
         upload_logs: Whether to upload log files (default: True)
-        upload_tracelens_report: Whether to generate and upload TraceLens reports
+        generate_tracelens_report: Whether to generate TraceLens reports locally
+        upload_tracelens_report: Whether to upload TraceLens reports to MLflow (implies generation)
         tracelens_ranks: List of ranks to analyze with TraceLens
                         (None = all, [0] = rank 0 only)
         tracelens_max_reports: Maximum number of TraceLens reports to generate
         tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        tracelens_cleanup_after_upload: Remove local reports after upload (default: False)
 
     Returns:
         Dictionary with counts of uploaded files, or None if MLflow is not enabled
@@ -137,8 +146,10 @@ def upload_mlflow_artifacts(
         exp_root_path=exp_root_path,
         upload_traces=upload_traces,
         upload_logs=upload_logs,
+        generate_tracelens_report=generate_tracelens_report,
         upload_tracelens_report=upload_tracelens_report,
         tracelens_ranks=tracelens_ranks,
         tracelens_max_reports=tracelens_max_reports,
         tracelens_output_format=tracelens_output_format,
+        tracelens_cleanup_after_upload=tracelens_cleanup_after_upload,
     )
diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index a8496f0d8..bc0cc00cf 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -591,6 +591,66 @@ def generate_tracelens_reports(
     return generated_reports
 
 
+def generate_tracelens_reports_locally(
+    tensorboard_dir: str,
+    exp_root_path: str,
+    ranks: Optional[List[int]] = None,
+    max_reports: Optional[int] = None,
+    output_format: str = "all",
+) -> int:
+    """
+    Generate TraceLens analysis reports locally (without MLflow upload).
+
+    This function generates TraceLens reports and saves them to
+    exp_root_path/tracelens_reports/ for local inspection.
+
+    Args:
+        tensorboard_dir: Directory containing PyTorch profiler trace files
+        exp_root_path: Root path of the experiment (for saving reports)
+        ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
+        max_reports: Maximum number of reports to generate
+        output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+
+    Returns:
+        Number of reports generated
+
+    Example:
+        >>> generate_tracelens_reports_locally(
+        ...     tensorboard_dir="/path/to/tensorboard",
+        ...     exp_root_path="/path/to/experiment",
+        ...     ranks=[0, 8],
+        ...     output_format="all"
+        ... )
+        26  # Generated 26 report files
+    """
+    # Create output directory for reports
+    reports_dir = os.path.join(exp_root_path, "tracelens_reports")
+    os.makedirs(reports_dir, exist_ok=True)
+
+    log_rank_0(f"[TraceLens] Generating reports from traces in: {tensorboard_dir}")
+    log_rank_0(f"[TraceLens] Reports will be saved to: {reports_dir}")
+    if ranks:
+        log_rank_0(f"[TraceLens] Analyzing ranks: {ranks}")
+    if max_reports:
+        log_rank_0(f"[TraceLens] Max reports: {max_reports}")
+
+    # Generate reports
+    reports = generate_tracelens_reports(
+        tensorboard_dir=tensorboard_dir,
+        output_dir=reports_dir,
+        ranks=ranks,
+        max_reports=max_reports,
+        output_format=output_format,
+    )
+
+    if not reports:
+        log_rank_0("[TraceLens] No reports generated")
+        return 0
+
+    log_rank_0(f"[TraceLens] Generated {len(reports)} report files locally")
+    return len(reports)
+
+
 def upload_tracelens_reports_to_mlflow(
     mlflow_writer,
     tensorboard_dir: str,
@@ -709,6 +769,7 @@ def upload_artifacts_to_mlflow(
     exp_root_path: Optional[str] = None,
     upload_traces: bool = True,
     upload_logs: bool = True,
+    generate_tracelens_report: bool = False,
     upload_tracelens_report: bool = False,
     tracelens_ranks: Optional[List[int]] = None,
     tracelens_max_reports: Optional[int] = None,
@@ -722,13 +783,24 @@ def upload_artifacts_to_mlflow(
     It handles:
     - Trace files from PyTorch profiler
     - Log files from training
-    - TraceLens analysis reports (optional)
+    - TraceLens analysis reports (optional - generate locally and/or upload to MLflow)
 
     MLflow Artifact Structure:
         artifacts/
         ├── traces/              # PyTorch profiler trace files
         ├── logs/                # Training log files
-        └── trace_analysis/      # TraceLens analysis reports
+        └── trace_analysis/      # TraceLens analysis reports (if uploaded)
+
+    TraceLens Report Generation Logic:
+        - If upload_tracelens_report=True:  Generate AND upload (auto-enables generation)
+        - If generate_tracelens_report=True and upload_tracelens_report=False: Generate locally only
+        - If both False: No report generation
+
+        Examples:
+            generate=False, upload=False  →  No reports
+            generate=True,  upload=False  →  Generate locally only
+            generate=False, upload=True   →  Generate AND upload (auto-enabled)
+            generate=True,  upload=True   →  Generate AND upload (explicit)
 
     Args:
         mlflow_writer: The MLflow module instance (from get_mlflow_writer())
@@ -736,7 +808,8 @@ def upload_artifacts_to_mlflow(
         exp_root_path: Root path of the experiment for log files
         upload_traces: Whether to upload trace files
         upload_logs: Whether to upload log files
-        upload_tracelens_report: Whether to generate and upload TraceLens reports
+        generate_tracelens_report: Whether to generate TraceLens reports locally
+        upload_tracelens_report: Whether to upload TraceLens reports to MLflow (implies generation)
         tracelens_ranks: List of ranks to generate TraceLens reports for
                         (None = all ranks, [0] = rank 0 only)
         tracelens_max_reports: Maximum number of TraceLens reports to generate
@@ -760,7 +833,10 @@ def upload_artifacts_to_mlflow(
     log_rank_0(f"[MLflow] tensorboard_dir: {tensorboard_dir}")
     log_rank_0(f"[MLflow] exp_root_path: {exp_root_path}")
     log_rank_0(f"[MLflow] upload_traces: {upload_traces}, upload_logs: {upload_logs}")
-    log_rank_0(f"[MLflow] upload_tracelens_report: {upload_tracelens_report}")
+    log_rank_0(
+        f"[MLflow] generate_tracelens_report: {generate_tracelens_report}, "
+        f"upload_tracelens_report: {upload_tracelens_report}"
+    )
 
     result = {"traces": 0, "logs": 0, "tracelens_reports": 0}
 
@@ -774,18 +850,36 @@ def upload_artifacts_to_mlflow(
     if upload_logs and exp_root_path:
         result["logs"] = upload_log_files_to_mlflow(mlflow_writer, exp_root_path, artifact_path="logs")
 
-    # Generate and upload TraceLens reports
-    if upload_tracelens_report and tensorboard_dir and exp_root_path:
-        result["tracelens_reports"] = upload_tracelens_reports_to_mlflow(
-            mlflow_writer=mlflow_writer,
-            tensorboard_dir=tensorboard_dir,
-            exp_root_path=exp_root_path,
-            ranks=tracelens_ranks,
-            max_reports=tracelens_max_reports,
-            output_format=tracelens_output_format,
-            artifact_path="trace_analysis",
-            cleanup_after_upload=tracelens_cleanup_after_upload,
-        )
+    # TraceLens report generation and upload logic
+    # If upload=True, auto-enable generation (even if generate=False)
+    should_generate = generate_tracelens_report or upload_tracelens_report
+
+    if should_generate and tensorboard_dir and exp_root_path:
+        if upload_tracelens_report:
+            # Generate AND upload to MLflow
+            log_rank_0("[TraceLens] Mode: Generate and upload to MLflow")
+            result["tracelens_reports"] = upload_tracelens_reports_to_mlflow(
+                mlflow_writer=mlflow_writer,
+                tensorboard_dir=tensorboard_dir,
+                exp_root_path=exp_root_path,
+                ranks=tracelens_ranks,
+                max_reports=tracelens_max_reports,
+                output_format=tracelens_output_format,
+                artifact_path="trace_analysis",
+                cleanup_after_upload=tracelens_cleanup_after_upload,
+            )
+        else:
+            # Generate locally only (no MLflow upload)
+            log_rank_0("[TraceLens] Mode: Generate locally only (no MLflow upload)")
+            num_generated = generate_tracelens_reports_locally(
+                tensorboard_dir=tensorboard_dir,
+                exp_root_path=exp_root_path,
+                ranks=tracelens_ranks,
+                max_reports=tracelens_max_reports,
+                output_format=tracelens_output_format,
+            )
+            # Don't count as "uploaded" since they're local-only
+            log_rank_0(f"[TraceLens] Generated {num_generated} report files (not uploaded to MLflow)")
 
     log_rank_0(
         f"[MLflow] Artifact upload complete: "
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 98b212c70..2fa576352 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -9,7 +9,20 @@ mlflow_experiment_name: null
 # Set these to false if you only want metrics/params logged to MLflow.
 mlflow_upload_traces: true           # Upload profiler trace files to MLflow
 mlflow_upload_logs: true             # Upload training log files to MLflow
-mlflow_upload_tracelens_report: false  # Generate and upload TraceLens analysis reports
+
+# TraceLens Report Generation & Upload
+# ----------------------------------------------------------------------------
+# generate_tracelens_report:          Generate TraceLens analysis reports locally
+# mlflow_upload_tracelens_report:     Upload reports to MLflow (auto-enables generation)
+#
+# Usage patterns:
+#   generate=false, upload=false  →  No reports generated
+#   generate=true,  upload=false  →  Generate reports locally only
+#   generate=false, upload=true   →  Generate AND upload (auto-enabled)
+#   generate=true,  upload=true   →  Generate AND upload (explicit)
+# ----------------------------------------------------------------------------
+generate_tracelens_report: false     # Generate TraceLens analysis reports locally
+mlflow_upload_tracelens_report: false  # Upload TraceLens reports to MLflow (implies generation)
 mlflow_tracelens_ranks: null         # List of ranks to analyze with TraceLens (null = all)
 mlflow_tracelens_max_reports: null   # Max number of TraceLens reports (null = unlimited)
 mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index b0750b83b..0fcf8e70d 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -1618,6 +1618,7 @@ def run(self, *args, **kwargs):
                 exp_root_path=self.exp_root_path,
                 upload_traces=getattr(args, "mlflow_upload_traces", True),
                 upload_logs=getattr(args, "mlflow_upload_logs", True),
+                generate_tracelens_report=getattr(args, "generate_tracelens_report", False),
                 upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
                 tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
                 tracelens_max_reports=getattr(args, "mlflow_tracelens_max_reports", None),

From 370360c6aa4556d620eb2c76392331757403bdbf Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 13:35:07 +0000
Subject: [PATCH 12/42] refactor: remove confusing tracelens_max_reports
 parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed mlflow_tracelens_max_reports parameter to simplify API and avoid confusion.

Problem:
- Using both 'ranks' and 'max_reports' was confusing
- Example: ranks=[0,4,8,12] + max_reports=2 → only [0,4] analyzed
- Users explicitly specify ranks but get silently truncated

Solution:
- Use tracelens_ranks to control which ranks AND how many
- Want 2 reports? Specify 2 ranks: ranks=[0,8]
- Want all ranks? Use ranks=null
- Clear and explicit behavior

Changes:
- Removed max_reports parameter from all functions
- Removed mlflow_tracelens_max_reports from config
- Updated docstrings to clarify rank-based control
- Simplified logic - no truncation step needed

Migration:
  Before: ranks=[0,4,8,12], max_reports=2
  After:  ranks=[0,4]  # Just specify the ranks you want!
---
 .../backends/megatron/training/global_vars.py |  6 ++--
 .../megatron/training/mlflow_artifacts.py     | 31 +++++--------------
 .../megatron/primus_megatron_module.yaml      |  6 ++--
 primus/modules/trainer/megatron/trainer.py    |  1 -
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/primus/backends/megatron/training/global_vars.py b/primus/backends/megatron/training/global_vars.py
index 1549ba436..9255d7842 100644
--- a/primus/backends/megatron/training/global_vars.py
+++ b/primus/backends/megatron/training/global_vars.py
@@ -98,7 +98,6 @@ def upload_mlflow_artifacts(
     generate_tracelens_report: bool = False,
     upload_tracelens_report: bool = False,
     tracelens_ranks: Optional[List[int]] = None,
-    tracelens_max_reports: Optional[int] = None,
     tracelens_output_format: str = "all",
     tracelens_cleanup_after_upload: bool = False,
 ) -> Optional[dict]:
@@ -128,8 +127,8 @@ def upload_mlflow_artifacts(
         generate_tracelens_report: Whether to generate TraceLens reports locally
         upload_tracelens_report: Whether to upload TraceLens reports to MLflow (implies generation)
         tracelens_ranks: List of ranks to analyze with TraceLens
-                        (None = all, [0] = rank 0 only)
-        tracelens_max_reports: Maximum number of TraceLens reports to generate
+                        (None = all, [0, 8] = ranks 0 and 8 only)
+                        Specify fewer ranks to limit number of reports
         tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
         tracelens_cleanup_after_upload: Remove local reports after upload (default: False)
 
@@ -149,7 +148,6 @@ def upload_mlflow_artifacts(
         generate_tracelens_report=generate_tracelens_report,
         upload_tracelens_report=upload_tracelens_report,
         tracelens_ranks=tracelens_ranks,
-        tracelens_max_reports=tracelens_max_reports,
         tracelens_output_format=tracelens_output_format,
         tracelens_cleanup_after_upload=tracelens_cleanup_after_upload,
     )
diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index bc0cc00cf..3c047a2ad 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -525,7 +525,6 @@ def generate_tracelens_reports(
     tensorboard_dir: str,
     output_dir: str,
     ranks: Optional[List[int]] = None,
-    max_reports: Optional[int] = None,
     output_format: str = "all",
 ) -> List[str]:
     """
@@ -535,7 +534,7 @@ def generate_tracelens_reports(
         tensorboard_dir: Directory containing PyTorch profiler trace files
         output_dir: Directory to save the generated reports
         ranks: List of ranks to generate reports for (None = all ranks)
-        max_reports: Maximum number of reports to generate (None = unlimited)
+               To limit number of reports, specify fewer ranks in the list
         output_format: Output format:
                       - "all" (default): Both XLSX and CSV files
                       - "xlsx": Multi-tab Excel with detailed analysis
@@ -572,11 +571,6 @@ def generate_tracelens_reports(
         trace_files = _filter_traces_by_rank(trace_files, ranks)
         log_rank_0(f"[TraceLens] Filtered to {len(trace_files)} trace files for ranks: {ranks}")
 
-    # Limit number of reports if specified
-    if max_reports is not None and len(trace_files) > max_reports:
-        trace_files = trace_files[:max_reports]
-        log_rank_0(f"[TraceLens] Limited to {max_reports} reports")
-
     log_rank_0(
         f"[TraceLens] Generating {output_format.upper()} reports for {len(trace_files)} trace files..."
     )
@@ -595,7 +589,6 @@ def generate_tracelens_reports_locally(
     tensorboard_dir: str,
     exp_root_path: str,
     ranks: Optional[List[int]] = None,
-    max_reports: Optional[int] = None,
     output_format: str = "all",
 ) -> int:
     """
@@ -608,7 +601,7 @@ def generate_tracelens_reports_locally(
         tensorboard_dir: Directory containing PyTorch profiler trace files
         exp_root_path: Root path of the experiment (for saving reports)
         ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
-        max_reports: Maximum number of reports to generate
+               Specify fewer ranks to limit number of reports
         output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
 
     Returns:
@@ -618,10 +611,10 @@ def generate_tracelens_reports_locally(
         >>> generate_tracelens_reports_locally(
         ...     tensorboard_dir="/path/to/tensorboard",
         ...     exp_root_path="/path/to/experiment",
-        ...     ranks=[0, 8],
+        ...     ranks=[0, 8],  # Only 2 ranks = 2 reports
         ...     output_format="all"
         ... )
-        26  # Generated 26 report files
+        26  # Generated 26 report files (XLSX + CSVs for 2 ranks)
     """
     # Create output directory for reports
     reports_dir = os.path.join(exp_root_path, "tracelens_reports")
@@ -631,15 +624,12 @@ def generate_tracelens_reports_locally(
     log_rank_0(f"[TraceLens] Reports will be saved to: {reports_dir}")
     if ranks:
         log_rank_0(f"[TraceLens] Analyzing ranks: {ranks}")
-    if max_reports:
-        log_rank_0(f"[TraceLens] Max reports: {max_reports}")
 
     # Generate reports
     reports = generate_tracelens_reports(
         tensorboard_dir=tensorboard_dir,
         output_dir=reports_dir,
         ranks=ranks,
-        max_reports=max_reports,
         output_format=output_format,
     )
 
@@ -656,7 +646,6 @@ def upload_tracelens_reports_to_mlflow(
     tensorboard_dir: str,
     exp_root_path: str,
     ranks: Optional[List[int]] = None,
-    max_reports: Optional[int] = None,
     output_format: str = "all",
     artifact_path: str = "trace_analysis",
     cleanup_after_upload: bool = False,
@@ -675,7 +664,7 @@ def upload_tracelens_reports_to_mlflow(
         tensorboard_dir: Directory containing PyTorch profiler trace files
         exp_root_path: Root path of the experiment (for saving reports)
         ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
-        max_reports: Maximum number of reports to generate
+               Specify fewer ranks to limit number of reports
         output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
         artifact_path: MLflow artifact subdirectory for reports
         cleanup_after_upload: If True, removes local reports after upload to save disk space.
@@ -715,15 +704,12 @@ def upload_tracelens_reports_to_mlflow(
     log_rank_0(f"[TraceLens] Reports will be saved to: {reports_dir}")
     if ranks:
         log_rank_0(f"[TraceLens] Analyzing ranks: {ranks}")
-    if max_reports:
-        log_rank_0(f"[TraceLens] Max reports: {max_reports}")
 
     # Generate reports
     reports = generate_tracelens_reports(
         tensorboard_dir=tensorboard_dir,
         output_dir=reports_dir,
         ranks=ranks,
-        max_reports=max_reports,
         output_format=output_format,
     )
 
@@ -772,7 +758,6 @@ def upload_artifacts_to_mlflow(
     generate_tracelens_report: bool = False,
     upload_tracelens_report: bool = False,
     tracelens_ranks: Optional[List[int]] = None,
-    tracelens_max_reports: Optional[int] = None,
     tracelens_output_format: str = "all",
     tracelens_cleanup_after_upload: bool = False,
 ) -> dict:
@@ -811,8 +796,8 @@ def upload_artifacts_to_mlflow(
         generate_tracelens_report: Whether to generate TraceLens reports locally
         upload_tracelens_report: Whether to upload TraceLens reports to MLflow (implies generation)
         tracelens_ranks: List of ranks to generate TraceLens reports for
-                        (None = all ranks, [0] = rank 0 only)
-        tracelens_max_reports: Maximum number of TraceLens reports to generate
+                        (None = all ranks, [0, 8] = ranks 0 and 8 only)
+                        Specify fewer ranks to limit number of reports
         tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
         tracelens_cleanup_after_upload: If True, removes local reports after upload to save disk space.
                                        If False, keeps reports locally for inspection (default).
@@ -863,7 +848,6 @@ def upload_artifacts_to_mlflow(
                 tensorboard_dir=tensorboard_dir,
                 exp_root_path=exp_root_path,
                 ranks=tracelens_ranks,
-                max_reports=tracelens_max_reports,
                 output_format=tracelens_output_format,
                 artifact_path="trace_analysis",
                 cleanup_after_upload=tracelens_cleanup_after_upload,
@@ -875,7 +859,6 @@ def upload_artifacts_to_mlflow(
                 tensorboard_dir=tensorboard_dir,
                 exp_root_path=exp_root_path,
                 ranks=tracelens_ranks,
-                max_reports=tracelens_max_reports,
                 output_format=tracelens_output_format,
             )
             # Don't count as "uploaded" since they're local-only
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 2fa576352..dc3e38ab5 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -20,11 +20,13 @@ mlflow_upload_logs: true             # Upload training log files to MLflow
 #   generate=true,  upload=false  →  Generate reports locally only
 #   generate=false, upload=true   →  Generate AND upload (auto-enabled)
 #   generate=true,  upload=true   →  Generate AND upload (explicit)
+#
+# To limit number of reports: Specify fewer ranks in mlflow_tracelens_ranks
+#   Example: mlflow_tracelens_ranks: [0, 8]  # Only 2 ranks = 2 reports
 # ----------------------------------------------------------------------------
 generate_tracelens_report: false     # Generate TraceLens analysis reports locally
 mlflow_upload_tracelens_report: false  # Upload TraceLens reports to MLflow (implies generation)
-mlflow_tracelens_ranks: null         # List of ranks to analyze with TraceLens (null = all)
-mlflow_tracelens_max_reports: null   # Max number of TraceLens reports (null = unlimited)
+mlflow_tracelens_ranks: null         # List of ranks to analyze (null = all, [0,8] = 2 ranks)
 mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
 mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)
 disable_compile_dependencies: true
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 0fcf8e70d..03a7e5895 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -1621,7 +1621,6 @@ def run(self, *args, **kwargs):
                 generate_tracelens_report=getattr(args, "generate_tracelens_report", False),
                 upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
                 tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
-                tracelens_max_reports=getattr(args, "mlflow_tracelens_max_reports", None),
                 tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "all"),
                 tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", False),
             )

From 6366eacaf18ac0bdcd0c05ead7b6531dd4b764d1 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 15:13:41 +0000
Subject: [PATCH 13/42] fix: Escape glob paths to handle [] characters in
 experiment names

The experiment name contains square brackets like [deepseek_v2_lite-pretrain_...]-rank[0]
which are interpreted as glob pattern character classes, causing glob.glob to
return empty results even though files exist.

Fixed by using glob.escape() on directory paths before using them with glob.glob().
---
 .../megatron/training/mlflow_artifacts.py      | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 3c047a2ad..013d7c749 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -63,9 +63,11 @@ def _get_all_trace_files(tensorboard_dir: str) -> list:
     # Look for PyTorch profiler trace files (both compressed and uncompressed)
     # Using specific patterns to avoid matching unrelated JSON files
     patterns = ["*.pt.trace.json", "*.pt.trace.json.gz"]
+    # Escape directory path to handle special characters like [] in experiment names
+    escaped_dir = glob.escape(tensorboard_dir)
     for pattern in patterns:
-        trace_files.extend(glob.glob(os.path.join(tensorboard_dir, pattern)))
-        trace_files.extend(glob.glob(os.path.join(tensorboard_dir, "**", pattern), recursive=True))
+        trace_files.extend(glob.glob(os.path.join(escaped_dir, pattern)))
+        trace_files.extend(glob.glob(os.path.join(escaped_dir, "**", pattern), recursive=True))
 
     # Remove duplicates while preserving order
     seen = set()
@@ -100,8 +102,8 @@ def _get_all_log_files(exp_root_path: str) -> list:
         return []
 
     log_files = []
-    # Find all .log files recursively
-    log_files.extend(glob.glob(os.path.join(logs_dir, "**", "*.log"), recursive=True))
+    # Find all .log files recursively (escape path to handle special characters)
+    log_files.extend(glob.glob(os.path.join(glob.escape(logs_dir), "**", "*.log"), recursive=True))
 
     return log_files
 
@@ -371,8 +373,8 @@ def generate_tracelens_report(
                 )
                 generated_files.append(xlsx_path)
 
-            # Check CSV outputs
-            csv_files = glob.glob(os.path.join(csv_subdir, "*.csv"))
+            # Check CSV outputs (escape path to handle [] characters in filenames)
+            csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
             if csv_files:
                 log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
                 generated_files.extend(csv_files)
@@ -393,8 +395,8 @@ def generate_tracelens_report(
             os.makedirs(csv_subdir, exist_ok=True)
             dfs = generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
 
-            # Collect all generated CSV files
-            csv_files = glob.glob(os.path.join(csv_subdir, "*.csv"))
+            # Collect all generated CSV files (escape path to handle [] characters in filenames)
+            csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
             if csv_files:
                 log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
                 generated_files.extend(csv_files)

From cb1584bf81e13e722d39d3ab6f39989dc12c5542 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 18 Dec 2025 16:08:37 +0000
Subject: [PATCH 14/42] fix: Install openpyxl for XLSX generation and call
 TraceLens twice for all format

TraceLens uses either/or logic - if output_csvs_dir is set, it only generates
CSVs and ignores output_xlsx_path. To get both formats, we now call
generate_perf_report_pytorch twice: once for XLSX and once for CSVs.

Also added _ensure_openpyxl_installed() to automatically install openpyxl
which is required for XLSX file generation.
---
 .../megatron/training/mlflow_artifacts.py     | 48 +++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 013d7c749..f440b72d4 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -220,11 +220,38 @@ def upload_log_files_to_mlflow(
 # =============================================================================
 
 
+def _ensure_openpyxl_installed() -> bool:
+    """
+    Ensure openpyxl is installed for XLSX generation.
+
+    Returns:
+        True if openpyxl is available, False otherwise
+    """
+    try:
+        import openpyxl  # noqa: F401
+
+        return True
+    except ImportError:
+        log_rank_0("[TraceLens] openpyxl not found, installing for XLSX support...")
+        try:
+            subprocess.check_call(
+                [sys.executable, "-m", "pip", "install", "openpyxl", "-q"],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            log_rank_0("[TraceLens] Successfully installed openpyxl")
+            return True
+        except subprocess.CalledProcessError as e:
+            warning_rank_0(f"[TraceLens] Failed to install openpyxl: {e}")
+            return False
+
+
 def _ensure_tracelens_installed() -> bool:
     """
-    Ensure TraceLens is installed. Install it if not present.
+    Ensure TraceLens and its dependencies are installed.
 
     TraceLens is available from GitHub: https://github.com/AMD-AGI/TraceLens
+    XLSX generation requires openpyxl which is installed separately.
 
     Returns:
         True if TraceLens is available, False otherwise
@@ -233,7 +260,6 @@ def _ensure_tracelens_installed() -> bool:
         import TraceLens  # noqa: F401
 
         log_rank_0("[TraceLens] TraceLens is already installed")
-        return True
     except ImportError:
         log_rank_0("[TraceLens] TraceLens not found, attempting to install from GitHub...")
         try:
@@ -251,11 +277,15 @@ def _ensure_tracelens_installed() -> bool:
                 stderr=subprocess.DEVNULL,
             )
             log_rank_0("[TraceLens] Successfully installed TraceLens from GitHub")
-            return True
         except subprocess.CalledProcessError as e:
             warning_rank_0(f"[TraceLens] Failed to install TraceLens: {e}")
             return False
 
+    # Ensure openpyxl is installed for XLSX generation
+    _ensure_openpyxl_installed()
+
+    return True
+
 
 def _extract_rank_from_filename(filename: str) -> Optional[int]:
     """
@@ -355,16 +385,15 @@ def generate_tracelens_report(
 
         generated_files = []
 
-        # Optimize for "all" format: parse trace once and generate both outputs
+        # For "all" format: TraceLens uses either/or logic - if output_csvs_dir is set,
+        # it ONLY generates CSVs. So we need to call it twice for both formats.
         if output_format == "all":
             xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
             csv_subdir = os.path.join(output_dir, report_name)
             os.makedirs(csv_subdir, exist_ok=True)
 
-            # Parse trace once and generate both formats
-            dfs = generate_perf_report_pytorch(
-                trace_file, output_xlsx_path=xlsx_path, output_csvs_dir=csv_subdir
-            )
+            # First call: Generate XLSX only
+            dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
 
             # Check XLSX output
             if os.path.exists(xlsx_path):
@@ -373,6 +402,9 @@ def generate_tracelens_report(
                 )
                 generated_files.append(xlsx_path)
 
+            # Second call: Generate CSVs only
+            generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
+
             # Check CSV outputs (escape path to handle [] characters in filenames)
             csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
             if csv_files:

From 8dc31263c88840394b9633b8328c113bf54f9ac5 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Fri, 19 Dec 2025 13:04:58 +0000
Subject: [PATCH 15/42] feat: Enable TraceLens by default with one report per
 node

---
 .../configs/modules/megatron/primus_megatron_module.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index dc3e38ab5..75bb4db55 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -22,11 +22,12 @@ mlflow_upload_logs: true             # Upload training log files to MLflow
 #   generate=true,  upload=true   →  Generate AND upload (explicit)
 #
 # To limit number of reports: Specify fewer ranks in mlflow_tracelens_ranks
-#   Example: mlflow_tracelens_ranks: [0, 8]  # Only 2 ranks = 2 reports
+#   Default: [0, 8] = one rank per node (assumes 8 GPUs/node)
+#   Use null for all ranks, or customize list like [0, 1, 8, 9] for more coverage
 # ----------------------------------------------------------------------------
-generate_tracelens_report: false     # Generate TraceLens analysis reports locally
-mlflow_upload_tracelens_report: false  # Upload TraceLens reports to MLflow (implies generation)
-mlflow_tracelens_ranks: null         # List of ranks to analyze (null = all, [0,8] = 2 ranks)
+generate_tracelens_report: true     # Generate TraceLens analysis reports locally
+mlflow_upload_tracelens_report: true  # Upload TraceLens reports to MLflow (implies generation)
+mlflow_tracelens_ranks: [0, 8]       # List of ranks to analyze (default: one per node for 2-node setup)
 mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
 mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)
 disable_compile_dependencies: true

From c9967c6245e293faae81e4a29ffa124eff6ac27f Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Fri, 19 Dec 2025 13:16:55 +0000
Subject: [PATCH 16/42] fix: Upload TraceLens CSV directories to preserve rank
 grouping

- Change from uploading individual CSV files to uploading directories
- Fixes issue where rank 8 CSVs overwrite rank 0 CSVs in MLflow
- Preserves local structure: rank[0].pt.trace/ and rank[8].pt.trace/

Before: CSVs mixed together, only last rank visible
After: CSVs grouped by rank in separate directories

Changes:
- Line 412: generated_files.append(csv_subdir) instead of extend(csv_files)
- Line 434: generated_files.append(csv_subdir) instead of extend(csv_files)
---
 primus/backends/megatron/training/mlflow_artifacts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index f440b72d4..86ff95341 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -409,7 +409,7 @@ def generate_tracelens_report(
             csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
             if csv_files:
                 log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
-                generated_files.extend(csv_files)
+                generated_files.append(csv_subdir)  # Upload directory to preserve structure
 
         elif output_format == "xlsx":
             # XLSX only: Single file with multiple tabs
@@ -431,7 +431,7 @@ def generate_tracelens_report(
             csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
             if csv_files:
                 log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
-                generated_files.extend(csv_files)
+                generated_files.append(csv_subdir)  # Upload directory to preserve structure
 
         if generated_files:
             return generated_files

From eb4da1356af6f91b636da6fd4840a337a6892a85 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 15 Jan 2026 08:45:24 +0000
Subject: [PATCH 17/42] minor fix: lint format

---
 primus/modules/trainer/megatron/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index dba2ebf75..a2ec004d1 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -147,8 +147,8 @@
     get_mlflow_writer,
     get_train_start_time,
     set_primus_global_variables,
-    upload_mlflow_artifacts,
     set_train_start_time,
+    upload_mlflow_artifacts,
 )
 from primus.backends.megatron.training.tokenizer.tokenizer import build_tokenizer
 from primus.core.utils import checker, file_utils

From fdc8f51559e5091bbe8213c8598303683b45b89c Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 15 Jan 2026 10:33:37 +0000
Subject: [PATCH 18/42] minor fix

---
 primus/backends/megatron/training/global_vars.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/primus/backends/megatron/training/global_vars.py b/primus/backends/megatron/training/global_vars.py
index 7203e5213..34e66052f 100644
--- a/primus/backends/megatron/training/global_vars.py
+++ b/primus/backends/megatron/training/global_vars.py
@@ -5,9 +5,8 @@
 # See LICENSE for license information.
 ###############################################################################
 
-from typing import List, Optional
-
 import time
+from typing import List, Optional
 
 from primus.modules.module_utils import debug_rank_0
 

From 3cfa4071e55c4023cc9dfa9790b6811fac23efd4 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Mon, 2 Feb 2026 12:45:21 +0000
Subject: [PATCH 19/42] Refactor TraceLens/MLflow artifact features to separate
 module

Move upload_mlflow_artifacts() from global_vars.py to new mlflow_setup.py
to reduce merge conflicts. Includes TraceLens report generation and upload
parameters.

global_vars.py now matches main, avoiding future conflicts when
merging from main branch.
---
 .../backends/megatron/training/global_vars.py | 67 +---------------
 .../megatron/training/mlflow_setup.py         | 80 +++++++++++++++++++
 primus/modules/trainer/megatron/trainer.py    |  2 +-
 3 files changed, 82 insertions(+), 67 deletions(-)
 create mode 100644 primus/backends/megatron/training/mlflow_setup.py

diff --git a/primus/backends/megatron/training/global_vars.py b/primus/backends/megatron/training/global_vars.py
index f3c60267a..5b2ae4825 100644
--- a/primus/backends/megatron/training/global_vars.py
+++ b/primus/backends/megatron/training/global_vars.py
@@ -5,9 +5,9 @@
 # See LICENSE for license information.
 ###############################################################################
 
+
 import json
 import time
-from typing import List, Optional
 
 from primus.backends.megatron.training.git_metadata import (
     collect_git_metadata,
@@ -15,8 +15,6 @@
 )
 from primus.modules.module_utils import debug_rank_0
 
-from .mlflow_artifacts import upload_artifacts_to_mlflow
-
 _GLOBAL_ARGS = None
 _GLOBAL_MLFLOW_WRITER = None
 _TRAIN_START_TIME = None
@@ -161,66 +159,3 @@ def _ensure_var_is_not_initialized(var, name):
 def destroy_global_vars():
     global _GLOBAL_ARGS
     _GLOBAL_ARGS = None
-
-
-def upload_mlflow_artifacts(
-    tensorboard_dir: Optional[str] = None,
-    exp_root_path: Optional[str] = None,
-    upload_traces: bool = True,
-    upload_logs: bool = True,
-    generate_tracelens_report: bool = False,
-    upload_tracelens_report: bool = False,
-    tracelens_ranks: Optional[List[int]] = None,
-    tracelens_output_format: str = "all",
-    tracelens_cleanup_after_upload: bool = False,
-) -> Optional[dict]:
-    """
-    Upload trace files, log files, and TraceLens reports to MLflow as artifacts.
-
-    This function should be called at the end of training to upload all
-    artifacts to MLflow. Only the rank that initialized MLflow (last rank)
-    should call this to avoid duplicate uploads.
-
-    MLflow Artifact Structure:
-        artifacts/
-        ├── traces/              # PyTorch profiler trace files
-        ├── logs/                # Training log files
-        └── trace_analysis/      # TraceLens analysis reports (if uploaded)
-
-    TraceLens Report Logic:
-        - upload_tracelens_report=True: Generate AND upload (auto-enables generation)
-        - generate_tracelens_report=True only: Generate locally without upload
-        - Both False: No report generation
-
-    Args:
-        tensorboard_dir: Path to tensorboard directory with trace files
-        exp_root_path: Root experiment path for log files
-        upload_traces: Whether to upload trace files (default: True)
-        upload_logs: Whether to upload log files (default: True)
-        generate_tracelens_report: Whether to generate TraceLens reports locally
-        upload_tracelens_report: Whether to upload TraceLens reports to MLflow (implies generation)
-        tracelens_ranks: List of ranks to analyze with TraceLens
-                        (None = all, [0, 8] = ranks 0 and 8 only)
-                        Specify fewer ranks to limit number of reports
-        tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
-        tracelens_cleanup_after_upload: Remove local reports after upload (default: False)
-
-    Returns:
-        Dictionary with counts of uploaded files, or None if MLflow is not enabled
-    """
-    mlflow_writer = get_mlflow_writer()
-    if mlflow_writer is None:
-        return None
-
-    return upload_artifacts_to_mlflow(
-        mlflow_writer=mlflow_writer,
-        tensorboard_dir=tensorboard_dir,
-        exp_root_path=exp_root_path,
-        upload_traces=upload_traces,
-        upload_logs=upload_logs,
-        generate_tracelens_report=generate_tracelens_report,
-        upload_tracelens_report=upload_tracelens_report,
-        tracelens_ranks=tracelens_ranks,
-        tracelens_output_format=tracelens_output_format,
-        tracelens_cleanup_after_upload=tracelens_cleanup_after_upload,
-    )
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
new file mode 100644
index 000000000..1bfbe5f05
--- /dev/null
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -0,0 +1,80 @@
+###############################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Modification Copyright© 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+MLflow artifact upload utilities.
+
+This module provides functions for uploading artifacts (traces, logs, TraceLens
+reports) to MLflow. Separated from global_vars.py to reduce merge conflicts.
+"""
+
+from typing import List, Optional
+
+from .global_vars import get_mlflow_writer
+from .mlflow_artifacts import upload_artifacts_to_mlflow
+
+
+def upload_mlflow_artifacts(
+    tensorboard_dir: Optional[str] = None,
+    exp_root_path: Optional[str] = None,
+    upload_traces: bool = True,
+    upload_logs: bool = True,
+    generate_tracelens_report: bool = False,
+    upload_tracelens_report: bool = False,
+    tracelens_ranks: Optional[List[int]] = None,
+    tracelens_output_format: str = "all",
+    tracelens_cleanup_after_upload: bool = False,
+) -> Optional[dict]:
+    """
+    Upload trace files, log files, and TraceLens reports to MLflow as artifacts.
+
+    This function should be called at the end of training to upload all
+    artifacts to MLflow. Only the rank that initialized MLflow (last rank)
+    should call this to avoid duplicate uploads.
+
+    MLflow Artifact Structure:
+        artifacts/
+        ├── traces/              # PyTorch profiler trace files
+        ├── logs/                # Training log files
+        └── trace_analysis/      # TraceLens analysis reports (if uploaded)
+
+    TraceLens Report Logic:
+        - upload_tracelens_report=True: Generate AND upload (auto-enables generation)
+        - generate_tracelens_report=True only: Generate locally without upload
+        - Both False: No report generation
+
+    Args:
+        tensorboard_dir: Path to tensorboard directory with trace files
+        exp_root_path: Root experiment path for log files
+        upload_traces: Whether to upload trace files (default: True)
+        upload_logs: Whether to upload log files (default: True)
+        generate_tracelens_report: Whether to generate TraceLens reports locally
+        upload_tracelens_report: Whether to upload TraceLens reports to MLflow (implies generation)
+        tracelens_ranks: List of ranks to analyze with TraceLens
+                        (None = all, [0, 8] = ranks 0 and 8 only)
+                        Specify fewer ranks to limit number of reports
+        tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        tracelens_cleanup_after_upload: Remove local reports after upload (default: False)
+
+    Returns:
+        Dictionary with counts of uploaded files, or None if MLflow is not enabled
+    """
+    mlflow_writer = get_mlflow_writer()
+    if mlflow_writer is None:
+        return None
+
+    return upload_artifacts_to_mlflow(
+        mlflow_writer=mlflow_writer,
+        tensorboard_dir=tensorboard_dir,
+        exp_root_path=exp_root_path,
+        upload_traces=upload_traces,
+        upload_logs=upload_logs,
+        generate_tracelens_report=generate_tracelens_report,
+        upload_tracelens_report=upload_tracelens_report,
+        tracelens_ranks=tracelens_ranks,
+        tracelens_output_format=tracelens_output_format,
+        tracelens_cleanup_after_upload=tracelens_cleanup_after_upload,
+    )
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 904c5bce4..534f9e96b 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -148,8 +148,8 @@
     get_train_start_time,
     set_primus_global_variables,
     set_train_start_time,
-    upload_mlflow_artifacts,
 )
+from primus.backends.megatron.training.mlflow_setup import upload_mlflow_artifacts
 from primus.backends.megatron.training.tokenizer.tokenizer import build_tokenizer
 from primus.core.utils import checker, file_utils
 from primus.core.utils.rocm_mem_info import get_rocm_smi_mem_info

From 99f0fa6e500f6f84602d803d218745a8127db4ed Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Tue, 3 Feb 2026 12:53:10 +0000
Subject: [PATCH 20/42] Address Copilot review comments for TraceLens
 functionality

- Document performance tradeoff for "all" format (parses trace twice)
- Add explicit warning when rank filtering results in zero matching files
- Add defensive checks for dfs return value before using len()
- Improve fallback error messages to clarify limitations
---
 .../megatron/training/mlflow_artifacts.py     | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 86ff95341..29ddd728f 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -387,6 +387,8 @@ def generate_tracelens_report(
 
         # For "all" format: TraceLens uses either/or logic - if output_csvs_dir is set,
         # it ONLY generates CSVs. So we need to call it twice for both formats.
+        # Note: This means the trace file is parsed twice, roughly doubling processing time
+        # compared to a single format. This is a TraceLens limitation, not a bug.
         if output_format == "all":
             xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
             csv_subdir = os.path.join(output_dir, report_name)
@@ -397,8 +399,9 @@ def generate_tracelens_report(
 
             # Check XLSX output
             if os.path.exists(xlsx_path):
+                num_tabs = len(dfs) if dfs else 0
                 log_rank_0(
-                    f"[TraceLens] Generated XLSX report with {len(dfs)} tabs: {os.path.basename(xlsx_path)}"
+                    f"[TraceLens] Generated XLSX report with {num_tabs} tabs: {os.path.basename(xlsx_path)}"
                 )
                 generated_files.append(xlsx_path)
 
@@ -416,8 +419,9 @@ def generate_tracelens_report(
             xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
             dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
             if os.path.exists(xlsx_path):
+                num_tabs = len(dfs) if dfs else 0
                 log_rank_0(
-                    f"[TraceLens] Generated XLSX report with {len(dfs)} tabs: {os.path.basename(xlsx_path)}"
+                    f"[TraceLens] Generated XLSX report with {num_tabs} tabs: {os.path.basename(xlsx_path)}"
                 )
                 generated_files.append(xlsx_path)
 
@@ -440,14 +444,20 @@ def generate_tracelens_report(
         return []
 
     except ImportError:
-        log_rank_0("[TraceLens] TraceLens not available, using fallback CSV summary")
-        # Fallback to simple CSV summary
+        warning_rank_0(
+            "[TraceLens] TraceLens not available. Using simplified fallback CSV summary. "
+            "Install TraceLens for comprehensive kernel, memory, and communication analysis."
+        )
+        # Fallback to simple CSV summary (basic stats only, may not handle all trace formats)
         csv_path = _generate_trace_summary_csv(trace_file, output_dir, f"{report_name}_summary.csv")
         return [csv_path] if csv_path else []
 
     except Exception as e:
-        warning_rank_0(f"[TraceLens] Error generating report: {e}")
-        # Fallback to simple CSV summary
+        warning_rank_0(
+            f"[TraceLens] Error generating report: {e}. "
+            "Using simplified fallback CSV summary with basic statistics only."
+        )
+        # Fallback to simple CSV summary (basic stats only, may not handle all trace formats)
         csv_path = _generate_trace_summary_csv(trace_file, output_dir, f"{report_name}_summary.csv")
         return [csv_path] if csv_path else []
 
@@ -602,8 +612,15 @@ def generate_tracelens_reports(
 
     # Filter by ranks if specified
     if ranks is not None:
+        original_count = len(trace_files)
         trace_files = _filter_traces_by_rank(trace_files, ranks)
         log_rank_0(f"[TraceLens] Filtered to {len(trace_files)} trace files for ranks: {ranks}")
+        if not trace_files and original_count > 0:
+            warning_rank_0(
+                f"[TraceLens] Warning: No trace files match the specified ranks {ranks}. "
+                f"Found {original_count} trace files but none matched. "
+                "Check that the rank numbers are correct."
+            )
 
     log_rank_0(
         f"[TraceLens] Generating {output_format.upper()} reports for {len(trace_files)} trace files..."

From ab6323ef4010649fc3a389a7f76212fa829693c3 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 5 Feb 2026 13:44:18 +0000
Subject: [PATCH 21/42] feat: auto-enable mlflow and profiling for tracelens
 upload

When mlflow_upload_traces, mlflow_upload_logs, or mlflow_upload_tracelens_report is True:
- Auto-enable mlflow (set disable_mlflow=False)
- Auto-enable profiling if trace or tracelens upload is requested

This removes the need to explicitly set:
- --disable_mlflow=False
- --profile=True
- --use_pytorch_profiler=True
---
 primus/modules/trainer/megatron/trainer.py | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 534f9e96b..853884364 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -433,6 +433,29 @@ def update_primus_config(
         log_kv_rank_0(f"  -wandb_save_dir", f"{args.wandb_save_dir}")
         log_kv_rank_0(f"  -wandb_entity", f"{args.wandb_entity}")
 
+        # mlflow - auto-enable dependencies
+        # If any mlflow_upload_* flag is True, auto-enable mlflow
+        mlflow_upload_flags = [
+            getattr(args, "mlflow_upload_traces", False),
+            getattr(args, "mlflow_upload_logs", False),
+            getattr(args, "mlflow_upload_tracelens_report", False),
+        ]
+        if any(mlflow_upload_flags) and args.disable_mlflow:
+            args.disable_mlflow = False
+            debug_rank_0("Auto-enabled MLflow (disable_mlflow=False) because mlflow_upload_* flags are set")
+
+        # If uploading traces or tracelens reports, auto-enable profiling
+        needs_profiling = getattr(args, "mlflow_upload_traces", False) or getattr(
+            args, "mlflow_upload_tracelens_report", False
+        )
+        if needs_profiling:
+            if not getattr(args, "profile", False):
+                args.profile = True
+                debug_rank_0("Auto-enabled profile=True for mlflow trace/tracelens upload")
+            if not getattr(args, "use_pytorch_profiler", False):
+                args.use_pytorch_profiler = True
+                debug_rank_0("Auto-enabled use_pytorch_profiler=True for mlflow trace/tracelens upload")
+
         # mlflow
         log_kv_rank_0(f"-disable_mlflow", f"{args.disable_mlflow}")
         if not args.disable_mlflow:

From 5b4e43c368725c9afa64b989a5e7c1cb537d05ba Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 5 Feb 2026 14:17:39 +0000
Subject: [PATCH 22/42] fix: auto-enable tensorboard when profiling is enabled

The profiler saves traces to tensorboard_dir, which is None when
tensorboard is disabled. This caused a TypeError during trace save.

Moved auto-enable logic before tensorboard section and added
tensorboard auto-enable when mlflow_upload_traces or
mlflow_upload_tracelens_report is True.
---
 primus/modules/trainer/megatron/trainer.py | 49 ++++++++++++----------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 853884364..bae9bfb1f 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -396,6 +396,32 @@ def update_primus_config(
             else:
                 log_rank_0(f"-{latest_file} does not exist, skip auto_continue_train.")
 
+        # Auto-enable dependencies for mlflow upload flags
+        # This must run BEFORE tensorboard section to ensure paths are set correctly
+        mlflow_upload_flags = [
+            getattr(args, "mlflow_upload_traces", False),
+            getattr(args, "mlflow_upload_logs", False),
+            getattr(args, "mlflow_upload_tracelens_report", False),
+        ]
+        if any(mlflow_upload_flags) and args.disable_mlflow:
+            args.disable_mlflow = False
+            debug_rank_0("Auto-enabled MLflow (disable_mlflow=False) because mlflow_upload_* flags are set")
+
+        # If uploading traces or tracelens reports, auto-enable profiling and tensorboard
+        needs_profiling = getattr(args, "mlflow_upload_traces", False) or getattr(
+            args, "mlflow_upload_tracelens_report", False
+        )
+        if needs_profiling:
+            if not getattr(args, "profile", False):
+                args.profile = True
+                debug_rank_0("Auto-enabled profile=True for mlflow trace/tracelens upload")
+            if not getattr(args, "use_pytorch_profiler", False):
+                args.use_pytorch_profiler = True
+                debug_rank_0("Auto-enabled use_pytorch_profiler=True for mlflow trace/tracelens upload")
+            if getattr(args, "disable_tensorboard", True):
+                args.disable_tensorboard = False
+                debug_rank_0("Auto-enabled tensorboard (disable_tensorboard=False) for profiler trace output")
+
         # tensorboard
         if not args.disable_tensorboard:
             tb_path = os.path.abspath(os.path.join(exp_root_path, "tensorboard"))
@@ -433,29 +459,6 @@ def update_primus_config(
         log_kv_rank_0(f"  -wandb_save_dir", f"{args.wandb_save_dir}")
         log_kv_rank_0(f"  -wandb_entity", f"{args.wandb_entity}")
 
-        # mlflow - auto-enable dependencies
-        # If any mlflow_upload_* flag is True, auto-enable mlflow
-        mlflow_upload_flags = [
-            getattr(args, "mlflow_upload_traces", False),
-            getattr(args, "mlflow_upload_logs", False),
-            getattr(args, "mlflow_upload_tracelens_report", False),
-        ]
-        if any(mlflow_upload_flags) and args.disable_mlflow:
-            args.disable_mlflow = False
-            debug_rank_0("Auto-enabled MLflow (disable_mlflow=False) because mlflow_upload_* flags are set")
-
-        # If uploading traces or tracelens reports, auto-enable profiling
-        needs_profiling = getattr(args, "mlflow_upload_traces", False) or getattr(
-            args, "mlflow_upload_tracelens_report", False
-        )
-        if needs_profiling:
-            if not getattr(args, "profile", False):
-                args.profile = True
-                debug_rank_0("Auto-enabled profile=True for mlflow trace/tracelens upload")
-            if not getattr(args, "use_pytorch_profiler", False):
-                args.use_pytorch_profiler = True
-                debug_rank_0("Auto-enabled use_pytorch_profiler=True for mlflow trace/tracelens upload")
-
         # mlflow
         log_kv_rank_0(f"-disable_mlflow", f"{args.disable_mlflow}")
         if not args.disable_mlflow:

From 74ad87993a96abd06e4ef559b5e46c8820724668 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 5 Feb 2026 14:42:27 +0000
Subject: [PATCH 23/42] chore: set TraceLens defaults to false (opt-in)

TraceLens report generation and upload are now disabled by default.
They are auto-enabled when mlflow_upload_tracelens_report=true is set.

This prevents TraceLens from running when only testing other features
like mlflow_upload_performance_metrics.
---
 primus/configs/modules/megatron/primus_megatron_module.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 75bb4db55..02ee324ed 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -25,8 +25,8 @@ mlflow_upload_logs: true             # Upload training log files to MLflow
 #   Default: [0, 8] = one rank per node (assumes 8 GPUs/node)
 #   Use null for all ranks, or customize list like [0, 1, 8, 9] for more coverage
 # ----------------------------------------------------------------------------
-generate_tracelens_report: true     # Generate TraceLens analysis reports locally
-mlflow_upload_tracelens_report: true  # Upload TraceLens reports to MLflow (implies generation)
+generate_tracelens_report: false    # Generate TraceLens analysis reports locally (auto-enabled when upload=true)
+mlflow_upload_tracelens_report: false # Upload TraceLens reports to MLflow (auto-enables generation, profiling, tensorboard)
 mlflow_tracelens_ranks: [0, 8]       # List of ranks to analyze (default: one per node for 2-node setup)
 mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
 mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)

From 7a3856f643870b523da6be64a2028c5bfab915dc Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Mon, 9 Feb 2026 12:34:16 +0000
Subject: [PATCH 24/42] Pin TraceLens install to v0.4.0; default mlflow upload
 flags to false

- mlflow_artifacts: install TraceLens from git@v0.4.0 for reproducibility and supply-chain safety
- primus_megatron_module: mlflow_upload_traces/logs default false so MLflow stays opt-in

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../backends/megatron/training/mlflow_artifacts.py   | 12 +++++++++---
 .../modules/megatron/primus_megatron_module.yaml     |  8 ++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 29ddd728f..b80f7da84 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -41,6 +41,9 @@
 
 from primus.modules.module_utils import log_rank_0, warning_rank_0
 
+# Pinned ref for runtime TraceLens install (supply-chain safety and reproducibility)
+TRACELENS_INSTALL_REF = "v0.4.0"
+
 
 def _get_all_trace_files(tensorboard_dir: str) -> list:
     """
@@ -263,20 +266,23 @@ def _ensure_tracelens_installed() -> bool:
     except ImportError:
         log_rank_0("[TraceLens] TraceLens not found, attempting to install from GitHub...")
         try:
-            # TraceLens is on GitHub, not PyPI
+            # TraceLens is on GitHub, not PyPI; pin to a tag for reproducibility and supply-chain safety
+            install_spec = f"git+https://github.com/AMD-AGI/TraceLens.git@{TRACELENS_INSTALL_REF}"
             subprocess.check_call(
                 [
                     sys.executable,
                     "-m",
                     "pip",
                     "install",
-                    "git+https://github.com/AMD-AGI/TraceLens.git",
+                    install_spec,
                     "-q",
                 ],
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL,
             )
-            log_rank_0("[TraceLens] Successfully installed TraceLens from GitHub")
+            log_rank_0(
+                f"[TraceLens] Successfully installed TraceLens from GitHub (ref={TRACELENS_INSTALL_REF})"
+            )
         except subprocess.CalledProcessError as e:
             warning_rank_0(f"[TraceLens] Failed to install TraceLens: {e}")
             return False
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 02ee324ed..e921d0465 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -5,10 +5,10 @@ disable_wandb: true
 disable_mlflow: true
 mlflow_run_name: null
 mlflow_experiment_name: null
-# NOTE: When disable_mlflow=false, traces and logs are uploaded by default.
-# Set these to false if you only want metrics/params logged to MLflow.
-mlflow_upload_traces: true           # Upload profiler trace files to MLflow
-mlflow_upload_logs: true             # Upload training log files to MLflow
+# When disable_mlflow=false, set these to true to upload traces/logs to MLflow.
+# Default false so MLflow remains opt-in and disable_mlflow is respected.
+mlflow_upload_traces: false          # Upload profiler trace files to MLflow
+mlflow_upload_logs: false            # Upload training log files to MLflow
 
 # TraceLens Report Generation & Upload
 # ----------------------------------------------------------------------------

From 8098e53034e874df71046d0cc2e3294caee10357 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Mon, 9 Feb 2026 14:32:24 +0000
Subject: [PATCH 25/42] TraceLens/MLflow fixes: tests, docs, local-only
 generation, cleanup safety

mlflow_artifacts.py:
- Pin TraceLens install to v0.4.0 for reproducibility and supply-chain safety
- Use log_artifact for files, log_artifacts for dirs (correct MLflow API)
- Default output_format to xlsx; document and warn on 'all' (2x parse)
- Only cleanup after upload when all uploads succeeded
- Clarify ranks string parsing (config/CLI); report count log (items not files)
- Use dfs in CSV branch log; ASCII arrows in docstring; runtime warning for 'all'

mlflow_setup.py:
- Run local TraceLens generation when MLflow disabled (generate_tracelens_report=True)

primus_megatron_module.yaml:
- List supported TraceLens options; no max_reports; ASCII arrows; format comment

trainer.py:
- Include generate_tracelens_report in needs_profiling for local-only generation
- Always call upload_mlflow_artifacts (so local-only runs without MLflow)
- Default tracelens_output_format to xlsx; upload flags default False

tests:
- Add test_mlflow_artifacts.py: trace/log discovery, rank extraction, report generation
  (mocked TraceLens), upload logic (file vs dir), cleanup, upload_artifacts_to_mlflow

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     |  92 ++-
 .../megatron/training/mlflow_setup.py         |  13 +-
 .../megatron/primus_megatron_module.yaml      |  36 +-
 primus/modules/trainer/megatron/trainer.py    | 130 +++-
 .../megatron/test_mlflow_artifacts.py         | 621 ++++++++++++++++++
 5 files changed, 822 insertions(+), 70 deletions(-)
 create mode 100644 tests/unit_tests/backends/megatron/test_mlflow_artifacts.py

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index b80f7da84..2599e86e4 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -28,9 +28,9 @@
         └── ...
 
 TraceLens Report Formats:
-    - xlsx: Multi-tab Excel with sections for kernels, memory, communication, etc.
+    - xlsx: Multi-tab Excel (default; single parse, fastest)
     - csv:  Multiple CSV files per rank (kernels, memory, communication, etc.)
-    - all:  Both xlsx and csv files (default)
+    - all:  Both xlsx and csv (parses trace twice, ~2x processing time; use when both formats needed)
 """
 
 import glob
@@ -352,7 +352,7 @@ def generate_tracelens_report(
     trace_file: str,
     output_dir: str,
     report_name: Optional[str] = None,
-    output_format: str = "all",
+    output_format: str = "xlsx",
 ) -> List[str]:
     """
     Generate a TraceLens analysis report for a single trace file.
@@ -362,9 +362,10 @@ def generate_tracelens_report(
         output_dir: Directory to save the report
         report_name: Optional custom name for the report (base name for CSVs)
         output_format: Output format:
-                      - "all" (default): Both XLSX and CSV files
-                      - "xlsx": Single multi-tab Excel file with detailed analysis
+                      - "xlsx" (default): Single multi-tab Excel; one trace parse, fastest.
                       - "csv": Multiple CSV files (kernels, memory, communication, etc.)
+                      - "all": Both XLSX and CSV; trace is parsed twice (~2x processing time).
+                      Prefer "xlsx" or "csv" to avoid this overhead unless both are needed.
 
     Returns:
         List of paths to generated report files
@@ -393,9 +394,14 @@ def generate_tracelens_report(
 
         # For "all" format: TraceLens uses either/or logic - if output_csvs_dir is set,
         # it ONLY generates CSVs. So we need to call it twice for both formats.
-        # Note: This means the trace file is parsed twice, roughly doubling processing time
-        # compared to a single format. This is a TraceLens limitation, not a bug.
+        # Performance: trace file is parsed twice (~2x time; large traces can be hundreds of MB).
+        # A future workaround could write CSVs from the DataFrames returned by the first call
+        # if TraceLens API exposes a suitable export; for now we accept the double parse.
         if output_format == "all":
+            warning_rank_0(
+                "[TraceLens] output_format='all' parses the trace file twice (~2x processing time). "
+                "Use 'xlsx' or 'csv' if only one format is needed."
+            )
             xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
             csv_subdir = os.path.join(output_dir, report_name)
             os.makedirs(csv_subdir, exist_ok=True)
@@ -440,7 +446,10 @@ def generate_tracelens_report(
             # Collect all generated CSV files (escape path to handle [] characters in filenames)
             csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
             if csv_files:
-                log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
+                num_sections = len(dfs) if dfs else 0
+                log_rank_0(
+                    f"[TraceLens] Generated {len(csv_files)} CSV files ({num_sections} sections) for {report_name}"
+                )
                 generated_files.append(csv_subdir)  # Upload directory to preserve structure
 
         if generated_files:
@@ -575,7 +584,7 @@ def generate_tracelens_reports(
     tensorboard_dir: str,
     output_dir: str,
     ranks: Optional[List[int]] = None,
-    output_format: str = "all",
+    output_format: str = "xlsx",
 ) -> List[str]:
     """
     Generate TraceLens analysis reports for trace files.
@@ -586,9 +595,9 @@ def generate_tracelens_reports(
         ranks: List of ranks to generate reports for (None = all ranks)
                To limit number of reports, specify fewer ranks in the list
         output_format: Output format:
-                      - "all" (default): Both XLSX and CSV files
-                      - "xlsx": Multi-tab Excel with detailed analysis
+                      - "xlsx" (default): Multi-tab Excel; single parse, fastest
                       - "csv": Multiple CSV files per rank (kernels, memory, comm, etc.)
+                      - "all": Both XLSX and CSV; trace parsed twice (~2x processing time)
 
     Returns:
         List of paths to all generated report files
@@ -596,7 +605,8 @@ def generate_tracelens_reports(
     # Try to install tracelens, but continue with fallback if not available
     _ensure_tracelens_installed()
 
-    # Normalize ranks parameter: handle string input from config parser
+    # Normalize ranks: config/CLI can pass mlflow_tracelens_ranks as a string (e.g. env override
+    # or serialized list), but we need a list or None for filtering.
     if ranks is not None and isinstance(ranks, str):
         import ast
 
@@ -646,7 +656,7 @@ def generate_tracelens_reports_locally(
     tensorboard_dir: str,
     exp_root_path: str,
     ranks: Optional[List[int]] = None,
-    output_format: str = "all",
+    output_format: str = "xlsx",
 ) -> int:
     """
     Generate TraceLens analysis reports locally (without MLflow upload).
@@ -659,7 +669,7 @@ def generate_tracelens_reports_locally(
         exp_root_path: Root path of the experiment (for saving reports)
         ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
                Specify fewer ranks to limit number of reports
-        output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time)
 
     Returns:
         Number of reports generated
@@ -703,7 +713,7 @@ def upload_tracelens_reports_to_mlflow(
     tensorboard_dir: str,
     exp_root_path: str,
     ranks: Optional[List[int]] = None,
-    output_format: str = "all",
+    output_format: str = "xlsx",
     artifact_path: str = "trace_analysis",
     cleanup_after_upload: bool = False,
 ) -> int:
@@ -722,7 +732,7 @@ def upload_tracelens_reports_to_mlflow(
         exp_root_path: Root path of the experiment (for saving reports)
         ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
                Specify fewer ranks to limit number of reports
-        output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time)
         artifact_path: MLflow artifact subdirectory for reports
         cleanup_after_upload: If True, removes local reports after upload to save disk space.
                              If False, keeps reports locally for inspection. Default: False.
@@ -738,7 +748,8 @@ def upload_tracelens_reports_to_mlflow(
         log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
         return 0
 
-    # Normalize ranks parameter: handle string input from config parser
+    # Normalize ranks: config/CLI can pass mlflow_tracelens_ranks as a string (e.g. env override
+    # or serialized list), but we need a list or None for filtering.
     if ranks is not None and isinstance(ranks, str):
         import ast
 
@@ -774,27 +785,46 @@ def upload_tracelens_reports_to_mlflow(
         log_rank_0("[TraceLens] No reports generated, nothing to upload")
         return 0
 
-    # Upload reports to MLflow
+    # Upload reports to MLflow (files via log_artifact, dirs via log_artifacts for correct behavior)
     uploaded_count = 0
     for report_path in reports:
         try:
-            mlflow_writer.log_artifact(report_path, artifact_path=artifact_path)
+            if os.path.isdir(report_path):
+                subpath = (
+                    os.path.join(artifact_path, os.path.basename(report_path))
+                    if artifact_path
+                    else os.path.basename(report_path)
+                )
+                mlflow_writer.log_artifacts(report_path, artifact_path=subpath)
+                log_rank_0(f"[MLflow] Uploaded TraceLens report dir: {os.path.basename(report_path)}")
+            else:
+                mlflow_writer.log_artifact(report_path, artifact_path=artifact_path)
+                log_rank_0(f"[MLflow] Uploaded TraceLens report: {os.path.basename(report_path)}")
             uploaded_count += 1
-            log_rank_0(f"[MLflow] Uploaded TraceLens report: {os.path.basename(report_path)}")
         except Exception as e:
             warning_rank_0(f"[MLflow] Failed to upload report {report_path}: {e}")
 
-    log_rank_0(f"[TraceLens] Uploaded {uploaded_count} reports to '{artifact_path}'")
+    log_rank_0(
+        f"[TraceLens] Uploaded {uploaded_count} report item(s) to '{artifact_path}' "
+        "(each item may be a file or a directory of CSV files)"
+    )
 
-    # Optionally clean up local reports after successful upload to save disk space
+    # Optionally clean up local reports only when all uploads succeeded, to avoid losing data
+    # when some uploads failed (reported via warning_rank_0 above).
     if cleanup_after_upload:
-        try:
-            import shutil
-
-            shutil.rmtree(reports_dir)
-            log_rank_0(f"[TraceLens] Cleaned up local reports directory: {reports_dir}")
-        except Exception as e:
-            warning_rank_0(f"[TraceLens] Failed to cleanup reports directory: {e}")
+        if uploaded_count == len(reports):
+            try:
+                import shutil
+
+                shutil.rmtree(reports_dir)
+                log_rank_0(f"[TraceLens] Cleaned up local reports directory: {reports_dir}")
+            except Exception as e:
+                warning_rank_0(f"[TraceLens] Failed to cleanup reports directory: {e}")
+        else:
+            log_rank_0(
+                f"[TraceLens] Skipping cleanup (only {uploaded_count}/{len(reports)} uploads succeeded); "
+                f"keeping local reports at: {reports_dir}"
+            )
     else:
         log_rank_0(f"[TraceLens] Keeping local reports at: {reports_dir}")
 
@@ -815,7 +845,7 @@ def upload_artifacts_to_mlflow(
     generate_tracelens_report: bool = False,
     upload_tracelens_report: bool = False,
     tracelens_ranks: Optional[List[int]] = None,
-    tracelens_output_format: str = "all",
+    tracelens_output_format: str = "xlsx",
     tracelens_cleanup_after_upload: bool = False,
 ) -> dict:
     """
@@ -855,7 +885,7 @@ def upload_artifacts_to_mlflow(
         tracelens_ranks: List of ranks to generate TraceLens reports for
                         (None = all ranks, [0, 8] = ranks 0 and 8 only)
                         Specify fewer ranks to limit number of reports
-        tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        tracelens_output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time)
         tracelens_cleanup_after_upload: If True, removes local reports after upload to save disk space.
                                        If False, keeps reports locally for inspection (default).
 
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index 1bfbe5f05..c615b2365 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -14,7 +14,10 @@
 from typing import List, Optional
 
 from .global_vars import get_mlflow_writer
-from .mlflow_artifacts import upload_artifacts_to_mlflow
+from .mlflow_artifacts import (
+    generate_tracelens_reports_locally,
+    upload_artifacts_to_mlflow,
+)
 
 
 def upload_mlflow_artifacts(
@@ -64,6 +67,14 @@ def upload_mlflow_artifacts(
     """
     mlflow_writer = get_mlflow_writer()
     if mlflow_writer is None:
+        # Local-only TraceLens generation: run even when MLflow is disabled
+        if generate_tracelens_report and tensorboard_dir and exp_root_path:
+            generate_tracelens_reports_locally(
+                tensorboard_dir=tensorboard_dir,
+                exp_root_path=exp_root_path,
+                ranks=tracelens_ranks,
+                output_format=tracelens_output_format,
+            )
         return None
 
     return upload_artifacts_to_mlflow(
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index e921d0465..8ebdd78ae 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -12,23 +12,28 @@ mlflow_upload_logs: false            # Upload training log files to MLflow
 
 # TraceLens Report Generation & Upload
 # ----------------------------------------------------------------------------
+# Supported options: generate_tracelens_report, mlflow_upload_tracelens_report,
+#   mlflow_tracelens_ranks, mlflow_tracelens_output_format, mlflow_tracelens_cleanup_after_upload
+#
 # generate_tracelens_report:          Generate TraceLens analysis reports locally
 # mlflow_upload_tracelens_report:     Upload reports to MLflow (auto-enables generation)
 #
 # Usage patterns:
-#   generate=false, upload=false  →  No reports generated
-#   generate=true,  upload=false  →  Generate reports locally only
-#   generate=false, upload=true   →  Generate AND upload (auto-enabled)
-#   generate=true,  upload=true   →  Generate AND upload (explicit)
+#   generate=false, upload=false  ->  No reports generated
+#   generate=true,  upload=false  ->  Generate reports locally only
+#   generate=false, upload=true   ->  Generate AND upload (auto-enabled)
+#   generate=true,  upload=true   ->  Generate AND upload (explicit)
 #
-# To limit number of reports: Specify fewer ranks in mlflow_tracelens_ranks
+# To limit number of reports: use mlflow_tracelens_ranks (no separate max_reports option).
 #   Default: [0, 8] = one rank per node (assumes 8 GPUs/node)
 #   Use null for all ranks, or customize list like [0, 1, 8, 9] for more coverage
 # ----------------------------------------------------------------------------
 generate_tracelens_report: false    # Generate TraceLens analysis reports locally (auto-enabled when upload=true)
 mlflow_upload_tracelens_report: false # Upload TraceLens reports to MLflow (auto-enables generation, profiling, tensorboard)
 mlflow_tracelens_ranks: [0, 8]       # List of ranks to analyze (default: one per node for 2-node setup)
-mlflow_tracelens_output_format: all  # TraceLens report format: all (xlsx+csv), xlsx, or csv
+# TraceLens report format: xlsx (default, single parse, fastest), csv, or all (xlsx+csv;
+# parses each trace twice so ~2x processing time; use only when both formats are needed)
+mlflow_tracelens_output_format: xlsx
 mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)
 disable_compile_dependencies: true
 # NOTE:
@@ -39,6 +44,25 @@ disable_compile_dependencies: true
 use_rocm_mem_info: false
 use_rocm_mem_info_iters: [1,2]
 
+# MLflow performance metrics - comprehensive metrics for scaling tests
+# When enabled, automatically enables throughput calculations and logs to MLflow:
+#
+#   1. Performance Metrics:
+#      - perf/throughput_tflops_per_gpu: TFLOP/s per GPU
+#      - perf/tps_tokens_per_sec_per_gpu: Tokens/sec per GPU
+#      - perf/iteration_time_ms: Time per training step (ms)
+#
+#   2. Memory Metrics:
+#      - perf/{rocm/hip}_peak_mem_gb: Peak GPU memory usage (GB)
+#      - perf/{rocm/hip}_mem_utilization_pct: Memory utilization (% of total)
+#
+#   3. System Metrics:
+#      - perf/gpu_utilization_pct_rank{N}: GPU utilization per rank (%)
+#      - perf/gpu_utilization_pct_avg: Average GPU utilization across all ranks (%)
+#
+# Note: This flag implicitly enables log_throughput behavior for metric collection.
+mlflow_upload_performance_metrics: false
+
 # profiling
 disable_profiler_activity_cpu: false
 torch_profiler_record_shapes: true
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 3ba8ec523..213d89271 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -151,7 +151,7 @@
 from primus.backends.megatron.training.mlflow_setup import upload_mlflow_artifacts
 from primus.backends.megatron.training.tokenizer.tokenizer import build_tokenizer
 from primus.core.utils import checker, file_utils
-from primus.core.utils.rocm_mem_info import get_rocm_smi_mem_info
+from primus.core.utils.rocm_mem_info import get_rocm_smi_gpu_util, get_rocm_smi_mem_info
 from primus.core.utils.yaml_utils import nested_namespace_to_dict
 from primus.modules.base_module import BaseModule
 from primus.modules.module_utils import (
@@ -396,28 +396,25 @@ def update_primus_config(
             else:
                 log_rank_0(f"-{latest_file} does not exist, skip auto_continue_train.")
 
-        # Auto-enable dependencies for mlflow upload flags
-        # This must run BEFORE tensorboard section to ensure paths are set correctly
-        mlflow_upload_flags = [
-            getattr(args, "mlflow_upload_traces", False),
-            getattr(args, "mlflow_upload_logs", False),
-            getattr(args, "mlflow_upload_tracelens_report", False),
-        ]
-        if any(mlflow_upload_flags) and args.disable_mlflow:
-            args.disable_mlflow = False
-            debug_rank_0("Auto-enabled MLflow (disable_mlflow=False) because mlflow_upload_* flags are set")
-
-        # If uploading traces or tracelens reports, auto-enable profiling and tensorboard
-        needs_profiling = getattr(args, "mlflow_upload_traces", False) or getattr(
-            args, "mlflow_upload_tracelens_report", False
-        )
+        # Auto-enable profiling and tensorboard when traces are needed: for MLflow upload
+        # (only if MLflow is enabled) or for local TraceLens report generation.
+        # Without this, generate_tracelens_report=True with profile=False would produce no traces.
+        needs_profiling = (
+            (
+                getattr(args, "mlflow_upload_traces", False)
+                or getattr(args, "mlflow_upload_tracelens_report", False)
+            )
+            and not args.disable_mlflow
+        ) or getattr(args, "generate_tracelens_report", False)
         if needs_profiling:
             if not getattr(args, "profile", False):
                 args.profile = True
-                debug_rank_0("Auto-enabled profile=True for mlflow trace/tracelens upload")
+                debug_rank_0("Auto-enabled profile=True for trace/tracelens (upload or local generation)")
             if not getattr(args, "use_pytorch_profiler", False):
                 args.use_pytorch_profiler = True
-                debug_rank_0("Auto-enabled use_pytorch_profiler=True for mlflow trace/tracelens upload")
+                debug_rank_0(
+                    "Auto-enabled use_pytorch_profiler=True for trace/tracelens (upload or local generation)"
+                )
             if getattr(args, "disable_tensorboard", True):
                 args.disable_tensorboard = False
                 debug_rank_0("Auto-enabled tensorboard (disable_tensorboard=False) for profiler trace output")
@@ -1147,19 +1144,20 @@ def run(self, *args, **kwargs):
         ft_integration.on_checkpointing_end(is_async_finalization=True)
 
         mlflow_writer = get_mlflow_writer()
+        # Always call: uploads to MLflow when enabled; when MLflow disabled, still runs
+        # local-only TraceLens report generation if generate_tracelens_report=True.
+        upload_mlflow_artifacts(
+            tensorboard_dir=args.tensorboard_dir,
+            exp_root_path=self.exp_root_path,
+            upload_traces=getattr(args, "mlflow_upload_traces", False),
+            upload_logs=getattr(args, "mlflow_upload_logs", False),
+            generate_tracelens_report=getattr(args, "generate_tracelens_report", False),
+            upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
+            tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
+            tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "xlsx"),
+            tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", False),
+        )
         if mlflow_writer:
-            # Upload artifacts to MLflow before ending the run
-            upload_mlflow_artifacts(
-                tensorboard_dir=args.tensorboard_dir,
-                exp_root_path=self.exp_root_path,
-                upload_traces=getattr(args, "mlflow_upload_traces", True),
-                upload_logs=getattr(args, "mlflow_upload_logs", True),
-                generate_tracelens_report=getattr(args, "generate_tracelens_report", False),
-                upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
-                tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
-                tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "all"),
-                tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", False),
-            )
             mlflow_writer.end_run()
 
         one_logger and one_logger.log_metrics({"app_finish_time": one_logger_utils.get_timestamp_in_ms()})
@@ -2000,13 +1998,24 @@ def training_log(
         if iteration % args.log_interval == 0:
             # Note(wenx): If we want to collect rocm-smi memory information for the first two iterations,
             # place the collection before the timer to minimize its impact on latency measurements for iterations ≥ 3.
-            if args.log_throughput:
+            rocm_gpu_util = None
+            # Enable throughput calculations if log_throughput or mlflow_upload_performance_metrics is set
+            enable_perf_metrics = args.log_throughput or getattr(
+                args, "mlflow_upload_performance_metrics", False
+            )
+            if enable_perf_metrics:
                 if args.use_rocm_mem_info or (
                     args.use_rocm_mem_info_iters is not None and iteration in args.use_rocm_mem_info_iters
                 ):
                     rocm_total_mem, rocm_used_mem, rocm_free_mem = get_rocm_smi_mem_info(
                         self.module_local_rank
                     )
+                # Collect GPU utilization for performance metrics
+                if getattr(args, "mlflow_upload_performance_metrics", False):
+                    try:
+                        rocm_gpu_util = get_rocm_smi_gpu_util(self.module_local_rank)
+                    except Exception:
+                        rocm_gpu_util = None
 
             elapsed_time = timers("interval-time").elapsed(barrier=True)
             elapsed_time_per_iteration = elapsed_time / total_iterations
@@ -2043,7 +2052,7 @@ def training_log(
                 elapsed_time_per_iteration * 1000.0,
                 statistics.mean(self.recent_iteration_times),
             )
-            if args.log_throughput:
+            if enable_perf_metrics:
                 if (
                     iteration == self.log_avg_skip_iterations + 1
                     or len(self.recent_tflop_throughputs) >= self.log_avg_reset_interval
@@ -2185,6 +2194,63 @@ def training_log(
                         mlflow_writer.log_metric(
                             f"{mem_collector}_mem_usage_percent", mem_usage * 100.0, iteration
                         )
+
+                # Upload performance metrics to MLflow
+                # Groups: Performance (throughput, TPS, iteration time), Memory (peak, usage %), System (GPU util)
+                # NOTE: mlflow_writer only exists on last rank, but all_gather requires all ranks to participate
+                if getattr(args, "mlflow_upload_performance_metrics", False):
+                    # System metrics - GPU utilization per rank
+                    # ALL ranks must participate in all_gather, even if they don't have mlflow_writer
+                    # Use -1 as sentinel for unavailable GPU util
+                    util_value = rocm_gpu_util if rocm_gpu_util is not None else -1.0
+                    util_tensor = torch.tensor([util_value], device="cuda", dtype=torch.float32)
+                    world_size = dist.get_world_size()
+                    gathered_utils = [torch.zeros_like(util_tensor) for _ in range(world_size)]
+                    dist.all_gather(gathered_utils, util_tensor)
+
+                    # Only the last rank (which has mlflow_writer) logs the metrics
+                    if mlflow_writer:
+                        # Performance metrics
+                        mlflow_writer.log_metric("perf/throughput_tflops_per_gpu", throughput, iteration)
+                        mlflow_writer.log_metric(
+                            "perf/tps_tokens_per_sec_per_gpu", token_throughput, iteration
+                        )
+                        mlflow_writer.log_metric(
+                            "perf/iteration_time_ms",
+                            elapsed_time_per_iteration * 1000.0,
+                            iteration,
+                        )
+                        # Memory metrics
+                        mlflow_writer.log_metric(
+                            f"perf/{mem_collector}_peak_mem_gb",
+                            used_mem / 1024 / 1024 / 1024,
+                            iteration,
+                        )
+                        mlflow_writer.log_metric(
+                            f"perf/{mem_collector}_mem_utilization_pct",
+                            mem_usage * 100.0,
+                            iteration,
+                        )
+                        # Log GPU utilization from gathered values
+                        valid_utils = []
+                        for rank, util_val in enumerate(gathered_utils):
+                            util = util_val.item()
+                            if util >= 0:  # Filter out sentinel values (-1)
+                                mlflow_writer.log_metric(
+                                    f"perf/gpu_utilization_pct_rank{rank}",
+                                    util,
+                                    iteration,
+                                )
+                                valid_utils.append(util)
+                        # Also log average GPU utilization (only from valid values)
+                        if valid_utils:
+                            avg_util = sum(valid_utils) / len(valid_utils)
+                            mlflow_writer.log_metric(
+                                "perf/gpu_utilization_pct_avg",
+                                avg_util,
+                                iteration,
+                            )
+
             assert learning_rate is not None
             # Decoupled_learning_rate should be not None only on first and last pipeline stage.
             log_string += " learning rate: {:.6E} |".format(learning_rate)
diff --git a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
new file mode 100644
index 000000000..281a2744f
--- /dev/null
+++ b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
@@ -0,0 +1,621 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc.
+#
+# See LICENSE for license information.
+###############################################################################
+
+"""
+Unit tests for primus.backends.megatron.training.mlflow_artifacts.
+
+Covers:
+- Trace file discovery and filtering
+- Rank extraction from filenames
+- Report generation with mocked TraceLens API
+- Upload logic with mocked mlflow_writer (file vs directory)
+- Error handling and fallback behavior
+- Cleanup logic validation
+"""
+
+import os
+import sys
+import types
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from primus.backends.megatron.training import mlflow_artifacts as mlflow_artifacts_mod
+
+# Use the module reference for patching in tests
+_MODULE = "primus.backends.megatron.training.mlflow_artifacts"
+
+
+@pytest.fixture(autouse=True)
+def suppress_logging():
+    """Suppress log_rank_0 and warning_rank_0 in all tests."""
+    with patch(f"{_MODULE}.log_rank_0"), patch(f"{_MODULE}.warning_rank_0"):
+        yield
+
+
+# -----------------------------------------------------------------------------
+# Trace file discovery
+# -----------------------------------------------------------------------------
+
+
+class TestGetAllTraceFiles:
+    """Test _get_all_trace_files discovery and filtering."""
+
+    def test_returns_empty_for_none_path(self):
+        out = mlflow_artifacts_mod._get_all_trace_files(None)
+        assert out == []
+
+    def test_returns_empty_for_empty_string(self):
+        out = mlflow_artifacts_mod._get_all_trace_files("")
+        assert out == []
+
+    def test_returns_empty_for_missing_directory(self, tmp_path):
+        missing = tmp_path / "does_not_exist"
+        assert not missing.exists()
+        out = mlflow_artifacts_mod._get_all_trace_files(str(missing))
+        assert out == []
+
+    def test_finds_pt_trace_json_in_root(self, tmp_path):
+        (tmp_path / "rank_0_step_1.pt.trace.json").write_text("{}")
+        (tmp_path / "rank_1_step_1.pt.trace.json").write_text("{}")
+        (tmp_path / "other.json").write_text("{}")
+        out = mlflow_artifacts_mod._get_all_trace_files(str(tmp_path))
+        assert len(out) == 2
+        basenames = {os.path.basename(p) for p in out}
+        assert basenames == {"rank_0_step_1.pt.trace.json", "rank_1_step_1.pt.trace.json"}
+
+    def test_finds_pt_trace_json_gz(self, tmp_path):
+        (tmp_path / "rank_0.pt.trace.json.gz").write_text("")
+        out = mlflow_artifacts_mod._get_all_trace_files(str(tmp_path))
+        assert len(out) == 1
+        assert out[0].endswith("rank_0.pt.trace.json.gz")
+
+    def test_finds_traces_recursively_in_subdirs(self, tmp_path):
+        sub = tmp_path / "sub"
+        sub.mkdir()
+        (sub / "rank_2.pt.trace.json").write_text("{}")
+        out = mlflow_artifacts_mod._get_all_trace_files(str(tmp_path))
+        assert len(out) == 1
+        assert "rank_2" in out[0]
+
+    def test_deduplicates_results(self, tmp_path):
+        (tmp_path / "a.pt.trace.json").write_text("{}")
+        out = mlflow_artifacts_mod._get_all_trace_files(str(tmp_path))
+        assert len(out) == 1
+
+
+# -----------------------------------------------------------------------------
+# Rank extraction from filenames
+# -----------------------------------------------------------------------------
+
+
+class TestExtractRankFromFilename:
+    """Test _extract_rank_from_filename patterns."""
+
+    def test_rank_underscore_number_underscore(self):
+        assert mlflow_artifacts_mod._extract_rank_from_filename("rank_0_step_2.json.gz") == 0
+        assert mlflow_artifacts_mod._extract_rank_from_filename("rank_15_step_1.pt.trace.json") == 15
+
+    def test_rank_bracket_number_bracket(self):
+        assert (
+            mlflow_artifacts_mod._extract_rank_from_filename("primus-megatron-exp-rank[0].pt.trace.json") == 0
+        )
+        assert mlflow_artifacts_mod._extract_rank_from_filename("prefix-rank[7].json") == 7
+
+    def test_dash_rank_number_dot(self):
+        assert mlflow_artifacts_mod._extract_rank_from_filename("trace-rank1.json") == 1
+
+    def test_underscore_rank_number_dot(self):
+        assert mlflow_artifacts_mod._extract_rank_from_filename("trace_rank2.json") == 2
+
+    def test_returns_none_for_unknown_pattern(self):
+        assert mlflow_artifacts_mod._extract_rank_from_filename("random_file.json") is None
+        assert mlflow_artifacts_mod._extract_rank_from_filename("trace.json.gz") is None
+
+
+# -----------------------------------------------------------------------------
+# Filter traces by rank
+# -----------------------------------------------------------------------------
+
+
+class TestFilterTracesByRank:
+    """Test _filter_traces_by_rank."""
+
+    def test_returns_all_when_ranks_none(self, tmp_path):
+        paths = [
+            str(tmp_path / "rank_0.pt.trace.json"),
+            str(tmp_path / "rank_1.pt.trace.json"),
+        ]
+        out = mlflow_artifacts_mod._filter_traces_by_rank(paths, None)
+        assert out == paths
+
+    def test_returns_all_when_ranks_empty_list(self, tmp_path):
+        paths = [str(tmp_path / "rank_0.pt.trace.json")]
+        out = mlflow_artifacts_mod._filter_traces_by_rank(paths, [])
+        assert out == paths
+
+    def test_filters_to_specified_ranks(self, tmp_path):
+        paths = [
+            str(tmp_path / "rank_0_step_1.pt.trace.json"),
+            str(tmp_path / "rank_1_step_1.pt.trace.json"),
+            str(tmp_path / "rank_2_step_1.pt.trace.json"),
+        ]
+        out = mlflow_artifacts_mod._filter_traces_by_rank(paths, [0, 2])
+        assert len(out) == 2
+        assert "rank_0" in out[0]
+        assert "rank_2" in out[1]
+
+
+# -----------------------------------------------------------------------------
+# Report generation with mocked TraceLens
+# -----------------------------------------------------------------------------
+
+
+class TestGenerateTracelensReport:
+    """Test generate_tracelens_report with mocked TraceLens."""
+
+    def _install_fake_tracelens(self, mock_generate, xlsx_path=None, csv_dir=None):
+        """Put fake TraceLens.Reporting into sys.modules so generate_tracelens_report can import it."""
+        reporting = types.ModuleType("TraceLens.Reporting")
+        reporting.generate_perf_report_pytorch = mock_generate
+        tracelens = types.ModuleType("TraceLens")
+        tracelens.Reporting = reporting
+        sys.modules["TraceLens"] = tracelens
+        sys.modules["TraceLens.Reporting"] = reporting
+
+        def _side_effect(trace_file, output_xlsx_path=None, output_csvs_dir=None):
+            if output_xlsx_path and xlsx_path is not False:
+                Path(output_xlsx_path).parent.mkdir(parents=True, exist_ok=True)
+                Path(output_xlsx_path).write_text("xlsx")
+                return [{"tab1"}, {"tab2"}]
+            if output_csvs_dir and csv_dir is not False:
+                Path(output_csvs_dir).mkdir(parents=True, exist_ok=True)
+                (Path(output_csvs_dir) / "kernels.csv").write_text("kernels")
+                (Path(output_csvs_dir) / "memory.csv").write_text("memory")
+            return []
+
+        mock_generate.side_effect = _side_effect
+        return reporting
+
+    def teardown_method(self):
+        for key in list(sys.modules.keys()):
+            if key == "TraceLens" or key.startswith("TraceLens."):
+                del sys.modules[key]
+
+    def test_report_generation_xlsx_with_mocked_tracelens(self, tmp_path):
+        trace_file = tmp_path / "rank_0.pt.trace.json"
+        trace_file.write_text('{"traceEvents": []}')
+        output_dir = tmp_path / "reports"
+        mock_gen = MagicMock()
+
+        self._install_fake_tracelens(mock_gen, xlsx_path=True, csv_dir=None)
+        try:
+            with patch(f"{_MODULE}._ensure_openpyxl_installed"):
+                result = mlflow_artifacts_mod.generate_tracelens_report(
+                    str(trace_file), str(output_dir), output_format="xlsx"
+                )
+            assert len(result) == 1
+            assert result[0].endswith("_analysis.xlsx")
+            assert mock_gen.called
+        finally:
+            self.teardown_method()
+
+    def test_report_generation_missing_trace_file(self, tmp_path):
+        missing = tmp_path / "missing.pt.trace.json"
+        assert not missing.exists()
+        with patch(f"{_MODULE}.warning_rank_0") as warn:
+            result = mlflow_artifacts_mod.generate_tracelens_report(
+                str(missing), str(tmp_path), output_format="xlsx"
+            )
+        assert result == []
+        assert warn.called
+
+
+# -----------------------------------------------------------------------------
+# Fallback CSV when TraceLens fails / not available
+# -----------------------------------------------------------------------------
+
+
+class TestGenerateTraceSummaryCsvFallback:
+    """Test _generate_trace_summary_csv fallback."""
+
+    def test_fallback_csv_from_valid_trace_json(self, tmp_path):
+        trace_file = tmp_path / "rank_0.pt.trace.json"
+        trace_file.write_text(
+            '{"traceEvents": ['
+            '{"name": "kernel1", "cat": "kernel", "dur": 100},'
+            '{"name": "kernel1", "cat": "kernel", "dur": 200}'
+            "]}"
+        )
+        out_dir = tmp_path / "out"
+        out_dir.mkdir()
+        with patch(f"{_MODULE}.log_rank_0"), patch(f"{_MODULE}.warning_rank_0"):
+            path = mlflow_artifacts_mod._generate_trace_summary_csv(
+                str(trace_file), str(out_dir), "summary.csv"
+            )
+        assert path is not None
+        assert path.endswith("summary.csv")
+        assert os.path.exists(path)
+        content = Path(path).read_text()
+        assert "kernel1" in content
+        assert "Count" in content or "Total" in content
+
+    def test_fallback_returns_none_for_missing_file(self, tmp_path):
+        with patch(f"{_MODULE}.warning_rank_0"):
+            path = mlflow_artifacts_mod._generate_trace_summary_csv(
+                str(tmp_path / "missing.json"), str(tmp_path), "out.csv"
+            )
+        assert path is None
+
+    def test_fallback_returns_none_for_empty_events(self, tmp_path):
+        trace_file = tmp_path / "empty.pt.trace.json"
+        trace_file.write_text('{"traceEvents": []}')
+        with patch(f"{_MODULE}.warning_rank_0"):
+            path = mlflow_artifacts_mod._generate_trace_summary_csv(str(trace_file), str(tmp_path), "out.csv")
+        assert path is None
+
+
+# -----------------------------------------------------------------------------
+# Upload logic with mocked mlflow_writer
+# -----------------------------------------------------------------------------
+
+
+class TestUploadTraceFilesToMlflow:
+    """Test upload_trace_files_to_mlflow with mocked writer."""
+
+    def test_returns_zero_when_mlflow_writer_none(self, tmp_path):
+        count = mlflow_artifacts_mod.upload_trace_files_to_mlflow(None, str(tmp_path), artifact_path="traces")
+        assert count == 0
+
+    def test_uploads_found_traces_and_returns_count(self, tmp_path):
+        (tmp_path / "rank_0.pt.trace.json").write_text("{}")
+        (tmp_path / "rank_1.pt.trace.json").write_text("{}")
+        mock_writer = MagicMock()
+        with patch(f"{_MODULE}.log_rank_0"):
+            count = mlflow_artifacts_mod.upload_trace_files_to_mlflow(
+                mock_writer, str(tmp_path), artifact_path="traces"
+            )
+        assert count == 2
+        assert mock_writer.log_artifact.call_count == 2
+
+
+class TestUploadTracelensReportsToMlflow:
+    """Test upload_tracelens_reports_to_mlflow: file vs dir, cleanup, errors."""
+
+    def test_returns_zero_when_mlflow_writer_none(self, tmp_path):
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports",
+            return_value=[],
+        ):
+            count = mlflow_artifacts_mod.upload_tracelens_reports_to_mlflow(
+                None,
+                str(tmp_path),
+                str(tmp_path),
+                ranks=[0],
+                output_format="xlsx",
+            )
+        assert count == 0
+
+    def test_uses_log_artifact_for_files_and_log_artifacts_for_dirs(self, tmp_path):
+        file_report = tmp_path / "rank_0_analysis.xlsx"
+        file_report.write_text("xlsx")
+        dir_report = tmp_path / "rank_0"
+        dir_report.mkdir()
+        (dir_report / "kernels.csv").write_text("csv")
+        reports = [str(file_report), str(dir_report)]
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports",
+            return_value=reports,
+        ):
+            count = mlflow_artifacts_mod.upload_tracelens_reports_to_mlflow(
+                mock_writer,
+                str(tmp_path),
+                str(tmp_path),
+                ranks=[0],
+                output_format="xlsx",
+                artifact_path="trace_analysis",
+            )
+        assert count == 2
+        mock_writer.log_artifact.assert_called_once()
+        mock_writer.log_artifacts.assert_called_once()
+        # Directory should be logged with subpath preserving name
+        call_kw = mock_writer.log_artifacts.call_args[1]
+        assert "artifact_path" in call_kw
+        assert "rank_0" in call_kw["artifact_path"] or call_kw["artifact_path"] == "rank_0"
+
+    def test_upload_failure_on_one_report_still_uploads_others(self, tmp_path):
+        file1 = tmp_path / "r0.xlsx"
+        file1.write_text("a")
+        file2 = tmp_path / "r1.xlsx"
+        file2.write_text("b")
+        reports = [str(file1), str(file2)]
+        mock_writer = MagicMock()
+        mock_writer.log_artifact.side_effect = [None, Exception("upload failed")]
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports",
+            return_value=reports,
+        ), patch(f"{_MODULE}.warning_rank_0"):
+            count = mlflow_artifacts_mod.upload_tracelens_reports_to_mlflow(
+                mock_writer,
+                str(tmp_path),
+                str(tmp_path),
+                artifact_path="trace_analysis",
+            )
+        assert count == 1
+        assert mock_writer.log_artifact.call_count == 2
+
+    def test_cleanup_after_upload_calls_rmtree(self, tmp_path):
+        reports_dir = tmp_path / "tracelens_reports"
+        reports_dir.mkdir()
+        file_report = reports_dir / "rank_0_analysis.xlsx"
+        file_report.write_text("xlsx")
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports",
+            return_value=[str(file_report)],
+        ), patch("shutil.rmtree") as mock_rmtree:
+            mlflow_artifacts_mod.upload_tracelens_reports_to_mlflow(
+                mock_writer,
+                str(tmp_path),
+                str(tmp_path),
+                artifact_path="trace_analysis",
+                cleanup_after_upload=True,
+            )
+        mock_rmtree.assert_called_once()
+        assert "tracelens_reports" in str(mock_rmtree.call_args[0][0])
+
+    def test_no_cleanup_when_cleanup_after_upload_false(self, tmp_path):
+        reports_dir = tmp_path / "tracelens_reports"
+        reports_dir.mkdir()
+        (reports_dir / "r0.xlsx").write_text("x")
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports",
+            return_value=[str(reports_dir / "r0.xlsx")],
+        ), patch("shutil.rmtree") as mock_rmtree:
+            mlflow_artifacts_mod.upload_tracelens_reports_to_mlflow(
+                mock_writer,
+                str(tmp_path),
+                str(tmp_path),
+                cleanup_after_upload=False,
+            )
+        mock_rmtree.assert_not_called()
+
+    def test_cleanup_skipped_when_some_uploads_failed(self, tmp_path):
+        reports_dir = tmp_path / "tracelens_reports"
+        reports_dir.mkdir()
+        f1 = reports_dir / "r0.xlsx"
+        f2 = reports_dir / "r1.xlsx"
+        f1.write_text("a")
+        f2.write_text("b")
+        mock_writer = MagicMock()
+        mock_writer.log_artifact.side_effect = [None, Exception("upload failed")]
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports",
+            return_value=[str(f1), str(f2)],
+        ), patch("shutil.rmtree") as mock_rmtree, patch(f"{_MODULE}.warning_rank_0"):
+            mlflow_artifacts_mod.upload_tracelens_reports_to_mlflow(
+                mock_writer,
+                str(tmp_path),
+                str(tmp_path),
+                artifact_path="trace_analysis",
+                cleanup_after_upload=True,
+            )
+        mock_rmtree.assert_not_called()
+
+
+# -----------------------------------------------------------------------------
+# upload_artifacts_to_mlflow (main entry point)
+# -----------------------------------------------------------------------------
+
+
+class TestUploadArtifactsToMlflow:
+    """Test upload_artifacts_to_mlflow: trace/log discovery, artifact paths, TraceLens logic, cleanup."""
+
+    def test_returns_zero_dict_when_mlflow_writer_none(self, tmp_path):
+        result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+            None,
+            tensorboard_dir=str(tmp_path),
+            exp_root_path=str(tmp_path),
+        )
+        assert result == {"traces": 0, "logs": 0, "tracelens_reports": 0}
+
+    def test_upload_traces_called_with_correct_artifact_path(self, tmp_path):
+        (tmp_path / "rank_0.pt.trace.json").write_text("{}")
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "upload_trace_files_to_mlflow",
+            return_value=2,
+        ) as mock_traces:
+            result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+                mock_writer,
+                tensorboard_dir=str(tmp_path),
+                exp_root_path=str(tmp_path),
+                upload_traces=True,
+                upload_logs=False,
+                generate_tracelens_report=False,
+                upload_tracelens_report=False,
+            )
+        assert result["traces"] == 2
+        mock_traces.assert_called_once()
+        call_kw = mock_traces.call_args[1]
+        assert call_kw["artifact_path"] == "traces"
+
+    def test_upload_logs_called_with_correct_artifact_path(self, tmp_path):
+        (tmp_path / "logs" / "master" / "master-0.log").parent.mkdir(parents=True)
+        (tmp_path / "logs" / "master" / "master-0.log").write_text("log")
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "upload_log_files_to_mlflow",
+            return_value=1,
+        ) as mock_logs:
+            result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+                mock_writer,
+                tensorboard_dir=None,
+                exp_root_path=str(tmp_path),
+                upload_traces=False,
+                upload_logs=True,
+                generate_tracelens_report=False,
+                upload_tracelens_report=False,
+            )
+        assert result["logs"] == 1
+        mock_logs.assert_called_once()
+        call_kw = mock_logs.call_args[1]
+        assert call_kw["artifact_path"] == "logs"
+
+    def test_tracelens_upload_called_with_artifact_path_and_cleanup(self, tmp_path):
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "upload_tracelens_reports_to_mlflow",
+            return_value=3,
+        ) as mock_upload_tracelens:
+            result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+                mock_writer,
+                tensorboard_dir=str(tmp_path),
+                exp_root_path=str(tmp_path),
+                upload_traces=False,
+                upload_logs=False,
+                generate_tracelens_report=False,
+                upload_tracelens_report=True,
+                tracelens_ranks=[0, 8],
+                tracelens_output_format="xlsx",
+                tracelens_cleanup_after_upload=True,
+            )
+        assert result["tracelens_reports"] == 3
+        mock_upload_tracelens.assert_called_once()
+        call_kw = mock_upload_tracelens.call_args[1]
+        assert call_kw["artifact_path"] == "trace_analysis"
+        assert call_kw["cleanup_after_upload"] is True
+        assert call_kw["ranks"] == [0, 8]
+        assert call_kw["output_format"] == "xlsx"
+
+    def test_tracelens_generate_locally_only_when_generate_true_upload_false(self, tmp_path):
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports_locally",
+            return_value=5,
+        ) as mock_local:
+            with patch.object(
+                mlflow_artifacts_mod,
+                "upload_tracelens_reports_to_mlflow",
+            ) as mock_upload:
+                result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+                    mock_writer,
+                    tensorboard_dir=str(tmp_path),
+                    exp_root_path=str(tmp_path),
+                    upload_traces=False,
+                    upload_logs=False,
+                    generate_tracelens_report=True,
+                    upload_tracelens_report=False,
+                )
+        assert result["tracelens_reports"] == 0
+        mock_local.assert_called_once()
+        mock_upload.assert_not_called()
+
+    def test_no_tracelens_calls_when_both_generate_and_upload_false(self, tmp_path):
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "generate_tracelens_reports_locally",
+        ) as mock_local:
+            with patch.object(
+                mlflow_artifacts_mod,
+                "upload_tracelens_reports_to_mlflow",
+            ) as mock_upload:
+                result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+                    mock_writer,
+                    tensorboard_dir=str(tmp_path),
+                    exp_root_path=str(tmp_path),
+                    upload_traces=False,
+                    upload_logs=False,
+                    generate_tracelens_report=False,
+                    upload_tracelens_report=False,
+                )
+        assert result["tracelens_reports"] == 0
+        mock_local.assert_not_called()
+        mock_upload.assert_not_called()
+
+    def test_trace_and_log_discovery_integration(self, tmp_path):
+        """Trace and log files are discovered and upload helpers called with correct paths."""
+        (tmp_path / "rank_0.pt.trace.json").write_text("{}")
+        (tmp_path / "logs" / "master" / "m.log").parent.mkdir(parents=True)
+        (tmp_path / "logs" / "master" / "m.log").write_text("log")
+        mock_writer = MagicMock()
+        with patch.object(
+            mlflow_artifacts_mod,
+            "upload_trace_files_to_mlflow",
+            return_value=1,
+        ) as mock_traces:
+            with patch.object(
+                mlflow_artifacts_mod,
+                "upload_log_files_to_mlflow",
+                return_value=1,
+            ) as mock_logs:
+                result = mlflow_artifacts_mod.upload_artifacts_to_mlflow(
+                    mock_writer,
+                    tensorboard_dir=str(tmp_path),
+                    exp_root_path=str(tmp_path),
+                    upload_traces=True,
+                    upload_logs=True,
+                    generate_tracelens_report=False,
+                    upload_tracelens_report=False,
+                )
+        assert result["traces"] == 1
+        assert result["logs"] == 1
+        assert result["tracelens_reports"] == 0
+        mock_traces.assert_called_once_with(mock_writer, str(tmp_path), artifact_path="traces")
+        mock_logs.assert_called_once_with(mock_writer, str(tmp_path), artifact_path="logs")
+
+
+# -----------------------------------------------------------------------------
+# Log file discovery
+# -----------------------------------------------------------------------------
+
+
+class TestGetAllLogFiles:
+    """Test _get_all_log_files."""
+
+    def test_returns_empty_for_empty_exp_root(self):
+        out = mlflow_artifacts_mod._get_all_log_files("")
+        assert out == []
+
+    def test_returns_empty_when_logs_dir_missing(self, tmp_path):
+        out = mlflow_artifacts_mod._get_all_log_files(str(tmp_path))
+        assert out == []
+
+    def test_finds_log_files_recursively(self, tmp_path):
+        logs_dir = tmp_path / "logs"
+        logs_dir.mkdir()
+        (logs_dir / "master" / "master-0.log").parent.mkdir(parents=True)
+        (logs_dir / "master" / "master-0.log").write_text("log")
+        (logs_dir / "train" / "rank-0" / "rank-0.log").parent.mkdir(parents=True)
+        (logs_dir / "train" / "rank-0" / "rank-0.log").write_text("log")
+        out = mlflow_artifacts_mod._get_all_log_files(str(tmp_path))
+        assert len(out) == 2
+
+
+# -----------------------------------------------------------------------------
+# Constants
+# -----------------------------------------------------------------------------
+
+
+def test_tracelens_install_ref_constant():
+    """TRACELENS_INSTALL_REF is set for reproducibility."""
+    assert hasattr(mlflow_artifacts_mod, "TRACELENS_INSTALL_REF")
+    assert isinstance(mlflow_artifacts_mod.TRACELENS_INSTALL_REF, str)
+    assert len(mlflow_artifacts_mod.TRACELENS_INSTALL_REF) > 0

From 21106e375c44150284ed3bc2a6f47a26e410e466 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Mon, 9 Feb 2026 16:00:34 +0000
Subject: [PATCH 26/42] Fix Copilot review issues: supply-chain safety,
 UnboundLocalError, rank filtering, and multi-rank races

- Pin TraceLens install ref to immutable commit SHA (was tag v0.4.0)
- Move openpyxl install to generate_tracelens_report, only when xlsx/all
- Fix extension stripping order: most specific suffixes first
- Add rank_N. regex pattern for filenames like rank_0.pt.trace.json.gz
- Add get_rocm_smi_gpu_util to rocm_mem_info.py (fixes ImportError)
- Compute mem_collector/used_mem/mem_usage in MLflow perf block when
  log_timers_to_tensorboard is False (fixes UnboundLocalError)
- Restrict local-only TraceLens generation to rank 0 (avoids races)
- Add unit test for rank_N. filename pattern

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     | 24 ++++++++------
 .../megatron/training/mlflow_setup.py         | 14 +++++++--
 primus/core/utils/rocm_mem_info.py            | 31 +++++++++++++++++++
 primus/modules/trainer/megatron/trainer.py    | 15 +++++++++
 .../megatron/test_mlflow_artifacts.py         |  6 ++++
 5 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 2599e86e4..16ddeb896 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -41,8 +41,9 @@
 
 from primus.modules.module_utils import log_rank_0, warning_rank_0
 
-# Pinned ref for runtime TraceLens install (supply-chain safety and reproducibility)
-TRACELENS_INSTALL_REF = "v0.4.0"
+# Pinned to immutable commit SHA for supply-chain safety (tags can be moved).
+# This corresponds to tag v0.4.0 in AMD-AGI/TraceLens.
+TRACELENS_INSTALL_REF = "0cba6840e20bf3bda74f26bed27a3497017101e6"
 
 
 def _get_all_trace_files(tensorboard_dir: str) -> list:
@@ -287,9 +288,6 @@ def _ensure_tracelens_installed() -> bool:
             warning_rank_0(f"[TraceLens] Failed to install TraceLens: {e}")
             return False
 
-    # Ensure openpyxl is installed for XLSX generation
-    _ensure_openpyxl_installed()
-
     return True
 
 
@@ -298,8 +296,9 @@ def _extract_rank_from_filename(filename: str) -> Optional[int]:
     Extract rank number from trace filename.
 
     Expected patterns:
-    - rank_0_step_2.json.gz
-    - primus-megatron-exp-rank[0].*.json
+    - rank_0_step_2.json.gz, rank_15_step_1.pt.trace.json (rank_N_)
+    - rank_0.pt.trace.json, rank_0.pt.trace.json.gz (rank_N. with dot after rank)
+    - primus-megatron-exp-rank[0].*.json (rank[N], -rankN., _rankN.)
 
     Args:
         filename: The trace filename
@@ -309,9 +308,10 @@ def _extract_rank_from_filename(filename: str) -> Optional[int]:
     """
     import re
 
-    # Try pattern: rank_N_ or rank[N]
+    # Try pattern: rank_N_, rank_N. (dot), rank[N], -rankN., _rankN.
     patterns = [
         r"rank_(\d+)_",
+        r"rank_(\d+)\.",  # e.g. rank_0.pt.trace.json.gz
         r"rank\[(\d+)\]",
         r"-rank(\d+)\.",
         r"_rank(\d+)\.",
@@ -374,13 +374,17 @@ def generate_tracelens_report(
         warning_rank_0(f"[TraceLens] Trace file not found: {trace_file}")
         return []
 
+    # Only ensure openpyxl when XLSX output is requested (avoids pip install in CSV-only or restricted envs)
+    if output_format in ("xlsx", "all"):
+        _ensure_openpyxl_installed()
+
     os.makedirs(output_dir, exist_ok=True)
 
     # Generate base name from trace filename if not provided
     if report_name is None:
         base_name = os.path.basename(trace_file)
-        # Remove extensions like .json.gz
-        for trace_ext in [".json.gz", ".json", ".pt.trace.json.gz", ".pt.trace.json"]:
+        # Remove extensions like .json.gz (check most specific first so e.g. rank_0.pt.trace.json.gz -> rank_0)
+        for trace_ext in [".pt.trace.json.gz", ".pt.trace.json", ".json.gz", ".json"]:
             if base_name.endswith(trace_ext):
                 base_name = base_name[: -len(trace_ext)]
                 break
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index c615b2365..3348482c8 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -13,7 +13,9 @@
 
 from typing import List, Optional
 
-from .global_vars import get_mlflow_writer
+import torch.distributed as dist
+
+from .global_vars import get_mlflow_writer, get_primus_args
 from .mlflow_artifacts import (
     generate_tracelens_reports_locally,
     upload_artifacts_to_mlflow,
@@ -67,8 +69,14 @@ def upload_mlflow_artifacts(
     """
     mlflow_writer = get_mlflow_writer()
     if mlflow_writer is None:
-        # Local-only TraceLens generation: run even when MLflow is disabled
-        if generate_tracelens_report and tensorboard_dir and exp_root_path:
+        # Local-only TraceLens generation: run on a single rank only to avoid duplicate
+        # work and races writing exp_root_path/tracelens_reports (rank 0 when multi-rank).
+        try:
+            args = get_primus_args()
+            is_single_rank = args.rank == 0
+        except Exception:
+            is_single_rank = not dist.is_initialized() or dist.get_rank() == 0
+        if is_single_rank and generate_tracelens_report and tensorboard_dir and exp_root_path:
             generate_tracelens_reports_locally(
                 tensorboard_dir=tensorboard_dir,
                 exp_root_path=exp_root_path,
diff --git a/primus/core/utils/rocm_mem_info.py b/primus/core/utils/rocm_mem_info.py
index 0e5f94dc9..a6b5f6a8c 100644
--- a/primus/core/utils/rocm_mem_info.py
+++ b/primus/core/utils/rocm_mem_info.py
@@ -4,9 +4,40 @@
 # See LICENSE for license information.
 ###############################################################################
 
+import re
 import subprocess
 
 
+def get_rocm_smi_gpu_util(device_id: int):
+    """
+    Return current GPU utilization (0-100) for the given device via rocm-smi --showuse.
+
+    Returns:
+        float: GPU use percentage (0-100), or raises on failure (caller should catch and use fallback).
+    """
+    try:
+        out = subprocess.check_output(
+            ["rocm-smi", "--showuse", f"-d={device_id}"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+    except FileNotFoundError:
+        raise RuntimeError("rocm-smi not found, please ensure ROCm is installed and in PATH")
+
+    # Parse output: look for GPU use (%) or similar (e.g. "GPU use (%): 42" or "GPU Use: 42%")
+    for line in out.splitlines():
+        line_lower = line.lower()
+        if "use" not in line_lower and "busy" not in line_lower:
+            continue
+        # Extract a number in 0-100 range (integer or float)
+        numbers = re.findall(r"\b(\d+(?:\.\d+)?)\s*%?\b", line)
+        for n in numbers:
+            val = float(n)
+            if 0 <= val <= 100:
+                return val
+    raise RuntimeError(f"rocm-smi --showuse did not report a GPU use percentage for device {device_id}")
+
+
 def get_rocm_smi_mem_info(device_id: int):
     try:
         out = subprocess.check_output(["rocm-smi", "--showmeminfo", "vram", f"-d={device_id}"], text=True)
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 213d89271..59ea32d4e 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -2199,6 +2199,21 @@ def training_log(
                 # Groups: Performance (throughput, TPS, iteration time), Memory (peak, usage %), System (GPU util)
                 # NOTE: mlflow_writer only exists on last rank, but all_gather requires all ranks to participate
                 if getattr(args, "mlflow_upload_performance_metrics", False):
+                    # Ensure memory metrics are available when log_timers_to_tensorboard is False
+                    # (mem_collector, used_mem, mem_usage are otherwise only set inside log_timers_to_tensorboard)
+                    if not args.log_timers_to_tensorboard:
+                        if args.use_rocm_mem_info or (
+                            args.use_rocm_mem_info_iters is not None
+                            and iteration in args.use_rocm_mem_info_iters
+                        ):
+                            mem_collector = "rocm"
+                            used_mem = rocm_used_mem
+                            mem_usage = rocm_mem_usage
+                        else:
+                            hip_free_mem, hip_total_mem = torch.cuda.mem_get_info()
+                            used_mem = hip_total_mem - hip_free_mem
+                            mem_usage = used_mem / hip_total_mem
+                            mem_collector = "hip"
                     # System metrics - GPU utilization per rank
                     # ALL ranks must participate in all_gather, even if they don't have mlflow_writer
                     # Use -1 as sentinel for unavailable GPU util
diff --git a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
index 281a2744f..bff0717af 100644
--- a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
+++ b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
@@ -100,6 +100,12 @@ def test_rank_underscore_number_underscore(self):
         assert mlflow_artifacts_mod._extract_rank_from_filename("rank_0_step_2.json.gz") == 0
         assert mlflow_artifacts_mod._extract_rank_from_filename("rank_15_step_1.pt.trace.json") == 15
 
+    def test_rank_underscore_number_dot(self):
+        """Match rank_N. (dot after rank), e.g. rank_0.pt.trace.json.gz used by PyTorch profiler."""
+        assert mlflow_artifacts_mod._extract_rank_from_filename("rank_0.pt.trace.json") == 0
+        assert mlflow_artifacts_mod._extract_rank_from_filename("rank_0.pt.trace.json.gz") == 0
+        assert mlflow_artifacts_mod._extract_rank_from_filename("rank_8.pt.trace.json") == 8
+
     def test_rank_bracket_number_bracket(self):
         assert (
             mlflow_artifacts_mod._extract_rank_from_filename("primus-megatron-exp-rank[0].pt.trace.json") == 0

From 63fb4aa8dd3243a54e789e36088d56c8e715a3b0 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Tue, 10 Feb 2026 10:36:02 +0000
Subject: [PATCH 27/42] Harden TraceLens rank handling and docs

Normalize and validate TraceLens rank filters, warn on invalid values, and
clarify where XLSX/CSV outputs land for all output formats.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     | 122 +++++++++++++-----
 1 file changed, 87 insertions(+), 35 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 16ddeb896..88fb1d5d3 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -325,6 +325,66 @@ def _extract_rank_from_filename(filename: str) -> Optional[int]:
     return None
 
 
+def _normalize_tracelens_ranks(ranks: Optional[List[int]]) -> Optional[List[int]]:
+    """Normalize and validate TraceLens rank filters."""
+    if ranks is None:
+        return None
+
+    if isinstance(ranks, str):
+        import ast
+
+        try:
+            ranks = ast.literal_eval(ranks)
+        except (ValueError, SyntaxError) as e:
+            warning_rank_0(f"[TraceLens] Failed to parse ranks '{ranks}': {e}. Disabling rank filter.")
+            return None
+
+    if not isinstance(ranks, list):
+        warning_rank_0(
+            f"[TraceLens] Ranks evaluated to {type(ranks).__name__}, expected list. Disabling rank filter."
+        )
+        return None
+
+    normalized = []
+    invalid = []
+    for rank in ranks:
+        if isinstance(rank, bool):
+            invalid.append(rank)
+            continue
+        try:
+            rank_int = int(rank)
+        except (TypeError, ValueError):
+            invalid.append(rank)
+            continue
+        if rank_int < 0:
+            invalid.append(rank)
+            continue
+        normalized.append(rank_int)
+
+    if invalid:
+        warning_rank_0("[TraceLens] Ignoring invalid ranks: " + ", ".join(str(rank) for rank in invalid))
+
+    if not normalized:
+        warning_rank_0("[TraceLens] No valid ranks provided after validation.")
+        return []
+
+    try:
+        world_size = int(os.environ.get("WORLD_SIZE", os.environ.get("SLURM_NTASKS", "0")))
+    except ValueError:
+        world_size = 0
+
+    if world_size > 0:
+        out_of_range = [rank for rank in normalized if rank >= world_size]
+        if out_of_range:
+            warning_rank_0(f"[TraceLens] Ignoring ranks outside world_size={world_size}: {out_of_range}")
+            normalized = [rank for rank in normalized if rank < world_size]
+            if not normalized:
+                warning_rank_0("[TraceLens] No valid ranks remain after world_size filtering.")
+                return []
+
+    return sorted(set(normalized))
+
+
 def _filter_traces_by_rank(trace_files: List[str], ranks: List[int]) -> List[str]:
     """
     Filter trace files to only include specified ranks.
@@ -336,7 +396,7 @@ def _filter_traces_by_rank(trace_files: List[str], ranks: List[int]) -> List[str
     Returns:
         Filtered list of trace files
     """
-    if not ranks:
+    if ranks is None:
         return trace_files
 
     filtered = []
@@ -364,7 +424,10 @@ def generate_tracelens_report(
         output_format: Output format:
                       - "xlsx" (default): Single multi-tab Excel; one trace parse, fastest.
                       - "csv": Multiple CSV files (kernels, memory, communication, etc.)
+                               saved under {output_dir}/{report_name}/*.csv.
                       - "all": Both XLSX and CSV; trace is parsed twice (~2x processing time).
+                               XLSX: {output_dir}/{report_name}_analysis.xlsx
+                               CSVs: {output_dir}/{report_name}/*.csv
                       Prefer "xlsx" or "csv" to avoid this overhead unless both are needed.
 
     Returns:
@@ -601,7 +664,10 @@ def generate_tracelens_reports(
         output_format: Output format:
                       - "xlsx" (default): Multi-tab Excel; single parse, fastest
                       - "csv": Multiple CSV files per rank (kernels, memory, comm, etc.)
-                      - "all": Both XLSX and CSV; trace parsed twice (~2x processing time)
+                               saved under {output_dir}/{report_name}/*.csv.
+                      - "all": Both XLSX and CSV; trace parsed twice (~2x processing time).
+                               XLSX: {output_dir}/{report_name}_analysis.xlsx
+                               CSVs: {output_dir}/{report_name}/*.csv
 
     Returns:
         List of paths to all generated report files
@@ -609,21 +675,11 @@ def generate_tracelens_reports(
     # Try to install tracelens, but continue with fallback if not available
     _ensure_tracelens_installed()
 
-    # Normalize ranks: config/CLI can pass mlflow_tracelens_ranks as a string (e.g. env override
-    # or serialized list), but we need a list or None for filtering.
-    if ranks is not None and isinstance(ranks, str):
-        import ast
-
-        try:
-            ranks = ast.literal_eval(ranks)
-            if not isinstance(ranks, list):
-                log_rank_0(
-                    f"[TraceLens] Warning: ranks evaluated to {type(ranks).__name__}, expected list. Using None."
-                )
-                ranks = None
-        except (ValueError, SyntaxError) as e:
-            log_rank_0(f"[TraceLens] Warning: Failed to parse ranks '{ranks}': {e}. Using None.")
-            ranks = None
+    # Normalize and validate ranks (config/CLI can pass as a string)
+    ranks = _normalize_tracelens_ranks(ranks)
+    if ranks == []:
+        warning_rank_0("[TraceLens] No valid ranks after validation; skipping report generation.")
+        return []
 
     trace_files = _get_all_trace_files(tensorboard_dir)
     if not trace_files:
@@ -673,7 +729,9 @@ def generate_tracelens_reports_locally(
         exp_root_path: Root path of the experiment (for saving reports)
         ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
                Specify fewer ranks to limit number of reports
-        output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time)
+        output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time).
+                       For "all": XLSX at {exp_root_path}/tracelens_reports/{report_name}_analysis.xlsx
+                       and CSVs under {exp_root_path}/tracelens_reports/{report_name}/*.csv
 
     Returns:
         Number of reports generated
@@ -736,7 +794,9 @@ def upload_tracelens_reports_to_mlflow(
         exp_root_path: Root path of the experiment (for saving reports)
         ranks: List of ranks to analyze (None = all ranks, [0] = rank 0 only)
                Specify fewer ranks to limit number of reports
-        output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time)
+        output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time).
+                       For "all": XLSX at {exp_root_path}/tracelens_reports/{report_name}_analysis.xlsx
+                       and CSVs under {exp_root_path}/tracelens_reports/{report_name}/*.csv
         artifact_path: MLflow artifact subdirectory for reports
         cleanup_after_upload: If True, removes local reports after upload to save disk space.
                              If False, keeps reports locally for inspection. Default: False.
@@ -752,21 +812,11 @@ def upload_tracelens_reports_to_mlflow(
         log_rank_0("[TraceLens] MLflow writer not available, skipping report upload")
         return 0
 
-    # Normalize ranks: config/CLI can pass mlflow_tracelens_ranks as a string (e.g. env override
-    # or serialized list), but we need a list or None for filtering.
-    if ranks is not None and isinstance(ranks, str):
-        import ast
-
-        try:
-            ranks = ast.literal_eval(ranks)
-            if not isinstance(ranks, list):
-                log_rank_0(
-                    f"[TraceLens] Warning: ranks evaluated to {type(ranks).__name__}, expected list. Using None."
-                )
-                ranks = None
-        except (ValueError, SyntaxError) as e:
-            log_rank_0(f"[TraceLens] Warning: Failed to parse ranks '{ranks}': {e}. Using None.")
-            ranks = None
+    # Normalize and validate ranks (config/CLI can pass as a string)
+    ranks = _normalize_tracelens_ranks(ranks)
+    if ranks == []:
+        warning_rank_0("[TraceLens] No valid ranks after validation; skipping report upload.")
+        return 0
 
     # Create output directory for reports
     reports_dir = os.path.join(exp_root_path, "tracelens_reports")
@@ -889,7 +939,9 @@ def upload_artifacts_to_mlflow(
         tracelens_ranks: List of ranks to generate TraceLens reports for
                         (None = all ranks, [0, 8] = ranks 0 and 8 only)
                         Specify fewer ranks to limit number of reports
-        tracelens_output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time)
+        tracelens_output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time).
+                                For "all": XLSX at {exp_root_path}/tracelens_reports/{report_name}_analysis.xlsx
+                                and CSVs under {exp_root_path}/tracelens_reports/{report_name}/*.csv
         tracelens_cleanup_after_upload: If True, removes local reports after upload to save disk space.
                                        If False, keeps reports locally for inspection (default).
 

From 9fcd41229f56bfebac1dbd4b41e6733f5595b705 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Tue, 10 Feb 2026 14:41:37 +0000
Subject: [PATCH 28/42] Clarify MLflow memory metric name

Rename perf/{mem_collector}_peak_mem_gb to current_mem_gb to reflect
instantaneous memory usage rather than a peak value.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 primus/modules/trainer/megatron/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 59ea32d4e..acd79d664 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -2237,7 +2237,7 @@ def training_log(
                         )
                         # Memory metrics
                         mlflow_writer.log_metric(
-                            f"perf/{mem_collector}_peak_mem_gb",
+                            f"perf/{mem_collector}_current_mem_gb",
                             used_mem / 1024 / 1024 / 1024,
                             iteration,
                         )

From 2ab4a48bbd67a8b286f462ab1114e510f4e6f3f4 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Tue, 10 Feb 2026 14:44:11 +0000
Subject: [PATCH 29/42] Tighten GPU util parsing and TraceLens log wording

Parse rocm-smi GPU utilization using labeled/percentage values to avoid
misreading device indices, and clarify TraceLens report item logging.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     |  4 +++-
 primus/core/utils/rocm_mem_info.py            | 23 ++++++++++++++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 88fb1d5d3..c98d37ca3 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -708,7 +708,9 @@ def generate_tracelens_reports(
         report_paths = generate_tracelens_report(trace_file, output_dir, output_format=output_format)
         generated_reports.extend(report_paths)
 
-    log_rank_0(f"[TraceLens] Generated {len(generated_reports)} report files from {len(trace_files)} traces")
+    log_rank_0(
+        f"[TraceLens] Generated {len(generated_reports)} report item(s) " f"from {len(trace_files)} traces"
+    )
     return generated_reports
 
 
diff --git a/primus/core/utils/rocm_mem_info.py b/primus/core/utils/rocm_mem_info.py
index a6b5f6a8c..00e9fce63 100644
--- a/primus/core/utils/rocm_mem_info.py
+++ b/primus/core/utils/rocm_mem_info.py
@@ -29,9 +29,26 @@ def get_rocm_smi_gpu_util(device_id: int):
         line_lower = line.lower()
         if "use" not in line_lower and "busy" not in line_lower:
             continue
-        # Extract a number in 0-100 range (integer or float)
-        numbers = re.findall(r"\b(\d+(?:\.\d+)?)\s*%?\b", line)
-        for n in numbers:
+        # Prefer a number that follows a use/busy label.
+        labeled_match = re.search(
+            r"(?:use|busy)[^0-9%]*([0-9]+(?:\.[0-9]+)?)\s*%?",
+            line_lower,
+        )
+        if labeled_match:
+            val = float(labeled_match.group(1))
+            if 0 <= val <= 100:
+                return val
+
+        # Otherwise, take the last percentage on the line to avoid grabbing GPU index.
+        percent_numbers = re.findall(r"(\d+(?:\.\d+)?)\s*%", line)
+        for n in reversed(percent_numbers):
+            val = float(n)
+            if 0 <= val <= 100:
+                return val
+
+        # Fallback: take the last 0-100 number if no percent sign is present.
+        numbers = re.findall(r"\b(\d+(?:\.\d+)?)\b", line)
+        for n in reversed(numbers):
             val = float(n)
             if 0 <= val <= 100:
                 return val

From 535c816f31596b1467f5b2dc58a12cc33935b9be Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Tue, 10 Feb 2026 15:52:51 +0000
Subject: [PATCH 30/42] Defer openpyxl install until TraceLens import

Only run the openpyxl check after TraceLens imports, so CSV-only fallback
paths avoid unnecessary runtime installs.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 primus/backends/megatron/training/mlflow_artifacts.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index c98d37ca3..be6ed2bbd 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -437,10 +437,6 @@ def generate_tracelens_report(
         warning_rank_0(f"[TraceLens] Trace file not found: {trace_file}")
         return []
 
-    # Only ensure openpyxl when XLSX output is requested (avoids pip install in CSV-only or restricted envs)
-    if output_format in ("xlsx", "all"):
-        _ensure_openpyxl_installed()
-
     os.makedirs(output_dir, exist_ok=True)
 
     # Generate base name from trace filename if not provided
@@ -457,6 +453,10 @@ def generate_tracelens_report(
         # Try using TraceLens Python API directly
         from TraceLens.Reporting import generate_perf_report_pytorch
 
+        # Only ensure openpyxl when XLSX output is requested (avoids pip install in CSV-only or restricted envs)
+        if output_format in ("xlsx", "all"):
+            _ensure_openpyxl_installed()
+
         generated_files = []
 
         # For "all" format: TraceLens uses either/or logic - if output_csvs_dir is set,

From 1ed2954fa078e5657028e0a7835663cf8ff90d43 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 08:45:00 +0000
Subject: [PATCH 31/42] Harden TraceLens install behavior and docs

Add timeout and stderr logging for TraceLens installs, skip install when
rank validation fails, and clarify MLflow artifact call behavior.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     | 23 +++++++++++++------
 .../megatron/training/mlflow_setup.py         |  5 ++--
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index be6ed2bbd..1be20a51d 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -267,9 +267,9 @@ def _ensure_tracelens_installed() -> bool:
     except ImportError:
         log_rank_0("[TraceLens] TraceLens not found, attempting to install from GitHub...")
         try:
-            # TraceLens is on GitHub, not PyPI; pin to a tag for reproducibility and supply-chain safety
+            # TraceLens is on GitHub, not PyPI; pin to a commit SHA for reproducibility and supply-chain safety
             install_spec = f"git+https://github.com/AMD-AGI/TraceLens.git@{TRACELENS_INSTALL_REF}"
-            subprocess.check_call(
+            subprocess.run(
                 [
                     sys.executable,
                     "-m",
@@ -279,13 +279,22 @@ def _ensure_tracelens_installed() -> bool:
                     "-q",
                 ],
                 stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+                timeout=300,
             )
             log_rank_0(
                 f"[TraceLens] Successfully installed TraceLens from GitHub (ref={TRACELENS_INSTALL_REF})"
             )
+        except subprocess.TimeoutExpired:
+            warning_rank_0("[TraceLens] TraceLens install timed out after 300s. Skipping install.")
+            return False
         except subprocess.CalledProcessError as e:
-            warning_rank_0(f"[TraceLens] Failed to install TraceLens: {e}")
+            stderr_output = e.stderr.strip() if e.stderr else "No stderr output captured."
+            warning_rank_0(
+                f"[TraceLens] Failed to install TraceLens: {e}\n" f"[TraceLens] pip stderr: {stderr_output}"
+            )
             return False
 
     return True
@@ -672,15 +681,15 @@ def generate_tracelens_reports(
     Returns:
         List of paths to all generated report files
     """
-    # Try to install tracelens, but continue with fallback if not available
-    _ensure_tracelens_installed()
-
     # Normalize and validate ranks (config/CLI can pass as a string)
     ranks = _normalize_tracelens_ranks(ranks)
     if ranks == []:
         warning_rank_0("[TraceLens] No valid ranks after validation; skipping report generation.")
         return []
 
+    # Try to install tracelens, but continue with fallback if not available
+    _ensure_tracelens_installed()
+
     trace_files = _get_all_trace_files(tensorboard_dir)
     if not trace_files:
         log_rank_0("[TraceLens] No trace files found for analysis")
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index 3348482c8..9663e9af8 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -37,8 +37,9 @@ def upload_mlflow_artifacts(
     Upload trace files, log files, and TraceLens reports to MLflow as artifacts.
 
     This function should be called at the end of training to upload all
-    artifacts to MLflow. Only the rank that initialized MLflow (last rank)
-    should call this to avoid duplicate uploads.
+    artifacts to MLflow. It is safe to call on all ranks: non-writer ranks
+    will no-op when MLflow is disabled, while the writer rank performs uploads
+    (and local-only TraceLens generation may still occur when configured).
 
     MLflow Artifact Structure:
         artifacts/

From 9d59aa559b3237169b350491fb3fa6bbf6ea6e9a Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 08:55:25 +0000
Subject: [PATCH 32/42] Address remaining TraceLens and ROCm comment feedback

Normalize TraceLens output formats, improve CSV handling, add auto-install
controls and timeouts, and harden MLflow/ROCm metric handling and docs.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     | 85 +++++++++++++++----
 .../megatron/training/mlflow_setup.py         |  4 +
 .../megatron/primus_megatron_module.yaml      |  7 +-
 primus/core/utils/rocm_mem_info.py            |  3 +
 primus/modules/trainer/megatron/trainer.py    | 31 ++++---
 5 files changed, 97 insertions(+), 33 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 1be20a51d..6b5f7cb32 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -29,7 +29,7 @@
 
 TraceLens Report Formats:
     - xlsx: Multi-tab Excel (default; single parse, fastest)
-    - csv:  Multiple CSV files per rank (kernels, memory, communication, etc.)
+    - csv:  Directory of CSV files per rank (kernels, memory, communication, etc.)
     - all:  Both xlsx and csv (parses trace twice, ~2x processing time; use when both formats needed)
 """
 
@@ -70,7 +70,6 @@ def _get_all_trace_files(tensorboard_dir: str) -> list:
     # Escape directory path to handle special characters like [] in experiment names
     escaped_dir = glob.escape(tensorboard_dir)
     for pattern in patterns:
-        trace_files.extend(glob.glob(os.path.join(escaped_dir, pattern)))
         trace_files.extend(glob.glob(os.path.join(escaped_dir, "**", pattern), recursive=True))
 
     # Remove duplicates while preserving order
@@ -250,7 +249,7 @@ def _ensure_openpyxl_installed() -> bool:
             return False
 
 
-def _ensure_tracelens_installed() -> bool:
+def _ensure_tracelens_installed(auto_install: bool = True) -> bool:
     """
     Ensure TraceLens and its dependencies are installed.
 
@@ -265,6 +264,9 @@ def _ensure_tracelens_installed() -> bool:
 
         log_rank_0("[TraceLens] TraceLens is already installed")
     except ImportError:
+        if not auto_install:
+            warning_rank_0("[TraceLens] TraceLens not installed and auto-install disabled.")
+            return False
         log_rank_0("[TraceLens] TraceLens not found, attempting to install from GitHub...")
         try:
             # TraceLens is on GitHub, not PyPI; pin to a commit SHA for reproducibility and supply-chain safety
@@ -287,6 +289,13 @@ def _ensure_tracelens_installed() -> bool:
             log_rank_0(
                 f"[TraceLens] Successfully installed TraceLens from GitHub (ref={TRACELENS_INSTALL_REF})"
             )
+            try:
+                import TraceLens  # noqa: F401
+            except ImportError:
+                warning_rank_0(
+                    "[TraceLens] TraceLens install completed but import failed. " "A restart may be required."
+                )
+                return False
         except subprocess.TimeoutExpired:
             warning_rank_0("[TraceLens] TraceLens install timed out after 300s. Skipping install.")
             return False
@@ -394,6 +403,23 @@ def _normalize_tracelens_ranks(ranks: Optional[List[int]]) -> Optional[List[int]
     return sorted(set(normalized))
 
 
+def _normalize_tracelens_output_format(output_format: str) -> str:
+    """Normalize and validate TraceLens output format."""
+    if output_format is None:
+        warning_rank_0("[TraceLens] output_format is None; defaulting to 'xlsx'.")
+        return "xlsx"
+
+    normalized = str(output_format).strip().lower()
+    if normalized in ("xlsx", "csv", "all"):
+        return normalized
+
+    warning_rank_0(
+        f"[TraceLens] Invalid output_format '{output_format}'; "
+        "expected 'xlsx', 'csv', or 'all'. Defaulting to 'xlsx'."
+    )
+    return "xlsx"
+
+
 def _filter_traces_by_rank(trace_files: List[str], ranks: List[int]) -> List[str]:
     """
     Filter trace files to only include specified ranks.
@@ -446,6 +472,8 @@ def generate_tracelens_report(
         warning_rank_0(f"[TraceLens] Trace file not found: {trace_file}")
         return []
 
+    output_format = _normalize_tracelens_output_format(output_format)
+
     os.makedirs(output_dir, exist_ok=True)
 
     # Generate base name from trace filename if not provided
@@ -483,31 +511,35 @@ def generate_tracelens_report(
             os.makedirs(csv_subdir, exist_ok=True)
 
             # First call: Generate XLSX only
-            dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
+            dfs_xlsx = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
 
             # Check XLSX output
             if os.path.exists(xlsx_path):
-                num_tabs = len(dfs) if dfs else 0
+                num_tabs = len(dfs_xlsx) if dfs_xlsx else 0
                 log_rank_0(
                     f"[TraceLens] Generated XLSX report with {num_tabs} tabs: {os.path.basename(xlsx_path)}"
                 )
                 generated_files.append(xlsx_path)
 
             # Second call: Generate CSVs only
-            generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
+            existing_csv_files = set(glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv")))
+            _ = generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
 
             # Check CSV outputs (escape path to handle [] characters in filenames)
             csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
-            if csv_files:
-                log_rank_0(f"[TraceLens] Generated {len(csv_files)} CSV files for {report_name}")
+            new_csv_files = [f for f in csv_files if f not in existing_csv_files]
+            if new_csv_files:
+                log_rank_0(f"[TraceLens] Generated {len(new_csv_files)} CSV files for {report_name}")
                 generated_files.append(csv_subdir)  # Upload directory to preserve structure
+            else:
+                warning_rank_0(f"[TraceLens] No new CSV files generated for {report_name}")
 
         elif output_format == "xlsx":
             # XLSX only: Single file with multiple tabs
             xlsx_path = os.path.join(output_dir, f"{report_name}_analysis.xlsx")
-            dfs = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
+            dfs_xlsx = generate_perf_report_pytorch(trace_file, output_xlsx_path=xlsx_path)
             if os.path.exists(xlsx_path):
-                num_tabs = len(dfs) if dfs else 0
+                num_tabs = len(dfs_xlsx) if dfs_xlsx else 0
                 log_rank_0(
                     f"[TraceLens] Generated XLSX report with {num_tabs} tabs: {os.path.basename(xlsx_path)}"
                 )
@@ -517,16 +549,17 @@ def generate_tracelens_report(
             # CSV only: Multiple files in a subdirectory per rank
             csv_subdir = os.path.join(output_dir, report_name)
             os.makedirs(csv_subdir, exist_ok=True)
-            dfs = generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
+            existing_csv_files = set(glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv")))
+            _ = generate_perf_report_pytorch(trace_file, output_csvs_dir=csv_subdir)
 
             # Collect all generated CSV files (escape path to handle [] characters in filenames)
             csv_files = glob.glob(os.path.join(glob.escape(csv_subdir), "*.csv"))
-            if csv_files:
-                num_sections = len(dfs) if dfs else 0
-                log_rank_0(
-                    f"[TraceLens] Generated {len(csv_files)} CSV files ({num_sections} sections) for {report_name}"
-                )
+            new_csv_files = [f for f in csv_files if f not in existing_csv_files]
+            if new_csv_files:
+                log_rank_0(f"[TraceLens] Generated {len(new_csv_files)} CSV files for {report_name}")
                 generated_files.append(csv_subdir)  # Upload directory to preserve structure
+            else:
+                warning_rank_0(f"[TraceLens] No new CSV files generated for {report_name}")
 
         if generated_files:
             return generated_files
@@ -606,6 +639,8 @@ def _generate_trace_summary_csv(
                 op_stats[name]["min_us"] = min(op_stats[name]["min_us"], dur)
                 op_stats[name]["max_us"] = max(op_stats[name]["max_us"], dur)
 
+        # Filter out any operations with zero count (defensive; should not normally occur)
+        op_stats = {name: stats for name, stats in op_stats.items() if stats["count"] > 0}
         if not op_stats:
             warning_rank_0(f"[TraceLens] No kernel/op events found in trace: {trace_file}")
             return None
@@ -661,6 +696,7 @@ def generate_tracelens_reports(
     output_dir: str,
     ranks: Optional[List[int]] = None,
     output_format: str = "xlsx",
+    auto_install: bool = True,
 ) -> List[str]:
     """
     Generate TraceLens analysis reports for trace files.
@@ -677,6 +713,7 @@ def generate_tracelens_reports(
                       - "all": Both XLSX and CSV; trace parsed twice (~2x processing time).
                                XLSX: {output_dir}/{report_name}_analysis.xlsx
                                CSVs: {output_dir}/{report_name}/*.csv
+        auto_install: Whether to attempt auto-installing TraceLens if missing
 
     Returns:
         List of paths to all generated report files
@@ -687,8 +724,10 @@ def generate_tracelens_reports(
         warning_rank_0("[TraceLens] No valid ranks after validation; skipping report generation.")
         return []
 
+    output_format = _normalize_tracelens_output_format(output_format)
+
     # Try to install tracelens, but continue with fallback if not available
-    _ensure_tracelens_installed()
+    _ensure_tracelens_installed(auto_install=auto_install)
 
     trace_files = _get_all_trace_files(tensorboard_dir)
     if not trace_files:
@@ -728,6 +767,7 @@ def generate_tracelens_reports_locally(
     exp_root_path: str,
     ranks: Optional[List[int]] = None,
     output_format: str = "xlsx",
+    auto_install: bool = True,
 ) -> int:
     """
     Generate TraceLens analysis reports locally (without MLflow upload).
@@ -743,6 +783,7 @@ def generate_tracelens_reports_locally(
         output_format: Report format - "xlsx" (default), "csv", or "all" (xlsx+csv, ~2x time).
                        For "all": XLSX at {exp_root_path}/tracelens_reports/{report_name}_analysis.xlsx
                        and CSVs under {exp_root_path}/tracelens_reports/{report_name}/*.csv
+        auto_install: Whether to attempt auto-installing TraceLens if missing
 
     Returns:
         Number of reports generated
@@ -771,6 +812,7 @@ def generate_tracelens_reports_locally(
         output_dir=reports_dir,
         ranks=ranks,
         output_format=output_format,
+        auto_install=auto_install,
     )
 
     if not reports:
@@ -789,6 +831,7 @@ def upload_tracelens_reports_to_mlflow(
     output_format: str = "xlsx",
     artifact_path: str = "trace_analysis",
     cleanup_after_upload: bool = False,
+    auto_install: bool = True,
 ) -> int:
     """
     Generate TraceLens reports and upload them to MLflow.
@@ -811,6 +854,7 @@ def upload_tracelens_reports_to_mlflow(
         artifact_path: MLflow artifact subdirectory for reports
         cleanup_after_upload: If True, removes local reports after upload to save disk space.
                              If False, keeps reports locally for inspection. Default: False.
+        auto_install: Whether to attempt auto-installing TraceLens if missing
 
     Returns:
         Number of reports uploaded to MLflow
@@ -829,6 +873,8 @@ def upload_tracelens_reports_to_mlflow(
         warning_rank_0("[TraceLens] No valid ranks after validation; skipping report upload.")
         return 0
 
+    output_format = _normalize_tracelens_output_format(output_format)
+
     # Create output directory for reports
     reports_dir = os.path.join(exp_root_path, "tracelens_reports")
     os.makedirs(reports_dir, exist_ok=True)
@@ -844,6 +890,7 @@ def upload_tracelens_reports_to_mlflow(
         output_dir=reports_dir,
         ranks=ranks,
         output_format=output_format,
+        auto_install=auto_install,
     )
 
     if not reports:
@@ -912,6 +959,7 @@ def upload_artifacts_to_mlflow(
     tracelens_ranks: Optional[List[int]] = None,
     tracelens_output_format: str = "xlsx",
     tracelens_cleanup_after_upload: bool = False,
+    tracelens_auto_install: bool = True,
 ) -> dict:
     """
     Upload all artifacts (trace files, log files, TraceLens reports) to MLflow.
@@ -955,6 +1003,7 @@ def upload_artifacts_to_mlflow(
                                 and CSVs under {exp_root_path}/tracelens_reports/{report_name}/*.csv
         tracelens_cleanup_after_upload: If True, removes local reports after upload to save disk space.
                                        If False, keeps reports locally for inspection (default).
+        tracelens_auto_install: Whether to attempt auto-installing TraceLens if missing
 
     Returns:
         Dictionary with counts of uploaded files:
@@ -1005,6 +1054,7 @@ def upload_artifacts_to_mlflow(
                 output_format=tracelens_output_format,
                 artifact_path="trace_analysis",
                 cleanup_after_upload=tracelens_cleanup_after_upload,
+                auto_install=tracelens_auto_install,
             )
         else:
             # Generate locally only (no MLflow upload)
@@ -1014,6 +1064,7 @@ def upload_artifacts_to_mlflow(
                 exp_root_path=exp_root_path,
                 ranks=tracelens_ranks,
                 output_format=tracelens_output_format,
+                auto_install=tracelens_auto_install,
             )
             # Don't count as "uploaded" since they're local-only
             log_rank_0(f"[TraceLens] Generated {num_generated} report files (not uploaded to MLflow)")
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index 9663e9af8..59660d7e9 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -32,6 +32,7 @@ def upload_mlflow_artifacts(
     tracelens_ranks: Optional[List[int]] = None,
     tracelens_output_format: str = "all",
     tracelens_cleanup_after_upload: bool = False,
+    tracelens_auto_install: bool = True,
 ) -> Optional[dict]:
     """
     Upload trace files, log files, and TraceLens reports to MLflow as artifacts.
@@ -64,6 +65,7 @@ def upload_mlflow_artifacts(
                         Specify fewer ranks to limit number of reports
         tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
         tracelens_cleanup_after_upload: Remove local reports after upload (default: False)
+        tracelens_auto_install: Whether to attempt auto-installing TraceLens if missing
 
     Returns:
         Dictionary with counts of uploaded files, or None if MLflow is not enabled
@@ -83,6 +85,7 @@ def upload_mlflow_artifacts(
                 exp_root_path=exp_root_path,
                 ranks=tracelens_ranks,
                 output_format=tracelens_output_format,
+                auto_install=tracelens_auto_install,
             )
         return None
 
@@ -97,4 +100,5 @@ def upload_mlflow_artifacts(
         tracelens_ranks=tracelens_ranks,
         tracelens_output_format=tracelens_output_format,
         tracelens_cleanup_after_upload=tracelens_cleanup_after_upload,
+        tracelens_auto_install=tracelens_auto_install,
     )
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 8ebdd78ae..0070d5fee 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -26,15 +26,16 @@ mlflow_upload_logs: false            # Upload training log files to MLflow
 #
 # To limit number of reports: use mlflow_tracelens_ranks (no separate max_reports option).
 #   Default: [0, 8] = one rank per node (assumes 8 GPUs/node)
-#   Use null for all ranks, or customize list like [0, 1, 8, 9] for more coverage
+#   Use null for all ranks (e.g., null or [0, 1, 2] for explicit list)
 # ----------------------------------------------------------------------------
 generate_tracelens_report: false    # Generate TraceLens analysis reports locally (auto-enabled when upload=true)
 mlflow_upload_tracelens_report: false # Upload TraceLens reports to MLflow (auto-enables generation, profiling, tensorboard)
-mlflow_tracelens_ranks: [0, 8]       # List of ranks to analyze (default: one per node for 2-node setup)
+mlflow_tracelens_ranks: [0, 8]       # List of ranks to analyze (null = all, e.g. [0, 1, 2])
 # TraceLens report format: xlsx (default, single parse, fastest), csv, or all (xlsx+csv;
 # parses each trace twice so ~2x processing time; use only when both formats are needed)
 mlflow_tracelens_output_format: xlsx
 mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)
+mlflow_tracelens_auto_install: true           # Auto-install TraceLens if missing (set false to disable)
 disable_compile_dependencies: true
 # NOTE:
 # - If `use_rocm_mem_info = True`, ROCm memory information will be collected
@@ -53,7 +54,7 @@ use_rocm_mem_info_iters: [1,2]
 #      - perf/iteration_time_ms: Time per training step (ms)
 #
 #   2. Memory Metrics:
-#      - perf/{rocm/hip}_peak_mem_gb: Peak GPU memory usage (GB)
+#      - perf/{rocm/hip}_current_mem_gb: Current GPU memory usage (GB)
 #      - perf/{rocm/hip}_mem_utilization_pct: Memory utilization (% of total)
 #
 #   3. System Metrics:
diff --git a/primus/core/utils/rocm_mem_info.py b/primus/core/utils/rocm_mem_info.py
index 00e9fce63..1ee266a43 100644
--- a/primus/core/utils/rocm_mem_info.py
+++ b/primus/core/utils/rocm_mem_info.py
@@ -20,9 +20,12 @@ def get_rocm_smi_gpu_util(device_id: int):
             ["rocm-smi", "--showuse", f"-d={device_id}"],
             text=True,
             stderr=subprocess.DEVNULL,
+            timeout=10,
         )
     except FileNotFoundError:
         raise RuntimeError("rocm-smi not found, please ensure ROCm is installed and in PATH")
+    except subprocess.TimeoutExpired:
+        raise RuntimeError("rocm-smi --showuse timed out")
 
     # Parse output: look for GPU use (%) or similar (e.g. "GPU use (%): 42" or "GPU Use: 42%")
     for line in out.splitlines():
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index acd79d664..6f4811518 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -1146,19 +1146,24 @@ def run(self, *args, **kwargs):
         mlflow_writer = get_mlflow_writer()
         # Always call: uploads to MLflow when enabled; when MLflow disabled, still runs
         # local-only TraceLens report generation if generate_tracelens_report=True.
-        upload_mlflow_artifacts(
-            tensorboard_dir=args.tensorboard_dir,
-            exp_root_path=self.exp_root_path,
-            upload_traces=getattr(args, "mlflow_upload_traces", False),
-            upload_logs=getattr(args, "mlflow_upload_logs", False),
-            generate_tracelens_report=getattr(args, "generate_tracelens_report", False),
-            upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
-            tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
-            tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "xlsx"),
-            tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", False),
-        )
-        if mlflow_writer:
-            mlflow_writer.end_run()
+        try:
+            upload_mlflow_artifacts(
+                tensorboard_dir=args.tensorboard_dir,
+                exp_root_path=self.exp_root_path,
+                upload_traces=getattr(args, "mlflow_upload_traces", False),
+                upload_logs=getattr(args, "mlflow_upload_logs", False),
+                generate_tracelens_report=getattr(args, "generate_tracelens_report", False),
+                upload_tracelens_report=getattr(args, "mlflow_upload_tracelens_report", False),
+                tracelens_ranks=getattr(args, "mlflow_tracelens_ranks", None),
+                tracelens_output_format=getattr(args, "mlflow_tracelens_output_format", "xlsx"),
+                tracelens_cleanup_after_upload=getattr(args, "mlflow_tracelens_cleanup_after_upload", False),
+                tracelens_auto_install=getattr(args, "mlflow_tracelens_auto_install", True),
+            )
+        except Exception as e:
+            warning_rank_0(f"[MLflow] Artifact upload failed: {e}")
+        finally:
+            if mlflow_writer:
+                mlflow_writer.end_run()
 
         one_logger and one_logger.log_metrics({"app_finish_time": one_logger_utils.get_timestamp_in_ms()})
 

From 85810912eebca5f3d2d8bde26e45d06e2f1d4e4e Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 09:25:25 +0000
Subject: [PATCH 33/42] Clarify TraceLens defaults and openpyxl fallback

Avoid duplicate local generation in distributed runs, align default
output_format with xlsx, and downgrade to CSV when openpyxl is unavailable.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../backends/megatron/training/mlflow_artifacts.py   |  4 +++-
 primus/backends/megatron/training/mlflow_setup.py    | 12 ++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 6b5f7cb32..d0671d386 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -492,7 +492,9 @@ def generate_tracelens_report(
 
         # Only ensure openpyxl when XLSX output is requested (avoids pip install in CSV-only or restricted envs)
         if output_format in ("xlsx", "all"):
-            _ensure_openpyxl_installed()
+            if not _ensure_openpyxl_installed():
+                warning_rank_0("[TraceLens] openpyxl unavailable; downgrading output_format to 'csv'.")
+                output_format = "csv"
 
         generated_files = []
 
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index 59660d7e9..ec3aa155f 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -30,7 +30,7 @@ def upload_mlflow_artifacts(
     generate_tracelens_report: bool = False,
     upload_tracelens_report: bool = False,
     tracelens_ranks: Optional[List[int]] = None,
-    tracelens_output_format: str = "all",
+    tracelens_output_format: str = "xlsx",
     tracelens_cleanup_after_upload: bool = False,
     tracelens_auto_install: bool = True,
 ) -> Optional[dict]:
@@ -74,12 +74,20 @@ def upload_mlflow_artifacts(
     if mlflow_writer is None:
         # Local-only TraceLens generation: run on a single rank only to avoid duplicate
         # work and races writing exp_root_path/tracelens_reports (rank 0 when multi-rank).
+        # If MLflow is enabled in a distributed run, the writer rank will handle generation,
+        # so skip local generation on non-writer ranks.
         try:
             args = get_primus_args()
             is_single_rank = args.rank == 0
+            mlflow_expected = getattr(args, "mlflow_run_name", None) is not None
+            is_distributed = args.world_size > 1
         except Exception:
             is_single_rank = not dist.is_initialized() or dist.get_rank() == 0
-        if is_single_rank and generate_tracelens_report and tensorboard_dir and exp_root_path:
+            mlflow_expected = False
+            is_distributed = dist.is_initialized()
+
+        should_generate_locally = is_single_rank and (not mlflow_expected or not is_distributed)
+        if should_generate_locally and generate_tracelens_report and tensorboard_dir and exp_root_path:
             generate_tracelens_reports_locally(
                 tensorboard_dir=tensorboard_dir,
                 exp_root_path=exp_root_path,

From 619896083458f0482a0f7170d93ff4171ba9863a Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 09:28:21 +0000
Subject: [PATCH 34/42] Clarify MLflow writer rank in TraceLens upload

Update docs to state the last rank (writer) performs TraceLens artifact uploads
in distributed runs.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 primus/backends/megatron/training/mlflow_artifacts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index d0671d386..28655dddf 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -120,8 +120,8 @@ def upload_trace_files_to_mlflow(
     Upload all profiler trace files to MLflow as artifacts.
 
     This function collects trace files from the tensorboard directory and
-    uploads them to MLflow. In distributed settings, only rank 0 (or the
-    last rank where MLflow writer is initialized) should call this.
+    uploads them to MLflow. In distributed settings, only the last rank
+    (where the MLflow writer is initialized) should call this.
 
     Args:
         mlflow_writer: The MLflow module instance (from get_mlflow_writer())

From c05c7435cf0f8a131a9c3f2fa09855268734e9d1 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 09:45:09 +0000
Subject: [PATCH 35/42] Fix TraceLens output_format docstring default

Align mlflow_setup.py docstring with the actual default of 'xlsx'.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 primus/backends/megatron/training/mlflow_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index ec3aa155f..4ffbb7d0c 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -63,7 +63,7 @@ def upload_mlflow_artifacts(
         tracelens_ranks: List of ranks to analyze with TraceLens
                         (None = all, [0, 8] = ranks 0 and 8 only)
                         Specify fewer ranks to limit number of reports
-        tracelens_output_format: Report format - "all" (default, xlsx+csv), "xlsx", or "csv"
+        tracelens_output_format: Report format - "xlsx" (default), "csv", or "all"
         tracelens_cleanup_after_upload: Remove local reports after upload (default: False)
         tracelens_auto_install: Whether to attempt auto-installing TraceLens if missing
 

From f86556a10a6cbe48376ec5f8bdb0fd156370bbbc Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 09:52:13 +0000
Subject: [PATCH 36/42] Improve TraceLens install diagnostics and metrics
 safety.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     | 25 ++++++++++++++-----
 .../megatron/training/mlflow_setup.py         |  9 +++----
 .../megatron/primus_megatron_module.yaml      | 10 +++++---
 primus/core/utils/rocm_mem_info.py            |  6 -----
 primus/modules/trainer/megatron/trainer.py    |  9 +++----
 .../megatron/test_mlflow_artifacts.py         |  4 +--
 6 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 28655dddf..e332cdaee 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -237,15 +237,23 @@ def _ensure_openpyxl_installed() -> bool:
     except ImportError:
         log_rank_0("[TraceLens] openpyxl not found, installing for XLSX support...")
         try:
-            subprocess.check_call(
+            result = subprocess.run(
                 [sys.executable, "-m", "pip", "install", "openpyxl", "-q"],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
             )
             log_rank_0("[TraceLens] Successfully installed openpyxl")
             return True
         except subprocess.CalledProcessError as e:
-            warning_rank_0(f"[TraceLens] Failed to install openpyxl: {e}")
+            stdout_output = e.stdout.strip() if e.stdout else "No stdout output captured."
+            stderr_output = e.stderr.strip() if e.stderr else "No stderr output captured."
+            warning_rank_0(
+                f"[TraceLens] Failed to install openpyxl: {e}\n"
+                f"[TraceLens] pip stdout: {stdout_output}\n"
+                f"[TraceLens] pip stderr: {stderr_output}"
+            )
             return False
 
 
@@ -280,7 +288,7 @@ def _ensure_tracelens_installed(auto_install: bool = True) -> bool:
                     install_spec,
                     "-q",
                 ],
-                stdout=subprocess.DEVNULL,
+                stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
                 check=True,
@@ -300,9 +308,12 @@ def _ensure_tracelens_installed(auto_install: bool = True) -> bool:
             warning_rank_0("[TraceLens] TraceLens install timed out after 300s. Skipping install.")
             return False
         except subprocess.CalledProcessError as e:
+            stdout_output = e.stdout.strip() if e.stdout else "No stdout output captured."
             stderr_output = e.stderr.strip() if e.stderr else "No stderr output captured."
             warning_rank_0(
-                f"[TraceLens] Failed to install TraceLens: {e}\n" f"[TraceLens] pip stderr: {stderr_output}"
+                f"[TraceLens] Failed to install TraceLens: {e}\n"
+                f"[TraceLens] pip stdout: {stdout_output}\n"
+                f"[TraceLens] pip stderr: {stderr_output}"
             )
             return False
 
@@ -433,6 +444,8 @@ def _filter_traces_by_rank(trace_files: List[str], ranks: List[int]) -> List[str
     """
     if ranks is None:
         return trace_files
+    if not ranks:
+        return []
 
     filtered = []
     for trace_file in trace_files:
diff --git a/primus/backends/megatron/training/mlflow_setup.py b/primus/backends/megatron/training/mlflow_setup.py
index 4ffbb7d0c..3846594d1 100644
--- a/primus/backends/megatron/training/mlflow_setup.py
+++ b/primus/backends/megatron/training/mlflow_setup.py
@@ -1,6 +1,5 @@
 ###############################################################################
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Modification Copyright© 2025 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE for license information.
 ###############################################################################
@@ -78,15 +77,15 @@ def upload_mlflow_artifacts(
         # so skip local generation on non-writer ranks.
         try:
             args = get_primus_args()
-            is_single_rank = args.rank == 0
+            is_rank_zero = args.rank == 0
             mlflow_expected = getattr(args, "mlflow_run_name", None) is not None
             is_distributed = args.world_size > 1
         except Exception:
-            is_single_rank = not dist.is_initialized() or dist.get_rank() == 0
+            is_rank_zero = not dist.is_initialized() or dist.get_rank() == 0
             mlflow_expected = False
             is_distributed = dist.is_initialized()
 
-        should_generate_locally = is_single_rank and (not mlflow_expected or not is_distributed)
+        should_generate_locally = is_rank_zero and (not mlflow_expected or not is_distributed)
         if should_generate_locally and generate_tracelens_report and tensorboard_dir and exp_root_path:
             generate_tracelens_reports_locally(
                 tensorboard_dir=tensorboard_dir,
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
index 0070d5fee..7a1877d3e 100644
--- a/primus/configs/modules/megatron/primus_megatron_module.yaml
+++ b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -25,13 +25,14 @@ mlflow_upload_logs: false            # Upload training log files to MLflow
 #   generate=true,  upload=true   ->  Generate AND upload (explicit)
 #
 # To limit number of reports: use mlflow_tracelens_ranks (no separate max_reports option).
-#   Default: [0, 8] = one rank per node (assumes 8 GPUs/node)
-#   Use null for all ranks (e.g., null or [0, 1, 2] for explicit list)
+#   Default: null = all ranks
+#   Example (8 GPUs/node): [0, 8] = one rank per node (assumes 8 GPUs/node)
+#   Use an explicit list for specific ranks (e.g., [0, 1, 2])
 # ----------------------------------------------------------------------------
 generate_tracelens_report: false    # Generate TraceLens analysis reports locally (auto-enabled when upload=true)
 mlflow_upload_tracelens_report: false # Upload TraceLens reports to MLflow (auto-enables generation, profiling, tensorboard)
-mlflow_tracelens_ranks: [0, 8]       # List of ranks to analyze (null = all, e.g. [0, 1, 2])
-# TraceLens report format: xlsx (default, single parse, fastest), csv, or all (xlsx+csv;
+mlflow_tracelens_ranks: null         # List of ranks to analyze (null = all, e.g. [0, 1, 2])
+# TraceLens report format: xlsx (default; single parse, fastest), csv, or all (xlsx+csv;
 # parses each trace twice so ~2x processing time; use only when both formats are needed)
 mlflow_tracelens_output_format: xlsx
 mlflow_tracelens_cleanup_after_upload: false  # Keep local reports (true to cleanup and save disk space)
@@ -62,6 +63,7 @@ use_rocm_mem_info_iters: [1,2]
 #      - perf/gpu_utilization_pct_avg: Average GPU utilization across all ranks (%)
 #
 # Note: This flag implicitly enables log_throughput behavior for metric collection.
+# Note: GPU utilization collection uses all_gather on every log_interval (sync across ranks).
 mlflow_upload_performance_metrics: false
 
 # profiling
diff --git a/primus/core/utils/rocm_mem_info.py b/primus/core/utils/rocm_mem_info.py
index 1ee266a43..0127d95a2 100644
--- a/primus/core/utils/rocm_mem_info.py
+++ b/primus/core/utils/rocm_mem_info.py
@@ -49,12 +49,6 @@ def get_rocm_smi_gpu_util(device_id: int):
             if 0 <= val <= 100:
                 return val
 
-        # Fallback: take the last 0-100 number if no percent sign is present.
-        numbers = re.findall(r"\b(\d+(?:\.\d+)?)\b", line)
-        for n in reversed(numbers):
-            val = float(n)
-            if 0 <= val <= 100:
-                return val
     raise RuntimeError(f"rocm-smi --showuse did not report a GPU use percentage for device {device_id}")
 
 
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
index 6f4811518..7fc72d2d0 100644
--- a/primus/modules/trainer/megatron/trainer.py
+++ b/primus/modules/trainer/megatron/trainer.py
@@ -2207,6 +2207,10 @@ def training_log(
                     # Ensure memory metrics are available when log_timers_to_tensorboard is False
                     # (mem_collector, used_mem, mem_usage are otherwise only set inside log_timers_to_tensorboard)
                     if not args.log_timers_to_tensorboard:
+                        hip_free_mem, hip_total_mem = torch.cuda.mem_get_info()
+                        used_mem = hip_total_mem - hip_free_mem
+                        mem_usage = used_mem / hip_total_mem
+                        mem_collector = "hip"
                         if args.use_rocm_mem_info or (
                             args.use_rocm_mem_info_iters is not None
                             and iteration in args.use_rocm_mem_info_iters
@@ -2214,11 +2218,6 @@ def training_log(
                             mem_collector = "rocm"
                             used_mem = rocm_used_mem
                             mem_usage = rocm_mem_usage
-                        else:
-                            hip_free_mem, hip_total_mem = torch.cuda.mem_get_info()
-                            used_mem = hip_total_mem - hip_free_mem
-                            mem_usage = used_mem / hip_total_mem
-                            mem_collector = "hip"
                     # System metrics - GPU utilization per rank
                     # ALL ranks must participate in all_gather, even if they don't have mlflow_writer
                     # Use -1 as sentinel for unavailable GPU util
diff --git a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
index bff0717af..53d96923d 100644
--- a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
+++ b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
@@ -139,10 +139,10 @@ def test_returns_all_when_ranks_none(self, tmp_path):
         out = mlflow_artifacts_mod._filter_traces_by_rank(paths, None)
         assert out == paths
 
-    def test_returns_all_when_ranks_empty_list(self, tmp_path):
+    def test_returns_empty_when_ranks_empty_list(self, tmp_path):
         paths = [str(tmp_path / "rank_0.pt.trace.json")]
         out = mlflow_artifacts_mod._filter_traces_by_rank(paths, [])
-        assert out == paths
+        assert out == []
 
     def test_filters_to_specified_ranks(self, tmp_path):
         paths = [

From 889143476b3ae18804e4eabbe230af2bf84fb896 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 13:18:29 +0000
Subject: [PATCH 37/42] Remove unused openpyxl install result.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 primus/backends/megatron/training/mlflow_artifacts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index e332cdaee..d2f10057c 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -237,7 +237,7 @@ def _ensure_openpyxl_installed() -> bool:
     except ImportError:
         log_rank_0("[TraceLens] openpyxl not found, installing for XLSX support...")
         try:
-            result = subprocess.run(
+            subprocess.run(
                 [sys.executable, "-m", "pip", "install", "openpyxl", "-q"],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,

From 3a1c3c58e10cc403ca7cd2fef15bddefdc1882ac Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 13:23:00 +0000
Subject: [PATCH 38/42] Harden TraceLens install checks and ROCm parsing.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     | 40 ++++++++++++++++++-
 primus/core/utils/rocm_mem_info.py            |  5 ++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index d2f10057c..a137b835a 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -257,6 +257,42 @@ def _ensure_openpyxl_installed() -> bool:
             return False
 
 
+def _verify_tracelens_ref_exists(ref: str) -> bool:
+    """
+    Verify that the TraceLens git reference exists before installing.
+
+    Returns:
+        True if the ref exists or verification is skipped, False otherwise
+    """
+    try:
+        result = subprocess.run(
+            ["git", "ls-remote", "https://github.com/AMD-AGI/TraceLens.git", ref],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            check=True,
+            timeout=10,
+        )
+    except FileNotFoundError:
+        warning_rank_0("[TraceLens] git not found; skipping TraceLens ref verification.")
+        return True
+    except subprocess.TimeoutExpired:
+        warning_rank_0("[TraceLens] TraceLens ref verification timed out.")
+        return False
+    except subprocess.CalledProcessError as e:
+        stderr_output = e.stderr.strip() if e.stderr else "No stderr output captured."
+        warning_rank_0(
+            f"[TraceLens] TraceLens ref verification failed: {e}\n" f"[TraceLens] git stderr: {stderr_output}"
+        )
+        return False
+
+    if not result.stdout.strip():
+        warning_rank_0(f"[TraceLens] TraceLens ref '{ref}' not found; skipping install.")
+        return False
+
+    return True
+
+
 def _ensure_tracelens_installed(auto_install: bool = True) -> bool:
     """
     Ensure TraceLens and its dependencies are installed.
@@ -279,6 +315,8 @@ def _ensure_tracelens_installed(auto_install: bool = True) -> bool:
         try:
             # TraceLens is on GitHub, not PyPI; pin to a commit SHA for reproducibility and supply-chain safety
             install_spec = f"git+https://github.com/AMD-AGI/TraceLens.git@{TRACELENS_INSTALL_REF}"
+            if not _verify_tracelens_ref_exists(TRACELENS_INSTALL_REF):
+                return False
             subprocess.run(
                 [
                     sys.executable,
@@ -513,7 +551,7 @@ def generate_tracelens_report(
 
         # For "all" format: TraceLens uses either/or logic - if output_csvs_dir is set,
         # it ONLY generates CSVs. So we need to call it twice for both formats.
-        # Performance: trace file is parsed twice (~2x time; large traces can be hundreds of MB).
+        # Performance: trace file is parsed twice intentionally (~2x time; large traces can be hundreds of MB).
         # A future workaround could write CSVs from the DataFrames returned by the first call
         # if TraceLens API exposes a suitable export; for now we accept the double parse.
         if output_format == "all":
diff --git a/primus/core/utils/rocm_mem_info.py b/primus/core/utils/rocm_mem_info.py
index 0127d95a2..5c0b30fad 100644
--- a/primus/core/utils/rocm_mem_info.py
+++ b/primus/core/utils/rocm_mem_info.py
@@ -26,6 +26,9 @@ def get_rocm_smi_gpu_util(device_id: int):
         raise RuntimeError("rocm-smi not found, please ensure ROCm is installed and in PATH")
     except subprocess.TimeoutExpired:
         raise RuntimeError("rocm-smi --showuse timed out")
+    except subprocess.CalledProcessError as e:
+        output = e.output.strip() if isinstance(e.output, str) and e.output else "No output captured."
+        raise RuntimeError(f"rocm-smi --showuse failed with exit code {e.returncode}. Output: {output}")
 
     # Parse output: look for GPU use (%) or similar (e.g. "GPU use (%): 42" or "GPU Use: 42%")
     for line in out.splitlines():
@@ -34,7 +37,7 @@ def get_rocm_smi_gpu_util(device_id: int):
             continue
         # Prefer a number that follows a use/busy label.
         labeled_match = re.search(
-            r"(?:use|busy)[^0-9%]*([0-9]+(?:\.[0-9]+)?)\s*%?",
+            r"\b(?:use|busy)\b[^0-9%]*[:=]\s*([0-9]+(?:\.[0-9]+)?)\s*%?",
             line_lower,
         )
         if labeled_match:

From 195f28b938a15bb75b7050475417cd8a008c14f4 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Wed, 11 Feb 2026 13:33:12 +0000
Subject: [PATCH 39/42] Handle TraceLens SHA verification.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../backends/megatron/training/mlflow_artifacts.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index a137b835a..bf635ee58 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -264,9 +264,13 @@ def _verify_tracelens_ref_exists(ref: str) -> bool:
     Returns:
         True if the ref exists or verification is skipped, False otherwise
     """
+    is_commit_sha = bool(re.fullmatch(r"[0-9a-fA-F]{7,40}", ref))
     try:
+        ls_remote_cmd = ["git", "ls-remote", "https://github.com/AMD-AGI/TraceLens.git"]
+        if not is_commit_sha:
+            ls_remote_cmd.append(ref)
         result = subprocess.run(
-            ["git", "ls-remote", "https://github.com/AMD-AGI/TraceLens.git", ref],
+            ls_remote_cmd,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
@@ -286,9 +290,15 @@ def _verify_tracelens_ref_exists(ref: str) -> bool:
         )
         return False
 
-    if not result.stdout.strip():
+    output = result.stdout.strip()
+    if not output:
         warning_rank_0(f"[TraceLens] TraceLens ref '{ref}' not found; skipping install.")
         return False
+    if is_commit_sha:
+        sha_lower = ref.lower()
+        if not any(line.lower().startswith(sha_lower) for line in output.splitlines()):
+            warning_rank_0(f"[TraceLens] TraceLens SHA '{ref}' not found; skipping install.")
+            return False
 
     return True
 

From 0b4b4911f022ffcaff17ea1f41df7a1029b6013f Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Fri, 20 Feb 2026 08:20:18 +0000
Subject: [PATCH 40/42] Add TraceLens normalization coverage and import.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../megatron/training/mlflow_artifacts.py     |  1 +
 .../megatron/test_mlflow_artifacts.py         | 33 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index bf635ee58..94fc65862 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -35,6 +35,7 @@
 
 import glob
 import os
+import re
 import subprocess
 import sys
 from typing import List, Optional
diff --git a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
index 53d96923d..b647bad68 100644
--- a/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
+++ b/tests/unit_tests/backends/megatron/test_mlflow_artifacts.py
@@ -156,6 +156,39 @@ def test_filters_to_specified_ranks(self, tmp_path):
         assert "rank_2" in out[1]
 
 
+# -----------------------------------------------------------------------------
+# Normalize TraceLens inputs
+# -----------------------------------------------------------------------------
+
+
+class TestNormalizeTracelensInputs:
+    """Test TraceLens input normalization helpers."""
+
+    def test_normalize_ranks_none(self):
+        assert mlflow_artifacts_mod._normalize_tracelens_ranks(None) is None
+
+    def test_normalize_ranks_string_list(self):
+        ranks = mlflow_artifacts_mod._normalize_tracelens_ranks("[0, 2, '3']")
+        assert ranks == [0, 2, 3]
+
+    def test_normalize_ranks_invalid_string(self):
+        assert mlflow_artifacts_mod._normalize_tracelens_ranks("not a list") is None
+
+    def test_normalize_ranks_filters_invalid_and_world_size(self, monkeypatch):
+        monkeypatch.setenv("WORLD_SIZE", "2")
+        ranks = mlflow_artifacts_mod._normalize_tracelens_ranks([0, 1, 2, -1, "x", True])
+        assert ranks == [0, 1]
+
+    def test_normalize_output_format_none(self):
+        assert mlflow_artifacts_mod._normalize_tracelens_output_format(None) == "xlsx"
+
+    def test_normalize_output_format_valid(self):
+        assert mlflow_artifacts_mod._normalize_tracelens_output_format("CSV") == "csv"
+
+    def test_normalize_output_format_invalid(self):
+        assert mlflow_artifacts_mod._normalize_tracelens_output_format("pdf") == "xlsx"
+
+
 # -----------------------------------------------------------------------------
 # Report generation with mocked TraceLens
 # -----------------------------------------------------------------------------

From d2c88d7fc1711a99a9558ff6bcffcb7148413418 Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Mon, 23 Feb 2026 07:47:08 +0000
Subject: [PATCH 41/42] Remove redundant re import.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 primus/backends/megatron/training/mlflow_artifacts.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index 94fc65862..f33b456fe 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -384,8 +384,6 @@ def _extract_rank_from_filename(filename: str) -> Optional[int]:
     Returns:
         Rank number or None if not found
     """
-    import re
-
     # Try pattern: rank_N_, rank_N. (dot), rank[N], -rankN., _rankN.
     patterns = [
         r"rank_(\d+)_",

From 88eda50e344042751f94c015b5a5e56cf916debd Mon Sep 17 00:00:00 2001
From: guangphu <guangpu.huang@amd.com>
Date: Thu, 26 Feb 2026 12:21:06 +0000
Subject: [PATCH 42/42] Gate openpyxl installs and test mlflow setup.

Made-with: Cursor
---
 .../megatron/training/mlflow_artifacts.py     | 16 ++++--
 .../backends/megatron/test_mlflow_setup.py    | 51 +++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit_tests/backends/megatron/test_mlflow_setup.py

diff --git a/primus/backends/megatron/training/mlflow_artifacts.py b/primus/backends/megatron/training/mlflow_artifacts.py
index f33b456fe..9d8b3ab09 100644
--- a/primus/backends/megatron/training/mlflow_artifacts.py
+++ b/primus/backends/megatron/training/mlflow_artifacts.py
@@ -224,7 +224,7 @@ def upload_log_files_to_mlflow(
 # =============================================================================
 
 
-def _ensure_openpyxl_installed() -> bool:
+def _ensure_openpyxl_installed(auto_install: bool = True) -> bool:
     """
     Ensure openpyxl is installed for XLSX generation.
 
@@ -236,6 +236,9 @@ def _ensure_openpyxl_installed() -> bool:
 
         return True
     except ImportError:
+        if not auto_install:
+            warning_rank_0("[TraceLens] openpyxl not installed and auto-install disabled; skipping install.")
+            return False
         log_rank_0("[TraceLens] openpyxl not found, installing for XLSX support...")
         try:
             subprocess.run(
@@ -244,9 +247,13 @@ def _ensure_openpyxl_installed() -> bool:
                 stderr=subprocess.PIPE,
                 text=True,
                 check=True,
+                timeout=300,
             )
             log_rank_0("[TraceLens] Successfully installed openpyxl")
             return True
+        except subprocess.TimeoutExpired:
+            warning_rank_0("[TraceLens] openpyxl install timed out after 300s. Skipping install.")
+            return False
         except subprocess.CalledProcessError as e:
             stdout_output = e.stdout.strip() if e.stdout else "No stdout output captured."
             stderr_output = e.stderr.strip() if e.stderr else "No stderr output captured."
@@ -508,6 +515,7 @@ def generate_tracelens_report(
     output_dir: str,
     report_name: Optional[str] = None,
     output_format: str = "xlsx",
+    auto_install: bool = True,
 ) -> List[str]:
     """
     Generate a TraceLens analysis report for a single trace file.
@@ -552,7 +560,7 @@ def generate_tracelens_report(
 
         # Only ensure openpyxl when XLSX output is requested (avoids pip install in CSV-only or restricted envs)
         if output_format in ("xlsx", "all"):
-            if not _ensure_openpyxl_installed():
+            if not _ensure_openpyxl_installed(auto_install=auto_install):
                 warning_rank_0("[TraceLens] openpyxl unavailable; downgrading output_format to 'csv'.")
                 output_format = "csv"
 
@@ -815,7 +823,9 @@ def generate_tracelens_reports(
     generated_reports = []
     for trace_file in trace_files:
         # generate_tracelens_report now returns a list of files
-        report_paths = generate_tracelens_report(trace_file, output_dir, output_format=output_format)
+        report_paths = generate_tracelens_report(
+            trace_file, output_dir, output_format=output_format, auto_install=auto_install
+        )
         generated_reports.extend(report_paths)
 
     log_rank_0(
diff --git a/tests/unit_tests/backends/megatron/test_mlflow_setup.py b/tests/unit_tests/backends/megatron/test_mlflow_setup.py
new file mode 100644
index 000000000..70738d9a4
--- /dev/null
+++ b/tests/unit_tests/backends/megatron/test_mlflow_setup.py
@@ -0,0 +1,51 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc.
+#
+# See LICENSE for license information.
+###############################################################################
+
+from unittest.mock import MagicMock, patch
+
+from primus.backends.megatron.training import mlflow_setup
+
+
+class DummyArgs:
+    def __init__(self, rank: int, world_size: int, mlflow_run_name=None):
+        self.rank = rank
+        self.world_size = world_size
+        self.mlflow_run_name = mlflow_run_name
+
+
+def _call_upload(generate_tracelens_report: bool, args: DummyArgs, mock_generate):
+    with patch(f"{mlflow_setup.__name__}.get_mlflow_writer", return_value=None), patch(
+        f"{mlflow_setup.__name__}.get_primus_args", return_value=args
+    ), patch(f"{mlflow_setup.__name__}.generate_tracelens_reports_locally", mock_generate):
+        mlflow_setup.upload_mlflow_artifacts(
+            tensorboard_dir="/tmp/tb",
+            exp_root_path="/tmp/exp",
+            generate_tracelens_report=generate_tracelens_report,
+        )
+
+
+def test_local_generation_single_rank_no_mlflow():
+    mock_generate = MagicMock()
+    _call_upload(True, DummyArgs(rank=0, world_size=1, mlflow_run_name=None), mock_generate)
+    mock_generate.assert_called_once()
+
+
+def test_local_generation_skipped_when_mlflow_expected_distributed():
+    mock_generate = MagicMock()
+    _call_upload(True, DummyArgs(rank=0, world_size=8, mlflow_run_name="run"), mock_generate)
+    mock_generate.assert_not_called()
+
+
+def test_local_generation_distributed_without_mlflow():
+    mock_generate = MagicMock()
+    _call_upload(True, DummyArgs(rank=0, world_size=8, mlflow_run_name=None), mock_generate)
+    mock_generate.assert_called_once()
+
+
+def test_local_generation_disabled_when_flag_false():
+    mock_generate = MagicMock()
+    _call_upload(False, DummyArgs(rank=0, world_size=1, mlflow_run_name=None), mock_generate)
+    mock_generate.assert_not_called()