diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 9853068d4..2a40a9ba4 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -114,8 +114,7 @@ jobs:
           echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-ainic"
           start_time=$(date +%s)
           mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
-          cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-38.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
-          cp /apps/tas/0_public/primus_docker_ci/ainic/amd-anp-v1.3.0.patch $GITHUB_WORKSPACE/.github/workflows/docker/ainic/
+          cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
           docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile.ainic \
                --network=host \
                -t tasimage/primus:${{env.IMAGE_TAG}}-ainic \
@@ -131,6 +130,25 @@ jobs:
           docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-ainic
           docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}
 
+          echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-v25.09-ainic"
+          start_time=$(date +%s)
+          mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
+          cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
+          docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile_v25.09_ainic \
+               --network=host \
+               -t tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic \
+               --build-arg AINIC_BUNDLE_PATH=ainic \
+               --build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \
+               $GITHUB_WORKSPACE/.github/workflows/docker
+          end_time=$(date +%s)
+          elapsed=$((end_time - start_time))
+          echo "⏱️ [build primus docker-v25.09-ainic] Total elapsed time: ${elapsed} seconds"
+
+          docker tag tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic docker.io/tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
+          docker login -u tasimage -p ${{ secrets.PRIMUS_DOCKER_HUB_TOKEN }}
+          docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
+          docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}
+
           echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-jax"
           start_time=$(date +%s)
           docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile \
@@ -159,9 +177,11 @@ jobs:
 
   run-unittest-torch:
     env:
-      PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch
+      PRIMUS_WORKDIR: /mnt/apps_proxy/tas/0_public/primus_ci/actions-runner-torch
+      # PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch
     needs: [code-lint]
-    runs-on: [primus-lm-cicd-torch-j8knc]
+    # runs-on: [primus-lm-cicd-torch-j8knc]
+    runs-on: [primus-lm-cicd-torch-tas8n-a16-40]
     steps:
       - run: echo "🎉 Begin Primus-Turbo Checkout."
       - name: Set commit hash to env
@@ -176,10 +196,11 @@ jobs:
       - run: echo "Begin Primus-Turbo Install."
       - name: Install Primus-Turbo
         run: |
+          rm -rf /tmp/Primus-Turbo || true
           mv Primus-Turbo /tmp/
           echo "Primus-Turbo dir: /tmp/Primus-Turbo"
-          git config --global --add safe.directory /tmp/Primus-Turbo
-          cd /tmp/Primus-Turbo
+          git config --global --add safe.directory /tmp/Primus-Turbo || true
+          cd /tmp/Primus-Turbo || true
           start_time=$(date +%s)
           echo "✅ [Pip install requirements] started at: $(date)"
           mkdir -p ${PRIMUS_WORKDIR}/primus-cache
@@ -233,6 +254,8 @@ jobs:
         run: |
           echo "Running Primus Core tests..."
           # Note: The tests `test_fp8_te_linear` and `test_te_linear` are temporarily skipped due to intermittent failures.
+          # Note HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid RCCL perf hit (TAS-8N Node), rocm ver:70125424
+          export HSA_NO_SCRATCH_RECLAIM=1
           pytest --maxfail=1 -s ./tests/unit_tests/ \
             --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_fp8_te_linear \
             --deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_te_linear
@@ -243,7 +266,8 @@ jobs:
           echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
           rm -rf "${{ env.UT_LOG_PATH }}"
           mkdir -p "${{ env.UT_LOG_PATH }}"
-          MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
+          # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
+          MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
           pytest  --maxfail=1 -s ./tests/trainer/test_megatron_trainer.py
       - name: Run Primus Model Tests -- TorchTitan
         env:
@@ -252,7 +276,8 @@ jobs:
           echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
           rm -rf "${{ env.UT_LOG_PATH }}"
           mkdir -p "${{ env.UT_LOG_PATH }}"
-          MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
+          # MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
+          MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
           pytest  --maxfail=1 -s ./tests/trainer/test_torchtitan_trainer.py
       - name: Clean
         run: |
diff --git a/.github/workflows/docker/Dockerfile.ainic b/.github/workflows/docker/Dockerfile.ainic
index c6c951622..f377bd5d9 100644
--- a/.github/workflows/docker/Dockerfile.ainic
+++ b/.github/workflows/docker/Dockerfile.ainic
@@ -30,14 +30,16 @@ ENV MPI_PATH=/opt/ompi
 # WARNING: If these paths are missing, tools and libraries may not function correctly.
 # INFO: Installation completed successfully
 
-COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-38.tar.gz ${WORKDIR}
+COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
 RUN cd ${WORKDIR} && \
     echo "Building ainic bundle... current directory: ${WORKDIR}" && \
-    tar zxf ainic_bundle_1.117.5-a-38.tar.gz && \
-    cd ainic_bundle_1.117.5-a-38 && \
+    tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
+    cd ainic_bundle_1.117.5-a-56 && \
     tar zxf host_sw_pkg.tar.gz && \
     cd host_sw_pkg && \
-    ./install.sh --domain=user -y 2>&1 | tee log_install.txt
+    ./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
+    cd ${WORKDIR} && \
+    apt-get install -y ./amd/ainic/deb-repo/libionic*.deb
 
 # ---------------------------------------------------------------------------
 # Build rccl
@@ -54,8 +56,9 @@ ENV RCCL_HOME=${WORKDIR}/rccl
 # Build AMD ANP
 # ---------------------------------------------------------------------------
 
-RUN apt-get install -y --allow-unauthenticated libionic-dev && \
-    cd ${WORKDIR} && git clone https://github.com/rocm/amd-anp.git && \
+# RUN apt-get install -y --allow-unauthenticated libionic-dev && \
+RUN cd ${WORKDIR} && \
+    git clone https://github.com/rocm/amd-anp.git && \
     cd amd-anp && git checkout tags/v1.3.0 && \
     make -j 16 RCCL_HOME=${RCCL_HOME} \
                 MPI_INCLUDE=${MPI_PATH}/include/ \
diff --git a/.github/workflows/docker/Dockerfile_v25.09_ainic b/.github/workflows/docker/Dockerfile_v25.09_ainic
index 44308696b..d8c5934a7 100644
--- a/.github/workflows/docker/Dockerfile_v25.09_ainic
+++ b/.github/workflows/docker/Dockerfile_v25.09_ainic
@@ -36,15 +36,16 @@ RUN apt-get update && \
 # WARNING: If these paths are missing, tools and libraries may not function correctly.
 # INFO: Installation completed successfully
 
-COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.1-a-42.tar.gz /opt/
+COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
 RUN cd ${WORKDIR} && \
     echo "Building ainic bundle... current directory: ${WORKDIR}" && \
-    tar zxf ainic_bundle_1.117.1-a-42.tar.gz && \
-    cd ainic_bundle_1.117.1-a-42 && \
+    tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
+    cd ainic_bundle_1.117.5-a-56 && \
     tar zxf host_sw_pkg.tar.gz && \
     cd host_sw_pkg && \
     ./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
-    cd /opt
+    cd ${WORKDIR} && \
+    apt-get install -y ./amd/ainic/deb-repo/libionic*.deb
 
 # =============================== Test AINIC Driver ===============================
 # ibv_devices
diff --git a/docs/README.md b/docs/README.md
index 45e28f26e..f2d22ad42 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -24,6 +24,7 @@ Guides for common workflows and features:
 
 In-depth technical documentation:
 
+- **[Post-Training Guide](./posttraining.md)** - Fine-tuning with SFT and LoRA using Primus CLI
 - **[Performance Projection](./projection.md)** - Project training performance to multi-node configurations
 - **[Preflight](./preflight.md)** - Cluster diagnostics (host/GPU/network info + perf tests)
 - **[Benchmark Suite](./benchmark.md)** - GEMM, RCCL, end-to-end benchmarks and profiling
diff --git a/docs/posttraining.md b/docs/posttraining.md
new file mode 100644
index 000000000..27eb5ca01
--- /dev/null
+++ b/docs/posttraining.md
@@ -0,0 +1,452 @@
+# 🎓 Post-Training with Primus
+
+This guide demonstrates how to perform post-training (fine-tuning) using **Megatron Bridge** within the **Primus** framework. It covers both **Supervised Fine-Tuning (SFT)** and **Low-Rank Adaptation (LoRA)** methods for customizing pre-trained models.
+
+---
+
+## 📚 Table of Contents
+
+- [🎓 Post-Training with Primus](#-post-training-with-primus)
+  - [📚 Table of Contents](#-table-of-contents)
+  - [🎯 Overview](#-overview)
+  - [⚙️ Supported Backends](#️-supported-backends)
+  - [🔧 Post-Training Methods](#-post-training-methods)
+  - [🚀 Quick Start](#-quick-start)
+    - [Prerequisites](#prerequisites)
+    - [Basic Usage](#basic-usage)
+  - [📝 Configuration Examples](#-configuration-examples)
+    - [Supervised Fine-Tuning (SFT)](#supervised-fine-tuning-sft)
+    - [LoRA Fine-Tuning](#lora-fine-tuning)
+  - [🖥️ Single Node Training](#️-single-node-training)
+    - [Direct Mode](#direct-mode)
+    - [Container Mode](#container-mode)
+  - [📊 Hardware-Specific Configurations](#-hardware-specific-configurations)
+    - [MI300X Configurations](#mi300x-configurations)
+    - [MI355X Configurations](#mi355x-configurations)
+  - [🎨 Customizing Training Parameters](#-customizing-training-parameters)
+  - [💡 Best Practices](#-best-practices)
+  - [🔍 Troubleshooting](#-troubleshooting)
+
+---
+
+## 🎯 Overview
+
+Post-training (fine-tuning) allows you to adapt pre-trained foundation models to specific tasks or domains. Primus supports two primary fine-tuning approaches:
+
+- **Supervised Fine-Tuning (SFT)**: Full fine-tuning that updates all model parameters
+- **LoRA (Low-Rank Adaptation)**: Parameter-efficient fine-tuning that only trains lightweight adapter modules
+
+---
+
+## ⚙️ Supported Backends
+
+Post-training in Primus uses the **Megatron Bridge** backend:
+
+| Backend         | Description                                                     |
+| --------------- | --------------------------------------------------------------- |
+| Megatron Bridge | Bridge implementation for fine-tuning Megatron-based models    |
+
+---
+
+## 🔧 Post-Training Methods
+
+| Method | Memory Usage | Training Speed | Use Case                              |
+| ------ | ------------ | -------------- | ------------------------------------- |
+| **SFT**  | High         | Slower         | Maximum performance, full adaptation  |
+| **LoRA** | Low          | Faster         | Resource-efficient, quick iteration   |
+
+**Key Differences:**
+- **SFT** updates all model parameters, requiring more memory and compute
+- **LoRA** trains only low-rank adapter matrices, significantly reducing resource requirements
+
+---
+
+## 🚀 Quick Start
+
+### Prerequisites
+
+- AMD ROCm drivers (≥ 7.0)
+- Docker (≥ 24.0) with ROCm support (recommended)
+- AMD Instinct GPUs (MI300X, MI355X, etc.)
+- Pre-trained model checkpoint (optional, for continued training)
+
+```bash
+# Quick verification
+rocm-smi && docker --version
+```
+
+### Basic Usage
+
+The general command structure for post-training:
+
+```bash
+./runner/primus-cli <mode> train posttrain --config <config_file>
+```
+
+**Example commands:**
+
+```bash
+# SFT with direct mode
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
+
+# LoRA with direct mode
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
+```
+
+---
+
+## 📝 Configuration Examples
+
+### Supervised Fine-Tuning (SFT)
+
+Full fine-tuning configuration example for **Qwen3 32B** on **MI355X**:
+
+```yaml
+work_group: ${PRIMUS_TEAM:amd}
+user_name: ${PRIMUS_USER:root}
+exp_name: ${PRIMUS_EXP_NAME:qwen3_32b_sft_posttrain}
+workspace: ${PRIMUS_WORKSPACE:./output}
+
+modules:
+  post_trainer:
+    framework: megatron_bridge
+    config: sft_trainer.yaml
+    model: qwen3_32b.yaml
+
+    overrides:
+      # Parallelism configuration
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      context_parallel_size: 1
+      sequence_parallel: false
+
+      # Fine-tuning method
+      peft: "none"  # Full fine-tuning
+      
+      # Training configuration
+      train_iters: 200
+      global_batch_size: 8
+      micro_batch_size: 1
+      seq_length: 8192
+
+      # Optimizer configuration
+      finetune_lr: 5.0e-6
+      min_lr: 0.0
+      lr_warmup_iters: 50
+
+      # Precision
+      precision_config: bf16_mixed
+```
+
+**Configuration location:** `examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml`
+
+### LoRA Fine-Tuning
+
+Parameter-efficient fine-tuning configuration for **Qwen3 32B** on **MI355X**:
+
+```yaml
+work_group: ${PRIMUS_TEAM:amd}
+user_name: ${PRIMUS_USER:root}
+exp_name: ${PRIMUS_EXP_NAME:qwen3_32b_lora_posttrain}
+workspace: ${PRIMUS_WORKSPACE:./output}
+
+modules:
+  post_trainer:
+    framework: megatron_bridge
+    config: sft_trainer.yaml
+    model: qwen3_32b.yaml
+
+    overrides:
+      # Parallelism configuration
+      tensor_model_parallel_size: 1  # LoRA requires less parallelism
+      pipeline_model_parallel_size: 1
+      context_parallel_size: 1
+      sequence_parallel: false
+
+      # Fine-tuning method
+      peft: lora  # LoRA fine-tuning
+      
+      # Training configuration
+      train_iters: 200
+      global_batch_size: 32
+      micro_batch_size: 4
+      seq_length: 8192
+
+      # Optimizer configuration
+      finetune_lr: 1.0e-4  # Higher LR for LoRA
+      min_lr: 0.0
+      lr_warmup_iters: 50
+
+      # Precision
+      precision_config: bf16_mixed
+      
+      # Recompute configuration
+      recompute_granularity: full
+      recompute_method: uniform
+      recompute_num_layers: 1
+```
+
+**Configuration location:** `examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml`
+
+---
+
+## 🖥️ Single Node Training
+
+### Direct Mode
+
+Best for local development or when running directly on bare metal with ROCm installed.
+
+**SFT Example:**
+```bash
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
+```
+
+**LoRA Example:**
+```bash
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
+```
+
+**MI300X Examples:**
+```bash
+# SFT on MI300X
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI300X/qwen3_32b_sft_posttrain.yaml
+
+# LoRA on MI300X
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI300X/qwen3_32b_lora_posttrain.yaml
+```
+
+### Container Mode
+
+Recommended for environment isolation and dependency management.
+
+**Pull Docker image:**
+```bash
+docker pull docker.io/rocm/primus:latest
+```
+
+**SFT Example:**
+```bash
+./runner/primus-cli container --image rocm/primus:latest \
+  train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
+```
+
+**LoRA Example:**
+```bash
+./runner/primus-cli container --image rocm/primus:latest \
+  train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
+```
+
+---
+
+## 📊 Hardware-Specific Configurations
+
+### MI300X Configurations
+
+Available configurations for AMD Instinct MI300X GPUs:
+
+| Model      | Method | Config File                                 | TP | GBS | MBS | Seq Len |
+| ---------- | ------ | ------------------------------------------- | -- | --- | --- | ------- |
+| Qwen3 32B  | SFT    | `MI300X/qwen3_32b_sft_posttrain.yaml`       | 2  | 8   | 2   | 8192    |
+| Qwen3 32B  | LoRA   | `MI300X/qwen3_32b_lora_posttrain.yaml`      | 1  | 32  | 2   | 8192    |
+
+**Example:**
+```bash
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI300X/qwen3_32b_sft_posttrain.yaml
+```
+
+### MI355X Configurations
+
+Available configurations for AMD Instinct MI355X GPUs:
+
+| Model        | Method | Config File                                 | TP | GBS | MBS | Seq Len |
+| ------------ | ------ | ------------------------------------------- | -- | --- | --- | ------- |
+| Qwen3 32B    | SFT    | `MI355X/qwen3_32b_sft_posttrain.yaml`       | 1  | 8   | 1   | 8192    |
+| Qwen3 32B    | LoRA   | `MI355X/qwen3_32b_lora_posttrain.yaml`      | 1  | 32  | 4   | 8192    |
+
+**Legend:**
+- **TP**: Tensor Parallelism Size
+- **GBS**: Global Batch Size
+- **MBS**: Micro Batch Size (per GPU)
+- **Seq Len**: Sequence Length
+
+**Example:**
+```bash
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
+```
+
+---
+
+## 🎨 Customizing Training Parameters
+
+Key parameters you can customize in the YAML configuration:
+
+### Parallelism Settings
+```yaml
+tensor_model_parallel_size: 1    # Number of GPUs for tensor parallelism (1-8)
+pipeline_model_parallel_size: 1  # Number of GPUs for pipeline parallelism
+context_parallel_size: 1         # Context parallelism for long sequences
+sequence_parallel: false         # Enable sequence parallelism
+```
+
+### Training Hyperparameters
+```yaml
+train_iters: 200              # Total training iterations
+global_batch_size: 8          # Global batch size (8-32 depending on config)
+micro_batch_size: 1           # Batch size per GPU (1-4 depending on config)
+seq_length: 2048              # Sequence length (2048-8192 depending on model)
+eval_interval: 30             # Evaluate every N iterations
+save_interval: 50             # Save checkpoint every N iterations
+```
+
+### Learning Rate Configuration
+```yaml
+finetune_lr: 1.0e-4          # Initial learning rate
+min_lr: 0.0                  # Minimum learning rate
+lr_warmup_iters: 50          # Number of warmup iterations
+lr_decay_iters: null         # Learning rate decay iterations
+```
+
+### Fine-Tuning Method
+```yaml
+peft: lora                   # Options: "lora" or "none" (for full SFT)
+packed_sequence: false       # Enable packed sequences for efficiency
+```
+
+### Precision Configuration
+```yaml
+precision_config: bf16_mixed  # Options: bf16_mixed, fp16_mixed, fp32
+```
+
+### Memory Optimization
+```yaml
+recompute_granularity: full   # Options: full, selective, null
+recompute_method: uniform     # Recompute strategy
+recompute_num_layers: 1       # Number of layers to recompute
+```
+
+---
+
+## 💡 Best Practices
+
+### Choosing Between SFT and LoRA
+
+**Use SFT when:**
+- You need maximum model performance
+- You have sufficient GPU memory
+- Training time is not critical
+- You want full model adaptation
+
+**Use LoRA when:**
+- GPU memory is limited
+- You need fast iteration cycles
+- Training multiple task-specific adapters
+- Parameter efficiency is important
+
+### Parallelism Configuration
+
+**For SFT:**
+- Use higher `tensor_model_parallel_size` for large models (e.g., TP=8 for 70B)
+- Consider pipeline parallelism for very large models
+- Examples: 
+  - 32B model: TP=1-2 (MI300X: TP=2, MI355X: TP=1)
+  - 70B model: TP=8
+
+**For LoRA:**
+- Lower `tensor_model_parallel_size` due to reduced memory
+- LoRA can fit larger models with less parallelism
+- Examples:
+  - 32B model: TP=1
+  - 70B model: TP=8 (still requires high TP due to model size)
+
+### Learning Rate Guidelines
+
+- **SFT**: Use lower learning rates (5e-6 to 1e-5)
+- **LoRA**: Use higher learning rates (1e-4 to 5e-4)
+- Always use warmup for stable training
+
+### Batch Size Recommendations
+
+- Start with `global_batch_size: 8` for SFT development
+- LoRA can use higher batch sizes (e.g., 32) due to lower memory usage
+- Increase for production: 64, 128, or higher
+- Adjust `micro_batch_size` (1-4) based on GPU memory and sequence length
+- Longer sequences (8192) may require higher `micro_batch_size` for efficiency
+
+---
+
+## 🔍 Troubleshooting
+
+### Out of Memory (OOM) Errors
+
+**For SFT:**
+1. Increase `tensor_model_parallel_size`
+2. Reduce `micro_batch_size`
+3. Enable gradient checkpointing:
+   ```yaml
+   recompute_granularity: full
+   recompute_method: uniform
+   recompute_num_layers: 1
+   ```
+4. Reduce `seq_length`
+
+**For LoRA:**
+1. LoRA should have lower memory usage; verify `peft: lora` is set
+2. Reduce `micro_batch_size` if still facing OOM
+3. Enable recomputation as above
+
+### Training Instability
+
+1. **Check learning rate**: Reduce if loss is spiking
+2. **Increase warmup**: Try `lr_warmup_iters: 100` or higher
+3. **Use mixed precision**: Ensure `precision_config: bf16_mixed`
+4. **Monitor gradients**: Watch for gradient explosions
+
+### Slow Training Speed
+
+1. **Optimize batch size**: Increase `global_batch_size` if possible
+2. **Check parallelism**: Ensure optimal TP/PP configuration
+3. **Use container mode**: Docker containers can improve performance
+4. **Profile execution**: Use profiling tools to identify bottlenecks
+
+### Configuration Issues
+
+1. **Verify paths**: Ensure config file paths are correct
+2. **Check YAML syntax**: Validate indentation and structure
+3. **Environment variables**: Set `PRIMUS_WORKSPACE` if needed
+4. **Model checkpoint**: Verify pre-trained checkpoint path (if using)
+
+---
+
+## 🎯 Summary Commands
+
+**Quick reference for common post-training tasks:**
+
+```bash
+# SFT on MI355X (direct mode)
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
+
+# LoRA on MI355X (direct mode)
+./runner/primus-cli direct train posttrain \
+  --config ./examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
+
+# SFT on MI300X (container mode)
+./runner/primus-cli container --image rocm/primus:latest train posttrain \
+  --config ./examples/megatron_bridge/configs/MI300X/qwen3_32b_sft_posttrain.yaml
+```
+
+---
+
+**Need help?** Open an issue on [GitHub](https://github.com/AMD-AIG-AIMA/Primus/issues).
+
+**Start fine-tuning with Primus! 🚀**
diff --git a/docs/projection.md b/docs/projection.md
index 5c5212026..9af860731 100644
--- a/docs/projection.md
+++ b/docs/projection.md
@@ -1,39 +1,64 @@
-# Performance Projection
+# Projection
 
-Primus includes a performance projection tool that benchmarks transformer layers on a single node and projects training iteration times to multi-node configurations.
+Primus includes projection tools that estimate **memory requirements** and **training performance** for large-scale distributed training without requiring the full target cluster. Two projection modes are available:
 
-- **User-facing entry**: `primus-cli … -- projection performance [options]`
-- **Implementation entrypoint**: `primus/cli/subcommands/projection.py`
-- **Core logic**: `primus/core/projection/performance_projection/projection.py`
-
-## Overview
-
-The performance projection tool:
-
-1. **Benchmarks** transformer layers on a single node to measure forward/backward pass times
-2. **Simulates** pipeline parallelism scheduling (including zero-bubble optimization)
-3. **Projects** performance to multi-node configurations by modeling:
-   - Data Parallelism (DP) scaling
-   - Gradient AllReduce communication overhead
-   - Expert Parallelism (EP) All-to-All communication overhead
-   - Inter-node vs intra-node communication differences
+| Mode | Command | What it does |
+|------|---------|--------------|
+| **Memory** | `projection memory` | Estimates per-GPU memory (parameters, optimizer, activations) using analytical formulas |
+| **Performance** | `projection performance` | Benchmarks layers on 1 node, then projects training time to multi-node clusters |
 
-This allows you to estimate training performance on larger clusters without actually running on them.
+- **User-facing entry**: `primus-cli … -- projection {memory,performance} [options]`
+- **Implementation entrypoint**: `primus/cli/subcommands/projection.py`
+- **Core logic**:
+  - Memory: `primus/core/projection/memory_projection/projection.py`
+  - Performance: `primus/core/projection/performance_projection/projection.py`
+
+---
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Command Syntax](#command-syntax)
+3. [Memory Projection](#memory-projection)
+   - [Overview](#memory-overview)
+   - [Architecture](#memory-architecture)
+   - [Parameter Estimation](#parameter-estimation)
+   - [Param + Optimizer Memory](#param--optimizer-memory)
+   - [Activation Memory](#activation-memory)
+   - [Pipeline Schedule Memory Scaling](#pipeline-schedule-memory-scaling)
+   - [Recomputation Support](#recomputation-support)
+   - [Memory Formulas Reference](#memory-formulas-reference)
+4. [Performance Projection](#performance-projection)
+   - [Overview](#performance-overview)
+   - [How It Works](#how-it-works)
+   - [Scaling Mechanisms](#scaling-mechanisms)
+   - [Communication Modeling](#communication-modeling)
+   - [Pipeline Schedule Simulator](#pipeline-schedule-simulator)
+   - [Overall Performance Prediction Flow](#overall-performance-prediction-flow)
+5. [Example Output](#example-output)
+6. [Assumptions and Limitations](#assumptions-and-limitations)
+7. [Tips](#tips)
+
+---
 
 ## Quick Start
 
-Run a basic performance projection for the minimum required nodes:
+### Memory Projection
+
+Estimate per-GPU memory for a model configuration (no GPU needed for estimation, but the CLI currently requires torch distributed init):
 
 ```bash
 export NNODES=1
 export HSA_NO_SCRATCH_RECLAIM=1
 
 bash runner/primus-cli direct --script primus/cli/main.py -- \
-    projection performance \
+    projection memory \
     --config examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml
 ```
 
-Project performance to a specific number of nodes:
+### Performance Projection
+
+Benchmark on 1 node and project to a target cluster:
 
 ```bash
 export NNODES=1
@@ -45,26 +70,32 @@ bash runner/primus-cli direct --script primus/cli/main.py -- \
     --target-nodes 4
 ```
 
+---
+
 ## Command Syntax
 
 ```bash
-primus-cli [global-options] <mode> [mode-args] -- projection performance [options]
+primus-cli [global-options] <mode> [mode-args] -- projection {memory,performance} [options]
 ```
 
-### Options
+### Common Options
 
 | Option | Type | Description |
 |--------|------|-------------|
 | `--config` | string | Path to the Primus YAML configuration file (required) |
+
+### Performance-Only Options
+
+| Option | Type | Description |
+|--------|------|-------------|
 | `--target-nodes` | int | Target number of nodes for projection. Defaults to minimum required by parallelism config |
 | `--hardware-config` | string | Path to YAML file with custom hardware parameters for communication modeling |
 
 ### Parallelism Overrides
 
-You can override parallelism settings from the config file using environment variables or CLI overrides:
+You can override parallelism settings from the config file:
 
 ```bash
-# Using environment variables
 export PRIMUS_TP=1
 export PRIMUS_PP=3
 export PRIMUS_EP=8
@@ -75,30 +106,390 @@ bash runner/primus-cli direct --script primus/cli/main.py -- \
     --target-nodes 6
 ```
 
-## How It Works
+---
+
+## Memory Projection
+
+<a name="memory-overview"></a>
+### Overview
+
+The memory projection estimates **per-GPU memory** usage by analytically computing:
+
+1. **Parameter memory** — model weights stored on this GPU
+2. **Optimizer state memory** — optimizer first/second moments, sharded across DP ranks
+3. **Activation memory** — intermediate tensors stored for the backward pass
+
+It uses a hierarchical profiler system that mirrors the model's module structure, computing each component's contribution bottom-up.
+
+<a name="memory-architecture"></a>
+### Architecture
+
+```
+LanguageModelProfiler
+├── EmbeddingProfiler              — vocab embeddings (stage 0 only)
+├── DenseTransformerLayerProfiler  — for non-MoE layers
+│   ├── LayerNormProfiler (×3)     — pre-attn, pre-MLP, post-MLP
+│   ├── AttentionProfiler          — QKV projections + attention
+│   ├── ResidualAddProfiler (×2)   — skip connections
+│   └── DenseMLPProfiler           — standard SwiGLU/FFN
+├── MoETransformerLayerProfiler    — for MoE layers
+│   ├── LayerNormProfiler (×3)
+│   ├── AttentionProfiler
+│   ├── ResidualAddProfiler (×2)
+│   ├── RouterProfiler             — expert routing logits
+│   └── MoEMLPProfiler            — routed experts + shared expert
+├── LayerNormProfiler              — final layer norm (last stage only)
+├── OutputLayerProfiler            — language model head (last stage only)
+└── LossProfiler                   — cross-entropy loss (last stage only)
+```
+
+Each profiler implements two key methods:
+- `estimated_num_params(rank)` — parameter count (total if `rank=None`, per-GPU if rank given)
+- `estimated_activation_memory(batch_size, seq_len)` — activation bytes for one microbatch
+
+<a name="parameter-estimation"></a>
+### Parameter Estimation
+
+Parameters are computed per component and summed across all layers assigned to this GPU's pipeline stage.
+
+#### Layer Assignment
+
+Layers are distributed across `PP × VPP` virtual stages. Each physical PP rank hosts `VPP` virtual stages in an interleaved pattern:
+
+```
+PP rank 0 → virtual stages 0, PP, 2×PP, ...
+PP rank 1 → virtual stages 1, PP+1, 2×PP+1, ...
+```
+
+#### Per-Component Parameter Formulas
+
+| Component | Formula | Notes |
+|-----------|---------|-------|
+| **Embedding** | `V × H` | `V` = padded vocab size, `H` = hidden size |
+| **LayerNorm** | `2 × H` | gamma + beta per LayerNorm |
+| **Attention (standard)** | `2 × H² × (1 + G/A) × P` | `A` = num heads, `G` = num KV groups, `P` = proj ratio |
+| **Attention (MLA)** | `q_term + kv_term + pos + out` | DeepSeek-style multi-latent attention |
+| **Dense MLP (SwiGLU)** | `3 × H × FFN` | gate, up, down projections |
+| **Dense MLP (standard)** | `2 × H × FFN` | up, down projections |
+| **Router** | `H × NE` | `NE` = number of experts |
+| **MoE MLP** | `NE/EP × n_proj × H × FFN_e + shared` | Expert params sharded by EP |
+| **Output Layer** | `V × H` | May share weights with embedding |
+
+Where:
+- `H` = `hidden_size`
+- `V` = `padded_vocab_size`
+- `FFN` = `ffn_hidden_size` (dense MLP intermediate dimension)
+- `FFN_e` = `moe_ffn_hidden_size` (per-expert intermediate dimension)
+- `NE` = `num_experts`
+- `EP` = `expert_model_parallel_size`
+- `n_proj` = 3 for SwiGLU, 2 for standard FFN
+
+### Param + Optimizer Memory
+
+The total bytes per parameter depends on the training precision and optimizer sharding:
+
+```
+bytes_per_param = weight_bytes + gradient_bytes + optimizer_bytes
+
+Where:
+  weight_bytes   = 2      (BF16 weights)
+  gradient_bytes = 2      (BF16 gradients)
+  optimizer_bytes = 10/DP  (FP32 master weights + Adam m + Adam v, sharded across DP)
+                 = (2 + 4 + 4) / DP
+```
+
+**DP calculation:**
+
+```
+DP = world_size / (EP × PP)
+```
+
+Note: TP and CP are not divided out because all TP/CP ranks within a DP group share the same optimizer partition.
+
+**Total param + optimizer memory per GPU:**
+
+```
+param_optimizer_memory = params_on_this_gpu × bytes_per_param
+```
+
+### Activation Memory
 
-### 1. Configuration Reduction
+Activation memory is the memory needed to store intermediate tensors for the backward pass. Each component estimates what it stores for backward.
+
+#### Base Tensor (sbh)
+
+The fundamental building block is the hidden state tensor:
+
+```
+sbh = MBS × (S / TP / CP) × H × 2 bytes (BF16)
+```
+
+Where `MBS` = micro batch size, `S` = sequence length.
+
+#### Per-Component Activation Formulas
+
+##### LayerNorm
+Stores its input for backward:
+```
+act = sbh = MBS × S/(TP×CP) × H × 2
+```
+
+##### Residual Add
+Stores the residual for backward:
+```
+act = sbh
+```
+
+##### Router
+Stores hidden states for routing weight gradients:
+```
+act = sbh
+```
+
+##### Attention (standard, Flash Attention)
+
+Stores Q, K, V, output, and logsumexp for Flash Attention backward:
+
+```python
+tokens_per_rank = MBS × S / (TP × CP)
+
+# activation width = Q + K + V + output + softmax stats
+Q_width     = kv_channels × num_heads                    # e.g. 128 × 64 = 8192
+KV_width    = kv_channels × num_kv_groups                # e.g. 128 × 1 = 128 (MQA)
+output_width = hidden_size                                # 8192
+softmax_width = Q_width  (with Flash Attention)           # 8192
+
+total_width = Q_width + 2×KV_width + output_width + softmax_width
+act = tokens_per_rank × total_width × 2  (BF16)
+```
+
+For MQA with 64 heads and 1 KV group: `Q(256MB) + K(4MB) + V(4MB) + O(256MB) + LSE(4MB) ≈ 0.51 GB`
+
+##### Dense MLP (SwiGLU)
+
+For the SwiGLU computation `output = down_proj(silu(gate_proj(x)) ⊙ up_proj(x))`, stores:
+
+```python
+tokens = MBS × S / (TP × CP)
+
+# SwiGLU stores gate, up, and hidden (silu×up) for backward
+intermediate = 2 × tokens × FFN × 2      # gate_proj + up_proj outputs (BF16)
+activation   = tokens × FFN × 2           # silu(gate) ⊙ up (input to down_proj)
+output       = tokens × H × 2             # down_proj output
+
+act = intermediate + activation + output
+    = tokens × (3×FFN + H) × 2
+```
+
+##### MoE MLP
+
+For MoE, each token is routed to `topk` experts, duplicating the activation:
+
+```python
+tokens = MBS × S / (TP × CP)
+topk_tokens = tokens × topk               # total token-expert pairs
+
+# Routed experts: same SwiGLU structure per token-expert pair
+routed_act = topk_tokens × (3×FFN_e + H) × 2
+
+# Shared expert (if configured): processes ALL tokens
+shared_act = tokens × (3×FFN_e + H) × 2   # same SwiGLU, one copy
+
+act = routed_act + shared_act
+    = tokens × (topk + N_shared) × (H + 3×FFN_e) × 2
+```
+
+Where:
+- `topk` = `moe_router_topk` (experts activated per token)
+- `FFN_e` = `moe_ffn_hidden_size` (per-expert FFN intermediate dimension)
+- `N_shared` = 1 if `moe_shared_expert_intermediate_size` is set, else 0
+
+**Example (MoE 4.5T, MBS=4, S=16384, CP=4, H=8192, FFN_e=2048, topk=36):**
+```
+tokens = 4 × 16384/4 = 16,384
+MoE MLP = 16,384 × (36+1) × (8192 + 3×2048) × 2 = 16.19 GB
+```
+
+##### Full Transformer Layer (without recompute)
+
+For a MoE layer, the total is the sum of all components:
+
+```
+layer_act = 3×LayerNorm + Attention + 2×ResidualAdd + Router + MoE_MLP
+          = 3×sbh + attn_act + 2×sbh + sbh + moe_mlp_act
+          = 6×sbh + attn_act + moe_mlp_act
+```
+
+For a dense layer: same but with Dense MLP instead of Router + MoE MLP.
+
+##### Full Layer Activation Summary
+
+| Component | Formula | Typical Size (MoE 4.5T) |
+|-----------|---------|------------------------|
+| LayerNorm (×3) | `3 × sbh` | 0.75 GB |
+| Residual Add (×2) | `2 × sbh` | 0.50 GB |
+| Router | `sbh` | 0.25 GB |
+| Attention (Flash, MQA) | `tokens × (Q+2KV+O+softmax) × 2` | 0.51 GB |
+| MoE MLP (SwiGLU) | `tokens × (topk+1) × (H+3×FFN_e) × 2` | 16.19 GB |
+| **Full MoE layer** | **sum** | **18.20 GB** |
+| With full recompute | `sbh` (checkpoint only) | 0.25 GB |
+
+<a name="pipeline-schedule-memory-scaling"></a>
+### Pipeline Schedule Memory Scaling
+
+With pipeline parallelism, multiple microbatches are in-flight simultaneously, each requiring stored activations.
+
+#### 1F1B Schedule
+
+In the 1F1B (one-forward-one-backward) schedule, the first pipeline stage (rank 0) accumulates `PP` microbatches during the warmup phase before starting any backward passes. This means peak activation memory requires storing activations for `PP` microbatches.
+
+```
+base_activation = sum of per-layer activations across all layers on this rank
+peak_activation = base_activation × PP
+```
+
+#### VPP (Virtual Pipeline Parallelism) Overhead
+
+With interleaved scheduling (VPP > 1), there is a small memory overhead because more microbatches can be partially in-flight:
+
+```
+interleaved_penalty = 1 + (PP - 1) / (PP × VPP)
+```
+
+For VPP=1: penalty = 1 + (PP-1)/PP (significant overhead)
+For VPP=20: penalty = 1 + (PP-1)/(PP×20) ≈ 1.04 (nearly negligible)
+
+#### Gradient Accumulation Saving
+
+When gradient accumulation (GA) steps are fewer than PP stages, the pipeline isn't fully filled:
+
+```python
+GA = GBS / (MBS × DP)
+ga_saving = 1.0 if GA >= PP else GA / PP
+```
+
+#### Final Activation Memory Formula
+
+```
+total_activation = base_activation × PP × interleaved_penalty × ga_saving
+```
+
+<a name="recomputation-support"></a>
+### Recomputation Support
+
+Activation recomputation trades compute for memory by discarding intermediate activations during forward and recomputing them during backward.
+
+#### Full Recompute (`recompute_granularity="full"`)
+
+When a layer is fully recomputed, only the **input tensor** is stored as a checkpoint:
+
+```
+recomputed_layer_act = sbh = MBS × S/(TP×CP) × H × 2 bytes
+```
+
+This is dramatically smaller than the full activation. For example, an MoE layer drops from ~18 GB to 0.25 GB.
+
+#### Partial Recompute
+
+The `recompute_num_layers` setting controls how many layers per VPP stage are recomputed:
+
+```python
+for each layer on this rank:
+    local_idx = layer_index % layers_per_vpp_stage
+    if recompute_granularity == "full" and local_idx < recompute_num_layers:
+        act += input_act_per_layer   # just sbh (checkpoint)
+    else:
+        act += full_layer_act        # all intermediates
+```
+
+#### With Recompute: Total Memory
+
+```
+total_with_recompute = (L/PP × sbh) × PP × interleaved_penalty × ga_saving
+                     + recompute_working_memory (1 layer's full activation, temporary)
+                     + embedding_act (stage 0 only)
+```
+
+The recompute working memory is transient — only one layer's full intermediates exist at a time during backward.
+
+### Memory Formulas Reference
+
+Summary of all memory components for one GPU:
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                    Total GPU Memory                                  │
+├─────────────────────────────────────────────────────────────────────┤
+│                                                                      │
+│  1. Parameters (BF16)                                               │
+│     = params_on_rank × 2 bytes                                      │
+│                                                                      │
+│  2. Gradients (BF16)                                                │
+│     = params_on_rank × 2 bytes                                      │
+│                                                                      │
+│  3. Optimizer States (FP32, sharded across DP)                      │
+│     = params_on_rank × 10 / DP bytes                                │
+│     (master weights: 2B + Adam m: 4B + Adam v: 4B)                  │
+│                                                                      │
+│  4. Activations                                                     │
+│     = Σ(per-layer act) × PP × VPP_penalty × GA_saving              │
+│     + embedding/output activations (stage-dependent)                 │
+│                                                                      │
+│  5. Transient buffers (not in projection)                           │
+│     - A2A dispatch/combine buffers                                   │
+│     - Communication scratch space                                    │
+│     - PyTorch allocator fragmentation                                │
+│                                                                      │
+│  Total = (1) + (2) + (3) + (4)                                     │
+│  Reported as: Param+Optimizer + Activation = Projected Total        │
+│                                                                      │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Performance Projection
+
+<a name="performance-overview"></a>
+### Overview
+
+The performance projection tool:
+
+1. **Benchmarks** transformer layers on a single node to measure forward/backward pass times
+2. **Simulates** pipeline parallelism scheduling (including zero-bubble optimization)
+3. **Projects** performance to multi-node configurations by modeling:
+   - Data Parallelism (DP) scaling
+   - Gradient AllReduce communication overhead
+   - Expert Parallelism (EP) All-to-All communication overhead
+   - Inter-node vs intra-node communication differences
+
+This allows you to estimate training performance on larger clusters without actually running on them.
+
+### How It Works
+
+#### 1. Configuration Reduction
 
 If the parallelism configuration requires multiple nodes (e.g., PP=3 needs 3 nodes), the tool automatically reduces the config for single-node benchmarking:
 
 - **Pipeline Parallelism (PP)**: Reduced to fit on 1 node, PP overhead estimated analytically
 - **Expert Parallelism (EP)**: Reduced if needed, All-to-All overhead added back
 
-### 2. Layer Benchmarking
+#### 2. Layer Benchmarking
 
 The tool benchmarks each transformer layer type:
 - Dense attention layers
 - MoE (Mixture of Experts) layers
 - Measures forward and backward pass times separately
+- Also benchmarks embedding and output layers
 
-### 3. Pipeline Simulation
+#### 3. Pipeline Simulation
 
 For PP > 1, the tool simulates the pipeline schedule to account for:
 - Pipeline bubble overhead
 - Microbatch interleaving
 - Zero-bubble scheduling (if enabled)
 
-### 4. Data Parallel Scaling
+#### 4. Data Parallel Scaling
 
 The projection models how performance scales with additional nodes:
 
@@ -106,11 +497,9 @@ The projection models how performance scales with additional nodes:
 Projected Time = (Base Time / DP_scaling_factor) + Communication Overheads
 ```
 
-## Scaling Mechanisms
+### Scaling Mechanisms
 
-The tool models the following parallelism dimensions and their communication patterns:
-
-### Tensor Parallelism (TP)
+#### Tensor Parallelism (TP)
 
 **What it does**: Splits individual layer weights across GPUs within a node.
 
@@ -118,7 +507,7 @@ The tool models the following parallelism dimensions and their communication pat
 
 **Communication**: AllReduce within TP group (typically intra-node, fast).
 
-### Pipeline Parallelism (PP)
+#### Pipeline Parallelism (PP)
 
 **What it does**: Distributes layers across pipeline stages. Each stage processes microbatches in sequence.
 
@@ -128,7 +517,7 @@ The tool models the following parallelism dimensions and their communication pat
 - Simulates the actual 1F1B or zero-bubble schedule with proper send/receive synchronization
 - Accounts for pipeline bubble overhead and microbatch interleaving
 
-### Expert Parallelism (EP)
+#### Expert Parallelism (EP)
 
 **What it does**: Distributes MoE experts across GPUs. Each GPU holds a subset of experts.
 
@@ -143,7 +532,7 @@ The tool models the following parallelism dimensions and their communication pat
 All-to-All Message Size = tokens × hidden_size × top_k × 2 (BF16)
 ```
 
-### Data Parallelism (DP)
+#### Data Parallelism (DP)
 
 **What it does**: Replicates the model across DP groups. Each group processes different data batches.
 
@@ -159,13 +548,13 @@ Gradient AllReduce Size = num_params × 4 (FP32 gradients)
 
 **Optimization**: If `overlap_grad_reduce=True` (default), gradient AllReduce is overlapped with backward computation and not on the critical path.
 
-### Context Parallelism (CP)
+#### Context Parallelism (CP)
 
 **What it does**: Splits sequence length across GPUs for long-context training.
 
 **How it's modeled**: CP affects the GPU topology for communication routing. Currently included in minimum GPU requirements calculation.
 
-## Communication Modeling
+### Communication Modeling
 
 The tool uses analytical models to estimate collective communication times:
 
@@ -176,33 +565,177 @@ The tool uses analytical models to estimate collective communication times:
 | P2P Send/Recv | PP (activations) | Point-to-point latency + bandwidth |
 
 Communication times differ significantly based on:
-- **Intra-node**: Fast (e.g., NVLink, xGMI)
-- **Inter-node**: Slower (e.g., InfiniBand, RoCE) 
+- **Intra-node**: Fast (e.g., NVLink, UALink, xGMI)
+- **Inter-node**: Slower (e.g., InfiniBand, RoCE)
+
+Custom hardware parameters can be provided via `--hardware-config <yaml>`.
 
-### Key Concepts
+#### Key Concepts
 
-#### Minimum Nodes Required
+##### Minimum Nodes Required
 
-The minimum nodes required is determined by:
 ```
 Min Nodes = ceil(TP × PP × EP × CP / GPUs_per_node)
 ```
 
-#### Scaling Behavior
+##### Scaling Behavior
 
 - **DP scaling**: Linear speedup. Doubling DP halves iteration time (minus communication overhead).
 - **PP scaling**: Happens in multiples of pipeline replicas. With PP=3, you need 3, 6, 9... nodes to increase scaling.
-- **EP scaling**: Divides the experts on EP nodes.
+- **EP scaling**: Divides the experts across EP nodes.
+
+### Pipeline Schedule Simulator
+
+The pipeline simulator (`simulator.py`) simulates the execution of pipeline parallelism schedules to calculate step time and bubble ratio.
+
+#### Schedule Algorithms
+
+| Algorithm | Description | Use Case |
+|-----------|-------------|----------|
+| **1F1B** | Standard one-forward-one-backward | Default pipeline schedule |
+| **Interleaved 1F1B** | Multiple chunks per rank (VPP > 1) | Reduced bubble ratio |
+| **Zero-Bubble** | Splits backward into B + W | Minimal bubble overhead |
+
+#### Zero-Bubble Scheduling
+
+Zero-bubble minimizes pipeline bubbles by separating the backward pass:
+- **B (Input Gradient):** Compute gradients w.r.t. input activations
+- **W (Weight Gradient):** Compute gradients w.r.t. weights
+
+This allows more flexible scheduling because W doesn't depend on receiving gradients from the next stage. By default, backward time is split 50/50 between B and W.
+
+Two implementations are available:
+1. **Simple Zero-Bubble Simulator** — basic F-B-W pattern with warmup/steady/cooldown phases
+2. **Megatron ILP-Based Scheduler** — graph-based schedule optimization with memory-aware scheduling using Megatron's actual zero-bubble scheduler
+
+#### P2P Communication in Pipeline Simulation
+
+The pipeline simulator uses a **fixed small constant** (~0.1 ms) for P2P communication, NOT the analytical `sendrecv` model. This is because:
+1. P2P communication is typically **overlapped with compute** in optimized schedules
+2. The simulator focuses on **schedule ordering and bubble calculation**
+3. P2P time is **small relative to F/B/W times** for large models
+
+However, when the benchmark PP differs from the target PP (e.g., benchmark PP=1, target PP=6), the **analytical `sendrecv` model** is used to estimate the PP communication overhead that was not captured in the benchmark:
+
+```
+PP overhead = additional_stages × 2 (fwd+bwd) × microbatches × sendrecv(activation_size)
+```
+
+P2P communication becomes significant when PP stages span nodes (inter-node P2P has 10-100× higher latency than intra-node).
+
+### Overall Performance Prediction Flow
+
+```
+1. Load Configuration
+   └── Parse YAML config, extract parallelism settings
+
+2. Single-Node Benchmarking
+   ├── If config requires multiple nodes:
+   │   └── Reduce PP to 1, possibly rescale EP to fit on 1 node
+   ├── Limit layers (1 dense + 1 MoE for efficiency)
+   └── Benchmark forward + backward times
+
+3. Extrapolate to Full Model
+   └── Multiply per-layer times by total layer count
+
+4. Pipeline Schedule Simulation (if PP > 1)
+   ├── Build chunk time matrix (per-rank, per-chunk)
+   ├── Select scheduler (1F1B, Interleaved, Zero-Bubble)
+   └── Simulate to get step_time_ms and bubble_ratio
+
+5. Add Communication Overhead (if config was reduced)
+   ├── PP overhead: P2P communication between stages
+   └── EP overhead: Additional All-to-All for larger EP
+
+6. Multinode Scaling Projection
+   ├── Calculate DP scaling factor
+   ├── Scale compute time: projected = base × (min_dp / target_dp)
+   ├── Add gradient AllReduce (if not overlapped)
+   └── Report projected iteration time and tokens/s
+```
+
+#### Performance Formula
+
+```
+Projected_Time = Base_Time × (Min_DP / Target_DP) + Communication_Overhead
+
+Where:
+  Base_Time             = Pipeline simulation time (includes PP bubbles)
+  Min_DP                = DP at minimum node configuration
+  Target_DP             = DP at target node configuration
+  Communication_Overhead = Gradient AllReduce (if not overlapped)
+                         + MoE All-to-All overhead (if EP spans nodes)
+```
+
+#### Example Calculation
+
+**Configuration:** DeepSeek V2 Lite — TP=1, PP=3, EP=8, CP=1 — GBS=640, MBS=4, Seq=4096
+
+```
+Step 1: Minimum Nodes
+  GPUs required = 1 × 3 × 8 × 1 = 24 GPUs = 3 nodes
+  Min DP = 24 / (1 × 3 × 1) = 8
+
+Step 2: Target Configuration (6 nodes)
+  Total GPUs = 48
+  Target DP = 48 / (1 × 3 × 1) = 16
+  DP Scaling = 16 / 8 = 2×
+
+Step 3: Projected Time
+  Base_Time (from pipeline simulation) = 5500 ms
+  Projected_Time = 5500 × (8 / 16) = 2750 ms
+  Tokens/s = (640 × 4096) / 2.750 = 953,018 tokens/s
+```
+
+---
 
 ## Example Output
 
+### Memory Projection
+
 ```
 ====================================================================================================
-[Primus:Performance Projection] Configuration Summary:
-  Benchmark Config: PP=1, EP=8, TP=1, CP=1, DP=1 (1 node)
-  Target Config: PP=1, EP=8, TP=1, CP=1, DP=4 (4 nodes)
-  Benchmark Microbatches: 160 (global_batch=640, micro_batch=4, benchmark_dp=1)
+[Primus:Projection] Component-wise Profiling Results (Rank 0):
+====================================================================================================
+
+  Total Number of Parameters: 15.654321 Billion (15,654,321,024)
+
+  [embedding]
+    Params: 0.819200 Billion (819,200,000)
+    Activation Memory: 0.2500 GB
+
+    [dense_transformer_layer]
+      Params: 0.302000 Billion (302,000,000)
+      Activation Memory: 2.1250 GB
+
+      [layer_norm]
+        Params: 0.000016 Billion (16,384)
+        Activation Memory: 0.2500 GB
+
+      [self_attention]
+        Params: 0.134218 Billion (134,217,728)
+        Activation Memory: 0.5100 GB
+
+      [mlp]
+        Params: 0.167772 Billion (167,772,160)
+        Activation Memory: 0.8650 GB
+
+    [moe_transformer_layer]
+      Params: 1.001400 Billion (1,001,400,000)
+      Activation Memory: 18.2000 GB
+
+====================================================================================================
+[Primus:Projection] Memory Projection Summary on Rank 0:
+  Params: 20.850000 Billion (20,850,000,000)
+  Param+Optimizer Memory: 83.7400 GB
+  Activation Memory (per batch size 4, seq len 16384): 36.7500 GB
+  Projected Total Memory: 120.4900 GB
+====================================================================================================
+```
 
+### Performance Projection
+
+```
 ====================================================================================================
 Multinode Scaling Projection Results
 ====================================================================================================
@@ -226,12 +759,36 @@ Multinode Scaling Projection Results
 ====================================================================================================
 ```
 
+---
+
+## Assumptions and Limitations
+
+### Assumptions
+
+1. **Linear DP Scaling** — Compute time scales linearly with 1/DP (ideal weak scaling)
+2. **Communication Model** — Bandwidth efficiency is constant (default 91%); inter-node communication uses switch topology; all NICs are used in parallel for inter-node traffic
+3. **Pipeline Bubbles** — B/W split is 50/50 for zero-bubble scheduling; P2P communication time is small relative to compute
+4. **Gradient AllReduce** — By default overlapped with compute (`overlap_grad_reduce=True`); if disabled, added to critical path
+5. **MoE All-to-All** — All-to-All time scales with EP size; per-peer latency overhead is constant
+
+### Limitations
+
+1. **Single-Node Benchmark Accuracy** — Benchmarking with reduced PP/EP may not capture all behaviors; layer timing variance is assumed uniform
+2. **Communication Contention** — Model doesn't account for network contention; assumes dedicated bandwidth per collective
+3. **Memory Pressure** — Memory impact on performance not fully modeled; activation recomputation overhead not considered in performance
+4. **Hardware Heterogeneity** — Assumes homogeneous nodes; GPU frequency variations not modeled
+
+---
+
 ## Tips
 
+- **Start with memory projection**: Run `projection memory` first to verify your model fits in GPU memory before benchmarking.
 - **Start with 1 node**: Always benchmark on 1 node first to establish baseline performance.
-- **Understand scaling limits**: DP scaling is limited by global_batch_size / micro_batch_size. If you run out of microbatches, adding more nodes won't help.
-- **Check minimum nodes**: If your config requires multiple nodes (e.g., PP=4 with 8 GPUs/node), projection will automatically reduce PP for benchmarking.
+- **Understand scaling limits**: DP scaling is limited by `global_batch_size / micro_batch_size`. If you run out of microbatches, adding more nodes won't help.
+- **Check minimum nodes**: If your config requires multiple nodes (e.g., PP=4 with 8 GPUs/node), the performance projection will automatically reduce PP for benchmarking.
 - **Pipeline scaling**: With PP > 1, you can only scale in multiples of the pipeline replica size.
+- **Recomputation trade-off**: Full recompute dramatically reduces activation memory (e.g., 18 GB → 0.25 GB per MoE layer) at the cost of ~33% more compute.
+- **MoE activation dominance**: For MoE models, the MoE MLP activation (scaled by `topk`) typically dominates the per-layer activation budget. Consider recomputation if memory is tight.
 
 ## Related Documentation
 
diff --git a/docs/tech_blogs/primus_pipeline/imgs/actual-perf.png b/docs/tech_blogs/primus_pipeline/imgs/actual-perf.png
new file mode 100644
index 000000000..7c2de0528
Binary files /dev/null and b/docs/tech_blogs/primus_pipeline/imgs/actual-perf.png differ
diff --git a/docs/tech_blogs/primus_pipeline/imgs/llama2-7B-perf.png b/docs/tech_blogs/primus_pipeline/imgs/llama2-7B-perf.png
new file mode 100644
index 000000000..e0145b46c
Binary files /dev/null and b/docs/tech_blogs/primus_pipeline/imgs/llama2-7B-perf.png differ
diff --git a/docs/tech_blogs/primus_pipeline/imgs/qwen-235B-perf.png b/docs/tech_blogs/primus_pipeline/imgs/qwen-235B-perf.png
new file mode 100644
index 000000000..b6156c6e8
Binary files /dev/null and b/docs/tech_blogs/primus_pipeline/imgs/qwen-235B-perf.png differ
diff --git a/docs/tech_blogs/primus_pipeline/imgs/simulation.png b/docs/tech_blogs/primus_pipeline/imgs/simulation.png
new file mode 100644
index 000000000..a62d38c95
Binary files /dev/null and b/docs/tech_blogs/primus_pipeline/imgs/simulation.png differ
diff --git a/docs/tech_blogs/primus_pipeline/imgs/simulator_shell.png b/docs/tech_blogs/primus_pipeline/imgs/simulator_shell.png
new file mode 100644
index 000000000..5942f08bb
Binary files /dev/null and b/docs/tech_blogs/primus_pipeline/imgs/simulator_shell.png differ
diff --git a/docs/tech_blogs/primus_pipeline/primus_pipeline.md b/docs/tech_blogs/primus_pipeline/primus_pipeline.md
new file mode 100644
index 000000000..bc6d76cab
--- /dev/null
+++ b/docs/tech_blogs/primus_pipeline/primus_pipeline.md
@@ -0,0 +1,214 @@
+<!---
+Copyright (c) 2025 Advanced Micro Devices, Inc. (AMD)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+--->
+
+# Primus-pipeline: A more flexible and scalable pipeline parallelism implementation
+
+This blog covers a flexible pipeline parallelism implementation in the Primus Megatron-LM backend, which supports a full stack of zero-bubble algorithms including zerobubble/zbv/v-half/v-min, as well as traditional 1f1b and 1f1b-interleaved schedules. We also provide a full stack of simulation and performance tools to analyze pipeline scheduling algorithms in both theory and practice.
+
+## Background
+
+Pipeline parallelism(PP) is an efficient strategy for large language model pretraining. For models that must span nodes, pipeline parallelism enables model sharding with primarily neighbor point‑to‑point activation transfers (instead of heavy global collectives), often improving the scalability and predictability of scale‑out communication.
+
+1f1b and 1f1b-interleaved are two PP scheduling algorithms widely used in LLM training frameworks such as Megatron-LM and DeepSpeed. In recent years, several state-of-the-art pipeline parallelism methods have been proposed, such as the [zero-bubble algorithm](https://github.com/sail-sg/zero-bubble-pipeline-parallelism) by Sea AI Lab and [DualPipe/DualPipe-V](https://github.com/deepseek-ai/DualPipe) by the DeepSeek team, but they are not integrated into open-source LLM training frameworks mainly because the scheduling logic is fixed and hard to modify.
+
+For example, in the Megatron-LM 1f1b-interleaved implementation, the PP schedule is depicted by three main loops: warm-up, 1f1b steady, and cooldown. The warm-up phase executes only forward passes, the steady phase executes one forward and one backward in order, and finally the cooldown phase executes the last minibatches' backward passes. Communication nodes are inserted inside the loop. In this case, if you want to add a new schedule such as zero-bubble or dual-pipe, you need to copy the three loops and modify their details, which can become long and complicated.
+
+## Primus-pipeline
+
+The key idea of Primus-pipeline is to separate pipeline scheduling logic from training execution, making it convenient to verify and implement new pipeline schedule algorithms. Our main contributions can be summarized in the following three points.
+
+- Provide a flexible abstraction for PP scheduling algorithms and fully support many state-of-the-art algorithms, including 1f1b-interleaved and zero-bubble based schedules.
+
+- Implement InputGrad/WeightGrad separation ops for GeMM and GroupGemm by redefining [Primus-Turbo](https://github.com/AMD-AGI/Primus-Turbo) ops.
+
+- Provide simulation tools for each PP algorithm in both theory and practice, which clearly simulate and measure bubble rate and memory consumption under specific configs. 
+
+### Schedule Design
+
+Primus-pipeline patch and substite the Megatron-LM's `megatron.core.pipeline_parallel.get_forward_backward_func` function. The entrypoint of the schedule logic is at [PrimusPipelineParallelLauncher](https://github.com/AMD-AGI/Primus/blob/dev/yc/primus-pipe-blog/primus/backends/megatron/core/pipeline_parallel/primuspipe/pipeline_launcher.py)
+
+Here are the steps to define and run a PP algorithm in Primus-pipeline.
+
+1. **Create a ScheduleTable with ScheduleNodes**: For most PP algorithms, a schedule table containing schedule nodes can be defined given PP world size, virtual pipeline chunks per rank, and minibatches. 
+  - **ScheduleNode**: Each step of the execution can be abstracted as a ScheduleNode including computation nodes such as FORWARD/BACKWARD/WGRAD and communication nodes such as RECV_FORWARD/SEND_FORWARD.
+  - **PP Algorithms**: [pp-algorithms](https://github.com/AMD-AGI/Primus/tree/main/primus/core/pipeline_parallel/scheduler/algorithms)
+
+2. **Bind the ScheduleNodes with their execution functions and args**: Each node in the schedule table is bound to its handler functions and runtime args. For example, Primus binds Megatron-LM backend functions in this code. [Megatron-LM backend binding](https://github.com/AMD-AGI/Primus/blob/main/primus/backends/megatron/core/pipeline_parallel/primuspipe/pipeline_launcher.py#L239-L272)
+
+3. **Launch the ScheduleRunner**
+After binding execution functions on the schedule table, it is launched by [ScheduleRunner](https://github.com/AMD-AGI/Primus/blob/main/primus/core/pipeline_parallel/scheduler/scheduler.py)
+
+
+### Simulator and perf
+
+To validate and compare performance of different PP algorithms, we offer tools that can perform projections without hardware and also observe and profile PP schedules in real runs.
+
+#### Simulator
+
+Given a `ScheduleTable`, the simulator estimates bubble rate, memory footprint, and timing based on a provided configuration without requiring physical devices. This enables rapid comparison of different schedule designs and iterative development before running full-scale experiments.
+
+Below is an example demonstrating how to validate different PP algorithms using the configuration defined in [pp_simulation.yaml](https://github.com/AMD-AGI/Primus/blob/main/primus/core/projection/configs/pp_simulation.yaml). We also did lots of works for fine-grained projections with transformer layer performance metrics and various parallel strategies, please refer to the [projection readme](https://github.com/AMD-AGI/Primus/blob/main/docs/projection.md).
+
+Execute the command below to run the simulation program. The tool will output performance metrics for different algorithms directly in the console. The data generated by simulator can be visualized by visualization tools introduced in the next section.
+
+```bash
+python3 primus/core/projection/performance_projection/simulator.py --config=primus/core/projection/configs/pp_simulation.yaml
+
+```
+![simulator shell](imgs/simulator_shell.png)
+
+
+#### Perf & Visualization tools
+
+Turn on the flag `dump_pp_data: True`; the Primus framework will dump forward and backward times for each minibatch execution. Use the [PP visualization tools](https://github.com/AMD-AGI/Primus/blob/main/tools/visualization/pp_vis/README.md) to visualize the PP schedule.
+
+The time data generated by the simulator can also be visualized by this tool. Here are some examples generated by the visualization tools.
+
+- **Simulation perf**
+
+```bash
+python3 tools/visualization/pp_vis/vis.py --config=primus/core/projection/configs/pp_simulation.yaml
+```
+![simulator perf](imgs/simulation.png)
+
+- **Real training perf**
+![actual perf](imgs/actual-perf.png)
+
+
+### Run with Megatron-LM backend
+
+Most configs for using Primus-pipeline are defined in [primus_pipeline.yaml](https://github.com/AMD-AGI/primus-tiger-training-internal/blob/main/primus/configs/modules/megatron/primus_pipeline.yaml). Two key configs are `patch_primus_pipeline`, which enables the Primus-pipeline implementation to replace the original schedule logic in Megatron, and `pp_algorithm`, which specifies the PP scheduling algorithm to use.
+
+
+Besides that, some configs conflict with Primus-pipeline, need to be config as listed below.
+
+```yaml
+overlap_grad_reduce: false
+overlap_param_gather: false
+no_persist_layer_norm: true
+create_attention_mask_in_dataloader: false
+gradient_accumulation_fusion: true
+
+```
+
+### PP schedule algorithm comparison
+
+Our implementation primarily focuses on the zero-bubble family proposed by Sea AI Lab, including zerobubble/zbv/v-half/v-min. Note that post-validation is not currently supported, as it requires substantial optimizer modifications that are challenging to maintain across Megatron-LM versions.
+
+The following table compares different PP scheduling algorithms under the assumption that forward, backward, and weight-grad operations have equal execution time:
+
+
+| Algorithm        | VPP size | Bubble Rate                 | Max Activation Memory | Communication Volume |
+|------------------|----------|-----------------------------|-----------------------|----------------------|
+| 1f1b             | 1        | (p - 1) / (m + p -1)        | p                     | 1                    |
+| 1f1b-interleaved | N        | (p - 1) / (m * N + p - 1)   | p + (p - 1) / p       | N                    |
+| ZeroBubble(ZB1P) | 1        | (p - 1) / (3 * (m + p - 1)) | p                     | 1                    |
+| ZBV-formatted    | 2        | (p - 1) / (p - 1 + 6 * m)   | p                     | 2                    |
+| V-half           | 2        | -                           | p / 2 + x             | 2                    |
+| V-min            | 2        | -                           | p / 3 + x             | 2                    |
+
+**Note:** V-half and V-min employ greedy algorithms and therefore lack closed-form bubble-rate formulas. Use the simulator to estimate their bubble rates.
+
+**Notation:**
+* `p`: number of pipeline stages
+* `m`: number of minibatches
+* `x`: constant term
+
+### Experiments
+
+We ran experiments to verify the performance of Primus-pipeline. Here we list results for [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b) on 1 node with PP8 and the [Qwen3-235B](https://huggingface.co/Qwen/Qwen3-235B-A22B) MoE model with PP4 EP8.
+
+#### llama2 7B verification
+
+- Cluster with MI300, 1 node, PP8, Llama2-7B model
+
+| PP | VPP | PP-algorithm | tokens/s/device | TFLOPS | max_memory | max_mem_percent | hbm overhead | speed up ratio |
+|----|-----|-------------|----------------|--------|------------|-----------------|--------------|---------------|
+| 8  | 1   | 1f1b        | 10057          | 235.7  | 16.26 | 8.47%         | 1            | 1             |
+| 8  | 2   | 1f1b-interleaved(vpp2) | 10974 | 257.3 | 21.40 | 11.15% | 1.31 | 1.09 |
+| 8  | 2   | zbb         | 11411          | 268    | 16.59 | 8.64% | 1.02 | 1.13 |
+| 8  | 2   | zbv         | 11347          | 265.9  | 18.11 | 9.43% | 1.11 | 1.12 |
+| 8  | 2   | v-half      | 10894          | 255.1  | 14.06 | 7.32% | 0.86 | 1.08 |
+| 8  | 2   | v-min       | 8897.2         | 208.5  | 11.90 | 6.20% | 0.73 | 0.88 |
+
+
+![llama2-7B-perf](imgs/llama2-7B-perf.png)
+
+#### Qwen 235B verification
+
+- Cluster with MI355, 4 nodes, PP4, EP8, Qwen-235B model.
+
+| PP | VPP | PP-algorithm | tokens/s/device | TFLOPS | max_memory | max_mem_percent | hbm overhead | speed up ratio |
+|----|-----|--------------|----------------|--------|------------|-----------------|--------------|---------------|
+| 4  | 1   | 1f1b         | 2742.4         | 406    | 261.58     | 90.83%          | 1            | 1             |
+| 4  | 2   | v-half       | 2912.9         | 431.2  | 257.84     | 89.53%          | 0.98         | 1.06          |
+| 4  | 2   | v-min        | 2200.6         | 325.8  | 228.28     | 79.27%          | 0.87         | 0.80          |
+| 4  | 2   | zbv-formatted | 2952.7        | 437.1  | 284.36     | 98.74%          | 1.09         | 1.08          |
+| 4  | 1   | zero-bubble  | 2963.1         | 438.6  | 272.13     | 94.50%          | 1.04         | 1.08          |
+| 4  | 3   | 1f1b-interleaved-vpp2 | OOM   | | | | | |
+| 4  | 3   | 1f1b-interleaved-vpp3 | 3012.1 | 445.9 | 287.13 | 99.70% | 1.10 | 1.10 |
+| 4  | 4   | 1f1b-interleaved-vpp4 | 3024.5 | 447.7 | 278.82 | 96.82% | 1.06 | 1.10 |
+
+![qwen-235B-perf](imgs/qwen-235B-perf.png)
+
+
+### Conclusions and Best Practice Guide
+
+Based on the results above, we draw the following conclusions.
+
+- Llama-based models show higher throughput and memory gains than MoE models, because the overall network has a larger GEMM footprint and higher activation memory.
+- For large MoE cases in practice, 1f1b-interleaved reaches a higher throughput roofline than zero-bubble schedules, but it is harder to reduce memory usage. In memory-limited scenarios, v-half is a reasonable option.
+
+Based on these conclusions, we recommend using zero-bubble based algorithms (zero-bubble / zbv / v-half / v-min) instead of 1f1b-interleaved in the following cases:
+
+- A large share of GEMM/GroupedGEMM in the model: dense-layer-heavy models benefit more from splitting weight-grad and input-grad computation. Imbalanced time between the two phases tends to introduce extra bubbles.
+
+- Memory is the bottleneck: 1f1b-interleaved offers limited ways to reduce memory consumption, while v-half and v-min provide practical options when memory is tight.
+
+- Communication is inefficient: 1f1b-interleaved trades extra communication for lower bubble rates, and larger VPP increases communication volume. In most cases, p2p communication can be hidden by overlapping computation, but without AINIC or RDMA support, zbv/v-half/v-min have clearer advantages.
+
+- Extreme partitioning limits: when the model cannot be split beyond VPP rank 2, zbv/v-half/v-min usually outperform 1f1b-interleaved.
+
+
+## Future works
+
+Primus-pipeline provides a flexible interface for future investigation and research on PP algorithms. Here are some potential topics we are working on, and we welcome contributions and ideas.
+
+1. CPU offloading: Based on the schedule node design, it is easy to control offloading/reloading timing for different minibatches and model layers. We are adding offload logic to zbv/v-half/v-min algorithms.
+
+2. More algorithms: Implement more state-of-the-art PP schedules like Dual-Pipe-V and investigate more efficient PP algorithms. Contributions are welcome.
+
+3. Fine-grained overlap: In technical reports like DeepSeek-V3, PP schedules combine forward and backward passes of different minibatches and overlap computation and communication. We plan to explore similar fine-grained overlap strategies.
+
+
+## Acknowledgments
+
+We would like to express our sincere gratitude to the [SeaAI lab](https://sail.sea.com/) team and individuals for their invaluable contributions and collaboration, their expertise and support have been instrumental in advancing the progress of this project.
+
+## Disclaimers
+
+Third-party content is licensed to you directly by the third party that owns the
+content and is not licensed to you by AMD. ALL LINKED THIRD-PARTY CONTENT IS
+PROVIDED “AS IS” WITHOUT A WARRANTY OF ANY KIND. USE OF SUCH THIRD-PARTY CONTENT
+IS DONE AT YOUR SOLE DISCRETION AND UNDER NO CIRCUMSTANCES WILL AMD BE LIABLE TO
+YOU FOR ANY THIRD-PARTY CONTENT. YOU ASSUME ALL RISK AND ARE SOLELY RESPONSIBLE
+FOR ANY DAMAGES THAT MAY ARISE FROM YOUR USE OF THIRD-PARTY CONTENT.
\ No newline at end of file
diff --git a/examples/megatron_bridge/configs/MI300X/qwen3_32b_lora_posttrain.yaml b/examples/megatron_bridge/configs/MI300X/qwen3_32b_lora_posttrain.yaml
new file mode 100644
index 000000000..456ee968a
--- /dev/null
+++ b/examples/megatron_bridge/configs/MI300X/qwen3_32b_lora_posttrain.yaml
@@ -0,0 +1,58 @@
+work_group: ${PRIMUS_TEAM:amd}
+user_name: ${PRIMUS_USER:root}
+exp_name: ${PRIMUS_EXP_NAME:qwen3_32b_lora_posttrain}
+workspace: ${PRIMUS_WORKSPACE:./output}
+
+modules:
+  post_trainer:
+    framework: megatron_bridge
+    config: sft_trainer.yaml
+
+    # Model to run
+    model: qwen3_32b.yaml
+
+    overrides:
+      stderr_sink_level: DEBUG
+
+      # Parallelism configuration
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      pipeline_dtype: null
+      virtual_pipeline_model_parallel_size: null
+      context_parallel_size: 1
+      sequence_parallel: false
+      use_megatron_fsdp: false
+
+      # Finetuning-specific params
+      #pretrained_checkpoint: null
+      peft: lora
+      packed_sequence: false
+
+      # Training configuration
+      train_iters: 200
+      global_batch_size: 32
+      micro_batch_size: 2
+      seq_length: 8192
+      eval_interval: 30
+      save_interval: 50
+
+      # Optimizer configuration
+      finetune_lr: 1.0e-4
+      min_lr: 0.0
+      lr_warmup_iters: 50
+      lr_decay_iters: null
+
+      # W&B logging
+      wandb_project: null
+      wandb_entity: null
+      wandb_exp_name: null
+
+      # Precision
+      precision_config: bf16_mixed
+      comm_overlap_config: null
+
+      # Recompute configuration (enabled for 32B model)
+      recompute_granularity: full
+      recompute_method: uniform
+      recompute_num_layers: 1
+
diff --git a/examples/megatron_bridge/configs/MI300X/qwen3_32b_sft_posttrain.yaml b/examples/megatron_bridge/configs/MI300X/qwen3_32b_sft_posttrain.yaml
new file mode 100644
index 000000000..c91a6b0d5
--- /dev/null
+++ b/examples/megatron_bridge/configs/MI300X/qwen3_32b_sft_posttrain.yaml
@@ -0,0 +1,58 @@
+work_group: ${PRIMUS_TEAM:amd}
+user_name: ${PRIMUS_USER:root}
+exp_name: ${PRIMUS_EXP_NAME:qwen3_32b_sft_posttrain}
+workspace: ${PRIMUS_WORKSPACE:./output}
+
+modules:
+  post_trainer:
+    framework: megatron_bridge
+    config: sft_trainer.yaml
+
+    # Model to run
+    model: qwen3_32b.yaml
+
+    overrides:
+      stderr_sink_level: DEBUG
+
+      # Parallelism configuration
+      tensor_model_parallel_size: 2
+      pipeline_model_parallel_size: 1
+      pipeline_dtype: null
+      virtual_pipeline_model_parallel_size: null
+      context_parallel_size: 1
+      sequence_parallel: false
+      use_megatron_fsdp: false
+
+      # Finetuning-specific params
+      #pretrained_checkpoint: null
+      peft: "none"
+      packed_sequence: false
+
+      # Training configuration
+      train_iters: 200
+      global_batch_size: 8
+      micro_batch_size: 2
+      seq_length: 8192
+      eval_interval: 30
+      save_interval: 50
+
+      # Optimizer configuration
+      finetune_lr: 5.0e-6
+      min_lr: 0.0
+      lr_warmup_iters: 50
+      lr_decay_iters: null
+
+      # W&B logging
+      wandb_project: null
+      wandb_entity: null
+      wandb_exp_name: null
+
+      # Precision
+      precision_config: bf16_mixed
+      comm_overlap_config: null
+
+      # Recompute configuration (enabled for 32B model)
+      recompute_granularity: full
+      recompute_method: uniform
+      recompute_num_layers: 1
+
diff --git a/examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml b/examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
index 49ce4f8a0..3dfb3eb39 100755
--- a/examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
+++ b/examples/megatron_bridge/configs/MI355X/qwen3_32b_lora_posttrain.yaml
@@ -30,9 +30,9 @@ modules:
 
       # Training configuration
       train_iters: 200
-      global_batch_size: 128
-      micro_batch_size: 1
-      seq_length: 2048
+      global_batch_size: 32
+      micro_batch_size: 4
+      seq_length: 8192
       eval_interval: 30
       save_interval: 50
 
diff --git a/examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml b/examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
index 1b9f10aff..d01623990 100755
--- a/examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
+++ b/examples/megatron_bridge/configs/MI355X/qwen3_32b_sft_posttrain.yaml
@@ -15,7 +15,7 @@ modules:
       stderr_sink_level: DEBUG
 
       # Parallelism configuration
-      tensor_model_parallel_size: 4
+      tensor_model_parallel_size: 1
       pipeline_model_parallel_size: 1
       pipeline_dtype: null
       virtual_pipeline_model_parallel_size: null
@@ -30,9 +30,9 @@ modules:
 
       # Training configuration
       train_iters: 200
-      global_batch_size: 128
+      global_batch_size: 8
       micro_batch_size: 1
-      seq_length: 2048
+      seq_length: 8192
       eval_interval: 30
       save_interval: 50
 
diff --git a/examples/run_pretrain.sh b/examples/run_pretrain.sh
index dd23ec02d..016bc23db 100755
--- a/examples/run_pretrain.sh
+++ b/examples/run_pretrain.sh
@@ -178,7 +178,15 @@ if [ "$USING_AINIC" == "1" ]; then
     export ANP_HOME_DIR=${ANP_HOME_DIR:-"/opt/amd-anp"}
     export RCCL_HOME_DIR=${RCCL_HOME_DIR:-"/opt/rccl"}
     export MPI_HOME_DIR=${MPI_HOME_DIR:-"/opt/ompi"}
-    export NCCL_NET_PLUGIN=librccl-anp.so
+    # Check which NCCL net plugin library is present under ${ANP_HOME_DIR}/build and set accordingly
+    if [ -f "${ANP_HOME_DIR}/build/librccl-anp.so" ]; then
+        export NCCL_NET_PLUGIN=librccl-anp.so
+    elif [ -f "${ANP_HOME_DIR}/build/librccl-net.so" ]; then
+        export NCCL_NET_PLUGIN=librccl-net.so
+    else
+        LOG_ERROR "Error: Neither librccl-anp.so nor librccl-net.so found in ${ANP_HOME_DIR}/build."
+        exit 1
+    fi
 
     LOG_INFO_RANK0 "Using AINIC"
     LOG_INFO_RANK0 "RCCL_HOME_DIR: $RCCL_HOME_DIR"
@@ -490,11 +498,17 @@ handle_hipblaslt_tuning() {
     fi
 }
 
-# Disable HipBLASLT tuning in deterministic mode
-if [ "${PRIMUS_DETERMINISTIC:-}" != "1" ]; then
+# NOTE: Disable HipBLASLT tuning in deterministic mode
+# NOTE: If you need to enable torch profiler, do NOT enable HipBLASLT tuning.
+if [ "${PRIMUS_DETERMINISTIC:-}" != "1" ] && [ "${PRIMUS_HIPBLASLT_TUNING:-0}" = "1" ]; then
     handle_hipblaslt_tuning
+else
+    LOG_INFO "disable hipblaslt tuning by default to fix torch profiler issue in TE"
+    export TE_HIPBLASLT_TUNING_RUN_COUNT=0
+    export TE_HIPBLASLT_TUNING_ALGO_COUNT=0
 fi
 
+
 # -------------------- Python Path Setup --------------------
 setup_pythonpath() {
     local site_packages
diff --git a/primus/backends/megatron/core/extensions/primus_turbo.py b/primus/backends/megatron/core/extensions/primus_turbo.py
index f56729701..aebc0ce30 100644
--- a/primus/backends/megatron/core/extensions/primus_turbo.py
+++ b/primus/backends/megatron/core/extensions/primus_turbo.py
@@ -20,15 +20,20 @@
     get_context_parallel_group,
     get_hierarchical_context_parallel_groups,
     get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
 )
 from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.tensor_parallel.layers import ColumnParallelLinear
+from megatron.core.tensor_parallel.layers import (
+    ColumnParallelLinear,
+    _initialize_affine_weight_cpu,
+)
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.moe.experts import GroupedMLP
 from megatron.core.transformer.moe.token_dispatcher import MoETokenDispatcher
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
-from megatron.core.utils import get_tensor_model_parallel_group_if_none
+from megatron.core.utils import divide, get_tensor_model_parallel_group_if_none
 from megatron.training.global_vars import get_args
 from primus_turbo.pytorch.core.low_precision import (
     Float4QuantConfig,
@@ -623,6 +628,37 @@ def __init__(
             tp_group=tp_group,
         )
 
+        # Manual weight initialization when use_cpu_initialization=True
+        if config.use_cpu_initialization:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+            input_size_per_partition = divide(input_size, world_size)
+
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                input_size_per_partition,
+                1,  # partition_dim (row parallel partitions along input dimension)
+                init_method=condition_init_method(config, init_method),
+                stride=1,
+                return_master_weight=False,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+
+            # Bias initialization
+            if bias:
+                with torch.no_grad():
+                    bias_tensor = torch.cat([getattr(self, name) for name in self.bias_names])
+                    bias_tensor.zero_()
+                # Set allreduce attribute for distributed training
+                for bias_name in self.bias_names:
+                    bias_param = getattr(self, bias_name)
+                    setattr(bias_param, "allreduce", True)
+
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 1, bias not sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
@@ -708,6 +744,7 @@ def __init__(
             raise ValueError(f"{__class__.__name__} layers do not support gather_output = True")
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
 
+        args = get_args()
         self.offload = args.offload and "column_parallel_gemm" in args.offload_ops
         assert not self.offload, "gemm offload still have some problems"
 
@@ -739,6 +776,37 @@ def __init__(
             tp_group=tp_group,
         )
 
+        # Manual weight initialization when use_cpu_initialization=True
+        if config.use_cpu_initialization:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+            output_size_per_partition = divide(output_size, world_size)
+
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,  # partition_dim (column parallel partitions along output dimension)
+                init_method=condition_init_method(config, init_method),
+                stride=1,
+                return_master_weight=False,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+
+            # Bias initialization
+            if bias:
+                with torch.no_grad():
+                    bias_tensor = torch.cat([getattr(self, name) for name in self.bias_names])
+                    bias_tensor.zero_()
+                # Set allreduce attribute for distributed training
+                for bias_name in self.bias_names:
+                    bias_param = getattr(self, bias_name)
+                    setattr(bias_param, "allreduce", True)
+
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
@@ -984,6 +1052,37 @@ def __init__(
             zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
         )
 
+        # Manual weight initialization when use_cpu_initialization=True
+        if config.use_cpu_initialization:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+            output_size_per_partition = divide(output_size, world_size)
+
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,  # partition_dim (column parallel partitions along output dimension)
+                init_method=condition_init_method(config, init_method),
+                stride=1,
+                return_master_weight=False,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+
+            # Bias initialization
+            if bias:
+                with torch.no_grad():
+                    bias_tensor = torch.cat([getattr(self, name) for name in self.bias_names])
+                    bias_tensor.zero_()
+                # Set allreduce attribute for distributed training
+                for bias_name in self.bias_names:
+                    bias_param = getattr(self, bias_name)
+                    setattr(bias_param, "allreduce", True)
+
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
diff --git a/primus/backends/megatron/patches/env_patches.py b/primus/backends/megatron/patches/env_patches.py
index 86663ec7f..a59b420ee 100644
--- a/primus/backends/megatron/patches/env_patches.py
+++ b/primus/backends/megatron/patches/env_patches.py
@@ -42,8 +42,10 @@ def set_cuda_device_max_connections(ctx: PatchContext):
     module_config = ctx.extra.get("module_config", {})
 
     # Determine CUDA_DEVICE_MAX_CONNECTIONS based on FSDP usage
-    use_fsdp = getattr(module_config.params, "use_torch_fsdp2", False) or getattr(
-        module_config.params, "use_custom_fsdp", False
+    use_fsdp = (
+        getattr(module_config.params, "use_torch_fsdp2", False)
+        or getattr(module_config.params, "use_custom_fsdp", False)
+        or getattr(module_config.params, "use_megatron_fsdp", False)
     )
 
     if use_fsdp:
diff --git a/primus/backends/megatron/patches/megatron_fsdp_patches.py b/primus/backends/megatron/patches/megatron_fsdp_patches.py
new file mode 100644
index 000000000..5a3b48ea8
--- /dev/null
+++ b/primus/backends/megatron/patches/megatron_fsdp_patches.py
@@ -0,0 +1,94 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+"""
+Megatron-FSDP Patches
+
+This module contains patches for Megatron's native FSDP implementation:
+- DeviceMesh API compatibility for PyTorch 2.10+
+
+These patches are separate from PyTorch FSDP2 (torch.distributed.fsdp),
+which is Megatron's own FSDP implementation.
+"""
+
+from primus.core.patches import PatchContext, get_args, register_patch
+from primus.modules.module_utils import log_rank_0, warning_rank_0
+
+
+@register_patch(
+    "megatron.fsdp.device_mesh",
+    backend="megatron",
+    phase="before_train",
+    description=(
+        "Fix Megatron-FSDP DeviceMesh API compatibility for PyTorch 2.10+ "
+        "by updating get_mesh_names to use new API."
+    ),
+    condition=lambda ctx: getattr(get_args(ctx), "use_megatron_fsdp", False),
+)
+def patch_megatron_fsdp_device_mesh(ctx: PatchContext):
+    """
+    Patch Megatron FSDP's get_mesh_names to work with PyTorch 2.10+ DeviceMesh API.
+
+    Issue:
+        Megatron-LM's megatron_fsdp/utils.py uses the outdated
+        `_mesh_resources.child_to_root_mapping` attribute which was removed
+        in PyTorch 2.10. The new API uses `_root_mesh` and `_flatten_mapping`.
+
+    Solution:
+        Replace get_mesh_names with a patched version that uses the new PyTorch 2.10+
+        DeviceMesh API methods.
+    """
+    log_rank_0("[Patch:megatron.fsdp.device_mesh] Patching Megatron FSDP DeviceMesh " "API compatibility...")
+
+    try:
+        from megatron.core.distributed.fsdp.src.megatron_fsdp import utils
+
+        # Check if the module has the function we need to patch
+        if not hasattr(utils, "get_mesh_names"):
+            warning_rank_0("[Patch:megatron.fsdp.device_mesh] get_mesh_names not found, skipping patch")
+            return
+
+        # Define the patched function
+        def patched_get_mesh_names(device_mesh):
+            """
+            Get dimension names from a DeviceMesh, including submesh names.
+
+            This is the patched version that works with PyTorch 2.10+.
+            """
+            # Get the root mesh using the new API
+            root_mesh = device_mesh._get_root_mesh()
+
+            # Start with the device_mesh's own dimension names
+            result = list(device_mesh.mesh_dim_names or [])
+
+            # Collect submesh dimension names from flattened mapping
+            # In PyTorch 2.10+, submeshes are tracked via _flatten_mapping
+            if hasattr(root_mesh, "_flatten_mapping") and root_mesh._flatten_mapping:
+                for mesh_dim, submesh in root_mesh._flatten_mapping.items():
+                    # Check if this submesh matches our device_mesh
+                    if submesh == device_mesh or (
+                        hasattr(submesh, "_dim_group_names") and submesh._dim_group_names
+                    ):
+                        # Add submesh dimension names
+                        if hasattr(submesh, "mesh_dim_names") and submesh.mesh_dim_names:
+                            result.extend(submesh.mesh_dim_names)
+
+            return result
+
+        # Apply the monkey patch
+        utils.get_mesh_names = patched_get_mesh_names
+
+        log_rank_0(
+            "[Patch:megatron.fsdp.device_mesh] Megatron FSDP DeviceMesh patch "
+            "applied successfully (PyTorch 2.10+ compatibility)"
+        )
+
+    except ImportError as e:
+        warning_rank_0(
+            f"[Patch:megatron.fsdp.device_mesh] Megatron FSDP not available, " f"skipping patch: {e}"
+        )
+    except Exception as e:
+        warning_rank_0(f"[Patch:megatron.fsdp.device_mesh] Failed to patch Megatron FSDP " f"DeviceMesh: {e}")
diff --git a/primus/backends/megatron/patches/training_log/print_rank_last_patches.py b/primus/backends/megatron/patches/training_log/print_rank_last_patches.py
index 59167db9f..4c24e5024 100644
--- a/primus/backends/megatron/patches/training_log/print_rank_last_patches.py
+++ b/primus/backends/megatron/patches/training_log/print_rank_last_patches.py
@@ -11,6 +11,7 @@
 to inject additional information into Megatron training logs:
 
     - ROCm/HIP memory stats.
+    - Running average elapsed time per iteration (ms).
     - Running average throughput per GPU (TFLOP/s/GPU).
     - Running average token throughput per GPU (tokens/s/GPU).
 
@@ -40,6 +41,8 @@ class TrainingLogInfo:
     train_iters: Optional[int] = None
     consumed_samples: Optional[int] = None
     elapsed_ms: Optional[float] = None
+    # Index of the elapsed segment within ``segments``, if present.
+    elapsed_index: Optional[int] = None
     throughput_tflops: Optional[float] = None
     # Index of the throughput segment within ``segments``, if present.
     throughput_index: Optional[int] = None
@@ -94,6 +97,7 @@ def parse_training_log_line(log_string: str) -> TrainingLogInfo:
                 elapsed_match = re.search(r"elapsed time per iteration \(ms\):\s*([0-9.+-eE]+)", seg)
                 if elapsed_match:
                     info.elapsed_ms = float(elapsed_match.group(1))
+                    info.elapsed_index = idx
                 continue
 
             # throughput per GPU (TFLOP/s/GPU): {throughput}
@@ -223,6 +227,66 @@ def inject(
         return log_string
 
 
+class ElapsedAverageExtension:
+    """
+    Helper extension to compute and inject running average of elapsed time per
+    iteration (ms) into Megatron training logs.
+
+    Semantics mirror Primus MegatronTrainer (same as ThroughputAverageExtension):
+        - Ignore the first `log_avg_skip_iterations` iterations for averaging.
+        - Maintain a sliding window up to `log_avg_reset_interval` entries.
+    """
+
+    def __init__(self, args: Any):
+        self._args = args
+        self._recent_elapsed_ms: list[float] = []
+        self._log_avg_skip_iterations: int = int(getattr(args, "log_avg_skip_iterations", 0))
+        self._log_avg_reset_interval: int = int(getattr(args, "log_avg_reset_interval", 1000))
+
+        log_rank_0(
+            f"[Patch:megatron.training_log] ElapsedAverageExtension initialized with "
+            f"log_avg_skip_iterations: {self._log_avg_skip_iterations} "
+            f"log_avg_reset_interval: {self._log_avg_reset_interval}"
+        )
+
+    def inject(self, log_string: str, parsed: Optional[TrainingLogInfo] = None) -> str:
+        """
+        Update ``parsed`` with running-average elapsed time per iteration.
+
+        Elapsed time is rendered inline as:
+            elapsed time per iteration (ms): inst/avg
+        """
+        try:
+            if parsed is None or parsed.elapsed_ms is None:
+                return log_string
+
+            iteration = parsed.iteration
+            elapsed_value = float(parsed.elapsed_ms)
+
+            if iteration is not None and (
+                iteration == self._log_avg_skip_iterations + 1
+                or len(self._recent_elapsed_ms) >= self._log_avg_reset_interval
+            ):
+                self._recent_elapsed_ms.clear()
+
+            if iteration is None or iteration > self._log_avg_skip_iterations:
+                self._recent_elapsed_ms.append(elapsed_value)
+
+            if not self._recent_elapsed_ms:
+                return log_string
+
+            avg_elapsed_ms = sum(self._recent_elapsed_ms) / len(self._recent_elapsed_ms)
+            idx = parsed.elapsed_index
+            if idx is not None and 0 <= idx < len(parsed.segments):
+                parsed.segments[idx] = (
+                    f"elapsed time per iteration (ms): " f"{elapsed_value:.1f}/{avg_elapsed_ms:.1f}"
+                )
+
+            return log_string
+        except Exception:
+            return log_string
+
+
 class ThroughputAverageExtension:
     """
     Helper extension to compute and inject running average throughput statistics
@@ -401,6 +465,7 @@ def patch_training_log_unified(ctx: PatchContext):
         # Create helper extensions once so they keep state (ROCm cache, avg windows)
         # across all training_log invocations.
         mem_ext = MemoryStatsExtension(config)
+        elapsed_ext = ElapsedAverageExtension(config)
         throughput_ext = ThroughputAverageExtension(config)
         call_count = 0
         # Capture the original ``print_rank_last`` so we can delegate actual
@@ -423,10 +488,11 @@ def primus_print_rank_last(log_string: str) -> None:
                 # Parse the original log string once and share across extensions.
                 parsed = parse_training_log_line(log_string)
 
-                # Inject memory statistics, throughput, and token throughput by
-                # mutating the parsed structure. These calls ignore their string
-                # return value when `parsed` is provided.
+                # Inject memory statistics, elapsed avg, throughput, and token
+                # throughput by mutating the parsed structure. These calls ignore
+                # their string return value when `parsed` is provided.
                 mem_ext.inject(log_string, call_count, parsed)
+                elapsed_ext.inject(log_string, parsed)
                 throughput_ext.inject(log_string, parsed)
 
                 # Render the final line from the parsed structure.
diff --git a/primus/backends/megatron/training/evaluator.py b/primus/backends/megatron/training/evaluator.py
index f7df28701..24d59cc7b 100644
--- a/primus/backends/megatron/training/evaluator.py
+++ b/primus/backends/megatron/training/evaluator.py
@@ -49,7 +49,9 @@ def primus_evaluate(
     rerun_mode = rerun_state_machine.get_mode()
     rerun_state_machine.set_mode(RerunMode.DISABLED)
 
-    total_loss_dict = {}
+    # Accumulate numerator and denominator separately across all eval iterations
+    total_loss_numerators = {}
+    total_loss_denominators = {}
 
     # make validation batch size independent from training batch size
     eval_batch_size = args.global_batch_size
@@ -93,7 +95,7 @@ def primus_evaluate(
                 torch.cuda.empty_cache()
 
             if is_pipeline_stage_containing_loss():
-                # Average loss across microbatches.
+                # Accumulate loss across microbatches for this iteration.
                 for key in loss_dicts[0].keys():
                     numerator = 0
                     denominator = 0
@@ -109,7 +111,12 @@ def primus_evaluate(
                             # and so the denominator is 1.
                             numerator += val
                             denominator += 1
-                    total_loss_dict[key] = numerator / denominator
+                    # Accumulate across all eval iterations
+                    if key not in total_loss_numerators:
+                        total_loss_numerators[key] = 0
+                        total_loss_denominators[key] = 0
+                    total_loss_numerators[key] += numerator
+                    total_loss_denominators[key] += denominator
 
             args.consumed_valid_samples += eval_batch_size
 
@@ -125,6 +132,15 @@ def primus_evaluate(
                     log_rank_0("Exiting during evaluation, timelimit reached")
                     return None, None, True
 
+        # Compute final average loss across all eval iterations
+        total_loss_dict = {}
+        if is_pipeline_stage_containing_loss():
+            for key in total_loss_numerators.keys():
+                if total_loss_denominators[key] > 0:
+                    total_loss_dict[key] = total_loss_numerators[key] / total_loss_denominators[key]
+                else:
+                    total_loss_dict[key] = 0.0
+
         collected_non_loss_data = None
         if non_loss_data_func is not None:
             collected_non_loss_data = non_loss_data_func(model)
diff --git a/primus/configs/modules/megatron_bridge/sft_trainer.yaml b/primus/configs/modules/megatron_bridge/sft_trainer.yaml
index 230bb22a5..994a1389a 100644
--- a/primus/configs/modules/megatron_bridge/sft_trainer.yaml
+++ b/primus/configs/modules/megatron_bridge/sft_trainer.yaml
@@ -12,7 +12,7 @@ stage: "sft"
 
 
 # main control flag
-enable_primus_turbo: false
+enable_primus_turbo: true
 
 # feature control flags
 use_turbo_attention: false
diff --git a/primus/core/utils/arg_utils.py b/primus/core/utils/arg_utils.py
index 818c5e9ab..90e41c207 100644
--- a/primus/core/utils/arg_utils.py
+++ b/primus/core/utils/arg_utils.py
@@ -15,6 +15,7 @@ def parse_cli_overrides(overrides: list) -> dict:
 
     Supported formats:
         - "key=value"
+        - "--key=value"
         - "nested.key=value"
         - "--key value" (common CLI style, converted internally to "key=value")
 
@@ -55,7 +56,10 @@ def parse_cli_overrides(overrides: list) -> dict:
 
         # Already in key=value form (including "--key=value")
         if "=" in item:
-            normalized.append(item)
+            key, value = item.split("=", 1)
+            key = key.lstrip("-").strip()
+            value = value.strip()
+            normalized.append(f"{key}={value}")
             i += 1
             continue
 
diff --git a/primus/tools/preflight/gpu/gpu_topology.py b/primus/tools/preflight/gpu/gpu_topology.py
index 168121fa1..e86198764 100644
--- a/primus/tools/preflight/gpu/gpu_topology.py
+++ b/primus/tools/preflight/gpu/gpu_topology.py
@@ -7,6 +7,7 @@
 from __future__ import annotations
 
 import os
+from collections import Counter
 from typing import Any, Dict, List, Optional
 
 from .gpu_probe import probe_gpus
@@ -109,7 +110,10 @@ def run_gpu_standard_checks(*, force_topology: bool = False) -> Dict[str, Any]:
         findings.append(Finding("warn", "NUMA mapping unavailable (amd-smi not found); skipped", {}))
     else:
         nodes = [x.get("numa_node") for x in numa.get("gpus", []) if x.get("numa_node") is not None]
-        imbalance = len(set(nodes)) > 1 if nodes else False
+        imbalance = False
+        if nodes:
+            counts = Counter(nodes).values()
+            imbalance = len(set(counts)) > 1
         findings.append(
             Finding("info", "GPU↔NUMA mapping", {"mapping": numa.get("gpus", []), "imbalance": imbalance})
         )
diff --git a/tests/unit_tests/backends/megatron/conftest.py b/tests/unit_tests/backends/megatron/conftest.py
new file mode 100644
index 000000000..1f5c7d161
--- /dev/null
+++ b/tests/unit_tests/backends/megatron/conftest.py
@@ -0,0 +1,103 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Pytest fixtures for Megatron backend tests.
+
+This conftest provides fixtures for tests that need Megatron parallel state
+initialization, such as tests for Primus Turbo layers and other Megatron-Core
+components that require distributed setup.
+"""
+
+import os
+import random
+
+import pytest
+import torch
+import torch.distributed as dist
+
+
+@pytest.fixture(scope="function")
+def init_parallel_state():
+    """
+    Initialize Megatron parallel state for tests that need it.
+
+    This fixture initializes the parallel state with no actual parallelism
+    (tensor_model_parallel_size=1), which allows tensor parallel layers to
+    function in single-GPU unit tests.
+
+    The fixture is function-scoped so each test gets a clean parallel state.
+    It uses dynamic ports to avoid conflicts when running tests in parallel.
+
+    Yields:
+        None
+
+    Example:
+        @pytest.fixture(autouse=True)
+        def setup_parallel(self, init_parallel_state, monkeypatch):
+            '''Auto-use parallel state for this test class.'''
+            pass
+    """
+    # Only import after sys.path is set up (which happens in pytest_configure)
+    from megatron.core import parallel_state as ps
+
+    # Initialize torch.distributed if not already initialized
+    if not dist.is_initialized():
+        # Use dynamic port to avoid conflicts with other running tests
+        port = random.randint(29500, 39500)
+        os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+        os.environ.setdefault("MASTER_PORT", str(port))
+
+        dist.init_process_group(
+            backend="nccl" if torch.cuda.is_available() else "gloo",
+            init_method=f"tcp://127.0.0.1:{port}",
+            world_size=1,
+            rank=0,
+        )
+
+    # Check if model parallel already initialized and destroy if so
+    # This ensures clean state for each test
+    if ps.model_parallel_is_initialized():
+        ps.destroy_model_parallel()
+
+    # Initialize with minimal parallelism (TP=1, PP=1, EP=1, CP=1)
+    ps.initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        expert_model_parallel_size=1,
+        context_parallel_size=1,
+    )
+
+    # Initialize RNG states for tensor parallel operations
+    # This is required for:
+    # 1. ColumnParallelLinear and other tensor parallel layers that use get_cuda_rng_tracker().fork()
+    # 2. CUDA graph support in layers (enable_cuda_graph=True)
+    if torch.cuda.is_available():
+        try:
+            from megatron.core.tensor_parallel import random as tp_random
+
+            # Initialize RNG tracker with CUDA graph support BEFORE calling model_parallel_cuda_manual_seed
+            # Try Transformer Engine RNG tracker first (best for CUDA graphs, used by Megatron's own tests)
+            try:
+                tp_random.initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True)
+            except (ImportError, AssertionError):
+                # Fallback to native PyTorch CUDA graph RNG support if TE not available
+                tp_random.initialize_rng_tracker(use_cudagraphable_rng=True, force_reset=True)
+
+            tp_random.model_parallel_cuda_manual_seed(42)
+        except ImportError:
+            # RNG tracker initialization is optional - skip if not available
+            pass
+
+    yield
+
+    # Cleanup after test
+    if ps.model_parallel_is_initialized():
+        ps.destroy_model_parallel()
+
+    # Cleanup torch.distributed for single-process tests
+    # (In multi-process torchrun tests, the process group persists across tests)
+    if dist.is_initialized():
+        dist.destroy_process_group()
diff --git a/tests/unit_tests/backends/megatron/test_primus_turbo_cpu_init.py b/tests/unit_tests/backends/megatron/test_primus_turbo_cpu_init.py
new file mode 100644
index 000000000..77f59faa3
--- /dev/null
+++ b/tests/unit_tests/backends/megatron/test_primus_turbo_cpu_init.py
@@ -0,0 +1,225 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+"""
+Unit tests for Primus Turbo linear layer CPU initialization.
+
+Tests verify that when use_cpu_initialization=True:
+1. Weights are properly initialized using _initialize_affine_weight_cpu
+2. Bias is zeroed correctly
+3. Bias allreduce attribute is set for LayerNormColumnParallelLinear (our fix)
+"""
+
+import functools
+
+import pytest
+import torch
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+from primus.backends.megatron.core.extensions.primus_turbo import (
+    PrimusTurboColumnParallelLinear,
+    PrimusTurboLayerNormColumnParallelLinear,
+    PrimusTurboRowParallelLinear,
+)
+from tests.utils import PrimusUT
+
+
+def create_dummy_args():
+    """Create dummy args namespace for Megatron global_vars."""
+    from types import SimpleNamespace
+
+    return SimpleNamespace(
+        rank=0,
+        world_size=1,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        offload=False,
+        offload_ops=[],
+        patch_primus_pipeline=False,
+        pp_algorithm=None,
+        patch_zero_bubble=False,
+        enable_zero_bubble=False,
+        rampup_batch_size=None,
+        global_batch_size=1,
+        micro_batch_size=1,
+        data_parallel_size=1,
+        decrease_batch_size_if_needed=False,
+    )
+
+
+def init_method_xavier():
+    """Create a simple Xavier uniform init method for testing."""
+    return functools.partial(torch.nn.init.xavier_uniform_)
+
+
+class TestPrimusTurboCPUInit(PrimusUT):
+    """Test CPU initialization for all Primus Turbo linear layer classes."""
+
+    @pytest.fixture(autouse=True)
+    def setup_parallel(self, init_parallel_state, monkeypatch):
+        """Initialize parallel state for model tests."""
+        dummy_args = create_dummy_args()
+        import megatron.training.global_vars as global_vars_module
+
+        monkeypatch.setattr(global_vars_module, "_GLOBAL_ARGS", dummy_args)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA/Transformer Engine")
+    def test_row_parallel_weights_initialized_with_cpu_init(self):
+        """Test that RowParallel weights are initialized (not zeros) when CPU init enabled."""
+        config = ModelParallelConfig(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+            context_parallel_size=1,
+            use_cpu_initialization=True,
+            params_dtype=torch.float32,
+        )
+        setattr(config, "symmetric_ar_type", "none")
+        setattr(config, "disable_parameter_transpose_cache", False)
+        setattr(config, "init_model_with_meta_device", False)
+
+        layer = PrimusTurboRowParallelLinear(
+            input_size=64,
+            output_size=128,
+            config=config,
+            init_method=init_method_xavier(),
+            bias=True,
+            input_is_parallel=True,
+            skip_bias_add=False,
+            is_expert=False,
+        ).cuda()
+
+        weight = layer.weight
+        assert weight is not None, "Weight should exist"
+        assert not torch.allclose(
+            weight, torch.zeros_like(weight)
+        ), "Weights should be initialized, not all zeros"
+        assert weight.std() > 0.01, "Weights should have non-trivial variance after initialization"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA/Transformer Engine")
+    def test_column_parallel_weights_initialized_with_cpu_init(self):
+        """Test that ColumnParallel weights are initialized (not zeros) when CPU init enabled."""
+        config = ModelParallelConfig(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+            context_parallel_size=1,
+            use_cpu_initialization=True,
+            params_dtype=torch.float32,
+        )
+        setattr(config, "symmetric_ar_type", "none")
+        setattr(config, "disable_parameter_transpose_cache", False)
+        setattr(config, "init_model_with_meta_device", False)
+
+        layer = PrimusTurboColumnParallelLinear(
+            input_size=64,
+            output_size=128,
+            config=config,
+            init_method=init_method_xavier(),
+            bias=True,
+            gather_output=False,
+            skip_bias_add=False,
+            is_expert=False,
+        ).cuda()
+
+        weight = layer.weight
+        assert weight is not None, "Weight should exist"
+        assert not torch.allclose(
+            weight, torch.zeros_like(weight)
+        ), "Weights should be initialized, not all zeros"
+        assert weight.std() > 0.01, "Weights should have non-trivial variance after initialization"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA/Transformer Engine")
+    def test_layer_norm_column_parallel_weights_initialized_with_cpu_init(self):
+        """Test that LayerNormColumnParallel weights are initialized (not zeros) when CPU init enabled."""
+        transformer_config = TransformerConfig(
+            hidden_size=64,
+            num_attention_heads=8,
+            num_layers=1,
+            use_cpu_initialization=True,
+            params_dtype=torch.float32,
+        )
+
+        layer = PrimusTurboLayerNormColumnParallelLinear(
+            input_size=64,
+            output_size=128,
+            config=transformer_config,
+            init_method=init_method_xavier(),
+            bias=True,
+            gather_output=False,
+            skip_bias_add=False,
+            is_expert=False,
+        ).cuda()
+
+        weight = layer.weight
+        assert weight is not None, "Weight should exist"
+        assert not torch.allclose(
+            weight, torch.zeros_like(weight)
+        ), "Weights should be initialized, not all zeros"
+        assert weight.std() > 0.01, "Weights should have non-trivial variance after initialization"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA/Transformer Engine")
+    def test_row_parallel_bias_initialized_to_zero_with_cpu_init(self):
+        """Test that RowParallel bias is initialized to zero when CPU init enabled."""
+        config = ModelParallelConfig(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+            context_parallel_size=1,
+            use_cpu_initialization=True,
+            params_dtype=torch.float32,
+        )
+        setattr(config, "symmetric_ar_type", "none")
+        setattr(config, "disable_parameter_transpose_cache", False)
+        setattr(config, "init_model_with_meta_device", False)
+
+        layer = PrimusTurboRowParallelLinear(
+            input_size=64,
+            output_size=128,
+            config=config,
+            init_method=init_method_xavier(),
+            bias=True,
+            input_is_parallel=True,
+            skip_bias_add=False,
+            is_expert=False,
+        ).cuda()
+
+        bias = torch.cat([getattr(layer, name) for name in layer.bias_names])
+        assert torch.allclose(bias, torch.zeros_like(bias)), "Bias should be initialized to zero"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA/Transformer Engine")
+    def test_layer_norm_bias_allreduce_attribute_set(self):
+        """
+        Test that bias has allreduce=True attribute for LayerNormColumnParallelLinear.
+
+        This tests our explicit fix: we set allreduce=True on bias when CPU init is enabled
+        for PrimusTurboLayerNormColumnParallelLinear (unlike Row/Column parallel, where
+        the parent TELinear class sets it automatically).
+        """
+        transformer_config = TransformerConfig(
+            hidden_size=64,
+            num_attention_heads=8,
+            num_layers=1,
+            use_cpu_initialization=True,
+            params_dtype=torch.float32,
+        )
+
+        layer = PrimusTurboLayerNormColumnParallelLinear(
+            input_size=64,
+            output_size=128,
+            config=transformer_config,
+            init_method=init_method_xavier(),
+            bias=True,
+            gather_output=False,
+            skip_bias_add=False,
+            is_expert=False,
+        ).cuda()
+
+        # Check that bias has allreduce attribute set to True (our fix)
+        for bias_name in layer.bias_names:
+            bias_param = getattr(layer, bias_name)
+            assert hasattr(bias_param, "allreduce"), f"Bias {bias_name} should have 'allreduce' attribute"
+            assert (
+                getattr(bias_param, "allreduce") is True
+            ), f"Bias {bias_name} should have allreduce=True (set by our CPU init code)"
diff --git a/tools/daily/daily_report.py b/tools/daily/daily_report.py
index 3c8168c88..f0b6b4fd7 100644
--- a/tools/daily/daily_report.py
+++ b/tools/daily/daily_report.py
@@ -138,14 +138,14 @@ def parse_last_metrics_from_log(file_path: str) -> Dict[str, float]:
     if not last_m:
         raise ValueError(f"No valid iteration metrics found in {file_path}")
     metric = last_m.groupdict()
-    step_time_s = float(metric["elapsed_ms"]) / 1000.0
+    step_time_s_avg = float(metric["elapsed_ms_avg"]) / 1000.0
     mem_usage = float(metric["hip_used"])
     tflops_avg = float(metric["tflops_avg"])
     tokens_avg = float(metric["tokens_avg"])
 
     return {
         "TFLOP/s/GPU": round(tflops_avg, 2),
-        "Step Time (s)": round(step_time_s, 3),
+        "Step Time (s)": round(step_time_s_avg, 3),
         "Tokens/s/GPU": round(tokens_avg, 1),
         "Mem Usage": round(mem_usage, 2),
     }