Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0511e5d
update maxtext to version 022dc02
llying-001 Feb 4, 2026
2e58d67
update maxtext part 1
llying-001 Feb 4, 2026
a506657
update maxtext: part II
llying-001 Feb 9, 2026
a6f37d8
update maxtext III
llying-001 Feb 9, 2026
b96d0b3
update jax docker to 26.1
llying-001 Feb 10, 2026
3767dbd
fix ib deps for jax
llying-001 Feb 10, 2026
cbda947
fix mixtral config error
llying-001 Feb 11, 2026
e0794c9
add 405b config file
llying-001 Feb 12, 2026
a704cee
update multi-node shell for jax
llying-001 Feb 13, 2026
10d1a0f
Improve slurm launcher logging and make nodelist optional
amd-fuyuajin Feb 14, 2026
f09fc6f
corrected XLA_FLAGS and added env var to suppress errors
amd-fuyuajin Feb 14, 2026
a0aed10
Make Jax/MaxText work for the unified primus-cli launching command
amd-fuyuajin Feb 16, 2026
62c59e4
Updated the apt install package list for Jax/MaxText
amd-fuyuajin Feb 18, 2026
eac1364
add /dev/infiniband as default in primus-cli global config file
amd-fuyuajin Feb 18, 2026
08967fd
add primus-cli global config yaml file for AINIC usage
amd-fuyuajin Feb 18, 2026
095b267
Updated Primus-cli user guide
amd-fuyuajin Feb 18, 2026
87bc2e7
Update docs/cli/PRIMUS-CLI-GUIDE.md
amd-fuyuajin Feb 25, 2026
fdb9c48
fix up by review
llying-001 Feb 25, 2026
36a162d
update cicd for maxtext
llying-001 Feb 25, 2026
ba5c95c
disable turbo install to avoid segfault, update cicd for jax and enab…
llying-001 Feb 26, 2026
faec60e
Merge branch 'main' into dev/fuyuajin/maxtext-backend-test
llying-001 Feb 26, 2026
8357b2f
unify NCCL_IB_TC and NCCL_IB_FIFO_TC for maxtext and torch
llying-001 Feb 27, 2026
da2b3bf
Merge remote-tracking branch 'origin/main' into dev/fuyuajin/maxtext-…
llying-001 Feb 27, 2026
694420e
update jax ainic image and add detect_nccl_ib_tc for cli and remove d…
llying-001 Mar 3, 2026
7941fb7
Merge remote-tracking branch 'origin/main' into dev/fuyuajin/maxtext-…
llying-001 Mar 3, 2026
846b4b3
update cicd runner for jax
llying-001 Mar 3, 2026
24b2931
update docker file ainic for jax
llying-001 Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ env:
PRIMUS_TURBO_COMMIT: 79373eb781a54fd49aed9430c8718489409d1dd0 # chore(aiter): update aiter to fix FA asm kernel bug (#237)
ROCSHMEM_COMMIT: 17ff985c026f9f97f85068647e863ab541dd5645 # Update version to 3.2.0 for 7.2.0 rocm release (#351) (#355)
BASE_IMAGE: docker.io/rocm/primus:v26.1
MAXTEXT_BASE_IMAGE: docker.io/rocm/jax-training:maxtext-v25.9
MAXTEXT_BASE_IMAGE: docker.io/rocm/jax-training:maxtext-v26.1

jobs:
code-lint:
Expand Down Expand Up @@ -168,6 +168,25 @@ jobs:
docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-jax
docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}

echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-jax-ainic"
start_time=$(date +%s)
mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile_jax.ainic \
--network=host \
-t tasimage/primus:${{env.IMAGE_TAG}}-jax-ainic \
--build-arg BASE_IMAGE=${MAXTEXT_BASE_IMAGE} \
--build-arg AINIC_BUNDLE_PATH=ainic \
$GITHUB_WORKSPACE/.github/workflows/docker
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "⏱️ [build primus docker-jax-ainic] Total elapsed time: ${elapsed} seconds"

docker tag tasimage/primus:${{env.IMAGE_TAG}}-jax-ainic docker.io/tasimage/primus:${{env.IMAGE_TAG}}-jax-ainic
docker login -u tasimage -p ${{ secrets.PRIMUS_DOCKER_HUB_TOKEN }}
docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-jax-ainic
docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}

# echo "> Docker cleanup local images"
# docker rmi tasimage/primus:${{env.IMAGE_TAG}}
# docker rmi tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
Expand Down Expand Up @@ -288,7 +307,7 @@ jobs:
env:
PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/jax
needs: [code-lint]
runs-on: [primus-lm-cicd-jax-8t8mh]
runs-on: [primus-lm-cicd-jax-m42vb]
steps:
- run: echo "🎉 Begin Primus-Turbo Checkout."
- name: Set commit hash to env
Expand All @@ -311,19 +330,16 @@ jobs:
echo "✅ [Pip install requirements] started at: $(date)"
mkdir -p ${PRIMUS_WORKDIR}/primus-cache
python3 -m pip install --upgrade pip setuptools
pip3 install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm7.0
MAX_JOBS=128 pip3 install --cache-dir=${PRIMUS_WORKDIR}/primus-cache --no-build-isolation --no-clean -r requirements.txt
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "✅ [Pip install requirements] ended at: $(date)"
echo "⏱️ [Pip install requirements] Total elapsed time: ${elapsed} seconds"
start_time=$(date +%s)
echo "✅ [build primus-turbo] started at: $(date)"
PRIMUS_TURBO_FRAMEWORK="JAX" pip3 install --no-build-isolation -e . -v
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "✅ [build primus-turbo] ended at: $(date)"
echo "⏱️ [build primus-turbo] Total elapsed time: ${elapsed} seconds"
echo "⏱️ [build primus-turbo] Torch installation causes segfault, so we skip it and actually not install turbo. Total elapsed time: ${elapsed} seconds"
- run: echo "🎉 Begin Primus Unit Test."
- uses: actions/checkout@v4
with:
Expand Down
38 changes: 38 additions & 0 deletions .github/workflows/docker/Dockerfile_jax.ainic
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Base image
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ARG AINIC_BUNDLE_PATH

# Non-interactive APT
ENV DEBIAN_FRONTEND=noninteractive


# ---------------------------------------------------------------------------
# Install build dependencies
# ---------------------------------------------------------------------------
RUN rm /etc/apt/sources.list.d/*radeon* && \
apt update && \
apt install initramfs-tools -y

# ---------------------------------------------------------------------------
# Enviroment variables
# ---------------------------------------------------------------------------
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib
ENV WORKDIR=/workspace

# =============================== Build AINIC Driver ===============================
# WARNING: Please ensure the following environment variables are correctly set:
# WARNING: 1. PATH: /usr/sbin must be included.
# WARNING: 2. LD_LIBRARY_PATH: /usr/lib must be included.
# WARNING: If these paths are missing, tools and libraries may not function correctly.
# INFO: Installation completed successfully

COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
RUN cd ${WORKDIR} && \
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
cd ainic_bundle_1.117.5-a-56 && \
tar zxf host_sw_pkg.tar.gz && \
cd host_sw_pkg && \
./install.sh --domain=user -y 2>&1 | tee log_install.txt
31 changes: 24 additions & 7 deletions docs/cli/PRIMUS-CLI-GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

```bash
# Run GEMM benchmark directly on current host
./primus-cli direct -- benchmark gemm -M 4096 -N 4096 -K 4096
./primus-cli direct -- benchmark gemm --M 4096 --N 4096 --K 4096
```

---
Expand Down Expand Up @@ -65,7 +65,7 @@ Primus CLI supports three execution modes, each suitable for different scenarios
./primus-cli direct -- train pretrain --config config.yaml

# GEMM benchmark
./primus-cli direct -- benchmark gemm -M 4096 -N 4096 -K 4096
./primus-cli direct -- benchmark gemm --M 4096 --N 4096 --K 4096

# Environment check (info only)
./primus-cli direct -- preflight --host --gpu --network
Expand Down Expand Up @@ -115,7 +115,7 @@ Primus CLI supports three execution modes, each suitable for different scenarios

# Set resource limits
./primus-cli container --cpus 32 --memory 256G \
-- benchmark gemm -M 8192 -N 8192 -K 8192
-- benchmark gemm --M 8192 --N 8192 --K 8192

# Mount local Primus code for development
./primus-cli container --volume ~/workspace/Primus:/workspace/Primus \
Expand Down Expand Up @@ -164,10 +164,18 @@ Primus CLI supports three execution modes, each suitable for different scenarios
-- train pretrain --config deepseek_v2.yaml

# Run distributed GEMM benchmark
./primus-cli slurm srun -N 2 -- benchmark gemm -M 16384 -N 16384 -K 16384
./primus-cli slurm srun -N 2 -- benchmark gemm --M 16384 --N 16384 --K 16384

# Multi-node environment check (info only)
# this will generate a fast info report of the host, GPU, and network
./primus-cli slurm srun -N 4 -- preflight --host --gpu --network

# this will generate a full preflight report of the host, GPU, and network, as well as the performance tests
./primus-cli slurm srun -N 4 -- preflight --report-file-name preflight-report-4N

# if you are using AINIC in your cluster, use the appropriate configuration file
# for preflight test, set docker image to rocm/primus:v26.1 in the configuration file
./primus-cli --config runner/use_ainic.yaml slurm srun -N 2 -- preflight --report-file-name preflight-report-2N
```

**Suitable for**:
Expand Down Expand Up @@ -250,6 +258,15 @@ direct:
./primus-cli --config prod.yaml slurm srun -N 8 -- train pretrain
```

### Using AINIC Configuration File

If you are using AINIC in your cluster, you can use the `runner/use_ainic.yaml` configuration file to configure the AINIC environment. This file includes pre-configured environment variables for AINIC: `USING_AINIC=1`, `NCCL_PXN_DISABLE=0`, and `NCCL_IB_GID_INDEX=1`. You can modify the `NCCL_IB_GID_INDEX` value based on your AINIC settings and update the `image` value to match your Docker image.

Here is an example of using the AINIC configuration file to run a training job:
```bash
./primus-cli --config runner/use_ainic.yaml slurm srun -N 2 -- train pretrain --config examples/maxtext/configs/MI355X/llama2_7B-pretrain.yaml
```

### Configuration Priority

**Priority Order** (high to low):
Expand Down Expand Up @@ -306,13 +323,13 @@ Command-line args > Specified config file > System default config > User config
#### GEMM Benchmark
```bash
# Single-node GEMM
./primus-cli direct -- benchmark gemm -M 4096 -N 4096 -K 4096
./primus-cli direct -- benchmark gemm --M 4096 --N 4096 --K 4096

# Run in container
./primus-cli container -- benchmark gemm -M 8192 -N 8192 -K 8192
./primus-cli container -- benchmark gemm --M 8192 --N 8192 --K 8192

# Multi-node GEMM
./primus-cli slurm srun -N 2 -- benchmark gemm -M 16384 -N 16384 -K 16384
./primus-cli slurm srun -N 2 -- benchmark gemm --M 16384 --N 16384 --K 16384
```

#### Other Benchmarks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ modules:
capacity_factor: 1
max_target_length: 4096
per_device_batch_size: 12
remat_policy: "minimal"
remat_policy: "save_dot_with_context_except_mlp"
51 changes: 51 additions & 0 deletions examples/maxtext/configs/MI355X/llama3.1_405B-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:llama3.1_405B-pretrain}
workspace: ./output

modules:
pre_trainer:
framework: maxtext
config: pre_trainer.yaml

# model to run
model: llama3.1_405B.yaml
overrides:
run_name: "llama3.1_405b_training"
base_output_directory: "./output"
steps: 50
log_period: 10
profiler: ""

# data
dataset_type: "synthetic"
hf_access_token: ${HF_TOKEN:""}

# checkpoint
enable_checkpointing: false
async_checkpointing: false

# inter-node parallelism strategy
dcn_data_parallelism: 1
dcn_fsdp_parallelism: -1
dcn_pipeline_parallelism: 1
dcn_tensor_parallelism: 1
dcn_sequence_parallelism: 1

# intra-node parallelism strategy
ici_fsdp_parallelism: -1
ici_data_parallelism: 1
ici_sequence_parallelism: 1
ici_tensor_parallelism: 1
ici_pipeline_parallelism: 1

remat_policy: 'full'
optimizer_memory_host_offload: False
param_scan_axis: 1
megablox: False

use_iota_embed: True
scan_layers: True

max_target_length: 8192
per_device_batch_size: 5
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ modules:
megablox: false
capacity_factor: 1
max_target_length: 4096
per_device_batch_size: 12
per_device_batch_size: 11
remat_policy: "minimal"
9 changes: 8 additions & 1 deletion examples/run_local_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ EXP=${EXP:-"examples/megatron/exp_pretrain.yaml"}

# Default docker image
if [ "${BACKEND:-}" = "MaxText" ]; then
DOCKER_IMAGE=${DOCKER_IMAGE:-"docker.io/rocm/jax-training:maxtext-v25.9"}
DOCKER_IMAGE=${DOCKER_IMAGE:-"docker.io/rocm/jax-training:maxtext-v26.1"}
else
DOCKER_IMAGE=${DOCKER_IMAGE:-"docker.io/rocm/primus:v26.1"}
fi
Expand Down Expand Up @@ -125,6 +125,13 @@ if [ "$USING_AINIC" == "1" ]; then
ENV_ARGS+=("--env" "ANP_HOME_DIR")
ENV_ARGS+=("--env" "MPI_HOME_DIR")

TC_RESULTS=$(bash "${PRIMUS_PATH}/examples/scripts/detect_nccl_ib_tc.sh")
if [ -z "$TC_RESULTS" ]; then
echo "TC_RESULTS: $TC_RESULTS"
ENV_ARGS+=("--env" "TC_RESULTS")
else
echo "Failed to detect NCCL_IB_TC and NCCL_IB_FIFO_TC"
fi
# VOLUME_ARGS+=(-v /mnt/shared:/mnt/shared)
# VOLUME_ARGS+=(-v /etc/libibverbs.d/:/etc/libibverbs.d:ro)
# VOLUME_ARGS+=(-v /usr/lib/x86_64-linux-gnu/libibverbs/:/usr/lib/x86_64-linux-gnu/libibverbs/:ro)
Expand Down
Loading