Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,7 @@ jobs:
echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-ainic"
start_time=$(date +%s)
mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-38.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
cp /apps/tas/0_public/primus_docker_ci/ainic/amd-anp-v1.3.0.patch $GITHUB_WORKSPACE/.github/workflows/docker/ainic/
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile.ainic \
--network=host \
-t tasimage/primus:${{env.IMAGE_TAG}}-ainic \
Expand All @@ -131,6 +130,25 @@ jobs:
docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-ainic
docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}

echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-v25.09-ainic"
start_time=$(date +%s)
mkdir -p $GITHUB_WORKSPACE/.github/workflows/docker/ainic
cp /apps/tas/0_public/primus_docker_ci/ainic/ainic_bundle_1.117.5-a-56.tar.gz $GITHUB_WORKSPACE/.github/workflows/docker/ainic/ || { echo "Error: Failed to copy ainic bundle"; exit 1; }
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile_v25.09_ainic \
--network=host \
-t tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic \
--build-arg AINIC_BUNDLE_PATH=ainic \
--build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \
$GITHUB_WORKSPACE/.github/workflows/docker
end_time=$(date +%s)
elapsed=$((end_time - start_time))
echo "⏱️ [build primus docker-v25.09-ainic] Total elapsed time: ${elapsed} seconds"

docker tag tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic docker.io/tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
docker login -u tasimage -p ${{ secrets.PRIMUS_DOCKER_HUB_TOKEN }}
docker push docker.io/tasimage/primus:${{env.IMAGE_TAG}}-v25.09-ainic
docker login -u rocmshared -p ${{ secrets.ROCM_DOCKER_HUB_TOKEN }}

echo "> Build Docker Image with tag: ${{ env.IMAGE_TAG }}-jax"
start_time=$(date +%s)
docker build -f $GITHUB_WORKSPACE/.github/workflows/docker/Dockerfile \
Expand Down Expand Up @@ -159,9 +177,11 @@ jobs:

run-unittest-torch:
env:
PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch
PRIMUS_WORKDIR: /mnt/apps_proxy/tas/0_public/primus_ci/actions-runner-torch
# PRIMUS_WORKDIR: /wekafs/primus-data/primus_safe_ci/torch
needs: [code-lint]
runs-on: [primus-lm-cicd-torch-j8knc]
# runs-on: [primus-lm-cicd-torch-j8knc]
runs-on: [primus-lm-cicd-torch-tas8n-a16-40]
steps:
- run: echo "🎉 Begin Primus-Turbo Checkout."
- name: Set commit hash to env
Expand All @@ -176,10 +196,11 @@ jobs:
- run: echo "Begin Primus-Turbo Install."
- name: Install Primus-Turbo
run: |
rm -rf /tmp/Primus-Turbo || true
mv Primus-Turbo /tmp/
echo "Primus-Turbo dir: /tmp/Primus-Turbo"
git config --global --add safe.directory /tmp/Primus-Turbo
cd /tmp/Primus-Turbo
git config --global --add safe.directory /tmp/Primus-Turbo || true
cd /tmp/Primus-Turbo || true
start_time=$(date +%s)
echo "✅ [Pip install requirements] started at: $(date)"
mkdir -p ${PRIMUS_WORKDIR}/primus-cache
Expand Down Expand Up @@ -233,6 +254,8 @@ jobs:
run: |
echo "Running Primus Core tests..."
# Note: The tests `test_fp8_te_linear` and `test_te_linear` are temporarily skipped due to intermittent failures.
# Note HSA_NO_SCRATCH_RECLAIM=1 must be set to avoid RCCL perf hit (TAS-8N Node), rocm ver:70125424
export HSA_NO_SCRATCH_RECLAIM=1
pytest --maxfail=1 -s ./tests/unit_tests/ \
--deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_fp8_te_linear \
--deselect=tests/unit_tests/megatron/cco/test_tp_overlap.py::TPOverlapTestCase::test_te_linear
Expand All @@ -243,7 +266,8 @@ jobs:
echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
rm -rf "${{ env.UT_LOG_PATH }}"
mkdir -p "${{ env.UT_LOG_PATH }}"
MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
# MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
pytest --maxfail=1 -s ./tests/trainer/test_megatron_trainer.py
- name: Run Primus Model Tests -- TorchTitan
env:
Expand All @@ -252,7 +276,8 @@ jobs:
echo "Set UT_LOG_PATH: ${{ env.UT_LOG_PATH }}"
rm -rf "${{ env.UT_LOG_PATH }}"
mkdir -p "${{ env.UT_LOG_PATH }}"
MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
# MASTER_PORT=10009 DATA_PATH=/wekafs/primus-data \
MASTER_PORT=10009 DATA_PATH=/mnt/apps_proxy/tas/0_public/data HSA_NO_SCRATCH_RECLAIM=1 \
pytest --maxfail=1 -s ./tests/trainer/test_torchtitan_trainer.py
- name: Clean
run: |
Expand Down
15 changes: 9 additions & 6 deletions .github/workflows/docker/Dockerfile.ainic
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@ ENV MPI_PATH=/opt/ompi
# WARNING: If these paths are missing, tools and libraries may not function correctly.
# INFO: Installation completed successfully

COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-38.tar.gz ${WORKDIR}
COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
RUN cd ${WORKDIR} && \
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
tar zxf ainic_bundle_1.117.5-a-38.tar.gz && \
cd ainic_bundle_1.117.5-a-38 && \
tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
cd ainic_bundle_1.117.5-a-56 && \
tar zxf host_sw_pkg.tar.gz && \
cd host_sw_pkg && \
./install.sh --domain=user -y 2>&1 | tee log_install.txt
./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
cd ${WORKDIR} && \
apt-get install -y ./amd/ainic/deb-repo/libionic*.deb

# ---------------------------------------------------------------------------
# Build rccl
Expand All @@ -54,8 +56,9 @@ ENV RCCL_HOME=${WORKDIR}/rccl
# Build AMD ANP
# ---------------------------------------------------------------------------

RUN apt-get install -y --allow-unauthenticated libionic-dev && \
cd ${WORKDIR} && git clone https://github.com/rocm/amd-anp.git && \
# RUN apt-get install -y --allow-unauthenticated libionic-dev && \
RUN cd ${WORKDIR} && \
git clone https://github.com/rocm/amd-anp.git && \
cd amd-anp && git checkout tags/v1.3.0 && \
make -j 16 RCCL_HOME=${RCCL_HOME} \
MPI_INCLUDE=${MPI_PATH}/include/ \
Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/docker/Dockerfile_v25.09_ainic
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,16 @@ RUN apt-get update && \
# WARNING: If these paths are missing, tools and libraries may not function correctly.
# INFO: Installation completed successfully

COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.1-a-42.tar.gz /opt/
COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-56.tar.gz ${WORKDIR}
RUN cd ${WORKDIR} && \
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
tar zxf ainic_bundle_1.117.1-a-42.tar.gz && \
cd ainic_bundle_1.117.1-a-42 && \
tar zxf ainic_bundle_1.117.5-a-56.tar.gz && \
cd ainic_bundle_1.117.5-a-56 && \
tar zxf host_sw_pkg.tar.gz && \
cd host_sw_pkg && \
./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
cd /opt
cd ${WORKDIR} && \
apt-get install -y ./amd/ainic/deb-repo/libionic*.deb

# =============================== Test AINIC Driver ===============================
# ibv_devices
Expand Down
1 change: 1 addition & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Guides for common workflows and features:

In-depth technical documentation:

- **[Post-Training Guide](./posttraining.md)** - Fine-tuning with SFT and LoRA using Primus CLI
- **[Performance Projection](./projection.md)** - Project training performance to multi-node configurations
- **[Preflight](./preflight.md)** - Cluster diagnostics (host/GPU/network info + perf tests)
- **[Benchmark Suite](./benchmark.md)** - GEMM, RCCL, end-to-end benchmarks and profiling
Expand Down
Loading
Loading