Skip to content

mla ps support paged 64 and 3buffer layout for ds3.2 (#1917) #4

mla ps support paged 64 and 3buffer layout for ds3.2 (#1917)

mla ps support paged 64 and 3buffer layout for ds3.2 (#1917) #4

Workflow file for this run

name: Aiter Test
on:
push:
branches: [main]
pull_request:
branches: [main] # Triggers on PRs targeting `main`
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
# TODO: Revert to rocm/pytorch:latest once CK adds ROCm 7.2 support
DOCKER_IMAGE: "rocm/pytorch:latest@sha256:683765a52c61341e1674fe730ab3be861a444a45a36c0a8caae7653a08a0e208"
jobs:
check-signal:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download and check signal artifact
run: ./.github/scripts/check_signal.sh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_SHA: ${{ github.sha }}
# define-runners:
# runs-on: ubuntu-latest
# needs: [check-signal]
# outputs:
# standard_runners: ${{ steps.machines.outputs.standard_runners }}
# multigpu_runners: ${{ steps.machines.outputs.multigpu_runners }}
# steps:
# - name: Define whether runs on MI35X
# env:
# PR_TITLE: ${{ github.event.pull_request.title }}
# id: machines
# run: |
# set -euo pipefail
# if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
# echo "It's main branch, running tests on MI325 and MI35X..."
# echo 'standard_runners=["aiter-mi355-1gpu", "aiter-1gpu-runner"]' >> "$GITHUB_OUTPUT"
# echo 'multigpu_runners=["aiter-mi355-8gpu", "aiter-8gpu-runner"]' >> "$GITHUB_OUTPUT"
# #elif echo "${PR_TITLE}" | grep -qi "mi325"; then
# # echo "PR title contains 'MI325', running tests on MI325 and MI35X..."
# # echo 'standard_runners=["aiter-mi355-1gpu", "aiter-1gpu-runner"]' >> "$GITHUB_OUTPUT"
# # echo 'multigpu_runners=["aiter-mi355-8gpu", "aiter-8gpu-runner"]' >> "$GITHUB_OUTPUT"
# else
# # echo "Not main branch and PR title does not contain mi325, only running on MI35X..."
# echo 'standard_runners=["aiter-mi355-1gpu", "aiter-1gpu-runner"]' >> "$GITHUB_OUTPUT"
# echo 'multigpu_runners=["aiter-mi355-8gpu", "aiter-8gpu-runner"]' >> "$GITHUB_OUTPUT"
# fi
# echo "$GITHUB_OUTPUT"
#
# - name: Show output variable
# run: |
# echo "Standard: ${{ steps.machines.outputs.standard_runners }}"
# echo "Multigpu: ${{ steps.machines.outputs.multigpu_runners }}"
standard:
name: Standard Tests (1 GPU)
needs: check-signal
strategy:
fail-fast: false
matrix:
include:
- runner: aiter-mi355-1gpu
label: MI355
- runner: aiter-1gpu-runner
label: MI325
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Sync submodules
run: |
set -euo pipefail
if [[ ${{ github.ref }} == "refs/heads/main" ]]; then
echo "It's main branch, syncing latest CK..."
git submodule sync
git submodule update --init --recursive --remote --depth 1 --jobs 4
else
echo "It's a PR branch, syncing specific CK..."
git submodule sync
git submodule update --init --recursive --depth 1 --jobs 4
fi
- name: Clean up Rocm processes
run: |
./.github/scripts/clean_up_rocm.sh
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
${{ env.DOCKER_IMAGE }}
- name: Setup pip config
run: |
docker exec -u root aiter_test bash -c "pip config set global.default-timeout 60"
docker exec -u root aiter_test bash -c "pip config set global.retries 10"
- name: Setup Aiter
run: |
set -ex
echo "Setting up Aiter..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Tests
run: |
set -ex
docker exec \
-w /workspace \
aiter_test \
bash -c "MAX_JOBS=20 ./.github/scripts/aiter_test.sh"
- name: Collect test logs
if: always()
run: |
echo "Collecting test logs..."
echo "Aiter Operator Tests Summary:" >> $GITHUB_STEP_SUMMARY
python3 ./.github/scripts/collect_logs.py latest_test.log >> $GITHUB_STEP_SUMMARY
- name: Upload test logs
uses: actions/upload-artifact@v4
if: always()
with:
name: standard-test-log-${{ matrix.runner }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
- name: Clean up Rocm processes
if: always()
run: |
./.github/scripts/clean_up_rocm.sh
multi-gpu:
name: Multi-GPU Tests (8 GPU)
needs: check-signal
# only run multi-gpu tests on main branch due to limited multi-gpu resources
if: github.ref == 'refs/heads/main'
strategy:
fail-fast: false
matrix:
include:
- runner: aiter-mi355-8gpu
- runner: aiter-8gpu-runner
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Sync submodules
run: |
set -euo pipefail
if [[ ${{ github.ref }} == "refs/heads/main" ]]; then
echo "It's main branch, syncing latest CK..."
git submodule sync
git submodule update --init --recursive --remote --depth 1 --jobs 4
else
echo "It's a PR branch, syncing specific CK..."
git submodule sync
git submodule update --init --recursive --depth 1 --jobs 4
fi
- name: Clean up Rocm processes
run: ./.github/scripts/clean_up_rocm.sh
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
${{ env.DOCKER_IMAGE }}
- name: Setup pip config
run: |
docker exec -u root aiter_test bash -c "pip config set global.default-timeout 60"
docker exec -u root aiter_test bash -c "pip config set global.retries 10"
- name: Setup-Aiter
run: |
set -ex
echo "Setting up Aiter..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Tests
run: |
set -ex
docker exec \
-e MULTIGPU=TRUE \
-w /workspace \
aiter_test \
bash -c "MAX_JOBS=20 ./.github/scripts/aiter_test.sh"
- name: Upload test logs
uses: actions/upload-artifact@v4
if: always()
with:
name: multigpu-test-${{ matrix.runner }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
- name: Clean up Rocm processes
if: always()
run: |
./.github/scripts/clean_up_rocm.sh