From 0120399d7d0f4aa6d2027805f17b308394f69dd5 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 5 Jan 2026 22:19:07 +0000 Subject: [PATCH 01/65] test ci --- .github/workflows/pr-test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index ce9c2daaa..e5428fa06 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + #-e http_proxy=$http_proxy + #-e https_proxy=$https_proxy + #-e HTTP_PROXY=$HTTP_PROXY + #-e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 4a6cab5e56ce1249b367f1e77e2f7585350e7079 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 5 Jan 2026 22:53:47 +0000 Subject: [PATCH 02/65] update --- .github/workflows/pr-test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index e5428fa06..ce9c2daaa 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - #-e http_proxy=$http_proxy - #-e https_proxy=$https_proxy - #-e HTTP_PROXY=$HTTP_PROXY - #-e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 1ff2341a66dc1a4ab48efecd89460e98f0582607 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 5 Jan 2026 22:59:22 +0000 Subject: [PATCH 03/65] update --- .github/workflows/pr-test.yml | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index ce9c2daaa..2693c10ae 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -82,10 +78,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -126,10 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -170,10 +158,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -214,10 +198,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 50f56c544ee3718fdc9118f9ae6dabc06f571c0e Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 8 Jan 2026 18:04:10 -0800 Subject: [PATCH 04/65] update --- .github/workflows/pr-test.yml | 8 -------- .github/workflows/pr-test.yml.j2 | 6 +----- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 7913aadad..ece2de8fb 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -118,10 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -242,10 +238,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 84cac9114..06f8bf4a9 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -104,10 +104,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -134,4 +130,4 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} -<% endfor %> \ No newline at end of file +<% endfor %> From ebd8df617d4aac4d72b90019a126308ae85cd315 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 22:50:41 +0000 Subject: [PATCH 05/65] update --- .github/workflows/pr-test.yml | 28 ++++++++++++++++++++++++++++ .github/workflows/pr-test.yml.j2 | 4 ++++ 2 files changed, 32 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..f00faa5a6 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,6 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -78,6 +82,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -118,6 +126,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -158,6 +170,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -198,6 +214,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -238,6 +258,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -278,6 +302,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 263f4a591..644180c6e 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,6 +107,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 099b50169351f70dd16d4957c7f1c6d655670e84 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 22:54:10 +0000 Subject: [PATCH 06/65] update --- .github/workflows/pr-test.yml | 56 ++++++++++++++++---------------- .github/workflows/pr-test.yml.j2 | 8 ++--- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f00faa5a6..517cd0295 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -82,10 +82,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -126,10 +126,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -170,10 +170,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -214,10 +214,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -258,10 +258,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -302,10 +302,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 644180c6e..29d3a66be 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,10 +107,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + # -e http_proxy=$http_proxy + # -e https_proxy=$https_proxy + # -e HTTP_PROXY=$HTTP_PROXY + # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From cf579d8ae8ed21c45022611b272bfe2b5fe26d46 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 22:58:34 +0000 Subject: [PATCH 07/65] update --- .github/workflows/pr-test.yml | 28 ---------------------------- .github/workflows/pr-test.yml.j2 | 4 ---- 2 files changed, 32 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 517cd0295..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -82,10 +78,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -126,10 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -170,10 +158,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -214,10 +198,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -258,10 +238,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -302,10 +278,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 29d3a66be..263f4a591 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,10 +107,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - # -e http_proxy=$http_proxy - # -e https_proxy=$https_proxy - # -e HTTP_PROXY=$HTTP_PROXY - # -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From df4b2f56926dbea519032d8ff51f9bc979240500 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 23:07:58 +0000 Subject: [PATCH 08/65] update --- .github/workflows/pr-test.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..2276ca9c4 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,6 +38,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -78,6 +80,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -118,6 +122,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -158,6 +164,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -198,6 +206,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -238,6 +248,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -278,6 +290,8 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 7e549e24e61b6ba3d2111dda555f59f49b4c5110 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 23:11:15 +0000 Subject: [PATCH 09/65] update --- .github/workflows/pr-test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2276ca9c4..d03b855af 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,8 +38,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 11f2bd878dd25ef7f1cdd920e0a22f5f25988995 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 23:36:47 +0000 Subject: [PATCH 10/65] fix pre-test http proxy problem --- .github/workflows/pr-test.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index d03b855af..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -78,8 +78,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -120,8 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -162,8 +158,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -204,8 +198,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -246,8 +238,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -288,8 +278,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From c800b7f4d5cc6a2ad6655f0a8a5b6cad1748a490 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 23:40:01 +0000 Subject: [PATCH 11/65] update --- .github/workflows/pr-test.yml | 332 +++++++++------------------------- 1 file changed, 83 insertions(+), 249 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..25bb2bce2 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -1,7 +1,76 @@ -################################################################################ -# This file is auto-generated from the .j2 file via generate_github_workflows.py. Do not edit manually. -################################################################################ - +<% set jobs = { + 'e2e-test-short': { + 'label': 'run-ci-short', + 'tests': [ + {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4}, + {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4}, + {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2}, + ], + }, + 'e2e-test-fsdp': { + 'label': 'run-ci-fsdp', + 'tests': [ + {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, + ], + }, + 'e2e-test-megatron': { + 'label': 'run-ci-megatron', + 'tests': [ + {'test_file': 'test_quick_start_glm4_9B.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_30B_A3B.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_4B_ppo.py', 'num_gpus': 8}, + {'test_file': 'test_moonlight_16B_A3B.py', 'num_gpus': 8}, + {'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8}, + ], + }, + 'e2e-test-precision': { + 'label': 'run-ci-precision', + 'tests': [ + {'test_file': 'test_qwen3_0.6B_parallel_check.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, + ], + }, + 'e2e-test-ckpt': { + 'label': 'run-ci-ckpt', + 'tests': [ + {'test_file': 'test_qwen3_4B_ckpt.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_4B_ckpt.py --async-save', 'num_gpus': 8}, + ], + }, + 'e2e-test-long': { + 'label': 'run-ci-long', + 'tests': [ + {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2}, + {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2}, + ], + }, + 'e2e-test-image': { + 'label': 'run-ci-image', + 'image': 'radixark/miles-test:latest', + 'tests': [ + {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4}, + {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4}, + {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2}, + {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2}, + {'test_file': 'test_quick_start_glm4_9B.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_30B_A3B.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_4B_ppo.py', 'num_gpus': 8}, + {'test_file': 'test_moonlight_16B_A3B.py', 'num_gpus': 8}, + {'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_0.6B_parallel_check.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, + {'test_file': 'test_qwen3_4B_ckpt.py', 'num_gpus': 8}, + {'test_file': 'test_qwen3_4B_ckpt.py --async-save', 'num_gpus': 8}, + {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2}, + {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2}, + ], + }, +} %> name: PR Test on: @@ -24,252 +93,12 @@ concurrency: cancel-in-progress: true jobs: - - e2e-test-short: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-short')) - runs-on: self-hosted - container: - image: radixark/miles:latest - options: > - --gpus all - --ipc=host - --shm-size=16g - --ulimit memlock=-1 - --ulimit stack=67108864 - --memory=0 - --memory-swap=0 - -v /mnt/nvme0n1/miles_ci:/data/miles_ci - -v /mnt/nvme0n1/miles_ci/models:/root/models - -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets - strategy: - fail-fast: false - matrix: - info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}] - defaults: - run: - working-directory: ${{ github.workspace }} - env: - GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} - WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} - MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install - shell: bash - run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages - - - name: Execute - shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - e2e-test-fsdp: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp')) - runs-on: self-hosted - container: - image: radixark/miles:latest - options: > - --gpus all - --ipc=host - --shm-size=16g - --ulimit memlock=-1 - --ulimit stack=67108864 - --memory=0 - --memory-swap=0 - -v /mnt/nvme0n1/miles_ci:/data/miles_ci - -v /mnt/nvme0n1/miles_ci/models:/root/models - -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets - strategy: - fail-fast: false - matrix: - info: [{"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}] - defaults: - run: - working-directory: ${{ github.workspace }} - env: - GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} - WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} - MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install - shell: bash - run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages - - - name: Execute - shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - e2e-test-megatron: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron')) - runs-on: self-hosted - container: - image: radixark/miles:latest - options: > - --gpus all - --ipc=host - --shm-size=16g - --ulimit memlock=-1 - --ulimit stack=67108864 - --memory=0 - --memory-swap=0 - -v /mnt/nvme0n1/miles_ci:/data/miles_ci - -v /mnt/nvme0n1/miles_ci/models:/root/models - -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets - strategy: - fail-fast: false - matrix: - info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}] - defaults: - run: - working-directory: ${{ github.workspace }} - env: - GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} - WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} - MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install - shell: bash - run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages - - - name: Execute - shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - e2e-test-precision: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision')) - runs-on: self-hosted - container: - image: radixark/miles:latest - options: > - --gpus all - --ipc=host - --shm-size=16g - --ulimit memlock=-1 - --ulimit stack=67108864 - --memory=0 - --memory-swap=0 - -v /mnt/nvme0n1/miles_ci:/data/miles_ci - -v /mnt/nvme0n1/miles_ci/models:/root/models - -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets - strategy: - fail-fast: false - matrix: - info: [{"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}] - defaults: - run: - working-directory: ${{ github.workspace }} - env: - GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} - WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} - MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install - shell: bash - run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages - - - name: Execute - shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - e2e-test-ckpt: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt')) - runs-on: self-hosted - container: - image: radixark/miles:latest - options: > - --gpus all - --ipc=host - --shm-size=16g - --ulimit memlock=-1 - --ulimit stack=67108864 - --memory=0 - --memory-swap=0 - -v /mnt/nvme0n1/miles_ci:/data/miles_ci - -v /mnt/nvme0n1/miles_ci/models:/root/models - -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets - strategy: - fail-fast: false - matrix: - info: [{"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}] - defaults: - run: - working-directory: ${{ github.workspace }} - env: - GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} - WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} - MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install - shell: bash - run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages - - - name: Execute - shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - e2e-test-long: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long')) - runs-on: self-hosted - container: - image: radixark/miles:latest - options: > - --gpus all - --ipc=host - --shm-size=16g - --ulimit memlock=-1 - --ulimit stack=67108864 - --memory=0 - --memory-swap=0 - -v /mnt/nvme0n1/miles_ci:/data/miles_ci - -v /mnt/nvme0n1/miles_ci/models:/root/models - -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets - strategy: - fail-fast: false - matrix: - info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}] - defaults: - run: - working-directory: ${{ github.workspace }} - env: - GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} - WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} - MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install - shell: bash - run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages - - - name: Execute - shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - e2e-test-image: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image')) +<% for job_name, config in jobs.items() %> + << job_name >>: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, '<< config.label >>')) runs-on: self-hosted container: - image: radixark/miles-test:latest + image: << config.image if config.image else 'radixark/miles:latest' >> options: > --gpus all --ipc=host @@ -278,13 +107,17 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets strategy: fail-fast: false matrix: - info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}] + info: << config.tests | tojson >> defaults: run: working-directory: ${{ github.workspace }} @@ -304,3 +137,4 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} +<% endfor %> \ No newline at end of file From 16139892082dcdb1765814e76254e68a842ffbcc Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 23:42:36 +0000 Subject: [PATCH 12/65] update --- .github/workflows/pr-test.yml | 352 ++++++++++++++++++++++++------- .github/workflows/pr-test.yml.j2 | 6 +- 2 files changed, 278 insertions(+), 80 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 25bb2bce2..569f02e88 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -1,76 +1,7 @@ -<% set jobs = { - 'e2e-test-short': { - 'label': 'run-ci-short', - 'tests': [ - {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4}, - {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4}, - {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2}, - ], - }, - 'e2e-test-fsdp': { - 'label': 'run-ci-fsdp', - 'tests': [ - {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2}, - {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2}, - {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, - ], - }, - 'e2e-test-megatron': { - 'label': 'run-ci-megatron', - 'tests': [ - {'test_file': 'test_quick_start_glm4_9B.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_30B_A3B.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_4B_ppo.py', 'num_gpus': 8}, - {'test_file': 'test_moonlight_16B_A3B.py', 'num_gpus': 8}, - {'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8}, - ], - }, - 'e2e-test-precision': { - 'label': 'run-ci-precision', - 'tests': [ - {'test_file': 'test_qwen3_0.6B_parallel_check.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, - ], - }, - 'e2e-test-ckpt': { - 'label': 'run-ci-ckpt', - 'tests': [ - {'test_file': 'test_qwen3_4B_ckpt.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_4B_ckpt.py --async-save', 'num_gpus': 8}, - ], - }, - 'e2e-test-long': { - 'label': 'run-ci-long', - 'tests': [ - {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2}, - {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2}, - ], - }, - 'e2e-test-image': { - 'label': 'run-ci-image', - 'image': 'radixark/miles-test:latest', - 'tests': [ - {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4}, - {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4}, - {'test_file': 'test_qwen3_0.6B_fsdp_colocated_2xGPU.py', 'num_gpus': 2}, - {'test_file': 'test_qwen3_4B_fsdp_true_on_policy.py', 'num_gpus': 2}, - {'test_file': 'test_qwen3_vl_4B_fsdp.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_0.6B_fsdp_distributed.py', 'num_gpus': 2}, - {'test_file': 'test_quick_start_glm4_9B.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_30B_A3B.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_4B_ppo.py', 'num_gpus': 8}, - {'test_file': 'test_moonlight_16B_A3B.py', 'num_gpus': 8}, - {'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_0.6B_parallel_check.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_0.6B_megatron_fsdp_align.py', 'num_gpus': 4}, - {'test_file': 'test_qwen3_4B_ckpt.py', 'num_gpus': 8}, - {'test_file': 'test_qwen3_4B_ckpt.py --async-save', 'num_gpus': 8}, - {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2}, - {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2}, - ], - }, -} %> +################################################################################ +# This file is auto-generated from the .j2 file via generate_github_workflows.py. Do not edit manually. +################################################################################ + name: PR Test on: @@ -93,12 +24,12 @@ concurrency: cancel-in-progress: true jobs: -<% for job_name, config in jobs.items() %> - << job_name >>: - if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, '<< config.label >>')) + + e2e-test-short: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-short')) runs-on: self-hosted container: - image: << config.image if config.image else 'radixark/miles:latest' >> + image: radixark/miles:latest options: > --gpus all --ipc=host @@ -117,7 +48,7 @@ jobs: strategy: fail-fast: false matrix: - info: << config.tests | tojson >> + info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}] defaults: run: working-directory: ${{ github.workspace }} @@ -137,4 +68,267 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} -<% endfor %> \ No newline at end of file + + e2e-test-fsdp: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp')) + runs-on: self-hosted + container: + image: radixark/miles:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY + -v /mnt/nvme0n1/miles_ci:/data/miles_ci + -v /mnt/nvme0n1/miles_ci/models:/root/models + -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: [{"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}] + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages + + - name: Execute + shell: bash + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + e2e-test-megatron: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron')) + runs-on: self-hosted + container: + image: radixark/miles:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY + -v /mnt/nvme0n1/miles_ci:/data/miles_ci + -v /mnt/nvme0n1/miles_ci/models:/root/models + -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}] + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages + + - name: Execute + shell: bash + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + e2e-test-precision: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision')) + runs-on: self-hosted + container: + image: radixark/miles:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY + -v /mnt/nvme0n1/miles_ci:/data/miles_ci + -v /mnt/nvme0n1/miles_ci/models:/root/models + -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: [{"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}] + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages + + - name: Execute + shell: bash + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + e2e-test-ckpt: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt')) + runs-on: self-hosted + container: + image: radixark/miles:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY + -v /mnt/nvme0n1/miles_ci:/data/miles_ci + -v /mnt/nvme0n1/miles_ci/models:/root/models + -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: [{"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}] + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages + + - name: Execute + shell: bash + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + e2e-test-long: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long')) + runs-on: self-hosted + container: + image: radixark/miles:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY + -v /mnt/nvme0n1/miles_ci:/data/miles_ci + -v /mnt/nvme0n1/miles_ci/models:/root/models + -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: [{"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}] + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages + + - name: Execute + shell: bash + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + e2e-test-image: + if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image')) + runs-on: self-hosted + container: + image: radixark/miles-test:latest + options: > + --gpus all + --ipc=host + --shm-size=16g + --ulimit memlock=-1 + --ulimit stack=67108864 + --memory=0 + --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY + -v /mnt/nvme0n1/miles_ci:/data/miles_ci + -v /mnt/nvme0n1/miles_ci/models:/root/models + -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + strategy: + fail-fast: false + matrix: + info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}] + defaults: + run: + working-directory: ${{ github.workspace }} + env: + GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} + MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install + shell: bash + run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages + + - name: Execute + shell: bash + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} \ No newline at end of file diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 263f4a591..25bb2bce2 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,6 +107,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -133,4 +137,4 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} -<% endfor %> +<% endfor %> \ No newline at end of file From b44926d166edd2fb058fa32d1f4a987d5a81f0ab Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sat, 17 Jan 2026 23:48:18 +0000 Subject: [PATCH 13/65] update --- .github/workflows/pr-test.yml | 56 ++++++++++++++++---------------- .github/workflows/pr-test.yml.j2 | 8 ++--- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 569f02e88..f5ba4a668 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -82,10 +82,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -126,10 +126,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -170,10 +170,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -214,10 +214,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -258,10 +258,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -302,10 +302,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 25bb2bce2..d13bdc849 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,10 +107,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY + -e http_proxy=${{ env.http_proxy || '' }} + -e https_proxy=${{ env.https_proxy || '' }} + -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From c0e729c54bf5ecba5c6f40f9b9b18bcd4f479785 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:01:06 +0000 Subject: [PATCH 14/65] update --- .github/workflows/pr-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f5ba4a668..f3b6d059b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -331,4 +331,4 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} \ No newline at end of file + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} From 7823fc3d81dafa19bc194b15d62eb4704a8ad3de Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:08:13 +0000 Subject: [PATCH 15/65] update --- .github/workflows/pr-test.yml | 2 +- .github/workflows/pr-test.yml.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f3b6d059b..3fdc699fc 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -10,7 +10,7 @@ on: # branches: [main] pull_request: branches: [main] - types: [synchronize, labeled] + types: [opened, synchronize, labeled] workflow_dispatch: inputs: infinite_run: diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index d13bdc849..0fbaa5740 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -79,7 +79,7 @@ on: # branches: [main] pull_request: branches: [main] - types: [synchronize, labeled] + types: [opened, synchronize, labeled] workflow_dispatch: inputs: infinite_run: From bd933da77f20b0190a13ce69f96b0dbfc34145de Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:26:59 +0000 Subject: [PATCH 16/65] update --- .github/workflows/pr-test.yml | 2 +- .github/workflows/pr-test.yml.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 3fdc699fc..f3b6d059b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -10,7 +10,7 @@ on: # branches: [main] pull_request: branches: [main] - types: [opened, synchronize, labeled] + types: [synchronize, labeled] workflow_dispatch: inputs: infinite_run: diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 0fbaa5740..d13bdc849 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -79,7 +79,7 @@ on: # branches: [main] pull_request: branches: [main] - types: [opened, synchronize, labeled] + types: [synchronize, labeled] workflow_dispatch: inputs: infinite_run: From 46e584befc3a87a9a0aa1978b78a9ddbc3a92743 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:32:55 +0000 Subject: [PATCH 17/65] update --- .github/workflows/pr-test.yml | 56 ++++++++++++++++---------------- .github/workflows/pr-test.yml.j2 | 8 ++--- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f3b6d059b..b4a6d2957 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -82,10 +82,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -126,10 +126,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -170,10 +170,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -214,10 +214,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -258,10 +258,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -302,10 +302,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index d13bdc849..9afd8e74d 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,10 +107,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ env.http_proxy || '' }} - -e https_proxy=${{ env.https_proxy || '' }} - -e HTTP_PROXY=${{ env.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ env.HTTPS_PROXY || '' }} + -e http_proxy=${{ vars.HTTP_PROXY || '' }} + -e https_proxy=${{ vars.HTTPS_PROXY || '' }} + -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} + -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From df42816113c2e8ffd302c2592586d989aba1cd2b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:41:27 +0000 Subject: [PATCH 18/65] update --- del.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 del.txt diff --git a/del.txt b/del.txt new file mode 100644 index 000000000..190a18037 --- /dev/null +++ b/del.txt @@ -0,0 +1 @@ +123 From e56305ebadbabb74dfb153672495b88964988508 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:41:50 +0000 Subject: [PATCH 19/65] update --- del.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 del.txt diff --git a/del.txt b/del.txt deleted file mode 100644 index 190a18037..000000000 --- a/del.txt +++ /dev/null @@ -1 +0,0 @@ -123 From 3b19e590fa8cc62e578526f9a0a5c3ef25791f06 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:49:53 +0000 Subject: [PATCH 20/65] update --- .github/workflows/pr-test.yml.j2 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 9afd8e74d..9b14ba918 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -130,6 +130,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages From f144f309256ead52f419a9995e57346beb368c8e Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 00:53:11 +0000 Subject: [PATCH 21/65] update --- .github/workflows/pr-test.yml | 63 ++++++++++++++++++++++++++++++++ .github/workflows/pr-test.yml.j2 | 14 +++---- 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index b4a6d2957..23229c06f 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -61,6 +61,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -105,6 +114,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -149,6 +167,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -193,6 +220,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -237,6 +273,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -281,6 +326,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -325,6 +379,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray environment + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 9b14ba918..d8d9258d1 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -131,13 +131,13 @@ jobs: uses: actions/checkout@v4 - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true + shell: bash + run: | + # Kill any existing Ray processes + ray stop --force || true + pkill -9 ray || true + # Clean up environment variables + unset RAY_ADDRESS || true - name: Install shell: bash From 795c2830576364d60a665730730b652693a37ee9 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:03:05 +0000 Subject: [PATCH 22/65] update --- .github/workflows/pr-test.yml | 14 ++++++++++++++ .github/workflows/pr-test.yml.j2 | 2 ++ 2 files changed, 16 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 23229c06f..3f6413abb 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -76,6 +76,8 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-fsdp: @@ -129,6 +131,8 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-megatron: @@ -182,6 +186,8 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-precision: @@ -235,6 +241,8 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-ckpt: @@ -288,6 +296,8 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-long: @@ -341,6 +351,8 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-image: @@ -394,4 +406,6 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index d8d9258d1..19f27cfe1 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -145,5 +145,7 @@ jobs: - name: Execute shell: bash + env: + RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} <% endfor %> \ No newline at end of file From 6febe6bbf84b50092e5224e06ace3cba6d48dfcc Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:06:21 +0000 Subject: [PATCH 23/65] update --- .github/workflows/pr-test.yml | 63 -------------------------------- .github/workflows/pr-test.yml.j2 | 9 ----- 2 files changed, 72 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 3f6413abb..acd8aeb47 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -61,15 +61,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -116,15 +107,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -171,15 +153,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -226,15 +199,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -281,15 +245,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -336,15 +291,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -391,15 +337,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 19f27cfe1..6aa831ec9 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -130,15 +130,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup Ray environment - shell: bash - run: | - # Kill any existing Ray processes - ray stop --force || true - pkill -9 ray || true - # Clean up environment variables - unset RAY_ADDRESS || true - - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages From fa32531aba1db40e04f325f82af396429e2b4586 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:18:11 +0000 Subject: [PATCH 24/65] update --- del.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 del.txt diff --git a/del.txt b/del.txt new file mode 100644 index 000000000..d72af3146 --- /dev/null +++ b/del.txt @@ -0,0 +1 @@ +asd From 1441193a4661a4b0f7a9b4d0157ab4086229ce99 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:31:28 +0000 Subject: [PATCH 25/65] update --- del.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 del.txt diff --git a/del.txt b/del.txt deleted file mode 100644 index d72af3146..000000000 --- a/del.txt +++ /dev/null @@ -1 +0,0 @@ -asd From 73e9a121d2e977a44a568c1bda21fe327591f987 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:41:23 +0000 Subject: [PATCH 26/65] update --- .github/workflows/pr-test.yml.j2 | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 6aa831ec9..9afd8e74d 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -136,7 +136,5 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} <% endfor %> \ No newline at end of file From 7153c4cd6176c1db0ad4d98d145c07deb0d2b372 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:54:54 +0000 Subject: [PATCH 27/65] update --- .github/workflows/pr-test.yml | 42 -------------------------------- .github/workflows/pr-test.yml.j2 | 4 --- 2 files changed, 46 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index acd8aeb47..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -67,8 +63,6 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-fsdp: @@ -84,10 +78,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -113,8 +103,6 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-megatron: @@ -130,10 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -159,8 +143,6 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-precision: @@ -176,10 +158,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -205,8 +183,6 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-ckpt: @@ -222,10 +198,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -251,8 +223,6 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-long: @@ -268,10 +238,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -297,8 +263,6 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-image: @@ -314,10 +278,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -343,6 +303,4 @@ jobs: - name: Execute shell: bash - env: - RAY_ADDRESS: "" run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 9afd8e74d..50fe1e76b 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,10 +107,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=${{ vars.HTTP_PROXY || '' }} - -e https_proxy=${{ vars.HTTPS_PROXY || '' }} - -e HTTP_PROXY=${{ vars.HTTP_PROXY || '' }} - -e HTTPS_PROXY=${{ vars.HTTPS_PROXY || '' }} -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From d2cb61d7f74e883b04e4f38ef63b85b233adc07b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 01:59:23 +0000 Subject: [PATCH 28/65] fix ci test --- tests/test_qwen2.5_0.5B_gsm8k_async_short.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py index d55262cd0..ff98abc61 100644 --- a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py +++ b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py @@ -123,8 +123,8 @@ def execute(): if __name__ == "__main__": prepare() - os.environ.pop("http_proxy") - os.environ.pop("https_proxy") - os.environ.pop("HTTP_PROXY") - os.environ.pop("HTTPS_PROXY") + # os.environ.pop("http_proxy") + # os.environ.pop("https_proxy") + # os.environ.pop("HTTP_PROXY") + # os.environ.pop("HTTPS_PROXY") execute() From e2ce0feaa3ab9f6eb99cdebc3b97f62a03e58f9c Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:12:30 +0000 Subject: [PATCH 29/65] fix --- tests/test_qwen2.5_0.5B_gsm8k_async_short.py | 8 ++++---- tests/test_qwen2.5_0.5B_gsm8k_short.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py index ff98abc61..dfadaee00 100644 --- a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py +++ b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py @@ -123,8 +123,8 @@ def execute(): if __name__ == "__main__": prepare() - # os.environ.pop("http_proxy") - # os.environ.pop("https_proxy") - # os.environ.pop("HTTP_PROXY") - # os.environ.pop("HTTPS_PROXY") + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) + os.environ.pop("HTTP_PROXY", None) + os.environ.pop("HTTPS_PROXY", None) execute() diff --git a/tests/test_qwen2.5_0.5B_gsm8k_short.py b/tests/test_qwen2.5_0.5B_gsm8k_short.py index afbffbc56..77cf02bf1 100644 --- a/tests/test_qwen2.5_0.5B_gsm8k_short.py +++ b/tests/test_qwen2.5_0.5B_gsm8k_short.py @@ -122,8 +122,8 @@ def execute(): if __name__ == "__main__": prepare() - os.environ.pop("http_proxy") - os.environ.pop("https_proxy") - os.environ.pop("HTTP_PROXY") - os.environ.pop("HTTPS_PROXY") + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) + os.environ.pop("HTTP_PROXY", None) + os.environ.pop("HTTPS_PROXY", None) execute() From 5acfadda4f74d5621a793de212895819125b543b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:22:54 +0000 Subject: [PATCH 30/65] update --- miles/utils/external_utils/command_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 8c7c9316b..a1d865c04 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -109,6 +109,10 @@ def execute_train( train_backend_fsdp = "--train-backend fsdp" in train_args assert train_backend_fsdp == (megatron_model_type is None) + # Clear proxy environment variables to prevent Ray communication issues + for proxy_var in ["http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"]: + os.environ.pop(proxy_var, None) + exec_command( "pkill -9 sglang; " "sleep 3; " @@ -130,8 +134,10 @@ def execute_train( exec_command( # will prevent ray from buffering stdout/stderr f"export PYTHONBUFFERED=16 && " - f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" + f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265" ) + # Wait for Ray to be fully initialized + time.sleep(3) if (f := before_ray_job_submit) is not None: f() From c9dc55974db92cb45e3b23b302015fa26b5c3b37 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:27:13 +0000 Subject: [PATCH 31/65] update --- miles/utils/external_utils/command_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index a1d865c04..1ede7bf6e 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -133,11 +133,12 @@ def execute_train( if not external_ray: exec_command( # will prevent ray from buffering stdout/stderr + f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " f"export PYTHONBUFFERED=16 && " - f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265" + f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-statsd" ) # Wait for Ray to be fully initialized - time.sleep(3) + time.sleep(5) if (f := before_ray_job_submit) is not None: f() @@ -181,6 +182,7 @@ def execute_train( else "" ) exec_command( + f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && " f"{cmd_megatron_model_source}" f'ray job submit --address="http://127.0.0.1:8265" ' From 5680741ed59a6cca772445fc58d63d7368ff3c16 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:29:29 +0000 Subject: [PATCH 32/65] update --- miles/utils/external_utils/command_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 1ede7bf6e..142068afe 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -135,7 +135,7 @@ def execute_train( # will prevent ray from buffering stdout/stderr f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " f"export PYTHONBUFFERED=16 && " - f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-statsd" + f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" ) # Wait for Ray to be fully initialized time.sleep(5) From 3f72150fab34b36bf38cfd8d380a7c59262174d3 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:34:30 +0000 Subject: [PATCH 33/65] update --- miles/utils/external_utils/command_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 142068afe..0989b5e50 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -139,6 +139,7 @@ def execute_train( ) # Wait for Ray to be fully initialized time.sleep(5) + exec_command("ray status") if (f := before_ray_job_submit) is not None: f() From a099bb554ca61b0cdb349c37f8bee4ba4610c03b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:39:37 +0000 Subject: [PATCH 34/65] update --- .github/workflows/pr-test.yml | 42 ++++++++++++++++++++++++++++++++ .github/workflows/pr-test.yml.j2 | 6 +++++ 2 files changed, 48 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..34adf8d17 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -52,6 +52,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -92,6 +98,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -132,6 +144,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -172,6 +190,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -212,6 +236,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -252,6 +282,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -292,6 +328,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 50fe1e76b..68d96f6c4 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -121,6 +121,12 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + # Clear proxy settings to prevent Ray communication issues + http_proxy: "" + https_proxy: "" + HTTP_PROXY: "" + HTTPS_PROXY: "" + no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository From 537e28291d2b50c369afad5febcd55f704ff6e89 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:42:03 +0000 Subject: [PATCH 35/65] update --- miles/utils/external_utils/command_utils.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 0989b5e50..f0a30fa07 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -109,10 +109,6 @@ def execute_train( train_backend_fsdp = "--train-backend fsdp" in train_args assert train_backend_fsdp == (megatron_model_type is None) - # Clear proxy environment variables to prevent Ray communication issues - for proxy_var in ["http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"]: - os.environ.pop(proxy_var, None) - exec_command( "pkill -9 sglang; " "sleep 3; " @@ -133,13 +129,9 @@ def execute_train( if not external_ray: exec_command( # will prevent ray from buffering stdout/stderr - f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " f"export PYTHONBUFFERED=16 && " f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" ) - # Wait for Ray to be fully initialized - time.sleep(5) - exec_command("ray status") if (f := before_ray_job_submit) is not None: f() @@ -183,7 +175,6 @@ def execute_train( else "" ) exec_command( - f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && " f"{cmd_megatron_model_source}" f'ray job submit --address="http://127.0.0.1:8265" ' @@ -277,4 +268,4 @@ def save_to_temp_file(text: str, ext: str): "H100": "Hopper", "GB200": "Blackwell", "GB300": "Blackwell", -} +} \ No newline at end of file From 27f3a774503e8d4428bb32f7cf44cb89d5a189df Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:44:04 +0000 Subject: [PATCH 36/65] update --- miles/utils/external_utils/command_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index f0a30fa07..8c7c9316b 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -268,4 +268,4 @@ def save_to_temp_file(text: str, ext: str): "H100": "Hopper", "GB200": "Blackwell", "GB300": "Blackwell", -} \ No newline at end of file +} From 640ac7a4ea00f0c4dea2a800c4b8b2d7eaf069f2 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:46:11 +0000 Subject: [PATCH 37/65] update --- del.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 del.txt diff --git a/del.txt b/del.txt new file mode 100644 index 000000000..73ca374fe --- /dev/null +++ b/del.txt @@ -0,0 +1 @@ +sdasd From 62bfa826d39dd1d10fd1b438055ae179c7bebd88 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 02:51:27 +0000 Subject: [PATCH 38/65] update --- .github/workflows/pr-test.yml | 42 -------------------------------- .github/workflows/pr-test.yml.j2 | 6 ----- 2 files changed, 48 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 34adf8d17..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -52,12 +52,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -98,12 +92,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -144,12 +132,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -190,12 +172,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -236,12 +212,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -282,12 +252,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository @@ -328,12 +292,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 68d96f6c4..50fe1e76b 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -121,12 +121,6 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} - # Clear proxy settings to prevent Ray communication issues - http_proxy: "" - https_proxy: "" - HTTP_PROXY: "" - HTTPS_PROXY: "" - no_proxy: "127.0.0.1,localhost" steps: - name: Checkout repository From cb24e9c299e467e3dbf8436e2ba700741b14f3d0 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:01:17 +0000 Subject: [PATCH 39/65] update --- .github/workflows/pr-test.yml | 35 ++++++++++++++++----- .github/workflows/pr-test.yml.j2 | 5 ++- miles/utils/external_utils/command_utils.py | 6 ++++ 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..a96ef5182 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -63,7 +63,10 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-fsdp: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp')) @@ -103,7 +106,10 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-megatron: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron')) @@ -143,7 +149,10 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-precision: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision')) @@ -183,7 +192,10 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-ckpt: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt')) @@ -223,7 +235,10 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-long: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long')) @@ -263,7 +278,10 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-image: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image')) @@ -303,4 +321,7 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 50fe1e76b..6b711e645 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -132,5 +132,8 @@ jobs: - name: Execute shell: bash - run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS + export no_proxy="127.0.0.1,localhost" + python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} <% endfor %> \ No newline at end of file diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 8c7c9316b..0b75593c5 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -109,6 +109,10 @@ def execute_train( train_backend_fsdp = "--train-backend fsdp" in train_args assert train_backend_fsdp == (megatron_model_type is None) + # Clear proxy and Ray address environment variables to prevent communication issues + for env_var in ["http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY", "RAY_ADDRESS"]: + os.environ.pop(env_var, None) + exec_command( "pkill -9 sglang; " "sleep 3; " @@ -129,6 +133,7 @@ def execute_train( if not external_ray: exec_command( # will prevent ray from buffering stdout/stderr + f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS && " f"export PYTHONBUFFERED=16 && " f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" ) @@ -175,6 +180,7 @@ def execute_train( else "" ) exec_command( + f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS && " f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && " f"{cmd_megatron_model_source}" f'ray job submit --address="http://127.0.0.1:8265" ' From 52968a3010a5424991e689c8d7f706731530336b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:06:33 +0000 Subject: [PATCH 40/65] update --- miles/utils/external_utils/command_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 0b75593c5..998705d5d 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -157,6 +157,12 @@ def execute_train( "no_proxy": f"127.0.0.1,{master_addr}", # This is needed by megatron / torch distributed in multi-node setup "MASTER_ADDR": master_addr, + # Clear proxy and RAY_ADDRESS to prevent communication issues + "http_proxy": "", + "https_proxy": "", + "HTTP_PROXY": "", + "HTTPS_PROXY": "", + "RAY_ADDRESS": "", # Clear to use default Ray connection **( { "CUDA_ENABLE_COREDUMP_ON_EXCEPTION": "1", From 8bcb7f0db146b3e0fd60a6641e577fba9a155ce2 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:15:33 +0000 Subject: [PATCH 41/65] update --- miles/utils/external_utils/command_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 998705d5d..00399912b 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -162,7 +162,7 @@ def execute_train( "https_proxy": "", "HTTP_PROXY": "", "HTTPS_PROXY": "", - "RAY_ADDRESS": "", # Clear to use default Ray connection + "RAY_ADDRESS": "127.0.0.1:6379", # Clear to use default Ray connection **( { "CUDA_ENABLE_COREDUMP_ON_EXCEPTION": "1", From bb95d36540ab82ebff78aa26237d89ae9c7a8f91 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:19:31 +0000 Subject: [PATCH 42/65] update --- .github/workflows/pr-test.yml | 7 +++++++ .github/workflows/pr-test.yml.j2 | 1 + 2 files changed, 8 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index a96ef5182..62f7c0412 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,6 +33,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -76,6 +77,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -119,6 +121,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -162,6 +165,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -205,6 +209,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -248,6 +253,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -291,6 +297,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 6b711e645..77f82b7de 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -102,6 +102,7 @@ jobs: options: > --gpus all --ipc=host + --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 From ea52de921eafdd2926fbb3e68a2c3035420fbeb7 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:25:50 +0000 Subject: [PATCH 43/65] update --- .github/workflows/pr-test.yml | 7 ------- .github/workflows/pr-test.yml.j2 | 1 - miles/utils/external_utils/command_utils.py | 2 +- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 62f7c0412..a96ef5182 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,7 +33,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -77,7 +76,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -121,7 +119,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -165,7 +162,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -209,7 +205,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -253,7 +248,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 @@ -297,7 +291,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 77f82b7de..6b711e645 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -102,7 +102,6 @@ jobs: options: > --gpus all --ipc=host - --network=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 00399912b..bef2e388a 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -135,7 +135,7 @@ def execute_train( # will prevent ray from buffering stdout/stderr f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS && " f"export PYTHONBUFFERED=16 && " - f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" + f"ray start --head --num-gpus {num_gpus_per_node} --disable-usage-stats" ) if (f := before_ray_job_submit) is not None: From 08ab3fa339dec5ee7fcbffedcfc7434862fec003 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:39:37 +0000 Subject: [PATCH 44/65] update --- miles/utils/external_utils/command_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index bef2e388a..042e7b5f6 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -133,10 +133,14 @@ def execute_train( if not external_ray: exec_command( # will prevent ray from buffering stdout/stderr + # Force Ray to use 127.0.0.1 for all internal communication to avoid Docker network issues f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS && " f"export PYTHONBUFFERED=16 && " - f"ray start --head --num-gpus {num_gpus_per_node} --disable-usage-stats" + f"export RAY_ADDRESS=127.0.0.1:6379 && " + f"ray start --head --node-ip-address 127.0.0.1 --num-gpus {num_gpus_per_node} --disable-usage-stats" ) + # Set RAY_ADDRESS in Python process so subsequent exec_command calls inherit it + os.environ["RAY_ADDRESS"] = "127.0.0.1:6379" if (f := before_ray_job_submit) is not None: f() @@ -186,7 +190,9 @@ def execute_train( else "" ) exec_command( - f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS && " + # Clear proxies but explicitly set RAY_ADDRESS to 127.0.0.1:6379 + f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " + f"export RAY_ADDRESS=127.0.0.1:6379 && " f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && " f"{cmd_megatron_model_source}" f'ray job submit --address="http://127.0.0.1:8265" ' From 28fd2a9eeaec9d2b005f26bbdfc0d804eb8f3aed Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:43:20 +0000 Subject: [PATCH 45/65] update --- .github/workflows/pr-test.yml | 63 ++++++++++++--------- .github/workflows/pr-test.yml.j2 | 9 +-- miles/utils/external_utils/command_utils.py | 20 +------ 3 files changed, 41 insertions(+), 51 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index a96ef5182..f00faa5a6 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,6 +38,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -63,10 +67,7 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-fsdp: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp')) @@ -81,6 +82,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -106,10 +111,7 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-megatron: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron')) @@ -124,6 +126,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -149,10 +155,7 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-precision: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision')) @@ -167,6 +170,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -192,10 +199,7 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-ckpt: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt')) @@ -210,6 +214,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -235,10 +243,7 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-long: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long')) @@ -253,6 +258,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -278,10 +287,7 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} e2e-test-image: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image')) @@ -296,6 +302,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -321,7 +331,4 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 6b711e645..25bb2bce2 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,6 +107,10 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -e http_proxy=$http_proxy + -e https_proxy=$https_proxy + -e HTTP_PROXY=$HTTP_PROXY + -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -132,8 +136,5 @@ jobs: - name: Execute shell: bash - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS - export no_proxy="127.0.0.1,localhost" - python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} <% endfor %> \ No newline at end of file diff --git a/miles/utils/external_utils/command_utils.py b/miles/utils/external_utils/command_utils.py index 042e7b5f6..8c7c9316b 100644 --- a/miles/utils/external_utils/command_utils.py +++ b/miles/utils/external_utils/command_utils.py @@ -109,10 +109,6 @@ def execute_train( train_backend_fsdp = "--train-backend fsdp" in train_args assert train_backend_fsdp == (megatron_model_type is None) - # Clear proxy and Ray address environment variables to prevent communication issues - for env_var in ["http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY", "RAY_ADDRESS"]: - os.environ.pop(env_var, None) - exec_command( "pkill -9 sglang; " "sleep 3; " @@ -133,14 +129,9 @@ def execute_train( if not external_ray: exec_command( # will prevent ray from buffering stdout/stderr - # Force Ray to use 127.0.0.1 for all internal communication to avoid Docker network issues - f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY RAY_ADDRESS && " f"export PYTHONBUFFERED=16 && " - f"export RAY_ADDRESS=127.0.0.1:6379 && " - f"ray start --head --node-ip-address 127.0.0.1 --num-gpus {num_gpus_per_node} --disable-usage-stats" + f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" ) - # Set RAY_ADDRESS in Python process so subsequent exec_command calls inherit it - os.environ["RAY_ADDRESS"] = "127.0.0.1:6379" if (f := before_ray_job_submit) is not None: f() @@ -161,12 +152,6 @@ def execute_train( "no_proxy": f"127.0.0.1,{master_addr}", # This is needed by megatron / torch distributed in multi-node setup "MASTER_ADDR": master_addr, - # Clear proxy and RAY_ADDRESS to prevent communication issues - "http_proxy": "", - "https_proxy": "", - "HTTP_PROXY": "", - "HTTPS_PROXY": "", - "RAY_ADDRESS": "127.0.0.1:6379", # Clear to use default Ray connection **( { "CUDA_ENABLE_COREDUMP_ON_EXCEPTION": "1", @@ -190,9 +175,6 @@ def execute_train( else "" ) exec_command( - # Clear proxies but explicitly set RAY_ADDRESS to 127.0.0.1:6379 - f"unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && " - f"export RAY_ADDRESS=127.0.0.1:6379 && " f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && " f"{cmd_megatron_model_source}" f'ray job submit --address="http://127.0.0.1:8265" ' From 412086bf265cdb62cb9b98a9d42ad0455be1f4f8 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:49:04 +0000 Subject: [PATCH 46/65] update --- .github/workflows/pr-test.yml | 42 ++++++++++++++++++++++++++++++++ .github/workflows/pr-test.yml.j2 | 6 +++++ 2 files changed, 48 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f00faa5a6..6d85ad97b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -58,6 +58,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 @@ -102,6 +108,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 @@ -146,6 +158,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 @@ -190,6 +208,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 @@ -234,6 +258,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 @@ -278,6 +308,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 @@ -322,6 +358,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 25bb2bce2..46febee88 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -127,6 +127,12 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: | + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + git config --global --unset http.proxy || true + git config --global --unset https.proxy || true + - name: Checkout repository uses: actions/checkout@v4 From 6c9a176eed8819fb58d4cf1498fc064995d5cbcc Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:53:27 +0000 Subject: [PATCH 47/65] update --- .github/workflows/pr-test.yml | 42 -------------------------------- .github/workflows/pr-test.yml.j2 | 6 ----- 2 files changed, 48 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6d85ad97b..f00faa5a6 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -58,12 +58,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 @@ -108,12 +102,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 @@ -158,12 +146,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 @@ -208,12 +190,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 @@ -258,12 +234,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 @@ -308,12 +278,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 @@ -358,12 +322,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 46febee88..25bb2bce2 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -127,12 +127,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: | - unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY - git config --global --unset http.proxy || true - git config --global --unset https.proxy || true - - name: Checkout repository uses: actions/checkout@v4 From 48cc27f481d33b088b9969d037b96cc68051942d Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:54:55 +0000 Subject: [PATCH 48/65] update --- del.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 del.txt diff --git a/del.txt b/del.txt deleted file mode 100644 index 73ca374fe..000000000 --- a/del.txt +++ /dev/null @@ -1 +0,0 @@ -sdasd From 16545ed8e48bafaea165a4f991a7ef98d77bb81d Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 03:58:00 +0000 Subject: [PATCH 49/65] update --- .github/workflows/pr-test.yml | 28 ++++++++++++++++++++++++++++ .github/workflows/pr-test.yml.j2 | 4 ++++ 2 files changed, 32 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f00faa5a6..23b16ae13 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -58,6 +58,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 @@ -102,6 +106,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 @@ -146,6 +154,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 @@ -190,6 +202,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 @@ -234,6 +250,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 @@ -278,6 +298,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 @@ -322,6 +346,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 25bb2bce2..0faddf7fb 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -127,6 +127,10 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: + - name: Clear proxy settings + run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true + + - name: Checkout repository uses: actions/checkout@v4 From c1bdc87408f40c590ad2a6a37bbe0bb505dae0c1 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 04:06:02 +0000 Subject: [PATCH 50/65] remove http becasue need not proxy - VPN --- .github/workflows/pr-test.yml | 56 -------------------------------- .github/workflows/pr-test.yml.j2 | 8 ----- 2 files changed, 64 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 23b16ae13..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,10 +38,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -58,10 +54,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 @@ -86,10 +78,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -106,10 +94,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 @@ -134,10 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -154,10 +134,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 @@ -182,10 +158,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -202,10 +174,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 @@ -230,10 +198,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -250,10 +214,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 @@ -278,10 +238,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -298,10 +254,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 @@ -326,10 +278,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -346,10 +294,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 0faddf7fb..50fe1e76b 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,10 +107,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -e http_proxy=$http_proxy - -e https_proxy=$https_proxy - -e HTTP_PROXY=$HTTP_PROXY - -e HTTPS_PROXY=$HTTPS_PROXY -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -127,10 +123,6 @@ jobs: MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} steps: - - name: Clear proxy settings - run: unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY && git config --global --unset http.proxy || true && git config --global --unset https.proxy || true - - - name: Checkout repository uses: actions/checkout@v4 From 5166592b37600e63d6253f8dacece34338a39a1f Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 04:16:42 +0000 Subject: [PATCH 51/65] update --- .github/workflows/pr-test.yml | 21 ++++++++++++++------- .github/workflows/pr-test.yml.j2 | 3 ++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..f64baa823 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -32,8 +32,9 @@ jobs: image: radixark/miles:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -72,8 +73,9 @@ jobs: image: radixark/miles:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -112,8 +114,9 @@ jobs: image: radixark/miles:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -152,8 +155,9 @@ jobs: image: radixark/miles:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -192,8 +196,9 @@ jobs: image: radixark/miles:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -232,8 +237,9 @@ jobs: image: radixark/miles:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -272,8 +278,9 @@ jobs: image: radixark/miles-test:latest options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 50fe1e76b..278e23831 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -101,8 +101,9 @@ jobs: image: << config.image if config.image else 'radixark/miles:latest' >> options: > --gpus all + --network host --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 From 7c95bb39c59fd358c326f86fbde0973c6d82b6ea Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 04:22:52 +0000 Subject: [PATCH 52/65] update --- .github/workflows/pr-test.yml | 7 ------- .github/workflows/pr-test.yml.j2 | 1 - 2 files changed, 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f64baa823..6eb47ede5 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -32,7 +32,6 @@ jobs: image: radixark/miles:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 @@ -73,7 +72,6 @@ jobs: image: radixark/miles:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 @@ -114,7 +112,6 @@ jobs: image: radixark/miles:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 @@ -155,7 +152,6 @@ jobs: image: radixark/miles:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 @@ -196,7 +192,6 @@ jobs: image: radixark/miles:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 @@ -237,7 +232,6 @@ jobs: image: radixark/miles:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 @@ -278,7 +272,6 @@ jobs: image: radixark/miles-test:latest options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 278e23831..996d96bad 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -101,7 +101,6 @@ jobs: image: << config.image if config.image else 'radixark/miles:latest' >> options: > --gpus all - --network host --ipc=host --shm-size=32g --ulimit memlock=-1 From 2aabc4c178acf15416ed3de6203e6e5f5993f4ea Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 04:24:46 +0000 Subject: [PATCH 53/65] update --- .github/workflows/pr-test.yml | 14 +++++++------- .github/workflows/pr-test.yml.j2 | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6eb47ede5..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,7 +33,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -73,7 +73,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -113,7 +113,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -153,7 +153,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -193,7 +193,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -233,7 +233,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -273,7 +273,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 996d96bad..50fe1e76b 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -102,7 +102,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=32g + --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 From 00dd900ca8a4870aa270f9ff419b3584677c745e Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 22:48:59 +0000 Subject: [PATCH 54/65] update --- .github/workflows/pr-test.yml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 50fe1e76b..e3d1b918a 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,6 +107,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 0daaa5b0862932debc9f93749bcb5cd6f75808bc Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 22:51:08 +0000 Subject: [PATCH 55/65] update --- .github/workflows/pr-test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..561acdbfa 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,6 +38,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -78,6 +79,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -118,6 +120,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -158,6 +161,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -198,6 +202,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -238,6 +243,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -278,6 +284,7 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 + -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 2716d3ae094af6b5e551b646832da00c1cf56b9b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Sun, 18 Jan 2026 22:53:28 +0000 Subject: [PATCH 56/65] update --- .github/workflows/pr-test.yml | 7 ------- .github/workflows/pr-test.yml.j2 | 1 - 2 files changed, 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 561acdbfa..2c2002ea9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -38,7 +38,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -79,7 +78,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -120,7 +118,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -161,7 +158,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -202,7 +198,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -243,7 +238,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets @@ -284,7 +278,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index e3d1b918a..50fe1e76b 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -107,7 +107,6 @@ jobs: --ulimit stack=67108864 --memory=0 --memory-swap=0 - -p 8265:8265 -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets From 14ecd1391ce9490f5c6d05182a36b1a657665b9e Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 19 Jan 2026 00:42:50 +0000 Subject: [PATCH 57/65] update --- .github/workflows/pr-test.yml | 182 +++++++++++++++++++++++++++++-- .github/workflows/pr-test.yml.j2 | 26 ++++- 2 files changed, 200 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 2c2002ea9..60e37ec5e 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,7 +33,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -41,6 +41,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -57,6 +60,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -65,6 +80,15 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + e2e-test-fsdp: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp')) runs-on: self-hosted @@ -73,7 +97,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -81,6 +105,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -97,6 +124,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -105,6 +144,15 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + e2e-test-megatron: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron')) runs-on: self-hosted @@ -113,7 +161,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -121,6 +169,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -137,6 +188,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -145,6 +208,15 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + e2e-test-precision: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision')) runs-on: self-hosted @@ -153,7 +225,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -161,6 +233,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -177,6 +252,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -185,6 +272,15 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + e2e-test-ckpt: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt')) runs-on: self-hosted @@ -193,7 +289,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -201,6 +297,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -217,6 +316,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -225,6 +336,15 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + e2e-test-long: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long')) runs-on: self-hosted @@ -233,7 +353,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -241,6 +361,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -257,6 +380,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -265,6 +400,15 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + e2e-test-image: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image')) runs-on: self-hosted @@ -273,7 +417,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -281,6 +425,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -297,6 +444,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -304,3 +463,12 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index 50fe1e76b..1d1022837 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -102,7 +102,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -110,6 +110,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -126,6 +129,18 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -133,4 +148,13 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} + + - name: Post-test cleanup + if: always() + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true <% endfor %> \ No newline at end of file From 59b192a30a238523fb433c4f53f7e6573ebbef24 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 19 Jan 2026 01:19:13 +0000 Subject: [PATCH 58/65] update --- tests/test_qwen2.5_0.5B_gsm8k_async_short.py | 6 ++---- tests/test_qwen2.5_0.5B_gsm8k_short.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py index dfadaee00..90cd15cb6 100644 --- a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py +++ b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py @@ -123,8 +123,6 @@ def execute(): if __name__ == "__main__": prepare() - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - os.environ.pop("HTTP_PROXY", None) - os.environ.pop("HTTPS_PROXY", None) + for proxy_var in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"): + os.environ.pop(proxy_var, None) execute() diff --git a/tests/test_qwen2.5_0.5B_gsm8k_short.py b/tests/test_qwen2.5_0.5B_gsm8k_short.py index 77cf02bf1..867fdcad6 100644 --- a/tests/test_qwen2.5_0.5B_gsm8k_short.py +++ b/tests/test_qwen2.5_0.5B_gsm8k_short.py @@ -122,8 +122,6 @@ def execute(): if __name__ == "__main__": prepare() - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - os.environ.pop("HTTP_PROXY", None) - os.environ.pop("HTTPS_PROXY", None) + for proxy_var in ("http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY"): + os.environ.pop(proxy_var, None) execute() From 09557fe4567a8507755014b77c9cc6b536b191b3 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 19 Jan 2026 01:24:14 +0000 Subject: [PATCH 59/65] update --- del.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 del.txt diff --git a/del.txt b/del.txt new file mode 100644 index 000000000..d72af3146 --- /dev/null +++ b/del.txt @@ -0,0 +1 @@ +asd From e4e73038badc4227b422734ff5122a4bffe94620 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 19 Jan 2026 01:30:10 +0000 Subject: [PATCH 60/65] update --- del.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 del.txt diff --git a/del.txt b/del.txt deleted file mode 100644 index d72af3146..000000000 --- a/del.txt +++ /dev/null @@ -1 +0,0 @@ -asd From d2dbba6721de400e90aeef4545a045dad10eafb9 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 19 Jan 2026 05:12:01 +0000 Subject: [PATCH 61/65] update --- del.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 del.txt diff --git a/del.txt b/del.txt new file mode 100644 index 000000000..bf7e89904 --- /dev/null +++ b/del.txt @@ -0,0 +1 @@ +asdasd From 40ebfcf9dafcdc1599168ea33806576588fcb4e1 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Mon, 19 Jan 2026 05:12:21 +0000 Subject: [PATCH 62/65] update --- del.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 del.txt diff --git a/del.txt b/del.txt deleted file mode 100644 index bf7e89904..000000000 --- a/del.txt +++ /dev/null @@ -1 +0,0 @@ -asdasd From 77d89fe81c92fac0dd41f94a096402b95f2e6c17 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Tue, 20 Jan 2026 20:45:47 +0000 Subject: [PATCH 63/65] test --- ci_test.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 ci_test.txt diff --git a/ci_test.txt b/ci_test.txt new file mode 100644 index 000000000..abaddc0b9 --- /dev/null +++ b/ci_test.txt @@ -0,0 +1 @@ +del From 943e43fd09607c70a56e84728a987b261888c33b Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Tue, 20 Jan 2026 23:30:42 +0000 Subject: [PATCH 64/65] update --- ci_test.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 ci_test.txt diff --git a/ci_test.txt b/ci_test.txt deleted file mode 100644 index abaddc0b9..000000000 --- a/ci_test.txt +++ /dev/null @@ -1 +0,0 @@ -del From 1475de672c7736da492921e42fcd60cd08974f06 Mon Sep 17 00:00:00 2001 From: Yusheng Su Date: Thu, 22 Jan 2026 23:12:01 +0000 Subject: [PATCH 65/65] fix run-ci-image problem --- .github/workflows/pr-test.yml | 85 ++++++++------------------------ .github/workflows/pr-test.yml.j2 | 2 +- 2 files changed, 21 insertions(+), 66 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 4b8b5dc82..207c0ff26 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,7 +33,7 @@ jobs: options: > --gpus all --ipc=host - --shm-size=16g + --shm-size=32g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 @@ -41,6 +41,9 @@ jobs: -v /mnt/nvme0n1/miles_ci:/data/miles_ci -v /mnt/nvme0n1/miles_ci/models:/root/models -v /mnt/nvme0n1/miles_ci/datasets:/root/datasets + --privileged + --ulimit nofile=65535:65535 + -v /tmp:/tmp strategy: fail-fast: false matrix: @@ -52,11 +55,26 @@ jobs: GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }} WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} MILES_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }} + MILES_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }} + MILES_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }} + MILES_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }} steps: - name: Checkout repository uses: actions/checkout@v4 + - name: Cleanup Ray processes + shell: bash + run: | + pkill -9 -f 'ray::' 2>/dev/null || true + pkill -9 -f raylet 2>/dev/null || true + pkill -9 -f gcs_server 2>/dev/null || true + pkill -9 -f 'ray-dashboard' 2>/dev/null || true + pkill -9 sglang 2>/dev/null || true + ray stop --force 2>/dev/null || true + rm -rf /tmp/ray/* 2>/dev/null || true + sleep 3 + - name: Install shell: bash run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages @@ -123,15 +141,6 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true - e2e-test-fsdp: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-fsdp')) runs-on: self-hosted @@ -190,15 +199,6 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true - e2e-test-megatron: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-megatron')) runs-on: self-hosted @@ -257,15 +257,6 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true - e2e-test-precision: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-precision')) runs-on: self-hosted @@ -324,15 +315,6 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true - e2e-test-ckpt: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-ckpt')) runs-on: self-hosted @@ -391,15 +373,6 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true - e2e-test-long: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-long')) runs-on: self-hosted @@ -458,20 +431,11 @@ jobs: shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true - e2e-test-image: if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-image')) runs-on: self-hosted container: - image: radixark/miles-test:latest + image: radixark/miles:latest options: > --gpus all --ipc=host @@ -524,12 +488,3 @@ jobs: - name: Execute shell: bash run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }} - - - name: Post-test cleanup - if: always() - shell: bash - run: | - pkill -9 -f 'ray::' 2>/dev/null || true - pkill -9 -f raylet 2>/dev/null || true - ray stop --force 2>/dev/null || true - rm -rf /tmp/ray/* 2>/dev/null || true diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2 index c052b8494..887542577 100644 --- a/.github/workflows/pr-test.yml.j2 +++ b/.github/workflows/pr-test.yml.j2 @@ -58,7 +58,7 @@ }, 'e2e-test-image': { 'label': 'run-ci-image', - 'image': 'radixark/miles-test:latest', + 'image': 'radixark/miles:latest', 'tests': [ {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4}, {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4},