From 6130f0765c038b154f76aa43036b8ca044f077ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 4 Mar 2026 16:09:56 +0000 Subject: [PATCH 1/3] ci: Mount and enforce HF_HOME MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 1 + tests/test_utils/python_scripts/launch_nemo_run_workload.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index decaa5ff3f8..367b23b3099 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -186,6 +186,7 @@ runs: --platform dgx_h100 \ --container-image ${{ inputs.container-image }} \ --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \ + --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME RUN_TEST_EOF ) diff --git a/tests/test_utils/python_scripts/launch_nemo_run_workload.py b/tests/test_utils/python_scripts/launch_nemo_run_workload.py index bf4bb37aa20..cdb730fae63 100644 --- a/tests/test_utils/python_scripts/launch_nemo_run_workload.py +++ b/tests/test_utils/python_scripts/launch_nemo_run_workload.py @@ -54,6 +54,7 @@ def is_flaky_failure(concat_allranks_logs: str) -> bool: "--n-repeat", required=False, type=int, help="Number of times to repeat the workload", default=1 ) @click.option("--data-dir", required=False, type=str, help="Data directory of the workload") +@click.option("--hf-home", required=False, type=str, help="HF home directory of the workload") @click.option("--tag", required=False, type=str, help="Tag of the workload") @click.option( "--enable-lightweight-mode", @@ -73,6 +74,7 @@ def main( container_image, n_repeat: int = 1, data_dir: Optional[str] = None, + hf_home: Optional[str] = None, tag: Optional[str] = None, enable_lightweight_mode: Optional[bool] = False, ): @@ -107,6 +109,8 @@ def main( artifacts.append(f"{os.getcwd()}:/opt/megatron-lm") if data_dir: artifacts.append(f"{pathlib.Path(data_dir)}:/mnt/artifacts") + if hf_home: + artifacts.append(f"{pathlib.Path(hf_home)}:/mnt/hf_home") executor = run.DockerExecutor( container_image=container_image, @@ -122,6 +126,8 @@ def main( "CLUSTER": "dgxh100_dgxc", "NCCL_DEBUG": "INFO", "NCCL_DEBUG_FILE": "/opt/megatron-lm/assets_dir/logs/nccl_debug.log", + "HF_HOME": "/mnt/hf_home", + "TRANSFORMERS_OFFLINE": "1", }, packager=run.Packager(), volumes=artifacts, From c7c5212391deab458fa15938c43a9871cf315bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 4 Mar 2026 17:00:20 +0000 Subject: [PATCH 2/3] mount hf home MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/actions/action.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/actions/action.yml b/.github/actions/action.yml index 367b23b3099..b49aebfcfa1 100644 --- a/.github/actions/action.yml +++ b/.github/actions/action.yml @@ -98,7 +98,8 @@ runs: --environment dev \ --platform dgx_h100 \ --tag ${{ inputs.tag }} \ - --container-image ${{ inputs.container-image }} + --container-image ${{ inputs.container-image }} \ + --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME RUN_TEST_EOF ) From dd5bad21bf53ccc108ab40d4192ea77481f80b4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 5 Mar 2026 08:28:15 +0000 Subject: [PATCH 3/3] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ed3bbca7f2f..3442d4f1f02 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -25,7 +25,7 @@ on: workflow_dispatch: concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref || github.event.pull_request.number }} + group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.base_ref || github.ref }} cancel-in-progress: true permissions: