From c5350beb181b0d8ee40ca941d64e234aff497890 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 26 Feb 2026 17:29:03 +0800 Subject: [PATCH 1/3] wip --- INSTALL.sh | 116 ++++++++++++++++++ .../hccl_checkpoint_engine.py | 2 +- src/twinkle/checkpoint_engine/mixin.py | 2 +- .../nccl_checkpoint_engine.py | 2 +- src/twinkle/infra/_ray/resource_manager.py | 1 + 5 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 INSTALL.sh diff --git a/INSTALL.sh b/INSTALL.sh new file mode 100644 index 00000000..ae9db2ac --- /dev/null +++ b/INSTALL.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Installation script - Install deep learning dependencies +# Usage: chmod +x install.sh && ./install.sh + +set -e # Exit immediately on error + +echo "==========================================" +echo "Starting deep learning dependencies installation..." +echo "==========================================" + +# Detect GPU architecture from nvidia-smi +echo "" +echo "Detecting GPU architecture..." +GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1) +echo "Detected GPU: $GPU_NAME" + +# Map GPU name to CUDA architecture +get_cuda_arch() { + local gpu_name="$1" + case "$gpu_name" in + *H100*|*H200*|*H20*|*H800*) + echo "9.0" + ;; + *A100*|*A800*|*A30*) + echo "8.0" + ;; + *A10*|*A40*|*A16*|*A2*) + echo "8.6" + ;; + *L40*|*L4*|*Ada*|*RTX\ 40*|*RTX\ 50*) + echo "8.9" + ;; + *V100*) + echo "7.0" + ;; + *T4*) + echo "7.5" + ;; + *RTX\ 30*|*A6000*|*A5000*) + echo "8.6" + ;; + *RTX\ 20*) + echo "7.5" + ;; + *) + echo "8.0;9.0" # Default fallback + ;; + esac +} + +TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME") +export TORCH_CUDA_ARCH_LIST +echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST" + +# Install latest base packages +echo "" +echo "Installing peft, accelerate, transformers, modelscope, oss2..." +pip install --upgrade peft accelerate transformers "modelscope[framework]" oss2 + +# Install latest vllm +echo "" +echo "Installing latest vllm..." +pip install --upgrade vllm + +# Get site-packages path and install transformer_engine and megatron_core +echo "" +echo "Installing transformer_engine and megatron_core..." +SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") +echo "Site-packages path: $SITE_PACKAGES" + +CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \ +CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ +pip install --no-build-isolation "transformer_engine[pytorch]" megatron_core --no-cache-dir + +# Install flash-attention (force local build) +echo "" +echo "Installing flash-attention (local build for $GPU_NAME)..." +TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \ +MAX_JOBS=8 \ +FLASH_ATTENTION_FORCE_BUILD=TRUE \ +pip install flash-attn --no-build-isolation --no-cache-dir + +# Install numpy +echo "" +echo "Installing numpy==2.2 and deep_gemm..." +pip install numpy==2.2 +pip uninstall deep_gemm -y +cd /tmp +git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git +cd DeepGEMM +pip install . --no-build-isolation + +# Verify installation +echo "" +echo "Verifying installation..." +echo "" +python -c " +import pkg_resources + +packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'oss2', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy'] + +print('Installed package versions:') +print('-' * 40) +for pkg in packages: + try: + version = pkg_resources.get_distribution(pkg).version + print(f'{pkg}: {version}') + except pkg_resources.DistributionNotFound: + print(f'{pkg}: Not installed') +" + +echo "" +echo "==========================================" +echo "Installation complete!" +echo "==========================================" \ No newline at end of file diff --git a/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py b/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py index 85c2f0a8..e6b9cdde 100644 --- a/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py +++ b/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py @@ -102,7 +102,7 @@ class HCCLCheckpointEngine(CheckpointEngine): def __init__( self, - bucket_size: int = 2048 << 20, + bucket_size: int = 3072 << 20, group_name: str = 'twinkle_ckpt', rebuild_group: bool = True, rollout_dtype: torch.dtype = torch.bfloat16, diff --git a/src/twinkle/checkpoint_engine/mixin.py b/src/twinkle/checkpoint_engine/mixin.py index 75bdad74..1a4c4466 100644 --- a/src/twinkle/checkpoint_engine/mixin.py +++ b/src/twinkle/checkpoint_engine/mixin.py @@ -6,7 +6,7 @@ class CheckpointEngineMixin: _checkpoint_engine: CheckpointEngine = None - _bucket_size: int = 2048 << 20 # 2 GB + _bucket_size: int = 3072 << 20 # 2 GB def _get_or_create_checkpoint_engine(self) -> 'CheckpointEngine': """Get or create the checkpoint engine instance (lazy singleton).""" diff --git a/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py b/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py index f44ed5d4..d6209c58 100644 --- a/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py +++ b/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py @@ -100,7 +100,7 @@ class NCCLCheckpointEngine(CheckpointEngine): def __init__( self, - bucket_size: int = 2048 << 20, + bucket_size: int = 3072 << 20, group_name: str = 'twinkle_ckpt', rebuild_group: bool = False, rollout_dtype: torch.dtype = torch.bfloat16, diff --git a/src/twinkle/infra/_ray/resource_manager.py b/src/twinkle/infra/_ray/resource_manager.py index d5e87e53..d6fea8f1 100644 --- a/src/twinkle/infra/_ray/resource_manager.py +++ b/src/twinkle/infra/_ray/resource_manager.py @@ -195,6 +195,7 @@ def get_visible_devices(): # All GPUs for a worker should be on the same node gpu_ranks_local = [] + node_ranks = [] for r in worker_ranks: node_rank = r // nproc_per_node node_ranks.append(node_rank) From 57ad57fc8f9d1d8a4451c558b210b0260ba3a054 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 26 Feb 2026 17:29:46 +0800 Subject: [PATCH 2/3] lint code --- INSTALL.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.sh b/INSTALL.sh index ae9db2ac..2c280b46 100644 --- a/INSTALL.sh +++ b/INSTALL.sh @@ -113,4 +113,4 @@ for pkg in packages: echo "" echo "==========================================" echo "Installation complete!" -echo "==========================================" \ No newline at end of file +echo "==========================================" From 44c9f82458db8a927936fcc79f1e527d7c77e112 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 27 Feb 2026 11:39:34 +0800 Subject: [PATCH 3/3] change comments --- INSTALL.sh => INSTALL_MEGATRON.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename INSTALL.sh => INSTALL_MEGATRON.sh (96%) diff --git a/INSTALL.sh b/INSTALL_MEGATRON.sh similarity index 96% rename from INSTALL.sh rename to INSTALL_MEGATRON.sh index 2c280b46..c85ec6e1 100644 --- a/INSTALL.sh +++ b/INSTALL_MEGATRON.sh @@ -1,7 +1,7 @@ #!/bin/bash -# Installation script - Install deep learning dependencies -# Usage: chmod +x install.sh && ./install.sh +# Installation script - We offer a script to install the megatron and vllm related dependencies, +# which always occur error set -e # Exit immediately on error