diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh new file mode 100644 index 00000000..c85ec6e1 --- /dev/null +++ b/INSTALL_MEGATRON.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Installation script - We offer a script to install the megatron and vllm related dependencies, +# which always occur error + +set -e # Exit immediately on error + +echo "==========================================" +echo "Starting deep learning dependencies installation..." +echo "==========================================" + +# Detect GPU architecture from nvidia-smi +echo "" +echo "Detecting GPU architecture..." +GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1) +echo "Detected GPU: $GPU_NAME" + +# Map GPU name to CUDA architecture +get_cuda_arch() { + local gpu_name="$1" + case "$gpu_name" in + *H100*|*H200*|*H20*|*H800*) + echo "9.0" + ;; + *A100*|*A800*|*A30*) + echo "8.0" + ;; + *A10*|*A40*|*A16*|*A2*) + echo "8.6" + ;; + *L40*|*L4*|*Ada*|*RTX\ 40*|*RTX\ 50*) + echo "8.9" + ;; + *V100*) + echo "7.0" + ;; + *T4*) + echo "7.5" + ;; + *RTX\ 30*|*A6000*|*A5000*) + echo "8.6" + ;; + *RTX\ 20*) + echo "7.5" + ;; + *) + echo "8.0;9.0" # Default fallback + ;; + esac +} + +TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME") +export TORCH_CUDA_ARCH_LIST +echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST" + +# Install latest base packages +echo "" +echo "Installing peft, accelerate, transformers, modelscope, oss2..." +pip install --upgrade peft accelerate transformers "modelscope[framework]" oss2 + +# Install latest vllm +echo "" +echo "Installing latest vllm..." +pip install --upgrade vllm + +# Get site-packages path and install transformer_engine and megatron_core +echo "" +echo "Installing transformer_engine and megatron_core..." +SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") +echo "Site-packages path: $SITE_PACKAGES" + +CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \ +CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ +pip install --no-build-isolation "transformer_engine[pytorch]" megatron_core --no-cache-dir + +# Install flash-attention (force local build) +echo "" +echo "Installing flash-attention (local build for $GPU_NAME)..." +TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \ +MAX_JOBS=8 \ +FLASH_ATTENTION_FORCE_BUILD=TRUE \ +pip install flash-attn --no-build-isolation --no-cache-dir + +# Install numpy +echo "" +echo "Installing numpy==2.2 and deep_gemm..." +pip install numpy==2.2 +pip uninstall deep_gemm -y +cd /tmp +git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git +cd DeepGEMM +pip install . --no-build-isolation + +# Verify installation +echo "" +echo "Verifying installation..." +echo "" +python -c " +import pkg_resources + +packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'oss2', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy'] + +print('Installed package versions:') +print('-' * 40) +for pkg in packages: + try: + version = pkg_resources.get_distribution(pkg).version + print(f'{pkg}: {version}') + except pkg_resources.DistributionNotFound: + print(f'{pkg}: Not installed') +" + +echo "" +echo "==========================================" +echo "Installation complete!" +echo "==========================================" diff --git a/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py b/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py index 85c2f0a8..e6b9cdde 100644 --- a/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py +++ b/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py @@ -102,7 +102,7 @@ class HCCLCheckpointEngine(CheckpointEngine): def __init__( self, - bucket_size: int = 2048 << 20, + bucket_size: int = 3072 << 20, group_name: str = 'twinkle_ckpt', rebuild_group: bool = True, rollout_dtype: torch.dtype = torch.bfloat16, diff --git a/src/twinkle/checkpoint_engine/mixin.py b/src/twinkle/checkpoint_engine/mixin.py index 75bdad74..1a4c4466 100644 --- a/src/twinkle/checkpoint_engine/mixin.py +++ b/src/twinkle/checkpoint_engine/mixin.py @@ -6,7 +6,7 @@ class CheckpointEngineMixin: _checkpoint_engine: CheckpointEngine = None - _bucket_size: int = 2048 << 20 # 2 GB + _bucket_size: int = 3072 << 20 # 2 GB def _get_or_create_checkpoint_engine(self) -> 'CheckpointEngine': """Get or create the checkpoint engine instance (lazy singleton).""" diff --git a/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py b/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py index f44ed5d4..d6209c58 100644 --- a/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py +++ b/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py @@ -100,7 +100,7 @@ class NCCLCheckpointEngine(CheckpointEngine): def __init__( self, - bucket_size: int = 2048 << 20, + bucket_size: int = 3072 << 20, group_name: str = 'twinkle_ckpt', rebuild_group: bool = False, rollout_dtype: torch.dtype = torch.bfloat16, diff --git a/src/twinkle/infra/_ray/resource_manager.py b/src/twinkle/infra/_ray/resource_manager.py index d5e87e53..d6fea8f1 100644 --- a/src/twinkle/infra/_ray/resource_manager.py +++ b/src/twinkle/infra/_ray/resource_manager.py @@ -195,6 +195,7 @@ def get_visible_devices(): # All GPUs for a worker should be on the same node gpu_ranks_local = [] + node_ranks = [] for r in worker_ranks: node_rank = r // nproc_per_node node_ranks.append(node_rank)