Skip to content

Commit d3f39f4

Browse files
1. Add megatron install helper script 2. fix resource manager 3. fix weight sync buffer size
* wip * lint code * change comments
1 parent d4c5db5 commit d3f39f4

File tree

5 files changed

+120
-3
lines changed

5 files changed

+120
-3
lines changed

INSTALL_MEGATRON.sh

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/bin/bash
2+
3+
# Installation script - We offer a script to install the megatron and vllm related dependencies,
4+
# which always occur error
5+
6+
set -e # Exit immediately on error
7+
8+
echo "=========================================="
9+
echo "Starting deep learning dependencies installation..."
10+
echo "=========================================="
11+
12+
# Detect GPU architecture from nvidia-smi
13+
echo ""
14+
echo "Detecting GPU architecture..."
15+
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)
16+
echo "Detected GPU: $GPU_NAME"
17+
18+
# Map GPU name to CUDA architecture
19+
get_cuda_arch() {
20+
local gpu_name="$1"
21+
case "$gpu_name" in
22+
*H100*|*H200*|*H20*|*H800*)
23+
echo "9.0"
24+
;;
25+
*A100*|*A800*|*A30*)
26+
echo "8.0"
27+
;;
28+
*A10*|*A40*|*A16*|*A2*)
29+
echo "8.6"
30+
;;
31+
*L40*|*L4*|*Ada*|*RTX\ 40*|*RTX\ 50*)
32+
echo "8.9"
33+
;;
34+
*V100*)
35+
echo "7.0"
36+
;;
37+
*T4*)
38+
echo "7.5"
39+
;;
40+
*RTX\ 30*|*A6000*|*A5000*)
41+
echo "8.6"
42+
;;
43+
*RTX\ 20*)
44+
echo "7.5"
45+
;;
46+
*)
47+
echo "8.0;9.0" # Default fallback
48+
;;
49+
esac
50+
}
51+
52+
TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME")
53+
export TORCH_CUDA_ARCH_LIST
54+
echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
55+
56+
# Install latest base packages
57+
echo ""
58+
echo "Installing peft, accelerate, transformers, modelscope, oss2..."
59+
pip install --upgrade peft accelerate transformers "modelscope[framework]" oss2
60+
61+
# Install latest vllm
62+
echo ""
63+
echo "Installing latest vllm..."
64+
pip install --upgrade vllm
65+
66+
# Get site-packages path and install transformer_engine and megatron_core
67+
echo ""
68+
echo "Installing transformer_engine and megatron_core..."
69+
SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
70+
echo "Site-packages path: $SITE_PACKAGES"
71+
72+
CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
73+
CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
74+
pip install --no-build-isolation "transformer_engine[pytorch]" megatron_core --no-cache-dir
75+
76+
# Install flash-attention (force local build)
77+
echo ""
78+
echo "Installing flash-attention (local build for $GPU_NAME)..."
79+
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
80+
MAX_JOBS=8 \
81+
FLASH_ATTENTION_FORCE_BUILD=TRUE \
82+
pip install flash-attn --no-build-isolation --no-cache-dir
83+
84+
# Install numpy
85+
echo ""
86+
echo "Installing numpy==2.2 and deep_gemm..."
87+
pip install numpy==2.2
88+
pip uninstall deep_gemm -y
89+
cd /tmp
90+
git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git
91+
cd DeepGEMM
92+
pip install . --no-build-isolation
93+
94+
# Verify installation
95+
echo ""
96+
echo "Verifying installation..."
97+
echo ""
98+
python -c "
99+
import pkg_resources
100+
101+
packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'oss2', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy']
102+
103+
print('Installed package versions:')
104+
print('-' * 40)
105+
for pkg in packages:
106+
try:
107+
version = pkg_resources.get_distribution(pkg).version
108+
print(f'{pkg}: {version}')
109+
except pkg_resources.DistributionNotFound:
110+
print(f'{pkg}: Not installed')
111+
"
112+
113+
echo ""
114+
echo "=========================================="
115+
echo "Installation complete!"
116+
echo "=========================================="

src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ class HCCLCheckpointEngine(CheckpointEngine):
102102

103103
def __init__(
104104
self,
105-
bucket_size: int = 2048 << 20,
105+
bucket_size: int = 3072 << 20,
106106
group_name: str = 'twinkle_ckpt',
107107
rebuild_group: bool = True,
108108
rollout_dtype: torch.dtype = torch.bfloat16,

src/twinkle/checkpoint_engine/mixin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
class CheckpointEngineMixin:
77

88
_checkpoint_engine: CheckpointEngine = None
9-
_bucket_size: int = 2048 << 20 # 2 GB
9+
_bucket_size: int = 3072 << 20 # 2 GB
1010

1111
def _get_or_create_checkpoint_engine(self) -> 'CheckpointEngine':
1212
"""Get or create the checkpoint engine instance (lazy singleton)."""

src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class NCCLCheckpointEngine(CheckpointEngine):
100100

101101
def __init__(
102102
self,
103-
bucket_size: int = 2048 << 20,
103+
bucket_size: int = 3072 << 20,
104104
group_name: str = 'twinkle_ckpt',
105105
rebuild_group: bool = False,
106106
rollout_dtype: torch.dtype = torch.bfloat16,

src/twinkle/infra/_ray/resource_manager.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ def get_visible_devices():
195195

196196
# All GPUs for a worker should be on the same node
197197
gpu_ranks_local = []
198+
node_ranks = []
198199
for r in worker_ranks:
199200
node_rank = r // nproc_per_node
200201
node_ranks.append(node_rank)

0 commit comments

Comments
 (0)