From c5350beb181b0d8ee40ca941d64e234aff497890 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 26 Feb 2026 17:29:03 +0800
Subject: [PATCH 1/3] wip

---
 INSTALL.sh                                    | 116 ++++++++++++++++++
 .../hccl_checkpoint_engine.py                 |   2 +-
 src/twinkle/checkpoint_engine/mixin.py        |   2 +-
 .../nccl_checkpoint_engine.py                 |   2 +-
 src/twinkle/infra/_ray/resource_manager.py    |   1 +
 5 files changed, 120 insertions(+), 3 deletions(-)
 create mode 100644 INSTALL.sh

diff --git a/INSTALL.sh b/INSTALL.sh
new file mode 100644
index 00000000..ae9db2ac
--- /dev/null
+++ b/INSTALL.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Installation script - Install deep learning dependencies
+# Usage: chmod +x install.sh && ./install.sh
+
+set -e  # Exit immediately on error
+
+echo "=========================================="
+echo "Starting deep learning dependencies installation..."
+echo "=========================================="
+
+# Detect GPU architecture from nvidia-smi
+echo ""
+echo "Detecting GPU architecture..."
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)
+echo "Detected GPU: $GPU_NAME"
+
+# Map GPU name to CUDA architecture
+get_cuda_arch() {
+    local gpu_name="$1"
+    case "$gpu_name" in
+        *H100*|*H200*|*H20*|*H800*)
+            echo "9.0"
+            ;;
+        *A100*|*A800*|*A30*)
+            echo "8.0"
+            ;;
+        *A10*|*A40*|*A16*|*A2*)
+            echo "8.6"
+            ;;
+        *L40*|*L4*|*Ada*|*RTX\ 40*|*RTX\ 50*)
+            echo "8.9"
+            ;;
+        *V100*)
+            echo "7.0"
+            ;;
+        *T4*)
+            echo "7.5"
+            ;;
+        *RTX\ 30*|*A6000*|*A5000*)
+            echo "8.6"
+            ;;
+        *RTX\ 20*)
+            echo "7.5"
+            ;;
+        *)
+            echo "8.0;9.0"  # Default fallback
+            ;;
+    esac
+}
+
+TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME")
+export TORCH_CUDA_ARCH_LIST
+echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
+
+# Install latest base packages
+echo ""
+echo "Installing peft, accelerate, transformers, modelscope, oss2..."
+pip install --upgrade peft accelerate transformers "modelscope[framework]" oss2
+
+# Install latest vllm
+echo ""
+echo "Installing latest vllm..."
+pip install --upgrade vllm
+
+# Get site-packages path and install transformer_engine and megatron_core
+echo ""
+echo "Installing transformer_engine and megatron_core..."
+SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+echo "Site-packages path: $SITE_PACKAGES"
+
+CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
+CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
+pip install --no-build-isolation "transformer_engine[pytorch]" megatron_core --no-cache-dir
+
+# Install flash-attention (force local build)
+echo ""
+echo "Installing flash-attention (local build for $GPU_NAME)..."
+TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
+MAX_JOBS=8 \
+FLASH_ATTENTION_FORCE_BUILD=TRUE \
+pip install flash-attn --no-build-isolation --no-cache-dir
+
+# Install numpy
+echo ""
+echo "Installing numpy==2.2 and deep_gemm..."
+pip install numpy==2.2
+pip uninstall deep_gemm -y
+cd /tmp
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git
+cd DeepGEMM
+pip install . --no-build-isolation
+
+# Verify installation
+echo ""
+echo "Verifying installation..."
+echo ""
+python -c "
+import pkg_resources
+
+packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'oss2', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy']
+
+print('Installed package versions:')
+print('-' * 40)
+for pkg in packages:
+    try:
+        version = pkg_resources.get_distribution(pkg).version
+        print(f'{pkg}: {version}')
+    except pkg_resources.DistributionNotFound:
+        print(f'{pkg}: Not installed')
+"
+
+echo ""
+echo "=========================================="
+echo "Installation complete!"
+echo "=========================================="
\ No newline at end of file
diff --git a/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py b/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py
index 85c2f0a8..e6b9cdde 100644
--- a/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py
+++ b/src/twinkle/checkpoint_engine/hccl_checkpoint_engine.py
@@ -102,7 +102,7 @@ class HCCLCheckpointEngine(CheckpointEngine):
 
     def __init__(
         self,
-        bucket_size: int = 2048 << 20,
+        bucket_size: int = 3072 << 20,
         group_name: str = 'twinkle_ckpt',
         rebuild_group: bool = True,
         rollout_dtype: torch.dtype = torch.bfloat16,
diff --git a/src/twinkle/checkpoint_engine/mixin.py b/src/twinkle/checkpoint_engine/mixin.py
index 75bdad74..1a4c4466 100644
--- a/src/twinkle/checkpoint_engine/mixin.py
+++ b/src/twinkle/checkpoint_engine/mixin.py
@@ -6,7 +6,7 @@
 class CheckpointEngineMixin:
 
     _checkpoint_engine: CheckpointEngine = None
-    _bucket_size: int = 2048 << 20  # 2 GB
+    _bucket_size: int = 3072 << 20  # 2 GB
 
     def _get_or_create_checkpoint_engine(self) -> 'CheckpointEngine':
         """Get or create the checkpoint engine instance (lazy singleton)."""
diff --git a/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py b/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py
index f44ed5d4..d6209c58 100644
--- a/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py
+++ b/src/twinkle/checkpoint_engine/nccl_checkpoint_engine.py
@@ -100,7 +100,7 @@ class NCCLCheckpointEngine(CheckpointEngine):
 
     def __init__(
         self,
-        bucket_size: int = 2048 << 20,
+        bucket_size: int = 3072 << 20,
         group_name: str = 'twinkle_ckpt',
         rebuild_group: bool = False,
         rollout_dtype: torch.dtype = torch.bfloat16,
diff --git a/src/twinkle/infra/_ray/resource_manager.py b/src/twinkle/infra/_ray/resource_manager.py
index d5e87e53..d6fea8f1 100644
--- a/src/twinkle/infra/_ray/resource_manager.py
+++ b/src/twinkle/infra/_ray/resource_manager.py
@@ -195,6 +195,7 @@ def get_visible_devices():
 
                         # All GPUs for a worker should be on the same node
                         gpu_ranks_local = []
+                        node_ranks = []
                         for r in worker_ranks:
                             node_rank = r // nproc_per_node
                             node_ranks.append(node_rank)

From 57ad57fc8f9d1d8a4451c558b210b0260ba3a054 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 26 Feb 2026 17:29:46 +0800
Subject: [PATCH 2/3] lint code

---
 INSTALL.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.sh b/INSTALL.sh
index ae9db2ac..2c280b46 100644
--- a/INSTALL.sh
+++ b/INSTALL.sh
@@ -113,4 +113,4 @@ for pkg in packages:
 echo ""
 echo "=========================================="
 echo "Installation complete!"
-echo "=========================================="
\ No newline at end of file
+echo "=========================================="

From 44c9f82458db8a927936fcc79f1e527d7c77e112 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 27 Feb 2026 11:39:34 +0800
Subject: [PATCH 3/3] change comments

---
 INSTALL.sh => INSTALL_MEGATRON.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename INSTALL.sh => INSTALL_MEGATRON.sh (96%)

diff --git a/INSTALL.sh b/INSTALL_MEGATRON.sh
similarity index 96%
rename from INSTALL.sh
rename to INSTALL_MEGATRON.sh
index 2c280b46..c85ec6e1 100644
--- a/INSTALL.sh
+++ b/INSTALL_MEGATRON.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Installation script - Install deep learning dependencies
-# Usage: chmod +x install.sh && ./install.sh
+# Installation script - We offer a script to install the megatron and vllm related dependencies,
+# which always occur error
 
 set -e  # Exit immediately on error