diff --git a/.dev_scripts/build_docs.sh b/.dev_scripts/build_docs.sh new file mode 100644 index 00000000..43378168 --- /dev/null +++ b/.dev_scripts/build_docs.sh @@ -0,0 +1,7 @@ +cd docs +rm -rf build + +# update api rst +#rm -rf source/api/ +#sphinx-apidoc --module-first -o source/api/ ../modelscope/ +make html diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh new file mode 100644 index 00000000..b5f2d8b3 --- /dev/null +++ b/.dev_scripts/ci_container_test.sh @@ -0,0 +1,47 @@ +install_twinkle_with_kernels() { + pip install ".[kernels]" -i https://mirrors.aliyun.com/pypi/simple/ || pip install ".[kernels]" +} + +if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then + # pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + git config --global --add safe.directory /twinkle + git config --global user.email tmp + git config --global user.name tmp.com + + # linter test + # use internal project for pre-commit due to the network problem + if [ `git remote -v | grep alibaba | wc -l` -gt 1 ]; then + pre-commit run -c .pre-commit-config_local.yaml --all-files + if [ $? -ne 0 ]; then + echo "linter test failed, please run 'pre-commit run --all-files' to check" + echo "From the repository folder" + echo "Run 'pre-commit install' install pre-commit hooks." + echo "Finally run linter with command: 'pre-commit run --all-files' to check." + echo "Ensure there is no failure!!!!!!!!" + exit -1 + fi + fi + + pip install decord einops -U -i https://mirrors.aliyun.com/pypi/simple/ + pip uninstall autoawq -y + pip uninstall lmdeploy -y + pip uninstall tensorflow -y + pip install kernels -U + pip install ray==2.48 + pip install optimum + + # test with install + install_twinkle_with_kernels +else + install_twinkle_with_kernels + echo "Running case in release image, run case directly!" +fi +# remove torch_extensions folder to avoid ci hang. +rm -rf ~/.cache/torch_extensions +if [ $# -eq 0 ]; then + ci_command="pytest tests" +else + ci_command="$@" +fi +echo "Running case with command: $ci_command" +$ci_command diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh new file mode 100644 index 00000000..3e41846c --- /dev/null +++ b/.dev_scripts/dockerci.sh @@ -0,0 +1,96 @@ +#!/bin/bash +MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache +CODE_DIR=$PWD +CODE_DIR_IN_CONTAINER=/twinkle +mkdir -p ~/.cache +MODELSCOPE_CACHE=~/.cache +IMAGE_NAME=modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope +IMAGE_VERSION=ci_image +MODELSCOPE_HOME_CACHE=~/.cache +CI_TEST=True +MODELSCOPE_SDK_DEBUG=True +CI_COMMAND='bash .dev_scripts/ci_container_test.sh pytest tests' +MODELSCOPE_SDK_DEBUG=True +echo "$USER" +gpus='0,1 2,3' +cpu_sets='0-15 16-31' +cpu_sets_arr=($cpu_sets) +is_get_file_lock=false +echo "ci command: $CI_COMMAND" +PR_CHANGED_FILES="${PR_CHANGED_FILES:-}" +echo "PR modified files: $PR_CHANGED_FILES" +PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#} +echo "PR_CHANGED_FILES: $PR_CHANGED_FILES" +idx=0 +for gpu in $gpus +do + exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 + flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; } + echo "get gpu lock $gpu" + + CONTAINER_NAME="twinkle-ci-$idx" + let is_get_file_lock=true + + # pull image if there are update + docker pull ${IMAGE_NAME}:${IMAGE_VERSION} + if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then + echo 'debugging' + docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ + --cpuset-cpus=${cpu_sets_arr[$idx]} \ + --gpus='"'"device=$gpu"'"' \ + -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ + -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ + -v $MODELSCOPE_HOME_CACHE/$idx:/root \ + -v /home/admin/pre-commit:/home/admin/pre-commit \ + -e CI_TEST=True \ + -e TEST_LEVEL=$TEST_LEVEL \ + -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ + -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \ + -e MODELSCOPE_SDK_DEBUG=True \ + -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \ + -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ + -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ + -e TEST_LEVEL=$TEST_LEVEL \ + -e MODELSCOPE_ENVIRONMENT='ci' \ + -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ + -e MODEL_TAG_URL=$MODEL_TAG_URL \ + -e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \ + -e PR_CHANGED_FILES=$PR_CHANGED_FILES \ + --workdir=$CODE_DIR_IN_CONTAINER \ + ${IMAGE_NAME}:${IMAGE_VERSION} \ + $CI_COMMAND + else + docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ + --cpuset-cpus=${cpu_sets_arr[$idx]} \ + --gpus='"'"device=$gpu"'"' \ + -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ + -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ + -v $MODELSCOPE_HOME_CACHE/$idx:/root \ + -v /home/admin/pre-commit:/home/admin/pre-commit \ + -e CI_TEST=True \ + -e TEST_LEVEL=$TEST_LEVEL \ + -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ + -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \ + -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \ + -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ + -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ + -e TEST_LEVEL=$TEST_LEVEL \ + -e MODELSCOPE_ENVIRONMENT='ci' \ + -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ + -e MODEL_TAG_URL=$MODEL_TAG_URL \ + -e MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN \ + -e PR_CHANGED_FILES=$PR_CHANGED_FILES \ + --workdir=$CODE_DIR_IN_CONTAINER \ + ${IMAGE_NAME}:${IMAGE_VERSION} \ + $CI_COMMAND + fi + if [ $? -ne 0 ]; then + echo "Running test case failed, please check the log!" + exit -1 + fi + break +done +if [ "$is_get_file_lock" = false ] ; then + echo 'No free GPU!' + exit 1 +fi diff --git a/.dev_scripts/dockerci_npu.sh b/.dev_scripts/dockerci_npu.sh new file mode 100644 index 00000000..e0f9d253 --- /dev/null +++ b/.dev_scripts/dockerci_npu.sh @@ -0,0 +1,57 @@ +#!/bin/bash +MODELSCOPE_CACHE_DIR=/modelscope_cache +CODE_DIR=$PWD +MODELSCOPE_SDK_DEBUG=True +echo "$USER" +gpus='0,1 2,3' +is_get_file_lock=false +CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh pytest tests} +echo "ci command: $CI_COMMAND" +PR_CHANGED_FILES="${PR_CHANGED_FILES:-}" +echo "PR modified files: $PR_CHANGED_FILES" +PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#} +echo "PR_CHANGED_FILES: $PR_CHANGED_FILES" +idx=0 +for gpu in $gpus +do + exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 + flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; } + echo "get gpu lock $gpu" + + let is_get_file_lock=true + + # 设置环境变量 + export CI_TEST=True + export TEST_LEVEL=$TEST_LEVEL + export MODELSCOPE_CACHE=${MODELSCOPE_CACHE:-$MODELSCOPE_CACHE_DIR} + export MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN + export HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT + export TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST + export TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV + export MODELSCOPE_ENVIRONMENT='ci' + export TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN + export MODEL_TAG_URL=$MODEL_TAG_URL + export MODELSCOPE_API_TOKEN=$MODELSCOPE_API_TOKEN + export PR_CHANGED_FILES=$PR_CHANGED_FILES + export CUDA_VISIBLE_DEVICES=$gpu + + if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then + export MODELSCOPE_SDK_DEBUG=True + echo 'debugging' + fi + + # 切换到代码目录并执行命令 + cd $CODE_DIR + eval $CI_COMMAND + + if [ $? -ne 0 ]; then + echo "Running test case failed, please check the log!" + exit -1 + fi + break +done + +if [ "$is_get_file_lock" = false ] ; then + echo 'No free GPU!' + exit 1 +fi diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml new file mode 100644 index 00000000..9999b446 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -0,0 +1,49 @@ +name: "🐛 Bug Report" +description: Create a bug report to help us improve twinkle +labels: ["bug"] + +body: + - type: markdown + attributes: + value: | + Thank you for supporting twinkle and taking the time to submit this issue. + 感谢你对 twinkle 的支持和抽出时间提交相关 issue。 + + - type: checkboxes + id: checklist + attributes: + label: Checklist / 检查清单 + options: + - label: I have searched existing issues, and this is a new bug report. / 我已经搜索过现有的 issues,确认这是一个新的 bug report。 + required: true + + + - type: textarea + id: bug-description + validations: + required: true + attributes: + label: Bug Description / Bug 描述 + description: | + Please describe the issue you encountered. It's better to include error screenshots or stack trace information. + 请详细描述你遇到的问题,最好包含报错截图或报错栈信息。 + + + - type: textarea + id: reproduction-steps + validations: + required: true + attributes: + label: How to Reproduce / 如何复现 + description: | + Please provide steps to reproduce the issue, including twinkle version, runtime environment, and detailed reproduction steps. + 请提供复现问题的步骤,包括 twinkle 的版本、运行环境、详细的复现步骤等。 + + + - type: textarea + id: additional-information + attributes: + label: Additional Information / 补充信息 + description: | + Please provide any additional information here. + 在这里补充其他相关信息。 diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml new file mode 100644 index 00000000..57633400 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml @@ -0,0 +1,37 @@ +name: "🚀 Feature Request" +description: Submit a request for a new feature +labels: ["enhancement"] + +body: + - type: markdown + attributes: + value: | + Thank you for supporting twinkle and taking the time to submit this issue. + 感谢你对 twinkle 的支持和抽出时间提交相关 issue。 + + - type: checkboxes + id: checklist + attributes: + label: Checklist / 检查清单 + options: + - label: I have searched existing issues, and this is a new feature request. / 我已经搜索过现有的 issues,确认这是一个新的 Feature Request。 + required: true + + - type: textarea + id: feature-request-description + validations: + required: true + attributes: + label: Feature Request Description / Feature Request 描述 + description: | + Please provide a detailed description of the new feature you would like to see added. + 请详细描述您希望添加的新功能特性。 + + + - type: textarea + id: pull-request + attributes: + label: Pull Request / Pull Request 信息 + description: | + Have you already submitted or plan to submit a Pull Request? Please share your plans. + 你是否已经提交或即将提交 Pull Request?请说明你的计划。 diff --git a/.github/ISSUE_TEMPLATE/3-question-discussion.yml b/.github/ISSUE_TEMPLATE/3-question-discussion.yml new file mode 100644 index 00000000..cc8ba339 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3-question-discussion.yml @@ -0,0 +1,28 @@ +name: "🤔 Question & Discussion" +description: Create an issue for questions and discussions +labels: ["question"] + +body: + - type: markdown + attributes: + value: | + Thank you for supporting twinkle and taking the time to submit this issue. + 感谢你对 twinkle 的支持和抽出时间提交相关 issue。 + + - type: checkboxes + id: checklist + attributes: + label: Checklist / 检查清单 + options: + - label: I have searched existing issues, and this is a new question or discussion topic. / 我已经搜索过现有的 issues,确认这是一个新的问题与讨论。 + required: true + + - type: textarea + id: question-description + validations: + required: true + attributes: + label: Question Description / 问题描述 + description: | + Please describe the question or topic you would like to discuss. + 请描述你想要讨论的问题或话题。 diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..3ba13e0c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..a09bfad1 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,13 @@ +# PR type +- [ ] Bug Fix +- [ ] New Feature +- [ ] Document Updates +- [ ] More Models or Datasets Support + +# PR information + +Write the detail information belongs to this PR. + +## Experiment results + +Paste your experiment result here(if needed). diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 00000000..d549cbed --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,3 @@ +# Reporting Security Issues + +Usually security issues of a deep learning project come from non-standard 3rd packages or continuous running services. If you are suffering from security issues from our project, please consider reporting to us. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000..e4b72616 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,63 @@ +# Twinkle AI Coding Agent Guidelines + +These instructions help AI agents work productively in this repo. Focus on concrete repo patterns and workflows. + +## Big Picture +- **Goal:** Training and serving LLMs with multi-adapter LoRA, efficient data handling, and distributed execution across Ray and Torch. +- **Core Modules:** + - Infrastructure & distributed orchestration: [src/twinkle/infra/__init__.py](src/twinkle/infra/__init__.py) + - Device layout & platform abstraction: [src/twinkle/utils/platform.py](src/twinkle/utils/platform.py), [src/twinkle/utils/framework.py](src/twinkle/utils/framework.py) + - Model stack (Transformers + Multi-LoRA): [src/twinkle/model/multi_lora_transformers.py](src/twinkle/model/multi_lora_transformers.py) + - Sampler (vLLM integration): [src/twinkle/sampler/vllm_sampler.py](src/twinkle/sampler/vllm_sampler.py) + - Losses & metrics: [src/twinkle/loss](src/twinkle/loss), [src/twinkle/metric](src/twinkle/metric) + - Templates & preprocessing: [src/twinkle/template](src/twinkle/template), [src/twinkle/preprocessor](src/twinkle/preprocessor) + - Model/Processor HTTP services via Ray Serve: [src/twinkle/server/twinkle](src/twinkle/server/twinkle) + - Hub integrations (ModelScope/HF): [src/twinkle/hub/hub.py](src/twinkle/hub/hub.py) + +## Architecture & Patterns +- **Lazy import surface:** [src/twinkle/__init__.py](src/twinkle/__init__.py) exposes a small, lazy API (`_LazyModule`), import public symbols from here when possible. +- **Distributed mode selection:** `twinkle.infra.initialize()` toggles between local and Ray modes. Ray mode requires `TWINKLE_MODE=ray` or `initialize(mode='ray', ...)`. +- **Remote execution decorators:** + - `remote_class()` wraps classes for Ray placement; auto-injects `DeviceMesh` if missing. + - `remote_function(dispatch='slice', execute='all', collect='none')` patches methods for distributed dispatch/collect. + - See usage in [src/twinkle/model/multi_lora_transformers.py](src/twinkle/model/multi_lora_transformers.py) and [src/twinkle/sampler/vllm_sampler.py](src/twinkle/sampler/vllm_sampler.py). +- **Device topology:** Represented by `DeviceMesh`/`DeviceGroup`. Visualize with `twinkle.infra.get_device_placement()`; examples in [tests/infra/test_infra_graph.py](tests/infra/test_infra_graph.py). +- **Platform abstractions:** `GPU`/`NPU` selection via env and device discovery. Rank/world size read from env (`RANK`, `WORLD_SIZE`, etc.). See [src/twinkle/utils/platform.py](src/twinkle/utils/platform.py). +- **Hub usage:** `HubOperation` routes to HF or ModelScope by `hf://` or `ms://` prefixes. Dataset/model download/push helpers in [src/twinkle/hub/hub.py](src/twinkle/hub/hub.py). +- **Plugin loading:** Use `Plugin.load_plugin(id, Base)` for remote code from hubs; guarded by `trust_remote_code()` to prevent unsafe execution. See [src/twinkle/utils/plugin.py](src/twinkle/utils/plugin.py). +- **Multi-LoRA conventions:** + - `MultiLoraTransformersModel` wraps a base Transformers model via `MultiAdapter` to manage multiple LoRA adapters. + - FSDP is unsupported for Multi-LoRA (`fsdp_world_size == 1` enforced). Adapter params are strictly controlled to avoid training base weights. + - Adapter ops are routed through remote functions and grouped by DP process groups. + +## Developer Workflows +- **Install:** Python 3.11+. Install with Poetry or pip. + - Poetry: `poetry install --with transformers,ray` + - Pip (editable): `pip install -e .[transformers,ray]` +- **Run tests:** + - Unit tests: `python -m unittest tests/infra/test_infra_graph.py` +- **Local single-process dev:** + - Initialize infra: `twinkle.initialize(mode='local', seed=42)` + - Inspect device placement: call `twinkle.infra.get_device_placement()`. +- **Ray Serve demo (HTTP services):** + - Config and launcher: [cookbook/client/server.py](cookbook/client/server.py), [cookbook/client/server_config.yaml](cookbook/client/server_config.yaml) + - Start: + - `python cookbook/client/server.py` + - Endpoints print on startup (default `localhost:8000`). + - Model app binds `MultiLoraTransformersModel` and exposes routes like `/add_adapter_to_model`, `/forward`, `/calculate_loss`, etc. See [src/twinkle/server/twinkle/model.py](src/twinkle/server/twinkle/model.py). +- **vLLM inference:** Use `VLLMEngine` with engine args; LoRA weight sync via `patch.vllm_lora_weights`. See [src/twinkle/sampler/vllm_engine.py](src/twinkle/sampler/vllm_engine.py). + +## Conventions & Gotchas +- **Safety:** Remote plugin code requires `trust_remote_code()` true; avoid loading arbitrary strings into adapter configs (enforced in Multi-LoRA). +- **Env-driven ranks:** Many utilities read ranks/world size from env; set `WORLD_SIZE`, `RANK`, `LOCAL_RANK` when using torchrun. +- **Determinism:** `seed_everything(seed, full_determinism)` controls CUDA/NPU determinism; may set envs like `CUDA_LAUNCH_BLOCKING`. +- **Adapter lifecycle:** Server auto-removes inactive adapters (heartbeat required); per-token adapter limits are enforced. See cleanup in [src/twinkle/server/twinkle/model.py](src/twinkle/server/twinkle/model.py). +- **Templates:** Tokenization/encode via `Template` (e.g., `Qwen3Template`), producing `InputFeature` for model forward. See [src/twinkle/template/base.py](src/twinkle/template/base.py). + +## Examples +- **Visualize a custom mesh:** create `DeviceMesh` and call `get_device_placement()`; example in [tests/infra/test_infra_graph.py](tests/infra/test_infra_graph.py). +- **Add LoRA adapter via HTTP:** POST to `/add_adapter_to_model` with serialized `LoraConfig`; see server routes in [src/twinkle/server/twinkle/model.py](src/twinkle/server/twinkle/model.py). +- **Sample with vLLM:** Configure `vLLMSampler`, set `Template`/`Processor`, then `sample()` on `Trajectory` list; see [src/twinkle/sampler/vllm_sampler.py](src/twinkle/sampler/vllm_sampler.py). + +--- +Questions or gaps? Tell us where guidance is unclear (e.g., missing run scripts, Ray cluster setup), and we’ll refine this document. diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml new file mode 100644 index 00000000..bd560302 --- /dev/null +++ b/.github/workflows/citest.yaml @@ -0,0 +1,76 @@ +name: citest + +on: + push: + branches: + - master + - "release/**" + paths-ignore: + - "setup.*" + - "requirements.txt" + - "requirements/**" + - "docs/**" + - "tools/**" + - ".dev_scripts/**" + - "README.md" + - "README_*.md" + - "NOTICE" + - ".github/workflows/lint.yaml" + - ".github/workflows/publish.yaml" + + pull_request: + paths-ignore: + - "setup.*" + - "requirements.txt" + - "requirements/**" + - "docs/**" + - "tools/**" + - ".dev_scripts/**" + - "README.md" + - "README_*.md" + - "NOTICE" + - ".github/workflows/lint.yaml" + - ".github/workflows/publish.yaml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unittest: + # The type of runner that the job will run on + runs-on: [self-hosted] + timeout-minutes: 240 + steps: + - name: ResetFileMode + shell: bash + run: | + # reset filemode to allow action runner to delete files + # generated by root in docker + set -e + source ~/.bashrc + sudo chown -R $USER:$USER $GITHUB_WORKSPACE + + - name: Checkout + uses: actions/checkout@v3 + env: + GIT_CONFIG_PARAMETERS: "'core.hooksPath='" + with: + lfs: 'true' + submodules: 'false' + fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} + - name: Get changed files + id: changed-files + run: | + if ${{ github.event_name == 'pull_request' }}; then + echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV + else + echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV + fi + - name: Checkout LFS objects + run: git lfs checkout + - name: Run unittest + shell: bash + run: | + set -e + bash .dev_scripts/dockerci.sh diff --git a/.github/workflows/citest_npu.yaml b/.github/workflows/citest_npu.yaml new file mode 100644 index 00000000..d48c7421 --- /dev/null +++ b/.github/workflows/citest_npu.yaml @@ -0,0 +1,75 @@ +name: citest-npu + +on: + push: + branches: + - master + - "release/**" + paths-ignore: + - "setup.*" + - "requirements.txt" + - "requirements/**" + - "docs/**" + - "tools/**" + - ".dev_scripts/**" + - "README.md" + - "README_*.md" + - "NOTICE" + - ".github/workflows/lint.yaml" + - ".github/workflows/publish.yaml" + + pull_request: + paths-ignore: + - "setup.*" + - "requirements.txt" + - "requirements/**" + - "docs/**" + - "tools/**" + - ".dev_scripts/**" + - "README.md" + - "README_*.md" + - "NOTICE" + - ".github/workflows/lint.yaml" + - ".github/workflows/publish.yaml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unittest: + # The type of runner that the job will run on + runs-on: [linux-aarch64-a2-1] + timeout-minutes: 240 + container: + image: 'ascendai/cann:8.3.rc2-910b-ubuntu22.04-py3.11' + steps: + - name: Config mirrors + run: | + sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list + pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local + + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} + - name: Get changed files + id: changed-files + run: | + if ${{ github.event_name == 'pull_request' }}; then + echo "PR_CHANGED_FILES=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_ENV + else + echo "PR_CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.event.after }} | xargs)" >> $GITHUB_ENV + fi + - name: Run unittest + shell: bash + run: | + set -e + export IMAGE_NAME=ascendai/cann + export IMAGE_VERSION=8.3.rc2-910b-ubuntu22.04-py3.11 + export TEST_LEVEL=0 + mkdir -p ~/.cache + export MODELSCOPE_CACHE=~/.cache + export CI_COMMAND='bash .dev_scripts/ci_container_test.sh pytest tests' + bash .dev_scripts/dockerci_npu.sh diff --git a/.github/workflows/close_tale_issue.yaml b/.github/workflows/close_tale_issue.yaml new file mode 100644 index 00000000..46a713f1 --- /dev/null +++ b/.github/workflows/close_tale_issue.yaml @@ -0,0 +1,20 @@ +name: Close Stale Issues +on: + schedule: + - cron: '0 0 * * *' + workflow_dispatch: + +jobs: + close-stale: + runs-on: ubuntu-latest + steps: + - name: Close stale issues + uses: actions/stale@v8 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + days-before-stale: 90 + days-before-close: 7 + stale-issue-message: 'This issue has been inactive for over 3 months and will be automatically closed in 7 days. If this issue is still relevant, please reply to this message.' + close-issue-message: 'This issue has been automatically closed due to inactivity. If needed, it can be reopened.' + stale-issue-label: 'stale' + exempt-all-issue-labels: true diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 00000000..771ee4bc --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,22 @@ +name: Lint test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install pre-commit hook + run: | + pip install pre-commit + - name: Linting + run: pre-commit run --all-files diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml new file mode 100644 index 00000000..bf37a0b4 --- /dev/null +++ b/.github/workflows/publish.yaml @@ -0,0 +1,29 @@ +name: release + +on: + push: + tags: + - 'v**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-publish + cancel-in-progress: true + +jobs: + build-n-publish: + runs-on: ubuntu-22.04 + #if: startsWith(github.event.ref, 'refs/tags') + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.11 + uses: actions/setup-python@v2 + with: + python-version: '3.11' + - name: Install poetry + run: pip install poetry + - name: Build twinkle-kit + run: poetry build + - name: Publish package to PyPI + run: | + pip install twine + twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 3c7cc700..58f495d4 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ wheels/ /package /temp MANIFEST +.locks/ # PyInstaller # Usually these files are written by a python script from a template @@ -134,7 +135,6 @@ wandb/ benchmarks/ eval_output/ eval_outputs/ -transformers/ vlmeval/ my_model/ /data @@ -142,6 +142,7 @@ result/ images /custom/ megatron_output/ +.qoder # Pytorch *.pth @@ -149,3 +150,6 @@ megatron_output/ # ast template ast_index_file.py +test_cookbook/ +/test*.py +swanlog/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 558ddc5a..f1979a9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,52 +1,44 @@ repos: - - repo: https://github.com/pycqa/flake8.git - rev: 4.0.0 + - repo: https://github.com/pycqa/flake8 + rev: 7.3.0 hooks: - id: flake8 - exclude: | - (?x)^( - thirdparty/| - examples/| - tests/run.py - )$ - - repo: https://github.com/PyCQA/isort.git - rev: 4.3.21 + exclude: ^(examples/|cookbook/|client_tools/|src/twinkle_client/) + + - repo: https://github.com/PyCQA/isort + rev: 7.0.0 hooks: - id: isort - exclude: | - (?x)^( - examples/| - tests/run.py| - swift/cli/sft.py - )$ - - repo: https://github.com/pre-commit/mirrors-yapf.git - rev: v0.30.0 + exclude: ^(examples/|cookbook/|client_tools/|src/twinkle_client/) + + - repo: https://github.com/google/yapf + rev: v0.43.0 hooks: - id: yapf - exclude: | - (?x)^( - thirdparty/| - examples/| - tests/run.py - )$ - - repo: https://github.com/pre-commit/pre-commit-hooks.git - rev: v3.1.0 + exclude: ^(examples/|cookbook/|client_tools/|src/twinkle_client/) + + - repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + args: [--py38-plus] + exclude: ^(examples/|cookbook/|client_tools/|src/twinkle_client/) + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - id: trailing-whitespace - exclude: thirdparty/|tests/run.py + exclude: ^(client_tools/|src/twinkle_client/) - id: check-yaml - exclude: thirdparty/|tests/run.py + exclude: ^(client_tools/|src/twinkle_client/) - id: end-of-file-fixer - exclude: thirdparty/|tests/run.py + exclude: ^(client_tools/|src/twinkle_client/) - id: requirements-txt-fixer - exclude: thirdparty/|tests/run.py + exclude: ^(client_tools/|src/twinkle_client/) - id: double-quote-string-fixer - exclude: thirdparty/|tests/run.py + exclude: ^(client_tools/|src/twinkle_client/) - id: check-merge-conflict - exclude: thirdparty/|tests/run.py - - id: fix-encoding-pragma - exclude: thirdparty/|tests/run.py - args: ["--remove"] + exclude: ^(client_tools/|src/twinkle_client/) - id: mixed-line-ending - exclude: thirdparty/|tests/run.py args: ["--fix=lf"] + exclude: ^(client_tools/|src/twinkle_client/) diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml deleted file mode 100644 index f6ef27d9..00000000 --- a/.pre-commit-config_local.yaml +++ /dev/null @@ -1,52 +0,0 @@ -repos: - - repo: /home/admin/pre-commit/flake8 - rev: 4.0.0 - hooks: - - id: flake8 - exclude: | - (?x)^( - thirdparty/| - examples/| - tests/run.py - )$ - - repo: /home/admin/pre-commit/isort - rev: 4.3.21 - hooks: - - id: isort - exclude: | - (?x)^( - examples/| - tests/run.py| - swift/cli/sft.py - )$ - - repo: /home/admin/pre-commit/mirrors-yapf - rev: v0.30.0 - hooks: - - id: yapf - exclude: | - (?x)^( - thirdparty/| - examples/| - tests/run.py - )$ - - repo: /home/admin/pre-commit/pre-commit-hooks - rev: v3.1.0 - hooks: - - id: trailing-whitespace - exclude: thirdparty/|tests/run.py - - id: check-yaml - exclude: thirdparty/|tests/run.py - - id: end-of-file-fixer - exclude: thirdparty/ - - id: requirements-txt-fixer - exclude: thirdparty/|tests/run.py - - id: double-quote-string-fixer - exclude: thirdparty/|tests/run.py - - id: check-merge-conflict - exclude: thirdparty/|tests/run.py - - id: fix-encoding-pragma - exclude: thirdparty/|tests/run.py - args: ["--remove"] - - id: mixed-line-ending - exclude: thirdparty/|tests/run.py - args: ["--fix=lf"] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9892a2d3..4707d995 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,63 +1,67 @@ -# Contributor Guide +# Contributor Guidelines -_Welcome to offer PRs, bug reports, documentation supplements or other types of contributions to SWIFT!_ +*Welcome to contribute Feature PRs, Bug reports, documentation, or other types of contributions to twinkle!* ## Table of Contents + - [Code of Conduct](#-code-of-conduct) - [Contribution Process](#-contribution-process) -- [Hardware support](#-Hardware-support) +- [Resource Support](#-resource-support) ## 📖 Code of Conduct -Please refer to our [Code of Conduct documentation](./CODE_OF_CONDUCT.md). + +Please refer to our [Code of Conduct document](./CODE_OF_CONDUCT.md). ## 🔁 Contribution Process + ### What We Need -- New Technologies and New Models: SWIFT needs to support more open-source models and datasets, or new technologies that we have not paid attention to. If you are interested please submit a PR to us. -- Technical Propagation: If you are interested in technical propagation, you are welcome to help us write tutorials, documents or videos on any website, and send us the link. -- Community Contribution: You can write technical articles related to SWIFT, and submit them to us. After review and approval, we will publish them on the official ModelScope accounts (Zhihu, WeChat, etc.), with your name assigned. + +- New components: You can contribute excellent components to the twinkle project, or contribute them to the modelhub in the ModelScope/Hugging Face community following the component protocol, making them available for other developers to use +- New kernels: You can contribute low-level kernels to the twinkle project. These kernels can be integrated into models to achieve better training value + +Your contributions will help other developers. Please add your component name, location, and usage documentation link in the Community Components section of the README in your code PR. ### Incentives -- we will issue electronic certificates to contributors on behalf of the ModelScope community, to encourage your selfless contributions. -- We will offer small souvenirs related to the ModelScope Community. -- We will provide free A10 computing power during the development period. For more details, please refer to [Hardware-support](#-Hardware-support) section. - -### Submitting PR (Pull Requests) - -Any feature development is carried out in the form of Fork and then PR on GitHub. -1. Fork: Go to the [ms-swift](https://github.com/modelscope/ms-swift) page and click the **Fork button**. After completion, a SWIFT code repository will be cloned under your personal organization. -2. Clone: Clone the code repository generated in the first step to your local machine and **create a new branch** for development. During development, please click the **Sync Fork button** in time to synchronize with the `main` branch to prevent code expiration and conflicts. -3. Submit PR: After development and testing, push the code to the remote branch. On GitHub, go to the **Pull Requests page**, create a new PR, select your code branch as the source branch, and the `modelscope/swift:main` branch as the target branch. - -4. Write Description: It is necessary to provide a good feature description in the PR, so that the reviewers know the content of your modification. -5. Review: We hope that the code to be merged is concise and efficient, so we may raise some questions and discuss them. Please note that any issues raised in the review are aimed at the code itself, not at you personally. Once all issues are discussed and resolved, your code will be approved. - -### Code Standards and Development Approach -SWIFT has conventional variable naming conventions and development approaches. Please follow these approaches as much as possible during development. -1. Variable names are separated by underscores, and class names are named with the first letter of each word capitalized. -2. All Python indentation uses four spaces instead of a tab. -3. Choose well-known open-source libraries, avoid using closed-source libraries or unstable open-source libraries, and avoid repeating the existing code. - -After the PR is submitted, SWIFT will perform two types of tests: -- Code Lint Test: A static code compliance check test. please make sure that you have performed code lint locally in advance. -```shell -pip install pre-commit # In the swift folder -pre-commit run --all-files # Fix the errors reported by pre-commit until all checks are successful -``` -- CI Tests: Smoke tests and unit tests, please refer to the next section. -### Running CI Tests -Before submitting the PR, please ensure that your development code is protected by test cases, such as smoke tests for new features, or unit tests for various edge cases. Reviewers will also pay attention to this during code review. At the same time, there will be dedicated services running CI Tests, running all test cases, and the code can only be merged after the test cases pass. +- We will issue electronic certificates to contributors on behalf of the ModelScope community to acknowledge your selfless contributions. +- We will give away ModelScope community merchandise and small gifts. + +### Submitting PRs (Pull Requests) + +All feature development is conducted on GitHub using a Fork-then-PR workflow. + +1. Fork: Go to the [twinkle](https://github.com/modelscope/twinkle) page and click the **Fork button**. This will clone a twinkle repository under your personal organization + +2. Clone: Clone the repository created in step one to your local machine and **create a new branch** for development. During development, please click the **Sync Fork button** regularly to sync with the `main` branch to prevent code from becoming outdated and causing conflicts -Additionally, since some important tests have been skipped due to long running time, to ensure that your logic is correct, you can run the test locally: -```shell -python tests/llm/test_run.py -``` -Please make sure this test can pass normally. +3. Submit PR: After development and testing are complete, push your code to the remote branch. On GitHub, click the **Pull Requests page** and create a new PR. Select your code branch as the source branch and `modelscope/twinkle:main` as the target branch -## ✅ Hardware support +4. Write Description: It is essential to provide a good feature description in your PR so that reviewers understand your changes + +5. Review: We want the merged code to be clean and efficient, so we may raise some questions for discussion. Please note that any questions raised during review are about the code itself, not about you personally. Once all issues have been discussed and resolved, your code will be approved + +### Code Standards and Development Practices + +twinkle has established conventions for variable naming and development practices. Please try to follow these conventions during development. + +1. Variable names use underscore separation; class names use PascalCase (capitalize the first letter of each word) +2. All Python indentation uses four spaces instead of one tab +3. Use well-known open-source libraries; avoid closed-source or unstable open-source libraries; avoid reinventing the wheel + +twinkle runs two types of tests after a PR is submitted: + +- Code Lint Tests: Static code analysis tests. To ensure this test passes, please run Code lint locally beforehand. Here's how: + + ```shell + pip install pre-commit + pre-commit run --all-files + # Fix any errors reported by pre-commit until all checks pass + ``` + +- CI Tests: Smoke tests and unit tests. Please refer to the next section + +### Running CI Tests -SWIFT will provide hardware support for developers, including free GPUs. If needed, please email us ([contact@modelscope.cn](mailto:contact@modelscope.cn)) or join our WeChat group: +Before submitting a PR, please ensure your development code is protected by test cases. For example, smoke tests for new features, or unit tests for various edge cases. Reviewers will also pay attention to this during code review. Additionally, a dedicated service will run CI Tests, executing all test cases. Code can only be merged after all test cases pass. -

- -

+Please ensure these tests pass successfully. diff --git a/CONTRIBUTING_CN.md b/CONTRIBUTING_CN.md index d18ae6e3..cdbc4755 100644 --- a/CONTRIBUTING_CN.md +++ b/CONTRIBUTING_CN.md @@ -1,6 +1,6 @@ # 贡献者指引 -*欢迎帮SWIFT提供Feature PR、Bug反馈、文档补充或其他类型的贡献!* +*欢迎帮twinkle提供Feature PR、Bug反馈、文档补充或其他类型的贡献!* ## 目录 @@ -15,25 +15,26 @@ ## 🔁 贡献流程 ### 我们需要什么 -- 新技术和新模型:SWIFT需要支持更多的开源模型和数据集,或我们没有关注到的新技术,如果您对此有兴趣,可以提交PR给我们。 -- 技术布道:如果您对技术布道有兴趣,欢迎在任何网站上帮我们撰写教程文档或视频等,并将链接发给我们。 -- 社区供稿:您可以撰写和SWIFT有关的技术文章,并供稿给我们,我们审核通过后会在魔搭官方账号(知乎、公众号等)上进行发布,并属上您的名字。 + +- 新组件:您可以将优秀的组件贡献进twinkle项目,或按照组件协议贡献进ModelScope/Hugging Face社区的modelhub中,方便其他开发者使用 +- 新kernels:您可以将底层kernels贡献进twinkle项目中,这些kernels可以被模型集成,实现更好的训练价值 + +您的贡献会帮助到其他开发者,请在代码PR中在README的社区组件章节中增加您的组件名称、位置和使用方法文档链接。 ### 激励 - 我们会以魔搭社区的身份给贡献者颁发电子证书,以鼓励您的无私贡献。 - 我们会赠送相关魔搭社区相关周边小礼品。 -- 我们会赠送开发期间的免费A10算力,具体可以查看[资源支持](#-资源支持)章节。 ### 提交PR(Pull Requests) 任何feature开发都在github上以先Fork后PR的形式进行。 -1. Fork:进入[ms-swift](https://github.com/modelscope/ms-swift)页面后,点击**Fork按钮**执行。完成后会在您的个人组织下克隆出一个SWIFT代码库 +1. Fork:进入[twinkle](https://github.com/modelscope/twinkle)页面后,点击**Fork按钮**执行。完成后会在您的个人组织下克隆出一个twinkle代码库 2. Clone:将第一步产生的代码库clone到本地并**拉新分支**进行开发,开发中请及时点击**Sync Fork按钮**同步`main`分支,防止代码过期并冲突 -3. 提交PR:开发、测试完成后将代码推送到远程分支。在github上点击**Pull Requests页面**,新建一个PR,源分支选择您提交的代码分支,目标分支选择`modelscope/swift:main`分支 +3. 提交PR:开发、测试完成后将代码推送到远程分支。在github上点击**Pull Requests页面**,新建一个PR,源分支选择您提交的代码分支,目标分支选择`modelscope/twinkle:main`分支 4. 撰写描述:在PR中填写良好的feature描述是必要的,让Reviewers知道您的修改内容 @@ -41,19 +42,18 @@ ### 代码规范和开发方式 -SWIFT有约定俗成的变量命名方式和开发方式。在开发中请尽量遵循这些方式。 +twinkle有约定俗成的变量命名方式和开发方式。在开发中请尽量遵循这些方式。 1. 变量命名以下划线分割,类名以所有单词首字母大写方式命名 2. 所有的python缩进都是四个空格取代一个tab 3. 选用知名的开源库,避免使用闭源库或不稳定的开源库,避免重复造轮子 -SWIFT在PR提交后会进行两类测试: +twinkle在PR提交后会进行两类测试: - Code Lint测试 对代码进行静态规范走查的测试,为保证改测试通过,请保证本地预先进行了Code lint。方法是: ```shell pip install pre-commit - # 在swift文件夹内 pre-commit run --all-files # 对pre-commit报的错误进行修改,直到所有的检查都是成功状态 ``` @@ -64,18 +64,4 @@ SWIFT在PR提交后会进行两类测试: 在提交PR前,请保证您的开发代码已经受到了测试用例的保护。例如,对新功能的冒烟测试,或者各种边缘case的单元测试等。在代码review时Reviewers也会关注这一点。同时,也会有服务专门运行CI Tests,运行所有的测试用例,测试用例通过后代码才可以合并。 -另外,由于运行时间过长,我们跳过了部分重要测试,为保证您的逻辑是正确的,可以在本地执行该测试: - -```shell -python tests/llm/test_run.py -``` - 请保证该测试可以正常通过。 - -## ✅ 资源支持 - -SWIFT会为开发者提供资源支持,包括免费的GPU算力。如果需要请邮件联系我们([contact@modelscope.cn](mailto:contact@modelscope.cn))或加入我们的微信群: - -

- -

diff --git a/README.md b/README.md index e69de29b..7ddd0070 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,364 @@ +

Twinkle: Training workbench to make your model glow

+ +

+ +

+

+by ModelScope +
+ English  |  中文  +

+ +

+ + + + + + +

+ +

+ English Documentation   |   中文文档   +

+ +## ✨ What is Twinkle? + +Twinkle✨ is a lightweight, client-server training framework engineered +with modular, high-cohesion interfaces. Whether you are executing locally +with `torchrun`, or scaling training across Ray clusters, +Twinkle✨ eliminates infrastructure friction by encapsulating +training logic into standardized APIs. Beyond simple +abstraction, Twinkle✨ serves as a robust backend and gateway to enable serverless Training-as-a-Service (TaaS). +It offers interfaces that constitute a _superset_ of [Tinker](https://thinkingmachines.ai/tinker/) APIs, +thereby making it possible to access a Twinkle✨ training service via Tinker client or native Twinkle✨ client +which offers more functionalities. + +🧩 Decoupled Architecture: Standardized Interfaces, backward compatible with Tinker APIs.
+🚀 Multiple Runtime Modes: torchrun / Ray / HTTP.
+🔌 Versatile Backends: Transformers / Megatron.
+👥 Multi-Tenancy Training Service: Train multiple LoRAs that share one base model deployment.
+ +Note: Twinkle✨is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and +we expect the two projects to evolve together. We expect some fundamental components in Twinkle✨will likely +be reused in [ms-swift](https://github.com/modelscope/ms-swift). + +| Twinkle Wechat Group | +|:------------------------------------------------------:| +| | + +## Installation + +### Install with package: + +```shell +pip install 'twinkle-kit' +``` + +### Install from Source: + +```shell +git clone https://github.com/modelscope/twinkle.git +cd twinkle +pip install -e . +``` + +## Tutorials + +| Training Type | Model Framework | Cookbook Path | +| --------------------------------- | --------------- | ------------------------------------------------- | +| FSDP finetuning | transformers | [Script](cookbook/transformers/fsdp2.py) | +| FSDP MoE finetuning | transformers | [Script](cookbook/transformers/fsdp2_moe.py) | +| ep/sp FSDP MoE finetuning | transformers | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) | +| EP MoE finetuning | transformers | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) | +| pp/tp/cp finetuning | megatron | [Script](cookbook/megatron/tp.py) | +| pp/tp/cp MoE finetuning | megatron | [Script](cookbook/megatron/tp_moe.py) | +| tinker client finetuning | megatron | [Script](cookbook/client/tinker/megatron) | +| tinker client finetuning/sampling | transformers | [Script](cookbook/client/tinker/transformer) | +| twinkle client finetuning | megatron | [Script](cookbook/client/twinkle/megatron) | +| twinkle client finetuning | transformer | [Script](cookbook/client/twinkle/transformer) | + +## Changelog + +- 🎉2026-02-13 Initial version of Twinkle✨ released, including SFT/PT/RL support for text models and serverless training capabilities on [ModelScope](https://modelscope.cn). + +## Training as a Service on ModelScope + +We are rolling out training service built atop Twinkle✨ on ModelScope. It is currently in _Beta_. You may +sign up for free access by joining the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization, and +train via API endpoint `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to +our [documentation](docs/source_en/Usage%20Guide/ModelScope-Official-Resources.md). + +## Supported Hardware + +| Hardware Environment | Notes | +| -------------------- | ---------------------------------------------------------------- | +| Nvidia GPUs | ✅ Support for BF16/Flash-Attn may be incomplete in earlier GPUs | +| Ascend NPU | ✅ Some operators may not supported | +| PPU | ✅ | +| CPU | Supports partial components like dataset, dataloader | + +## Supported Models + +We will be adding support for more models as new models are released. The following table lists current models +supported on Twinkle✨ framework. + +>[!Note] +> For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it currently supports +> one training base at a time, and currently it is [Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507). + + +| Model Type | Model ID on [ModelScope](https://modelscope.cn) | Requires | Megatron Support | HF Model ID | +| ------------------- |--------------------------------------------------------------------------------------------------------------------------| -------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------- | +| qwen3 series | [Qwen/Qwen3-0.6B-Base](https://modelscope.cn/models/Qwen/Qwen3-0.6B-Base)~32B | transformers>=4.51 | ✅ | [Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base) | +| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | transformers>=4.51 | ✅ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | +| | [Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B)~235B | transformers>=4.51 | ✅ | [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | +| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) ~72B | transformers>=4.37 | ✅ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | +| | [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct)~72B | transformers>=4.37 | ✅ | [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | +| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B)~72B | transformers>=4.37 | ✅ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | +| qwen2_moe series | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | transformers>=4.40 | ✅ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | +| chatglm4 series | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | +| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | +| glm_edge series | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | +| | [ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat) | +| internlm2 series | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | +| | [Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) | +| deepseek_v1 | [deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat) | transformers>=4.39.4 | ✅ | —— | +| | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | transformers>=4.39.3 | ✅ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | +| | [deepseek-ai/DeepSeek-V2.5](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5) | transformers>=4.39.3 | ✅ | [deepseek-ai/DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5) | +| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | transformers>=4.39.3 | ✅ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | +| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) ~32B | transformers>=4.37 | ✅ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | + +For a more detailed model support list 👉 [Quick Start.md](https://github.com/modelscope/twinkle/blob/dev/docs/source/%E4%BD%BF%E7%94%A8%E6%8C%87%E5%BC%95/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B.md) + +## Sample Code + +### Train with Ray + +```python +from peft import LoraConfig +import twinkle +from twinkle import DeviceMesh, DeviceGroup +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +device_group = [DeviceGroup(name='default',ranks=8,device_type='cuda')] +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# local for torchrun +twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_mesh) + + +def train(): + # to load model from Hugging Face, use 'hf://...' + base_model = 'ms://Qwen/Qwen2.5-7B-Instruct' + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id=base_model) + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id=base_model, remote_group='default') + + lora_config = LoraConfig( + r=8, + lora_alpha=32, + target_modules='all-linear' + ) + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler(scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, + num_training_steps=len(dataloader)) + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + print(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() +``` + +### Using Tinker-Like API + +```python +import os +from tqdm import tqdm +from tinker import types +from twinkle_client import init_tinker_compat_client +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_url='http://www.modelscope.cn/twinkle' +api_key=os.environ.get('MODELSCOPE_TOKEN') + +# Use twinkle dataset to load the data +dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) +dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_cache_file=False) +dataset.encode(batched=True, load_from_cache_file=False) +dataloader = DataLoader(dataset=dataset, batch_size=8) + +# Initialize tinker client +service_client = init_tinker_compat_client(base_url, api_key) +training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16) + +# Training loop: use input_feature_to_datum to transfer the input format +for epoch in range(3): + for step, batch in tqdm(enumerate(dataloader)): + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + fwdbwd_future = training_client.forward_backward(input_datum, "cross_entropy") + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + fwdbwd_result = fwdbwd_future.result() + optim_result = optim_future.result() + + training_client.save_state(f"twinkle-lora-{epoch}").result() +``` + +## Architecture Design + + + + **Twinkle✨** features a decoupled **Client-Server architecture** designed for maximum flexibility. + The client-side provides two distinct integration paths: + +* **Twinkle✨ Native:** A conforming API that mirrors the server-side interface for seamless end-to-end integration. +* **Tinker Compatibility:** Full support for the native Tinker API, enabling developers to leverage Twinkle✨’s backend using Tinker client. + +This dual-path design ensures access to Twinkle✨’s training services using Tinker API, with a simple modification of the Tinker base URL. + +## Multi-Tenancy + +**Twinkle✨** supports simultaneous multi-tenant training on a shared base model. Leveraging a **LoRA Pool + Tenant Application** architecture, Twinkle enables up to **N tenants** to train in parallel with complete isolation. This design offers unprecedented flexibility: from the model's perspective, each tenant's session is distinct, supporting heterogeneous configurations including unique **data padding strategies, optimizers, and loss functions**—all running concurrently on the same base model. + +*Note: This feature is currently optimized for [LoRA](https://github.com/huggingface/peft).* + + + +For example: + +- Tenant A: Load local private dataset locally, LoRA rank=8, using base model for SFT +- Tenant B: Load open-source dataset from Hub remotely, LoRA rank=32, using base model for PT +- Tenant C: Use base model for GRPO loss calculation, using Sampler for sampling +- Tenant D: Use base model for logps inference + +These processes are executed concurrently on a single base model because the **Model and Sampler** +are integrated as **task-agnostic components** within the Twinkle✨ ecosystem. +Upon completion, checkpoints are automatically pushed to **ModelScope** or **HuggingFace** repositories +(private by default). On the server side, Twinkle✨ provides a robust multi-tenant suite +featuring **automated cluster management** and **dynamic scaling**, making it the +foundation for building customizable, enterprise-grade training services. + +> As a modular framework, Twinkle✨ also supports remote temporary exclusive training, i.e., training in full-parameter mode. + +## 🛠️ Twinkle✨ Modular Ecosystem + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Dataset
Data loading and preprocessing

+
+

Template
Encoding and decoding

+
+

DataLoader
Data distribution and batching

+
+

Preprocessor
Data ETL

+
+

InputProcessor
Task-specific input processing

+
+

Model
Large models, supports multiple frameworks

+
+

Sampler
Sampler logic

+
+

Loss
Loss functions

+
+

Metric
Training metrics collection

+
+

Reward
Reward function

+
+

Advantage
Advantage function

+
+

CheckpointEngine
Weight synchronization

+
+

Patch
Patches for model fixes

+
+

Module
Components, e.g., Optimizer

+
+

Kernel
Operators

+
+

Server
Start backend cluster

+
+

Client
Client code

+
+

Infra
Isolate ray and torchrun differences

+
+

Plugin
Use hub components

+
+

Hub
Interface with HF/MS libraries

+
+
+ +## Community Components + +| Component Type | Component Link | Component Function | Author | +| -------------- | -------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------- | ------------------- | +| Patch | [qwen3_moe_transformers4_patch](https://www.modelscope.cn/models/twinkle-kit/qwen3_moe_transformers4_patch) | Fixes Qwen3 MoE model hang issue during FSDP2 training, effective for transformers==4.x | ModelScope Official | + +## Acknowledgements + +This project is maintained and supported by multiple teams under Workshop: + +- ModelScope Team +- CMB-Tech Team + +Twinkle is built on the shoulders of giants, including [Transformers](https://github.com/huggingface/transformers),[MS-SWIFT](https://github.com/modelscope/swift), [veRL](https://github.com/verl-project/verl), and other excellent projects. diff --git a/README_ZH.md b/README_ZH.md new file mode 100644 index 00000000..73bf9cac --- /dev/null +++ b/README_ZH.md @@ -0,0 +1,342 @@ +# Twinkle: Training workbench to make your model glow + +

+ +

+

+ModelScope +
+ English  |  中文  +

+ +

+ + + + + + +

+ +

+ 英文文档   |   中文文档   +

+ +## ✨ Twinkle 是什么? + +Twinkle✨ 是一个轻量级的客户端-服务端训练框架,采用模块化、高内聚的接口设计。无论你是使用 `torchrun` 在本地执行,还是跨 Ray 集群扩展训练,Twinkle✨ 通过将训练逻辑封装成标准化 API 来消除基础设施层面的摩擦。除了简单的抽象之外,Twinkle✨ 还作为强大的后端和网关,实现无服务器训练即服务(TaaS)。它提供的接口是 [Tinker](https://thinkingmachines.ai/tinker/) API 的_超集_,因此可以通过 Tinker 客户端或原生 Twinkle✨ 客户端(提供更多功能)来访问 Twinkle✨ 训练服务。 + +🧩 解耦架构:标准化接口,向后兼容 Tinker API。
+🚀 多种运行模式:torchrun / Ray / HTTP。
+🔌 多样化后端:Transformers / Megatron。
+👥 多租户训练服务:在共享一个基础模型部署的情况下训练多个 LoRA。
+ +注意:Twinkle✨ 由 [ms-swift](https://github.com/modelscope/ms-swift) 背后的团队构建,我们期望这两个项目能够共同发展。我们预计 Twinkle✨ 中的一些基础组件将可能被 [ms-swift](https://github.com/modelscope/ms-swift) 复用。 + +| 魔搭社区twinkle算法交流群 | +|:------------------------------------------------------:| +| | + +## 安装 + +### 使用包安装: + +```shell +pip install 'twinkle-kit' +``` + +### 从源码安装: + +```shell +git clone https://github.com/modelscope/twinkle.git +cd twinkle +pip install -e . +``` + +## 教程 + +| 训练类型 | 模型框架 | Cookbook 路径 | +| ---------------------------- | -------- | ------------------------------------------------- | +| FSDP 微调 | transformers | [脚本](cookbook/transformers/fsdp2.py) | +| FSDP MoE 微调 | transformers | [脚本](cookbook/transformers/fsdp2_moe.py) | +| EP MoE 微调 | transformers | [脚本](cookbook/transformers/ep_fsdp_qwen3_moe.py) | +| pp/tp/cp 微调 | megatron | [脚本](cookbook/megatron/tp.py) | +| pp/tp/cp MoE 微调 | megatron | [脚本](cookbook/megatron/tp_moe.py) | +| tinker 客户端微调 | megatron | [脚本](cookbook/client/tinker/megatron) | +| tinker 客户端微调/采样 | transformers | [脚本](cookbook/client/tinker/transformer) | +| twinkle 客户端微调 | megatron | [脚本](cookbook/client/twinkle/megatron) | +| twinkle 客户端微调 | transformer | [脚本](cookbook/client/twinkle/transformer) | + +## 更新日志 + +- 🎉2026-02-13 Twinkle✨ 初始版本发布,包括对文本模型的 SFT/PT/RL 支持以及在 [ModelScope](https://modelscope.cn) 上的无服务器训练能力。 + +## ModelScope 的训练服务 + +我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。目前处于 _Beta_ 阶段。你可以通过加入 [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) 组织来注册免费访问,并通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](docs/source_zh/使用指引/训练服务.md)。 + +## 支持的硬件 + +| 硬件环境 | 备注 | +| -------- | --------------------------------------------------------------- | +| Nvidia GPU | ✅ 早期 GPU 对 BF16/Flash-Attn 的支持可能不完整 | +| 昇腾 NPU | ✅ 部分算子可能不支持 | +| PPU | ✅ | +| CPU | 支持部分组件如 dataset、dataloader | + +## 支持的模型 + +随着新模型的发布,我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。 + +>[!注意] +> 对于通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前一次只支持一个训练基座,当前是 [Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。 + + +| 模型类型 | [ModelScope](https://modelscope.cn) 上的模型 ID | 要求 | Megatron 支持 | HF 模型 ID | +| ----------------- |--------------------------------------------------------------------------------------------------------------------------| -------------------- | -------------- | ---------------------------------------------------------------------------------------------------------- | +| qwen3 系列 | [Qwen/Qwen3-0.6B-Base](https://modelscope.cn/models/Qwen/Qwen3-0.6B-Base)~32B | transformers>=4.51 | ✅ | [Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base) | +| qwen3_moe 系列 | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | transformers>=4.51 | ✅ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | +| | [Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B)~235B | transformers>=4.51 | ✅ | [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | +| qwen2 系列 | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) ~72B | transformers>=4.37 | ✅ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | +| | [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct)~72B | transformers>=4.37 | ✅ | [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | +| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B)~72B | transformers>=4.37 | ✅ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | +| qwen2_moe 系列 | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | transformers>=4.40 | ✅ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | +| chatglm4 系列 | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | +| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | +| glm_edge 系列 | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | +| | [ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat) | +| internlm2 系列 | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | +| | [Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) | +| deepseek_v1 | [deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat) | transformers>=4.39.4 | ✅ | —— | +| | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | transformers>=4.39.3 | ✅ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | +| | [deepseek-ai/DeepSeek-V2.5](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5) | transformers>=4.39.3 | ✅ | [deepseek-ai/DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5) | +| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | transformers>=4.39.3 | ✅ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | +| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) ~32B | transformers>=4.37 | ✅ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | + +更详细的模型支持列表 👉 [快速开始.md](https://github.com/modelscope/twinkle/blob/dev/docs/source/%E4%BD%BF%E7%94%A8%E6%8C%87%E5%BC%95/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B.md) + +## 示例代码 + +### 使用 Ray 训练 + +```python +from peft import LoraConfig +import twinkle +from twinkle import DeviceMesh, DeviceGroup +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +device_group = [DeviceGroup(name='default',ranks=8,device_type='cuda')] +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# local for torchrun +twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_mesh) + + +def train(): + # to load model from Hugging Face, use 'hf://...' + base_model = 'ms://Qwen/Qwen2.5-7B-Instruct' + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id=base_model) + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id=base_model, remote_group='default') + + lora_config = LoraConfig( + r=8, + lora_alpha=32, + target_modules='all-linear' + ) + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler(scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, + num_training_steps=len(dataloader)) + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + print(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() +``` + +### 使用类 Tinker API + +```python +import os +from tqdm import tqdm +from tinker import types +from twinkle_client import init_tinker_compat_client +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_url='http://www.modelscope.cn/twinkle' +api_key=os.environ.get('MODELSCOPE_TOKEN') + +# Use twinkle dataset to load the data +dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) +dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_cache_file=False) +dataset.encode(batched=True, load_from_cache_file=False) +dataloader = DataLoader(dataset=dataset, batch_size=8) + +# Initialize tinker client +service_client = init_tinker_compat_client(base_url, api_key) +training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16) + +# Training loop: use input_feature_to_datum to transfer the input format +for epoch in range(3): + for step, batch in tqdm(enumerate(dataloader)): + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + fwdbwd_future = training_client.forward_backward(input_datum, "cross_entropy") + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + fwdbwd_result = fwdbwd_future.result() + optim_result = optim_future.result() + + training_client.save_state(f"twinkle-lora-{epoch}").result() +``` + +## 架构设计 + + + +**Twinkle✨** 采用解耦的**客户端-服务端架构**设计,以实现最大的灵活性。客户端提供两种不同的集成路径: + +* **Twinkle✨ 原生:** 符合服务端接口的 API,实现无缝的端到端集成。 +* **Tinker 兼容:** 完全支持原生 Tinker API,使开发者能够使用 Tinker 客户端来利用 Twinkle✨ 的后端。 + +这种双路径设计确保可以使用 Tinker API 访问 Twinkle✨ 的训练服务,只需简单修改 Tinker 的 base URL。 + +## 多租户 + +**Twinkle✨** 支持在共享基础模型上同时进行多租户训练。利用 **LoRA 池 + 租户应用** 架构,Twinkle 能够让多达 **N 个租户** 在完全隔离的情况下并行训练。这种设计提供了前所未有的灵活性:从模型的角度来看,每个租户的会话是独立的,支持异构配置,包括独特的**数据填充策略、优化器和损失函数**——所有这些都在同一个基础模型上并发运行。 + +*注意:此功能目前针对 [LoRA](https://github.com/huggingface/peft) 进行了优化。* + + + +例如: + +- 租户 A:在本地加载私有数据集,LoRA rank=8,使用基础模型进行 SFT +- 租户 B:从 Hub 远程加载开源数据集,LoRA rank=32,使用基础模型进行 PT +- 租户 C:使用基础模型进行 GRPO 损失计算,使用 Sampler 进行采样 +- 租户 D:使用基础模型进行 logps 推理 + +这些过程在单个基础模型上并发执行,因为**模型和采样器**作为 Twinkle✨ 生态系统中的**任务无关组件**被集成。完成后,检查点会自动推送到 **ModelScope** 或 **HuggingFace** 仓库(默认为私有)。在服务端,Twinkle✨ 提供强大的多租户套件,具备**自动化集群管理**和**动态扩展**功能,使其成为构建可定制、企业级训练服务的基础。 + +> 作为模块化框架,Twinkle✨ 也支持远程临时独占训练,即全参数模式训练。 + +## 🛠️ Twinkle✨ 模块化生态系统 + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Dataset
数据加载和预处理

+
+

Template
编码和解码

+
+

DataLoader
数据分发和批处理

+
+

Preprocessor
数据 ETL

+
+

InputProcessor
任务特定的输入处理

+
+

Model
大模型,支持多种框架

+
+

Sampler
采样逻辑

+
+

Loss
损失函数

+
+

Metric
训练指标收集

+
+

Reward
奖励函数

+
+

Advantage
优势函数

+
+

CheckpointEngine
权重同步

+
+

Patch
模型修复补丁

+
+

Module
组件,如优化器

+
+

Kernel
算子

+
+

Server
启动后端集群

+
+

Client
客户端代码

+
+

Infra
隔离 ray 和 torchrun 的差异

+
+

Plugin
使用 hub 组件

+
+

Hub
与 HF/MS 库对接

+
+
+ +## 社区组件 + +| 组件类型 | 组件链接 | 组件功能 | 作者 | +| -------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ----------------- | +| Patch | [qwen3_moe_transformers4_patch](https://www.modelscope.cn/models/twinkle-kit/qwen3_moe_transformers4_patch) | 修复 Qwen3 MoE 模型在 FSDP2 训练期间挂起的问题,适用于 transformers==4.x | ModelScope 官方 | + +## 致谢 + +本项目由 Workshop 组织下的多个团队共同维护和支持: + +- ModelScope官方团队 +- 招商银行开源技术团队 + +Twinkle 的构建基于多个优秀的开源项目,包括 [Transformers](https://github.com/huggingface/transformers)、[MS-SWIFT](https://github.com/modelscope/swift)、[veRL](https://github.com/verl-project/verl) 等。 diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 00000000..9294fc5d --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,88 @@ +# 0.1版本release + +## 中文 + +### 基础能力 + +- [x] 支持transformers模型 +- [x] 支持megatron模型 +- [x] 支持vLLM采样器 +- [x] 支持dataset、dataloader、reward、advantage、权重同步等基本组件 +- [x] 支持数据集packing、padding_free、流式数据集 +- [x] 支持纯文本模型的PT/SFT +- [x] 支持纯文本模型的GRPO +- [x] 支持kernels +- [x] 兼容NPU生态 + +### 网络能力 + +- [x] 支持多LoRA租户 +- [x] 支持twinkle client训练 +- [x] 支持tinker API的兼容性 +- [x] 支持租户资源控制、水位控制 +- [x] 支持checkpoint的保存上传、下载 +- [x] 支持魔搭免费训练集群 + +## English + +### Core Capabilities + +- [x] Support for Transformers models +- [x] Support for Megatron models +- [x] Support for vLLM sampler +- [x] Support for basic components including dataset, dataloader, reward, advantage, and weight synchronization +- [x] Support for dataset packing, padding-free, and streaming datasets +- [x] Support for PT/SFT of text-only models +- [x] Support for GRPO of text-only models +- [x] Support for kernels +- [x] Compatibility with NPU ecosystem + +### Networking Capabilities + +- [x] Support for multi-LoRA tenants +- [x] Support for Twinkle client training +- [x] Support for Tinker API compatibility +- [x] Support for tenant resource control and watermark control +- [x] Support for checkpoint saving, uploading, and downloading +- [x] Support for ModelScope free training cluster + + +# 0.2版本待开发 + +## 中文 + +### 基础能力 + +- [ ] 支持多模态模型 +- [ ] 支持megatron VPP +- [ ] 支持liger kernel +- [ ] 支持transformers模型的ulysses/ring-attention +- [ ] 兼容transformers v5的tp、pp +- [ ] 支持多轮RL +- [ ] 支持gym训练 +- [ ] 支持GAPO、GSPO算法 +- [ ] 支持GKD、on-policy-distill等蒸馏算法 +- [ ] 支持DPO对齐训练 +- [ ] 支持colocate RL训练 +- [ ] Preprocess支持batched + +### 网络能力 + +## English + +### Core Capabilities + +- [ ] Support for multimodal models +- [ ] Support for Megatron VPP +- [ ] Support for Liger kernel +- [ ] Support for Ulysses/Ring-Attention for Transformers models +- [ ] Compatibility with Transformers v5 TP and PP +- [ ] Support for multi-turn RL +- [ ] Support for Gym training +- [ ] Support for GAPO and GSPO algorithms +- [ ] Support for distillation algorithms such as GKD and on-policy distillation +- [ ] Support for DPO alignment training +- [ ] Support for colocate RL training +- [ ] Support for batched preprocessing + +### Networking Capabilities diff --git a/assets/framework.jpg b/assets/framework.jpg new file mode 100644 index 00000000..38e5110a Binary files /dev/null and b/assets/framework.jpg differ diff --git a/assets/multi_lora.png b/assets/multi_lora.png new file mode 100644 index 00000000..a299d801 Binary files /dev/null and b/assets/multi_lora.png differ diff --git a/assets/slogan.png b/assets/slogan.png new file mode 100644 index 00000000..c07888f4 Binary files /dev/null and b/assets/slogan.png differ diff --git a/assets/wechat.jpg b/assets/wechat.jpg new file mode 100644 index 00000000..61ef26b0 Binary files /dev/null and b/assets/wechat.jpg differ diff --git a/client_tools/client_generator.py b/client_tools/client_generator.py new file mode 100644 index 00000000..c337c464 --- /dev/null +++ b/client_tools/client_generator.py @@ -0,0 +1,871 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import ast +from pathlib import Path +from typing import Dict, List, Set, Tuple + +AUTO_GEN_WARNING = """# ============================================================================ +# WARNING: AUTO-GENERATED FILE - DO NOT MODIFY MANUALLY! +# ============================================================================ +# This file is automatically generated by client_tools/client_generator.py +# Any manual changes will be overwritten when the generator runs again. +# +# To update this file: +# 1. Modify the source files in src/twinkle/ +# 2. Run: python client_tools/client_generator.py +# ============================================================================ +""" + + +def generate_processors(): + """Generate client wrappers for all classes with @remote_function methods.""" + + # Module mapping: module_name -> directory in src/twinkle + module_mapping = { + 'dataloader': 'dataloader', + 'dataset': 'dataset', + 'processor': 'processor', + 'reward': 'reward', + 'template': 'template', + 'weight_loader': 'weight_loader', + } + + # Map module names to processor types in the server + processor_type_mapping = { + 'dataloader': 'dataloader', + 'dataset': 'dataset', + 'processor': 'processor', + 'reward': 'reward', + 'template': 'template', + 'weight_loader': 'weight_loader', + } + + # Get the project root directory + project_root = Path(__file__).parent.parent + src_twinkle_path = project_root / 'src' / 'twinkle' + src_client_path = project_root / 'src' / 'twinkle_client' + + def get_method_signature(func_node: ast.FunctionDef) -> str: + """Extract method signature from AST node.""" + args = [] + + # Regular arguments + for i, arg in enumerate(func_node.args.args): + if arg.arg == 'self': + continue + + # Get argument name + arg_str = arg.arg + + # Get type annotation if available + if arg.annotation: + try: + arg_str += f': {ast.unparse(arg.annotation)}' + except: + pass + + # Get default value if available + defaults_offset = len(func_node.args.args) - len(func_node.args.defaults) + if i >= defaults_offset: + default_idx = i - defaults_offset + try: + default_val = ast.unparse(func_node.args.defaults[default_idx]) + arg_str += f' = {default_val}' + except: + pass + + args.append(arg_str) + + # *args + if func_node.args.vararg: + vararg_str = f'*{func_node.args.vararg.arg}' + if func_node.args.vararg.annotation: + try: + vararg_str += f': {ast.unparse(func_node.args.vararg.annotation)}' + except: + pass + args.append(vararg_str) + + # **kwargs + if func_node.args.kwarg: + kwarg_str = f'**{func_node.args.kwarg.arg}' + if func_node.args.kwarg.annotation: + try: + kwarg_str += f': {ast.unparse(func_node.args.kwarg.annotation)}' + except: + pass + args.append(kwarg_str) + + return ', '.join(args) + + def extract_typing_imports(signatures: List[str]) -> Set[str]: + """Extract required typing imports from signatures.""" + typing_patterns = { + 'Union[': 'Union', + 'Optional[': 'Optional', + 'List[': 'List', + 'Dict[': 'Dict', + 'Tuple[': 'Tuple', + 'Type[': 'Type', + 'Any': 'Any', + 'Callable': 'Callable', + 'Literal[': 'Literal', + 'Required[': 'Required', + 'Set[': 'Set', + 'TypedDict': 'TypedDict', + } + + all_text = ' '.join(signatures) + return {name for pattern, name in typing_patterns.items() if pattern in all_text} + + def extract_twinkle_imports(signatures: List[str]) -> Set[str]: + """Extract required twinkle imports from signatures.""" + twinkle_patterns = { + 'InputFeature': ['from twinkle.data_format import InputFeature'], + 'Trajectory': ['from twinkle.data_format import Trajectory'], + 'DataFilter': ['from twinkle.preprocessor import DataFilter'], + 'Preprocessor': ['from twinkle.preprocessor import Preprocessor'], + 'DatasetMeta': ['from twinkle.dataset import DatasetMeta'], + 'Dataset': ['from twinkle.dataset import Dataset'], + 'DeviceMesh': ['from twinkle import DeviceMesh'], + 'Template': ['from twinkle.template import Template'], + 'template.Template': ['from twinkle.template import Template', 'from twinkle import template'], + 'processor.InputProcessor': + ['from twinkle.processor import InputProcessor', 'from twinkle import processor'], + 'InputProcessor': ['from twinkle.processor import InputProcessor'], + } + + all_text = ' '.join(signatures) + imports = set() + for pattern, stmts in twinkle_patterns.items(): + if pattern in all_text: + imports.update(stmts) + + return imports + + def parse_params_from_signature(signature: str) -> List[str]: + """Parse parameter names from signature, handling nested brackets.""" + params = [] + current = '' + depth = 0 + + for char in signature + ',': + if char in '[(': + depth += 1 + elif char in '])': + depth -= 1 + + if char == ',' and depth == 0: + name = current.split(':')[0].split('=')[0].strip() + if name and name != 'self' and not name.startswith('*'): + params.append(name) + current = '' + else: + current += char + + return params + + def find_classes_with_remote_methods(file_path: Path) -> List[Tuple[str, str, List[Tuple[str, str]]]]: + """Find all classes that have @remote_function decorated methods.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + tree = ast.parse(f.read(), filename=str(file_path)) + except Exception as e: + print(f'Error parsing {file_path}: {e}') + return [] + + def has_remote_decorator(func: ast.FunctionDef) -> bool: + for dec in func.decorator_list: + if isinstance(dec, ast.Name) and dec.id == 'remote_function': + return True + if isinstance(dec, ast.Call): + func_node = dec.func + if isinstance(func_node, ast.Name) and func_node.id == 'remote_function': + return True + if isinstance(func_node, ast.Attribute) and func_node.attr == 'remote_function': + return True + return False + + def is_public_or_dunder(name: str) -> bool: + return (name.startswith('__') and name.endswith('__')) or not name.startswith('_') + + def get_base_name(node: ast.ClassDef) -> str: + if not node.bases: + return 'object' + base = node.bases[0] + if isinstance(base, ast.Name): + return base.id + if isinstance(base, ast.Attribute): + return base.attr + return 'object' + + classes_found = [] + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef): + continue + + methods = [ + (item.name, get_method_signature(item)) for item in node.body + if isinstance(item, ast.FunctionDef) and has_remote_decorator(item) and is_public_or_dunder(item.name) + ] + + # Extract __init__ signature separately (it may not have @remote_function) + init_signature = '' + for item in node.body: + if isinstance(item, ast.FunctionDef) and item.name == '__init__': + init_signature = get_method_signature(item) + break + + if methods: + classes_found.append((node.name, get_base_name(node), methods, init_signature)) + + return classes_found + + def generate_client_class(class_name: str, + base_class_name: str, + methods: List[Tuple[str, str]], + module_name: str, + processor_type: str, + source_filename: str, + has_base_file: bool, + init_signature: str = '') -> str: + """Generate client wrapper class code.""" + + def build_imports() -> Tuple[List[str], str]: + # Include both method signatures and __init__ signature for import detection + signatures = [sig for _, sig in methods] + if init_signature: + signatures.append(init_signature) + + typing_imports = extract_typing_imports(signatures) + twinkle_imports = extract_twinkle_imports(signatures) + + lines = [] + if typing_imports: + lines.append(f"from typing import {', '.join(sorted(typing_imports))}") + lines.extend([ + 'from twinkle_client.http import http_post, heartbeat_manager', + ]) + lines.extend(sorted(twinkle_imports)) + + if source_filename == 'base': + inheritance = 'object' + elif base_class_name == 'IterableDataset': + lines.append('from torch.utils.data import IterableDataset') + inheritance = 'IterableDataset' + elif has_base_file and base_class_name != 'object': + lines.append(f'from .base import {base_class_name}') + inheritance = base_class_name + else: + inheritance = 'object' + + lines.append('') + return lines, inheritance + + def build_method(name: str, signature: str) -> str: + param_names = parse_params_from_signature(signature) + kwargs_dict = '{' + ', '.join(f"'{p}': {p}" for p in param_names) + '}' if param_names else '{}' + sig_part = f', {signature}' if signature else '' + if 'kwargs' in sig_part: + extra_args = '\n **kwargs' + else: + extra_args = '' + ret = 'self' if name == '__iter__' else 'response.json()["result"]' + + code = f''' + def {name}(self{sig_part}): + response = http_post( + url=f'{{self.server_url}}/processors/call', + json_data={{ + 'processor_id': self.processor_id, + 'function': '{name}', + **{kwargs_dict},{extra_args} + }} + ) + response.raise_for_status() + return {ret} + ''' + if name == '__iter__': + code += ''' + def __next__(self): + response = http_post( + url=f'{self.server_url}/processors/call', + json_data={ + 'processor_id': self.processor_id, + 'function': '__next__', + } + ) + response.raise_for_status() + return response.json()["result"] + ''' + return code + + import_lines, inheritance = build_imports() + + # Build __init__ method with actual signature + if init_signature: + # Extract parameter names from signature (excluding **kwargs) + param_names = parse_params_from_signature(init_signature) + init_params = f'self, {init_signature}' if init_signature else 'self' + + # Check if signature has **kwargs + has_kwargs = '**' in init_signature + + # Extract the **kwargs name if present + kwargs_name = None + if has_kwargs: + # Find the **kwargs parameter name + for part in init_signature.split(','): + part = part.strip() + if part.startswith('**'): + # Extract name after **, before : or end + kwargs_name = part[2:].split(':')[0].strip() + break + + # Build kwargs dict for HTTP request + if param_names: + kwargs_items = ', '.join([f"'{p}': {p}" for p in param_names]) + if has_kwargs and kwargs_name: + # Include both named params and **kwargs + kwargs_dict = f'{{{kwargs_items}}}, **{kwargs_name}' + else: + kwargs_dict = f'{{{kwargs_items}}}' + else: + if has_kwargs and kwargs_name: + kwargs_dict = kwargs_name + else: + kwargs_dict = '{}' + else: + # Fallback to **kwargs if no __init__ found + init_params = 'self, **kwargs' + kwargs_dict = 'kwargs' + + class_template = f'''{AUTO_GEN_WARNING} +{chr(10).join(import_lines)} +class {class_name}({inheritance}): + """Client wrapper for {class_name} that calls server HTTP endpoints.""" + + def __init__({init_params}): + from twinkle_client.http import get_base_url + self.server_url = get_base_url() + + response = http_post( + url=f'{{self.server_url}}/processors/create', + json_data={{ + 'processor_type': '{processor_type}', + 'class_type': '{class_name}', + **{kwargs_dict} + }} + ) + response.raise_for_status() + self.processor_id = response.json()['processor_id'] + heartbeat_manager.register_processor(self.processor_id) + + def __del__(self): + try: + heartbeat_manager.unregister_processor(self.processor_id) + except: + pass + + ''' + + method_codes = [build_method(name, sig) for name, sig in methods] + + return class_template + '\n'.join(method_codes) + + def scan_modules(src_twinkle_path: Path, module_mapping: Dict[str, str]) -> Dict: + """Scan all modules for classes with @remote_function methods.""" + print('Scanning src/twinkle modules for classes with @remote_function methods...') + + module_files = {} + for module_name, module_dir in module_mapping.items(): + module_path = src_twinkle_path / module_dir + if not module_path.exists(): + continue + + print(f' Scanning {module_name}...') + for py_file in module_path.glob('*.py'): + if py_file.name.startswith('_'): + continue + + if classes := find_classes_with_remote_methods(py_file): + module_files.setdefault(module_name, {}).setdefault(py_file.stem, []).extend(classes) + + return module_files + + def write_client_files(module_files: Dict, src_client_path: Path, processor_type_mapping: Dict[str, str]) -> None: + """Generate and write client files.""" + print('\nGenerating client classes...') + + for module_name, source_files in module_files.items(): + client_module_path = src_client_path / module_name + client_module_path.mkdir(parents=True, exist_ok=True) + + processor_type = processor_type_mapping.get(module_name, module_name) + has_base_file = 'base' in source_files + + for source_filename, classes in source_files.items(): + client_file = client_module_path / f'{source_filename}.py' + print(f' Writing {client_file}...') + + code = '\n\n'.join( + generate_client_class(class_name, base_class_name, methods, module_name, processor_type, + source_filename, has_base_file, init_signature) + for class_name, base_class_name, methods, init_signature in classes) + client_file.write_text(code, encoding='utf-8') + + def write_init_files(module_files: Dict, src_client_path: Path) -> None: + """Generate __init__.py files for each module.""" + print('\nGenerating __init__.py files...') + + for module_name, source_files in module_files.items(): + init_file = src_client_path / module_name / '__init__.py' + print(f' Writing {init_file}...') + + init_lines = [ + f'from .{source_filename} import {class_name}' + for source_filename, classes in sorted(source_files.items()) for class_name, _, _, _ in classes + ] + init_content = AUTO_GEN_WARNING + '\n'.join(sorted(init_lines)) + '\n' + init_file.write_text(init_content, encoding='utf-8') + + module_files = scan_modules(src_twinkle_path, module_mapping) + write_client_files(module_files, src_client_path, processor_type_mapping) + write_init_files(module_files, src_client_path) + print('\nProcessor client generation complete!') + return module_files + + +def generate_models(): + """Generate client wrapper for Model management.""" + from pathlib import Path + + project_root = Path(__file__).parent.parent + src_client_path = project_root / 'src' / 'twinkle_client' + client_module_path = src_client_path / 'model' + client_module_path.mkdir(parents=True, exist_ok=True) + + model_code = AUTO_GEN_WARNING + '''from typing import Any, Optional, Union, Type, Dict, Literal, List +import uuid +from twinkle_client.http import http_post, heartbeat_manager +from twinkle import DeviceMesh +from twinkle.data_format import InputFeature, Trajectory + + +class MultiLoraTransformersModel: + """Client wrapper for TwinkleModel that calls server HTTP endpoints. + + This client manages adapters and sends training/inference requests to the model server. + Each adapter has its own lifecycle managed through automatic heartbeats. + """ + + def __init__(self, model_id: str, **kwargs): + """Initialize model client.""" + from twinkle_client.http import get_base_url + self.server_url = get_base_url() + + self.model_id = model_id + if '://' in model_id: + model_id = model_id.split('://')[1] + self.server_url = f'{self.server_url}/models/{model_id}' + self.adapter_name = None + response = http_post( + url=f'{self.server_url}/create', + ) + response.raise_for_status() + + def _send_adapter_heartbeat(self): + """Internal method to send adapter heartbeat.""" + response = http_post( + url=f'{self.server_url}/heartbeat', + json_data={'adapter_name': self.adapter_name} + ) + response.raise_for_status() + + def add_adapter_to_model(self, adapter_name: str, config: Dict[str, Any], **kwargs): + """Add a new adapter to the model and start automatic heartbeat.""" + response = http_post( + url=f'{self.server_url}/add_adapter_to_model', + json_data={'adapter_name': adapter_name, 'config': config, **kwargs} + ) + response.raise_for_status() + + # Register adapter for automatic heartbeat after successful creation + self.adapter_name = adapter_name + heartbeat_manager.register_adapter( + self.adapter_name, + self._send_adapter_heartbeat + ) + + def __del__(self): + """Cleanup: unregister adapter from heartbeat manager.""" + try: + heartbeat_manager.unregister_adapter(self.adapter_name) + except: + pass + + def forward(self, inputs: Any, **kwargs): + """Execute forward pass on the model.""" + response = http_post( + url=f'{self.server_url}/forward', + json_data={'inputs': inputs, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def forward_only(self, inputs: Any, **kwargs): + """Execute forward pass without gradient computation.""" + response = http_post( + url=f'{self.server_url}/forward_only', + json_data={'inputs': inputs, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def calculate_loss(self, **kwargs): + """Calculate loss from model outputs.""" + response = http_post( + url=f'{self.server_url}/calculate_loss', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def get_train_configs(self, **kwargs): + """Get training configs""" + response = http_post( + url=f'{self.server_url}/get_train_configs', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def backward(self, **kwargs): + """Execute backward pass.""" + response = http_post( + url=f'{self.server_url}/backward', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def forward_backward(self, inputs: Any, **kwargs): + """Execute combined forward and backward pass.""" + response = http_post( + url=f'{self.server_url}/forward_backward', + json_data={'inputs': inputs, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def step(self, **kwargs): + """Execute optimizer step.""" + response = http_post( + url=f'{self.server_url}/step', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def zero_grad(self, **kwargs): + """Zero out gradients.""" + response = http_post( + url=f'{self.server_url}/zero_grad', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def lr_step(self, **kwargs): + """Execute learning rate scheduler step.""" + response = http_post( + url=f'{self.server_url}/lr_step', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def set_loss(self, loss_cls: str, **kwargs): + """Set the loss function.""" + response = http_post( + url=f'{self.server_url}/set_loss', + json_data={'loss_cls': loss_cls, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def clip_grad_norm(self, max_grad_norm: float=1.0, norm_type=2, **kwargs): + """Set the loss function.""" + response = http_post( + url=f'{self.server_url}/clip_grad_norm', + json_data={'max_grad_norm': max_grad_norm, 'norm_type': norm_type, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def set_optimizer(self, optimizer_cls: str, **kwargs): + """Set the optimizer.""" + response = http_post( + url=f'{self.server_url}/set_optimizer', + json_data={'optimizer_cls': optimizer_cls, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def set_lr_scheduler(self, scheduler_cls: str, **kwargs): + """Set the learning rate scheduler.""" + response = http_post( + url=f'{self.server_url}/set_lr_scheduler', + json_data={'scheduler_cls': scheduler_cls, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def save(self, name: str, **kwargs): + """Save model checkpoint.""" + response = http_post( + url=f'{self.server_url}/save', + json_data={'name': name, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def load(self, name: str, **kwargs): + """Load model checkpoint.""" + response = http_post( + url=f'{self.server_url}/load', + json_data={'name': name, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def set_template(self, template_cls: str, **kwargs): + """Set the template for data processing.""" + response = http_post( + url=f'{self.server_url}/set_template', + json_data={'template_cls': template_cls, 'adapter_name': self.adapter_name, 'model_id': self.model_id, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def set_processor(self, processor_cls: str, **kwargs): + """Set the input processor.""" + response = http_post( + url=f'{self.server_url}/set_processor', + json_data={'processor_cls': processor_cls, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def calculate_metric(self, is_training: bool = True, **kwargs): + """Calculate metrics from model outputs.""" + response = http_post( + url=f'{self.server_url}/calculate_metric', + json_data={'is_training': is_training, 'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def get_state_dict(self, **kwargs): + """Get model state dictionary.""" + response = http_post( + url=f'{self.server_url}/get_state_dict', + json_data={'adapter_name': self.adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json()['result'] + + def upload_to_hub(self, checkpoint_dir: str, hub_model_id: str, hub_token: Optional[str] = None, async_upload: bool = True): + """Upload model checkpoint to hub. + + Args: + checkpoint_dir: The directory path of the checkpoint to upload. + hub_model_id: The hub model id. + hub_token: The hub token (optional). + async_upload: Whether to use async upload (default: True). + """ + response = http_post( + url=f'{self.server_url}/upload_to_hub', + json_data={ + 'checkpoint_dir': checkpoint_dir, + 'hub_model_id': hub_model_id, + 'hub_token': hub_token, + 'async_upload': async_upload + } + ) + response.raise_for_status() + return response.json() +''' + + # Write the model client file + client_file = client_module_path / 'multi_lora_transformers.py' + print(f'Generating {client_file}...') + with open(client_file, 'w', encoding='utf-8') as f: + f.write(model_code) + + # Create/overwrite __init__.py + init_file = client_module_path / '__init__.py' + init_content = AUTO_GEN_WARNING + 'from .multi_lora_transformers import MultiLoraTransformersModel\n' + print(f'Writing {init_file}...') + with open(init_file, 'w', encoding='utf-8') as f: + f.write(init_content) + + print('Model client generation complete!') + + +def generate_samplers(): + """Generate client wrapper for Sampler management.""" + from pathlib import Path + + project_root = Path(__file__).parent.parent + src_client_path = project_root / 'src' / 'twinkle_client' + client_module_path = src_client_path / 'sampler' + client_module_path.mkdir(parents=True, exist_ok=True) + + sampler_code = AUTO_GEN_WARNING + '''from typing import Any, Optional, List, Dict, Union +from twinkle_client.http import http_post, heartbeat_manager +from twinkle.sampler.base import Sampler +from peft import PeftConfig +from twinkle.data_format import Trajectory, InputFeature + + +class vLLMSampler(Sampler): + """Client wrapper for Sampler that calls server HTTP endpoints. + + This client manages sampling operations and adapter synchronization with the sampler server. + Each adapter has its own lifecycle managed through automatic heartbeats. + """ + + def __init__(self, model_id: str, **kwargs): + """Create the sampler instance on server.""" + from twinkle_client.http import get_base_url + self.server_url = get_base_url() + + self.adapter_name = None + if '://' in model_id: + model_id = model_id.split('://')[1] + self.server_url = f'{self.server_url}/samplers/{model_id}' + response = http_post( + url=f'{self.server_url}/create', + json_data=kwargs + ) + response.raise_for_status() + + def _send_adapter_heartbeat(self): + """Internal method to send adapter heartbeat.""" + if not self.adapter_name: + return + response = http_post( + url=f'{self.server_url}/heartbeat', + json_data={'adapter_name': self.adapter_name} + ) + response.raise_for_status() + + def add_adapter_to_sampler(self, adapter_name: str, config: PeftConfig, **kwargs): + """Add a new adapter to the sampler and start automatic heartbeat.""" + if isinstance(config, PeftConfig): + config = config.__dict__ + response = http_post( + url=f'{self.server_url}/add_adapter_to_sampler', + json_data={'adapter_name': adapter_name, 'config': config, **kwargs} + ) + response.raise_for_status() + + # Register adapter for automatic heartbeat after successful creation + self.adapter_name = adapter_name + heartbeat_manager.register_adapter( + self.adapter_name, + self._send_adapter_heartbeat + ) + + return response.json() + + def __del__(self): + """Cleanup: unregister adapter from heartbeat manager.""" + try: + if self.adapter_name: + heartbeat_manager.unregister_adapter(self.adapter_name) + except: + pass + + def sample( + self, + inputs: Union[List[Trajectory], List[InputFeature]], + sampling_params: Optional[Dict[str, Any]] = None, + adapter_name: str = '', + adapter_uri: Optional[str] = None, + num_samples: int = 1, + ) -> Dict[str, Any]: + """Sample from the model. + + Args: + inputs: List of Trajectory or InputFeature to sample from. + sampling_params: Sampling parameters dict. + adapter_name: Adapter name for LoRA inference. + adapter_uri: Adapter URI (twinkle:// path or local path) for LoRA inference. + num_samples: Number of completions to generate per prompt. + + Returns: + Dict with 'sequences' list, each containing tokens, logprobs, stop_reason. + """ + json_data = { + 'inputs': inputs, + 'sampling_params': sampling_params, + 'adapter_name': adapter_name, + 'num_samples': num_samples, + } + if adapter_uri is not None: + json_data['adapter_uri'] = adapter_uri + + response = http_post( + url=f'{self.server_url}/sample', + json_data=json_data + ) + response.raise_for_status() + return response.json() + + def set_template(self, template_cls: str, adapter_name: str = '', **kwargs): + """Set the template for encoding trajectories.""" + response = http_post( + url=f'{self.server_url}/set_template', + json_data={'template_cls': template_cls, 'adapter_name': adapter_name, **kwargs} + ) + response.raise_for_status() + return response.json() +''' + + # Write the sampler client file + client_file = client_module_path / 'vllm_sampler.py' + print(f'Generating {client_file}...') + with open(client_file, 'w', encoding='utf-8') as f: + f.write(sampler_code) + + # Create/overwrite __init__.py + init_file = client_module_path / '__init__.py' + init_content = AUTO_GEN_WARNING + 'from .vllm_sampler import vLLMSampler\n' + print(f'Writing {init_file}...') + with open(init_file, 'w', encoding='utf-8') as f: + f.write(init_content) + + print('Sampler client generation complete!') + + +if __name__ == '__main__': + print('Starting client code generation...\n') + print('=' * 60) + + # Generate processor-based clients + print('\n[1/3] Generating processor-based clients...') + generate_processors() + + # Generate model client + print('\n' + '=' * 60) + print('\n[2/3] Generating model client...') + generate_models() + + # Generate sampler client + print('\n' + '=' * 60) + print('\n[3/3] Generating sampler client...') + generate_samplers() + + print('\n' + '=' * 60) + print('\n✓ All client code generation complete!\n') diff --git a/cookbook/client/tinker/lora.py b/cookbook/client/tinker/lora.py new file mode 100644 index 00000000..2714e0af --- /dev/null +++ b/cookbook/client/tinker/lora.py @@ -0,0 +1,181 @@ +# Tinker-Compatible Client - Transformers LoRA Training Example +# +# This script demonstrates end-to-end LoRA fine-tuning using the Tinker- +# compatible client API (an alternative client protocol for the Twinkle server). +# It covers: connecting to the server, preparing data manually with tokenizers, +# running a training loop, saving checkpoints, and publishing to ModelScope. +# The server must be running first (see server.py and server_config.yaml). + +# Step 1: Load environment variables from a .env file (e.g., API tokens) +import dotenv + +dotenv.load_dotenv('.env') + +import os + +from twinkle_client import init_tinker_compat_client + +# Step 2: Initialize the Tinker-compatible client to communicate with the server. +# - base_url: the address of the running server +# - api_key: authentication token (loaded from environment variable) +service_client = init_tinker_compat_client( + base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN')) + +# Step 3: List models available on the server to verify the connection +print('Available models:') +for item in service_client.get_server_capabilities().supported_models: + print('- ' + item.model_name) + +# Step 4: Create a REST client for querying training runs and checkpoints. +# This is useful for inspecting previous training sessions or resuming training. +rest_client = service_client.create_rest_client() + +future = rest_client.list_training_runs(limit=50) +response = future.result() + +# You can resume from either: +# 1. A twinkle path: "twinkle://...//weights/" +# 2. A model id on hub: "/" +# Example: +# resume_path = "twinkle://20260131_170251-Qwen_Qwen2_5-0_5B-Instruct-7275126c/weights/pig-latin-lora-epoch-1" +# resume_path = "AlexEz/20260205_163645-Qwen_Qwen2_5-7B-Instruct-385d5c17_pig-latin-lora-epoch-1" +resume_path = '' + +print(f'Found {len(response.training_runs)} training runs') +for tr in response.training_runs: + print(tr.model_dump_json(indent=2)) + + chpts = rest_client.list_checkpoints(tr.training_run_id).result() + for chpt in chpts.checkpoints: + print(' ' + chpt.model_dump_json(indent=2)) + # Uncomment the line below to resume from the last checkpoint: + # resume_path = chpt.tinker_path + +# Step 5: Create or resume a training client. +# If resume_path is set, it restores both model weights and optimizer state. +base_model = 'Qwen/Qwen2.5-7B-Instruct' +if not resume_path: + training_client = service_client.create_lora_training_client(base_model=base_model) +else: + print('Resuming from ' + resume_path) + training_client = service_client.create_training_client_from_state_with_optimizer(path=resume_path) + +# Step 6: Prepare training data manually +# +# This example teaches the model to translate English into Pig Latin. +# Each example has an "input" (English phrase) and "output" (Pig Latin). +examples = [ + { + 'input': 'banana split', + 'output': 'anana-bay plit-say' + }, + { + 'input': 'quantum physics', + 'output': 'uantum-qay ysics-phay' + }, + { + 'input': 'donut shop', + 'output': 'onut-day op-shay' + }, + { + 'input': 'pickle jar', + 'output': 'ickle-pay ar-jay' + }, + { + 'input': 'space exploration', + 'output': 'ace-spay exploration-way' + }, + { + 'input': 'rubber duck', + 'output': 'ubber-ray uck-day' + }, + { + 'input': 'coding wizard', + 'output': 'oding-cay izard-way' + }, +] + +from modelscope import AutoTokenizer +from tinker import types + +# Load the tokenizer locally (avoids a network call to HuggingFace) +tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) + + +def process_example(example: dict, tokenizer) -> types.Datum: + """Convert a raw example dict into a Datum suitable for the training API. + + The Datum contains: + - model_input: the token IDs fed into the LLM + - loss_fn_inputs: target tokens and per-token weights (0 = ignore, 1 = train) + """ + # Build a simple prompt template + prompt = f"English: {example['input']}\nPig Latin:" + + # Tokenize the prompt; weights=0 means the loss ignores these tokens + prompt_tokens = tokenizer.encode(prompt, add_special_tokens=True) + prompt_weights = [0] * len(prompt_tokens) + + # Tokenize the completion; weights=1 means the loss is computed on these tokens + completion_tokens = tokenizer.encode(f" {example['output']}\n\n", add_special_tokens=False) + completion_weights = [1] * len(completion_tokens) + + # Concatenate prompt + completion + tokens = prompt_tokens + completion_tokens + weights = prompt_weights + completion_weights + + # Shift by one: input is tokens[:-1], target is tokens[1:] (next-token prediction) + input_tokens = tokens[:-1] + target_tokens = tokens[1:] + weights = weights[1:] + + return types.Datum( + model_input=types.ModelInput.from_ints(tokens=input_tokens), + loss_fn_inputs=dict(weights=weights, target_tokens=target_tokens)) + + +# Process all examples into Datum objects +processed_examples = [process_example(ex, tokenizer) for ex in examples] + +# Visualize the first example to verify tokenization and weight alignment +datum0 = processed_examples[0] +print(f"{'Input':<20} {'Target':<20} {'Weight':<10}") +print('-' * 50) +for i, (inp, tgt, wgt) in enumerate( + zip(datum0.model_input.to_ints(), datum0.loss_fn_inputs['target_tokens'].tolist(), + datum0.loss_fn_inputs['weights'].tolist())): + print(f'{repr(tokenizer.decode([inp])):<20} {repr(tokenizer.decode([tgt])):<20} {wgt:<10}') + +# Step 7: Run the training loop +# +# For each epoch, iterate over multiple batches: +# - forward_backward: sends data to the server, computes loss & gradients +# - optim_step: updates model weights using Adam optimizer +import numpy as np + +for epoch in range(2): + for batch in range(5): + # Send training data and get back logprobs (asynchronous futures) + fwdbwd_future = training_client.forward_backward(processed_examples, 'cross_entropy') + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + # Wait for results from the server + fwdbwd_result = fwdbwd_future.result() + optim_result = optim_future.result() + + # Compute the weighted average log-loss per token for monitoring + print(f'Epoch {epoch}, Batch {batch}: ', end='') + logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs]) + weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in processed_examples]) + print(f'Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}') + + # Save checkpoint (model weights + optimizer state) after each epoch + save_future = training_client.save_state(f'pig-latin-lora-epoch-{epoch}') + save_result = save_future.result() + print(f'Saved checkpoint for epoch {epoch} to {save_result.path}') + +# Step 8: Publish the final checkpoint to ModelScope Hub. +# NOTE: Requires a valid ModelScope token set as api_key when initializing the client. +# The published model name will be: {run_id}_{checkpoint_name} +rest_client.publish_checkpoint_from_tinker_path(save_result.path).result() +print('Published checkpoint') diff --git a/cookbook/client/tinker/megatron/server.py b/cookbook/client/tinker/megatron/server.py new file mode 100644 index 00000000..e38f43a4 --- /dev/null +++ b/cookbook/client/tinker/megatron/server.py @@ -0,0 +1,21 @@ +# Twinkle Server Launcher - Tinker-Compatible Megatron Backend +# +# This script starts the Twinkle server with Tinker-compatible API support +# using the Megatron model backend. +# It reads the server_config.yaml in the same directory for all +# configuration (model, deployment settings, etc.). +# Run this script BEFORE running the client training script (lora.py). + +import os + +# Enable Ray debug mode for verbose logging during development +os.environ['TWINKLE_TRUST_REMOTE_CODE'] = '1' + +from twinkle.server import launch_server + +# Resolve the path to server_config.yaml relative to this script's location +file_dir = os.path.abspath(os.path.dirname(__file__)) +config_path = os.path.join(file_dir, 'server_config.yaml') + +# Launch the Twinkle server — this call blocks until the server is shut down +launch_server(config_path=config_path) diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml new file mode 100644 index 00000000..fe9ea0d6 --- /dev/null +++ b/cookbook/client/tinker/megatron/server_config.yaml @@ -0,0 +1,114 @@ +# Twinkle Server Configuration - Tinker-Compatible Transformers Backend + +# Server protocol type: "tinker" enables the Tinker-compatible API +server_type: tinker + +# proxy_location: determines where the HTTP proxy runs. +# "EveryNode" means each Ray node runs its own proxy (good for multi-node). +proxy_location: EveryNode + +# HTTP listener settings +http_options: + host: 0.0.0.0 # Listen on all network interfaces + port: 9000 # Port number for the server + +# Applications: each entry defines a service component deployed on the server +applications: + + # 1. TinkerCompatServer - The central API server + # Handles client connections, training run tracking, checkpoint listing. + - name: server + route_prefix: /api/v1 # API endpoint prefix (Tinker-compatible) + import_path: server # Python module to import + args: + + deployments: + - name: TinkerCompatServer + autoscaling_config: + min_replicas: 1 # Minimum number of replicas + max_replicas: 1 # Maximum number of replicas + target_ongoing_requests: 128 # Target concurrent requests per replica + ray_actor_options: + num_cpus: 0.1 # CPU resources allocated to this actor + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" + + # 3. Sampler Service - Runs inference / sampling using vLLM engine + # Used for generating text from the model (e.g., evaluating LoRA results). + - name: sampler-Qwen3-30B-A3B-Instruct-2507 + route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507 + import_path: sampler + args: + model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier + nproc_per_node: 4 # Number of GPU processes per node + sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) + engine_args: # vLLM engine-specific settings + max_model_len: 16000 # Maximum sequence length the engine supports + gpu_memory_utilization: 0.85 # Fraction of GPU memory to use (0.0-1.0) + enable_lora: true # Allow loading LoRA adapters during inference + max_loras: 5 # Max allowed loras working on vLLM at the same time + device_group: # Logical device group for the sampler + name: sampler + gpus_per_worker: 1 + ranks: [0,1,2,3] # GPU rank indices to use + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 4 + queue_config: + rps_limit: 20 # Max requests per second + tps_limit: 16000 # Max tokens per second + deployments: + - name: SamplerManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" + + # 2. Model Service (commented out) - Would host the base model for training. + # Uncomment and configure if you need a training model worker. + - name: models-Qwen3-30B-A3B-Instruct-2507 + route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507 + import_path: model + args: + use_megatron: true # Use HuggingFace Transformers backend + model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier + max_length: 16000 # model max length + max_loras: 5 # model max loras + nproc_per_node: 4 # Number of GPU processes per node + device_group: + name: model + ranks: [4,5,6,7] # GPU rank indices + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 4 + ep_size: 2 + + queue_config: + rps_limit: 20 # Max requests per second + tps_limit: 16000 # Max tokens per second + adapter_config: + per_token_adapter_limit: 3 # Max concurrent LoRA adapters + adapter_timeout: 30 # Seconds before idle adapter unload + adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours) + deployments: + - name: ModelManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 8 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" diff --git a/cookbook/client/tinker/megatron/server_config_7b.yaml b/cookbook/client/tinker/megatron/server_config_7b.yaml new file mode 100644 index 00000000..cad014c9 --- /dev/null +++ b/cookbook/client/tinker/megatron/server_config_7b.yaml @@ -0,0 +1,107 @@ +# Twinkle Server Configuration - Tinker-Compatible Transformers Backend + +# Server protocol type: "tinker" enables the Tinker-compatible API +server_type: tinker + +# proxy_location: determines where the HTTP proxy runs. +# "EveryNode" means each Ray node runs its own proxy (good for multi-node). +proxy_location: EveryNode + +# HTTP listener settings +http_options: + host: 0.0.0.0 # Listen on all network interfaces + port: 8000 # Port number for the server + +# Applications: each entry defines a service component deployed on the server +applications: + + # 1. TinkerCompatServer - The central API server + # Handles client connections, training run tracking, checkpoint listing. + - name: server + route_prefix: /api/v1 # API endpoint prefix (Tinker-compatible) + import_path: server # Python module to import + args: + + deployments: + - name: TinkerCompatServer + autoscaling_config: + min_replicas: 1 # Minimum number of replicas + max_replicas: 1 # Maximum number of replicas + target_ongoing_requests: 128 # Target concurrent requests per replica + ray_actor_options: + num_cpus: 0.1 # CPU resources allocated to this actor + + # 2. Model Service (commented out) - Would host the base model for training. + # Uncomment and configure if you need a training model worker. + - name: models-Qwen2.5-7B-Instruct + route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct + import_path: model + args: + use_megatron: true + model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier + max_length: 10240 + nproc_per_node: 2 # Number of GPU processes per node + device_group: + name: model + ranks: [0,1] # GPU rank indices + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 2 + queue_config: + rps_limit: 100 # Max requests per second + tps_limit: 10000 # Max tokens per second for a single user + max_input_tokens: 10000 # Maximum input tokens per request + adapter_config: + adapter_timeout: 30 # Seconds before idle adapter unload + adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours) + per_token_adapter_limit: 30 + deployments: + - name: ModelManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" + + # 3. Sampler Service - Runs inference / sampling using vLLM engine + # Used for generating text from the model (e.g., evaluating LoRA results). + - name: sampler-Qwen2.5-7B-Instruct + route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct + import_path: sampler + args: + model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier + nproc_per_node: 2 # Number of GPU processes per node + sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) + engine_args: # vLLM engine-specific settings + max_model_len: 4096 # Maximum sequence length the engine supports + gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0) + enable_lora: true # Allow loading LoRA adapters during inference + logprobs_mode: processed_logprobs # Logprobs mode for sampling results + device_group: # Logical device group for the sampler + name: sampler + ranks: [2] # GPU rank indices to use + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 1 + queue_config: + rps_limit: 100 # Max requests per second + tps_limit: 100000 # Max tokens per second + deployments: + - name: SamplerManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" diff --git a/cookbook/client/tinker/sample.py b/cookbook/client/tinker/sample.py new file mode 100644 index 00000000..eacd043b --- /dev/null +++ b/cookbook/client/tinker/sample.py @@ -0,0 +1,60 @@ +# Tinker-Compatible Client - Sampling / Inference Example +# +# This script demonstrates how to use a previously trained LoRA checkpoint +# for text generation (sampling) via the Tinker-compatible client API. +# The server must be running first (see server.py and server_config.yaml). + +from tinker import types + +from twinkle.data_format import Message, Trajectory +from twinkle.template import Template +from twinkle_client import init_tinker_compat_client + +# Step 1: Define the base model and connect to the server +base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +service_client = init_tinker_compat_client( + base_url='http://www.modelscope.cn/twinkle', + api_key=os.environ.get('MODELSCOPE_TOKEN') +) +# Step 2: Create a sampling client by loading weights from a saved checkpoint. +# The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. +# The server will load the base model and apply the LoRA adapter weights. +service_client.create_sampling_client( + model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1', + base_model=base_model +) + +# Step 3: Load the tokenizer locally to encode the prompt and decode the results +print(f'Using model {base_model}') + +template = Template(model_id=f'ms://{base_model}') + +trajectory = Trajectory( + messages=[ + Message(role='system', content='You are a helpful assistant'), + Message(role='user', content='你是谁?'), + ] +) + +input_feature = template.encode(trajectory, add_generation_prompt=True) + +input_ids = input_feature['input_ids'].tolist() + +# Step 4: Prepare the prompt and sampling parameters +prompt = types.ModelInput.from_ints(input_ids) +params = types.SamplingParams( + max_tokens=128, # Maximum number of tokens to generate + temperature=0.7, + stop=['\n'] # Stop generation when a newline character is produced +) + +# Step 5: Send the sampling request to the server. +# num_samples=8 generates 8 independent completions for the same prompt. +print('Sampling...') +future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1) +result = future.result() + +# Step 6: Decode and print the generated responses +print('Responses:') +for i, seq in enumerate(result.sequences): + print(f'{i}: {repr(template.decode(seq.tokens))}') diff --git a/cookbook/client/tinker/self_congnition.py b/cookbook/client/tinker/self_congnition.py new file mode 100644 index 00000000..9f0fba9b --- /dev/null +++ b/cookbook/client/tinker/self_congnition.py @@ -0,0 +1,129 @@ +# Tinker-Compatible Client - Self-Cognition Training & Evaluation Example +# +# This script demonstrates two workflows using the Tinker-compatible client: +# 1. train(): Fine-tune a model on a self-cognition dataset so it learns +# a custom identity (name, author). +# 2. eval(): Load a trained checkpoint and sample from it to verify +# that the model has learned the custom identity. +# The server must be running first (see server.py and server_config.yaml). +import numpy as np +import os +from tqdm import tqdm +from tinker import types +from twinkle_client import init_tinker_compat_client +from twinkle.data_format import Message, Trajectory +from twinkle.template import Template +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +# The base model to fine-tune / evaluate +base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' + + +def train(): + # Step 1: Prepare the dataset + + # Load the self-cognition dataset from ModelScope (first 500 examples) + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) + + # Apply the chat template matching the base model (max 256 tokens per sample) + dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + + # Replace placeholder names with custom model/author identity + dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) + + # Tokenize and encode the dataset into model-ready input features + dataset.encode(batched=True, load_from_cache_file=False) + + # Wrap the dataset into a DataLoader that yields batches of size 8 + dataloader = DataLoader(dataset=dataset, batch_size=8) + + # Step 2: Initialize the training client + + # Connect to the Twinkle server running locally + service_client = init_tinker_compat_client( + base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN')) + + # Create a LoRA training client for the base model (rank=16 for the LoRA adapter) + training_client = service_client.create_lora_training_client(base_model=base_model, rank=16) + + # Step 3: Run the training loop + + for epoch in range(3): + print(f'Epoch {epoch}') + for step, batch in tqdm(enumerate(dataloader)): + # Convert each InputFeature into a Datum for the Tinker API + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + # Send data to server: forward + backward pass (computes gradients) + fwdbwd_future = training_client.forward_backward(input_datum, 'cross_entropy') + + # Optimizer step: update model weights with Adam + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + # Wait for both operations to complete + fwdbwd_result = fwdbwd_future.result() + optim_result = optim_future.result() + + # Compute weighted average log-loss per token for monitoring + logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs]) + weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in input_datum]) + print(f'Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}') + + # Save a checkpoint after each epoch + save_future = training_client.save_state(f'twinkle-lora-{epoch}') + save_result = save_future.result() + print(f'Saved checkpoint to {save_result.path}') + + +def eval(): + # Step 1: Load the trained LoRA checkpoint for inference + + # Path to a previously saved LoRA checkpoint (twinkle:// URI) + weight_path = 'twinkle://20260212_174205-Qwen_Qwen2_5-7B-Instruct-51edc9ed/weights/twinkle-lora-2' + + # Connect to the server and create a sampling client with the trained weights + service_client = init_tinker_compat_client(base_url='http://localhost:8000') + sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model) + + # Step 2: Prepare the chat prompt + + # Build a multi-turn conversation to test the model's self-cognition + template = Template(model_id=f'ms://{base_model}') + + trajectory = Trajectory( + messages=[ + Message(role='system', content='You are a helpful assistant'), + Message(role='user', content='你是谁?'), + ] + ) + + input_feature = template.encode(trajectory, add_generation_prompt=True) + + input_ids = input_feature['input_ids'].tolist() + + # Step 3: Generate responses + + prompt = types.ModelInput.from_ints(input_ids) + params = types.SamplingParams( + max_tokens=50, # Maximum tokens to generate + temperature=0.2, # Low temperature for more focused responses + stop=['\n'] # Stop at newline + ) + + # Sample 8 independent completions + print('Sampling...') + future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=8) + result = future.result() + + # Decode and print each response + print('Responses:') + for i, seq in enumerate(result.sequences): + print(f'{i}: {repr(template.decode(seq.tokens))}') + + +if __name__ == '__main__': + train() # Uncomment to run training + # eval() # Run evaluation / inference diff --git a/cookbook/client/tinker/short_math_grpo.py b/cookbook/client/tinker/short_math_grpo.py new file mode 100644 index 00000000..d843322b --- /dev/null +++ b/cookbook/client/tinker/short_math_grpo.py @@ -0,0 +1,405 @@ +# Tinker-Compatible Client - Math GRPO Training Example +# +# This script demonstrates Math problem training using the +# Tinker-compatible client API with save_weights_for_sampler for weight sync. +# Instead of calling sync_weights directly, it periodically saves weights and +# creates a sampling client for generation. +# +# Flow: +# 1. Prepare Math dataset (client-side) +# 2. Initialize Tinker-compatible training & sampling clients +# 3. Training loop: +# a. Every SYNC_INTERVAL steps: save_weights_for_sampler → sampling_client +# b. Sample completions from the sampling client +# c. Compute rewards and advantages (client-side) +# d. Train on sampled data weighted by advantages +# e. Optimizer step +# +# The server must be running first (see server.py and server_config.yaml). +# Requires both model and sampler services to be configured. +import gc +import numpy as np +import os +import re +from tinker import types +from typing import List, Tuple + +from twinkle_client import init_tinker_compat_client +from twinkle import get_logger +from twinkle.advantage import GRPOAdvantage +from twinkle.data_format import Message, Trajectory +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import Preprocessor +from twinkle.reward.base import Reward +from twinkle.metric import CompletionRewardMetric +from twinkle.template import Template + +logger = get_logger() + +# ========== Configuration ========== +BASE_MODEL = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +NUM_GENERATIONS = 8 +MAX_NEW_TOKENS = 4096 +LEARNING_RATE = 1e-4 +MAX_STEPS = 1000 +BATCH_SIZE = 2 +TEMPERATURE = 1.0 +SYNC_INTERVAL = 1 # Save weights for sampler every N steps +LORA_RANK = 8 +DATA_NUM = 2000 # Number of Math samples to use + +SYSTEM_PROMPT = ('You are a math assistant that values brevity. ' + 'Solve problems with minimal but correct reasoning.\n\n' + 'Rules:\n' + '1. Use tags for reasoning\n' + '2. Final answer after ####\n\n' + 'Example:\nKey step1 -> Ket step 2 -> conclusion\n#### 42') + + + +class MathPreprocessor(Preprocessor): + + def __call__(self, sample): + if sample['level'] not in ('Level 4', 'Level 5'): + return Trajectory(messages=[], user_data=[]) + + def get_boxed_answer(text): + match = re.search(r'\\boxed{([^}]*)}', text) + return match.group(1) if match else None + + ground_truth = get_boxed_answer(sample['solution']) + if ground_truth is None: + return Trajectory(messages=[], user_data=[]) + problem = sample['problem'] + return Trajectory( + messages=[ + Message(role='system', content=SYSTEM_PROMPT), + Message(role='user', content=problem), + ], + user_data=[('ground_truth', ground_truth)], + ) + + +# ========== Math Reward Functions ========== +class MathAccuracyReward(Reward): + """Accuracy reward for Math: checks if the model's answer matches ground truth. + + Extracts the last '#### ' from model output and compares with ground truth. + Returns 1.0 for correct, 0.0 for incorrect. + """ + + @staticmethod + def extract_answer(completion: str) -> str: + """Extract the last #### answer from model completion.""" + # Only check last 500 chars for efficiency + text = completion[-500:] if len(completion) > 500 else completion + matches = re.findall(r'####\s*([\-\d,\.\s]+)', text) + if matches: + return matches[-1].replace(',', '').replace(' ', '').strip() + return '' + + def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]) -> List[float]: + rewards = [] + for trajectory in trajectories: + messages = trajectory.get('messages', []) + # Get model completion (last assistant message) + completion = '' + for msg in reversed(messages): + if msg.get('role') == 'assistant': + completion = msg.get('content', '') + break + + # Get ground truth from user_data + gt = '' + user_data = trajectory.get('user_data', []) + if isinstance(user_data, list): + for item in user_data: + if isinstance(item, (list, tuple)) and len(item) == 2: + if item[0] == 'ground_truth': + gt = str(item[1]) + break + + predicted = self.extract_answer(completion) + + # Numeric comparison + correct = False + if predicted and gt: + try: + correct = abs(float(predicted) - float(gt)) < 1e-5 + except (ValueError, OverflowError): + correct = predicted == gt + + rewards.append(1.0 if correct else 0.0) + return rewards + + +class MathFormatReward(Reward): + """Format reward: checks format and rewards shorter completions. + + Returns higher score for shorter completions (1.0 at length 100 or less). + Returns 0.0 if format is incorrect. + """ + + def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]) -> List[float]: + rewards = [] + for trajectory in trajectories: + messages = trajectory.get('messages', []) + completion = '' + for msg in reversed(messages): + if msg.get('role') == 'assistant': + completion = msg.get('content', '') + break + + has_think = bool(re.search(r'.*?', completion, re.DOTALL)) + has_answer = bool(re.search(r'####\s*[\-\d,\.]+', completion)) + + if not (has_think and has_answer): + rewards.append(0.0) + else: + length = len(completion) + if length <= 100: + rewards.append(1.0) + else: + reward = max(0.0, 1.0 - (length - 100) / 2000) + rewards.append(reward) + + return rewards + + +def create_math_dataset(): + """Create Math dataset.""" + meta = DatasetMeta( + 'ms://modelscope/competition_math', + subset_name='default', + split='train', + data_slice=range(DATA_NUM), + ) + dataset = Dataset(meta) + dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') + dataset.map(MathPreprocessor()) + dataset.filter(lambda row: bool(row['messages'])) + dataset.encode(add_generation_prompt=True) + return dataset + + +def compute_rewards(trajectories: List[Trajectory], ) -> Tuple[List[float], List[float], List[float]]: + """Compute accuracy and format rewards for Math.""" + accuracy_reward_fn = MathAccuracyReward() + format_reward_fn = MathFormatReward() + + accuracy_rewards = accuracy_reward_fn(trajectories, []) + format_rewards = format_reward_fn(trajectories, []) + total_rewards = [a + f for a, f in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + + +def main(): + logger.info('Starting Math GRPO training...') + + # Step 1: Prepare dataset and dataloader (client-side) + dataset = create_math_dataset() + dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE) + template = Template(model_id=f'ms://{BASE_MODEL}') + + logger.info('Dataset and template initialized') + + # Step 2: Initialize the Tinker-compatible client + logger.info('Connecting to Tinker server...') + service_client = init_tinker_compat_client( + base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN')) + + logger.info('Creating LoRA training client...') + # Create a LoRA training client for GRPO + training_client = service_client.create_lora_training_client( + base_model=BASE_MODEL, + rank=LORA_RANK, + ) + + logger.info('Training client created successfully') + + # Step 3: Setup metrics and advantage function + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + + sampling_params = types.SamplingParams( + max_tokens=MAX_NEW_TOKENS, + temperature=TEMPERATURE, + top_p=0.95, + ) + + # The sampling client is created on-demand via save_weights_for_sampler + sampling_client = None + + step = 0 + for batch in dataloader: + if step >= MAX_STEPS: + break + + metrics.reset() + prompts = batch if isinstance(batch, list) else [batch] + + # ========== 1. Save weights for sampler (instead of sync_weights) ========== + if step % SYNC_INTERVAL == 0: + logger.info(f'Step {step}: Saving weights for sampler...') + + sampling_client = (training_client.save_weights_and_get_sampling_client(name=f'Math-step-{step}')) + logger.info(f'Step {step}: Sampling client ready') + + if sampling_client is None: + logger.warning('No sampling client available, skipping step') + step += 1 + continue + + # ========== 2. Sample completions ========== + # Convert input features to token prompts for the sampling client + all_sequences = [] + all_user_data = [] + for prompt_feature in prompts: + input_ids = prompt_feature['input_ids'] + if hasattr(input_ids, 'tolist'): + input_ids = input_ids.tolist() + prompt = types.ModelInput.from_ints(input_ids) + future = sampling_client.sample( + prompt=prompt, + sampling_params=sampling_params, + num_samples=NUM_GENERATIONS, + ) + result = future.result() + # Store both sequences and user data + for _ in range(NUM_GENERATIONS): + all_user_data.append(prompt_feature.get('user_data', [])) + all_sequences.extend(result.sequences) + + if not all_sequences: + logger.warning(f'Step {step}: No valid samples, skipping') + step += 1 + continue + + # ========== 3. Build trajectories and collect logprobs ========== + trajectories = [] + old_logps_list = [] + completion_lengths = [] + + for idx, seq in enumerate(all_sequences): + decoded_text = template.decode(seq.tokens, skip_special_tokens=True) + # Use the corresponding user data for this sequence + trajectories.append({ + 'messages': [ + { + 'role': 'system', + 'content': SYSTEM_PROMPT + }, + { + 'role': 'user', + 'content': 'Math problem' + }, # Placeholder + { + 'role': 'assistant', + 'content': decoded_text + } + ], + 'user_data': + all_user_data[idx] + }) + old_logps_list.append([lp for lp in seq.logprobs] if seq.logprobs else []) + completion_lengths.append(len(seq.tokens)) + + # ========== 4. Compute rewards ========== + total_rewards, format_rewards, accuracy_rewards = compute_rewards(trajectories) + metrics.accumulate( + None, + None, + completion_lengths=completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }) + + # ========== 5. Compute advantages ========== + advantages = advantage_fn( + total_rewards, + num_generations=NUM_GENERATIONS, + scale='group', + ).tolist() + + frac_zero_std = (1.0 if all(abs(a) < 1e-8 for a in advantages) else 0.0) + if frac_zero_std == 1.0: + logger.info(f'Step {step}: All advantages are zero, skipping training') + step += 1 + continue + + # ========== 6. Train the policies with GRPO loss ========== + # Train the policies with the Advantage-Regularized policy + # gradient (GRPO) loss function. + # + # The GRPO loss function requires: + # 1. logprobs: The log probabilities of the tokens under the current policy + # 2. advantages: The advantage values for each completion + # + # The training data is constructed with: + # - model_input: The full prompt + completion tokens + # - target_tokens: The shifted tokens for next-token prediction + # - logprobs: The log probabilities from the sampling step + # - advantages: The computed advantage values + training_data = [] + for i, seq in enumerate(all_sequences): + # Build a Datum from the completion tokens with logprobs and advantages + prompt_feature = prompts[i // NUM_GENERATIONS] + prompt_ids = prompt_feature['input_ids'] + if hasattr(prompt_ids, 'tolist'): + prompt_ids = prompt_ids.tolist() + + sampled_tokens = list(seq.tokens) + logprobs = seq.logprobs if seq.logprobs else [0.0] * len(sampled_tokens) + advantage = float(advantages[i]) + + ob_len = len(prompt_ids) - 1 + input_tokens = prompt_ids + sampled_tokens[:-1] + target_tokens = [0] * ob_len + sampled_tokens + weights = [0] * ob_len + [1] * len(sampled_tokens) + padded_advantages = [0.0] * ob_len + [advantage] * len(sampled_tokens) + padded_logprobs = [0.0] * ob_len + logprobs + + datum = types.Datum( + model_input=types.ModelInput.from_ints(input_tokens), + loss_fn_inputs={ + 'target_tokens': target_tokens, + 'weights': weights, + 'logprobs': types.TensorData.from_numpy(np.array(padded_logprobs, dtype=np.float32)), + 'advantages': types.TensorData.from_numpy(np.array(padded_advantages, dtype=np.float32)), + }, + ) + training_data.append(datum) + + if not training_data: + logger.info(f'Step {step}: No training data constructed, skipping') + step += 1 + continue + + # Forward-backward pass with importance_sampling (GRPO) loss + # The training data already contains logprobs and advantages for the GRPO loss + fwdbwd_result = training_client.forward_backward(training_data, 'importance_sampling').result() + + optim_result = training_client.optim_step(types.AdamParams(learning_rate=LEARNING_RATE)).result() + + gc.collect() + + # ========== 7. Log ========== + log_dict = metrics.calculate() + if optim_result.metrics: + log_dict.update(optim_result.metrics) + log_dict['train/frac_reward_zero_std'] = frac_zero_std + log_dict['train/num_training_samples'] = len(training_data) + logger.info(f'Step {step}: {log_dict}') + step += 1 + + # Save final checkpoint + save_future = training_client.save_state('Math-grpo-final') + save_result = save_future.result() + logger.info(f'Saved final checkpoint to {save_result.path}') + + +if __name__ == '__main__': + main() diff --git a/cookbook/client/tinker/transformer/server.py b/cookbook/client/tinker/transformer/server.py new file mode 100644 index 00000000..938877eb --- /dev/null +++ b/cookbook/client/tinker/transformer/server.py @@ -0,0 +1,19 @@ +# Twinkle Server Launcher - Tinker-Compatible Transformers Backend +# +# This script starts the Twinkle server with Tinker-compatible API support. +# It reads the server_config.yaml in the same directory for all +# configuration (model, sampler, deployment settings, etc.). +# Run this script BEFORE running any client scripts (lora.py, sample.py, etc.). + +import os + +os.environ['TWINKLE_TRUST_REMOTE_CODE'] = '0' + +from twinkle.server import launch_server + +# Resolve the path to server_config.yaml relative to this script's location +file_dir = os.path.abspath(os.path.dirname(__file__)) +config_path = os.path.join(file_dir, 'server_config.yaml') + +# Launch the Twinkle server — this call blocks until the server is shut down +launch_server(config_path=config_path) diff --git a/cookbook/client/tinker/transformer/server_config.yaml b/cookbook/client/tinker/transformer/server_config.yaml new file mode 100644 index 00000000..00e57387 --- /dev/null +++ b/cookbook/client/tinker/transformer/server_config.yaml @@ -0,0 +1,105 @@ +# Twinkle Server Configuration - Tinker-Compatible Transformers Backend + +# Server protocol type: "tinker" enables the Tinker-compatible API +server_type: tinker + +# proxy_location: determines where the HTTP proxy runs. +# "EveryNode" means each Ray node runs its own proxy (good for multi-node). +proxy_location: EveryNode + +# HTTP listener settings +http_options: + host: 0.0.0.0 # Listen on all network interfaces + port: 8000 # Port number for the server + +# Applications: each entry defines a service component deployed on the server +applications: + + # 1. TinkerCompatServer - The central API server + # Handles client connections, training run tracking, checkpoint listing. + - name: server + route_prefix: /api/v1 # API endpoint prefix (Tinker-compatible) + import_path: server # Python module to import + args: + + deployments: + - name: TinkerCompatServer + autoscaling_config: + min_replicas: 1 # Minimum number of replicas + max_replicas: 1 # Maximum number of replicas + target_ongoing_requests: 128 # Target concurrent requests per replica + ray_actor_options: + num_cpus: 0.1 # CPU resources allocated to this actor + + # 2. Model Service (commented out) - Would host the base model for training. + # Uncomment and configure if you need a training model worker. + - name: models-Qwen2.5-7B-Instruct + route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct + import_path: model + args: + use_megatron: false # Use HuggingFace Transformers backend + model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier + max_length: 10240 + nproc_per_node: 2 # Number of GPU processes per node + device_group: + name: model + ranks: [0,1] # GPU rank indices + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 2 + queue_config: + rps_limit: 100 # Max requests per second + tps_limit: 100000 # Max tokens per second + adapter_config: + per_token_adapter_limit: 30 # Max concurrent LoRA adapters + adapter_timeout: 1800 # Seconds before idle adapter unload + deployments: + - name: ModelManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" + + # 3. Sampler Service - Runs inference / sampling using vLLM engine + # Used for generating text from the model (e.g., evaluating LoRA results). + - name: sampler-Qwen2.5-7B-Instruct + route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct + import_path: sampler + args: + model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier + nproc_per_node: 2 # Number of GPU processes per node + sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) + engine_args: # vLLM engine-specific settings + max_model_len: 4096 # Maximum sequence length the engine supports + gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0) + enable_lora: true # Allow loading LoRA adapters during inference + logprobs_mode: processed_logprobs # Logprobs mode for sampling results + device_group: # Logical device group for the sampler + name: sampler + ranks: [2] # GPU rank indices to use + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 1 + queue_config: + rps_limit: 100 # Max requests per second + tps_limit: 100000 # Max tokens per second + deployments: + - name: SamplerManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" diff --git a/cookbook/client/twinkle/grpo.py b/cookbook/client/twinkle/grpo.py new file mode 100644 index 00000000..ee874db6 --- /dev/null +++ b/cookbook/client/twinkle/grpo.py @@ -0,0 +1,273 @@ +# Twinkle Client - GRPO (Group Relative Policy Optimization) Training Example +# +# This script demonstrates GRPO reinforcement learning training using the +# Twinkle client API with model.save() + adapter_uri for weight sync. +# Instead of calling sync_weights directly, it periodically saves model weights +# and passes the checkpoint path to the sampler as adapter_uri. +# +# Flow: +# 1. Prepare Countdown dataset (client-side) +# 2. Initialize Twinkle client, model, and sampler +# 3. Configure model with GRPOLoss, optimizer, LR scheduler +# 4. Training loop: +# a. Every SYNC_INTERVAL steps: model.save() → get twinkle_path +# b. sampler.sample(inputs, adapter_uri=twinkle_path, num_samples=N) +# c. Compute rewards and advantages (client-side) +# d. model.forward_backward(inputs, advantages, old_logps) +# e. Optimizer step +# +# The server must be running first (see server.py and server_config.yaml). +# Requires both model and sampler services to be configured. + +import dotenv + +dotenv.load_dotenv('.env') +import re + +from twinkle.data_format import Trajectory +from twinkle.reward.base import Reward +import gc +import os +from peft import LoraConfig +from typing import List, Tuple + +from twinkle import get_logger +from twinkle.advantage import GRPOAdvantage +from twinkle.dataset import DatasetMeta +from twinkle.metric import CompletionRewardMetric +from twinkle_client import init_twinkle_client +from twinkle_client.dataloader import DataLoader +from twinkle_client.dataset import Dataset +from twinkle_client.model import MultiLoraTransformersModel +from twinkle_client.sampler import vLLMSampler + +logger = get_logger() + +# ========== Configuration ========== +MODEL_ID = 'ms://Qwen/Qwen2.5-3B-Instruct' +NUM_GENERATIONS = 4 +MAX_NEW_TOKENS = 1024 +LEARNING_RATE = 1e-5 +MAX_STEPS = 10 +BATCH_SIZE = 2 +TEMPERATURE = 1.0 +SYNC_INTERVAL = 1 # Save weights for sampler every N steps +GRADIENT_ACCUMULATION_STEPS = 4 + + +def create_countdown_dataset(): + """Create Countdown Game dataset for GRPO training.""" + + dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) + dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.map('CountdownProcessor') + dataset.encode(add_generation_prompt=True, batched=True) + return dataset + + +class CountDownAccuracy(Reward): + + @staticmethod + def countdown_accuracy_reward(completion: str, target: int, nums: List[int]) -> float: + """Accuracy reward: checks if equation is correct.""" + try: + match = re.search(r'(.*?)<\/answer>', completion) + if match is None: + return 0.0 + equation = match.group(1).strip() + if '=' in equation: + equation = equation.split('=')[0] + used_numbers = [int(n) for n in re.findall(r'\d+', equation)] + if sorted(used_numbers) != sorted(nums): + return 0.0 + if not re.match(r'^[\d+\-*/().\s]+$', equation): + return 0.0 + result = eval(equation, {'__builtins__': None}, {}) + return 1.0 if abs(float(result) - float(target)) < 1e-5 else 0.0 + except Exception: # noqa + return 0.0 + + def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]): + rewards = [] + for trajectory in trajectories: + messages = trajectory.get('messages', []) + completion = '' + for msg in reversed(messages): + if msg.get('role') == 'assistant': + completion = msg.get('content', '') + break + user_data = trajectory.get('user_data', [{}]) + data = user_data[0] if isinstance(user_data, list) and user_data else {} + target = data.get('target', 0) + nums = data.get('nums', []) + acc_reward = self.countdown_accuracy_reward(completion, target, nums) + rewards.append(acc_reward) + return rewards + + +def compute_rewards(trajectories: List[dict], ) -> Tuple[List[float], List[float], List[float]]: + """Compute format and accuracy rewards for Countdown game.""" + from twinkle.reward import FormatReward + format_rewards = FormatReward()(trajectories, []) + accuracy_rewards = CountDownAccuracy()(trajectories, []) + total_rewards = [a + b for a, b in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + + +def train(): + # Step 1: Initialize the Twinkle client + client = init_twinkle_client( + base_url='http://127.0.0.1:8000', + api_key=os.environ.get('MODELSCOPE_TOKEN'), + ) + + # Step 2: Prepare dataset and dataloader + dataset = create_countdown_dataset() + dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE) + + # Step 3: Configure the training model + model = MultiLoraTransformersModel(model_id=MODEL_ID) + + lora_config = LoraConfig( + target_modules='all-linear', + r=8, + lora_alpha=32, + lora_dropout=0.05, + ) + model.add_adapter_to_model( + 'default', + lora_config, + gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, + ) + + # Set GRPO loss (the key difference from SFT training) + model.set_loss('GRPOLoss', epsilon=0.2, beta=0.0) + + # Set optimizer and LR scheduler + model.set_optimizer('AdamW', lr=LEARNING_RATE) + model.set_lr_scheduler( + 'CosineWarmupScheduler', + num_warmup_steps=500, + num_training_steps=MAX_STEPS, + ) + + # Set processor and template for encoding inputs + model.set_processor('InputProcessor') + model.set_template('Template', model_id=MODEL_ID) + + # Step 4: Configure the sampler + sampler = vLLMSampler(model_id=MODEL_ID) + sampler.set_template('Template', model_id=MODEL_ID) + + # Step 5: Setup metrics and advantage function + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + + sampling_params = { + 'max_tokens': MAX_NEW_TOKENS, + 'temperature': TEMPERATURE, + 'top_p': 0.95, + } + + # Track the current adapter path for sampling + current_adapter_uri = None + + step = 0 + for batch in dataloader: + if step >= MAX_STEPS: + break + + metrics.reset() + prompts = batch if isinstance(batch, list) else [batch] + + # ========== 1. Save weights and update adapter_uri ========== + # Instead of sync_weights, save the model checkpoint and pass + # the resulting path to the sampler as adapter_uri + if step % SYNC_INTERVAL == 0: + logger.info(f'Step {step}: Saving weights for sampler...') + twinkle_path = model.save( + name=f'grpo-sampler-step-{step}', + save_optimizer=False, + ) + current_adapter_uri = twinkle_path + logger.info(f'Step {step}: Saved weights to {current_adapter_uri}') + + # ========== 2. Sample completions ========== + sample_response = sampler.sample( + inputs=prompts, + sampling_params=sampling_params, + adapter_uri=current_adapter_uri, + num_samples=NUM_GENERATIONS, + ) + + input_features = [] + old_logps_list = [] + completion_lengths = [] + + sequences = sample_response.get('sequences', []) + for seq in sequences: + input_features.append(seq.get('new_input_feature', seq)) + old_logps_list.append(seq.get('logprobs', [])) + completion_lengths.append(len(seq.get('tokens', []))) + + if not input_features: + logger.warning(f'Step {step}: No valid samples, skipping') + step += 1 + continue + + # ========== 3. Compute rewards ========== + total_rewards, format_rewards, accuracy_rewards = compute_rewards(input_features) + metrics.accumulate( + None, + None, + completion_lengths=completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }) + + # ========== 4. Compute advantages ========== + advantages = advantage_fn( + total_rewards, + num_generations=NUM_GENERATIONS, + scale='group', + ).tolist() + + frac_zero_std = (1.0 if all(abs(a) < 1e-8 for a in advantages) else 0.0) + if frac_zero_std == 1.0: + logger.info(f'Step {step}: All advantages are zero, skipping training') + step += 1 + continue + + # ========== 5. Training step (GRPO) ========== + # forward_backward with GRPO loss: passes advantages and old_logps + # to the server-side GRPOLoss for proper policy optimization + model.forward_backward( + inputs=input_features, + advantages=advantages, + old_logps=old_logps_list, + ) + + # Gradient clipping and optimizer step + model.clip_grad_norm(1.0) + model.step() + model.zero_grad() + model.lr_step() + + gc.collect() + + # ========== 6. Log ========== + log_dict = metrics.calculate() + log_dict.update(model.calculate_metric()) + log_dict['train/frac_reward_zero_std'] = frac_zero_std + logger.info(f'Step {step}: {log_dict}') + step += 1 + + # Save final checkpoint + twinkle_path = model.save(name='grpo-countdown-final', save_optimizer=True) + logger.info(f'Saved final checkpoint: {twinkle_path}') + + +if __name__ == '__main__': + train() diff --git a/cookbook/client/twinkle/megatron/server.py b/cookbook/client/twinkle/megatron/server.py new file mode 100644 index 00000000..3e58a5a9 --- /dev/null +++ b/cookbook/client/twinkle/megatron/server.py @@ -0,0 +1,20 @@ +# Twinkle Server Launcher - Megatron Backend +# +# This script starts the Twinkle server using Ray Serve with Megatron support. +# It reads the server_config.yaml in the same directory for all +# configuration (model, processor, deployment settings, etc.). +# Run this script BEFORE running the client training script (lora.py). + +import os + +# Enable Ray debug mode for verbose logging during development +os.environ['RAY_DEBUG'] = '1' + +from twinkle.server import launch_server + +# Resolve the path to server_config.yaml relative to this script's location +file_dir = os.path.abspath(os.path.dirname(__file__)) +config_path = os.path.join(file_dir, 'server_config.yaml') + +# Launch the Twinkle server — this call blocks until the server is shut down +launch_server(config_path=config_path) diff --git a/cookbook/client/twinkle/megatron/server_config.yaml b/cookbook/client/twinkle/megatron/server_config.yaml new file mode 100644 index 00000000..bb67bcfb --- /dev/null +++ b/cookbook/client/twinkle/megatron/server_config.yaml @@ -0,0 +1,87 @@ +# Twinkle Server Configuration - Megatron Backend + +# Server protocol type: "twinkle" for the native Twinkle client protocol +server_type: twinkle + +# proxy_location: determines where the HTTP proxy runs. +# "EveryNode" means each Ray node runs its own proxy (good for multi-node). +proxy_location: EveryNode + +# HTTP listener settings +http_options: + host: 0.0.0.0 # Listen on all network interfaces + port: 8000 # Port number for the server + +# Applications: each entry defines a service component deployed on the server +applications: + + # 1. TwinkleServer - The central management server + # Handles client connections, training run tracking, checkpoint listing. + - name: server + route_prefix: /server # API endpoint prefix + import_path: server # Python module to import + args: + + deployments: + - name: TwinkleServer + autoscaling_config: + min_replicas: 1 # Minimum number of replicas + max_replicas: 1 # Maximum number of replicas + target_ongoing_requests: 128 # Target concurrent requests per replica + ray_actor_options: + num_cpus: 0.1 # CPU resources allocated to this actor + + # 2. Model Service - Hosts the base model for training (Megatron backend) + # This is the actual model worker that performs forward/backward passes. + - name: models-Qwen2.5-3B-Instruct + route_prefix: /models/Qwen/Qwen2.5-3B-Instruct # REST path for this model + import_path: model + args: + use_megatron: true # Use Megatron-LM backend (not HuggingFace) + mixed_precision: bf16 + model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier to load + nproc_per_node: 2 # Number of GPU processes per node + device_group: # Logical device group for this model + name: model + ranks: [0,1] # GPU rank indices to use + device_type: cuda + device_mesh: # Distributed training mesh configuration + device_type: cuda + mesh: [0,1] # Device indices in the mesh + mesh_dim_names: ['dp'] # Mesh dimension names: 'dp' = data parallel + adapter_config: + per_token_adapter_limit: 30 # Max concurrent LoRA adapters + adapter_timeout: 1800 # Seconds before idle adapter unload + deployments: + - name: ModelManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + + # 3. Processor Service - Handles data preprocessing on CPU + # Runs tokenization, template application, and other CPU-bound tasks. + - name: processor + route_prefix: /processors + import_path: processor + args: + nproc_per_node: 2 # Number of processor workers per node + ncpu_proc_per_node: 2 # Number of CPU processes per node + device_group: + name: model + ranks: 2 # CPU rank index + device_type: CPU + device_mesh: + device_type: CPU + mesh: [0,1] + mesh_dim_names: ['dp'] + deployments: + - name: ProcessorManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 128 + ray_actor_options: + num_cpus: 0.1 diff --git a/cookbook/client/twinkle/sample.py b/cookbook/client/twinkle/sample.py new file mode 100644 index 00000000..27f22fba --- /dev/null +++ b/cookbook/client/twinkle/sample.py @@ -0,0 +1,96 @@ +# Twinkle Client - Sampler (Inference) Example +# +# This script demonstrates how to run text generation inference +# through the Twinkle client-server architecture. +# The server must be running first (see server.py and server_config.yaml). +# +# This is the client/server equivalent of cookbook/legacy/sampler/sampler_demo.py. +# Instead of running everything locally, the sampler runs on the server side +# while the client sends requests over HTTP. + +# Step 1: Load environment variables from a .env file (e.g., API tokens) +import dotenv + +dotenv.load_dotenv('.env') + +import os +from transformers import AutoTokenizer + +from twinkle import get_logger +from twinkle_client import init_twinkle_client +from twinkle_client.sampler import vLLMSampler + +logger = get_logger() + +MODEL_ID = 'Qwen/Qwen2.5-3B-Instruct' + +# Optional: adapter URI for LoRA inference +# This can be a twinkle:// path from a training run checkpoint +# or None to use the base model +# ADAPTER_URI = None +# Example: +ADAPTER_URI = 'twinkle://20260208_224851-fa3cdd11-default/weights/twinkle-epoch-2' + + +def sample(): + # Step 2: Initialize the Twinkle client to communicate with the remote server. + client = init_twinkle_client( + base_url='http://127.0.0.1:8000', + api_key=os.environ.get('MODELSCOPE_TOKEN'), + ) + + # Step 3: Create the sampler client pointing to the model on the server + sampler = vLLMSampler(model_id=MODEL_ID) + + # Step 4: Set the chat template so the sampler can encode Trajectory inputs + sampler.set_template('Template', model_id=MODEL_ID) + + # Step 5: Prepare inputs as Trajectory dicts (messages format) + # Each trajectory is a conversation with system and user messages + trajectory = { + 'messages': [ + { + 'role': 'system', + 'content': 'You are a helpful assistant.' + }, + { + 'role': 'user', + 'content': 'Who are you?' + }, + ] + } + + num_prompts = 4 + num_samples = 2 # Generate 2 completions per prompt + + # Step 6: Configure sampling parameters + sampling_params = { + 'max_tokens': 128, + 'temperature': 1.0, + } + + # Step 7: Call the sampler + # - inputs: list of Trajectory dicts (will be encoded server-side using the template) + # - sampling_params: controls generation behavior + # - adapter_uri: optional LoRA adapter path for fine-tuned inference + # - num_samples: number of completions per prompt + response = sampler.sample( + inputs=[trajectory] * num_prompts, + sampling_params=sampling_params, + adapter_uri=ADAPTER_URI, + num_samples=num_samples, + ) + + # Step 8: Decode and print the results + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) + + logger.info(f"Generated {len(response['sequences'])} sequences " + f'({num_prompts} prompts x {num_samples} samples)') + + for i, seq in enumerate(response['sequences']): + text = tokenizer.decode(seq['tokens'], skip_special_tokens=True) + logger.info(f'Sequence {i}:\n {text}\n') + + +if __name__ == '__main__': + sample() diff --git a/cookbook/client/twinkle/self_congnition.py b/cookbook/client/twinkle/self_congnition.py new file mode 100644 index 00000000..fd23726f --- /dev/null +++ b/cookbook/client/twinkle/self_congnition.py @@ -0,0 +1,140 @@ +# Twinkle Client - Transformers LoRA Training Example +# +# This script demonstrates how to fine-tune a language model using LoRA +# (Low-Rank Adaptation) through the Twinkle client-server architecture. +# The server must be running first (see server.py and server_config.yaml). + +# Step 1: Load environment variables from a .env file (e.g., API tokens) +import dotenv + +dotenv.load_dotenv('.env') + +import os +from peft import LoraConfig + +from twinkle import get_logger +from twinkle.dataset import DatasetMeta +from twinkle_client import init_twinkle_client +from twinkle_client.dataloader import DataLoader +from twinkle_client.dataset import Dataset +from twinkle_client.model import MultiLoraTransformersModel + +logger = get_logger() + +# Whether to use Megatron for training +use_megatron = True +# Step 2: Initialize the Twinkle client to communicate with the remote server. +# - base_url: the address of the running Twinkle server +# - api_key: authentication token (loaded from environment variable) +client = init_twinkle_client(base_url='http://127.0.0.1:8000', api_key=os.environ.get('MODELSCOPE_TOKEN')) + +# Step 3: Query the server for existing training runs and their checkpoints. +# This is useful for resuming a previous training session. +runs = client.list_training_runs() + +resume_path = None +for run in runs: + logger.info(run.model_dump_json(indent=2)) + # List all saved checkpoints for this training run + checkpoints = client.list_checkpoints(run.training_run_id) + + for checkpoint in checkpoints: + logger.info(checkpoint.model_dump_json(indent=2)) + # Uncomment the line below to resume from a specific checkpoint: + # resume_path = checkpoint.twinkle_path + + +def train(): + # Step 4: Prepare the dataset + + # Load the self-cognition dataset from ModelScope + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) + + # Apply a chat template so the data matches the model's expected input format + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-3B-Instruct', max_length=512) + + # Replace placeholder names in the dataset with custom model/author names + dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) + + # Tokenize and encode the dataset into model-ready input features + dataset.encode(batched=True) + + # Wrap the dataset into a DataLoader that yields batches of size 4 + dataloader = DataLoader(dataset=dataset, batch_size=4) + + # Step 5: Configure the model + + # Create a multi-LoRA Transformers model pointing to the base model on ModelScope + model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen2.5-3B-Instruct') + + # Define LoRA configuration: apply low-rank adapters to all linear layers + lora_config = LoraConfig(target_modules='all-linear') + + # Attach the LoRA adapter named 'default' to the model. + # gradient_accumulation_steps=2 means gradients are accumulated over 2 micro-batches + # before an optimizer step, effectively doubling the batch size. + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + + # Set the same chat template used during data preprocessing + model.set_template('Template') + + # Set the input processor (pads sequences on the right side) + model.set_processor('InputProcessor', padding_side='right') + + # Use cross-entropy loss for language modeling + model.set_loss('CrossEntropyLoss') + + # Use Adam optimizer with a learning rate of 1e-4 (Only support Adam optimizer if server use megatron) + model.set_optimizer('Adam', lr=1e-4) + + # Use a linear learning rate scheduler (Do not support LR scheduler if server use megatron) + if not use_megatron: + model.set_lr_scheduler('LinearLR') + + # Step 6: Optionally resume from a previous checkpoint + if resume_path: + logger.info(f'Resuming training from {resume_path}') + model.load(resume_path, load_optimizer=True) + + # Step 7: Run the training loop + logger.info(model.get_train_configs()) + + for epoch in range(3): + logger.info(f'Starting epoch {epoch}') + for step, batch in enumerate(dataloader): + # Forward pass + backward pass (computes gradients) + output = model.forward_backward(inputs=batch) + + # Log the loss every 2 steps (aligned with gradient accumulation) + if step % 2 == 0: + logger.info(f'Current is step {step // 2}, loss: {output}') + + # Clip gradients to prevent exploding gradients (max norm = 1.0) + model.clip_grad_norm(1.0) + + # Perform one optimizer step (update model weights) + model.step() + + # Reset gradients to zero for the next iteration + model.zero_grad() + + # Advance the learning rate scheduler by one step + model.lr_step() + + # Step 8: Save the trained checkpoint + twinkle_path = model.save(name=f'twinkle-epoch-{epoch}', save_optimizer=True) + logger.info(f'Saved checkpoint: {twinkle_path}') + + # Step 9: Upload the checkpoint to ModelScope Hub + # YOUR_USER_NAME = "your_username" + # hub_model_id = f'{YOUR_USER_NAME}/twinkle-self-cognition' + # model.upload_to_hub( + # checkpoint_dir=twinkle_path, + # hub_model_id=hub_model_id, + # async_upload=False + # ) + # logger.info(f"Uploaded checkpoint to hub: {hub_model_id}") + + +if __name__ == '__main__': + train() diff --git a/cookbook/client/twinkle/transformer/server.py b/cookbook/client/twinkle/transformer/server.py new file mode 100644 index 00000000..ba84e2dd --- /dev/null +++ b/cookbook/client/twinkle/transformer/server.py @@ -0,0 +1,20 @@ +# Twinkle Server Launcher - Transformers Backend +# +# This script starts the Twinkle server using Ray Serve. +# It reads the server_config.yaml in the same directory for all +# configuration (model, processor, deployment settings, etc.). +# Run this script BEFORE running the client training script (lora.py). + +import os + +# Enable Ray debug mode for verbose logging during development +os.environ['RAY_DEBUG'] = '1' + +from twinkle.server import launch_server + +# Resolve the path to server_config.yaml relative to this script's location +file_dir = os.path.abspath(os.path.dirname(__file__)) +config_path = os.path.join(file_dir, 'server_config.yaml') + +# Launch the Twinkle server — this call blocks until the server is shut down +launch_server(config_path=config_path) diff --git a/cookbook/client/twinkle/transformer/server_config.yaml b/cookbook/client/twinkle/transformer/server_config.yaml new file mode 100644 index 00000000..93fe8592 --- /dev/null +++ b/cookbook/client/twinkle/transformer/server_config.yaml @@ -0,0 +1,128 @@ +# Twinkle Server Configuration - Transformers Backend + +# Server protocol type: "twinkle" for the native Twinkle client protocol +server_type: twinkle + +# proxy_location: determines where the HTTP proxy runs. +# "EveryNode" means each Ray node runs its own proxy (good for multi-node). +proxy_location: EveryNode + +# HTTP listener settings +http_options: + host: 0.0.0.0 # Listen on all network interfaces + port: 8000 # Port number for the server + +# Applications: each entry defines a service component deployed on the server +applications: + + # 1. TwinkleServer - The central management server + # Handles client connections, training run tracking, checkpoint listing. + - name: server + route_prefix: /server # API endpoint prefix + import_path: server # Python module to import + args: + + deployments: + - name: TwinkleServer + autoscaling_config: + min_replicas: 1 # Minimum number of replicas + max_replicas: 1 # Maximum number of replicas + target_ongoing_requests: 128 # Target concurrent requests per replica + ray_actor_options: + num_cpus: 0.1 # CPU resources allocated to this actor + + # 2. Model Service - Hosts the base model for training + # This is the actual model worker that performs forward/backward passes. + - name: models-Qwen2.5-3B-Instruct + route_prefix: /models/Qwen/Qwen2.5-3B-Instruct # REST path for this model + import_path: model + args: + use_megatron: false # Use HuggingFace Transformers (not Megatron) + model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier to load + adapter_config: + per_token_adapter_limit: 30 # Max LoRA adapters that can be active simultaneously + adapter_timeout: 1800 # Seconds before an idle adapter is unloaded + nproc_per_node: 2 # Number of GPU processes per node + device_group: # Logical device group for this model + name: model + ranks: [0,1] # GPU rank indices to use + device_type: cuda + device_mesh: # Distributed training mesh configuration + device_type: cuda + dp_size: 2 # Mesh dimension names: 'dp' = data parallel + deployments: + - name: ModelManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" + + # 3. Processor Service - Handles data preprocessing on CPU + # Runs tokenization, template application, and other CPU-bound tasks. + - name: processor + route_prefix: /processors + import_path: processor + args: + nproc_per_node: 2 # Number of processor workers per node + ncpu_proc_per_node: 2 # Number of CPU processes per node + device_group: + name: model + ranks: 2 # CPU rank index + device_type: CPU + device_mesh: + device_type: CPU + mesh: [0,1] + mesh_dim_names: ['dp'] + deployments: + - name: ProcessorManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 128 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" + + # 4. Sampler Service - Handles text generation inference + # Uses vLLM for efficient batched generation with optional LoRA adapters. + - name: sampler-Qwen2.5-3B-Instruct + route_prefix: /samplers/Qwen/Qwen2.5-3B-Instruct # REST path for this sampler + import_path: sampler + args: + model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier to load + sampler_type: vllm # Sampler backend (vllm or torch) + nproc_per_node: 2 # Number of GPU processes per node + engine_args: # vLLM engine configuration + gpu_memory_utilization: 0.4 + max_model_len: 1024 + adapter_config: # Adapter lifecycle management + per_token_adapter_limit: 30 # Max LoRA adapters per user + adapter_timeout: 1800 # Seconds before idle adapter is unloaded + device_group: + name: sampler + ranks: [2] # GPU rank indices to use + device_type: cuda + device_mesh: + device_type: cuda + dp_size: 1 + deployments: + - name: SamplerManagement + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 0.1 + runtime_env: + env_vars: + TWINKLE_TRUST_REMOTE_CODE: "0" + DEVICE_COUNT_PER_PHYSICAL_NODE: "8" diff --git a/cookbook/megatron/tp.py b/cookbook/megatron/tp.py new file mode 100644 index 00000000..662bd50f --- /dev/null +++ b/cookbook/megatron/tp.py @@ -0,0 +1,83 @@ +import os +from peft import LoraConfig +from tqdm import tqdm + +import twinkle +from twinkle import DeviceMesh, Platform, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import MegatronModel +from twinkle.preprocessor import SelfCognitionProcessor + +# Construct a device_mesh, tp=pp=cp=2, dp=1 +device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2) +# use torchrun mode +twinkle.initialize(mode='local', global_device_mesh=device_mesh) + +logger = get_logger() + + +def eval(model): + # 100 Samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + dataset.encode() + dataloader = DataLoader(dataset=dataset, batch_size=16) + for step, batch in tqdm(enumerate(dataloader)): + model.forward_only(inputs=batch) + metrics = model.calculate_metric(is_training=False) + return metrics + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 1, dp_size = 1 + dataloader = DataLoader(dataset=dataset, batch_size=16) + # Use a MegatronModel + model = MegatronModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='default', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=5, lr_decay_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + loss_metric = 99.0 + # lora: 10G * 8 + # full: 40G * 8 + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 5 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + if step > 0 and step % 20 == 0: + metrics = eval(model) + logger.info(f'Eval metric: {metrics}') + metrics['step'] = step + if loss_metric > float(metrics['loss']): + model.save(f'checkpoint-{step}') + loss_metric = float(metrics['loss']) + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() diff --git a/cookbook/megatron/tp.sh b/cookbook/megatron/tp.sh new file mode 100644 index 00000000..5516130e --- /dev/null +++ b/cookbook/megatron/tp.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp.py diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py new file mode 100644 index 00000000..7de83962 --- /dev/null +++ b/cookbook/megatron/tp_moe.py @@ -0,0 +1,82 @@ +import os +from peft import LoraConfig +from tqdm import tqdm + +import twinkle +from twinkle import DeviceMesh, Platform, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import MegatronModel +from twinkle.preprocessor import SelfCognitionProcessor + +# Construct a device_mesh, tp=pp=cp=ep=2, dp=1 +device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2) +# use torchrun mode +twinkle.initialize(mode='local', global_device_mesh=device_mesh) + +logger = get_logger() + + +def eval(model): + # 100 Samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + dataset.encode() + dataloader = DataLoader(dataset=dataset, batch_size=16) + for step, batch in tqdm(enumerate(dataloader)): + model.forward_only(inputs=batch) + metrics = model.calculate_metric(is_training=False) + return metrics + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 1, dp_size = 1 + dataloader = DataLoader(dataset=dataset, batch_size=16) + # Use a MegatronModel + model = MegatronModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='default', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=5, lr_decay_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + loss_metric = 99.0 + # lora: 23G * 8 + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 5 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + if step > 0 and step % 20 == 0: + metrics = eval(model) + logger.info(f'Eval metric: {metrics}') + metrics['step'] = step + if loss_metric > float(metrics['loss']): + model.save(f'checkpoint-{step}') + loss_metric = float(metrics['loss']) + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() diff --git a/cookbook/megatron/tp_moe.sh b/cookbook/megatron/tp_moe.sh new file mode 100644 index 00000000..58e58646 --- /dev/null +++ b/cookbook/megatron/tp_moe.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe.py diff --git a/cookbook/ray/run.sh b/cookbook/ray/run.sh new file mode 100644 index 00000000..bbf8a400 --- /dev/null +++ b/cookbook/ray/run.sh @@ -0,0 +1 @@ +python3 single_controller.py diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py new file mode 100644 index 00000000..d0a0e730 --- /dev/null +++ b/cookbook/ray/single_controller.py @@ -0,0 +1,91 @@ +import os +from peft import LoraConfig +from tqdm import tqdm + +import twinkle +from twinkle import DeviceGroup, DeviceMesh, Platform, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +device_group = [DeviceGroup( + name='default', + ranks=8, + device_type='cuda', +)] + +# Construct a device_mesh, fsdp=4, dp=2 +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# use ray mode +twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_mesh) + +logger = get_logger() + + +def eval(model): + # 100 Samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + dataset.encode() + dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) + for step, batch in tqdm(enumerate(dataloader)): + model.forward_only(inputs=batch) + model.calculate_loss() + metrics = model.calculate_metric(is_training=False) + return metrics + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + loss_metric = 99.0 + # lora: 18G * 4 + # full: 50G * 4 + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + if step > 0 and step % 40 == 0: + metrics = eval(model) + logger.info(f'Eval metric: {metrics}') + metrics['step'] = step + if loss_metric > float(metrics['loss']): + model.save(f'checkpoint-{step}') + loss_metric = float(metrics['loss']) + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py new file mode 100644 index 00000000..4b217725 --- /dev/null +++ b/cookbook/rl/grpo.py @@ -0,0 +1,184 @@ +import os +from typing import List, Tuple, Dict, Any + +from peft import LoraConfig + +import twinkle +from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger +from twinkle.advantage import GRPOAdvantage +from twinkle.checkpoint_engine import CheckpointEngineManager +from twinkle.data_format import SamplingParams +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.processor import InputProcessor +from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward +from twinkle.sampler import vLLMSampler +from twinkle.template import Template +from twinkle.metric import CompletionRewardMetric +from twinkle.preprocessor.llm import GSM8KProcessor + +logger = get_logger() + +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen2.5-3B-Instruct') +USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1'))) + +MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) +SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4)) +NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS + +NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8)) +MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096)) +LEARNING_RATE = float(os.environ.get('LR', 1e-5)) +MAX_STEPS = int(os.environ.get('MAX_STEPS', 200)) +BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16)) # global prompt-level, global completion-level batch size = BATCH_SIZE * num_generations * dp_size +MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 16)) # global completion-level mini-batch-size +MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2)) # per-device-micro-batch-size (completion-level), batch_size in forward_backward +GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1)) +ADAPTER_NAME = 'default' + +def create_gsm8k_dataset(): + dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) + dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.map(GSM8KProcessor()) + dataset.encode(add_generation_prompt=True) + return dataset + +def compute_rewards( + trajectories: List[Dict[str, Any]], +) -> Tuple[List[float], List[float], List[float]]: + accuracy_reward_fn = GSM8KAccuracyReward() + format_reward_fn = GSM8KFormatReward() + + accuracy_rewards = accuracy_reward_fn(trajectories) + format_rewards = format_reward_fn(trajectories) + total_rewards = [a + f for a, f in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + +def main(): + # set sampler and model separate to use different gpus + device_groups = [ + DeviceGroup(name='model',ranks=list(range(MODEL_GPUS)),device_type='GPU'), + DeviceGroup(name='sampler',ranks=list(range(MODEL_GPUS, NUM_GPUS)),device_type='GPU'), + ] + if USE_MEGATRON: + model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS) + else: + model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS) + sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS) + twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False) + + lora_config = LoraConfig(target_modules='all-linear', r=32, lora_alpha=64, lora_dropout=0.05) + + if USE_MEGATRON: + from twinkle.model.megatron import MegatronModel + model = MegatronModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model', mixed_precision='bf16') + else: + model = TransformersModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model') + + model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=1) + if USE_MEGATRON: + model.set_optimizer('default', lr=LEARNING_RATE) + model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) + else: + model.set_optimizer('AdamW', lr=LEARNING_RATE) + model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0) + model.set_loss('GRPOLoss', epsilon=0.2) + model.set_processor(InputProcessor) + model.set_template('Template', model_id=MODEL_ID) + + sampler = vLLMSampler( + model_id=MODEL_ID, + engine_args={ + 'gpu_memory_utilization': 0.8, + 'max_model_len': 4096, + 'max_lora_rank': 32, # save as lora_config + 'enable_lora': True, + }, + device_mesh=sampler_mesh, + remote_group='sampler', + ) + sampler.set_template(Template, model_id=MODEL_ID) + + ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) + + GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS + dataloader = DataLoader( + dataset=create_gsm8k_dataset, + batch_size=GLOBAL_BATCH_SIZE, + min_batch_size=GLOBAL_BATCH_SIZE, + device_mesh=model_mesh, + remote_group='model', + ) + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + + sampling_params = SamplingParams(max_tokens=MAX_NEW_TOKENS) + + optim_step = 0 + logger.info(get_device_placement()) + + for batch in dataloader: + if optim_step >= MAX_STEPS: + break + metrics.reset() + global_prompts = batch if isinstance(batch, list) else [batch] + ckpt_manager.sync_weights(merge_and_sync=False) + sampler.reset_prefix_cache() + sample_response = sampler.sample( + global_prompts*NUM_GENERATIONS, + sampling_params, + num_samples=1, + ) + + all_input_data: List[Dict[str, Any]] = [] + all_old_logps: List[List[float]] = [] + all_completion_lengths: List[int] = [] + + for sequence in sample_response.sequences: + all_input_data.append(sequence.new_input_feature) + all_old_logps.append(sequence.logprobs) + all_completion_lengths.append(len(sequence.tokens)) + total_rewards, format_rewards, accuracy_rewards = compute_rewards( + all_input_data + ) + metrics.accumulate( + completion_lengths=all_completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }, + ) + + advantages = advantage_fn(total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist() + + # Split completions into mini-batches and run one optim step per mini-batch. + total_completions = len(all_input_data) + for mb_start in range(0, total_completions, MINI_BATCH_SIZE): + mb_end = min(mb_start + MINI_BATCH_SIZE, total_completions) + mb_inputs = all_input_data[mb_start:mb_end] + mb_old_logps = all_old_logps[mb_start:mb_end] + mb_advantages = advantages[mb_start:mb_end] + + model.forward_backward( + inputs=mb_inputs, + old_logps=mb_old_logps, + advantages=mb_advantages, + micro_batch_size=MICRO_BATCH_SIZE, + ) + model.clip_grad_and_step() + optim_step += 1 + + if optim_step >= MAX_STEPS: + break + log_dict = metrics.calculate() + log_dict.update(model.calculate_metric(is_training=True)) + metrics.reset() + logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') + + logger.info(f'Training completed. optim_steps={optim_step}') + model.save('grpo-gsm8k-checkpoint') + +if __name__ == '__main__': + main() diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.py b/cookbook/transformers/ep_fsdp_qwen3_moe.py new file mode 100644 index 00000000..6473dc63 --- /dev/null +++ b/cookbook/transformers/ep_fsdp_qwen3_moe.py @@ -0,0 +1,95 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import numpy as np +import os +from transformers import AutoConfig + +import twinkle +from twinkle import DeviceMesh, Platform, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +logger = get_logger() + +MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507') +DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition') +TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Template') +_num_layers_env = os.environ.get('NUM_LAYERS') +NUM_LAYERS = int(_num_layers_env) if _num_layers_env is not None else None + +# 4 gpus, dp=2, ep=2 +dp_size = 2 +ep_size = 2 +ulysses_size = 2 + +device_mesh = DeviceMesh( + device_type=Platform.get_platform().device_prefix(), + mesh=np.arange(dp_size * ep_size).reshape(dp_size, ep_size), + mesh_dim_names=('dp', 'ep'), + ulysses_size=ulysses_size, # enable sp +) + +twinkle.initialize( + mode='local', + global_device_mesh=device_mesh, +) + + +def train(): + config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True) + if NUM_LAYERS is not None and hasattr(config, 'num_hidden_layers'): + config.num_hidden_layers = NUM_LAYERS + if hasattr(config, 'use_cache'): + config.use_cache = False + + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + try: + dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID) + except ValueError: + dataset.set_template('Template', model_id=MODEL_ID) + + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + dataset.encode(batched=True) + dataloader = DataLoader( + dataset=dataset, + batch_size=4, + device_mesh=device_mesh, + ) + + grad_accum_steps = 4 + model = TransformersModel( + model_id=MODEL_ID, + config=config, + device_mesh=device_mesh, + fsdp_config={ + 'expert_parallel': { + 'enabled': True, + 'router_dtype': 'fp32', + 'all_to_all': 'torch', + 'keep_router_logits': False, + } + }, + ) + # Disable foreach to avoid DTensor mixed-type errors in EP runs. + model.set_optimizer('AdamW', foreach=False) + + logger.info(get_device_placement()) + logger.info(model.get_train_configs()) + + for step, batch in enumerate(dataloader): + if callable(batch): + batch = batch() + model.forward_backward(inputs=batch, gradient_accumulation_steps=grad_accum_steps) + model.clip_grad_and_step(gradient_accumulation_steps=grad_accum_steps) + if step % grad_accum_steps == 0: + metric = model.calculate_metric(is_training=True) + if callable(metric): + metric = metric() + logger.info(f'Current is step {step // grad_accum_steps}, metric: {metric}') + if step > 0 and step % 50 == 0: + model.save('./output') + + +if __name__ == '__main__': + train() diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.sh b/cookbook/transformers/ep_fsdp_qwen3_moe.sh new file mode 100644 index 00000000..cfc8a7cf --- /dev/null +++ b/cookbook/transformers/ep_fsdp_qwen3_moe.sh @@ -0,0 +1,7 @@ +# EP + FSDP2 (Transformers MoE) example. +# With expert_parallel enabled, expert parameters are sharded across the EP dimension. +# Non-expert parameters are sharded by FSDP (across world_size). +# Officially validated scope: qwen3_moe_like models (for example, Qwen3-30B-A3B). +# Other MoE models may work if their MoE blocks expose: `experts` + `gate/router` + `top_k` (or `num_experts_per_tok`). +# EP runtime constraints: `num_experts % ep_world_size == 0`. +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 ep_fsdp_qwen3_moe.py diff --git a/cookbook/transformers/fsdp2.py b/cookbook/transformers/fsdp2.py new file mode 100644 index 00000000..586000fc --- /dev/null +++ b/cookbook/transformers/fsdp2.py @@ -0,0 +1,85 @@ +import os +from peft import LoraConfig +from tqdm import tqdm + +import twinkle +from twinkle import DeviceMesh, Platform, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +# Construct a device_mesh, fsdp=4, dp=2 +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# use torchrun mode +twinkle.initialize(mode='local', global_device_mesh=device_mesh) + +logger = get_logger() + + +def eval(model): + # 100 Samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + dataset.encode() + dataloader = DataLoader(dataset=dataset, batch_size=8) + for step, batch in tqdm(enumerate(dataloader)): + model.forward_only(inputs=batch) + model.calculate_loss() + metrics = model.calculate_metric(is_training=False) + return metrics + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + loss_metric = 99.0 + # lora: 18G * 4 + # full: 50G * 4 + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + if step > 0 and step % 40 == 0: + metrics = eval(model) + logger.info(f'Eval metric: {metrics}') + metrics['step'] = step + if loss_metric > float(metrics['loss']): + model.save(f'checkpoint-{step}') + loss_metric = float(metrics['loss']) + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() diff --git a/cookbook/transformers/fsdp2.sh b/cookbook/transformers/fsdp2.sh new file mode 100644 index 00000000..93c531a9 --- /dev/null +++ b/cookbook/transformers/fsdp2.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2.py diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py new file mode 100644 index 00000000..3ea649d3 --- /dev/null +++ b/cookbook/transformers/fsdp2_moe.py @@ -0,0 +1,88 @@ +import os +from peft import LoraConfig +from tqdm import tqdm + +import twinkle +from twinkle import DeviceMesh, Platform, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +# Construct a device_mesh, fsdp=4, dp=2 +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# use torchrun mode +twinkle.initialize(mode='local', global_device_mesh=device_mesh) + +logger = get_logger() + + +def eval(model): + # 100 Samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + dataset.encode() + dataloader = DataLoader(dataset=dataset, batch_size=4) + for step, batch in tqdm(enumerate(dataloader)): + model.forward_only(inputs=batch) + model.calculate_loss() + metrics = model.calculate_metric(is_training=False) + return metrics + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 4, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8) + # Use a TransformersModel, transformer_cls_names_to_wrap=Qwen3MoeSparseMoeBlock to avoid hang of fsdp2 + model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']}) + # Patch MoE model to fix the hang bug, support transformers==4.* + model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch') + lora_config = LoraConfig( + r=8, + lora_alpha=32, + target_modules='all-linear' + ) + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler(scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + loss_metric = 99.0 + # lora: 34G * 8 + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + if step > 0 and step % 40 == 0: + metrics = eval(model) + logger.info(f'Eval metric: {metrics}') + metrics['step'] = step + if loss_metric > float(metrics['loss']): + model.save(f'checkpoint-{step}') + loss_metric = float(metrics['loss']) + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() diff --git a/cookbook/transformers/fsdp2_moe.sh b/cookbook/transformers/fsdp2_moe.sh new file mode 100644 index 00000000..c496cd1d --- /dev/null +++ b/cookbook/transformers/fsdp2_moe.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2_moe.py diff --git a/cookbook/transformers/sp_fsdp_dense.py b/cookbook/transformers/sp_fsdp_dense.py new file mode 100644 index 00000000..7a563a2c --- /dev/null +++ b/cookbook/transformers/sp_fsdp_dense.py @@ -0,0 +1,94 @@ +import numpy as np +from functools import partial +from peft import LoraConfig + +import twinkle +from twinkle import DeviceGroup, DeviceMesh, Platform, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +logger = get_logger() +MODEL_ID = 'ms://Qwen/Qwen2.5-7B-Instruct' +DATASETS = 'ms://swift/self-cognition' + +device_group = [DeviceGroup( + name='default', + ranks=[0, 1, 2, 3], + device_type=Platform.get_platform().device_prefix(), +)] + +# FSDP + SP validation over 4 GPUs: dp=2, fsdp=2 (SP only affects input slicing) +device_mesh = DeviceMesh( + device_type='cuda', + mesh=np.arange(4).reshape(2, 2), + mesh_dim_names=('dp', 'fsdp'), + ulysses_size=2, +) + +twinkle.initialize( + mode='local', + nproc_per_node=4, + global_device_mesh=device_mesh, + lazy_collect=False, +) + + +def eval(model): + dataloader = DataLoader( + dataset=partial(create_dataset, data_slice=range(100)), + batch_size=4, + device_mesh=device_mesh, + ) + for _, batch in enumerate(dataloader): + model.forward_only(inputs=batch, adapter_name='default') + model.calculate_loss(adapter_name='default') + return model.calculate_metric(is_training=False, adapter_name='default') + + +def create_dataset(data_slice=None): + dataset = Dataset(dataset_meta=DatasetMeta(DATASETS, data_slice=range(500))) + dataset.set_template('Template', model_id=MODEL_ID) + dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队')) + dataset.encode(batched=True) + return dataset + + +def train(): + dataloader = DataLoader( + dataset=partial(create_dataset, data_slice=None), + batch_size=8, + device_mesh=device_mesh, + ) + + model = TransformersModel( + model_id=MODEL_ID, + device_mesh=device_mesh, + strategy='native_fsdp', + ) + + lora_config = LoraConfig(target_modules='all-linear') + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=1) + model.set_optimizer('AdamW', lr=1e-4, adapter_name='default') + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', + num_warmup_steps=5, + num_training_steps=len(dataloader), + adapter_name='default', + ) + + logger.info(model.get_train_configs(adapter_name='default')) + logger.info(f'Total steps: {len(dataloader)}') + + for step, batch in enumerate(dataloader): + model.forward_backward(inputs=batch, adapter_name='default') + model.clip_grad_and_step(adapter_name='default') + if step % 20 == 0: + metric = model.calculate_metric(is_training=True, adapter_name='default') + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save('last-checkpoint', interval=1) + + +if __name__ == '__main__': + train() diff --git a/cookbook/transformers/sp_fsdp_dense.sh b/cookbook/transformers/sp_fsdp_dense.sh new file mode 100644 index 00000000..dd04a2b0 --- /dev/null +++ b/cookbook/transformers/sp_fsdp_dense.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# To enabele sequence parallelism, please set ulysses_size > 1 +# device_mesh = DeviceMesh( +# device_type="cuda", +# mesh=np.arange(4).reshape(2, 2), +# mesh_dim_names=("dp", "fsdp"), +# ulysses_size=2, +# ) +# +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 sp_fsdp_dense.py diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..8ccd292e --- /dev/null +++ b/docs/README.md @@ -0,0 +1,37 @@ +## maintain docs +1. build docs + ```shell + # in root directory: + make docs + ``` + +2. doc string format + + We adopt the google style docstring format as the standard, please refer to the following documents. + 1. Google Python style guide docstring [link](http://google.github.io/styleguide/pyguide.html#381-docstrings) + 2. Google docstring example [link](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) + 3. sample:torch.nn.modules.conv [link](https://pytorch.org/docs/stable/_modules/torch/nn/modules/conv.html#Conv1d) + 4. load function as an example: + + ```python + def load(file, file_format=None, **kwargs): + """Load data from json/yaml/pickle files. + + This method provides a unified api for loading data from serialized files. + + Args: + file (str or :obj:`Path` or file-like object): Filename or a file-like + object. + file_format (str, optional): If not specified, the file format will be + inferred from the file extension, otherwise use the specified one. + Currently supported formats include "json", "yaml/yml". + + Examples: + >>> load('/path/of/your/file') # file is stored in disk + >>> load('https://path/of/your/file') # file is stored on internet + >>> load('oss://path/of/your/file') # file is stored in petrel + + Returns: + The content from the file. + """ + ``` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..9534b018 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source_en/.readthedocs.yaml b/docs/source_en/.readthedocs.yaml new file mode 100644 index 00000000..ae642329 --- /dev/null +++ b/docs/source_en/.readthedocs.yaml @@ -0,0 +1,15 @@ +# .readthedocs.yaml +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + jobs: + pre_install: + - pip install poetry + - poetry config virtualenvs.create false + - poetry install --only docs --no-interaction --no-ansi + +sphinx: + configuration: docs/source_en/conf.py diff --git a/docs/source_en/Components/Advantage/Advantage.md b/docs/source_en/Components/Advantage/Advantage.md new file mode 100644 index 00000000..fed5028a --- /dev/null +++ b/docs/source_en/Components/Advantage/Advantage.md @@ -0,0 +1,61 @@ +# Advantage + +Advantage functions are components in reinforcement learning used to calculate the advantage of an action relative to the average performance. In RLHF training, advantage functions guide policy optimization. + +## Basic Interface + +```python +class Advantage: + + def __call__(self, + rewards: Union['torch.Tensor', List[float]], + num_generations: int = 1, + scale: Literal['group', 'batch', 'none'] = 'group', + **kwargs) -> 'torch.Tensor': + """ + Calculate advantage values + + Args: + rewards: List or tensor of reward values + num_generations: Number of samples generated per prompt + scale: Normalization method + - 'group': Normalize per group (GRPO) + - 'batch': Normalize across entire batch + - 'none': No normalization + + Returns: + Advantage tensor + """ + ... +``` + +## Available Advantage Functions + +Twinkle provides two advantage function implementations: + +### GRPOAdvantage + +GRPO (Group Relative Policy Optimization) advantage function calculates advantages by subtracting the group mean. + +- Simple and efficient, suitable for most scenarios +- Reduces variance and improves training stability +- Performs relative comparisons within groups + +See: [GRPOAdvantage](GRPOAdvantage.md) + +### RLOOAdvantage + +RLOO (Reinforcement Learning with Leave-One-Out) advantage function uses leave-one-out method to calculate baselines. + +- Theoretically superior, reduces bias +- Requires more samples (recommend 8 or more) +- More accurate counterfactual baseline estimation + +See: [RLOOAdvantage](RLOOAdvantage.md) + +## How to Choose + +- **GRPO**: Suitable for scenarios with fewer samples (around 4), high computational efficiency +- **RLOO**: Suitable for scenarios with more samples (8 or more), better theoretical performance + +> The choice of advantage function has a significant impact on RLHF training effectiveness. It's recommended to choose based on computational resources and sample quantity. diff --git a/docs/source_en/Components/Advantage/GRPOAdvantage.md b/docs/source_en/Components/Advantage/GRPOAdvantage.md new file mode 100644 index 00000000..381b7605 --- /dev/null +++ b/docs/source_en/Components/Advantage/GRPOAdvantage.md @@ -0,0 +1,68 @@ +# GRPOAdvantage + +GRPO (Group Relative Policy Optimization) advantage function calculates advantages by subtracting the group mean. + +## Usage Example + +```python +from twinkle.advantage import GRPOAdvantage + +advantage_fn = GRPOAdvantage() + +# Assume 2 prompts, each generating 4 samples +rewards = [0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0] # 8 reward values +advantages = advantage_fn(rewards, num_generations=4, scale='group') + +# Advantages will be each group minus the group mean: +# Group 1: [0.0-0.5, 1.0-0.5, 0.0-0.5, 1.0-0.5] = [-0.5, 0.5, -0.5, 0.5] +# Group 2: [1.0-0.25, 0.0-0.25, 0.0-0.25, 0.0-0.25] = [0.75, -0.25, -0.25, -0.25] +``` + +## How It Works + +GRPO groups samples (each group corresponds to multiple generations from one prompt), then within each group: +1. Calculate the group mean reward +2. Advantage for each sample = reward - group mean +3. Optionally normalize the advantage values + +This method: +- Reduces variance and improves training stability +- Performs relative comparisons within groups, better aligned with relative nature of human preferences +- Avoids the impact of reward scale + +## Complete Training Example + +Using the advantage function in GRPO training: + +```python +from twinkle.advantage import GRPOAdvantage +from twinkle.model import TransformersModel +from twinkle.sampler import vLLMSampler +from twinkle.reward import MathReward + +# Create components +actor = TransformersModel(model_id='Qwen/Qwen2.5-7B-Instruct') +sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +reward_fn = MathReward() +advantage_fn = GRPOAdvantage() + +# Training loop +for batch in dataloader: + # 1. Sample generation + response = sampler.sample(batch, num_samples=4) + + # 2. Calculate rewards + rewards = reward_fn(response.trajectories, batch.ground_truths) + + # 3. Calculate advantages + advantages = advantage_fn(rewards, num_generations=4) + + # 4. Policy optimization + loss = actor.forward_backward( + inputs=response.inputs, + advantages=advantages + ) + actor.clip_grad_and_step() +``` + +> The GRPO method is simple and efficient, suitable for most RLHF training scenarios. diff --git a/docs/source_en/Components/Advantage/RLOOAdvantage.md b/docs/source_en/Components/Advantage/RLOOAdvantage.md new file mode 100644 index 00000000..19308d35 --- /dev/null +++ b/docs/source_en/Components/Advantage/RLOOAdvantage.md @@ -0,0 +1,65 @@ +# RLOOAdvantage + +RLOO (Reinforcement Learning with Leave-One-Out) advantage function uses leave-one-out method to calculate baselines. + +## Usage Example + +```python +from twinkle.advantage import RLOOAdvantage + +advantage_fn = RLOOAdvantage() + +rewards = [0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0] +advantages = advantage_fn(rewards, num_generations=4) + +# For each sample, the baseline is the mean of all other samples +# First sample in first group: 0.0 - mean([1.0, 0.0, 1.0]) = 0.0 - 0.667 = -0.667 +# ... +``` + +## How It Works + +For each sample, RLOO: +1. Calculates the mean reward of all other samples in the group (leave-one-out baseline) +2. Advantage = sample reward - leave-one-out baseline +3. Optionally normalizes the values + +RLOO advantages: +- Avoids using the sample's own information as baseline, reducing bias +- More accurate counterfactual baseline estimation +- Better performance when there are more samples + +## Complete Training Example + +```python +from twinkle.advantage import RLOOAdvantage +from twinkle.model import TransformersModel +from twinkle.sampler import vLLMSampler +from twinkle.reward import MathReward + +# Create components +actor = TransformersModel(model_id='Qwen/Qwen2.5-7B-Instruct') +sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +reward_fn = MathReward() +advantage_fn = RLOOAdvantage() + +# Training loop +for batch in dataloader: + # 1. Sample generation (generate more samples to improve RLOO effectiveness) + response = sampler.sample(batch, num_samples=8) + + # 2. Calculate rewards + rewards = reward_fn(response.trajectories, batch.ground_truths) + + # 3. Calculate advantages + advantages = advantage_fn(rewards, num_generations=8) + + # 4. Policy optimization + loss = actor.forward_backward( + inputs=response.inputs, + advantages=advantages + ) + actor.clip_grad_and_step() +``` + +> RLOO is theoretically superior but requires more samples (recommend 8 or more samples per prompt). diff --git a/docs/source_en/Components/Advantage/index.rst b/docs/source_en/Components/Advantage/index.rst new file mode 100644 index 00000000..9d38ea8b --- /dev/null +++ b/docs/source_en/Components/Advantage/index.rst @@ -0,0 +1,8 @@ +Advantage +=============== +.. toctree:: + :maxdepth: 1 + + Advantage.md + GRPOAdvantage.md + RLOOAdvantage.md diff --git a/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md b/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md new file mode 100644 index 00000000..f72bec83 --- /dev/null +++ b/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md @@ -0,0 +1,69 @@ +# CheckpointEngine + +CheckpointEngine is a component used to synchronize model weights between trainer and inference processes, primarily used in RLHF training to synchronize weights between Actor models and Rollout samplers. + +## Basic Interface + +```python +class CheckpointEngine(ABC): + """Checkpoint engine base class + + The checkpoint engine handles weight synchronization between trainer and inference processes. + """ + + @abstractmethod + def prepare(self) -> dict[str, Any]: + """Prepare for weight synchronization""" + ... + + @abstractmethod + def init_process_group(self, rank: int, world_size: int, **kwargs): + """Initialize process group""" + ... + + @abstractmethod + async def send_weights(self, weight_generator): + """Send weights (called in trainer process)""" + ... + + @abstractmethod + def receive_weights(self) -> AsyncGenerator: + """Receive weights (called in inference process)""" + ... + + @abstractmethod + def finalize(self): + """Clean up resources""" + ... +``` + +## Available Checkpoint Engines + +Twinkle provides two checkpoint engine implementations: + +### NCCLCheckpointEngine + +A checkpoint engine that uses NCCL for high-speed weight transfer between GPUs. + +- High-Speed Transfer: Uses NCCL for GPU-to-GPU point-to-point high-speed transfer +- Zero-Copy: Direct transfer between GPU memories without going through CPU +- Bucketed Transfer: Supports bucketed transfer for large models + +See: [NCCLCheckpointEngine](NCCLCheckpointEngine.md) + +### HCCLCheckpointEngine + +A checkpoint engine that uses HCCL for weight transfer between Ascend NPUs. + +- NPU Optimized: Weight transfer optimized specifically for Ascend NPUs +- Efficient Communication: Uses HCCL for high-speed communication between NPUs +- Compatible Interface: Maintains consistent interface with NCCLCheckpointEngine + +See: [HCCLCheckpointEngine](HCCLCheckpointEngine.md) + +## How to Choose + +- **NCCLCheckpointEngine**: Suitable for GPU environments, provides the highest transfer performance +- **HCCLCheckpointEngine**: Suitable for Ascend NPU environments + +> Checkpoint engine is a key component of RLHF training infrastructure, ensuring that trainers and samplers use consistent model weights. diff --git a/docs/source_en/Components/Checkpoint Engine/HCCLCheckpointEngine.md b/docs/source_en/Components/Checkpoint Engine/HCCLCheckpointEngine.md new file mode 100644 index 00000000..585031ca --- /dev/null +++ b/docs/source_en/Components/Checkpoint Engine/HCCLCheckpointEngine.md @@ -0,0 +1,28 @@ +# HCCLCheckpointEngine + +A checkpoint engine that uses HCCL for weight transfer between Ascend NPUs. + +## Usage Example + +```python +from twinkle.checkpoint_engine import HCCLCheckpointEngine + +engine = HCCLCheckpointEngine(bucket_size=512<<20) +# Usage is the same as NCCLCheckpointEngine +``` + +## Features + +- **NPU Optimized**: Weight transfer optimized specifically for Ascend NPUs +- **Efficient Communication**: Uses HCCL for high-speed communication between NPUs +- **Compatible Interface**: Maintains consistent interface with NCCLCheckpointEngine + +## Use Cases + +HCCLCheckpointEngine is specifically designed for Ascend NPU environments: + +- Training on Huawei Ascend NPUs +- Synchronizing model weights between NPUs +- Large-scale NPU cluster deployment + +> In Ascend NPU environments, HCCLCheckpointEngine provides performance comparable to NCCL. diff --git a/docs/source_en/Components/Checkpoint Engine/NCCLCheckpointEngine.md b/docs/source_en/Components/Checkpoint Engine/NCCLCheckpointEngine.md new file mode 100644 index 00000000..6959a5eb --- /dev/null +++ b/docs/source_en/Components/Checkpoint Engine/NCCLCheckpointEngine.md @@ -0,0 +1,42 @@ +# NCCLCheckpointEngine + +A checkpoint engine that uses NCCL for high-speed weight transfer between GPUs. + +## Usage Example + +```python +from twinkle.checkpoint_engine import NCCLCheckpointEngine + +# In training process (rank 0) +engine = NCCLCheckpointEngine(bucket_size=512<<20) # 512MB bucket +engine.is_master = True +engine.prepare() +engine.init_process_group(rank=0, world_size=5) + +# Send weights +await engine.send_weights(model.named_parameters()) +engine.finalize() + +# In inference process (rank 1-4) +engine = NCCLCheckpointEngine(bucket_size=512<<20) +engine.prepare() +engine.init_process_group(rank=1, world_size=5, master_metadata=metadata) + +# Receive weights +async for name, tensor in engine.receive_weights(): + model.load_state_dict({name: tensor}, strict=False) +engine.finalize() +``` + +## Features + +- **High-Speed Transfer**: Uses NCCL for GPU-to-GPU point-to-point high-speed transfer +- **Zero-Copy**: Direct transfer between GPU memories without going through CPU +- **Bucketed Transfer**: Supports bucketed transfer for large models + +## Configuration Parameters + +- **bucket_size**: Weight bucket size, controls the amount of data transferred each time. Larger buckets can improve transfer efficiency but consume more memory +- **timeout**: Transfer timeout duration + +> NCCLCheckpointEngine is the recommended choice for GPU training, providing the highest transfer performance. diff --git a/docs/source_en/Components/Checkpoint Engine/index.rst b/docs/source_en/Components/Checkpoint Engine/index.rst new file mode 100644 index 00000000..bcd18842 --- /dev/null +++ b/docs/source_en/Components/Checkpoint Engine/index.rst @@ -0,0 +1,8 @@ +Checkpoint Engine +=============== +.. toctree:: + :maxdepth: 1 + + CheckpointEngine.md + NCCLCheckpointEngine.md + HCCLCheckpointEngine.md diff --git a/docs/source_en/Components/Data Format/InputFeature.md b/docs/source_en/Components/Data Format/InputFeature.md new file mode 100644 index 00000000..79954e29 --- /dev/null +++ b/docs/source_en/Components/Data Format/InputFeature.md @@ -0,0 +1,26 @@ +# Model Input + +The class used by Twinkle to represent model input is `InputFeature`, which is adapted to model structures such as transformers/megatron. + +```python +InputType = Union[List[List[int]], List[int], np.ndarray, Any] + +class InputFeature(TypedDict, total=False): + # Text-related fields + input_ids: InputType + attention_mask: InputType + position_ids: InputType + labels: InputType +``` + +InputFeature is essentially a Dict. Its input comes from the output of the `Template` component. + +- input_ids: Token list after List[Messages] is nested with a template +- attention_mask: Attention mask +- position_ids: Position encoding for sample distinction +- labels: Training labels, which have already undergone a one-token left shift + +In the case of packing or padding_free, fields such as input_ids are concatenated from lists of multiple samples. +In multimodal scenarios, InputFeature contains other multimodal fields. + +InputFeature is the standard interface for all template outputs and model inputs in Twinkle. diff --git a/docs/source_en/Components/Data Format/Message.md b/docs/source_en/Components/Data Format/Message.md new file mode 100644 index 00000000..f8d22256 --- /dev/null +++ b/docs/source_en/Components/Data Format/Message.md @@ -0,0 +1,43 @@ +# Message + +A message represents a single round of information in a model conversation. The message definition is: + +```python + +class ToolCall(TypedDict, total=False): + tool_name: str + arguments: str + +class Message(TypedDict, total=False): + role: Literal['system', 'user', 'assistant', 'tool'] + type: str + content: Union[str, List[Dict[str, str]]] + tool_calls: List[ToolCall] + reasoning_content: str + images: Optional[List[Union[str, Any]]] + videos: Optional[List[Union[str, Any]]] + audios: Optional[List[Union[str, Any]]] +``` + +Essentially, `Message` is a Dict. It contains several fields, with the following being strongly relevant to developers: + +- role: Message type, including four types: 'system', 'user', 'assistant', 'tool'. + - system: System instruction message, only appears in the 0th message + - user: User input message + - assistant: Model reply message + - tool: Tool call result, similar to user message input to the model +- content: Message body, if it contains multimodal information, then placeholders are needed: + - : Image placeholder + -