billwuhao
diff --git a/‎.github/workflows/publish_action.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/publish_action.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎DiffRhythmNode.py‎
Lines changed: 226 additions & 0 deletions b/‎DiffRhythmNode.py‎
Lines changed: 226 additions & 0 deletions
diff --git a/‎README-en.md‎
Lines changed: 45 additions & 0 deletions b/‎README-en.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 46 additions & 2 deletions b/‎README.md‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎config/accelerate_config.yaml‎
Lines changed: 18 additions & 0 deletions b/‎config/accelerate_config.yaml‎
Lines changed: 18 additions & 0 deletions
@@ -0,0 +1,22 @@
+name: Publish to Comfy registry
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+      - main
+    paths:
+      - "pyproject.toml"
+
+jobs:
+  publish-node:
+    name: Publish Custom Node to registry
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Publish Custom Node
+        uses: Comfy-Org/publish-node-action@main
+        with:
+          ## Add your own personal access token to your Github Repository secrets and reference it here.
+          personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
@@ -0,0 +1,226 @@
+# Copyright (c) 2025 ASLP-LAB
+#               2025 Huakang Chen  (huakang@mail.nwpu.edu.cn)
+#               2025 Guobin Ma     (guobin.ma@gmail.com)
+#
+# Licensed under the Stability AI License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   https://huggingface.co/stabilityai/stable-audio-open-1.0/blob/main/LICENSE.md
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torchaudio
+import librosa
+from mutagen.mp3 import MP3
+import torch
+from einops import rearrange
+
+from diffrhythm_utils import (
+    decode_audio,
+    get_lrc_token,
+    get_negative_style_prompt,
+    get_reference_latent,
+    prepare_model,
+)
+
+
+def inference(
+    cfm_model,
+    vae_model,
+    cond,
+    text,
+    duration,
+    style_prompt,
+    negative_style_prompt,
+    start_time,
+    chunked=False,
+):
+        with torch.inference_mode():
+            generated, _ = cfm_model.sample(
+                cond=cond,
+                text=text,
+                duration=duration,
+                style_prompt=style_prompt,
+                negative_style_prompt=negative_style_prompt,
+                steps=32,
+                cfg_strength=4.0,
+                start_time=start_time,
+            )
+        
+
+        generated = generated.to(torch.float32)
+        latent = generated.transpose(1, 2)  # [b d t]
+
+        output = decode_audio(latent, vae_model, chunked=chunked)
+
+        # Rearrange audio batch to a single sequence
+        output = rearrange(output, "b d n -> d (b n)")
+        # Peak normalize, clip, convert to int16, and save to file
+        output = (
+            output.to(torch.float32)
+            .div(torch.max(torch.abs(output)))
+            .clamp(-1, 1)
+            .mul(32767)
+            .to(torch.int16)
+            .cpu()
+        )
+
+        return output
+
+
+class MultiLinePrompt:
+    @classmethod
+    def INPUT_TYPES(cls):
+               
+        return {
+            "required": {
+                "multi_line_prompt": ("STRING", {
+                    "multiline": True, 
+                    "default": ""}),
+                },
+        }
+
+    CATEGORY = "MW-DiffRhythm"
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("prompt",)
+    FUNCTION = "promptgen"
+    
+    def promptgen(self, multi_line_prompt: str):
+        return (multi_line_prompt.strip(),)
+
+
+class DiffRhythmRun:
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.mps.is_available():
+        device = "mps"
+
+    @classmethod
+    def INPUT_TYPES(cls):
+               
+        return {
+            "required": {
+                "style_prompt": ("STRING", {
+                    "multiline": True, 
+                    "default": ""}),
+                },
+            "optional": {
+                "lyrics_prompt": ("STRING",),
+                "style_audio": ("AUDIO", ),
+                "chunked": ("BOOLEAN", {"default": False, "tooltip": "Whether to use chunked decoding."}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}),
+            },
+        }
+
+    CATEGORY = "MW-DiffRhythm"
+    RETURN_TYPES = ("AUDIO",)
+    RETURN_NAMES = ("audio",)
+    FUNCTION = "diffrhythmgen"
+    
+    def diffrhythmgen(
+            self,
+            style_prompt: str, 
+            # audio_length: int,
+            lyrics_prompt: str = "", 
+            style_audio: str = None,
+            chunked: bool = False,
+            seed: int = 0):
+
+        # if audio_length == 95:
+        #     max_frames = 2048
+        # elif audio_length == 285:  # current not available
+        #     max_frames = 6144
+        max_frames = 2048
+        cfm, tokenizer, muq, vae = prepare_model(self.device)
+
+        lrc_prompt, start_time = get_lrc_token(lyrics_prompt, tokenizer, self.device)
+
+        if style_audio:
+            prompt = self.get_style_prompt(muq, style_audio)
+        else:
+            prompt = self.get_style_prompt(muq, prompt=style_prompt)
+
+        negative_style_prompt = get_negative_style_prompt(self.device)
+        latent_prompt = get_reference_latent(self.device, max_frames)
+
+        try:
+            generated_song = inference(
+                cfm_model=cfm,
+                vae_model=vae,
+                cond=latent_prompt,
+                text=lrc_prompt,
+                duration=max_frames,
+                style_prompt=prompt,
+                negative_style_prompt=negative_style_prompt,
+                start_time=start_time,
+                chunked=chunked,
+            )
+        except Exception as e:
+            raise
+
+        audio_tensor = generated_song.unsqueeze(0)
+        return ({"waveform": audio_tensor, "sample_rate": 44100},)
+
+    @torch.no_grad()
+    def get_style_prompt(self, model, audio=None, prompt=None):
+        mulan = model
+
+        if prompt is not None:
+            return mulan(texts=prompt).half()
+
+        if audio is None:
+            raise ValueError("Audio data or style prompt must be provided")
+
+        waveform = audio["waveform"]
+        sample_rate = audio["sample_rate"]
+        
+        # 确保波形是正确的形状
+        if len(waveform.shape) == 3:  # [1, channels, samples]
+            waveform = waveform.squeeze(0)
+        if waveform.shape[0] > 1:  # 如果是立体声，转换为单声道
+            waveform = waveform.mean(0, keepdim=True)
+
+        # 计算音频长度（秒）
+        audio_len = waveform.shape[-1] / sample_rate
+        
+        if audio_len < 10:
+            raise ValueError(f"The audio is too short ({audio_len:.2f} s), it takes at least 10 seconds.")
+
+        # 提取中间 10 秒的片段
+        mid_time = int((audio_len // 2) * sample_rate)
+        start_sample = mid_time - int(5 * sample_rate)
+        end_sample = start_sample + int(10 * sample_rate)
+        wav_segment = waveform[..., start_sample:end_sample]
+
+        # 重采样到 24kHz
+        if sample_rate != 24000:
+            wav_segment = torchaudio.transforms.Resample(sample_rate, 24000)(wav_segment)
+
+        # 确保形状正确并移动到正确的设备
+        wav = wav_segment.to(model.device)
+        if len(wav.shape) == 1:
+            wav = wav.unsqueeze(0)
+
+        with torch.no_grad():
+            audio_emb = mulan(wavs=wav)  # [1, 512]
+
+        audio_emb = audio_emb.half()
+
+        return audio_emb
+
+
+NODE_CLASS_MAPPINGS = {
+    "DiffRhythmRun": DiffRhythmRun,
+    "MultiLinePrompt": MultiLinePrompt,
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "DiffRhythmRun": "DiffRhythm Run",
+    "MultiLinePrompt": "Multi Line Prompt",
+}
@@ -0,0 +1,45 @@
+[中文](README.md) | [English](README-en.md) 
+
+# DiffRhythm Node for ComfyUI
+
+Blazingly Fast and Embarrassingly Simple End-to-End Full-Length Song Generation.
+
+![](https://github.com/billwuhao/ComfyUI_DiffRhythm/blob/master/images/2025-03-12_23-49-32.png)
+
+## 📣 update
+
+[2025-03-13]⚒️: Release version v1.0.0.
+
+- All parameters are optional; you can generate random music without providing any parameters.
+
+## Model Download
+
+Models will be automatically downloaded to the `ComfyUI\models\TTS\DiffRhythm` folder.
+
+The structure is as follows:
+
+![](https://github.com/billwuhao/ComfyUI_DiffRhythm/blob/master/images/2025-03-13_00-08-51.png)
+
+Manual Download Addresses:
+
+https://huggingface.co/ASLP-lab/DiffRhythm-base/blob/main/cfm_model.pt  
+https://huggingface.co/ASLP-lab/DiffRhythm-vae/blob/main/vae_model.pt  
+https://huggingface.co/OpenMuQ/MuQ-MuLan-large/tree/main  
+https://huggingface.co/OpenMuQ/MuQ-large-msd-iter/tree/main  
+https://huggingface.co/FacebookAI/xlm-roberta-base/tree/main
+
+## Environment Configuration
+
+Configure the following on Windows systems, other systems have not been tested. Should support Linux and Mac.
+
+Download and install the latest version of [espeak-ng](https://github.com/espeak-ng/espeak-ng/releases/tag/1.52.0)
+
+Add the environment variable `PHONEMIZER_ESPEAK_LIBRARY` to your system. The value should be the path to the `libespeak-ng.dll` file in your espeak-ng installation, for example: `C:\Program Files\eSpeak NG\libespeak-ng.dll`.
+
+Enjoy the music! 🎶
+
+## Acknowledgements
+
+[DiffRhythm](https://github.com/ASLP-lab/DiffRhythm)
+
+Thanks to the DiffRhythm team for their excellent work. Currently the strongest open-source music/song generation model 👍.
@@ -1,2 +1,46 @@
-# ComfyUI_DiffRhythm
-Blazingly Fast and Embarrassingly Simple End-to-End Full-Length Song Generation. A node for ComfyUI.
+[中文](README.md) | [English](README-en.md) 
+
+# DiffRhythm 的 ComfyUI 节点
+
+快速而简单的端到端全长歌曲生成.
+
+![](https://github.com/billwuhao/ComfyUI_DiffRhythm/blob/master/images/2025-03-12_23-49-32.png)
+
+
+## 📣 更新
+
+[2025-03-13]⚒️: 发布版本 v1.0.0.
+
+- 所有参数均是可选的, 不提供任何参数随机生成音乐.
+
+## 模型下载
+
+模型会自动下载到 `ComfyUI\models\TTS\DiffRhythm` 文件夹下.
+
+结构如下:
+
+![](https://github.com/billwuhao/ComfyUI_DiffRhythm/blob/master/images/2025-03-13_00-08-51.png)
+
+手动下载地址:
+
+https://huggingface.co/ASLP-lab/DiffRhythm-base/blob/main/cfm_model.pt  
+https://huggingface.co/ASLP-lab/DiffRhythm-vae/blob/main/vae_model.pt  
+https://huggingface.co/OpenMuQ/MuQ-MuLan-large/tree/main  
+https://huggingface.co/OpenMuQ/MuQ-large-msd-iter/tree/main  
+https://huggingface.co/FacebookAI/xlm-roberta-base/tree/main
+
+## 环境配置
+
+Windows 系统做如下配置, 其他系统未测试. 应该支持 Linux, Mac.
+
+下载安装最新版 [espeak-ng](https://github.com/espeak-ng/espeak-ng/releases/tag/1.52.0)
+
+添加环境变量 `PHONEMIZER_ESPEAK_LIBRARY` 到系统中, 值是你安装的 espeak-ng 软件中 `libespeak-ng.dll` 文件的路径, 例如: `C:\Program Files\eSpeak NG\libespeak-ng.dll`.
+
+享受音乐吧🎶
+
+## 鸣谢
+
+[DiffRhythm](https://github.com/ASLP-lab/DiffRhythm)
+
+感谢 DiffRhythm 团队的卓越的工作, 目前最强开源 音乐/歌曲 生成模型👍.
@@ -0,0 +1,8 @@
+import sys
+import os
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, current_dir)
+
+from DiffRhythmNode import  NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
+__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
@@ -0,0 +1,18 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+rdzv_backend: static
+same_network: true
+num_processes: 8
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+main_process_port: 12342
+