From fc318dba344b928bee29e77ad50876d231f73d29 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:12:24 +0100 Subject: [PATCH 1/5] Add LEGO mode: generate instrumental stems over references (#19) * Add LEGO mode: --lego flag for dit-vae, example files, README docs --- .github/workflows/ci-build.yml | 2 +- README.md | 10 +++++++++ examples/lego.json | 6 ++++++ examples/lego.sh | 37 ++++++++++++++++++++++++++++++++++ tools/dit-vae.cpp | 35 +++++++++++++++++++++++++++----- 5 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 examples/lego.json create mode 100755 examples/lego.sh diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 78da84d..0a57c41 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -64,7 +64,7 @@ jobs: - name: Run clang-format (check mode) run: | find . \ - \( -path './.git' -o -path './ggml' -o -path './build' \) -prune -o \ + \( -path './.git' -o -path './ggml' -o -path './build' -o -path './vendor' -o -path './mp3' \) -prune -o \ -type f \( -name '*.c' -o -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.hpp' \) \ -print0 | xargs -0 clang-format --dry-run --Werror diff --git a/README.md b/README.md index d71b0a9..2c0ff48 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,16 @@ EOF --vae models/vae-BF16.gguf ``` +**Lego** (`--lego ` + `--src-audio`): +generates a new instrument track layered over an existing backing track. +Only the **base model** (`acestep-v15-base`) supports lego mode. +The track name is passed on the CLI; set `audio_cover_strength=1.0` in the +request so the source audio guides all DiT steps. +See `examples/lego.json` and `examples/lego.sh`. + +Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`, +`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`. + ## Request JSON reference Only `caption` is required. All other fields default to "unset" which means diff --git a/examples/lego.json b/examples/lego.json new file mode 100644 index 0000000..d4138d6 --- /dev/null +++ b/examples/lego.json @@ -0,0 +1,6 @@ +{ + "caption": "electric guitar riff, funk guitar, house music, instrumental", + "audio_cover_strength": 1.0, + "inference_steps": 50, + "guidance_scale": 7.0 +} diff --git a/examples/lego.sh b/examples/lego.sh new file mode 100755 index 0000000..f954223 --- /dev/null +++ b/examples/lego.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Lego test: three-step self-contained pipeline. +# +# step zero: download the base DiT model if not already present +# (lego requires acestep-v15-base; turbo/sft do not support it) +# step one: generate a track from the simple prompt +# step two: apply lego guitar to that generated track + +set -eu + +# Step 0: ensure the base model is available +echo "=== Step 0: ensure base model ===" +(cd .. && ./models.sh --base) + +# Step 1: generate a source track with the simple prompt +echo "=== Step 1: generate track ===" +../build/ace-qwen3 \ + --request simple.json \ + --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf + +../build/dit-vae \ + --request simple0.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf \ + --wav + +# Step 2: lego guitar on the generated track (base model required) +echo "=== Step 2: lego guitar ===" +../build/dit-vae \ + --src-audio simple00.wav \ + --lego guitar \ + --request lego.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-base-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf \ + --wav diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 585a089..6ac71a6 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -32,6 +32,11 @@ static void print_usage(const char * prog) { " --vae VAE GGUF file\n\n" "Reference audio:\n" " --src-audio Source audio (WAV or MP3, any sample rate)\n\n" + "Lego mode (base model only, requires --src-audio):\n" + " --lego Generate a track over the source audio context\n" + " Track names: vocals, backing_vocals, drums, bass,\n" + " guitar, keyboard, percussion, strings, synth,\n" + " fx, brass, woodwinds\n\n" "LoRA:\n" " --lora LoRA safetensors file or directory\n" " --lora-scale LoRA scaling factor (default: 1.0)\n\n" @@ -83,6 +88,7 @@ int main(int argc, char ** argv) { const char * dit_gguf = NULL; const char * vae_gguf = NULL; const char * src_audio_path = NULL; + const char * lego_track = NULL; // --lego const char * dump_dir = NULL; const char * lora_path = NULL; float lora_scale = 1.0f; @@ -107,6 +113,8 @@ int main(int argc, char ** argv) { vae_gguf = argv[++i]; } else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) { src_audio_path = argv[++i]; + } else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) { + lego_track = argv[++i]; } else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) { lora_path = argv[++i]; } else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) { @@ -144,6 +152,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n"); return 1; } + if (lego_track && !src_audio_path) { + fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n"); + return 1; + } if (!dit_gguf) { fprintf(stderr, "[CLI] ERROR: --dit required\n"); print_usage(argv[0]); @@ -410,12 +422,25 @@ int main(int argc, char ** argv) { // text2music = "Fill the audio semantic mask..." // cover = "Generate audio semantic tokens..." // repaint = "Repaint the mask area..." + // lego = "Generate the {track} track based on the audio context:" // Auto-switches to cover when audio_codes are present - bool is_cover = have_cover || !codes_vec.empty(); - const char * instruction = is_repaint ? "Repaint the mask area based on the given conditions:" : - is_cover ? "Generate audio semantic tokens based on the given conditions:" : - "Fill the audio semantic mask based on the given conditions:"; - char metas[512]; + bool is_cover = have_cover || !codes_vec.empty(); + + // Lego: build instruction from the track name supplied via --lego + char lego_instruction[256] = {}; + const char * instruction; + if (lego_track) { + snprintf(lego_instruction, sizeof(lego_instruction), + "Generate the %s track based on the audio context:", lego_track); + instruction = lego_instruction; + fprintf(stderr, "[Lego] track=%s\n", lego_track); + } else { + instruction = is_repaint ? "Repaint the mask area based on the given conditions:" : + is_cover ? "Generate audio semantic tokens based on the given conditions:" : + "Fill the audio semantic mask based on the given conditions:"; + } + + char metas[512]; snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm, timesig, keyscale, (int) duration); std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption + From 715be9aedfac3c54efbdd84b5cd0683adc98293b Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Tue, 10 Mar 2026 14:21:18 +0100 Subject: [PATCH 2/5] Remove base model check from lego.sh Removed the echo statement for ensuring the base model. --- examples/lego.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/lego.sh b/examples/lego.sh index f954223..a55802c 100755 --- a/examples/lego.sh +++ b/examples/lego.sh @@ -9,9 +9,6 @@ set -eu # Step 0: ensure the base model is available -echo "=== Step 0: ensure base model ===" -(cd .. && ./models.sh --base) - # Step 1: generate a source track with the simple prompt echo "=== Step 1: generate track ===" ../build/ace-qwen3 \ From f774da5c13a526c6f31a03d86c749128d07ba295 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Tue, 10 Mar 2026 14:23:11 +0100 Subject: [PATCH 3/5] Refactor lego.sh by removing echo statements Removed echo statements for steps in the script. --- examples/lego.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/lego.sh b/examples/lego.sh index a55802c..7fc019f 100755 --- a/examples/lego.sh +++ b/examples/lego.sh @@ -8,9 +8,7 @@ set -eu -# Step 0: ensure the base model is available # Step 1: generate a source track with the simple prompt -echo "=== Step 1: generate track ===" ../build/ace-qwen3 \ --request simple.json \ --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf @@ -23,7 +21,6 @@ echo "=== Step 1: generate track ===" --wav # Step 2: lego guitar on the generated track (base model required) -echo "=== Step 2: lego guitar ===" ../build/dit-vae \ --src-audio simple00.wav \ --lego guitar \ From 90c365a117b446db2952560e789baa73cfcc9c43 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Tue, 10 Mar 2026 15:33:46 +0100 Subject: [PATCH 4/5] Implement error check for --lego with DiT model Add error handling for --lego option requiring base DiT model --- tools/dit-vae.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 6ac71a6..7f98958 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -198,6 +198,12 @@ int main(int argc, char ** argv) { if (gf_load(&gf, dit_gguf)) { is_turbo = gf_get_bool(gf, "acestep.is_turbo"); const void * sl_data = gf_get_data(gf, "silence_latent"); + if (lego_track && is_turbo) { + fprintf(stderr, "[CLI] ERROR: --lego requires the base DiT model\n"); + gf_close(&gf); + dit_ggml_free(&model); + return 1; + } if (sl_data) { silence_full.resize(15000 * 64); memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float)); From aaaa02785fa0205f5e043ff9a4488492631471ff Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 10 Mar 2026 21:41:58 +0100 Subject: [PATCH 5/5] Move lego mode from `--lego ` CLI flag to `"lego"` JSON request field (#21) * apply requested changes --- README.md | 37 +++++++++++++++++-- examples/lego.json | 8 ++-- examples/lego.sh | 1 - src/request.cpp | 9 +++++ src/request.h | 6 +++ tools/dit-vae.cpp | 91 ++++++++++++++++++++++++++++------------------ 6 files changed, 108 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 2c0ff48..0c115ea 100644 --- a/README.md +++ b/README.md @@ -258,13 +258,32 @@ EOF --vae models/vae-BF16.gguf ``` -**Lego** (`--lego ` + `--src-audio`): +**Lego** (`"lego"` in JSON + `--src-audio`): generates a new instrument track layered over an existing backing track. Only the **base model** (`acestep-v15-base`) supports lego mode. -The track name is passed on the CLI; set `audio_cover_strength=1.0` in the -request so the source audio guides all DiT steps. See `examples/lego.json` and `examples/lego.sh`. +```bash +cat > /tmp/lego.json << 'EOF' +{ + "caption": "electric guitar riff, funk guitar, house music, instrumental", + "lyrics": "[Instrumental]", + "lego": "guitar", + "inference_steps": 50, + "guidance_scale": 7.0, + "shift": 1.0 +} +EOF + +./build/dit-vae \ + --src-audio backing-track.wav \ + --request /tmp/lego.json \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-base-Q8_0.gguf \ + --vae models/vae-BF16.gguf \ + --wav +``` + Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`, `keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`. @@ -295,7 +314,8 @@ the LLM fills them, or a sensible runtime default is applied. "shift": 3.0, "audio_cover_strength": 0.5, "repainting_start": -1, - "repainting_end": -1 + "repainting_end": -1, + "lego": "" } ``` @@ -363,6 +383,15 @@ the DiT regenerates the `[start, end)` time region while preserving everything else. `-1` on start means 0s (beginning), `-1` on end means source duration (end). Error if end <= start after resolve. `audio_cover_strength` is ignored. +**`lego`** (string, default `""` = inactive) +Track name for lego mode. Requires `--src-audio` and the **base model**. +Valid names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`, +`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`. +When set, passes the source audio to the DiT as context and builds the +instruction `"Generate the {TRACK} track based on the audio context:"`. +`audio_cover_strength` is forced to 1.0 (all steps see the source audio). +Use `inference_steps=50`, `guidance_scale=7.0`, `shift=1.0` for base model. + ### LM sampling (ace-qwen3) **`lm_temperature`** (float, default `0.85`) diff --git a/examples/lego.json b/examples/lego.json index d4138d6..ab9409b 100644 --- a/examples/lego.json +++ b/examples/lego.json @@ -1,6 +1,8 @@ { - "caption": "electric guitar riff, funk guitar, house music, instrumental", - "audio_cover_strength": 1.0, + "caption": "", + "lyrics": "[Instrumental]", + "lego": "guitar", "inference_steps": 50, - "guidance_scale": 7.0 + "guidance_scale": 7.0, + "shift": 1.0 } diff --git a/examples/lego.sh b/examples/lego.sh index 7fc019f..e9e1daf 100755 --- a/examples/lego.sh +++ b/examples/lego.sh @@ -23,7 +23,6 @@ set -eu # Step 2: lego guitar on the generated track (base model required) ../build/dit-vae \ --src-audio simple00.wav \ - --lego guitar \ --request lego.json \ --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ --dit ../models/acestep-v15-base-Q8_0.gguf \ diff --git a/src/request.cpp b/src/request.cpp index 96a213b..5b97202 100644 --- a/src/request.cpp +++ b/src/request.cpp @@ -34,6 +34,7 @@ void request_init(AceRequest * r) { r->audio_cover_strength = 0.5f; r->repainting_start = -1.0f; r->repainting_end = -1.0f; + r->lego = ""; } // JSON string escape / unescape @@ -321,6 +322,8 @@ bool request_parse(AceRequest * r, const char * path) { r->repainting_start = (float) atof(v.c_str()); } else if (k == "repainting_end") { r->repainting_end = (float) atof(v.c_str()); + } else if (k == "lego") { + r->lego = v; } } @@ -356,6 +359,9 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, " \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength); fprintf(f, " \"repainting_start\": %.1f,\n", r->repainting_start); fprintf(f, " \"repainting_end\": %.1f,\n", r->repainting_end); + if (!r->lego.empty()) { + fprintf(f, " \"lego\": \"%s\",\n", json_escape(r->lego).c_str()); + } // audio_codes last (no trailing comma) fprintf(f, " \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str()); fprintf(f, "}\n"); @@ -380,5 +386,8 @@ void request_dump(const AceRequest * r, FILE * f) { if (r->repainting_start >= 0.0f || r->repainting_end >= 0.0f) { fprintf(f, " repaint: start=%.1f end=%.1f\n", r->repainting_start, r->repainting_end); } + if (!r->lego.empty()) { + fprintf(f, " lego: %s\n", r->lego.c_str()); + } fprintf(f, " audio_codes: %s\n", r->audio_codes.empty() ? "(none)" : "(present)"); } diff --git a/src/request.h b/src/request.h index 807147a..04b7b26 100644 --- a/src/request.h +++ b/src/request.h @@ -49,6 +49,12 @@ struct AceRequest { // -1 on start means 0s, -1 on end means source duration. float repainting_start; // -1 float repainting_end; // -1 + + // lego mode (requires --src-audio, base model only) + // Track name from TRACK_NAMES: vocals, backing_vocals, drums, bass, guitar, + // keyboard, percussion, strings, synth, fx, brass, woodwinds. + // Empty = not lego. Sets instruction and forces full-range repaint. + std::string lego; // "" }; // Initialize all fields to defaults (matches Python GenerationParams defaults) diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 7f98958..0426cfd 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -16,6 +16,7 @@ #include "vae-enc.h" #include "vae.h" +#include #include #include #include @@ -32,11 +33,6 @@ static void print_usage(const char * prog) { " --vae VAE GGUF file\n\n" "Reference audio:\n" " --src-audio Source audio (WAV or MP3, any sample rate)\n\n" - "Lego mode (base model only, requires --src-audio):\n" - " --lego Generate a track over the source audio context\n" - " Track names: vocals, backing_vocals, drums, bass,\n" - " guitar, keyboard, percussion, strings, synth,\n" - " fx, brass, woodwinds\n\n" "LoRA:\n" " --lora LoRA safetensors file or directory\n" " --lora-scale LoRA scaling factor (default: 1.0)\n\n" @@ -88,7 +84,6 @@ int main(int argc, char ** argv) { const char * dit_gguf = NULL; const char * vae_gguf = NULL; const char * src_audio_path = NULL; - const char * lego_track = NULL; // --lego const char * dump_dir = NULL; const char * lora_path = NULL; float lora_scale = 1.0f; @@ -113,8 +108,6 @@ int main(int argc, char ** argv) { vae_gguf = argv[++i]; } else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) { src_audio_path = argv[++i]; - } else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) { - lego_track = argv[++i]; } else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) { lora_path = argv[++i]; } else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) { @@ -152,10 +145,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n"); return 1; } - if (lego_track && !src_audio_path) { - fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n"); - return 1; - } if (!dit_gguf) { fprintf(stderr, "[CLI] ERROR: --dit required\n"); print_usage(argv[0]); @@ -198,12 +187,6 @@ int main(int argc, char ** argv) { if (gf_load(&gf, dit_gguf)) { is_turbo = gf_get_bool(gf, "acestep.is_turbo"); const void * sl_data = gf_get_data(gf, "silence_latent"); - if (lego_track && is_turbo) { - fprintf(stderr, "[CLI] ERROR: --lego requires the base DiT model\n"); - gf_close(&gf); - dit_ggml_free(&model); - return 1; - } if (sl_data) { silence_full.resize(15000 * 64); memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float)); @@ -301,11 +284,43 @@ int main(int argc, char ** argv) { fprintf(stderr, "[Request] ERROR: failed to parse %s, skipping\n", rpath); continue; } - if (req.caption.empty()) { + if (req.caption.empty() && req.lego.empty()) { fprintf(stderr, "[Request] ERROR: caption is empty in %s, skipping\n", rpath); continue; } + // Lego mode validation (base model only, requires --src-audio) + bool is_lego = !req.lego.empty(); + if (is_lego) { + if (!src_audio_path) { + fprintf(stderr, "[Lego] ERROR: lego requires --src-audio\n"); + return 1; + } + if (is_turbo) { + fprintf(stderr, "[Lego] ERROR: lego requires the base DiT model (turbo detected)\n"); + return 1; + } + // Reference project: TRACK_NAMES (constants.py) + static const char * allowed[] = { + "vocals", "backing_vocals", "drums", "bass", "guitar", "keyboard", + "percussion", "strings", "synth", "fx", "brass", "woodwinds", + }; + bool valid = false; + for (int k = 0; k < 12; k++) { + if (req.lego == allowed[k]) { + valid = true; + break; + } + } + if (!valid) { + fprintf(stderr, "[Lego] ERROR: '%s' is not a valid track name\n", req.lego.c_str()); + fprintf(stderr, + " Valid: vocals, backing_vocals, drums, bass, guitar, keyboard,\n" + " percussion, strings, synth, fx, brass, woodwinds\n"); + return 1; + } + } + // Extract params const char * caption = req.caption.c_str(); const char * lyrics = req.lyrics.c_str(); @@ -424,32 +439,36 @@ int main(int argc, char ** argv) { } // 2. Build formatted prompts - // Reference project uses opposite-sounding instructions (constants.py): + // Reference project instruction templates (constants.py TASK_INSTRUCTIONS): // text2music = "Fill the audio semantic mask..." // cover = "Generate audio semantic tokens..." // repaint = "Repaint the mask area..." - // lego = "Generate the {track} track based on the audio context:" + // lego = "Generate the {TRACK_NAME} track based on the audio context:" // Auto-switches to cover when audio_codes are present - bool is_cover = have_cover || !codes_vec.empty(); - - // Lego: build instruction from the track name supplied via --lego - char lego_instruction[256] = {}; - const char * instruction; - if (lego_track) { - snprintf(lego_instruction, sizeof(lego_instruction), - "Generate the %s track based on the audio context:", lego_track); - instruction = lego_instruction; - fprintf(stderr, "[Lego] track=%s\n", lego_track); + bool is_cover = have_cover || !codes_vec.empty(); + std::string instruction_str; + if (is_lego) { + // Lego mode: force audio_cover_strength=1.0 so all DiT steps see the source audio + req.audio_cover_strength = 1.0f; + fprintf(stderr, "[Lego] track=%s, cover path, strength=1.0\n", req.lego.c_str()); + // Reference project (task_utils.py:86): track name is UPPERCASE + std::string track_upper = req.lego; + for (char & c : track_upper) { + c = (char) toupper((unsigned char) c); + } + instruction_str = "Generate the " + track_upper + " track based on the audio context:"; + } else if (is_repaint) { + instruction_str = "Repaint the mask area based on the given conditions:"; + } else if (is_cover) { + instruction_str = "Generate audio semantic tokens based on the given conditions:"; } else { - instruction = is_repaint ? "Repaint the mask area based on the given conditions:" : - is_cover ? "Generate audio semantic tokens based on the given conditions:" : - "Fill the audio semantic mask based on the given conditions:"; + instruction_str = "Fill the audio semantic mask based on the given conditions:"; } char metas[512]; snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm, timesig, keyscale, (int) duration); - std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption + + std::string text_str = std::string("# Instruction\n") + instruction_str + "\n\n" + "# Caption\n" + caption + "\n\n" + "# Metas\n" + metas + "<|endoftext|>\n"; std::string lyric_str = std::string("# Languages\n") + language + "\n\n# Lyric\n" + lyrics + "<|endoftext|>"; @@ -567,7 +586,7 @@ int main(int argc, char ** argv) { } // Build context: [T, ctx_ch] = src_latents[64] + chunk_mask[64] - // Cover: src = cover_latents, mask = 1.0 everywhere + // Cover/Lego: src = cover_latents, mask = 1.0 everywhere // Repaint: src = silence in region / cover outside, mask = 1.0 in region / 0.0 outside // Passthrough: detokenized FSQ codes + silence padding, mask = 1.0 // Text2music: silence only, mask = 1.0