From fc318dba344b928bee29e77ad50876d231f73d29 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 14:12:24 +0100
Subject: [PATCH 1/5] Add LEGO mode: generate instrumental stems over
 references (#19)

* Add LEGO mode: --lego <track> flag for dit-vae, example files, README docs
---
 .github/workflows/ci-build.yml |  2 +-
 README.md                      | 10 +++++++++
 examples/lego.json             |  6 ++++++
 examples/lego.sh               | 37 ++++++++++++++++++++++++++++++++++
 tools/dit-vae.cpp              | 35 +++++++++++++++++++++++++++-----
 5 files changed, 84 insertions(+), 6 deletions(-)
 create mode 100644 examples/lego.json
 create mode 100755 examples/lego.sh
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
index 78da84d..0a57c41 100644
--- a/.github/workflows/ci-build.yml
+++ b/.github/workflows/ci-build.yml
@@ -64,7 +64,7 @@ jobs:
       - name: Run clang-format (check mode)
         run: |
           find . \
-          \( -path './.git' -o -path './ggml' -o -path './build' \) -prune -o \
+          \( -path './.git' -o -path './ggml' -o -path './build' -o -path './vendor' -o -path './mp3' \) -prune -o \
           -type f \( -name '*.c' -o -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.hpp' \) \
           -print0 | xargs -0 clang-format --dry-run --Werror
 
diff --git a/README.md b/README.md
index d71b0a9..2c0ff48 100644
--- a/README.md
+++ b/README.md
@@ -258,6 +258,16 @@ EOF
     --vae models/vae-BF16.gguf
 ```
 
+**Lego** (`--lego <track>` + `--src-audio`):
+generates a new instrument track layered over an existing backing track.
+Only the **base model** (`acestep-v15-base`) supports lego mode.
+The track name is passed on the CLI; set `audio_cover_strength=1.0` in the
+request so the source audio guides all DiT steps.
+See `examples/lego.json` and `examples/lego.sh`.
+
+Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`,
+`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`.
+
 ## Request JSON reference
 
 Only `caption` is required. All other fields default to "unset" which means
diff --git a/examples/lego.json b/examples/lego.json
new file mode 100644
index 0000000..d4138d6
--- /dev/null
+++ b/examples/lego.json
@@ -0,0 +1,6 @@
+{
+    "caption": "electric guitar riff, funk guitar, house music, instrumental",
+    "audio_cover_strength": 1.0,
+    "inference_steps": 50,
+    "guidance_scale": 7.0
+}
diff --git a/examples/lego.sh b/examples/lego.sh
new file mode 100755
index 0000000..f954223
--- /dev/null
+++ b/examples/lego.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Lego test: three-step self-contained pipeline.
+#
+# step zero: download the base DiT model if not already present
+#            (lego requires acestep-v15-base; turbo/sft do not support it)
+# step one:  generate a track from the simple prompt
+# step two:  apply lego guitar to that generated track
+
+set -eu
+
+# Step 0: ensure the base model is available
+echo "=== Step 0: ensure base model ==="
+(cd .. && ./models.sh --base)
+
+# Step 1: generate a source track with the simple prompt
+echo "=== Step 1: generate track ==="
+../build/ace-qwen3 \
+    --request simple.json \
+    --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
+
+../build/dit-vae \
+    --request simple0.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf \
+    --wav
+
+# Step 2: lego guitar on the generated track (base model required)
+echo "=== Step 2: lego guitar ==="
+../build/dit-vae \
+    --src-audio simple00.wav \
+    --lego guitar \
+    --request lego.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-base-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf \
+    --wav
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index 585a089..6ac71a6 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -32,6 +32,11 @@ static void print_usage(const char * prog) {
             "  --vae <gguf>            VAE GGUF file\n\n"
             "Reference audio:\n"
             "  --src-audio <file>      Source audio (WAV or MP3, any sample rate)\n\n"
+            "Lego mode (base model only, requires --src-audio):\n"
+            "  --lego <track>          Generate a track over the source audio context\n"
+            "                          Track names: vocals, backing_vocals, drums, bass,\n"
+            "                          guitar, keyboard, percussion, strings, synth,\n"
+            "                          fx, brass, woodwinds\n\n"
             "LoRA:\n"
             "  --lora <path>           LoRA safetensors file or directory\n"
             "  --lora-scale <float>    LoRA scaling factor (default: 1.0)\n\n"
@@ -83,6 +88,7 @@ int main(int argc, char ** argv) {
     const char *              dit_gguf       = NULL;
     const char *              vae_gguf       = NULL;
     const char *              src_audio_path = NULL;
+    const char *              lego_track     = NULL;  // --lego <track>
     const char *              dump_dir       = NULL;
     const char *              lora_path      = NULL;
     float                     lora_scale     = 1.0f;
@@ -107,6 +113,8 @@ int main(int argc, char ** argv) {
             vae_gguf = argv[++i];
         } else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) {
             src_audio_path = argv[++i];
+        } else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) {
+            lego_track = argv[++i];
         } else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) {
             lora_path = argv[++i];
         } else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) {
@@ -144,6 +152,10 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n");
         return 1;
     }
+    if (lego_track && !src_audio_path) {
+        fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n");
+        return 1;
+    }
     if (!dit_gguf) {
         fprintf(stderr, "[CLI] ERROR: --dit required\n");
         print_usage(argv[0]);
@@ -410,12 +422,25 @@ int main(int argc, char ** argv) {
         //   text2music = "Fill the audio semantic mask..."
         //   cover      = "Generate audio semantic tokens..."
         //   repaint    = "Repaint the mask area..."
+        //   lego       = "Generate the {track} track based on the audio context:"
         // Auto-switches to cover when audio_codes are present
-        bool         is_cover    = have_cover || !codes_vec.empty();
-        const char * instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
-                                   is_cover   ? "Generate audio semantic tokens based on the given conditions:" :
-                                                "Fill the audio semantic mask based on the given conditions:";
-        char         metas[512];
+        bool is_cover = have_cover || !codes_vec.empty();
+
+        // Lego: build instruction from the track name supplied via --lego <track>
+        char         lego_instruction[256] = {};
+        const char * instruction;
+        if (lego_track) {
+            snprintf(lego_instruction, sizeof(lego_instruction),
+                     "Generate the %s track based on the audio context:", lego_track);
+            instruction = lego_instruction;
+            fprintf(stderr, "[Lego] track=%s\n", lego_track);
+        } else {
+            instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
+                          is_cover   ? "Generate audio semantic tokens based on the given conditions:" :
+                                       "Fill the audio semantic mask based on the given conditions:";
+        }
+
+        char metas[512];
         snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm,
                  timesig, keyscale, (int) duration);
         std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption +

From 715be9aedfac3c54efbdd84b5cd0683adc98293b Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Tue, 10 Mar 2026 14:21:18 +0100
Subject: [PATCH 2/5] Remove base model check from lego.sh

Removed the echo statement for ensuring the base model.
---
 examples/lego.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/lego.sh b/examples/lego.sh
index f954223..a55802c 100755
--- a/examples/lego.sh
+++ b/examples/lego.sh
@@ -9,9 +9,6 @@
 set -eu
 
 # Step 0: ensure the base model is available
-echo "=== Step 0: ensure base model ==="
-(cd .. && ./models.sh --base)
-
 # Step 1: generate a source track with the simple prompt
 echo "=== Step 1: generate track ==="
 ../build/ace-qwen3 \

From f774da5c13a526c6f31a03d86c749128d07ba295 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Tue, 10 Mar 2026 14:23:11 +0100
Subject: [PATCH 3/5] Refactor lego.sh by removing echo statements

Removed echo statements for steps in the script.
---
 examples/lego.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/lego.sh b/examples/lego.sh
index a55802c..7fc019f 100755
--- a/examples/lego.sh
+++ b/examples/lego.sh
@@ -8,9 +8,7 @@
 
 set -eu
 
-# Step 0: ensure the base model is available
 # Step 1: generate a source track with the simple prompt
-echo "=== Step 1: generate track ==="
 ../build/ace-qwen3 \
     --request simple.json \
     --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
@@ -23,7 +21,6 @@ echo "=== Step 1: generate track ==="
     --wav
 
 # Step 2: lego guitar on the generated track (base model required)
-echo "=== Step 2: lego guitar ==="
 ../build/dit-vae \
     --src-audio simple00.wav \
     --lego guitar \

From 90c365a117b446db2952560e789baa73cfcc9c43 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Tue, 10 Mar 2026 15:33:46 +0100
Subject: [PATCH 4/5] Implement error check for --lego with DiT model

Add error handling for --lego option requiring base DiT model
---
 tools/dit-vae.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index 6ac71a6..7f98958 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -198,6 +198,12 @@ int main(int argc, char ** argv) {
         if (gf_load(&gf, dit_gguf)) {
             is_turbo             = gf_get_bool(gf, "acestep.is_turbo");
             const void * sl_data = gf_get_data(gf, "silence_latent");
+            if (lego_track && is_turbo) {
+                fprintf(stderr, "[CLI] ERROR: --lego requires the base DiT model\n");
+                gf_close(&gf);
+                dit_ggml_free(&model);
+                return 1;
+            }
             if (sl_data) {
                 silence_full.resize(15000 * 64);
                 memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float));

From aaaa02785fa0205f5e043ff9a4488492631471ff Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Tue, 10 Mar 2026 21:41:58 +0100
Subject: [PATCH 5/5] Move lego mode from `--lego <track>` CLI flag to `"lego"`
 JSON request field (#21)

* apply requested changes
---
 README.md          | 37 +++++++++++++++++--
 examples/lego.json |  8 ++--
 examples/lego.sh   |  1 -
 src/request.cpp    |  9 +++++
 src/request.h      |  6 +++
 tools/dit-vae.cpp  | 91 ++++++++++++++++++++++++++++------------------
 6 files changed, 108 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 2c0ff48..0c115ea 100644
--- a/README.md
+++ b/README.md
@@ -258,13 +258,32 @@ EOF
     --vae models/vae-BF16.gguf
 ```
 
-**Lego** (`--lego <track>` + `--src-audio`):
+**Lego** (`"lego"` in JSON + `--src-audio`):
 generates a new instrument track layered over an existing backing track.
 Only the **base model** (`acestep-v15-base`) supports lego mode.
-The track name is passed on the CLI; set `audio_cover_strength=1.0` in the
-request so the source audio guides all DiT steps.
 See `examples/lego.json` and `examples/lego.sh`.
 
+```bash
+cat > /tmp/lego.json << 'EOF'
+{
+    "caption": "electric guitar riff, funk guitar, house music, instrumental",
+    "lyrics": "[Instrumental]",
+    "lego": "guitar",
+    "inference_steps": 50,
+    "guidance_scale": 7.0,
+    "shift": 1.0
+}
+EOF
+
+./build/dit-vae \
+    --src-audio backing-track.wav \
+    --request /tmp/lego.json \
+    --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit models/acestep-v15-base-Q8_0.gguf \
+    --vae models/vae-BF16.gguf \
+    --wav
+```
+
 Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`,
 `keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`.
 
@@ -295,7 +314,8 @@ the LLM fills them, or a sensible runtime default is applied.
     "shift":                3.0,
     "audio_cover_strength": 0.5,
     "repainting_start":    -1,
-    "repainting_end":      -1
+    "repainting_end":      -1,
+    "lego":                ""
 }
 ```
 
@@ -363,6 +383,15 @@ the DiT regenerates the `[start, end)` time region while preserving everything
 else. `-1` on start means 0s (beginning), `-1` on end means source duration
 (end). Error if end <= start after resolve. `audio_cover_strength` is ignored.
 
+**`lego`** (string, default `""` = inactive)
+Track name for lego mode. Requires `--src-audio` and the **base model**.
+Valid names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`,
+`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`.
+When set, passes the source audio to the DiT as context and builds the
+instruction `"Generate the {TRACK} track based on the audio context:"`.
+`audio_cover_strength` is forced to 1.0 (all steps see the source audio).
+Use `inference_steps=50`, `guidance_scale=7.0`, `shift=1.0` for base model.
+
 ### LM sampling (ace-qwen3)
 
 **`lm_temperature`** (float, default `0.85`)
diff --git a/examples/lego.json b/examples/lego.json
index d4138d6..ab9409b 100644
--- a/examples/lego.json
+++ b/examples/lego.json
@@ -1,6 +1,8 @@
 {
-    "caption": "electric guitar riff, funk guitar, house music, instrumental",
-    "audio_cover_strength": 1.0,
+    "caption": "",
+    "lyrics": "[Instrumental]",
+    "lego": "guitar",
     "inference_steps": 50,
-    "guidance_scale": 7.0
+    "guidance_scale": 7.0,
+    "shift": 1.0
 }
diff --git a/examples/lego.sh b/examples/lego.sh
index 7fc019f..e9e1daf 100755
--- a/examples/lego.sh
+++ b/examples/lego.sh
@@ -23,7 +23,6 @@ set -eu
 # Step 2: lego guitar on the generated track (base model required)
 ../build/dit-vae \
     --src-audio simple00.wav \
-    --lego guitar \
     --request lego.json \
     --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
     --dit ../models/acestep-v15-base-Q8_0.gguf \
diff --git a/src/request.cpp b/src/request.cpp
index 96a213b..5b97202 100644
--- a/src/request.cpp
+++ b/src/request.cpp
@@ -34,6 +34,7 @@ void request_init(AceRequest * r) {
     r->audio_cover_strength = 0.5f;
     r->repainting_start     = -1.0f;
     r->repainting_end       = -1.0f;
+    r->lego                 = "";
 }
 
 // JSON string escape / unescape
@@ -321,6 +322,8 @@ bool request_parse(AceRequest * r, const char * path) {
             r->repainting_start = (float) atof(v.c_str());
         } else if (k == "repainting_end") {
             r->repainting_end = (float) atof(v.c_str());
+        } else if (k == "lego") {
+            r->lego = v;
         }
     }
 
@@ -356,6 +359,9 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "  \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength);
     fprintf(f, "  \"repainting_start\": %.1f,\n", r->repainting_start);
     fprintf(f, "  \"repainting_end\": %.1f,\n", r->repainting_end);
+    if (!r->lego.empty()) {
+        fprintf(f, "  \"lego\": \"%s\",\n", json_escape(r->lego).c_str());
+    }
     // audio_codes last (no trailing comma)
     fprintf(f, "  \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str());
     fprintf(f, "}\n");
@@ -380,5 +386,8 @@ void request_dump(const AceRequest * r, FILE * f) {
     if (r->repainting_start >= 0.0f || r->repainting_end >= 0.0f) {
         fprintf(f, "  repaint: start=%.1f end=%.1f\n", r->repainting_start, r->repainting_end);
     }
+    if (!r->lego.empty()) {
+        fprintf(f, "  lego: %s\n", r->lego.c_str());
+    }
     fprintf(f, "  audio_codes: %s\n", r->audio_codes.empty() ? "(none)" : "(present)");
 }
diff --git a/src/request.h b/src/request.h
index 807147a..04b7b26 100644
--- a/src/request.h
+++ b/src/request.h
@@ -49,6 +49,12 @@ struct AceRequest {
     // -1 on start means 0s, -1 on end means source duration.
     float repainting_start;  // -1
     float repainting_end;    // -1
+
+    // lego mode (requires --src-audio, base model only)
+    // Track name from TRACK_NAMES: vocals, backing_vocals, drums, bass, guitar,
+    // keyboard, percussion, strings, synth, fx, brass, woodwinds.
+    // Empty = not lego. Sets instruction and forces full-range repaint.
+    std::string lego;  // ""
 };
 
 // Initialize all fields to defaults (matches Python GenerationParams defaults)
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index 7f98958..0426cfd 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -16,6 +16,7 @@
 #include "vae-enc.h"
 #include "vae.h"
 
+#include <cctype>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -32,11 +33,6 @@ static void print_usage(const char * prog) {
             "  --vae <gguf>            VAE GGUF file\n\n"
             "Reference audio:\n"
             "  --src-audio <file>      Source audio (WAV or MP3, any sample rate)\n\n"
-            "Lego mode (base model only, requires --src-audio):\n"
-            "  --lego <track>          Generate a track over the source audio context\n"
-            "                          Track names: vocals, backing_vocals, drums, bass,\n"
-            "                          guitar, keyboard, percussion, strings, synth,\n"
-            "                          fx, brass, woodwinds\n\n"
             "LoRA:\n"
             "  --lora <path>           LoRA safetensors file or directory\n"
             "  --lora-scale <float>    LoRA scaling factor (default: 1.0)\n\n"
@@ -88,7 +84,6 @@ int main(int argc, char ** argv) {
     const char *              dit_gguf       = NULL;
     const char *              vae_gguf       = NULL;
     const char *              src_audio_path = NULL;
-    const char *              lego_track     = NULL;  // --lego <track>
     const char *              dump_dir       = NULL;
     const char *              lora_path      = NULL;
     float                     lora_scale     = 1.0f;
@@ -113,8 +108,6 @@ int main(int argc, char ** argv) {
             vae_gguf = argv[++i];
         } else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) {
             src_audio_path = argv[++i];
-        } else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) {
-            lego_track = argv[++i];
         } else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) {
             lora_path = argv[++i];
         } else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) {
@@ -152,10 +145,6 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n");
         return 1;
     }
-    if (lego_track && !src_audio_path) {
-        fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n");
-        return 1;
-    }
     if (!dit_gguf) {
         fprintf(stderr, "[CLI] ERROR: --dit required\n");
         print_usage(argv[0]);
@@ -198,12 +187,6 @@ int main(int argc, char ** argv) {
         if (gf_load(&gf, dit_gguf)) {
             is_turbo             = gf_get_bool(gf, "acestep.is_turbo");
             const void * sl_data = gf_get_data(gf, "silence_latent");
-            if (lego_track && is_turbo) {
-                fprintf(stderr, "[CLI] ERROR: --lego requires the base DiT model\n");
-                gf_close(&gf);
-                dit_ggml_free(&model);
-                return 1;
-            }
             if (sl_data) {
                 silence_full.resize(15000 * 64);
                 memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float));
@@ -301,11 +284,43 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "[Request] ERROR: failed to parse %s, skipping\n", rpath);
             continue;
         }
-        if (req.caption.empty()) {
+        if (req.caption.empty() && req.lego.empty()) {
             fprintf(stderr, "[Request] ERROR: caption is empty in %s, skipping\n", rpath);
             continue;
         }
 
+        // Lego mode validation (base model only, requires --src-audio)
+        bool is_lego = !req.lego.empty();
+        if (is_lego) {
+            if (!src_audio_path) {
+                fprintf(stderr, "[Lego] ERROR: lego requires --src-audio\n");
+                return 1;
+            }
+            if (is_turbo) {
+                fprintf(stderr, "[Lego] ERROR: lego requires the base DiT model (turbo detected)\n");
+                return 1;
+            }
+            // Reference project: TRACK_NAMES (constants.py)
+            static const char * allowed[] = {
+                "vocals",     "backing_vocals", "drums", "bass", "guitar", "keyboard",
+                "percussion", "strings",        "synth", "fx",   "brass",  "woodwinds",
+            };
+            bool valid = false;
+            for (int k = 0; k < 12; k++) {
+                if (req.lego == allowed[k]) {
+                    valid = true;
+                    break;
+                }
+            }
+            if (!valid) {
+                fprintf(stderr, "[Lego] ERROR: '%s' is not a valid track name\n", req.lego.c_str());
+                fprintf(stderr,
+                        "  Valid: vocals, backing_vocals, drums, bass, guitar, keyboard,\n"
+                        "         percussion, strings, synth, fx, brass, woodwinds\n");
+                return 1;
+            }
+        }
+
         // Extract params
         const char * caption     = req.caption.c_str();
         const char * lyrics      = req.lyrics.c_str();
@@ -424,32 +439,36 @@ int main(int argc, char ** argv) {
         }
 
         // 2. Build formatted prompts
-        // Reference project uses opposite-sounding instructions (constants.py):
+        // Reference project instruction templates (constants.py TASK_INSTRUCTIONS):
         //   text2music = "Fill the audio semantic mask..."
         //   cover      = "Generate audio semantic tokens..."
         //   repaint    = "Repaint the mask area..."
-        //   lego       = "Generate the {track} track based on the audio context:"
+        //   lego       = "Generate the {TRACK_NAME} track based on the audio context:"
         // Auto-switches to cover when audio_codes are present
-        bool is_cover = have_cover || !codes_vec.empty();
-
-        // Lego: build instruction from the track name supplied via --lego <track>
-        char         lego_instruction[256] = {};
-        const char * instruction;
-        if (lego_track) {
-            snprintf(lego_instruction, sizeof(lego_instruction),
-                     "Generate the %s track based on the audio context:", lego_track);
-            instruction = lego_instruction;
-            fprintf(stderr, "[Lego] track=%s\n", lego_track);
+        bool        is_cover = have_cover || !codes_vec.empty();
+        std::string instruction_str;
+        if (is_lego) {
+            // Lego mode: force audio_cover_strength=1.0 so all DiT steps see the source audio
+            req.audio_cover_strength = 1.0f;
+            fprintf(stderr, "[Lego] track=%s, cover path, strength=1.0\n", req.lego.c_str());
+            // Reference project (task_utils.py:86): track name is UPPERCASE
+            std::string track_upper = req.lego;
+            for (char & c : track_upper) {
+                c = (char) toupper((unsigned char) c);
+            }
+            instruction_str = "Generate the " + track_upper + " track based on the audio context:";
+        } else if (is_repaint) {
+            instruction_str = "Repaint the mask area based on the given conditions:";
+        } else if (is_cover) {
+            instruction_str = "Generate audio semantic tokens based on the given conditions:";
         } else {
-            instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
-                          is_cover   ? "Generate audio semantic tokens based on the given conditions:" :
-                                       "Fill the audio semantic mask based on the given conditions:";
+            instruction_str = "Fill the audio semantic mask based on the given conditions:";
         }
 
         char metas[512];
         snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm,
                  timesig, keyscale, (int) duration);
-        std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption +
+        std::string text_str = std::string("# Instruction\n") + instruction_str + "\n\n" + "# Caption\n" + caption +
                                "\n\n" + "# Metas\n" + metas + "<|endoftext|>\n";
 
         std::string lyric_str = std::string("# Languages\n") + language + "\n\n# Lyric\n" + lyrics + "<|endoftext|>";
@@ -567,7 +586,7 @@ int main(int argc, char ** argv) {
         }
 
         // Build context: [T, ctx_ch] = src_latents[64] + chunk_mask[64]
-        // Cover:     src = cover_latents, mask = 1.0 everywhere
+        // Cover/Lego: src = cover_latents, mask = 1.0 everywhere
         // Repaint:   src = silence in region / cover outside, mask = 1.0 in region / 0.0 outside
         // Passthrough: detokenized FSQ codes + silence padding, mask = 1.0
         // Text2music: silence only, mask = 1.0