ace-step · ChuxiJ · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md b/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md
@@ -0,0 +1,31 @@
+---
+name: Lego / from-silence timing
+about: from-silence 或 chunk 选区下 lego 任务的 repainting、audio_duration 或 Metas 异常
+title: "[generation] "
+labels: bug
+---
+
+## 现象
+
+<!-- 例如：from silence + 短选区生成异常、服务端 Metas duration 为 0、噪声等 -->
+
+## 期望行为
+
+- **From silence**：`repainting_start=0`，`repainting_end=-1`，`audio_duration` = **选区/clip 秒数**（> 0）。
+- **From context**：repaint 为时间轴上 clip 区间，`audio_duration` = 工程时间轴长度。
+
+详见仓库内 `docs/release_task_lego_mapping.md`（「Timing fields」「Regression to avoid」）。
+
+## 复现步骤
+
+1.
+2.
+
+## 环境
+
+- DAW 版本 / 分支：
+- ACE-Step API 版本（如已知）：
+
+## 请求片段（可打码）
+
+<!-- 可选：贴 `/release_task` 中与 repainting、audio_duration 相关的 JSON 字段 -->
diff --git a/docs/release_task_lego_mapping.md b/docs/release_task_lego_mapping.md
@@ -0,0 +1,29 @@
+# DAW → ACE-Step `/release_task` (task_type `lego`)
+
+This documents what **ACE-Step-DAW** sends for stem/lego generation so it is not a black box. The HTTP schema is defined by the ACE-Step server; the DAW builds the body in `generateClipInternal` (`src/services/generationPipeline.ts`) using `computeLegoTimingParams` (`src/services/legoApiTiming.ts`).
+
+## Timing fields
+
+| Scenario | `repainting_start` | `repainting_end` | `audio_duration` |
+|----------|-------------------|------------------|------------------|
+| **From silence** (`forceSilence: true`) | `0` | `-1` | **Clip length in seconds** = select-window length (same as `clip.duration` after the clip is placed on the timeline). |
+| **From context / cumulative** | Clip start (or `repaintRange.start`) | Clip end (or `repaintRange.end`) | **Project timeline duration** = `getAudioDuration()` (max clip end, floored at the project minimum). |
+
+## Chunk vs full instruction
+
+The DiT prompt uses either “Generate a segment…” or “Generate the … track…” based on whether the clip is a **segment** of the timeline:
+
+- **From silence:** compare clip `[startTime, startTime + duration]` to the full project length.
+- **With context:** compare the repainting interval to the project length.
+
+## Placeholder silence WAV
+
+`generateSilenceWav` uploads a **short** (0.1s) silence file to save bandwidth. Target output length is **not** inferred from that file; the server must use the **`audio_duration`** parameter (and its own rules) for generation length and Metas.
+
+## Related API types
+
+TypeScript shapes: `LegoTaskParams` in `src/types/api.ts`.
+
+## Regression to avoid
+
+Earlier builds sent `audio_duration = getAudioDuration()` (full timeline) for from-silence while uploading a **0.1s** placeholder WAV. That mismatch could produce wrong Metas / bad generations on the server. From-silence must send **clip/select-window seconds** as `audio_duration` and `repainting` `0` / `-1` as in the table above.
diff --git a/src/services/generationPipeline.ts b/src/services/generationPipeline.ts
@@ -13,6 +13,7 @@ import type { LegoTaskParams, CoverTaskParams, RepaintTaskParams, RepaintMode, T
 import type { InferredMetas } from '../types/project';
 import * as api from './aceStepApi';
 import { generateSilenceWav } from './silenceGenerator';
+import { computeLegoTimingParams } from './legoApiTiming';
 import { saveAudioBlob, loadAudioBlobByKey } from './audioFileManager';
 import { getAudioEngine } from '../hooks/useAudioEngine';
 import { toastError, toastInfo, toastSuccess } from '../hooks/useToast';
@@ -502,34 +503,36 @@ async function generateClipInternal(
   store.updateClipStatus(clipId, 'queued', { generationJobId: jobId });
 
   try {
-    // Use actual audio duration (without timeline padding) for generation
-    const audioDuration = useProjectStore.getState().getAudioDuration();
+    const projectTimelineDuration = useProjectStore.getState().getAudioDuration();
+    const timing = computeLegoTimingParams(
+      Boolean(options.forceSilence),
+      clip,
+      projectTimelineDuration,
+      options.repaintRange,
+    );
 
     // Determine src_audio — prefer a server-side path (no upload), then
-    // previous cumulative blob, then synthesized silence.
+    // previous cumulative blob, then synthesized silence (tiny placeholder WAV;
+    // target length is carried by `audio_duration` in task params).
     const srcBlob = options.srcAudioPath
       ? null
       : (options.forceSilence ? null : previousCumulativeBlob);
-    const srcAudioBlob = srcBlob ?? generateSilenceWav(audioDuration);
+    const srcAudioBlob = srcBlob ?? generateSilenceWav();
 
     logger.debug(
       `clip=${clipId} track=${track.trackName}`,
       options.srcAudioPath
         ? `srcAudioPath=${options.srcAudioPath}`
         : `srcAudio: ${srcBlob ? 'previousCumulative' : 'silence'}`,
       `forceSilence=${options.forceSilence ?? false}`,
-      `audioDuration=${audioDuration}s`,
+      `projectTimeline=${projectTimelineDuration}s`,
+      `apiAudioDuration=${timing.audio_duration}s`,
+      `repainting=${timing.repainting_start}..${timing.repainting_end}`,
     );
 
-    // Build instruction — detect chunk vs full mode based on whether the
-    // generation region covers the entire audio duration.  The backend's
-    // conditioning_text.py checks for "a segment" in the instruction to
-    // switch caption formatting (chunk omits Global: prefix).
+    // Chunk vs full: backend conditioning_text checks for "a segment" in the instruction.
     const trackLabel = track.trackName.toUpperCase().replace('_', ' ');
-    const repaintStart = options.repaintRange?.start ?? clip.startTime;
-    const repaintEnd = options.repaintRange?.end ?? (clip.startTime + clip.duration);
-    const isChunkMode = repaintStart > 0.5 || repaintEnd < audioDuration - 0.5;
-    const instruction = isChunkMode
+    const instruction = timing.isChunkMode
       ? `Generate a segment of the ${trackLabel} track based on the audio context:`
       : `Generate the ${trackLabel} track based on the audio context:`;
 
@@ -553,9 +556,9 @@ async function generateClipInternal(
       global_caption: effectiveGlobalCaption,
       lyrics: effectiveLyrics,
       instruction,
-      repainting_start: repaintStart,
-      repainting_end: repaintEnd,
-      audio_duration: audioDuration,
+      repainting_start: timing.repainting_start,
+      repainting_end: timing.repainting_end,
+      audio_duration: timing.audio_duration,
       bpm: resolvedBpm,
       key_scale: resolvedKey,
       time_signature: resolvedTimeSig,
@@ -1851,8 +1854,7 @@ export async function generateRepaintClip(opts: GenerateRepaintOptions): Promise
         srcBlob = (await loadAudioBlobByKey(clip.cumulativeMixKey)) ?? null;
       }
       if (!srcBlob) {
-        const audioDuration = store.getAudioDuration();
-        srcBlob = generateSilenceWav(audioDuration);
+        srcBlob = generateSilenceWav();
       }
 
       const globalCaption = opts.globalCaption || store.project?.globalCaption || '';
@@ -1950,8 +1952,7 @@ export async function regenerateTimelineRegion(opts: RegionRegenerateOptions): P
           srcBlob = (await loadAudioBlobByKey(clip.cumulativeMixKey)) ?? null;
         }
         if (!srcBlob) {
-          const audioDuration = store.getAudioDuration();
-          srcBlob = generateSilenceWav(audioDuration);
+          srcBlob = generateSilenceWav();
         }
 
         const outcome = await generateRepaintInternal(

diff --git a/src/services/legoApiTiming.ts b/src/services/legoApiTiming.ts
@@ -0,0 +1,61 @@
+/**
+ * Maps a timeline clip to ACE-Step `task_type: lego` timing fields on `/release_task`.
+ *
+ * From-silence generation uses a tiny placeholder WAV; the server must rely on
+ * `audio_duration` for target length / Metas. We therefore send:
+ * - `repainting_start`: 0
+ * - `repainting_end`: -1
+ * - `audio_duration`: clip duration (select window length), not the full timeline.
+ *
+ * Context / cumulative generation keeps repainting in timeline seconds and uses
+ * `audio_duration` = project timeline extent (same as previous DAW behavior).
+ */
+
+export interface LegoTimingClip {
+  startTime: number;
+  duration: number;
+}
+
+export interface LegoTimingRepaintRange {
+  start: number;
+  end: number;
+}
+
+export interface LegoTimingResult {
+  repainting_start: number;
+  repainting_end: number;
+  /** Seconds: for forceSilence = clip length; else = project timeline duration */
+  audio_duration: number;
+  /** Whether to use chunk-style DiT instruction ("segment" vs full track) */
+  isChunkMode: boolean;
+}
+
+const MIN_LEGAL_AUDIO_DURATION_SEC = 1e-3;
+
+export function computeLegoTimingParams(
+  forceSilence: boolean,
+  clip: LegoTimingClip,
+  projectTimelineDuration: number,
+  repaintRange?: LegoTimingRepaintRange,
+): LegoTimingResult {
+  const clipEnd = clip.startTime + clip.duration;
+
+  if (forceSilence) {
+    return {
+      repainting_start: 0,
+      repainting_end: -1,
+      audio_duration: Math.max(clip.duration, MIN_LEGAL_AUDIO_DURATION_SEC),
+      isChunkMode: clip.startTime > 0.5 || clipEnd < projectTimelineDuration - 0.5,
+    };
+  }
+
+  const repainting_start = repaintRange?.start ?? clip.startTime;
+  const repainting_end = repaintRange?.end ?? clipEnd;
+
+  return {
+    repainting_start,
+    repainting_end,
+    audio_duration: projectTimelineDuration,
+    isChunkMode: repainting_start > 0.5 || repainting_end < projectTimelineDuration - 0.5,
+  };
+}
diff --git a/src/services/silenceGenerator.ts b/src/services/silenceGenerator.ts
@@ -5,15 +5,14 @@ const SILENCE_UPLOAD_CHANNELS = 1;
 const SILENCE_UPLOAD_DURATION = 0.1;
 
 /**
- * Generate a minimal silence WAV for upload.
- * The actual generation duration is controlled by the audio_duration API param,
- * so we only need a tiny placeholder (0.1s at 16kHz mono = ~3.2KB instead of
- * full-duration 48kHz stereo which can exceed 11MB).
+ * Generate a minimal silence WAV placeholder for upload.
  *
- * The full-quality version is still available via generateSilenceWavFull() for
- * local playback/mixing.
+ * The actual generation length is controlled by the `audio_duration` field on
+ * the lego/repaint task params (see {@link computeLegoTimingParams} in
+ * `legoApiTiming.ts`). This function only produces a tiny 0.1s 16kHz mono WAV
+ * (~3.2KB) so the backend has a valid `src_audio` file to parse.
  */
-export function generateSilenceWav(_durationSeconds: number): Blob {
+export function generateSilenceWav(): Blob {
   const numSamples = Math.ceil(SILENCE_UPLOAD_RATE * SILENCE_UPLOAD_DURATION);
   const bytesPerSample = BITS_PER_SAMPLE / 8;
   const blockAlign = SILENCE_UPLOAD_CHANNELS * bytesPerSample;

diff --git a/tests/unit/legoApiTiming.test.ts b/tests/unit/legoApiTiming.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+import { computeLegoTimingParams } from '../../src/services/legoApiTiming';
+
+describe('computeLegoTimingParams', () => {
+  const projectDuration = 128;
+
+  it('from-silence: repainting 0 / -1 and audio_duration = clip length', () => {
+    const t = computeLegoTimingParams(
+      true,
+      { startTime: 10, duration: 8 },
+      projectDuration,
+    );
+    expect(t.repainting_start).toBe(0);
+    expect(t.repainting_end).toBe(-1);
+    expect(t.audio_duration).toBe(8);
+    expect(t.isChunkMode).toBe(true);
+  });
+
+  it('from-silence: full-timeline clip is not chunk mode', () => {
+    const t = computeLegoTimingParams(
+      true,
+      { startTime: 0, duration: projectDuration },
+      projectDuration,
+    );
+    expect(t.isChunkMode).toBe(false);
+    expect(t.audio_duration).toBe(projectDuration);
+  });
+
+  it('from-silence: clamps zero clip duration to a small positive', () => {
+    const t = computeLegoTimingParams(true, { startTime: 0, duration: 0 }, projectDuration);
+    expect(t.audio_duration).toBeGreaterThan(0);
+    expect(t.audio_duration).toBeLessThanOrEqual(0.001);
+  });
+
+  it('with context: repainting follows clip and audio_duration = project timeline', () => {
+    const t = computeLegoTimingParams(
+      false,
+      { startTime: 5, duration: 4 },
+      projectDuration,
+    );
+    expect(t.repainting_start).toBe(5);
+    expect(t.repainting_end).toBe(9);
+    expect(t.audio_duration).toBe(projectDuration);
+    expect(t.isChunkMode).toBe(true);
+  });
+
+  it('respects repaintRange override when not forceSilence', () => {
+    const t = computeLegoTimingParams(
+      false,
+      { startTime: 0, duration: 20 },
+      projectDuration,
+      { start: 2, end: 18 },
+    );
+    expect(t.repainting_start).toBe(2);
+    expect(t.repainting_end).toBe(18);
+    expect(t.audio_duration).toBe(projectDuration);
+  });
+});