From 0ead9e0514ed4429cb3a427e9543e0618ff1cf89 Mon Sep 17 00:00:00 2001 From: chuxij Date: Sat, 21 Mar 2026 15:48:08 +0000 Subject: [PATCH 1/4] fix(generation): align lego from-silence timing with select window From-silence: repainting 0/-1 and audio_duration = clip length. Context paths unchanged. Adds legoApiTiming, tests, and docs. See docs/github-issues/from-silence-lego-audio-duration.md --- .../from-silence-lego-audio-duration.md | 39 ++++++++++++ docs/release_task_lego_mapping.md | 25 ++++++++ src/services/generationPipeline.ts | 35 ++++++----- src/services/legoApiTiming.ts | 61 +++++++++++++++++++ src/services/silenceGenerator.ts | 9 +-- tests/unit/legoApiTiming.test.ts | 58 ++++++++++++++++++ 6 files changed, 205 insertions(+), 22 deletions(-) create mode 100644 docs/github-issues/from-silence-lego-audio-duration.md create mode 100644 docs/release_task_lego_mapping.md create mode 100644 src/services/legoApiTiming.ts create mode 100644 tests/unit/legoApiTiming.test.ts diff --git a/docs/github-issues/from-silence-lego-audio-duration.md b/docs/github-issues/from-silence-lego-audio-duration.md new file mode 100644 index 00000000..17cff55a --- /dev/null +++ b/docs/github-issues/from-silence-lego-audio-duration.md @@ -0,0 +1,39 @@ +# Issue: from-silence lego 请求应发送选区时长与 repainting 0/-1 + +> **说明**:若无法在仓库创建 GitHub Issue,可将本文复制到 GitHub「New issue」;PR 描述中也会引用相同内容。 + +## 问题描述 + +在 **Generate from silence** + **chunk(选区短于整条时间轴)** 场景下,服务端 DiT 的 `text_prompt` 中 Metas 可能出现 `duration: 0 seconds`,生成结果异常(噪声等)。 + +## 根因(客户端) + +- `generateClipInternal` 将 `audio_duration` 设为 `getAudioDuration()`(整条工程时间轴长度),而不是 **当前 clip / Select Window 的长度**。 +- From-silence 路径上传的是 `generateSilenceWav` 生成的 **极短占位 WAV**(0.1s),与「要生成的段落长度」无关;服务端若按解码后的波形长度参与 Metas,会与真实选区不一致。 + +## 期望行为 + +与 ACE-Step API 约定对齐(from silence): + +| 字段 | 期望值 | +|------|--------| +| `repainting_start` | `0` | +| `repainting_end` | `-1` | +| `audio_duration` | **选区 / clip 时长(秒)**,且 `> 0` | + +有 cumulative **上下文** 时:保持原有行为(repaint 区间为时间轴上的 clip 范围,`audio_duration` 为工程时间轴长度)。 + +## 方案 + +- 抽取 `computeLegoTimingParams`(`src/services/legoApiTiming.ts`)统一计算上述字段。 +- `generationPipeline` 中 lego 任务使用该结果。 +- 补充单元测试与 `docs/release_task_lego_mapping.md` 说明 DAW → `/release_task` 映射。 + +## 验收建议 + +1. From silence + 拖选一段短于总长的 Select Window,发起多轨/单轨生成,确认请求体中 `audio_duration` 等于选区秒数,`repainting_start` / `repainting_end` 为 `0` / `-1`。 +2. From context 路径回归:repaint 区间与 `audio_duration` 与改前一致。 + +## 类型 + +`bug` / `generation` diff --git a/docs/release_task_lego_mapping.md b/docs/release_task_lego_mapping.md new file mode 100644 index 00000000..1602f2ed --- /dev/null +++ b/docs/release_task_lego_mapping.md @@ -0,0 +1,25 @@ +# DAW → ACE-Step `/release_task` (task_type `lego`) + +This documents what **ACE-Step-DAW** sends for stem/lego generation so it is not a black box. The HTTP schema is defined by the ACE-Step server; the DAW builds the body in `generateClipInternal` (`src/services/generationPipeline.ts`) using `computeLegoTimingParams` (`src/services/legoApiTiming.ts`). + +## Timing fields + +| Scenario | `repainting_start` | `repainting_end` | `audio_duration` | +|----------|-------------------|------------------|------------------| +| **From silence** (`forceSilence: true`) | `0` | `-1` | **Clip length in seconds** = select-window length (same as `clip.duration` after the clip is placed on the timeline). | +| **From context / cumulative** | Clip start (or `repaintRange.start`) | Clip end (or `repaintRange.end`) | **Project timeline duration** = `getAudioDuration()` (max clip end, floored at the project minimum). | + +## Chunk vs full instruction + +The DiT prompt uses either “Generate a segment…” or “Generate the … track…” based on whether the clip is a **segment** of the timeline: + +- **From silence:** compare clip `[startTime, startTime + duration]` to the full project length. +- **With context:** compare the repainting interval to the project length. + +## Placeholder silence WAV + +`generateSilenceWav` uploads a **short** (0.1s) silence file to save bandwidth. Target output length is **not** inferred from that file; the server must use the **`audio_duration`** parameter (and its own rules) for generation length and Metas. + +## Related API types + +TypeScript shapes: `LegoTaskParams` in `src/types/api.ts`. diff --git a/src/services/generationPipeline.ts b/src/services/generationPipeline.ts index 312f40d5..d8f123ef 100644 --- a/src/services/generationPipeline.ts +++ b/src/services/generationPipeline.ts @@ -13,6 +13,7 @@ import type { LegoTaskParams, CoverTaskParams, RepaintTaskParams, RepaintMode, T import type { InferredMetas } from '../types/project'; import * as api from './aceStepApi'; import { generateSilenceWav } from './silenceGenerator'; +import { computeLegoTimingParams } from './legoApiTiming'; import { saveAudioBlob, loadAudioBlobByKey } from './audioFileManager'; import { getAudioEngine } from '../hooks/useAudioEngine'; import { toastError, toastInfo, toastSuccess } from '../hooks/useToast'; @@ -502,15 +503,21 @@ async function generateClipInternal( store.updateClipStatus(clipId, 'queued', { generationJobId: jobId }); try { - // Use actual audio duration (without timeline padding) for generation - const audioDuration = useProjectStore.getState().getAudioDuration(); + const projectTimelineDuration = useProjectStore.getState().getAudioDuration(); + const timing = computeLegoTimingParams( + Boolean(options.forceSilence), + clip, + projectTimelineDuration, + options.repaintRange, + ); // Determine src_audio — prefer a server-side path (no upload), then - // previous cumulative blob, then synthesized silence. + // previous cumulative blob, then synthesized silence (tiny placeholder WAV; + // target length is carried by `audio_duration` in task params). const srcBlob = options.srcAudioPath ? null : (options.forceSilence ? null : previousCumulativeBlob); - const srcAudioBlob = srcBlob ?? generateSilenceWav(audioDuration); + const srcAudioBlob = srcBlob ?? generateSilenceWav(timing.audio_duration); logger.debug( `clip=${clipId} track=${track.trackName}`, @@ -518,18 +525,14 @@ async function generateClipInternal( ? `srcAudioPath=${options.srcAudioPath}` : `srcAudio: ${srcBlob ? 'previousCumulative' : 'silence'}`, `forceSilence=${options.forceSilence ?? false}`, - `audioDuration=${audioDuration}s`, + `projectTimeline=${projectTimelineDuration}s`, + `apiAudioDuration=${timing.audio_duration}s`, + `repainting=${timing.repainting_start}..${timing.repainting_end}`, ); - // Build instruction — detect chunk vs full mode based on whether the - // generation region covers the entire audio duration. The backend's - // conditioning_text.py checks for "a segment" in the instruction to - // switch caption formatting (chunk omits Global: prefix). + // Chunk vs full: backend conditioning_text checks for "a segment" in the instruction. const trackLabel = track.trackName.toUpperCase().replace('_', ' '); - const repaintStart = options.repaintRange?.start ?? clip.startTime; - const repaintEnd = options.repaintRange?.end ?? (clip.startTime + clip.duration); - const isChunkMode = repaintStart > 0.5 || repaintEnd < audioDuration - 0.5; - const instruction = isChunkMode + const instruction = timing.isChunkMode ? `Generate a segment of the ${trackLabel} track based on the audio context:` : `Generate the ${trackLabel} track based on the audio context:`; @@ -553,9 +556,9 @@ async function generateClipInternal( global_caption: effectiveGlobalCaption, lyrics: effectiveLyrics, instruction, - repainting_start: repaintStart, - repainting_end: repaintEnd, - audio_duration: audioDuration, + repainting_start: timing.repainting_start, + repainting_end: timing.repainting_end, + audio_duration: timing.audio_duration, bpm: resolvedBpm, key_scale: resolvedKey, time_signature: resolvedTimeSig, diff --git a/src/services/legoApiTiming.ts b/src/services/legoApiTiming.ts new file mode 100644 index 00000000..b13f9700 --- /dev/null +++ b/src/services/legoApiTiming.ts @@ -0,0 +1,61 @@ +/** + * Maps a timeline clip to ACE-Step `task_type: lego` timing fields on `/release_task`. + * + * From-silence generation uses a tiny placeholder WAV; the server must rely on + * `audio_duration` for target length / Metas. We therefore send: + * - `repainting_start`: 0 + * - `repainting_end`: -1 + * - `audio_duration`: clip duration (select window length), not the full timeline. + * + * Context / cumulative generation keeps repainting in timeline seconds and uses + * `audio_duration` = project timeline extent (same as previous DAW behavior). + */ + +export interface LegoTimingClip { + startTime: number; + duration: number; +} + +export interface LegoTimingRepaintRange { + start: number; + end: number; +} + +export interface LegoTimingResult { + repainting_start: number; + repainting_end: number; + /** Seconds: for forceSilence = clip length; else = project timeline duration */ + audio_duration: number; + /** Whether to use chunk-style DiT instruction ("segment" vs full track) */ + isChunkMode: boolean; +} + +const MIN_LEGAL_AUDIO_DURATION_SEC = 1e-3; + +export function computeLegoTimingParams( + forceSilence: boolean, + clip: LegoTimingClip, + projectTimelineDuration: number, + repaintRange?: LegoTimingRepaintRange, +): LegoTimingResult { + const clipEnd = clip.startTime + clip.duration; + + if (forceSilence) { + return { + repainting_start: 0, + repainting_end: -1, + audio_duration: Math.max(clip.duration, MIN_LEGAL_AUDIO_DURATION_SEC), + isChunkMode: clip.startTime > 0.5 || clipEnd < projectTimelineDuration - 0.5, + }; + } + + const repainting_start = repaintRange?.start ?? clip.startTime; + const repainting_end = repaintRange?.end ?? clipEnd; + + return { + repainting_start, + repainting_end, + audio_duration: projectTimelineDuration, + isChunkMode: repainting_start > 0.5 || repainting_end < projectTimelineDuration - 0.5, + }; +} diff --git a/src/services/silenceGenerator.ts b/src/services/silenceGenerator.ts index 4689ff8b..36d7dceb 100644 --- a/src/services/silenceGenerator.ts +++ b/src/services/silenceGenerator.ts @@ -6,12 +6,9 @@ const SILENCE_UPLOAD_DURATION = 0.1; /** * Generate a minimal silence WAV for upload. - * The actual generation duration is controlled by the audio_duration API param, - * so we only need a tiny placeholder (0.1s at 16kHz mono = ~3.2KB instead of - * full-duration 48kHz stereo which can exceed 11MB). - * - * The full-quality version is still available via generateSilenceWavFull() for - * local playback/mixing. + * The actual generation length must be sent as `audio_duration` on the lego task + * (see `computeLegoTimingParams` in legoApiTiming.ts — from-silence uses clip/select-window + * seconds). This file only uploads a tiny placeholder (0.1s at 16kHz mono ≈ 3.2KB). */ export function generateSilenceWav(_durationSeconds: number): Blob { const numSamples = Math.ceil(SILENCE_UPLOAD_RATE * SILENCE_UPLOAD_DURATION); diff --git a/tests/unit/legoApiTiming.test.ts b/tests/unit/legoApiTiming.test.ts new file mode 100644 index 00000000..1d0e96eb --- /dev/null +++ b/tests/unit/legoApiTiming.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; +import { computeLegoTimingParams } from '../../src/services/legoApiTiming'; + +describe('computeLegoTimingParams', () => { + const projectDuration = 128; + + it('from-silence: repainting 0 / -1 and audio_duration = clip length', () => { + const t = computeLegoTimingParams( + true, + { startTime: 10, duration: 8 }, + projectDuration, + ); + expect(t.repainting_start).toBe(0); + expect(t.repainting_end).toBe(-1); + expect(t.audio_duration).toBe(8); + expect(t.isChunkMode).toBe(true); + }); + + it('from-silence: full-timeline clip is not chunk mode', () => { + const t = computeLegoTimingParams( + true, + { startTime: 0, duration: projectDuration }, + projectDuration, + ); + expect(t.isChunkMode).toBe(false); + expect(t.audio_duration).toBe(projectDuration); + }); + + it('from-silence: clamps zero clip duration to a small positive', () => { + const t = computeLegoTimingParams(true, { startTime: 0, duration: 0 }, projectDuration); + expect(t.audio_duration).toBeGreaterThan(0); + expect(t.audio_duration).toBeLessThanOrEqual(0.001); + }); + + it('with context: repainting follows clip and audio_duration = project timeline', () => { + const t = computeLegoTimingParams( + false, + { startTime: 5, duration: 4 }, + projectDuration, + ); + expect(t.repainting_start).toBe(5); + expect(t.repainting_end).toBe(9); + expect(t.audio_duration).toBe(projectDuration); + expect(t.isChunkMode).toBe(true); + }); + + it('respects repaintRange override when not forceSilence', () => { + const t = computeLegoTimingParams( + false, + { startTime: 0, duration: 20 }, + projectDuration, + { start: 2, end: 18 }, + ); + expect(t.repainting_start).toBe(2); + expect(t.repainting_end).toBe(18); + expect(t.audio_duration).toBe(projectDuration); + }); +}); From 1d9f826f5443fe8ad516a36e9cf1cdd336a559ac Mon Sep 17 00:00:00 2001 From: chuxij Date: Sat, 21 Mar 2026 15:50:02 +0000 Subject: [PATCH 2/4] chore(docs): remove GitHub issue draft from repo Track bugs on GitHub; keep behavior notes in docs/release_task_lego_mapping.md. --- .../from-silence-lego-audio-duration.md | 39 ------------------- docs/release_task_lego_mapping.md | 4 ++ 2 files changed, 4 insertions(+), 39 deletions(-) delete mode 100644 docs/github-issues/from-silence-lego-audio-duration.md diff --git a/docs/github-issues/from-silence-lego-audio-duration.md b/docs/github-issues/from-silence-lego-audio-duration.md deleted file mode 100644 index 17cff55a..00000000 --- a/docs/github-issues/from-silence-lego-audio-duration.md +++ /dev/null @@ -1,39 +0,0 @@ -# Issue: from-silence lego 请求应发送选区时长与 repainting 0/-1 - -> **说明**:若无法在仓库创建 GitHub Issue,可将本文复制到 GitHub「New issue」;PR 描述中也会引用相同内容。 - -## 问题描述 - -在 **Generate from silence** + **chunk(选区短于整条时间轴)** 场景下,服务端 DiT 的 `text_prompt` 中 Metas 可能出现 `duration: 0 seconds`,生成结果异常(噪声等)。 - -## 根因(客户端) - -- `generateClipInternal` 将 `audio_duration` 设为 `getAudioDuration()`(整条工程时间轴长度),而不是 **当前 clip / Select Window 的长度**。 -- From-silence 路径上传的是 `generateSilenceWav` 生成的 **极短占位 WAV**(0.1s),与「要生成的段落长度」无关;服务端若按解码后的波形长度参与 Metas,会与真实选区不一致。 - -## 期望行为 - -与 ACE-Step API 约定对齐(from silence): - -| 字段 | 期望值 | -|------|--------| -| `repainting_start` | `0` | -| `repainting_end` | `-1` | -| `audio_duration` | **选区 / clip 时长(秒)**,且 `> 0` | - -有 cumulative **上下文** 时:保持原有行为(repaint 区间为时间轴上的 clip 范围,`audio_duration` 为工程时间轴长度)。 - -## 方案 - -- 抽取 `computeLegoTimingParams`(`src/services/legoApiTiming.ts`)统一计算上述字段。 -- `generationPipeline` 中 lego 任务使用该结果。 -- 补充单元测试与 `docs/release_task_lego_mapping.md` 说明 DAW → `/release_task` 映射。 - -## 验收建议 - -1. From silence + 拖选一段短于总长的 Select Window,发起多轨/单轨生成,确认请求体中 `audio_duration` 等于选区秒数,`repainting_start` / `repainting_end` 为 `0` / `-1`。 -2. From context 路径回归:repaint 区间与 `audio_duration` 与改前一致。 - -## 类型 - -`bug` / `generation` diff --git a/docs/release_task_lego_mapping.md b/docs/release_task_lego_mapping.md index 1602f2ed..7e691a5b 100644 --- a/docs/release_task_lego_mapping.md +++ b/docs/release_task_lego_mapping.md @@ -23,3 +23,7 @@ The DiT prompt uses either “Generate a segment…” or “Generate the … tr ## Related API types TypeScript shapes: `LegoTaskParams` in `src/types/api.ts`. + +## Regression to avoid + +Earlier builds sent `audio_duration = getAudioDuration()` (full timeline) for from-silence while uploading a **0.1s** placeholder WAV. That mismatch could produce wrong Metas / bad generations on the server. From-silence must send **clip/select-window seconds** as `audio_duration` and `repainting` `0` / `-1` as in the table above. From 326c437fe76771b6bb0d48043dbafbe7d9426866 Mon Sep 17 00:00:00 2001 From: chuxij Date: Sat, 21 Mar 2026 15:57:34 +0000 Subject: [PATCH 3/4] chore: add issue template for lego from-silence timing Normalized bug reports; see docs/release_task_lego_mapping.md for field semantics. --- .../lego-from-silence-timing.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/lego-from-silence-timing.md diff --git a/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md b/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md new file mode 100644 index 00000000..5a0f1fee --- /dev/null +++ b/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md @@ -0,0 +1,31 @@ +--- +name: Lego / from-silence timing +about: from-silence 或 chunk 选区下 lego 任务的 repainting、audio_duration 或 Metas 异常 +title: "[generation] " +labels: bug +--- + +## 现象 + + + +## 期望行为 + +- **From silence**:`repainting_start=0`,`repainting_end=-1`,`audio_duration` = **选区/clip 秒数**(> 0)。 +- **From context**:repaint 为时间轴上 clip 区间,`audio_duration` = 工程时间轴长度。 + +详见仓库内 `docs/release_task_lego_mapping.md`(「Timing fields」「Regression to avoid」)。 + +## 复现步骤 + +1. +2. + +## 环境 + +- DAW 版本 / 分支: +- ACE-Step API 版本(如已知): + +## 请求片段(可打码) + + From 47ff091a2da8006689d01342d581fca68fcff85e Mon Sep 17 00:00:00 2001 From: chuxij Date: Sat, 21 Mar 2026 16:34:33 +0000 Subject: [PATCH 4/4] fix(generation): remove unused duration param from generateSilenceWav Address Copilot review: generateSilenceWav always produces a fixed 0.1s placeholder regardless of arguments. Remove the misleading _durationSeconds parameter and update all call sites. The actual generation duration is controlled by audio_duration in the task params (via computeLegoTimingParams). Made-with: Cursor --- src/services/generationPipeline.ts | 8 +++----- src/services/silenceGenerator.ts | 12 +++++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/services/generationPipeline.ts b/src/services/generationPipeline.ts index d8f123ef..d1a760c9 100644 --- a/src/services/generationPipeline.ts +++ b/src/services/generationPipeline.ts @@ -517,7 +517,7 @@ async function generateClipInternal( const srcBlob = options.srcAudioPath ? null : (options.forceSilence ? null : previousCumulativeBlob); - const srcAudioBlob = srcBlob ?? generateSilenceWav(timing.audio_duration); + const srcAudioBlob = srcBlob ?? generateSilenceWav(); logger.debug( `clip=${clipId} track=${track.trackName}`, @@ -1854,8 +1854,7 @@ export async function generateRepaintClip(opts: GenerateRepaintOptions): Promise srcBlob = (await loadAudioBlobByKey(clip.cumulativeMixKey)) ?? null; } if (!srcBlob) { - const audioDuration = store.getAudioDuration(); - srcBlob = generateSilenceWav(audioDuration); + srcBlob = generateSilenceWav(); } const globalCaption = opts.globalCaption || store.project?.globalCaption || ''; @@ -1953,8 +1952,7 @@ export async function regenerateTimelineRegion(opts: RegionRegenerateOptions): P srcBlob = (await loadAudioBlobByKey(clip.cumulativeMixKey)) ?? null; } if (!srcBlob) { - const audioDuration = store.getAudioDuration(); - srcBlob = generateSilenceWav(audioDuration); + srcBlob = generateSilenceWav(); } const outcome = await generateRepaintInternal( diff --git a/src/services/silenceGenerator.ts b/src/services/silenceGenerator.ts index 36d7dceb..f90743ec 100644 --- a/src/services/silenceGenerator.ts +++ b/src/services/silenceGenerator.ts @@ -5,12 +5,14 @@ const SILENCE_UPLOAD_CHANNELS = 1; const SILENCE_UPLOAD_DURATION = 0.1; /** - * Generate a minimal silence WAV for upload. - * The actual generation length must be sent as `audio_duration` on the lego task - * (see `computeLegoTimingParams` in legoApiTiming.ts — from-silence uses clip/select-window - * seconds). This file only uploads a tiny placeholder (0.1s at 16kHz mono ≈ 3.2KB). + * Generate a minimal silence WAV placeholder for upload. + * + * The actual generation length is controlled by the `audio_duration` field on + * the lego/repaint task params (see {@link computeLegoTimingParams} in + * `legoApiTiming.ts`). This function only produces a tiny 0.1s 16kHz mono WAV + * (~3.2KB) so the backend has a valid `src_audio` file to parse. */ -export function generateSilenceWav(_durationSeconds: number): Blob { +export function generateSilenceWav(): Blob { const numSamples = Math.ceil(SILENCE_UPLOAD_RATE * SILENCE_UPLOAD_DURATION); const bytesPerSample = BITS_PER_SAMPLE / 8; const blockAlign = SILENCE_UPLOAD_CHANNELS * bytesPerSample;