diff --git a/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md b/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md new file mode 100644 index 00000000..5a0f1fee --- /dev/null +++ b/.github/ISSUE_TEMPLATE/lego-from-silence-timing.md @@ -0,0 +1,31 @@ +--- +name: Lego / from-silence timing +about: from-silence 或 chunk 选区下 lego 任务的 repainting、audio_duration 或 Metas 异常 +title: "[generation] " +labels: bug +--- + +## 现象 + + + +## 期望行为 + +- **From silence**:`repainting_start=0`,`repainting_end=-1`,`audio_duration` = **选区/clip 秒数**(> 0)。 +- **From context**:repaint 为时间轴上 clip 区间,`audio_duration` = 工程时间轴长度。 + +详见仓库内 `docs/release_task_lego_mapping.md`(「Timing fields」「Regression to avoid」)。 + +## 复现步骤 + +1. +2. + +## 环境 + +- DAW 版本 / 分支: +- ACE-Step API 版本(如已知): + +## 请求片段(可打码) + + diff --git a/docs/release_task_lego_mapping.md b/docs/release_task_lego_mapping.md new file mode 100644 index 00000000..7e691a5b --- /dev/null +++ b/docs/release_task_lego_mapping.md @@ -0,0 +1,29 @@ +# DAW → ACE-Step `/release_task` (task_type `lego`) + +This documents what **ACE-Step-DAW** sends for stem/lego generation so it is not a black box. The HTTP schema is defined by the ACE-Step server; the DAW builds the body in `generateClipInternal` (`src/services/generationPipeline.ts`) using `computeLegoTimingParams` (`src/services/legoApiTiming.ts`). + +## Timing fields + +| Scenario | `repainting_start` | `repainting_end` | `audio_duration` | +|----------|-------------------|------------------|------------------| +| **From silence** (`forceSilence: true`) | `0` | `-1` | **Clip length in seconds** = select-window length (same as `clip.duration` after the clip is placed on the timeline). | +| **From context / cumulative** | Clip start (or `repaintRange.start`) | Clip end (or `repaintRange.end`) | **Project timeline duration** = `getAudioDuration()` (max clip end, floored at the project minimum). | + +## Chunk vs full instruction + +The DiT prompt uses either “Generate a segment…” or “Generate the … track…” based on whether the clip is a **segment** of the timeline: + +- **From silence:** compare clip `[startTime, startTime + duration]` to the full project length. +- **With context:** compare the repainting interval to the project length. + +## Placeholder silence WAV + +`generateSilenceWav` uploads a **short** (0.1s) silence file to save bandwidth. Target output length is **not** inferred from that file; the server must use the **`audio_duration`** parameter (and its own rules) for generation length and Metas. + +## Related API types + +TypeScript shapes: `LegoTaskParams` in `src/types/api.ts`. + +## Regression to avoid + +Earlier builds sent `audio_duration = getAudioDuration()` (full timeline) for from-silence while uploading a **0.1s** placeholder WAV. That mismatch could produce wrong Metas / bad generations on the server. From-silence must send **clip/select-window seconds** as `audio_duration` and `repainting` `0` / `-1` as in the table above. diff --git a/src/services/generationPipeline.ts b/src/services/generationPipeline.ts index 312f40d5..d1a760c9 100644 --- a/src/services/generationPipeline.ts +++ b/src/services/generationPipeline.ts @@ -13,6 +13,7 @@ import type { LegoTaskParams, CoverTaskParams, RepaintTaskParams, RepaintMode, T import type { InferredMetas } from '../types/project'; import * as api from './aceStepApi'; import { generateSilenceWav } from './silenceGenerator'; +import { computeLegoTimingParams } from './legoApiTiming'; import { saveAudioBlob, loadAudioBlobByKey } from './audioFileManager'; import { getAudioEngine } from '../hooks/useAudioEngine'; import { toastError, toastInfo, toastSuccess } from '../hooks/useToast'; @@ -502,15 +503,21 @@ async function generateClipInternal( store.updateClipStatus(clipId, 'queued', { generationJobId: jobId }); try { - // Use actual audio duration (without timeline padding) for generation - const audioDuration = useProjectStore.getState().getAudioDuration(); + const projectTimelineDuration = useProjectStore.getState().getAudioDuration(); + const timing = computeLegoTimingParams( + Boolean(options.forceSilence), + clip, + projectTimelineDuration, + options.repaintRange, + ); // Determine src_audio — prefer a server-side path (no upload), then - // previous cumulative blob, then synthesized silence. + // previous cumulative blob, then synthesized silence (tiny placeholder WAV; + // target length is carried by `audio_duration` in task params). const srcBlob = options.srcAudioPath ? null : (options.forceSilence ? null : previousCumulativeBlob); - const srcAudioBlob = srcBlob ?? generateSilenceWav(audioDuration); + const srcAudioBlob = srcBlob ?? generateSilenceWav(); logger.debug( `clip=${clipId} track=${track.trackName}`, @@ -518,18 +525,14 @@ async function generateClipInternal( ? `srcAudioPath=${options.srcAudioPath}` : `srcAudio: ${srcBlob ? 'previousCumulative' : 'silence'}`, `forceSilence=${options.forceSilence ?? false}`, - `audioDuration=${audioDuration}s`, + `projectTimeline=${projectTimelineDuration}s`, + `apiAudioDuration=${timing.audio_duration}s`, + `repainting=${timing.repainting_start}..${timing.repainting_end}`, ); - // Build instruction — detect chunk vs full mode based on whether the - // generation region covers the entire audio duration. The backend's - // conditioning_text.py checks for "a segment" in the instruction to - // switch caption formatting (chunk omits Global: prefix). + // Chunk vs full: backend conditioning_text checks for "a segment" in the instruction. const trackLabel = track.trackName.toUpperCase().replace('_', ' '); - const repaintStart = options.repaintRange?.start ?? clip.startTime; - const repaintEnd = options.repaintRange?.end ?? (clip.startTime + clip.duration); - const isChunkMode = repaintStart > 0.5 || repaintEnd < audioDuration - 0.5; - const instruction = isChunkMode + const instruction = timing.isChunkMode ? `Generate a segment of the ${trackLabel} track based on the audio context:` : `Generate the ${trackLabel} track based on the audio context:`; @@ -553,9 +556,9 @@ async function generateClipInternal( global_caption: effectiveGlobalCaption, lyrics: effectiveLyrics, instruction, - repainting_start: repaintStart, - repainting_end: repaintEnd, - audio_duration: audioDuration, + repainting_start: timing.repainting_start, + repainting_end: timing.repainting_end, + audio_duration: timing.audio_duration, bpm: resolvedBpm, key_scale: resolvedKey, time_signature: resolvedTimeSig, @@ -1851,8 +1854,7 @@ export async function generateRepaintClip(opts: GenerateRepaintOptions): Promise srcBlob = (await loadAudioBlobByKey(clip.cumulativeMixKey)) ?? null; } if (!srcBlob) { - const audioDuration = store.getAudioDuration(); - srcBlob = generateSilenceWav(audioDuration); + srcBlob = generateSilenceWav(); } const globalCaption = opts.globalCaption || store.project?.globalCaption || ''; @@ -1950,8 +1952,7 @@ export async function regenerateTimelineRegion(opts: RegionRegenerateOptions): P srcBlob = (await loadAudioBlobByKey(clip.cumulativeMixKey)) ?? null; } if (!srcBlob) { - const audioDuration = store.getAudioDuration(); - srcBlob = generateSilenceWav(audioDuration); + srcBlob = generateSilenceWav(); } const outcome = await generateRepaintInternal( diff --git a/src/services/legoApiTiming.ts b/src/services/legoApiTiming.ts new file mode 100644 index 00000000..b13f9700 --- /dev/null +++ b/src/services/legoApiTiming.ts @@ -0,0 +1,61 @@ +/** + * Maps a timeline clip to ACE-Step `task_type: lego` timing fields on `/release_task`. + * + * From-silence generation uses a tiny placeholder WAV; the server must rely on + * `audio_duration` for target length / Metas. We therefore send: + * - `repainting_start`: 0 + * - `repainting_end`: -1 + * - `audio_duration`: clip duration (select window length), not the full timeline. + * + * Context / cumulative generation keeps repainting in timeline seconds and uses + * `audio_duration` = project timeline extent (same as previous DAW behavior). + */ + +export interface LegoTimingClip { + startTime: number; + duration: number; +} + +export interface LegoTimingRepaintRange { + start: number; + end: number; +} + +export interface LegoTimingResult { + repainting_start: number; + repainting_end: number; + /** Seconds: for forceSilence = clip length; else = project timeline duration */ + audio_duration: number; + /** Whether to use chunk-style DiT instruction ("segment" vs full track) */ + isChunkMode: boolean; +} + +const MIN_LEGAL_AUDIO_DURATION_SEC = 1e-3; + +export function computeLegoTimingParams( + forceSilence: boolean, + clip: LegoTimingClip, + projectTimelineDuration: number, + repaintRange?: LegoTimingRepaintRange, +): LegoTimingResult { + const clipEnd = clip.startTime + clip.duration; + + if (forceSilence) { + return { + repainting_start: 0, + repainting_end: -1, + audio_duration: Math.max(clip.duration, MIN_LEGAL_AUDIO_DURATION_SEC), + isChunkMode: clip.startTime > 0.5 || clipEnd < projectTimelineDuration - 0.5, + }; + } + + const repainting_start = repaintRange?.start ?? clip.startTime; + const repainting_end = repaintRange?.end ?? clipEnd; + + return { + repainting_start, + repainting_end, + audio_duration: projectTimelineDuration, + isChunkMode: repainting_start > 0.5 || repainting_end < projectTimelineDuration - 0.5, + }; +} diff --git a/src/services/silenceGenerator.ts b/src/services/silenceGenerator.ts index 4689ff8b..f90743ec 100644 --- a/src/services/silenceGenerator.ts +++ b/src/services/silenceGenerator.ts @@ -5,15 +5,14 @@ const SILENCE_UPLOAD_CHANNELS = 1; const SILENCE_UPLOAD_DURATION = 0.1; /** - * Generate a minimal silence WAV for upload. - * The actual generation duration is controlled by the audio_duration API param, - * so we only need a tiny placeholder (0.1s at 16kHz mono = ~3.2KB instead of - * full-duration 48kHz stereo which can exceed 11MB). + * Generate a minimal silence WAV placeholder for upload. * - * The full-quality version is still available via generateSilenceWavFull() for - * local playback/mixing. + * The actual generation length is controlled by the `audio_duration` field on + * the lego/repaint task params (see {@link computeLegoTimingParams} in + * `legoApiTiming.ts`). This function only produces a tiny 0.1s 16kHz mono WAV + * (~3.2KB) so the backend has a valid `src_audio` file to parse. */ -export function generateSilenceWav(_durationSeconds: number): Blob { +export function generateSilenceWav(): Blob { const numSamples = Math.ceil(SILENCE_UPLOAD_RATE * SILENCE_UPLOAD_DURATION); const bytesPerSample = BITS_PER_SAMPLE / 8; const blockAlign = SILENCE_UPLOAD_CHANNELS * bytesPerSample; diff --git a/tests/unit/legoApiTiming.test.ts b/tests/unit/legoApiTiming.test.ts new file mode 100644 index 00000000..1d0e96eb --- /dev/null +++ b/tests/unit/legoApiTiming.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; +import { computeLegoTimingParams } from '../../src/services/legoApiTiming'; + +describe('computeLegoTimingParams', () => { + const projectDuration = 128; + + it('from-silence: repainting 0 / -1 and audio_duration = clip length', () => { + const t = computeLegoTimingParams( + true, + { startTime: 10, duration: 8 }, + projectDuration, + ); + expect(t.repainting_start).toBe(0); + expect(t.repainting_end).toBe(-1); + expect(t.audio_duration).toBe(8); + expect(t.isChunkMode).toBe(true); + }); + + it('from-silence: full-timeline clip is not chunk mode', () => { + const t = computeLegoTimingParams( + true, + { startTime: 0, duration: projectDuration }, + projectDuration, + ); + expect(t.isChunkMode).toBe(false); + expect(t.audio_duration).toBe(projectDuration); + }); + + it('from-silence: clamps zero clip duration to a small positive', () => { + const t = computeLegoTimingParams(true, { startTime: 0, duration: 0 }, projectDuration); + expect(t.audio_duration).toBeGreaterThan(0); + expect(t.audio_duration).toBeLessThanOrEqual(0.001); + }); + + it('with context: repainting follows clip and audio_duration = project timeline', () => { + const t = computeLegoTimingParams( + false, + { startTime: 5, duration: 4 }, + projectDuration, + ); + expect(t.repainting_start).toBe(5); + expect(t.repainting_end).toBe(9); + expect(t.audio_duration).toBe(projectDuration); + expect(t.isChunkMode).toBe(true); + }); + + it('respects repaintRange override when not forceSilence', () => { + const t = computeLegoTimingParams( + false, + { startTime: 0, duration: 20 }, + projectDuration, + { start: 2, end: 18 }, + ); + expect(t.repainting_start).toBe(2); + expect(t.repainting_end).toBe(18); + expect(t.audio_duration).toBe(projectDuration); + }); +});