SpiralCloudOmega
diff --git a/‎.github/skills/ai-integration/ace-step-inference/SKILL.md‎
Lines changed: 189 additions & 0 deletions b/‎.github/skills/ai-integration/ace-step-inference/SKILL.md‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎.github/skills/ai-integration/demucs-separation/SKILL.md‎
Lines changed: 173 additions & 0 deletions b/‎.github/skills/ai-integration/demucs-separation/SKILL.md‎
Lines changed: 173 additions & 0 deletions
@@ -0,0 +1,189 @@
+---
+name: ace-step-inference
+version: "1.0"
+description: ACE-Step 1.5 music generation — GGML inference, text-to-music, covers, repainting, CUDA acceleration, 48kHz stereo output for REVITHION STUDIO
+tags: [ai, music-generation, ace-step, ggml, cuda, inference]
+category: ai-integration
+---
+
+# ACE-Step 1.5 Music Generation Integration
+
+ACE-Step 1.5 is a diffusion-based music generation model that produces full stereo audio from text prompts, reference tracks, or partial audio inputs. It supports three generation modes: **text-to-music** (prompt-only), **covers** (style transfer from a reference), and **repainting** (inpainting/outpainting on existing audio). REVITHION STUDIO integrates ACE-Step through a GGML-quantized C++ backend with CUDA acceleration, outputting 48kHz/32-bit float stereo suitable for direct insertion into the DAW timeline.
+
+## Architecture Overview
+
+The inference pipeline consists of a text encoder (CLAP), a latent diffusion UNet, and a vocoder (BigVGAN). The GGML backend loads quantized weights (Q4_K_M or Q8_0) into GPU VRAM via CUDA, keeping the host CPU free for DAW audio processing. A dedicated inference thread communicates with the audio engine through a lock-free FIFO, ensuring zero-glitch playback during generation.
+
+## GGML Model Loading & CUDA Context
+
+```cpp
+#include <ggml/ggml.h>
+#include <ggml/ggml-cuda.h>
+
+struct AceStepContext {
+    ggml_context* ctx = nullptr;
+    ggml_backend_t backend = nullptr;
+    ggml_backend_buffer_t buffer = nullptr;
+
+    bool loadModel(const std::string& modelPath, int gpuLayers) {
+        backend = ggml_backend_cuda_init(0);
+        if (!backend) return false;
+
+        struct ggml_init_params params = {
+            .mem_size = 512 * 1024 * 1024,
+            .mem_buffer = nullptr,
+            .no_alloc = true
+        };
+        ctx = ggml_init(params);
+
+        // Load quantized weights onto GPU
+        auto* model = ggml_model_load(modelPath.c_str(), ctx, backend, gpuLayers);
+        return model != nullptr;
+    }
+
+    ~AceStepContext() {
+        if (ctx) ggml_free(ctx);
+        if (buffer) ggml_backend_buffer_free(buffer);
+        if (backend) ggml_backend_free(backend);
+    }
+};
+```
+
+## Text-to-Music Generation
+
+```cpp
+struct GenerationParams {
+    std::string prompt;
+    float durationSec = 30.0f;
+    int steps = 100;
+    float cfgScale = 7.0f;
+    int sampleRate = 48000;
+    int seed = -1; // -1 = random
+};
+
+std::vector<float> generateFromText(AceStepContext& ace, const GenerationParams& params) {
+    auto tokens = ace.encodeText(params.prompt);
+
+    // Diffusion loop with classifier-free guidance
+    auto latent = ace.initNoise(params.durationSec, params.sampleRate, params.seed);
+    for (int step = 0; step < params.steps; ++step) {
+        auto conditioned = ace.denoise(latent, tokens, step, params.cfgScale);
+        auto unconditioned = ace.denoise(latent, {}, step, params.cfgScale);
+        latent = unconditioned + params.cfgScale * (conditioned - unconditioned);
+    }
+
+    return ace.vocoder(latent); // 48kHz stereo interleaved float
+}
+```
+
+## Cover & Repainting Modes
+
+```cpp
+enum class AceMode { TextToMusic, Cover, Repaint };
+
+std::vector<float> generateWithReference(AceStepContext& ace,
+                                          const GenerationParams& params,
+                                          AceMode mode,
+                                          const float* refAudio,
+                                          int refSamples,
+                                          float strength = 0.75f) {
+    auto latent = ace.encodeAudio(refAudio, refSamples);
+
+    if (mode == AceMode::Cover) {
+        // Partial noise injection preserving melodic structure
+        int startStep = static_cast<int>(params.steps * (1.0f - strength));
+        latent = ace.addNoise(latent, startStep);
+        return ace.denoiseFrom(latent, ace.encodeText(params.prompt), startStep, params);
+    }
+
+    if (mode == AceMode::Repaint) {
+        auto mask = ace.buildTimeMask(params.durationSec, params.sampleRate);
+        return ace.inpaint(latent, mask, ace.encodeText(params.prompt), params);
+    }
+
+    return generateFromText(ace, params);
+}
+```
+
+## JUCE Integration — Async Generation Thread
+
+```cpp
+class AceStepProcessor : public juce::Thread {
+    AceStepContext context;
+    juce::AbstractFifo fifo { 48000 * 120 * 2 }; // 120s stereo buffer
+    std::vector<float> ringBuffer;
+    std::atomic<bool> generating { false };
+
+public:
+    AceStepProcessor() : Thread("ACE-Step-Inference") {
+        ringBuffer.resize(static_cast<size_t>(fifo.getTotalSize()));
+    }
+
+    void startGeneration(const GenerationParams& params) {
+        currentParams = params;
+        generating = true;
+        startThread(juce::Thread::Priority::normal);
+    }
+
+    void run() override {
+        auto audio = generateFromText(context, currentParams);
+        int written = 0;
+        while (written < static_cast<int>(audio.size()) && !threadShouldExit()) {
+            auto scope = fifo.write(static_cast<int>(audio.size()) - written);
+            std::copy_n(audio.data() + written, scope.blockSize1, ringBuffer.data() + scope.startIndex1);
+            written += scope.blockSize1 + scope.blockSize2;
+        }
+        generating = false;
+    }
+
+    void pullSamples(float* dest, int numSamples) {
+        auto scope = fifo.read(numSamples);
+        std::copy_n(ringBuffer.data() + scope.startIndex1, scope.blockSize1, dest);
+    }
+
+private:
+    GenerationParams currentParams;
+};
+```
+
+## Python API Bridge (ACE-Step HTTP)
+
+```python
+import httpx, struct
+
+async def generate_music(prompt: str, duration: float = 30.0,
+                         mode: str = "text2music",
+                         reference_path: str | None = None) -> bytes:
+    """Call ACE-Step API server at localhost:8001."""
+    payload = {
+        "prompt": prompt, "duration": duration, "mode": mode,
+        "sample_rate": 48000, "cfg_scale": 7.0, "steps": 100
+    }
+    if reference_path:
+        payload["reference_audio"] = reference_path
+
+    async with httpx.AsyncClient(timeout=300) as client:
+        resp = await client.post("http://localhost:8001/generate", json=payload)
+        resp.raise_for_status()
+        return resp.content  # Raw 48kHz float32 PCM
+```
+
+## Anti-Patterns
+
+- ❌ Don't run inference on the audio thread — always use a separate thread with FIFO handoff
+- ❌ Don't load full FP32 weights on a 24GB GPU — use Q4_K_M or Q8_0 quantization to fit in VRAM
+- ❌ Don't generate at 44.1kHz then resample — generate natively at 48kHz to avoid aliasing artifacts
+- ❌ Don't block the UI thread waiting for generation — use async callbacks or polling
+- ❌ Don't skip CUDA device synchronization before reading output buffers
+- ❌ Don't use cfg_scale > 15 — it causes spectral collapse and harsh artifacts
+
+## Checklist
+
+- [ ] GGML backend initialized with CUDA device 0 before model load
+- [ ] Model weights quantized to Q4_K_M or Q8_0 and validated with checksum
+- [ ] Inference thread priority set below audio thread priority
+- [ ] Ring buffer sized for maximum generation duration (120s × 48kHz × 2ch)
+- [ ] Output sample rate matches DAW session rate (48kHz default)
+- [ ] VRAM usage monitored — abort generation if free VRAM < 2GB
+- [ ] Seed stored with generated clip for reproducibility
+- [ ] All three modes (text-to-music, cover, repaint) tested with reference audio
@@ -0,0 +1,173 @@
+---
+name: demucs-separation
+version: "1.0"
+description: AI stem separation with Demucs/HTDemucs — 4-stem and 6-stem modes, GPU inference via LibTorch, real-time preview for REVITHION STUDIO
+tags: [ai, stem-separation, demucs, libtorch, cuda, audio]
+category: ai-integration
+---
+
+# AI Stem Separation with Demucs/HTDemucs
+
+Demucs (Hybrid Transformer Demucs) is Meta's state-of-the-art source separation model that splits mixed audio into individual stems. REVITHION STUDIO integrates HTDemucs v4 via LibTorch C++ for native GPU-accelerated inference. The 4-stem model separates drums, bass, vocals, and other; the 6-stem model adds guitar and piano. Output is 44.1kHz float32 per stem, resampled to the session rate (typically 48kHz). A real-time preview mode processes small overlapping chunks for interactive auditioning before committing to full offline separation.
+
+## Model Configuration
+
+HTDemucs uses a hybrid architecture combining a temporal convolutional network with a spectral transformer. The C++ integration loads TorchScript-traced models, keeping the audio engine independent of Python at runtime.
+
+```cpp
+struct DemucsConfig {
+    int numStems = 4;           // 4 = drums/bass/vocals/other, 6 adds guitar/piano
+    int sampleRate = 44100;     // Native model rate
+    int segmentLength = 7;      // Seconds per chunk (overlap-add)
+    float overlap = 0.25f;      // 25% overlap between segments
+    bool useCuda = true;
+    int cudaDevice = 0;
+    juce::String modelPath;     // Path to .pt TorchScript model
+};
+```
+
+## LibTorch C++ Integration
+
+```cpp
+#include <torch/script.h>
+#include <torch/cuda.h>
+
+class DemucsEngine {
+    torch::jit::script::Module model;
+    torch::Device device;
+    DemucsConfig config;
+
+public:
+    DemucsEngine(const DemucsConfig& cfg)
+        : device(cfg.useCuda && torch::cuda::is_available()
+                 ? torch::Device(torch::kCUDA, cfg.cudaDevice)
+                 : torch::Device(torch::kCPU)),
+          config(cfg) {}
+
+    bool loadModel() {
+        try {
+            model = torch::jit::load(config.modelPath.toStdString(), device);
+            model.eval();
+            torch::NoGradGuard noGrad;
+            // Warm up with a short tensor to trigger CUDA kernel compilation
+            auto dummy = torch::randn({1, 2, config.sampleRate * 2}).to(device);
+            model.forward({dummy});
+            return true;
+        } catch (const c10::Error& e) {
+            DBG("Demucs load failed: " << e.what());
+            return false;
+        }
+    }
+
+    // Returns [numStems, 2, numSamples] tensor
+    torch::Tensor separate(const float* interleavedStereo, int numFrames) {
+        torch::NoGradGuard noGrad;
+        auto input = torch::from_blob(
+            const_cast<float*>(interleavedStereo),
+            {1, 2, numFrames}, torch::kFloat32
+        ).to(device);
+
+        auto output = model.forward({input}).toTensor(); // [1, stems, 2, samples]
+        return output.squeeze(0).cpu();
+    }
+};
+```
+
+## Overlap-Add Segment Processing
+
+Full tracks are too long for a single forward pass. Segment the input with overlap and crossfade the outputs to eliminate boundary artifacts.
+
+```cpp
+struct StemResult {
+    std::vector<std::vector<float>> stems; // [stemIdx][interleavedSamples]
+};
+
+StemResult separateFullTrack(DemucsEngine& engine, const float* audio,
+                              int totalFrames, const DemucsConfig& cfg) {
+    int segSamples = cfg.segmentLength * cfg.sampleRate;
+    int hopSamples = static_cast<int>(segSamples * (1.0f - cfg.overlap));
+    int numStems = cfg.numStems;
+
+    StemResult result;
+    result.stems.resize(static_cast<size_t>(numStems),
+                        std::vector<float>(static_cast<size_t>(totalFrames * 2), 0.0f));
+    std::vector<float> weightSum(static_cast<size_t>(totalFrames), 0.0f);
+
+    // Build Hann crossfade window
+    std::vector<float> window(static_cast<size_t>(segSamples));
+    for (int i = 0; i < segSamples; ++i)
+        window[static_cast<size_t>(i)] = 0.5f * (1.0f - std::cos(2.0f * M_PI * i / segSamples));
+
+    for (int offset = 0; offset < totalFrames; offset += hopSamples) {
+        int chunkLen = std::min(segSamples, totalFrames - offset);
+        auto output = engine.separate(audio + offset * 2, chunkLen);
+
+        for (int s = 0; s < numStems; ++s) {
+            auto stemData = output[s].contiguous().data_ptr<float>();
+            for (int i = 0; i < chunkLen; ++i) {
+                float w = window[static_cast<size_t>(i)];
+                size_t outIdx = static_cast<size_t>((offset + i) * 2);
+                result.stems[s][outIdx]     += stemData[i * 2]     * w;
+                result.stems[s][outIdx + 1] += stemData[i * 2 + 1] * w;
+                weightSum[static_cast<size_t>(offset + i)] += w;
+            }
+        }
+    }
+
+    // Normalize by accumulated window weight
+    for (int s = 0; s < numStems; ++s)
+        for (size_t i = 0; i < static_cast<size_t>(totalFrames); ++i) {
+            if (weightSum[i] > 0.0f) {
+                result.stems[s][i * 2]     /= weightSum[i];
+                result.stems[s][i * 2 + 1] /= weightSum[i];
+            }
+        }
+
+    return result;
+}
+```
+
+## Python — Batch Separation Script
+
+```python
+import torch, torchaudio
+from demucs.pretrained import get_model
+from demucs.apply import apply_model
+
+def separate_file(input_path: str, output_dir: str, model_name: str = "htdemucs") -> list[str]:
+    model = get_model(model_name)
+    model.cuda()
+    wav, sr = torchaudio.load(input_path)
+    wav = wav.unsqueeze(0).cuda()  # [1, channels, samples]
+
+    with torch.no_grad():
+        stems = apply_model(model, wav, shifts=1, overlap=0.25)
+
+    paths = []
+    for i, name in enumerate(model.sources):
+        out = stems[0, i].cpu()
+        path = f"{output_dir}/{name}.wav"
+        torchaudio.save(path, out, sr)
+        paths.append(path)
+    return paths
+```
+
+## Anti-Patterns
+
+- ❌ Don't run separation on the audio thread — even GPU inference takes 2–10× real-time
+- ❌ Don't skip overlap-add windowing — hard segment boundaries cause audible clicks
+- ❌ Don't assume 48kHz input — Demucs expects 44.1kHz; resample before inference
+- ❌ Don't allocate CUDA tensors in a loop — pre-allocate and reuse buffers
+- ❌ Don't mix LibTorch CUDA contexts with GGML CUDA contexts without stream synchronization
+- ❌ Don't forget `torch::NoGradGuard` — gradient tracking wastes VRAM during inference
+
+## Checklist
+
+- [ ] LibTorch linked with CUDA 12+ and matching cuDNN version
+- [ ] TorchScript model traced and validated against Python reference outputs
+- [ ] Overlap-add window function tested for perfect reconstruction (sum-to-one)
+- [ ] Sample rate conversion (48kHz ↔ 44.1kHz) uses high-quality sinc resampler
+- [ ] GPU memory monitored — fallback to CPU if VRAM < 4GB free
+- [ ] Preview mode processes ≤ 8 seconds to maintain interactive latency
+- [ ] Stem output gain-matched to original mix (sum of stems ≈ input)
+- [ ] All 4/6 stem names mapped to correct mixer channels