kokorojs/index.js at main · Shubin123/kokorojs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import {
  log,
  logAlways,
  readTextFile,
  encodeWAV,
  phonemizeAndTokenize,
  createDownloadButton,
  cacheModelChunks,
  cacheEntireModel,
} from "./helpers.js";

const audioCtx = new (window.AudioContext || window.wexbkitAudioContext)();
const susresBtn = document.getElementById("susresBtn");
const userText = document.getElementById("userText");
const cacheOverride = document.getElementById("cacheOverride");
const modelChunksDir = "./model/model_quantized.onnx"; // Directory containing model chunks use ./kokoro-v0_19_chunks.onnx if your using modelVersion = 0.
const cacheEntire = true; // change to false if using ./model_chunks as a directory
const modelVersion = 1; // either 0 or 1
const inputKeys = ["tokens", "input_ids"]; //will be one of these
const outputKeys = ["audio", "waveform"];

let combinedBuffer; // when we rerun main dont recreate this
let session;
async function main() {

  ort.env.wasm.numThreads=navigator.hardwareConcurrency; // enables all cores, thanks ken107 for the suggestion!
  // ort.env.wasm.proxy = true;
  if (!combinedBuffer){ // when session is already in memory
  if (!cacheEntire) {
    combinedBuffer = await cacheModelChunks(modelChunksDir);
  } else {
    combinedBuffer = await cacheEntireModel(modelChunksDir);
  }
  logAlways("model loaded");
  }

  // Create a new session and load the model
  if (!session){
  session = await ort.InferenceSession.create(combinedBuffer);
  }
  const text = userText.value;
  const tokens = await phonemizeAndTokenize(text, "en"); // token count does not conform to kokoro.py (when delimiters are used) more testing needed.
  log(`tokens (${tokens[0].length}): ${tokens}`); //input_text->phenomizer->tokenize
  const voiceSelection = document.querySelector("#voices");
  const selectedVoice = voiceSelection[voiceSelection.selectedIndex].value;
  log(`selectedVoice : (${selectedVoice})`); //voice tensor
  const styleData = await readTextFile(
    `./voices_json/${selectedVoice}.json`,
    cacheOverride
  );
  const style = new Float32Array(styleData[tokens.length][0]);
  log(`style: ${style}`);

  let results = null;

  if (Array.isArray(tokens[0])) {
    // If tokens is an array of arrays, loop through each set of tokens
    let combinedAudio = [];
    for (const tokenSet of tokens) {
      const feeds = {
        [inputKeys[modelVersion]]: new ort.Tensor("int64", tokenSet, [
          1,
          tokenSet.length,
        ]),
        style: new ort.Tensor("float32", style, [1, style.length]),
        speed: new ort.Tensor("float32", [1]),
      };
      const result = await session.run(feeds);
      console.log(result);
      combinedAudio = combinedAudio.concat(
        Array.from(result[outputKeys[modelVersion]].cpuData)
      );
    }
    results = { audio: { cpuData: new Float32Array(combinedAudio) } };
  } else {
    // If tokens is a single array
    const style = new Float32Array(styleData[tokens.length][0]);
    const feeds = {
      [inputKeys[modelVersion]]: new ort.Tensor("int64", tokens, [
        1,
        tokens.length,
      ]),
      style: new ort.Tensor("float32", style, [1, style.length]),
      speed: new ort.Tensor("float32", [1]),
    };
    results = await session.run(feeds);
  }

  if (results == null) {
    log("generation failed");
    throw new Error("Failed to generate audio results.");
  }
  // // Get the length in samples for the audio
  const originalLength = results.audio.cpuData.length;

  // Calculate the sample offset for the start and end
  const sampleRate = 24000;
  const startCut = 0.0 * sampleRate; // seconds * sampleRate
  const endCut = 0.0 * sampleRate;
  const audioDuration = results.audio.cpuData.length / sampleRate;
  logAlways(`done! duration of audio: ${audioDuration}s`);

  // Ensure we don't go out of bounds
  const endIndex = originalLength - endCut;
  const length = endIndex - startCut;

  if (length > 0 && startCut < endIndex) {
    const slicedAudio = results.audio.cpuData.slice(startCut, endIndex);

    // Create a new AudioBuffer to hold the cut audio
    const buffer = audioCtx.createBuffer(1, slicedAudio.length, sampleRate);

    // Copy the sliced audio data into the new AudioBuffer
    buffer.copyToChannel(slicedAudio, 0);

    // Create a source to play the audio
    const source = audioCtx.createBufferSource();
    source.buffer = buffer;
    source.connect(audioCtx.destination);

    // Start playing the audio
    source.start();

    const wavBuffer = encodeWAV(buffer.getChannelData(0), buffer.sampleRate);
    createDownloadButton(wavBuffer, "output.wav", "audio/wav");
  } else {
    console.error(
      "Invalid audio slice parameters. Check that the start and end positions are correct."
    );
  }
  susresBtn.style.display = true;
}

susresBtn.onclick = async function () {
  await audioCtx.resume().then(function () {
    main();
  });
};