-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathindex.js
More file actions
138 lines (122 loc) · 4.75 KB
/
index.js
File metadata and controls
138 lines (122 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import {
log,
logAlways,
readTextFile,
encodeWAV,
phonemizeAndTokenize,
createDownloadButton,
cacheModelChunks,
cacheEntireModel,
} from "./helpers.js";
const audioCtx = new (window.AudioContext || window.wexbkitAudioContext)();
const susresBtn = document.getElementById("susresBtn");
const userText = document.getElementById("userText");
const cacheOverride = document.getElementById("cacheOverride");
const modelChunksDir = "./model/model_quantized.onnx"; // Directory containing model chunks use ./kokoro-v0_19_chunks.onnx if your using modelVersion = 0.
const cacheEntire = true; // change to false if using ./model_chunks as a directory
const modelVersion = 1; // either 0 or 1
const inputKeys = ["tokens", "input_ids"]; //will be one of these
const outputKeys = ["audio", "waveform"];
let combinedBuffer; // when we rerun main dont recreate this
let session;
async function main() {
ort.env.wasm.numThreads=navigator.hardwareConcurrency; // enables all cores, thanks ken107 for the suggestion!
// ort.env.wasm.proxy = true;
if (!combinedBuffer){ // when session is already in memory
if (!cacheEntire) {
combinedBuffer = await cacheModelChunks(modelChunksDir);
} else {
combinedBuffer = await cacheEntireModel(modelChunksDir);
}
logAlways("model loaded");
}
// Create a new session and load the model
if (!session){
session = await ort.InferenceSession.create(combinedBuffer);
}
const text = userText.value;
const tokens = await phonemizeAndTokenize(text, "en"); // token count does not conform to kokoro.py (when delimiters are used) more testing needed.
log(`tokens (${tokens[0].length}): ${tokens}`); //input_text->phenomizer->tokenize
const voiceSelection = document.querySelector("#voices");
const selectedVoice = voiceSelection[voiceSelection.selectedIndex].value;
log(`selectedVoice : (${selectedVoice})`); //voice tensor
const styleData = await readTextFile(
`./voices_json/${selectedVoice}.json`,
cacheOverride
);
const style = new Float32Array(styleData[tokens.length][0]);
log(`style: ${style}`);
let results = null;
if (Array.isArray(tokens[0])) {
// If tokens is an array of arrays, loop through each set of tokens
let combinedAudio = [];
for (const tokenSet of tokens) {
const feeds = {
[inputKeys[modelVersion]]: new ort.Tensor("int64", tokenSet, [
1,
tokenSet.length,
]),
style: new ort.Tensor("float32", style, [1, style.length]),
speed: new ort.Tensor("float32", [1]),
};
const result = await session.run(feeds);
console.log(result);
combinedAudio = combinedAudio.concat(
Array.from(result[outputKeys[modelVersion]].cpuData)
);
}
results = { audio: { cpuData: new Float32Array(combinedAudio) } };
} else {
// If tokens is a single array
const style = new Float32Array(styleData[tokens.length][0]);
const feeds = {
[inputKeys[modelVersion]]: new ort.Tensor("int64", tokens, [
1,
tokens.length,
]),
style: new ort.Tensor("float32", style, [1, style.length]),
speed: new ort.Tensor("float32", [1]),
};
results = await session.run(feeds);
}
if (results == null) {
log("generation failed");
throw new Error("Failed to generate audio results.");
}
// // Get the length in samples for the audio
const originalLength = results.audio.cpuData.length;
// Calculate the sample offset for the start and end
const sampleRate = 24000;
const startCut = 0.0 * sampleRate; // seconds * sampleRate
const endCut = 0.0 * sampleRate;
const audioDuration = results.audio.cpuData.length / sampleRate;
logAlways(`done! duration of audio: ${audioDuration}s`);
// Ensure we don't go out of bounds
const endIndex = originalLength - endCut;
const length = endIndex - startCut;
if (length > 0 && startCut < endIndex) {
const slicedAudio = results.audio.cpuData.slice(startCut, endIndex);
// Create a new AudioBuffer to hold the cut audio
const buffer = audioCtx.createBuffer(1, slicedAudio.length, sampleRate);
// Copy the sliced audio data into the new AudioBuffer
buffer.copyToChannel(slicedAudio, 0);
// Create a source to play the audio
const source = audioCtx.createBufferSource();
source.buffer = buffer;
source.connect(audioCtx.destination);
// Start playing the audio
source.start();
const wavBuffer = encodeWAV(buffer.getChannelData(0), buffer.sampleRate);
createDownloadButton(wavBuffer, "output.wav", "audio/wav");
} else {
console.error(
"Invalid audio slice parameters. Check that the start and end positions are correct."
);
}
susresBtn.style.display = true;
}
susresBtn.onclick = async function () {
await audioCtx.resume().then(function () {
main();
});
};