diff --git a/main.js b/main.js index 52605b97..9d7dbd25 100644 --- a/main.js +++ b/main.js @@ -1107,6 +1107,7 @@ if (gotSingleInstanceLock) { googleCalendarManager.stop(); } if (ipcHandlers) { + ipcHandlers.cleanupAllStreaming(); ipcHandlers._cleanupTextEditMonitor(); } if (textEditMonitor) { diff --git a/package.json b/package.json index aaf9dd0d..33f496c2 100644 --- a/package.json +++ b/package.json @@ -60,6 +60,7 @@ "format:check": "eslint . && cd src && eslint . && prettier --check \"**/*.{js,jsx,ts,tsx,json,css,md}\"", "lint": "eslint . && cd src && eslint .", "typecheck": "cd src && tsc --noEmit", + "test": "node --test", "quality-check": "npm run format:check && npm run typecheck", "i18n:check": "node scripts/check-i18n.js", "preview": "cd src && vite preview", diff --git a/preload.js b/preload.js index a17e5b70..349b5471 100644 --- a/preload.js +++ b/preload.js @@ -299,6 +299,10 @@ contextBridge.exposeInMainWorld("electronAPI", { saveMistralKey: (key) => ipcRenderer.invoke("save-mistral-key", key), proxyMistralTranscription: (data) => ipcRenderer.invoke("proxy-mistral-transcription", data), + // Soniox API + getSonioxKey: () => ipcRenderer.invoke("get-soniox-key"), + saveSonioxKey: (key) => ipcRenderer.invoke("save-soniox-key", key), + // Custom endpoint API keys getCustomTranscriptionKey: () => ipcRenderer.invoke("get-custom-transcription-key"), saveCustomTranscriptionKey: (key) => ipcRenderer.invoke("save-custom-transcription-key", key), @@ -493,6 +497,30 @@ contextBridge.exposeInMainWorld("electronAPI", { (callback) => (_event, data) => callback(data) ), + // Soniox streaming + sonioxStreamingWarmup: (options) => ipcRenderer.invoke("soniox-streaming-warmup", options), + sonioxStreamingStart: (options) => ipcRenderer.invoke("soniox-streaming-start", options), + sonioxStreamingSend: (audioBuffer) => ipcRenderer.send("soniox-streaming-send", audioBuffer), + sonioxStreamingFinalize: () => ipcRenderer.send("soniox-streaming-finalize"), + sonioxStreamingStop: () => ipcRenderer.invoke("soniox-streaming-stop"), + sonioxStreamingStatus: () => ipcRenderer.invoke("soniox-streaming-status"), + onSonioxPartialTranscript: registerListener( + "soniox-streaming-partial", + (callback) => (_event, text) => callback(text) + ), + onSonioxFinalTranscript: registerListener( + "soniox-streaming-final", + (callback) => (_event, text) => callback(text) + ), + onSonioxError: registerListener( + "soniox-streaming-error", + (callback) => (_event, error) => callback(error) + ), + onSonioxSessionEnd: registerListener( + "soniox-streaming-session-end", + (callback) => (_event, data) => callback(data) + ), + // Usage limit events (for showing UpgradePrompt in ControlPanel) notifyLimitReached: (data) => ipcRenderer.send("limit-reached", data), onLimitReached: registerListener("limit-reached", (callback) => (_event, data) => callback(data)), diff --git a/src/assets/icons/providers/soniox.svg b/src/assets/icons/providers/soniox.svg new file mode 100644 index 00000000..2df2916e --- /dev/null +++ b/src/assets/icons/providers/soniox.svg @@ -0,0 +1 @@ + diff --git a/src/components/OnboardingFlow.tsx b/src/components/OnboardingFlow.tsx index c250c003..b03918b3 100644 --- a/src/components/OnboardingFlow.tsx +++ b/src/components/OnboardingFlow.tsx @@ -83,6 +83,8 @@ export default function OnboardingFlow({ onComplete }: OnboardingFlowProps) { openaiApiKey, groqApiKey, mistralApiKey, + sonioxApiKey, + setSonioxApiKey, customTranscriptionApiKey, setCustomTranscriptionApiKey, dictationKey, @@ -502,6 +504,8 @@ export default function OnboardingFlow({ onComplete }: OnboardingFlowProps) { setGroqApiKey={setGroqApiKey} mistralApiKey={mistralApiKey} setMistralApiKey={setMistralApiKey} + sonioxApiKey={sonioxApiKey} + setSonioxApiKey={setSonioxApiKey} customTranscriptionApiKey={customTranscriptionApiKey} setCustomTranscriptionApiKey={setCustomTranscriptionApiKey} cloudTranscriptionBaseUrl={cloudTranscriptionBaseUrl} @@ -725,6 +729,8 @@ export default function OnboardingFlow({ onComplete }: OnboardingFlowProps) { return groqApiKey.trim().length > 0; } else if (cloudTranscriptionProvider === "mistral") { return mistralApiKey.trim().length > 0; + } else if (cloudTranscriptionProvider === "soniox") { + return sonioxApiKey.trim().length > 0; } else if (cloudTranscriptionProvider === "custom") { // Custom can work without API key for local endpoints return true; diff --git a/src/components/SettingsPage.tsx b/src/components/SettingsPage.tsx index 9178d5bc..f2a5129f 100644 --- a/src/components/SettingsPage.tsx +++ b/src/components/SettingsPage.tsx @@ -172,6 +172,10 @@ interface TranscriptionSectionProps { setGroqApiKey: (key: string) => void; mistralApiKey: string; setMistralApiKey: (key: string) => void; + sonioxApiKey: string; + setSonioxApiKey: (key: string) => void; + sonioxSecondaryLanguage: string; + setSonioxSecondaryLanguage: (lang: string) => void; customTranscriptionApiKey: string; setCustomTranscriptionApiKey: (key: string) => void; cloudTranscriptionBaseUrl?: string; @@ -207,6 +211,10 @@ function TranscriptionSection({ setGroqApiKey, mistralApiKey, setMistralApiKey, + sonioxApiKey, + setSonioxApiKey, + sonioxSecondaryLanguage, + setSonioxSecondaryLanguage, customTranscriptionApiKey, setCustomTranscriptionApiKey, cloudTranscriptionBaseUrl, @@ -383,6 +391,10 @@ function TranscriptionSection({ setGroqApiKey={setGroqApiKey} mistralApiKey={mistralApiKey} setMistralApiKey={setMistralApiKey} + sonioxApiKey={sonioxApiKey} + setSonioxApiKey={setSonioxApiKey} + sonioxSecondaryLanguage={sonioxSecondaryLanguage} + setSonioxSecondaryLanguage={setSonioxSecondaryLanguage} customTranscriptionApiKey={customTranscriptionApiKey} setCustomTranscriptionApiKey={setCustomTranscriptionApiKey} cloudTranscriptionBaseUrl={cloudTranscriptionBaseUrl} @@ -676,6 +688,10 @@ export default function SettingsPage({ activeSection = "general" }: SettingsPage setGeminiApiKey, setGroqApiKey, setMistralApiKey, + sonioxApiKey, + setSonioxApiKey, + sonioxSecondaryLanguage, + setSonioxSecondaryLanguage, customTranscriptionApiKey, setCustomTranscriptionApiKey, customReasoningApiKey, @@ -2670,6 +2686,10 @@ EOF`, setGroqApiKey={setGroqApiKey} mistralApiKey={mistralApiKey} setMistralApiKey={setMistralApiKey} + sonioxApiKey={sonioxApiKey} + setSonioxApiKey={setSonioxApiKey} + sonioxSecondaryLanguage={sonioxSecondaryLanguage} + setSonioxSecondaryLanguage={setSonioxSecondaryLanguage} customTranscriptionApiKey={customTranscriptionApiKey} setCustomTranscriptionApiKey={setCustomTranscriptionApiKey} cloudTranscriptionBaseUrl={cloudTranscriptionBaseUrl} diff --git a/src/components/TranscriptionModelPicker.tsx b/src/components/TranscriptionModelPicker.tsx index c7c103eb..cdee8ffe 100644 --- a/src/components/TranscriptionModelPicker.tsx +++ b/src/components/TranscriptionModelPicker.tsx @@ -8,6 +8,8 @@ import { ProviderTabs } from "./ui/ProviderTabs"; import ModelCardList from "./ui/ModelCardList"; import { DownloadProgressBar } from "./ui/DownloadProgressBar"; import ApiKeyInput from "./ui/ApiKeyInput"; +import LanguageSelector, { type LanguageOption } from "./ui/LanguageSelector"; +import languageRegistry from "../config/languageRegistry.json"; import { ConfirmDialog } from "./ui/dialog"; import { useDialogs } from "../hooks/useDialogs"; import { useModelDownload, type DownloadProgress } from "../hooks/useModelDownload"; @@ -26,6 +28,7 @@ import { getProviderIcon, isMonochromeProvider } from "../utils/providerIcons"; import { API_ENDPOINTS, normalizeBaseUrl } from "../config/constants"; import { createExternalLinkHandler } from "../utils/externalLinks"; import { getCachedPlatform } from "../utils/platform"; +import { useSettingsStore } from "../stores/settingsStore"; import type { CudaWhisperStatus } from "../types/electron"; import logger from "../utils/logger"; @@ -199,16 +202,25 @@ interface TranscriptionModelPickerProps { setMistralApiKey: (key: string) => void; customTranscriptionApiKey?: string; setCustomTranscriptionApiKey?: (key: string) => void; + sonioxApiKey?: string; + setSonioxApiKey?: (key: string) => void; + sonioxSecondaryLanguage?: string; + setSonioxSecondaryLanguage?: (lang: string) => void; cloudTranscriptionBaseUrl?: string; setCloudTranscriptionBaseUrl?: (url: string) => void; className?: string; variant?: "onboarding" | "settings"; } +const SECONDARY_LANGUAGE_OPTIONS: LanguageOption[] = languageRegistry.languages + .filter((l) => l.code !== "auto") + .map(({ code, label, flag }) => ({ value: code, label, flag })); + const CLOUD_PROVIDER_TABS = [ { id: "openai", name: "OpenAI" }, { id: "groq", name: "Groq", recommended: true }, { id: "mistral", name: "Mistral" }, + { id: "soniox", name: "Soniox" }, { id: "custom", name: "Custom" }, ]; @@ -274,12 +286,18 @@ export default function TranscriptionModelPicker({ setMistralApiKey, customTranscriptionApiKey = "", setCustomTranscriptionApiKey, + sonioxApiKey = "", + setSonioxApiKey, + sonioxSecondaryLanguage = "", + setSonioxSecondaryLanguage, cloudTranscriptionBaseUrl = "", setCloudTranscriptionBaseUrl, className = "", variant = "settings", }: TranscriptionModelPickerProps) { const { t } = useTranslation(); + const preferredLanguage = useSettingsStore((s) => s.preferredLanguage); + const isAutoLanguage = !preferredLanguage || preferredLanguage === "auto"; const [localModels, setLocalModels] = useState([]); const [parakeetModels, setParakeetModels] = useState([]); const [internalLocalProvider, setInternalLocalProvider] = useState(selectedLocalProvider); @@ -393,8 +411,13 @@ export default function TranscriptionModelPicker({ } } } - } else if (selectedCloudProvider !== "custom" && !selectedCloudModel) { - const provider = cloudProviders.find((p) => p.id === selectedCloudProvider); + } else if ( + selectedCloudProvider !== "custom" && + !selectedCloudModel + ) { + const provider = cloudProviders.find( + (p) => p.id === selectedCloudProvider + ); if (provider?.models?.length) { onCloudModelSelect(provider.models[0].id); } @@ -527,7 +550,9 @@ export default function TranscriptionModelPicker({ const handleCloudProviderChange = useCallback( (providerId: string) => { onCloudProviderSelect(providerId); - const provider = cloudProviders.find((p) => p.id === providerId); + const provider = cloudProviders.find( + (p) => p.id === providerId + ); if (providerId === "custom") { onCloudModelSelect("whisper-1"); @@ -541,7 +566,12 @@ export default function TranscriptionModelPicker({ } } }, - [cloudProviders, onCloudProviderSelect, onCloudModelSelect, setCloudTranscriptionBaseUrl] + [ + cloudProviders, + onCloudProviderSelect, + onCloudModelSelect, + setCloudTranscriptionBaseUrl, + ] ); const handleLocalProviderChange = useCallback( @@ -837,7 +867,11 @@ export default function TranscriptionModelPicker({ setCloudTranscriptionBaseUrl?.(e.target.value)} + onChange={(e) => + setCloudTranscriptionBaseUrl?.( + e.target.value + ) + } onBlur={handleBaseUrlBlur} placeholder="https://your-api.example.com/v1" className="h-8 text-sm" @@ -846,7 +880,10 @@ export default function TranscriptionModelPicker({ {})} + setApiKey={ + setCustomTranscriptionApiKey || + (() => {}) + } label={t("transcription.apiKeyOptional")} helpText="" /> @@ -857,7 +894,9 @@ export default function TranscriptionModelPicker({ onCloudModelSelect(e.target.value)} + onChange={(e) => + onCloudModelSelect(e.target.value) + } placeholder="whisper-1" className="h-8 text-sm" /> @@ -877,7 +916,9 @@ export default function TranscriptionModelPicker({ groq: "https://console.groq.com/keys", mistral: "https://console.mistral.ai/api-keys", openai: "https://platform.openai.com/api-keys", - }[selectedCloudProvider] || "https://platform.openai.com/api-keys" + soniox: "https://console.soniox.com/", + }[selectedCloudProvider] || + "https://platform.openai.com/api-keys" )} className="text-xs text-primary/70 hover:text-primary transition-colors cursor-pointer" > @@ -886,22 +927,49 @@ export default function TranscriptionModelPicker({ + {selectedCloudProvider === "soniox" && setSonioxSecondaryLanguage && ( +
+ + setSonioxSecondaryLanguage(value === "none" ? "" : value)} + options={[ + { value: "none", label: t("common.none"), flag: "" }, + ...SECONDARY_LANGUAGE_OPTIONS, + ]} + className="min-w-32" + /> +
+ )} +
- + window.electronAPI.onDictationRealtimeError(cb), onSessionEnd: (cb) => window.electronAPI.onDictationRealtimeSessionEnd(cb), }, + soniox: { + warmup: (opts) => window.electronAPI.sonioxStreamingWarmup(opts), + start: (opts) => window.electronAPI.sonioxStreamingStart(opts), + send: (buf) => window.electronAPI.sonioxStreamingSend(buf), + finalize: () => window.electronAPI.sonioxStreamingFinalize(), + stop: () => window.electronAPI.sonioxStreamingStop(), + status: () => window.electronAPI.sonioxStreamingStatus(), + onPartial: (cb) => window.electronAPI.onSonioxPartialTranscript(cb), + onFinal: (cb) => window.electronAPI.onSonioxFinalTranscript(cb), + onError: (cb) => window.electronAPI.onSonioxError(cb), + onSessionEnd: (cb) => window.electronAPI.onSonioxSessionEnd(cb), + }, }; class AudioManager { @@ -221,7 +233,10 @@ registerProcessor("pcm-streaming-processor", PCMStreamingProcessor); } getStreamingProvider() { - const { cloudTranscriptionModel } = getSettings(); + const { cloudTranscriptionProvider, cloudTranscriptionModel } = getSettings(); + if (cloudTranscriptionProvider === "soniox") { + return STREAMING_PROVIDERS.soniox; + } if (REALTIME_MODELS.has(cloudTranscriptionModel)) { return STREAMING_PROVIDERS["openai-realtime"]; } @@ -2005,6 +2020,11 @@ registerProcessor("pcm-streaming-processor", PCMStreamingProcessor); const s = getSettings(); if (s.useLocalWhisper) return false; + // Soniox is always streaming (BYOK only) + if (s.cloudTranscriptionProvider === "soniox" && s.sonioxApiKey) { + return true; + } + if (REALTIME_MODELS.has(s.cloudTranscriptionModel)) { if (s.cloudTranscriptionMode === "byok") return !!s.openaiApiKey; if (s.cloudTranscriptionMode === "openwhispr") return !!(isSignedInOverride ?? s.isSignedIn); @@ -2271,12 +2291,14 @@ registerProcessor("pcm-streaming-processor", PCMStreamingProcessor); const result = await withSessionRefresh(async () => { const { preferredLanguage: preferredLang, + sonioxSecondaryLanguage, cloudTranscriptionModel, cloudTranscriptionMode, } = getSettings(); const res = await provider.start({ sampleRate: 16000, language: preferredLang && preferredLang !== "auto" ? preferredLang : undefined, + secondaryLanguage: preferredLang && preferredLang !== "auto" ? (sonioxSecondaryLanguage || undefined) : undefined, keyterms: this.getKeyterms(), model: cloudTranscriptionModel, mode: cloudTranscriptionMode === "byok" ? "byok" : "openwhispr", @@ -2577,6 +2599,7 @@ registerProcessor("pcm-streaming-processor", PCMStreamingProcessor); } } + finalText = finalText.trim(); if (finalText) { const tBeforePaste = performance.now(); const clientTotalMs = Math.round(tBeforePaste - t0); diff --git a/src/helpers/environment.js b/src/helpers/environment.js index d11f24f8..590a97f5 100644 --- a/src/helpers/environment.js +++ b/src/helpers/environment.js @@ -10,6 +10,7 @@ const PERSISTED_KEYS = [ "GEMINI_API_KEY", "GROQ_API_KEY", "MISTRAL_API_KEY", + "SONIOX_API_KEY", "CUSTOM_TRANSCRIPTION_API_KEY", "CUSTOM_REASONING_API_KEY", "LOCAL_TRANSCRIPTION_PROVIDER", @@ -107,6 +108,14 @@ class EnvironmentManager { return this._saveKey("MISTRAL_API_KEY", key); } + getSonioxKey() { + return this._getKey("SONIOX_API_KEY"); + } + + saveSonioxKey(key) { + return this._saveKey("SONIOX_API_KEY", key); + } + getCustomTranscriptionKey() { return this._getKey("CUSTOM_TRANSCRIPTION_API_KEY"); } diff --git a/src/helpers/ipcHandlers.js b/src/helpers/ipcHandlers.js index e872c521..d54940e1 100644 --- a/src/helpers/ipcHandlers.js +++ b/src/helpers/ipcHandlers.js @@ -10,6 +10,7 @@ const AssemblyAiStreaming = require("./assemblyAiStreaming"); const { i18nMain, changeLanguage } = require("./i18nMain"); const DeepgramStreaming = require("./deepgramStreaming"); const OpenAIRealtimeStreaming = require("./openaiRealtimeStreaming"); +const SonioxStreaming = require("./sonioxStreaming"); const AudioStorageManager = require("./audioStorage"); const MISTRAL_TRANSCRIPTION_URL = "https://api.mistral.ai/v1/audio/transcriptions"; @@ -108,6 +109,7 @@ class IPCHandlers { this.deepgramStreaming = null; this.openaiRealtimeStreaming = null; this._dictationStreaming = null; + this._sonioxStreaming = null; this._autoLearnEnabled = true; // Default on, synced from renderer this._autoLearnDebounceTimer = null; this._autoLearnLatestData = null; @@ -146,6 +148,34 @@ class IPCHandlers { } } + cleanupAllStreaming() { + const backends = [ + { name: "deepgram", instance: this.deepgramStreaming }, + { name: "openai-realtime", instance: this.openaiRealtimeStreaming }, + { name: "soniox", instance: this._sonioxStreaming }, + { name: "dictation", instance: this._dictationStreaming }, + { name: "assemblyai", instance: this.assemblyAiStreaming }, + ]; + for (const { name, instance } of backends) { + if (!instance) continue; + try { + if (typeof instance.cleanupAll === "function") { + instance.cleanupAll(); + } else { + instance.cleanup(); + } + debugLogger.debug(`Cleaned up ${name} streaming`); + } catch (err) { + debugLogger.debug(`Error cleaning up ${name} streaming`, { error: err.message }); + } + } + this.deepgramStreaming = null; + this.openaiRealtimeStreaming = null; + this._sonioxStreaming = null; + this._dictationStreaming = null; + this.assemblyAiStreaming = null; + } + _setupAudioCleanup() { const DEFAULT_RETENTION_DAYS = 30; const SIX_HOURS_MS = 6 * 60 * 60 * 1000; @@ -1424,6 +1454,14 @@ class IPCHandlers { return this.environmentManager.saveMistralKey(key); }); + ipcMain.handle("get-soniox-key", async () => { + return this.environmentManager.getSonioxKey(); + }); + + ipcMain.handle("save-soniox-key", async (event, key) => { + return this.environmentManager.saveSonioxKey(key); + }); + ipcMain.handle( "proxy-mistral-transcription", async (event, { audioBuffer, model, language, contextBias }) => { @@ -2277,12 +2315,18 @@ class IPCHandlers { }; const setupDictationCallbacks = (streaming, event) => { - streaming.onPartialTranscript = (text) => - event.sender.send("dictation-realtime-partial", text); - streaming.onFinalTranscript = (text) => event.sender.send("dictation-realtime-final", text); - streaming.onError = (err) => event.sender.send("dictation-realtime-error", err.message); - streaming.onSessionEnd = (data) => - event.sender.send("dictation-realtime-session-end", data || {}); + streaming.onPartialTranscript = (text) => { + if (!event.sender.isDestroyed()) event.sender.send("dictation-realtime-partial", text); + }; + streaming.onFinalTranscript = (text) => { + if (!event.sender.isDestroyed()) event.sender.send("dictation-realtime-final", text); + }; + streaming.onError = (err) => { + if (!event.sender.isDestroyed()) event.sender.send("dictation-realtime-error", err.message); + }; + streaming.onSessionEnd = (data) => { + if (!event.sender.isDestroyed()) event.sender.send("dictation-realtime-session-end", data || {}); + }; }; const connectDictationStreaming = async (event, options) => { @@ -2422,6 +2466,69 @@ class IPCHandlers { return { success: true, text: result.text || "" }; }); + // --- Soniox streaming --- + // Soniox cold-starts fast (~250ms), no warmup needed. + ipcMain.handle("soniox-streaming-warmup", async () => { + return { success: true }; + }); + + ipcMain.handle("soniox-streaming-start", async (event, options = {}) => { + try { + if (!this._sonioxStreaming?.isConnected) { + const apiKey = options.apiKey || this.environmentManager.getSonioxKey(); + if (!apiKey) { + return { success: false, error: "Soniox API key not configured", code: "NO_API" }; + } + + // Cold start: create new connection + this._sonioxStreaming = new SonioxStreaming(); + this._sonioxStreaming.onPartialTranscript = (text) => { + if (!event.sender.isDestroyed()) event.sender.send("soniox-streaming-partial", text); + }; + this._sonioxStreaming.onFinalTranscript = (text) => { + if (!event.sender.isDestroyed()) event.sender.send("soniox-streaming-final", text); + }; + this._sonioxStreaming.onError = (err) => { + if (!event.sender.isDestroyed()) event.sender.send("soniox-streaming-error", err.message); + }; + this._sonioxStreaming.onSessionEnd = (data) => { + if (!event.sender.isDestroyed()) event.sender.send("soniox-streaming-session-end", data || {}); + }; + + await this._sonioxStreaming.connect({ + apiKey, + model: options.model || "stt-rt-v4", + language: options.language, + secondaryLanguage: options.secondaryLanguage, + }); + } + return { success: true }; + } catch (err) { + return { success: false, error: err.message }; + } + }); + + ipcMain.on("soniox-streaming-send", (_event, audioBuffer) => { + this._sonioxStreaming?.sendAudio(Buffer.from(audioBuffer)); + }); + + ipcMain.on("soniox-streaming-finalize", () => { + this._sonioxStreaming?.finalize(); + }); + + ipcMain.handle("soniox-streaming-stop", async () => { + if (!this._sonioxStreaming) { + return { success: true, text: "" }; + } + const result = await this._sonioxStreaming.disconnect().catch(() => ({ text: "" })); + this._sonioxStreaming = null; + return { success: true, text: result.text || "" }; + }); + + ipcMain.handle("soniox-streaming-status", async () => { + return { connected: !!this._sonioxStreaming?.isConnected }; + }); + ipcMain.handle("update-transcription-text", async (_event, id, text, rawText) => { try { this.databaseManager.updateTranscriptionText(id, text, rawText); diff --git a/src/helpers/sonioxStreaming.js b/src/helpers/sonioxStreaming.js new file mode 100644 index 00000000..381d7cda --- /dev/null +++ b/src/helpers/sonioxStreaming.js @@ -0,0 +1,375 @@ +const WebSocket = require("ws"); +const debugLogger = require("./debugLogger"); + +const WEBSOCKET_TIMEOUT_MS = 15000; +const DISCONNECT_TIMEOUT_MS = 3000; +const KEEPALIVE_INTERVAL_MS = 5000; +const KEEPALIVE_IDLE_LIMIT_MS = 30000; // Stop keepalive if no audio sent for 30s +const COLD_START_BUFFER_MAX = 3 * 16000 * 2; // 3 seconds of 16-bit PCM at 16kHz +const SONIOX_WS_URL = "wss://stt-rt.soniox.com/transcribe-websocket"; + +// Filler words / hesitations to strip from assembled text. +// Soniox uses sub-word (BPE) tokenization, so fillers must be removed from the +// joined text rather than individual tokens. +const FILLER_WORD = "(?:uh+|um+|yyy+|eee+|mmm+|hmm+)"; +const FILLER_RE = new RegExp(`\\s*,?\\s*\\b${FILLER_WORD}\\b[,.]?\\s*`, "gi"); +const LEADING_FILLER_RE = new RegExp(`^\\s*,?\\s*\\b${FILLER_WORD}\\b`, "i"); +const POST_SENTENCE_CAP_RE = /([.!?]\s+)(\p{Ll})/gu; + +function removeFillers(text) { + const hadLeadingFiller = LEADING_FILLER_RE.test(text); + let result = text.replace(FILLER_RE, " "); + result = result.replace(/ +/g, " ").trim(); + result = result.replace(POST_SENTENCE_CAP_RE, (_, punct, letter) => + punct + letter.toUpperCase() + ); + if (hadLeadingFiller) { + result = result.replace(/^\p{Ll}/u, (c) => c.toUpperCase()); + } + return result; +} + +class SonioxStreaming { + constructor() { + this.ws = null; + this.isConnected = false; + this.finalTokens = []; + this.currentNonFinalText = ""; + this.coldStartBuffer = []; + this.coldStartBufferSize = 0; + this.onPartialTranscript = null; + this.onFinalTranscript = null; + this.onError = null; + this.onSessionEnd = null; + this.pendingResolve = null; + this.pendingReject = null; + this.connectionTimeout = null; + this.keepAliveInterval = null; + this.isDisconnecting = false; + this.audioBytesSent = 0; + this._finalizeSent = false; + this._lastAudioSentAt = 0; + } + + getFullTranscript() { + return removeFillers(this.finalTokens.map((t) => t.text).join("")); + } + + async connect(options = {}) { + const { apiKey, model, language, secondaryLanguage } = options; + if (!apiKey) throw new Error("Soniox API key is required"); + + if (this.isConnected) { + debugLogger.debug("Soniox already connected"); + return; + } + + this.finalTokens = []; + this.currentNonFinalText = ""; + this.audioBytesSent = 0; + this.coldStartBuffer = []; + this.coldStartBufferSize = 0; + this._finalizeSent = false; + + const toBase = (l) => l && l !== "auto" ? l.split("-")[0] : null; + const languageHints = + [toBase(language), toBase(secondaryLanguage)].filter(Boolean); + + debugLogger.debug("Soniox connecting", { model: model || "stt-rt-v4", languageHints }); + + const configMessage = { + api_key: apiKey, + model: model || "stt-rt-v4", + audio_format: "pcm_s16le", + sample_rate: 16000, + num_channels: 1, + language_hints: languageHints, + }; + + return new Promise((resolve, reject) => { + this.pendingResolve = resolve; + this.pendingReject = reject; + + this.connectionTimeout = setTimeout(() => { + this.cleanup(); + reject(new Error("Soniox WebSocket connection timeout")); + }, WEBSOCKET_TIMEOUT_MS); + + this.ws = new WebSocket(SONIOX_WS_URL); + + this.ws.on("open", () => { + debugLogger.debug("Soniox WebSocket opened, sending config"); + this.ws.send(JSON.stringify(configMessage)); + this.startKeepAlive(); + this.flushColdStartBuffer(); + + clearTimeout(this.connectionTimeout); + this.isConnected = true; + this.pendingResolve(); + this.pendingResolve = null; + this.pendingReject = null; + }); + + this.ws.on("message", (data) => { + this.handleMessage(data); + }); + + this.ws.on("error", (error) => { + debugLogger.error("Soniox WebSocket error", { error: error.message }); + this.cleanup(); + if (this.pendingReject) { + this.pendingReject(error); + this.pendingReject = null; + this.pendingResolve = null; + } + this.onError?.(error); + }); + + this.ws.on("close", (code, reason) => { + const wasActive = this.isConnected; + debugLogger.debug("Soniox WebSocket closed", { + code, + reason: reason?.toString(), + wasActive, + }); + if (this.pendingReject) { + this.pendingReject(new Error(`WebSocket closed before ready (code: ${code})`)); + this.pendingReject = null; + this.pendingResolve = null; + } + this.cleanup(); + if (wasActive && !this.isDisconnecting) { + this.onSessionEnd?.({ text: this.getFullTranscript() }); + } + }); + }); + } + + handleMessage(data) { + try { + const res = JSON.parse(data.toString()); + + if (res.error_code) { + debugLogger.error("Soniox error response", { + code: res.error_code, + message: res.error_message, + }); + this.onError?.(new Error(`Soniox error ${res.error_code}: ${res.error_message}`)); + return; + } + + if (res.finished) { + debugLogger.debug("Soniox session finished", { + finalTokens: this.finalTokens.length, + textLength: this.getFullTranscript().length, + }); + this.onSessionEnd?.({ text: this.getFullTranscript() }); + return; + } + + let nonFinalTexts = []; + let newFinalTokens = false; + for (const token of res.tokens || []) { + if (token.text === "") continue; + if (!token.text || !token.text.trim() || token.text === "\ufffd") continue; + if (token.is_final) { + this.finalTokens.push(token); + newFinalTokens = true; + } else { + nonFinalTexts.push(token.text); + } + } + + const rawFinal = this.finalTokens.map((t) => t.text).join(""); + this.currentNonFinalText = nonFinalTexts.join(""); + + this.onPartialTranscript?.( + removeFillers(rawFinal + this.currentNonFinalText) + ); + + if (newFinalTokens) { + this.onFinalTranscript?.(removeFillers(rawFinal)); + } + } catch (err) { + debugLogger.error("Soniox message parse error", { error: err.message }); + } + } + + flushColdStartBuffer() { + if (this.coldStartBuffer.length === 0) return; + + debugLogger.debug("Soniox flushing cold-start buffer", { + chunks: this.coldStartBuffer.length, + bytes: this.coldStartBufferSize, + }); + for (const buf of this.coldStartBuffer) { + this.ws.send(buf); + this.audioBytesSent += buf.length; + } + this.coldStartBuffer = []; + this.coldStartBufferSize = 0; + } + + sendAudio(pcmBuffer) { + if (!this.ws) return false; + + if ( + this.ws.readyState === WebSocket.CONNECTING && + this.coldStartBufferSize < COLD_START_BUFFER_MAX + ) { + const copy = Buffer.from(pcmBuffer); + this.coldStartBuffer.push(copy); + this.coldStartBufferSize += copy.length; + return false; + } + + if (this.ws.readyState !== WebSocket.OPEN) return false; + + this.flushColdStartBuffer(); + this.ws.send(pcmBuffer); + this.audioBytesSent += pcmBuffer.length; + this._lastAudioSentAt = Date.now(); + return true; + } + + finalize() { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return false; + + this._finalizeSent = true; + this.ws.send(JSON.stringify({ type: "finalize" })); + debugLogger.debug("Soniox finalize sent"); + return true; + } + + startKeepAlive() { + this.stopKeepAlive(); + this._lastAudioSentAt = Date.now(); + this.keepAliveInterval = setInterval(() => { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { + this.stopKeepAlive(); + return; + } + if (Date.now() - this._lastAudioSentAt > KEEPALIVE_IDLE_LIMIT_MS) { + debugLogger.debug("Soniox idle timeout, closing connection"); + this.cleanup(); + this.onSessionEnd?.({ text: this.getFullTranscript() }); + return; + } + try { + this.ws.send(JSON.stringify({ type: "keepalive" })); + } catch (err) { + debugLogger.debug("Soniox keep-alive failed", { error: err.message }); + this.stopKeepAlive(); + } + }, KEEPALIVE_INTERVAL_MS); + } + + stopKeepAlive() { + if (this.keepAliveInterval) { + clearInterval(this.keepAliveInterval); + this.keepAliveInterval = null; + } + } + + async disconnect() { + debugLogger.debug("Soniox disconnect", { + audioBytesSent: this.audioBytesSent, + finalTokens: this.finalTokens.length, + textLength: this.getFullTranscript().length, + }); + + if (!this.ws) return { text: this.getFullTranscript() }; + + this.isDisconnecting = true; + + if (this.ws.readyState === WebSocket.OPEN && this.audioBytesSent > 0) { + if (!this._finalizeSent) { + await this.drainFinalTokens(); + } + await this.drainSessionEnd(); + } + + if (this.ws) { + this.ws.close(); + } + + const result = { text: this.getFullTranscript() }; + this.cleanup(); + this.isDisconnecting = false; + return result; + } + + drainFinalTokens() { + return new Promise((resolve) => { + const prevOnFinal = this.onFinalTranscript; + + const tid = setTimeout(() => { + debugLogger.debug("Soniox finalize timeout, using accumulated text"); + this.onFinalTranscript = prevOnFinal; + resolve(); + }, DISCONNECT_TIMEOUT_MS); + + this.onFinalTranscript = (text) => { + clearTimeout(tid); + this.onFinalTranscript = prevOnFinal; + prevOnFinal?.(text); + resolve(); + }; + + try { + this.ws.send(JSON.stringify({ type: "finalize" })); + } catch { + clearTimeout(tid); + this.onFinalTranscript = prevOnFinal; + resolve(); + } + }); + } + + drainSessionEnd() { + return new Promise((resolve) => { + const prevOnSessionEnd = this.onSessionEnd; + + const tid = setTimeout(() => { + debugLogger.debug("Soniox session end timeout, closing"); + this.onSessionEnd = prevOnSessionEnd; + resolve(); + }, DISCONNECT_TIMEOUT_MS); + + this.onSessionEnd = (result) => { + clearTimeout(tid); + this.onSessionEnd = prevOnSessionEnd; + prevOnSessionEnd?.(result); + resolve(); + }; + + try { + this.ws.send(""); + } catch { + clearTimeout(tid); + this.onSessionEnd = prevOnSessionEnd; + resolve(); + } + }); + } + + cleanup() { + this.stopKeepAlive(); + clearTimeout(this.connectionTimeout); + this.connectionTimeout = null; + + if (this.ws) { + try { + this.ws.close(); + } catch (err) { + // ignore + } + this.ws = null; + } + + this.isConnected = false; + } +} + +module.exports = SonioxStreaming; +module.exports.removeFillers = removeFillers; diff --git a/src/hooks/useSettings.ts b/src/hooks/useSettings.ts index 56d0c62f..07c255db 100644 --- a/src/hooks/useSettings.ts +++ b/src/hooks/useSettings.ts @@ -14,6 +14,7 @@ export interface TranscriptionSettings { allowLocalFallback: boolean; fallbackWhisperModel: string; preferredLanguage: string; + sonioxSecondaryLanguage: string; cloudTranscriptionProvider: string; cloudTranscriptionModel: string; cloudTranscriptionBaseUrl?: string; @@ -48,6 +49,7 @@ export interface ApiKeySettings { mistralApiKey: string; customTranscriptionApiKey: string; customReasoningApiKey: string; + sonioxApiKey: string; } export interface PrivacySettings { @@ -170,6 +172,8 @@ function useSettingsInternal() { allowLocalFallback: store.allowLocalFallback, fallbackWhisperModel: store.fallbackWhisperModel, preferredLanguage: store.preferredLanguage, + sonioxSecondaryLanguage: store.sonioxSecondaryLanguage, + setSonioxSecondaryLanguage: store.setSonioxSecondaryLanguage, cloudTranscriptionProvider: store.cloudTranscriptionProvider, cloudTranscriptionModel: store.cloudTranscriptionModel, cloudTranscriptionBaseUrl: store.cloudTranscriptionBaseUrl, @@ -213,6 +217,8 @@ function useSettingsInternal() { setGeminiApiKey: store.setGeminiApiKey, setGroqApiKey: store.setGroqApiKey, setMistralApiKey: store.setMistralApiKey, + sonioxApiKey: store.sonioxApiKey, + setSonioxApiKey: store.setSonioxApiKey, customTranscriptionApiKey: store.customTranscriptionApiKey, setCustomTranscriptionApiKey: store.setCustomTranscriptionApiKey, customReasoningApiKey: store.customReasoningApiKey, diff --git a/src/locales/de/translation.json b/src/locales/de/translation.json index 3bb3a6dd..e9176f48 100644 --- a/src/locales/de/translation.json +++ b/src/locales/de/translation.json @@ -131,7 +131,9 @@ "private": "Privat", "tap": "Tippen", "close": "Schließen", - "dismiss": "Verwerfen" + "dismiss": "Verwerfen", + "none": "Keine", + "secondaryLanguage": "Zweitsprache" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "Originales Whisper-Modell", "groq_whisper_large_v3": "Hochpräzise Spracherkennung", "groq_whisper_large_v3_turbo": "216x Echtzeitgeschwindigkeit", - "mistral_voxtral_mini_latest": "Schnelle mehrsprachige Transkription" + "mistral_voxtral_mini_latest": "Schnelle mehrsprachige Transkription", + "soniox_stt_rt_v4": "Echtzeit-Spracherkennung mit hoher Genauigkeit" }, "cloud": { "openai_gpt_5_2": "Neuestes Flaggschiff-Modell für Reasoning", diff --git a/src/locales/en/translation.json b/src/locales/en/translation.json index 9cfa4475..e69b3ac1 100644 --- a/src/locales/en/translation.json +++ b/src/locales/en/translation.json @@ -131,7 +131,9 @@ "private": "Private", "tap": "Tap", "close": "Close", - "dismiss": "Dismiss" + "dismiss": "Dismiss", + "none": "None", + "secondaryLanguage": "Secondary language" }, "onboarding": { "steps": { @@ -1411,7 +1413,8 @@ "openai_whisper_1": "Original Whisper model", "groq_whisper_large_v3": "High accuracy speech recognition", "groq_whisper_large_v3_turbo": "216x real-time speed", - "mistral_voxtral_mini_latest": "Fast multilingual transcription" + "mistral_voxtral_mini_latest": "Fast multilingual transcription", + "soniox_stt_rt_v4": "Real-time streaming speech-to-text with high accuracy" }, "cloud": { "openai_gpt_5_2": "Latest flagship reasoning model", diff --git a/src/locales/es/translation.json b/src/locales/es/translation.json index 65c8f091..f991a894 100644 --- a/src/locales/es/translation.json +++ b/src/locales/es/translation.json @@ -131,7 +131,9 @@ "private": "Privado", "tap": "Pulsar", "close": "Cerrar", - "dismiss": "Descartar" + "dismiss": "Descartar", + "none": "Ninguno", + "secondaryLanguage": "Idioma secundario" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "Modelo Whisper original", "groq_whisper_large_v3": "Reconocimiento de voz de alta precisión", "groq_whisper_large_v3_turbo": "Velocidad 216x en tiempo real", - "mistral_voxtral_mini_latest": "Transcripción multilingüe rápida" + "mistral_voxtral_mini_latest": "Transcripción multilingüe rápida", + "soniox_stt_rt_v4": "Transcripción de voz en tiempo real con alta precisión" }, "cloud": { "openai_gpt_5_2": "Modelo insignia de razonamiento más reciente", diff --git a/src/locales/fr/translation.json b/src/locales/fr/translation.json index 50b0ef89..6c95989f 100644 --- a/src/locales/fr/translation.json +++ b/src/locales/fr/translation.json @@ -131,7 +131,9 @@ "private": "Privé", "tap": "Appui", "close": "Fermer", - "dismiss": "Ignorer" + "dismiss": "Ignorer", + "none": "Aucun", + "secondaryLanguage": "Langue secondaire" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "Modèle Whisper original", "groq_whisper_large_v3": "Reconnaissance vocale haute précision", "groq_whisper_large_v3_turbo": "Vitesse 216x en temps réel", - "mistral_voxtral_mini_latest": "Transcription multilingue rapide" + "mistral_voxtral_mini_latest": "Transcription multilingue rapide", + "soniox_stt_rt_v4": "Transcription vocale en temps réel avec une grande précision" }, "cloud": { "openai_gpt_5_2": "Dernier modèle phare pour le raisonnement", diff --git a/src/locales/it/translation.json b/src/locales/it/translation.json index 5577e040..fb904976 100644 --- a/src/locales/it/translation.json +++ b/src/locales/it/translation.json @@ -131,7 +131,9 @@ "private": "Privato", "tap": "Tocca", "close": "Chiudi", - "dismiss": "Ignora" + "dismiss": "Ignora", + "none": "Nessuno", + "secondaryLanguage": "Lingua secondaria" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "Modello Whisper originale", "groq_whisper_large_v3_turbo": "Velocità 216x in tempo reale", "groq_whisper_large_v3": "Modello Large v3, veloce e preciso", - "mistral_voxtral_mini_latest": "Trascrizione multilingue veloce" + "mistral_voxtral_mini_latest": "Trascrizione multilingue veloce", + "soniox_stt_rt_v4": "Trascrizione vocale in tempo reale con elevata precisione" }, "cloud": { "openai_gpt_5_2": "Ultimo modello di punta per il ragionamento", diff --git a/src/locales/ja/translation.json b/src/locales/ja/translation.json index 6584d4fe..ddc87729 100644 --- a/src/locales/ja/translation.json +++ b/src/locales/ja/translation.json @@ -131,7 +131,9 @@ "private": "プライベート", "tap": "タップ", "close": "閉じる", - "dismiss": "閉じる" + "dismiss": "閉じる", + "none": "なし", + "secondaryLanguage": "補助言語" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "オリジナル Whisper モデル", "groq_whisper_large_v3_turbo": "リアルタイムの 216 倍速", "groq_whisper_large_v3": "Large v3 モデル、高速かつ高精度", - "mistral_voxtral_mini_latest": "高速多言語文字起こし" + "mistral_voxtral_mini_latest": "高速多言語文字起こし", + "soniox_stt_rt_v4": "高精度リアルタイムストリーミング音声認識" }, "cloud": { "openai_gpt_5_2": "最新のフラッグシップ推論モデル", diff --git a/src/locales/pt/translation.json b/src/locales/pt/translation.json index d49d8e62..c745dbec 100644 --- a/src/locales/pt/translation.json +++ b/src/locales/pt/translation.json @@ -103,7 +103,9 @@ "private": "Privado", "tap": "Toque", "close": "Fechar", - "dismiss": "Dispensar" + "dismiss": "Dispensar", + "none": "Nenhum", + "secondaryLanguage": "Idioma secundário" }, "onboarding": { "steps": { @@ -1313,7 +1315,8 @@ "openai_whisper_1": "Modelo Whisper original", "groq_whisper_large_v3_turbo": "Velocidade 216x em tempo real", "groq_whisper_large_v3": "Modelo Large v3, rápido e preciso", - "mistral_voxtral_mini_latest": "Transcrição multilíngue rápida" + "mistral_voxtral_mini_latest": "Transcrição multilíngue rápida", + "soniox_stt_rt_v4": "Transcrição de voz em tempo real com alta precisão" }, "cloud": { "openai_gpt_5_2": "Modelo principal de raciocínio mais recente", diff --git a/src/locales/ru/translation.json b/src/locales/ru/translation.json index 0f98f327..27c8fa88 100644 --- a/src/locales/ru/translation.json +++ b/src/locales/ru/translation.json @@ -131,7 +131,9 @@ "private": "Приватный", "tap": "Нажатие", "close": "Закрыть", - "dismiss": "Скрыть" + "dismiss": "Скрыть", + "none": "Нет", + "secondaryLanguage": "Второй язык" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "Оригинальная модель Whisper", "groq_whisper_large_v3_turbo": "Скорость в 216 раз быстрее реального времени", "groq_whisper_large_v3": "Модель Large v3, быстрая и точная", - "mistral_voxtral_mini_latest": "Быстрая многоязычная транскрипция" + "mistral_voxtral_mini_latest": "Быстрая многоязычная транскрипция", + "soniox_stt_rt_v4": "Распознавание речи в реальном времени с высокой точностью" }, "cloud": { "openai_gpt_5_2": "Новейшая флагманская модель с рассуждением", diff --git a/src/locales/zh-CN/translation.json b/src/locales/zh-CN/translation.json index b12e1c62..c550ef13 100644 --- a/src/locales/zh-CN/translation.json +++ b/src/locales/zh-CN/translation.json @@ -131,7 +131,9 @@ "private": "隐私", "tap": "点按", "close": "关闭", - "dismiss": "忽略" + "dismiss": "忽略", + "none": "无", + "secondaryLanguage": "辅助语言" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "原始 Whisper 模型", "groq_whisper_large_v3_turbo": "216 倍实时速度", "groq_whisper_large_v3": "Large v3 模型,快速且精准", - "mistral_voxtral_mini_latest": "快速多语言转录" + "mistral_voxtral_mini_latest": "快速多语言转录", + "soniox_stt_rt_v4": "高精度实时流式语音转文字" }, "cloud": { "openai_gpt_5_2": "最新旗舰推理模型", diff --git a/src/locales/zh-TW/translation.json b/src/locales/zh-TW/translation.json index 80c45443..993add9a 100644 --- a/src/locales/zh-TW/translation.json +++ b/src/locales/zh-TW/translation.json @@ -131,7 +131,9 @@ "private": "私密", "tap": "點按", "close": "關閉", - "dismiss": "忽略" + "dismiss": "忽略", + "none": "無", + "secondaryLanguage": "輔助語言" }, "onboarding": { "steps": { @@ -1341,7 +1343,8 @@ "openai_whisper_1": "原始 Whisper 模型", "groq_whisper_large_v3_turbo": "216 倍即時速度", "groq_whisper_large_v3": "Large v3 模型,快速且精準", - "mistral_voxtral_mini_latest": "快速多語言轉錄" + "mistral_voxtral_mini_latest": "快速多語言轉錄", + "soniox_stt_rt_v4": "高精度即時串流語音轉文字" }, "cloud": { "openai_gpt_5_2": "最新旗艦推理模型", diff --git a/src/models/modelRegistryData.json b/src/models/modelRegistryData.json index 96fc97b1..b68d3c6c 100644 --- a/src/models/modelRegistryData.json +++ b/src/models/modelRegistryData.json @@ -159,6 +159,19 @@ "descriptionKey": "models.descriptions.transcription.mistral_voxtral_mini_latest" } ] + }, + { + "id": "soniox", + "name": "Soniox", + "baseUrl": "wss://stt-rt.soniox.com", + "models": [ + { + "id": "stt-rt-v4", + "name": "Soniox STT RT v4", + "description": "Real-time streaming speech-to-text with high accuracy", + "descriptionKey": "models.descriptions.transcription.soniox_stt_rt_v4" + } + ] } ], "cloudProviders": [ diff --git a/src/stores/settingsStore.ts b/src/stores/settingsStore.ts index bfb84313..bd9b9b79 100644 --- a/src/stores/settingsStore.ts +++ b/src/stores/settingsStore.ts @@ -113,6 +113,7 @@ export interface SettingsState setAllowLocalFallback: (value: boolean) => void; setFallbackWhisperModel: (value: string) => void; setPreferredLanguage: (value: string) => void; + setSonioxSecondaryLanguage: (value: string) => void; setCloudTranscriptionProvider: (value: string) => void; setCloudTranscriptionModel: (value: string) => void; setCloudTranscriptionBaseUrl: (value: string) => void; @@ -131,6 +132,7 @@ export interface SettingsState setGeminiApiKey: (key: string) => void; setGroqApiKey: (key: string) => void; setMistralApiKey: (key: string) => void; + setSonioxApiKey: (key: string) => void; setCustomTranscriptionApiKey: (key: string) => void; setCustomReasoningApiKey: (key: string) => void; @@ -228,6 +230,7 @@ export const useSettingsStore = create()((set, get) => ({ allowLocalFallback: readBoolean("allowLocalFallback", false), fallbackWhisperModel: readString("fallbackWhisperModel", "base"), preferredLanguage: readString("preferredLanguage", "auto"), + sonioxSecondaryLanguage: readString("sonioxSecondaryLanguage", ""), cloudTranscriptionProvider: readString("cloudTranscriptionProvider", "openai"), cloudTranscriptionModel: readString("cloudTranscriptionModel", "gpt-4o-mini-transcribe"), cloudTranscriptionBaseUrl: readString( @@ -252,6 +255,7 @@ export const useSettingsStore = create()((set, get) => ({ geminiApiKey: readString("geminiApiKey", ""), groqApiKey: readString("groqApiKey", ""), mistralApiKey: readString("mistralApiKey", ""), + sonioxApiKey: readString("sonioxApiKey", ""), customTranscriptionApiKey: readString("customTranscriptionApiKey", ""), customReasoningApiKey: readString("customReasoningApiKey", ""), @@ -323,6 +327,7 @@ export const useSettingsStore = create()((set, get) => ({ setAllowLocalFallback: createBooleanSetter("allowLocalFallback"), setFallbackWhisperModel: createStringSetter("fallbackWhisperModel"), setPreferredLanguage: createStringSetter("preferredLanguage"), + setSonioxSecondaryLanguage: createStringSetter("sonioxSecondaryLanguage"), setCloudTranscriptionProvider: createStringSetter("cloudTranscriptionProvider"), setCloudTranscriptionModel: createStringSetter("cloudTranscriptionModel"), setCloudTranscriptionBaseUrl: createStringSetter("cloudTranscriptionBaseUrl"), @@ -392,6 +397,12 @@ export const useSettingsStore = create()((set, get) => ({ window.electronAPI?.saveMistralKey?.(key); invalidateApiKeyCaches("mistral"); }, + setSonioxApiKey: (key: string) => { + if (isBrowser) localStorage.setItem("sonioxApiKey", key); + set({ sonioxApiKey: key }); + window.electronAPI?.saveSonioxKey?.(key); + invalidateApiKeyCaches(); + }, setCustomTranscriptionApiKey: (key: string) => { if (isBrowser) localStorage.setItem("customTranscriptionApiKey", key); set({ customTranscriptionApiKey: key }); @@ -546,6 +557,7 @@ export const useSettingsStore = create()((set, get) => ({ if (keys.geminiApiKey !== undefined) s.setGeminiApiKey(keys.geminiApiKey); if (keys.groqApiKey !== undefined) s.setGroqApiKey(keys.groqApiKey); if (keys.mistralApiKey !== undefined) s.setMistralApiKey(keys.mistralApiKey); + if (keys.sonioxApiKey !== undefined) s.setSonioxApiKey(keys.sonioxApiKey); if (keys.customTranscriptionApiKey !== undefined) s.setCustomTranscriptionApiKey(keys.customTranscriptionApiKey); if (keys.customReasoningApiKey !== undefined) @@ -632,6 +644,10 @@ export async function initializeSettings(): Promise { const envKey = await window.electronAPI.getMistralKey?.(); if (envKey) createStringSetter("mistralApiKey")(envKey); } + if (!state.sonioxApiKey) { + const envKey = await window.electronAPI.getSonioxKey?.(); + if (envKey) createStringSetter("sonioxApiKey")(envKey); + } if (!state.customTranscriptionApiKey) { const envKey = await window.electronAPI.getCustomTranscriptionKey?.(); if (envKey) createStringSetter("customTranscriptionApiKey")(envKey); diff --git a/src/types/electron.ts b/src/types/electron.ts index 4acdfd90..59db3c0e 100644 --- a/src/types/electron.ts +++ b/src/types/electron.ts @@ -677,6 +677,10 @@ declare global { // Mistral API key management getMistralKey: () => Promise; saveMistralKey: (key: string) => Promise; + + // Soniox API key management + getSonioxKey?: () => Promise; + saveSonioxKey?: (key: string) => Promise; proxyMistralTranscription: (data: { audioBuffer: ArrayBuffer; model?: string; @@ -1146,6 +1150,28 @@ declare global { onDictationRealtimeError?: (callback: (error: string) => void) => () => void; onDictationRealtimeSessionEnd?: (callback: (data: { text: string }) => void) => () => void; + // Soniox streaming + sonioxStreamingWarmup?: (options?: { + apiKey?: string; + model?: string; + language?: string; + }) => Promise<{ success: boolean; error?: string }>; + sonioxStreamingStart?: (options?: { + apiKey?: string; + model?: string; + language?: string; + }) => Promise<{ success: boolean; error?: string }>; + sonioxStreamingSend?: (audioBuffer: ArrayBuffer) => void; + sonioxStreamingFinalize?: () => void; + sonioxStreamingStop?: () => Promise<{ success: boolean; text?: string }>; + sonioxStreamingStatus?: () => Promise<{ connected: boolean }>; + onSonioxPartialTranscript?: (callback: (text: string) => void) => () => void; + onSonioxFinalTranscript?: (callback: (text: string) => void) => () => void; + onSonioxError?: (callback: (error: string) => void) => () => void; + onSonioxSessionEnd?: ( + callback: (data: { text?: string }) => void + ) => () => void; + // Desktop audio capture getDesktopSources?: (types: string[]) => Promise>; diff --git a/src/utils/byokDetection.ts b/src/utils/byokDetection.ts index a7b989d5..bce4e74e 100644 --- a/src/utils/byokDetection.ts +++ b/src/utils/byokDetection.ts @@ -3,5 +3,6 @@ export const hasStoredByokKey = () => localStorage.getItem("openaiApiKey") || localStorage.getItem("groqApiKey") || localStorage.getItem("mistralApiKey") || + localStorage.getItem("sonioxApiKey") || localStorage.getItem("customTranscriptionApiKey") ); diff --git a/src/utils/providerIcons.ts b/src/utils/providerIcons.ts index 3be07a38..da742a0c 100644 --- a/src/utils/providerIcons.ts +++ b/src/utils/providerIcons.ts @@ -8,6 +8,7 @@ import groqIcon from "@/assets/icons/providers/groq.svg"; import nvidiaIcon from "@/assets/icons/providers/nvidia.svg"; import openaiOssIcon from "@/assets/icons/providers/openai-oss.svg"; import gemmaIcon from "@/assets/icons/providers/gemma.svg"; +import sonioxIcon from "@/assets/icons/providers/soniox.svg"; export const PROVIDER_ICONS: Record = { openai: openaiIcon, @@ -21,6 +22,7 @@ export const PROVIDER_ICONS: Record = { nvidia: nvidiaIcon, "openai-oss": openaiOssIcon, gemma: gemmaIcon, + soniox: sonioxIcon, }; export function getProviderIcon(provider: string): string | undefined { diff --git a/tests/helpers/sonioxStreaming.test.js b/tests/helpers/sonioxStreaming.test.js new file mode 100644 index 00000000..49ef70d2 --- /dev/null +++ b/tests/helpers/sonioxStreaming.test.js @@ -0,0 +1,154 @@ +const { describe, it } = require("node:test"); +const assert = require("node:assert/strict"); +const { removeFillers } = require("../../src/helpers/sonioxStreaming"); + +describe("removeFillers", () => { + it("passes through normal text unchanged", () => { + assert.equal(removeFillers("Hello world."), "Hello world."); + }); + + it("removes filler mid-sentence", () => { + assert.equal(removeFillers("I uh think so"), "I think so"); + }); + + it("removes filler with trailing comma mid-sentence", () => { + assert.equal(removeFillers("I, um, think so"), "I think so"); + }); + + it("removes filler after period and capitalizes next word", () => { + assert.equal( + removeFillers("done. Yyy, let me check"), + "done. Let me check" + ); + }); + + it("removes filler after question mark and capitalizes", () => { + assert.equal( + removeFillers("right? Mmm, or maybe not"), + "right? Or maybe not" + ); + }); + + it("removes filler after exclamation mark and capitalizes", () => { + assert.equal( + removeFillers("wow! Um, that was great"), + "wow! That was great" + ); + }); + + it("removes standalone filler sentence (Hmm.)", () => { + assert.equal( + removeFillers("really? Hmm. Maybe so."), + "really? Maybe so." + ); + }); + + it("removes multiple fillers in one text", () => { + assert.equal( + removeFillers("OK so let's try. Yyy, does it work? Mmm, or not? Eee, let me check again."), + "OK so let's try. Does it work? Or not? Let me check again." + ); + }); + + it("removes filler at start of text and capitalizes", () => { + assert.equal(removeFillers("Uh, so anyway"), "So anyway"); + }); + + it("removes filler at end of text", () => { + assert.equal(removeFillers("That's all um"), "That's all"); + }); + + it("removes consecutive fillers", () => { + assert.equal(removeFillers("Well uh um ok"), "Well ok"); + }); + + it("handles text with only fillers", () => { + assert.equal(removeFillers("Uh um mmm"), ""); + }); + + it("handles empty string", () => { + assert.equal(removeFillers(""), ""); + }); + + it("is case-insensitive", () => { + assert.equal(removeFillers("So UH yeah"), "So yeah"); + assert.equal(removeFillers("So UHH yeah"), "So yeah"); + assert.equal(removeFillers("So YYY yeah"), "So yeah"); + }); + + it("handles filler variations with repeated letters", () => { + assert.equal(removeFillers("So uhhh yeah"), "So yeah"); + assert.equal(removeFillers("So ummm yeah"), "So yeah"); + assert.equal(removeFillers("So hmmm yeah"), "So yeah"); + assert.equal(removeFillers("So eeeee yeah"), "So yeah"); + assert.equal(removeFillers("So yyyy yeah"), "So yeah"); + }); + + // False positive protection: real words must NOT be removed + + it("preserves real words containing filler substrings", () => { + assert.equal(removeFillers("The umbrella is here."), "The umbrella is here."); + assert.equal(removeFillers("She is human."), "She is human."); + assert.equal(removeFillers("Check the ohms."), "Check the ohms."); + assert.equal(removeFillers("It is yummy."), "It is yummy."); + assert.equal(removeFillers("Hot summer day."), "Hot summer day."); + }); + + it("preserves 'Oh' as a real exclamation", () => { + assert.equal(removeFillers("Oh really?"), "Oh really?"); + assert.equal(removeFillers("Oh, that is nice."), "Oh, that is nice."); + assert.equal(removeFillers("oh no!"), "oh no!"); + assert.equal(removeFillers("Hello. Oh, nice!"), "Hello. Oh, nice!"); + }); + + it("preserves 'Ah' as a real exclamation", () => { + assert.equal(removeFillers("Ah, I see."), "Ah, I see."); + assert.equal(removeFillers("done. Ah, great."), "done. Ah, great."); + }); + + it("preserves short tokens like 'ee' and 'hm'", () => { + assert.equal(removeFillers("I see ee in the code"), "I see ee in the code"); + assert.equal(removeFillers("Hm, interesting."), "Hm, interesting."); + }); + + // Unicode capitalization + + it("capitalizes Unicode letters after filler at sentence boundary", () => { + assert.equal( + removeFillers("tak. Yyy, ćwiczenie drugie. Eee, ósmy punkt. Mmm, świetnie"), + "tak. Ćwiczenie drugie. Ósmy punkt. Świetnie" + ); + }); + + it("capitalizes accented Latin letters after filler", () => { + assert.equal( + removeFillers("bien. Um, él sabe"), + "bien. Él sabe" + ); + }); + + it("capitalizes Cyrillic letters after filler", () => { + assert.equal( + removeFillers("done. Uhh, это работает"), + "done. Это работает" + ); + }); + + it("does not capitalize mid-sentence after filler removal", () => { + assert.equal(removeFillers("I uh think so"), "I think so"); + assert.equal(removeFillers("let's um try"), "let's try"); + }); + + it("does not capitalize first letter when no leading filler was removed", () => { + assert.equal(removeFillers("iPhone is great"), "iPhone is great"); + }); + + // Realistic Soniox output + + it("handles realistic Soniox output with sub-word assembled fillers", () => { + assert.equal( + removeFillers("No dobra, to robimy test. Yyy, w takim razie: czy to będzie działać? Hmm. A może jednak nie będzie działać? Hmm. Ciekawe, czy to zadziała."), + "No dobra, to robimy test. W takim razie: czy to będzie działać? A może jednak nie będzie działać? Ciekawe, czy to zadziała." + ); + }); +});