diff --git a/crates/engine/src/oneshot.rs b/crates/engine/src/oneshot.rs index 395fa0c1..19e1bf1a 100644 --- a/crates/engine/src/oneshot.rs +++ b/crates/engine/src/oneshot.rs @@ -384,6 +384,11 @@ impl Engine { for _ in 0..max_steps { steps += 1; if let Some(def) = definition.nodes.get(cursor) { + // Skip synthetic oneshot nodes — they are not in the + // registry and are handled separately by the engine. + if def.kind == "streamkit::http_input" || def.kind == "streamkit::http_output" { + break; + } let temp = registry.create_node(&def.kind, def.params.as_ref())?; if let Some(ct) = temp.content_type() { found = Some(ct); diff --git a/docs/src/content/docs/reference/plugins/index.md b/docs/src/content/docs/reference/plugins/index.md index db0cd6a6..824150f3 100644 --- a/docs/src/content/docs/reference/plugins/index.md +++ b/docs/src/content/docs/reference/plugins/index.md @@ -14,12 +14,13 @@ curl http://localhost:4545/api/v1/plugins curl http://localhost:4545/api/v1/schema/nodes | jq '.[] | select(.kind | startswith("plugin::"))' ``` -## Official plugins (10) +## Official plugins (11) - [`plugin::native::helsinki`](./plugin-native-helsinki/) (original kind: `helsinki`) - [`plugin::native::kokoro`](./plugin-native-kokoro/) (original kind: `kokoro`) - [`plugin::native::matcha`](./plugin-native-matcha/) (original kind: `matcha`) - [`plugin::native::nllb`](./plugin-native-nllb/) (original kind: `nllb`) +- [`plugin::native::parakeet`](./plugin-native-parakeet/) (original kind: `parakeet`) - [`plugin::native::piper`](./plugin-native-piper/) (original kind: `piper`) - [`plugin::native::pocket-tts`](./plugin-native-pocket-tts/) (original kind: `pocket-tts`) - [`plugin::native::sensevoice`](./plugin-native-sensevoice/) (original kind: `sensevoice`) diff --git a/docs/src/content/docs/reference/plugins/plugin-native-parakeet.md b/docs/src/content/docs/reference/plugins/plugin-native-parakeet.md new file mode 100644 index 00000000..571625c4 --- /dev/null +++ b/docs/src/content/docs/reference/plugins/plugin-native-parakeet.md @@ -0,0 +1,144 @@ +--- +# SPDX-FileCopyrightText: © 2025 StreamKit Contributors +# SPDX-License-Identifier: MPL-2.0 +title: "plugin::native::parakeet" +description: "Fast speech-to-text transcription using NVIDIA Parakeet TDT, a transducer-based ASR model. Approximately 10x faster than Whisper on consumer hardware with competitive accuracy. Uses sherpa-onnx for inference. Requires 16kHz mono audio input." +--- + +`kind`: `plugin::native::parakeet` (original kind: `parakeet`) + +Fast speech-to-text transcription using NVIDIA Parakeet TDT, a transducer-based ASR model. Approximately 10x faster than Whisper on consumer hardware with competitive accuracy. Uses sherpa-onnx for inference. Requires 16kHz mono audio input. + +Source: `target/plugins/release/libparakeet.so` + +## Categories +- `ml` +- `speech` +- `transcription` + +## Pins +### Inputs +- `in` accepts `RawAudio(AudioFormat { sample_rate: 16000, channels: 1, sample_format: F32 })` (one) + +### Outputs +- `out` produces `Transcription` (broadcast) + +## Parameters +| Name | Type | Required | Default | Description | +| --- | --- | --- | --- | --- | +| `execution_provider` | `string enum[cpu, cuda, tensorrt]` | no | `cpu` | Execution provider (cpu, cuda, tensorrt) | +| `max_segment_duration_secs` | `number` | no | `30.0` | Maximum segment duration before forced transcription (seconds)
min: `5`
max: `120` | +| `min_silence_duration_ms` | `integer` | no | `700` | Minimum silence duration before transcription (milliseconds)
min: `100`
max: `5000` | +| `model_dir` | `string` | no | `models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8` | Path to Parakeet TDT model directory (contains encoder.int8.onnx, decoder.int8.onnx, joiner.int8.onnx, tokens.txt). IMPORTANT: Input audio must be 16kHz mono f32. | +| `num_threads` | `integer` | no | `4` | Number of threads for inference
min: `1`
max: `16` | +| `use_vad` | `boolean` | no | `true` | Enable VAD-based segmentation | +| `vad_model_path` | `string` | no | `models/silero_vad.onnx` | Path to Silero VAD ONNX model file | +| `vad_threshold` | `number` | no | `0.5` | VAD speech probability threshold (0.0-1.0)
min: `0`
max: `1` | + +## Example Pipeline + +```yaml +# +# skit:input_asset_tags=speech + +name: Speech-to-Text (Parakeet TDT) +description: Fast English speech transcription using NVIDIA Parakeet TDT (~10x faster than Whisper on CPU) +mode: oneshot +steps: + - kind: streamkit::http_input + + - kind: containers::ogg::demuxer + + - kind: audio::opus::decoder + + - kind: audio::resampler + params: + chunk_frames: 960 + output_frame_size: 960 + target_sample_rate: 16000 + + - kind: plugin::native::parakeet + params: + model_dir: models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8 + num_threads: 4 + use_vad: true + vad_model_path: models/silero_vad.onnx + vad_threshold: 0.5 + min_silence_duration_ms: 700 + + - kind: core::json_serialize + params: + pretty: false + newline_delimited: true + + - kind: streamkit::http_output + params: + content_type: application/json +``` + + +
+Raw JSON Schema + +```json +{ + "properties": { + "execution_provider": { + "default": "cpu", + "description": "Execution provider (cpu, cuda, tensorrt)", + "enum": [ + "cpu", + "cuda", + "tensorrt" + ], + "type": "string" + }, + "max_segment_duration_secs": { + "default": 30.0, + "description": "Maximum segment duration before forced transcription (seconds)", + "maximum": 120.0, + "minimum": 5.0, + "type": "number" + }, + "min_silence_duration_ms": { + "default": 700, + "description": "Minimum silence duration before transcription (milliseconds)", + "maximum": 5000, + "minimum": 100, + "type": "integer" + }, + "model_dir": { + "default": "models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8", + "description": "Path to Parakeet TDT model directory (contains encoder.int8.onnx, decoder.int8.onnx, joiner.int8.onnx, tokens.txt). IMPORTANT: Input audio must be 16kHz mono f32.", + "type": "string" + }, + "num_threads": { + "default": 4, + "description": "Number of threads for inference", + "maximum": 16, + "minimum": 1, + "type": "integer" + }, + "use_vad": { + "default": true, + "description": "Enable VAD-based segmentation", + "type": "boolean" + }, + "vad_model_path": { + "default": "models/silero_vad.onnx", + "description": "Path to Silero VAD ONNX model file", + "type": "string" + }, + "vad_threshold": { + "default": 0.5, + "description": "VAD speech probability threshold (0.0-1.0)", + "maximum": 1.0, + "minimum": 0.0, + "type": "number" + } + }, + "type": "object" +} +``` + +
diff --git a/justfile b/justfile index 71cddf27..bfb5cbf8 100644 --- a/justfile +++ b/justfile @@ -723,6 +723,39 @@ upload-sensevoice-plugin: build-plugin-native-sensevoice @curl -X POST -F "plugin=@{{plugins_target_dir}}/release/libsensevoice.so" \ http://127.0.0.1:4545/api/v1/plugins +# Build native Parakeet TDT STT plugin +[working-directory: 'plugins/native/parakeet'] +build-plugin-native-parakeet: + @echo "Building native Parakeet TDT STT plugin..." + @CARGO_TARGET_DIR={{plugins_target_dir}} cargo build --release + +# Upload Parakeet plugin to running server +[working-directory: 'plugins/native/parakeet'] +upload-parakeet-plugin: build-plugin-native-parakeet + @echo "Uploading Parakeet plugin to server..." + @curl -X POST -F "plugin=@{{plugins_target_dir}}/release/libparakeet.so" \ + http://127.0.0.1:4545/api/v1/plugins + +# Download Parakeet TDT models +download-parakeet-models: + @echo "Downloading Parakeet TDT models (~631MB)..." + @mkdir -p models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8 + @HF_BASE="https://huggingface.co/streamkit/parakeet-models/resolve/main" && \ + MODEL_DIR="models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8" && \ + for f in encoder.int8.onnx decoder.int8.onnx joiner.int8.onnx tokens.txt; do \ + if [ -f "$MODEL_DIR/$f" ]; then \ + echo "✓ $f already exists"; \ + else \ + echo "Downloading $f..." && \ + curl -L -o "$MODEL_DIR/$f" "$HF_BASE/$f" || exit 1; \ + fi; \ + done && \ + echo "✓ Parakeet TDT models ready at $MODEL_DIR (English)" + +# Setup Parakeet (install dependencies + download models) +setup-parakeet: install-sherpa-onnx download-parakeet-models download-silero-vad + @echo "✓ Parakeet TDT STT setup complete!" + # Download pre-converted NLLB models from Hugging Face download-nllb-models: @echo "Downloading pre-converted NLLB-200 models from Hugging Face..." @@ -792,6 +825,9 @@ download-models: download-whisper-models download-silero-vad download-kokoro-mod @echo "Optional: To download Pocket TTS models (gated; requires HF_TOKEN):" @echo " just download-pocket-tts-models" @echo "" + @echo "Optional: To download Parakeet TDT models (~660MB, CC-BY-4.0):" + @echo " just download-parakeet-models" + @echo "" @du -sh models/ # Setup VAD (install dependencies + download models) @@ -979,7 +1015,7 @@ install-plugin name: (build-plugin-native name) fi # Build all native plugin examples -build-plugins-native: build-plugin-native-gain build-plugin-native-whisper build-plugin-native-kokoro build-plugin-native-piper build-plugin-native-matcha build-plugin-native-pocket-tts build-plugin-native-sensevoice build-plugin-native-nllb build-plugin-native-vad build-plugin-native-helsinki build-plugin-native-supertonic build-plugin-native-slint build-plugin-native-aac-encoder +build-plugins-native: build-plugin-native-gain build-plugin-native-whisper build-plugin-native-kokoro build-plugin-native-piper build-plugin-native-matcha build-plugin-native-pocket-tts build-plugin-native-sensevoice build-plugin-native-nllb build-plugin-native-vad build-plugin-native-helsinki build-plugin-native-supertonic build-plugin-native-slint build-plugin-native-aac-encoder build-plugin-native-parakeet ## Combined @@ -1042,7 +1078,7 @@ copy-plugins-native: # Official native plugins (shared target dir). # For most plugins the lib stem matches the plugin id. - for name in whisper kokoro piper matcha vad sensevoice nllb helsinki supertonic slint; do + for name in whisper kokoro piper matcha vad sensevoice nllb helsinki supertonic slint parakeet; do copy_plugin "$name" "$name" "$PLUGINS_TARGET" done diff --git a/marketplace/official-plugins.json b/marketplace/official-plugins.json index c6bf7dde..128c63ac 100644 --- a/marketplace/official-plugins.json +++ b/marketplace/official-plugins.json @@ -143,6 +143,59 @@ } ] }, + { + "id": "parakeet", + "name": "Parakeet TDT", + "version": "0.1.0", + "node_kind": "parakeet", + "kind": "native", + "entrypoint": "libparakeet.so", + "artifact": "target/plugins/release/libparakeet.so", + "description": "Fast speech-to-text using NVIDIA Parakeet TDT via sherpa-onnx", + "license": "MPL-2.0", + "homepage": "https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2", + "models": [ + { + "id": "parakeet-tdt-0.6b-v2-int8", + "name": "Parakeet TDT 0.6B v2 (English, INT8)", + "default": true, + "source": "huggingface", + "repo_id": "streamkit/parakeet-models", + "revision": "main", + "files": [ + "encoder.int8.onnx", + "decoder.int8.onnx", + "joiner.int8.onnx", + "tokens.txt" + ], + "expected_size_bytes": 661190513, + "license": "CC-BY-4.0", + "license_url": "https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2", + "file_checksums": { + "encoder.int8.onnx": "a32b12d17bbbc309d0686fbbcc2987b5e9b8333a7da83fa6b089f0a2acd651ab", + "decoder.int8.onnx": "b6bb64963457237b900e496ee9994b59294526439fbcc1fecf705b31a15c6b4e", + "joiner.int8.onnx": "7946164367946e7f9f29a122407c3252b680dbae9a51343eb2488d057c3c43d2", + "tokens.txt": "ec182b70dd42113aff6c5372c75cac58c952443eb22322f57bbd7f53977d497d" + } + }, + { + "id": "silero-vad", + "name": "Silero VAD (v6.2)", + "default": true, + "source": "huggingface", + "repo_id": "streamkit/parakeet-models", + "revision": "main", + "files": [ + "silero_vad.onnx" + ], + "expected_size_bytes": 2327524, + "license": "MIT", + "license_url": "https://github.com/snakers4/silero-vad/blob/master/LICENSE", + "sha256": "1a153a22f4509e292a94e67d6f9b85e8deb25b4988682b7e174c65279d8788e3" + } + ], + "repo": "https://github.com/streamer45/streamkit" + }, { "id": "piper", "name": "Piper", diff --git a/plugins/native/parakeet/Cargo.lock b/plugins/native/parakeet/Cargo.lock new file mode 100644 index 00000000..8a3deca7 --- /dev/null +++ b/plugins/native/parakeet/Cargo.lock @@ -0,0 +1,1300 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "cc" +version = "1.2.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "der" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b" +dependencies = [ + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.0", + "serde", + "serde_core", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.184" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" + +[[package]] +name = "libredox" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +dependencies = [ + "bitflags", + "libc", + "plain", + "redox_syscall", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "openssl" +version = "0.10.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.112" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "ort" +version = "2.0.0-rc.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa7e49bd669d32d7bc2a15ec540a527e7764aec722a45467814005725bcd721" +dependencies = [ + "ndarray", + "ort-sys", + "smallvec 2.0.0-alpha.10", + "tracing", +] + +[[package]] +name = "ort-sys" +version = "2.0.0-rc.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2aba9f5c7c479925205799216e7e5d07cc1d4fa76ea8058c60a9a30f6a4e890" +dependencies = [ + "flate2", + "pkg-config", + "sha2", + "tar", + "ureq", +] + +[[package]] +name = "parakeet-plugin-native" +version = "0.1.0" +dependencies = [ + "cc", + "ndarray", + "once_cell", + "ort", + "serde", + "serde_json", + "streamkit-plugin-sdk-native", + "tracing", +] + +[[package]] +name = "pem-rfc7468" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "redox_syscall" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "zeroize", +] + +[[package]] +name = "schannel" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "smallvec" +version = "2.0.0-alpha.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d44cfb396c3caf6fbfd0ab422af02631b69ddd96d2eff0b0f0724f9024051b" + +[[package]] +name = "socks" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + +[[package]] +name = "streamkit-core" +version = "0.2.0" +dependencies = [ + "async-trait", + "base64", + "bytes", + "schemars", + "serde", + "serde_json", + "smallvec 1.15.1", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "ts-rs", +] + +[[package]] +name = "streamkit-plugin-sdk-native" +version = "0.2.0" +dependencies = [ + "async-trait", + "bytes", + "serde", + "serde_json", + "streamkit-core", + "tracing", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio" +version = "1.51.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c" +dependencies = [ + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "ts-rs" +version = "12.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "756050066659291d47a554a9f558125db17428b073c5ffce1daf5dcb0f7231d8" +dependencies = [ + "serde_json", + "thiserror", + "ts-rs-macros", +] + +[[package]] +name = "ts-rs-macros" +version = "12.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d90eea51bc7988ef9e674bf80a85ba6804739e535e9cab48e4bb34a8b652aa" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "termcolor", +] + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "ureq" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" +dependencies = [ + "base64", + "der", + "log", + "native-tls", + "percent-encoding", + "rustls-pki-types", + "socks", + "ureq-proto", + "utf8-zero", + "webpki-root-certs", +] + +[[package]] +name = "ureq-proto" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" +dependencies = [ + "base64", + "http", + "httparse", + "log", +] + +[[package]] +name = "utf8-zero" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "webpki-root-certs" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/plugins/native/parakeet/Cargo.toml b/plugins/native/parakeet/Cargo.toml new file mode 100644 index 00000000..444994bb --- /dev/null +++ b/plugins/native/parakeet/Cargo.toml @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: © 2025 StreamKit Contributors +# +# SPDX-License-Identifier: MPL-2.0 + +[package] +name = "parakeet-plugin-native" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lib] +name = "parakeet" +crate-type = ["cdylib"] + +[dependencies] +streamkit-plugin-sdk-native = { path = "../../../sdks/plugin-sdk/native" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +tracing = "0.1" + +# For VAD support +ort = "=2.0.0-rc.10" +ndarray = "0.16" + + +[lints.clippy] +# Categories +pedantic = { level = "warn", priority = -1 } +nursery = { level = "warn", priority = -1 } +# Safety +unwrap_used = "warn" +expect_used = "warn" +# Complexity +cognitive_complexity = "warn" +# Math +cast_possible_truncation = "warn" +cast_precision_loss = "warn" +cast_sign_loss = "warn" +# Allow-list (Noise reduction) +module_name_repetitions = "allow" +must_use_candidate = "allow" +doc_markdown = "allow" diff --git a/plugins/native/parakeet/README.md b/plugins/native/parakeet/README.md new file mode 100644 index 00000000..d5a81532 --- /dev/null +++ b/plugins/native/parakeet/README.md @@ -0,0 +1,189 @@ + + +# Parakeet TDT STT Native Plugin + +High-performance English speech-to-text plugin for StreamKit using [NVIDIA Parakeet TDT](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) via sherpa-onnx. + +## Features + +- **Best-in-class Accuracy**: #1 on HuggingFace ASR leaderboard, lower WER than Whisper +- **Fast CPU Inference**: ~10x faster than Whisper Large V3 on CPU (35 min audio in ~18s on Apple Silicon) +- **INT8 Quantized**: 631 MB model runs well on consumer hardware +- **VAD-Based Segmentation**: Optional Silero VAD integration for natural speech boundaries +- **GPU Acceleration**: Supports CUDA and TensorRT execution providers +- **Model Caching**: Automatic deduplication across pipeline instances +- **Commercially Permissive**: CC-BY-4.0 license + +## Quick Start + +### 1. Install Dependencies + +```bash +# Install sherpa-onnx shared library +just install-sherpa-onnx + +# Download Parakeet models and Silero VAD (one-time, ~631 MB) +just setup-parakeet +``` + +### 2. Build Plugin + +```bash +# Build the plugin +just build-plugin-native-parakeet + +# Copy to plugins directory +just copy-plugins-native + +# Or upload to running server +just upload-parakeet-plugin +``` + +### 3. Use in Pipeline + +```yaml +steps: + - kind: streamkit::http_input + - kind: containers::ogg::demuxer + - kind: audio::opus::decoder + - kind: audio::resampler + params: + target_sample_rate: 16000 # Parakeet requires 16kHz + chunk_frames: 960 + - kind: plugin::native::parakeet + params: + use_vad: true # Enable VAD segmentation + num_threads: 4 # CPU threads for inference + - kind: core::json_serialize + - kind: streamkit::http_output +``` + +See `samples/pipelines/oneshot/parakeet-stt.yml` for complete example. + +## Configuration Parameters + +### Model Parameters (Cached) + +These parameters affect model loading and are used for caching: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model_dir` | string | `models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8` | Path to model directory | +| `num_threads` | integer | `4` | CPU threads for inference (1-16) | +| `execution_provider` | string | `cpu` | ONNX Runtime provider (`cpu`, `cuda`, `tensorrt`) | + +### Processing Parameters (Per-Instance) + +These parameters can differ between instances sharing the same model: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `use_vad` | boolean | `true` | Enable VAD-based segmentation | +| `vad_model_path` | string | `models/silero_vad.onnx` | Path to Silero VAD model | +| `vad_threshold` | number | `0.5` | Speech detection threshold (0.0-1.0) | +| `min_silence_duration_ms` | integer | `700` | Minimum silence before segmenting (ms) | +| `max_segment_duration_secs` | number | `30.0` | Maximum segment duration (seconds) | + +## Audio Requirements + +Parakeet requires audio in the following format: + +- **Sample rate**: 16 kHz (use `audio::resampler` to convert) +- **Channels**: Mono (1 channel) +- **Format**: f32 samples + +## Model Architecture + +Parakeet TDT uses a **Token-and-Duration Transducer** architecture with three ONNX models: + +| File | Size | Description | +|------|------|-------------| +| `encoder.int8.onnx` | ~652 MB | FastConformer encoder (INT8 quantized) | +| `decoder.int8.onnx` | ~7 MB | Prediction network | +| `joiner.int8.onnx` | ~2 MB | Joint network | +| `tokens.txt` | ~9 KB | Token vocabulary | + +This contrasts with single-model architectures (SenseVoice, Whisper) — the transducer approach enables faster streaming-friendly decoding. + +## VAD Segmentation + +VAD (Voice Activity Detection) segments audio into natural speech boundaries: + +**With VAD enabled** (`use_vad: true`, default): +- Detects speech vs. silence using Silero VAD +- Transcribes complete sentences when silence is detected +- Zero chunking artifacts, natural boundaries +- Best for conversational audio and streaming + +**With VAD disabled** (`use_vad: false`): +- Transcribes audio in fixed-duration segments +- Uses `max_segment_duration_secs` for chunking +- Best for continuous speech with minimal pauses + +## Model Caching + +The plugin automatically caches recognizers to avoid redundant model loading: + +**Cache Key**: `(model_dir, num_threads, execution_provider)` + +Multiple pipeline instances using the same model configuration share a single recognizer in memory. + +## Comparison with Other STT Plugins + +| Feature | Parakeet TDT | SenseVoice | Whisper | +|---------|-------------|------------|---------| +| Languages | English only (v2) | 5 languages | 99 languages | +| Model Size | 631 MB (INT8) | 226 MB (INT8) | 140 MB (base.en-q5_1) | +| CPU Speed | ~10x faster than Whisper | ~5-10x realtime | ~10-15x realtime | +| Accuracy (WER) | Best (#1 HF leaderboard) | Good | Good | +| Architecture | Transducer (enc/dec/joiner) | Single model | Single model | +| License | CC-BY-4.0 | Apache 2.0 | MIT | +| Best For | Fast, accurate English STT | Asian languages | General multilingual | + +## Troubleshooting + +### Plugin fails to load + +``` +Error: Failed to load sherpa-onnx shared library +``` + +**Solution**: Install sherpa-onnx: +```bash +just install-sherpa-onnx +``` + +### Model not found + +``` +Error: model file not found: models/.../encoder.int8.onnx +``` + +**Solution**: Download models: +```bash +just download-parakeet-models +``` + +### Audio format error + +**Solution**: Add `audio::resampler` upstream: +```yaml +- kind: audio::resampler + params: + target_sample_rate: 16000 +``` + +## Model Attribution + +- **Parakeet TDT Model**: [NVIDIA NeMo](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) +- **Models**: [streamkit/parakeet-models](https://huggingface.co/streamkit/parakeet-models) (hosted from [csukuangfj/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8](https://huggingface.co/csukuangfj/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8)) +- **Silero VAD**: [snakers4/silero-vad](https://github.com/snakers4/silero-vad) (MIT) +- **License**: CC-BY-4.0 + +## License + +This plugin is licensed under MPL-2.0. The Parakeet TDT model is licensed under CC-BY-4.0. diff --git a/plugins/native/parakeet/build.rs b/plugins/native/parakeet/build.rs new file mode 100644 index 00000000..e567a85c --- /dev/null +++ b/plugins/native/parakeet/build.rs @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +// Allow: println! in build.rs is the standard way to communicate with Cargo, not logging +#![allow(clippy::disallowed_macros)] + +fn main() { + // Link against libsherpa-onnx-c-api (not libsherpa-onnx) + println!("cargo:rustc-link-lib=sherpa-onnx-c-api"); + + // Common library search paths + println!("cargo:rustc-link-search=native=/usr/local/lib"); + println!("cargo:rustc-link-search=native=/usr/lib"); + println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu"); + println!("cargo:rustc-link-search=native=/opt/homebrew/lib"); + + // Add rpath so the plugin can find sherpa-onnx at runtime + println!("cargo:rustc-link-arg=-Wl,-rpath,/usr/local/lib"); +} diff --git a/plugins/native/parakeet/plugin.yml b/plugins/native/parakeet/plugin.yml new file mode 100644 index 00000000..a13be801 --- /dev/null +++ b/plugins/native/parakeet/plugin.yml @@ -0,0 +1,43 @@ +id: parakeet +name: Parakeet TDT +version: 0.1.0 +node_kind: parakeet +kind: native +entrypoint: libparakeet.so +artifact: target/plugins/release/libparakeet.so +description: Fast speech-to-text using NVIDIA Parakeet TDT via sherpa-onnx +license: MPL-2.0 +homepage: https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2 +repo: https://github.com/streamer45/streamkit +models: +- id: parakeet-tdt-0.6b-v2-int8 + name: Parakeet TDT 0.6B v2 (English, INT8) + default: true + source: huggingface + repo_id: streamkit/parakeet-models + revision: main + files: + - encoder.int8.onnx + - decoder.int8.onnx + - joiner.int8.onnx + - tokens.txt + expected_size_bytes: 661190513 + license: CC-BY-4.0 + license_url: https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2 + file_checksums: + encoder.int8.onnx: a32b12d17bbbc309d0686fbbcc2987b5e9b8333a7da83fa6b089f0a2acd651ab + decoder.int8.onnx: b6bb64963457237b900e496ee9994b59294526439fbcc1fecf705b31a15c6b4e + joiner.int8.onnx: 7946164367946e7f9f29a122407c3252b680dbae9a51343eb2488d057c3c43d2 + tokens.txt: ec182b70dd42113aff6c5372c75cac58c952443eb22322f57bbd7f53977d497d +- id: silero-vad + name: Silero VAD (v6.2) + default: true + source: huggingface + repo_id: streamkit/parakeet-models + revision: main + files: + - silero_vad.onnx + expected_size_bytes: 2327524 + license: MIT + license_url: https://github.com/snakers4/silero-vad/blob/master/LICENSE + sha256: 1a153a22f4509e292a94e67d6f9b85e8deb25b4988682b7e174c65279d8788e3 diff --git a/plugins/native/parakeet/src/config.rs b/plugins/native/parakeet/src/config.rs new file mode 100644 index 00000000..23a42bda --- /dev/null +++ b/plugins/native/parakeet/src/config.rs @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +use serde::{Deserialize, Serialize}; + +/// Configuration for the Parakeet TDT STT plugin +#[derive(Serialize, Deserialize, Clone)] +pub struct ParakeetConfig { + /// Path to the Parakeet TDT model directory (contains encoder, decoder, joiner, tokens) + #[serde(default = "default_model_dir")] + pub model_dir: String, + + /// Number of threads for inference + #[serde(default = "default_num_threads")] + pub num_threads: i32, + + /// Execution provider (cpu, cuda, tensorrt) + #[serde(default = "default_execution_provider")] + pub execution_provider: String, + + /// Enable VAD-based segmentation + #[serde(default = "default_use_vad")] + pub use_vad: bool, + + /// Path to Silero VAD model (if use_vad = true) + #[serde(default = "default_vad_model_path")] + pub vad_model_path: String, + + /// VAD speech probability threshold (0.0-1.0) + #[serde(default = "default_vad_threshold")] + pub vad_threshold: f32, + + /// Minimum silence duration before triggering transcription (milliseconds) + #[serde(default = "default_min_silence_duration_ms")] + pub min_silence_duration_ms: u64, + + /// Maximum segment duration before forcing transcription (seconds) + #[serde(default = "default_max_segment_duration_secs")] + pub max_segment_duration_secs: f32, +} + +fn default_model_dir() -> String { + "models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8".to_string() +} + +const fn default_num_threads() -> i32 { + 4 +} + +fn default_execution_provider() -> String { + "cpu".to_string() +} + +const fn default_use_vad() -> bool { + true +} + +fn default_vad_model_path() -> String { + "models/silero_vad.onnx".to_string() +} + +const fn default_vad_threshold() -> f32 { + 0.5 +} + +const fn default_min_silence_duration_ms() -> u64 { + 700 +} + +const fn default_max_segment_duration_secs() -> f32 { + 30.0 +} + +impl Default for ParakeetConfig { + fn default() -> Self { + Self { + model_dir: default_model_dir(), + num_threads: default_num_threads(), + execution_provider: default_execution_provider(), + use_vad: default_use_vad(), + vad_model_path: default_vad_model_path(), + vad_threshold: default_vad_threshold(), + min_silence_duration_ms: default_min_silence_duration_ms(), + max_segment_duration_secs: default_max_segment_duration_secs(), + } + } +} diff --git a/plugins/native/parakeet/src/ffi.rs b/plugins/native/parakeet/src/ffi.rs new file mode 100644 index 00000000..597d466d --- /dev/null +++ b/plugins/native/parakeet/src/ffi.rs @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +//! FFI bindings to sherpa-onnx C API for Parakeet TDT transducer +//! Based on https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/c-api/c-api.h +//! Struct layout matches sherpa-onnx v1.12.17. + +use std::os::raw::{c_char, c_float, c_int}; + +/// Opaque recognizer handle +#[repr(C)] +pub struct SherpaOnnxOfflineRecognizer { + _private: [u8; 0], +} + +/// Opaque stream handle +#[repr(C)] +pub struct SherpaOnnxOfflineStream { + _private: [u8; 0], +} + +/// Recognition result +#[repr(C)] +pub struct SherpaOnnxOfflineRecognizerResult { + pub text: *const c_char, + pub tokens: *const c_char, + pub timestamps: *const c_float, + pub count: c_int, + pub lang: *const c_char, + pub emotion: *const c_char, + pub event: *const c_char, + pub json: *const c_char, +} + +/// Transducer model config (encoder + decoder + joiner) +/// This is the primary model config used for Parakeet TDT. +#[repr(C)] +pub struct SherpaOnnxOfflineTransducerModelConfig { + pub encoder: *const c_char, + pub decoder: *const c_char, + pub joiner: *const c_char, +} + +/// Paraformer model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineParaformerModelConfig { + pub model: *const c_char, +} + +/// NeMo CTC model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineNemoEncDecCtcModelConfig { + pub model: *const c_char, +} + +/// Whisper model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineWhisperModelConfig { + pub encoder: *const c_char, + pub decoder: *const c_char, + pub language: *const c_char, + pub task: *const c_char, + pub tail_paddings: c_int, +} + +/// TDnn model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineTdnnModelConfig { + pub model: *const c_char, +} + +/// SenseVoice model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineSenseVoiceModelConfig { + pub model: *const c_char, + pub language: *const c_char, + pub use_itn: c_int, +} + +/// Moonshine model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineMoonshineModelConfig { + pub preprocessor: *const c_char, + pub encoder: *const c_char, + pub uncached_decoder: *const c_char, + pub cached_decoder: *const c_char, +} + +/// FireRedAsr model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineFireRedAsrModelConfig { + pub encoder: *const c_char, + pub decoder: *const c_char, +} + +/// Dolphin model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineDolphinModelConfig { + pub model: *const c_char, +} + +/// ZipformerCtc model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineZipformerCtcModelConfig { + pub model: *const c_char, +} + +/// Canary model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineCanaryModelConfig { + pub encoder: *const c_char, + pub decoder: *const c_char, + pub src_lang: *const c_char, + pub tgt_lang: *const c_char, + pub use_pnc: c_int, +} + +/// WenetCtc model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineWenetCtcModelConfig { + pub model: *const c_char, +} + +/// OmnilingualAsrCtc model config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxOfflineOmnilingualAsrCtcModelConfig { + pub model: *const c_char, +} + +/// LM configuration +#[repr(C)] +pub struct SherpaOnnxOfflineLMConfig { + pub model: *const c_char, + pub scale: c_float, +} + +/// Overall model configuration +/// Field order must match sherpa-onnx v1.12.17 c-api.h exactly. +#[repr(C)] +pub struct SherpaOnnxOfflineModelConfig { + pub transducer: SherpaOnnxOfflineTransducerModelConfig, + pub paraformer: SherpaOnnxOfflineParaformerModelConfig, + pub nemo_ctc: SherpaOnnxOfflineNemoEncDecCtcModelConfig, + pub whisper: SherpaOnnxOfflineWhisperModelConfig, + pub tdnn: SherpaOnnxOfflineTdnnModelConfig, + pub tokens: *const c_char, + pub num_threads: c_int, + pub debug: c_int, + pub provider: *const c_char, + pub model_type: *const c_char, + pub modeling_unit: *const c_char, + pub bpe_vocab: *const c_char, + pub telespeech_ctc: *const c_char, + pub sense_voice: SherpaOnnxOfflineSenseVoiceModelConfig, + pub moonshine: SherpaOnnxOfflineMoonshineModelConfig, + pub fire_red_asr: SherpaOnnxOfflineFireRedAsrModelConfig, + pub dolphin: SherpaOnnxOfflineDolphinModelConfig, + pub zipformer_ctc: SherpaOnnxOfflineZipformerCtcModelConfig, + pub canary: SherpaOnnxOfflineCanaryModelConfig, + pub wenet_ctc: SherpaOnnxOfflineWenetCtcModelConfig, + pub omnilingual: SherpaOnnxOfflineOmnilingualAsrCtcModelConfig, +} + +/// Homophone replacer config (unused, but needed for struct layout) +#[repr(C)] +pub struct SherpaOnnxHomophoneReplacerConfig { + pub dict_dir: *const c_char, + pub lexicon: *const c_char, + pub rule_fsts: *const c_char, +} + +/// Recognizer configuration +#[repr(C)] +pub struct SherpaOnnxOfflineRecognizerConfig { + pub feat_config: SherpaOnnxFeatureConfig, + pub model_config: SherpaOnnxOfflineModelConfig, + pub lm_config: SherpaOnnxOfflineLMConfig, + pub decoding_method: *const c_char, + pub max_active_paths: c_int, + pub hotwords_file: *const c_char, + pub hotwords_score: c_float, + pub rule_fsts: *const c_char, + pub rule_fars: *const c_char, + pub blank_penalty: c_float, + pub hr: SherpaOnnxHomophoneReplacerConfig, +} + +/// Feature extraction configuration +#[repr(C)] +pub struct SherpaOnnxFeatureConfig { + pub sample_rate: c_int, + pub feature_dim: c_int, +} + +extern "C" { + /// Create offline recognizer + pub fn SherpaOnnxCreateOfflineRecognizer( + config: *const SherpaOnnxOfflineRecognizerConfig, + ) -> *mut SherpaOnnxOfflineRecognizer; + + /// Destroy offline recognizer + pub fn SherpaOnnxDestroyOfflineRecognizer(recognizer: *mut SherpaOnnxOfflineRecognizer); + + /// Create offline stream + pub fn SherpaOnnxCreateOfflineStream( + recognizer: *const SherpaOnnxOfflineRecognizer, + ) -> *mut SherpaOnnxOfflineStream; + + /// Destroy offline stream + pub fn SherpaOnnxDestroyOfflineStream(stream: *mut SherpaOnnxOfflineStream); + + /// Accept waveform for offline stream + pub fn SherpaOnnxAcceptWaveformOffline( + stream: *mut SherpaOnnxOfflineStream, + sample_rate: c_int, + samples: *const c_float, + n: c_int, + ); + + /// Decode offline stream + pub fn SherpaOnnxDecodeOfflineStream( + recognizer: *mut SherpaOnnxOfflineRecognizer, + stream: *mut SherpaOnnxOfflineStream, + ); + + /// Get recognition result + pub fn SherpaOnnxGetOfflineStreamResult( + stream: *const SherpaOnnxOfflineStream, + ) -> *const SherpaOnnxOfflineRecognizerResult; + + /// Destroy recognition result + pub fn SherpaOnnxDestroyOfflineRecognizerResult( + result: *const SherpaOnnxOfflineRecognizerResult, + ); +} diff --git a/plugins/native/parakeet/src/lib.rs b/plugins/native/parakeet/src/lib.rs new file mode 100644 index 00000000..1799a313 --- /dev/null +++ b/plugins/native/parakeet/src/lib.rs @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +//! Parakeet TDT STT native plugin for StreamKit +//! +//! Provides fast English speech recognition using NVIDIA's Parakeet TDT +//! transducer model via sherpa-onnx. Approximately 10x faster than Whisper +//! on consumer hardware with competitive accuracy. + +mod config; +mod ffi; +mod parakeet_node; +mod vad; + +use parakeet_node::ParakeetNode; +use streamkit_plugin_sdk_native::prelude::*; + +// Export the plugin entry point +native_plugin_entry!(ParakeetNode); diff --git a/plugins/native/parakeet/src/parakeet_node.rs b/plugins/native/parakeet/src/parakeet_node.rs new file mode 100644 index 00000000..3b24a2a0 --- /dev/null +++ b/plugins/native/parakeet/src/parakeet_node.rs @@ -0,0 +1,638 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +use std::collections::{HashMap, VecDeque}; +use std::ffi::{CStr, CString}; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; +use streamkit_plugin_sdk_native::prelude::*; +use streamkit_plugin_sdk_native::streamkit_core::types::{ + AudioFormat, SampleFormat, TranscriptionData, TranscriptionSegment, +}; + +use crate::config::ParakeetConfig; +use crate::ffi; +use crate::vad::SileroVAD; + +/// Wrapper for recognizer pointer with proper cleanup +struct RecognizerWrapper { + recognizer: *mut ffi::SherpaOnnxOfflineRecognizer, +} + +impl RecognizerWrapper { + const fn new(recognizer: *mut ffi::SherpaOnnxOfflineRecognizer) -> Self { + Self { recognizer } + } + + const fn get(&self) -> *mut ffi::SherpaOnnxOfflineRecognizer { + self.recognizer + } +} + +unsafe impl Send for RecognizerWrapper {} +unsafe impl Sync for RecognizerWrapper {} + +impl Drop for RecognizerWrapper { + fn drop(&mut self) { + if !self.recognizer.is_null() { + unsafe { + ffi::SherpaOnnxDestroyOfflineRecognizer(self.recognizer); + } + } + } +} + +/// Cached recognizer +struct CachedRecognizer { + recognizer: Arc, +} + +/// Global cache of recognizers +/// Key: (model_dir, num_threads, execution_provider) +// Allow: Type complexity is acceptable here - composite key for caching recognizers +#[allow(clippy::type_complexity)] +static RECOGNIZER_CACHE: std::sync::LazyLock< + Mutex>, +> = std::sync::LazyLock::new(|| Mutex::new(HashMap::new())); + +pub struct ParakeetNode { + config: ParakeetConfig, + recognizer: Arc, + vad: Option, + + // Frame buffering (for VAD) + frame_buffer: VecDeque, + + // Speech segment buffering (for recognition) + speech_buffer: VecDeque, + segment_start_time_ms: u64, + + // Silence tracking + silence_frame_count: usize, + silence_threshold_frames: usize, + + // Time tracking + absolute_time_ms: u64, + + logger: Logger, +} + +// SAFETY: We ensure thread-safety through Arc +unsafe impl Send for ParakeetNode {} +unsafe impl Sync for ParakeetNode {} + +impl NativeProcessorNode for ParakeetNode { + fn metadata() -> NodeMetadata { + NodeMetadata::builder("parakeet") + .description( + "Fast speech-to-text transcription using NVIDIA Parakeet TDT, a transducer-based \ + ASR model. Approximately 10x faster than Whisper on consumer hardware with \ + competitive accuracy. Uses sherpa-onnx for inference. \ + Requires 16kHz mono audio input.", + ) + .input( + "in", + &[PacketType::RawAudio(AudioFormat { + sample_rate: 16000, // Requires 16kHz + channels: 1, // Requires mono + sample_format: SampleFormat::F32, + })], + ) + .output("out", PacketType::Transcription) + .param_schema(serde_json::json!({ + "type": "object", + "properties": { + "model_dir": { + "type": "string", + "description": "Path to Parakeet TDT model directory (contains encoder.int8.onnx, decoder.int8.onnx, joiner.int8.onnx, tokens.txt). IMPORTANT: Input audio must be 16kHz mono f32.", + "default": "models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8" + }, + "num_threads": { + "type": "integer", + "description": "Number of threads for inference", + "default": 4, + "minimum": 1, + "maximum": 16 + }, + "execution_provider": { + "type": "string", + "description": "Execution provider (cpu, cuda, tensorrt)", + "default": "cpu", + "enum": ["cpu", "cuda", "tensorrt"] + }, + "use_vad": { + "type": "boolean", + "description": "Enable VAD-based segmentation", + "default": true + }, + "vad_model_path": { + "type": "string", + "description": "Path to Silero VAD ONNX model file", + "default": "models/silero_vad.onnx" + }, + "vad_threshold": { + "type": "number", + "description": "VAD speech probability threshold (0.0-1.0)", + "default": 0.5, + "minimum": 0.0, + "maximum": 1.0 + }, + "min_silence_duration_ms": { + "type": "integer", + "description": "Minimum silence duration before transcription (milliseconds)", + "default": 700, + "minimum": 100, + "maximum": 5000 + }, + "max_segment_duration_secs": { + "type": "number", + "description": "Maximum segment duration before forced transcription (seconds)", + "default": 30.0, + "minimum": 5.0, + "maximum": 120.0 + } + } + })) + .category("ml") + .category("speech") + .category("transcription") + .build() + } + + fn new(params: Option, logger: Logger) -> Result { + plugin_info!(logger, "ParakeetNode::new() called"); + + let config: ParakeetConfig = if let Some(p) = params { + serde_json::from_value(p).map_err(|e| format!("Config parse error: {e}"))? + } else { + ParakeetConfig::default() + }; + + plugin_info!( + logger, + "Config: model_dir={}, num_threads={}, use_vad={}", + config.model_dir, + config.num_threads, + config.use_vad + ); + + // Build paths + let model_dir = PathBuf::from(&config.model_dir); + let model_dir = if model_dir.is_absolute() { + model_dir + } else { + std::env::current_dir() + .map_err(|e| format!("Failed to get current dir: {e}"))? + .join(model_dir) + }; + + // Canonicalize + let model_dir = model_dir.canonicalize().map_err(|e| { + format!("Failed to canonicalize model dir '{}': {}", model_dir.display(), e) + })?; + + let model_dir_str = model_dir.to_string_lossy().to_string(); + + // Cache key: (model_dir, num_threads, execution_provider) + let cache_key = (model_dir_str, config.num_threads, config.execution_provider.clone()); + + plugin_info!( + logger, + "Cache key: dir='{}' threads={} provider='{}'", + cache_key.0, + cache_key.1, + cache_key.2 + ); + + // Check cache + let cached_recognizer = { + let cache = RECOGNIZER_CACHE + .lock() + .map_err(|e| format!("Failed to lock recognizer cache: {e}"))?; + + plugin_info!(logger, "Cache has {} entries", cache.len()); + cache.get(&cache_key).map(|cached| cached.recognizer.clone()) + }; + + let recognizer = if let Some(rec) = cached_recognizer { + plugin_info!(logger, "CACHE HIT: Reusing cached recognizer"); + rec + } else { + plugin_info!(logger, "CACHE MISS: Creating new recognizer"); + + let recognizer_ptr = unsafe { create_recognizer(&logger, &model_dir, &config)? }; + let recognizer_arc = Arc::new(RecognizerWrapper::new(recognizer_ptr)); + + // Insert into cache + plugin_info!(logger, "Inserting recognizer into cache"); + let cache_size = { + let mut cache = RECOGNIZER_CACHE + .lock() + .map_err(|e| format!("Failed to lock recognizer cache: {e}"))?; + + cache.insert(cache_key, CachedRecognizer { recognizer: recognizer_arc.clone() }); + cache.len() + }; + plugin_info!(logger, "Cache now has {} entries", cache_size); + + recognizer_arc + }; + + // Initialize VAD if enabled + let vad = if config.use_vad { + plugin_info!(logger, "Initializing Silero VAD"); + let vad_instance = SileroVAD::new(&config.vad_model_path, 16000, config.vad_threshold) + .map_err(|e| format!("Failed to initialize VAD: {e}"))?; + Some(vad_instance) + } else { + plugin_info!(logger, "VAD disabled"); + None + }; + + // Calculate silence threshold in frames (each frame is 32ms) + let silence_threshold_frames = (config.min_silence_duration_ms / 32) as usize; + + Ok(Self { + config, + recognizer, + vad, + frame_buffer: VecDeque::with_capacity(1024), + speech_buffer: VecDeque::with_capacity(16000 * 30), // 30 seconds max + segment_start_time_ms: 0, + silence_frame_count: 0, + silence_threshold_frames, + absolute_time_ms: 0, + logger, + }) + } + + fn process(&mut self, _pin: &str, packet: Packet, output: &OutputSender) -> Result<(), String> { + match packet { + Packet::Audio(frame) => { + // Validate audio format (must be 16kHz mono f32) + if frame.sample_rate != 16000 { + return Err(format!( + "Parakeet requires 16kHz audio, got {}Hz. Add audio_resampler upstream.", + frame.sample_rate + )); + } + if frame.channels != 1 { + return Err(format!( + "Parakeet requires mono audio, got {} channels. Add audio_resampler upstream.", + frame.channels + )); + } + + let has_vad = self.vad.is_some(); + + if has_vad { + // VAD-based processing + self.frame_buffer.extend(frame.samples.as_ref().as_slice().iter().copied()); + + // Process complete 512-sample frames through VAD + loop { + if self.frame_buffer.len() < 512 { + break; + } + + let vad_frame: Vec = self.frame_buffer.drain(..512).collect(); + + // Process chunk through VAD - scope the borrow + let is_speech = { + let vad = self.vad.as_mut().ok_or_else(|| { + "VAD not initialized but use_vad is true".to_string() + })?; + let probability = vad + .process_chunk(&vad_frame) + .map_err(|e| format!("VAD processing failed: {e}"))?; + probability >= self.config.vad_threshold + }; // vad borrow ends here + + let should_transcribe = if is_speech { + // Speech detected + self.silence_frame_count = 0; + + // Start new segment if needed + if self.speech_buffer.is_empty() { + self.segment_start_time_ms = self.absolute_time_ms; + } + + // Add to speech buffer + self.speech_buffer.extend(&vad_frame); + + // Check max duration + let segment_duration_ms = + self.absolute_time_ms - self.segment_start_time_ms; + // Allow: Config value is always positive, cast to u64 for duration comparison + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let max_duration_ms = + (self.config.max_segment_duration_secs * 1000.0) as u64; + segment_duration_ms >= max_duration_ms + } else { + // Silence detected + self.silence_frame_count += 1; + + // Check if we have buffered speech and enough silence + !self.speech_buffer.is_empty() + && self.silence_frame_count >= self.silence_threshold_frames + }; + + self.absolute_time_ms += 32; // 512 samples @ 16kHz = 32ms + + // Transcribe after releasing the borrow on vad + if should_transcribe { + self.transcribe_and_emit(output)?; + } + } + } else { + // No VAD: accumulate samples and transcribe when reaching max duration + if self.speech_buffer.is_empty() { + self.segment_start_time_ms = self.absolute_time_ms; + } + + self.speech_buffer.extend(frame.samples.as_ref().as_slice().iter().copied()); + + // Allow: Sample count and rate are always positive, cast is safe + #[allow( + clippy::cast_precision_loss, + clippy::cast_possible_truncation, + clippy::cast_sign_loss + )] + let duration_ms = (frame.samples.len() as f32 / 16.0) as u64; + self.absolute_time_ms += duration_ms; + + let segment_duration_ms = self.absolute_time_ms - self.segment_start_time_ms; + // Allow: Config value is always positive, cast to u64 for duration comparison + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let max_duration_ms = (self.config.max_segment_duration_secs * 1000.0) as u64; + + if segment_duration_ms >= max_duration_ms { + self.transcribe_and_emit(output)?; + } + } + + Ok(()) + }, + _ => Err("Parakeet plugin only accepts audio packets".to_string()), + } + } + + fn update_params(&mut self, _params: Option) -> Result<(), String> { + // Runtime parameter updates not yet implemented. + Ok(()) + } + + fn flush(&mut self, output: &OutputSender) -> Result<(), String> { + plugin_info!(self.logger, "Flush called, buffer_len={}", self.speech_buffer.len()); + + if !self.speech_buffer.is_empty() { + self.transcribe_and_emit(output)?; + } + + Ok(()) + } + + fn cleanup(&mut self) { + if !self.speech_buffer.is_empty() { + plugin_warn!( + self.logger, + "Speech buffer not empty at cleanup: {} samples", + self.speech_buffer.len() + ); + } + } +} + +impl ParakeetNode { + /// Transcribe buffered speech segment and emit result + fn transcribe_and_emit(&mut self, output: &OutputSender) -> Result<(), String> { + if self.speech_buffer.is_empty() { + return Ok(()); + } + + let samples: Vec = self.speech_buffer.drain(..).collect(); + + // Allow: Sample count / sample rate for duration calculation + #[allow(clippy::cast_precision_loss)] + let duration_secs = samples.len() as f32 / 16000.0; + plugin_info!( + self.logger, + "Transcribing segment: {} samples ({:.2}s)", + samples.len(), + duration_secs + ); + + // Create stream + let stream = unsafe { ffi::SherpaOnnxCreateOfflineStream(self.recognizer.get()) }; + + if stream.is_null() { + return Err("Failed to create recognition stream".to_string()); + } + + // Accept waveform + // Allow: Sample count is guaranteed to fit in i32 for practical audio segments + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] + unsafe { + ffi::SherpaOnnxAcceptWaveformOffline( + stream, + 16000, + samples.as_ptr(), + samples.len() as i32, + ); + } + + // Decode + unsafe { + ffi::SherpaOnnxDecodeOfflineStream(self.recognizer.get(), stream); + } + + // Get result + let result_ptr = unsafe { ffi::SherpaOnnxGetOfflineStreamResult(stream) }; + + if result_ptr.is_null() { + unsafe { + ffi::SherpaOnnxDestroyOfflineStream(stream); + } + return Err("Recognition returned null result".to_string()); + } + + let result = unsafe { &*result_ptr }; + + // Extract text + let text = if result.text.is_null() { + String::new() + } else { + unsafe { CStr::from_ptr(result.text).to_string_lossy().into_owned() } + }; + + // Cleanup + unsafe { + ffi::SherpaOnnxDestroyOfflineRecognizerResult(result_ptr); + ffi::SherpaOnnxDestroyOfflineStream(stream); + } + + // Emit transcription if not empty + if !text.trim().is_empty() { + plugin_info!(self.logger, "Transcription: {}", text); + + let segment = TranscriptionSegment { + text: text.trim().to_string(), + start_time_ms: self.segment_start_time_ms, + end_time_ms: self.absolute_time_ms, + confidence: None, + }; + + output.send( + "out", + &Packet::Transcription(std::sync::Arc::new(TranscriptionData { + text: segment.text.clone(), + segments: vec![segment], + language: Some("en".to_string()), + metadata: None, + })), + )?; + } + + // Reset for next segment + self.silence_frame_count = 0; + + Ok(()) + } +} + +/// Create recognizer using sherpa-onnx C API with transducer (encoder/decoder/joiner) config +unsafe fn create_recognizer( + logger: &Logger, + model_dir: &Path, + config: &ParakeetConfig, +) -> Result<*mut ffi::SherpaOnnxOfflineRecognizer, String> { + plugin_info!(logger, "Creating recognizer for model_dir={}", model_dir.display()); + + let encoder_path = model_dir.join("encoder.int8.onnx"); + let decoder_path = model_dir.join("decoder.int8.onnx"); + let joiner_path = model_dir.join("joiner.int8.onnx"); + let tokens_path = model_dir.join("tokens.txt"); + + // Verify files exist + for (name, path) in [ + ("encoder", &encoder_path), + ("decoder", &decoder_path), + ("joiner", &joiner_path), + ("tokens", &tokens_path), + ] { + if !path.exists() { + return Err(format!("{} file not found: {}", name, path.display())); + } + plugin_info!(logger, "File exists: {}", name); + } + + // Create C strings - keep all alive until after the FFI call + let encoder_cstr = path_to_cstring(&encoder_path)?; + let decoder_cstr = path_to_cstring(&decoder_path)?; + let joiner_cstr = path_to_cstring(&joiner_path)?; + let tokens_cstr = path_to_cstring(&tokens_path)?; + + plugin_info!( + logger, + "Initializing recognizer with execution_provider='{}'", + config.execution_provider + ); + let provider_cstr = CString::new(config.execution_provider.as_str()) + .map_err(|_| "Invalid execution provider string".to_string())?; + let decoding_method_cstr = + CString::new("greedy_search").map_err(|_| "Invalid decoding method string".to_string())?; + + // Empty strings for unused fields (safer than null) + let empty_cstr = CString::new("").map_err(|_| "Invalid empty string".to_string())?; + + // Build config struct — populate transducer fields, leave all others empty + let recognizer_config = ffi::SherpaOnnxOfflineRecognizerConfig { + feat_config: ffi::SherpaOnnxFeatureConfig { sample_rate: 16000, feature_dim: 80 }, + model_config: ffi::SherpaOnnxOfflineModelConfig { + transducer: ffi::SherpaOnnxOfflineTransducerModelConfig { + encoder: encoder_cstr.as_ptr(), + decoder: decoder_cstr.as_ptr(), + joiner: joiner_cstr.as_ptr(), + }, + paraformer: ffi::SherpaOnnxOfflineParaformerModelConfig { model: empty_cstr.as_ptr() }, + nemo_ctc: ffi::SherpaOnnxOfflineNemoEncDecCtcModelConfig { model: empty_cstr.as_ptr() }, + whisper: ffi::SherpaOnnxOfflineWhisperModelConfig { + encoder: empty_cstr.as_ptr(), + decoder: empty_cstr.as_ptr(), + language: empty_cstr.as_ptr(), + task: empty_cstr.as_ptr(), + tail_paddings: 0, + }, + tdnn: ffi::SherpaOnnxOfflineTdnnModelConfig { model: empty_cstr.as_ptr() }, + tokens: tokens_cstr.as_ptr(), + num_threads: config.num_threads, + debug: 0, + provider: provider_cstr.as_ptr(), + model_type: empty_cstr.as_ptr(), + modeling_unit: empty_cstr.as_ptr(), + bpe_vocab: empty_cstr.as_ptr(), + telespeech_ctc: empty_cstr.as_ptr(), + sense_voice: ffi::SherpaOnnxOfflineSenseVoiceModelConfig { + model: empty_cstr.as_ptr(), + language: empty_cstr.as_ptr(), + use_itn: 0, + }, + moonshine: ffi::SherpaOnnxOfflineMoonshineModelConfig { + preprocessor: empty_cstr.as_ptr(), + encoder: empty_cstr.as_ptr(), + uncached_decoder: empty_cstr.as_ptr(), + cached_decoder: empty_cstr.as_ptr(), + }, + fire_red_asr: ffi::SherpaOnnxOfflineFireRedAsrModelConfig { + encoder: empty_cstr.as_ptr(), + decoder: empty_cstr.as_ptr(), + }, + dolphin: ffi::SherpaOnnxOfflineDolphinModelConfig { model: empty_cstr.as_ptr() }, + zipformer_ctc: ffi::SherpaOnnxOfflineZipformerCtcModelConfig { + model: empty_cstr.as_ptr(), + }, + canary: ffi::SherpaOnnxOfflineCanaryModelConfig { + encoder: empty_cstr.as_ptr(), + decoder: empty_cstr.as_ptr(), + src_lang: empty_cstr.as_ptr(), + tgt_lang: empty_cstr.as_ptr(), + use_pnc: 0, + }, + wenet_ctc: ffi::SherpaOnnxOfflineWenetCtcModelConfig { model: empty_cstr.as_ptr() }, + omnilingual: ffi::SherpaOnnxOfflineOmnilingualAsrCtcModelConfig { + model: empty_cstr.as_ptr(), + }, + }, + lm_config: ffi::SherpaOnnxOfflineLMConfig { + model: empty_cstr.as_ptr(), + scale: 0.0, + }, + decoding_method: decoding_method_cstr.as_ptr(), + max_active_paths: 4, + hotwords_file: empty_cstr.as_ptr(), + hotwords_score: 0.0, + rule_fsts: empty_cstr.as_ptr(), + rule_fars: empty_cstr.as_ptr(), + blank_penalty: 0.0, + hr: ffi::SherpaOnnxHomophoneReplacerConfig { + dict_dir: empty_cstr.as_ptr(), + lexicon: empty_cstr.as_ptr(), + rule_fsts: empty_cstr.as_ptr(), + }, + }; + + plugin_info!(logger, "Calling SherpaOnnxCreateOfflineRecognizer"); + let recognizer = ffi::SherpaOnnxCreateOfflineRecognizer(&raw const recognizer_config); + + if recognizer.is_null() { + return Err("Failed to create recognizer".to_string()); + } + + plugin_info!(logger, "Recognizer created successfully"); + Ok(recognizer) +} + +fn path_to_cstring(path: &Path) -> Result { + CString::new(path.to_string_lossy().as_bytes()).map_err(|e| format!("Invalid path: {e}")) +} diff --git a/plugins/native/parakeet/src/vad.rs b/plugins/native/parakeet/src/vad.rs new file mode 100644 index 00000000..b479ddb4 --- /dev/null +++ b/plugins/native/parakeet/src/vad.rs @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +//! Silero VAD v6 wrapper for voice activity detection +//! +//! This module provides a lightweight Rust wrapper around the Silero VAD v6 ONNX model +//! for detecting speech vs. silence in audio streams. + +use ndarray::{Array1, Array2, Array3}; +use ort::session::{builder::GraphOptimizationLevel, Session}; +use ort::value::Value; + +/// Silero VAD v6 for voice activity detection +/// +/// Processes audio in 512-sample chunks (32ms @ 16kHz) and maintains RNN state +/// and context between frames for temporal continuity. +#[derive(Debug)] +pub struct SileroVAD { + session: Session, + sample_rate: u32, + state: Array3, // RNN state [2, batch_size, 128] where batch_size=1 + context: Vec, // Context samples from previous frame (64 samples for v6) + threshold: f32, +} + +impl SileroVAD { + /// Create a new Silero VAD instance + /// + /// # Arguments + /// * `model_path` - Path to the silero_vad.onnx model file + /// * `sample_rate` - Audio sample rate (8000 or 16000) + /// * `threshold` - Speech probability threshold (0.0-1.0, default 0.5) + pub fn new(model_path: &str, sample_rate: u32, threshold: f32) -> Result { + // Validate sample rate + if sample_rate != 8000 && sample_rate != 16000 { + return Err(format!("Silero VAD only supports 8kHz or 16kHz, got {sample_rate}Hz")); + } + + // Load ONNX model + let session = Session::builder() + .map_err(|e| format!("Failed to create session builder: {e}"))? + .with_optimization_level(GraphOptimizationLevel::Level3) + .map_err(|e| format!("Failed to set optimization level: {e}"))? + .commit_from_file(model_path) + .map_err(|e| format!("Failed to load VAD model from '{model_path}': {e}"))?; + + // Initialize RNN state [2, batch_size, 128] where batch_size=1 + let state = Array3::::zeros((2, 1, 128)); + + // Initialize context buffer with 64 zeros (window_size / 8 = 512 / 8 = 64) + let context = vec![0.0f32; 64]; + + Ok(Self { session, sample_rate, state, context, threshold }) + } + + /// Process a 512-sample audio chunk and return speech probability + /// + /// Silero VAD v6 requires context from the previous frame for temporal continuity. + /// The model expects [context_samples + window_samples] = [64 + 512] = 576 samples. + /// + /// # Arguments + /// * `audio` - Audio samples (exactly 512 samples) + /// + /// # Returns + /// Speech probability (0.0-1.0) + pub fn process_chunk(&mut self, audio: &[f32]) -> Result { + if audio.len() != 512 { + return Err(format!("Silero VAD expects exactly 512 samples, got {}", audio.len())); + } + + // Prepend context samples (64) to current audio (512) for effective window of 576 + let mut input_with_context = Vec::with_capacity(576); + input_with_context.extend_from_slice(&self.context); + input_with_context.extend_from_slice(audio); + + // Prepare input tensor with batch dimension: [batch_size, num_samples] = [1, 576] + let audio_input = Array2::from_shape_vec((1, 576), input_with_context) + .map_err(|e| format!("Failed to create audio input tensor: {e}"))?; + + // Sample rate as int64 scalar array + let sr_input = Array1::from_vec(vec![i64::from(self.sample_rate)]); + + // Convert to ort::Value + let input_value = Value::from_array(audio_input) + .map_err(|e| format!("Failed to convert audio to Value: {e}"))?; + + let state_value = Value::from_array(self.state.clone()) + .map_err(|e| format!("Failed to convert state to Value: {e}"))?; + + let sr_value = Value::from_array(sr_input) + .map_err(|e| format!("Failed to convert sample rate to Value: {e}"))?; + + // Run inference with inputs: input, state, sr + let outputs = self + .session + .run(ort::inputs![input_value, state_value, sr_value]) + .map_err(|e| format!("VAD inference failed: {e}"))?; + + // Extract probability (first output) + let prob_view = outputs[0] + .try_extract_tensor::() + .map_err(|e| format!("Failed to extract probability: {e}"))?; + let probability = prob_view.1[0]; // Extract first element + + // Extract updated state (second output) + let state_view = outputs[1] + .try_extract_tensor::() + .map_err(|e| format!("Failed to extract state: {e}"))?; + let state_new = Array3::from_shape_vec((2, 1, 128), state_view.1.to_vec()) + .map_err(|e| format!("Failed to reshape state: {e}"))?; + + // Update state for next iteration + self.state = state_new; + + // Update context: save last 64 samples of current audio for next frame + self.context.copy_from_slice(&audio[audio.len() - 64..]); + + Ok(probability) + } + + /// Check if audio chunk contains speech + /// + /// # Arguments + /// * `audio` - Audio samples (exactly 512 samples) + /// + /// # Returns + /// `true` if speech detected, `false` if silence + #[allow(dead_code)] + pub fn is_speech(&mut self, audio: &[f32]) -> Result { + let probability = self.process_chunk(audio)?; + Ok(probability >= self.threshold) + } + + /// Reset VAD state (clears RNN state and context buffer) + #[allow(dead_code)] + pub fn reset(&mut self) { + self.state.fill(0.0); + self.context.fill(0.0); + } + + /// Update speech threshold + #[allow(dead_code)] + pub fn set_threshold(&mut self, threshold: f32) { + self.threshold = threshold.clamp(0.0, 1.0); + } + + /// Get current threshold + #[allow(dead_code)] + pub const fn threshold(&self) -> f32 { + self.threshold + } +} diff --git a/samples/pipelines/oneshot/parakeet-stt.yml b/samples/pipelines/oneshot/parakeet-stt.yml new file mode 100644 index 00000000..a73af762 --- /dev/null +++ b/samples/pipelines/oneshot/parakeet-stt.yml @@ -0,0 +1,41 @@ +name: Speech-to-Text (Parakeet TDT) +description: Fast English speech transcription using NVIDIA Parakeet TDT (~10x faster than Whisper on CPU) +mode: oneshot +client: + input: + type: file_upload + accept: "audio/opus" + asset_tags: + - speech + output: + type: transcription +steps: + - kind: streamkit::http_input + + - kind: containers::ogg::demuxer + + - kind: audio::opus::decoder + + - kind: audio::resampler + params: + chunk_frames: 960 + output_frame_size: 960 + target_sample_rate: 16000 + + - kind: plugin::native::parakeet + params: + model_dir: models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8 + num_threads: 4 + use_vad: true + vad_model_path: models/silero_vad.onnx + vad_threshold: 0.5 + min_silence_duration_ms: 700 + + - kind: core::json_serialize + params: + pretty: false + newline_delimited: true + + - kind: streamkit::http_output + params: + content_type: application/json