diff --git a/Cargo.lock b/Cargo.lock index e539f80..91e9c7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "alsa" version = "0.9.1" @@ -98,6 +104,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arrayvec" version = "0.7.6" @@ -251,6 +263,21 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +[[package]] +name = "cassowary" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" + +[[package]] +name = "castaway" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.7" @@ -390,6 +417,20 @@ dependencies = [ "memchr", ] +[[package]] +name = "compact_str" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "static_assertions", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -417,6 +458,20 @@ dependencies = [ "coreaudio-sys", ] +[[package]] +name = "coreaudio-rs" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aae284fbaf7d27aa0e292f7677dfbe26503b0d555026f702940805a630eac17" +dependencies = [ + "bitflags 1.3.2", + "libc", + "objc2-audio-toolbox", + "objc2-core-audio", + "objc2-core-audio-types", + "objc2-core-foundation", +] + [[package]] name = "coreaudio-sys" version = "0.2.16" @@ -434,13 +489,13 @@ checksum = "873dab07c8f743075e57f524c583985fbaf745602acbe916a01539364369a779" dependencies = [ "alsa", "core-foundation-sys", - "coreaudio-rs", + "coreaudio-rs 0.11.3", "dasp_sample", "jni", "js-sys", "libc", "mach2", - "ndk", + "ndk 0.8.0", "ndk-context", "oboe", "wasm-bindgen", @@ -449,6 +504,92 @@ dependencies = [ "windows", ] +[[package]] +name = "cpal" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbd307f43cc2a697e2d1f8bc7a1d824b5269e052209e28883e5bc04d095aaa3f" +dependencies = [ + "alsa", + "coreaudio-rs 0.13.0", + "dasp_sample", + "jni", + "js-sys", + "libc", + "mach2", + "ndk 0.9.0", + "ndk-context", + "num-derive", + "num-traits", + "objc2-audio-toolbox", + "objc2-core-audio", + "objc2-core-audio-types", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "windows", +] + +[[package]] +name = "crossterm" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +dependencies = [ + "bitflags 2.6.0", + "crossterm_winapi", + "mio", + "parking_lot", + "rustix", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.101", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.101", +] + [[package]] name = "dasp_sample" version = "0.11.0" @@ -476,6 +617,16 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "dispatch2" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec" +dependencies = [ + "bitflags 2.6.0", + "objc2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -585,6 +736,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foreign-types" version = "0.3.2" @@ -728,6 +885,11 @@ name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -993,6 +1155,12 @@ dependencies = [ "syn 2.0.101", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -1024,6 +1192,25 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indoc" +version = "2.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" + +[[package]] +name = "instability" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf9fed6d91cfb734e7476a06bde8300a1b94e217e1b523b6f0cd1a01998c71d" +dependencies = [ + "darling", + "indoc", + "proc-macro2", + "quote", + "syn 2.0.101", +] + [[package]] name = "ipnet" version = "2.10.1" @@ -1175,17 +1362,23 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" name = "llm" version = "1.3.1" dependencies = [ + "anyhow", "async-trait", "axum", "base64", "chrono", "clap", "colored", + "cpal 0.16.0", + "crossterm", "dirs", "either", "env_logger", "futures", + "hound", "log", + "parking_lot", + "ratatui", "regex", "reqwest", "rodio", @@ -1215,6 +1408,15 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown", +] + [[package]] name = "mach2" version = "0.4.2" @@ -1280,6 +1482,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", + "log", "wasi", "windows-sys 0.52.0", ] @@ -1310,7 +1513,21 @@ dependencies = [ "bitflags 2.6.0", "jni-sys", "log", - "ndk-sys", + "ndk-sys 0.5.0+25.2.9519653", + "num_enum", + "thiserror 1.0.69", +] + +[[package]] +name = "ndk" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3f42e7bbe13d351b6bead8286a43aac9534b82bd3cc43e47037f012ebfd62d4" +dependencies = [ + "bitflags 2.6.0", + "jni-sys", + "log", + "ndk-sys 0.6.0+11769913", "num_enum", "thiserror 1.0.69", ] @@ -1330,6 +1547,15 @@ dependencies = [ "jni-sys", ] +[[package]] +name = "ndk-sys" +version = "0.6.0+11769913" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee6cda3051665f1fb8d9e08fc35c96d5a244fb1be711a03b71118828afc9a873" +dependencies = [ + "jni-sys", +] + [[package]] name = "nibble_vec" version = "0.1.0" @@ -1402,6 +1628,78 @@ dependencies = [ "syn 2.0.101", ] +[[package]] +name = "objc2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88c6597e14493ab2e44ce58f2fdecf095a51f12ca57bec060a11c57332520551" +dependencies = [ + "objc2-encode", +] + +[[package]] +name = "objc2-audio-toolbox" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10cbe18d879e20a4aea544f8befe38bcf52255eb63d3f23eca2842f3319e4c07" +dependencies = [ + "bitflags 2.6.0", + "libc", + "objc2", + "objc2-core-audio", + "objc2-core-audio-types", + "objc2-core-foundation", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-audio" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca44961e888e19313b808f23497073e3f6b3c22bb485056674c8b49f3b025c82" +dependencies = [ + "dispatch2", + "objc2", + "objc2-core-audio-types", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-core-audio-types" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f1cc99bb07ad2ddb6527ddf83db6a15271bb036b3eb94b801cd44fdc666ee1" +dependencies = [ + "bitflags 2.6.0", + "objc2", +] + +[[package]] +name = "objc2-core-foundation" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" +dependencies = [ + "bitflags 2.6.0", + "dispatch2", + "objc2", +] + +[[package]] +name = "objc2-encode" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" + +[[package]] +name = "objc2-foundation" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "900831247d2fe1a09a683278e5384cfb8c80c79fe6b166f9d14bfdde0ea1b03c" +dependencies = [ + "objc2", +] + [[package]] name = "object" version = "0.36.7" @@ -1418,7 +1716,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8b61bebd49e5d43f5f8cc7ee2891c16e0f41ec7954d36bcb6c14c5e0de867fb" dependencies = [ "jni", - "ndk", + "ndk 0.8.0", "ndk-context", "num-derive", "num-traits", @@ -1522,6 +1820,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1637,7 +1941,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1689,6 +1993,27 @@ dependencies = [ "getrandom", ] +[[package]] +name = "ratatui" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" +dependencies = [ + "bitflags 2.6.0", + "cassowary", + "compact_str", + "crossterm", + "indoc", + "instability", + "itertools", + "lru", + "paste", + "strum 0.26.3", + "unicode-segmentation", + "unicode-truncate", + "unicode-width 0.2.0", +] + [[package]] name = "redox_syscall" version = "0.5.9" @@ -1809,7 +2134,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7ceb6607dd738c99bc8cb28eff249b7cd5c8ec88b9db96c0608c1480d140fb1" dependencies = [ "claxon", - "cpal", + "cpal 0.15.3", "hound", "lewton", "symphonia", @@ -1912,7 +2237,7 @@ dependencies = [ "nix", "radix_trie", "unicode-segmentation", - "unicode-width", + "unicode-width 0.2.0", "utf8parse", "windows-sys 0.59.0", ] @@ -2043,6 +2368,27 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal-hook" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -2091,7 +2437,7 @@ checksum = "a0ef947f358b9c238923f764c72a4a9d42f2d637c46e059dbd319d6e7cfb4f82" dependencies = [ "lazy_static", "maplit", - "strum", + "strum 0.24.1", ] [[package]] @@ -2100,6 +2446,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" @@ -2112,7 +2464,16 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" dependencies = [ - "strum_macros", + "strum_macros 0.24.3", +] + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros 0.26.4", ] [[package]] @@ -2128,6 +2489,19 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.101", +] + [[package]] name = "subtle" version = "2.6.1" @@ -2567,6 +2941,23 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode-truncate" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf" +dependencies = [ + "itertools", + "unicode-segmentation", + "unicode-width 0.1.14", +] + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + [[package]] name = "unicode-width" version = "0.2.0" @@ -2772,6 +3163,22 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.9" @@ -2781,6 +3188,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows" version = "0.54.0" diff --git a/Cargo.toml b/Cargo.toml index afd836a..2ef56d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ elevenlabs = [] agent = [] rodio = ["dep:rodio"] logging = ["dep:env_logger"] +audio-example = ["dep:cpal", "dep:hound", "dep:ratatui", "dep:crossterm", "dep:anyhow", "dep:parking_lot"] [dependencies] serde = { version = "1.0", features = ["derive"] } @@ -69,6 +70,12 @@ rodio = { version = "0.20.0", features = ["mp3", "wav"], optional = true } regex = "1.10" log = "0.4" env_logger = { version = "0.11", optional = true } +cpal = { version = "0.16", optional = true } +hound = { version = "3.5", optional = true } +parking_lot = { version = "0.12", optional = true } +anyhow = { version = "1", optional = true } +ratatui = { version = "0.29", optional = true } +crossterm = { version = "0.28", optional = true } chrono = {version = "0.4", default-features = false, features = ["serde"]} [[bin]] @@ -80,3 +87,8 @@ required-features = ["cli"] tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] } rodio = { version = "0.20.1", default-features = false, features = ["symphonia-all"]} +[[example]] +name = "agent_audio_example" +path = "examples/agent_audio_example.rs" +required-features = ["audio-example"] + diff --git a/examples/agent_audio_example.rs b/examples/agent_audio_example.rs new file mode 100644 index 0000000..c691cf3 --- /dev/null +++ b/examples/agent_audio_example.rs @@ -0,0 +1,335 @@ +//! A voice assistant example that demonstrates how to use the LLM library with audio input. +//! +//! This example creates a terminal UI application that: +//! - Records audio from the default input device when spacebar is pressed +//! - Transcribes the audio using OpenAI's Whisper model +//! - Processes the transcribed text using a two-agent pipeline: +//! 1. A transcriber agent that creates a plan based on the audio input +//! 2. An assistant agent that executes the plan and responds to the user +//! +//! The UI shows: +//! - A scrollable list of messages from the agent pipeline +//! - Recording controls and status at the bottom +//! +//! # Usage +//! - Press SPACE to start/stop recording +//! - Press 'q' to quit +//! +//! # Required Environment Variables +//! - OPENAI_API_KEY: Your OpenAI API key + +use llm::{ + agent::AgentBuilder, + builder::{LLMBackend, LLMBuilder}, + chat::ChatMessage, + cond, + memory::{SharedMemory, SlidingWindowMemory}, +}; + +use crossterm::{ + event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode}, + execute, + terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen}, +}; +use ratatui::{ + backend::CrosstermBackend, + layout::{Constraint, Direction, Layout}, + style::{Color, Modifier, Style}, + widgets::{Block, Borders, List, ListItem, Paragraph}, + Frame, Terminal, +}; + +use std::{ + collections::VecDeque, + io::{self, Cursor}, + sync::{Arc, Mutex}, +}; + +use tokio::sync::mpsc; + +/// Messages that can be sent to update the UI +#[derive(Debug, Clone)] +enum AppMessage { + /// A message from an agent, including role, content and whether it contains audio + Agent { role: String, content: String, audio: bool }, + /// A status message to display in the UI + Status(String), +} + +/// The main application state +struct App { + /// Queue of messages to display in the UI + messages: VecDeque, + /// Current status message + status: String, + /// Whether audio is currently being recorded + recording: bool, +} + +impl App { + /// Maximum number of messages to keep in history + const MAX: usize = 512; + + /// Creates a new application instance + fn new() -> Self { + Self { + messages: VecDeque::from(vec!["πŸš€ Voice Assistant Agent Started".into()]), + status: "Press SPACE to record, 'q' to quit".into(), + recording: false, + } + } + + /// Adds a new message to the history + fn push(&mut self, role: &str, content: &str, audio: bool) { + if self.messages.len() == Self::MAX { + self.messages.pop_front(); + } + let icon = if audio { "🎡" } else { "πŸ“" }; + let msg = format!("{icon} [{role}]: {content}"); + self.messages.push_back(msg); + } + + /// Updates the status message + fn set_status(&mut self, s: impl Into) { + self.status = s.into(); + } + + /// Updates the recording state and status message + fn set_recording(&mut self, rec: bool) { + self.recording = rec; + self.set_status(if rec { + "πŸ”΄ Recording… Press SPACE to stop" + } else { + "Press SPACE to record, 'q' to quit" + }); + } +} + +/// Draws the terminal UI +fn draw(f: &mut Frame, app: &App) { + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Min(1), Constraint::Length(3)]) + .split(f.area()); + + let items: Vec<_> = app + .messages + .iter() + .map(|m| ListItem::new(m.clone())) + .collect(); + f.render_widget( + List::new(items).block( + Block::default() + .title("πŸ€– Agent Pipeline Messages") + .borders(Borders::ALL), + ), + chunks[0], + ); + + let style = if app.recording { + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD) + } else { + Style::default().fg(Color::Green) + }; + f.render_widget( + Paragraph::new(app.status.clone()) + .block( + Block::default() + .title("πŸŽ™οΈ Recording Controls") + .borders(Borders::ALL), + ) + .style(style), + chunks[1], + ); +} + +/// Main application entry point +#[tokio::main] +async fn main() -> Result<(), Box> { + let memory = SharedMemory::new_reactive(SlidingWindowMemory::new(10)); + + let transcriber = Arc::new( + AgentBuilder::new() + .role("transcriber") + .on("user", cond!(has_audio)) + .llm( + LLMBuilder::new() + .backend(LLMBackend::OpenAI) + .api_key(std::env::var("OPENAI_API_KEY").unwrap_or("sk-TEST".into())) + .model("gpt-4o") + .system("Create a plan to answer the question. You are not allowed to change the content of the audio message. You are not allowed to add any other text to the transcribed text."), + ) + .stt( + LLMBuilder::new() + .backend(LLMBackend::OpenAI) + .api_key(std::env::var("OPENAI_API_KEY").unwrap_or("sk-TEST".into())) + .model("whisper-1"), + ) + .memory(memory.clone()) + .build()?, + ); + + let _assistant = AgentBuilder::new() + .role("assistant") + .on("transcriber", cond!(any)) + .llm( + LLMBuilder::new() + .backend(LLMBackend::OpenAI) + .api_key(std::env::var("OPENAI_API_KEY").unwrap_or("sk-TEST".into())) + .model("gpt-4o-search-preview") + .openai_enable_web_search(true) + .system("Execute the plan and respond to the user."), + ) + .memory(memory.clone()) + .build()?; + + use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; + let host = cpal::default_host(); + let input_dev = host + .default_input_device() + .expect("no input device available"); + let cfg = input_dev.default_input_config()?; + + enable_raw_mode()?; + let mut stdout = io::stdout(); + execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?; + let backend = CrosstermBackend::new(stdout); + let mut terminal = Terminal::new(backend)?; + + let mut app = App::new(); + let (ui_tx, mut ui_rx) = mpsc::channel::(128); + + { + let ui_tx = ui_tx.clone(); + tokio::spawn(async move { + let mut sub = memory.subscribe(); + while let Ok(e) = sub.recv().await { + let audio = e.msg.has_audio(); + let content = e.msg.content; + let _ = ui_tx + .send(AppMessage::Agent { + role: e.role, + content, + audio, + }) + .await; + } + }); + } + + let samples = Arc::new(Mutex::new(Vec::::with_capacity(48_000 * 60))); + + let mut stream: Option = None; + + loop { + while let Ok(msg) = ui_rx.try_recv() { + match msg { + AppMessage::Agent { role, content, audio } => app.push(&role, &content, audio), + AppMessage::Status(s) => app.set_status(s), + } + } + + terminal.draw(|f| draw(f, &app))?; + + if event::poll(std::time::Duration::from_millis(50))? { + if let Event::Key(k) = event::read()? { + match k.code { + KeyCode::Char('q') => break, + KeyCode::Char(' ') => { + if app.recording { + if let Some(s) = stream.take() { + drop(s); + } + app.set_recording(false); + app.set_status("🎡 Processing audio…"); + + let pcm: Vec = { + let mut guard = samples.lock().unwrap(); + std::mem::take(&mut *guard) + }; + let cfg_clone = cfg.clone(); + let trans = Arc::clone(&transcriber); + let ui_tx = ui_tx.clone(); + + tokio::spawn(async move { + if pcm.is_empty() { + let _ = ui_tx + .send(AppMessage::Status("⚠️ No audio recorded".into())) + .await; + return; + } + + let wav_bytes = tokio::task::spawn_blocking(move || -> Vec { + let mut buf = Vec::::with_capacity(pcm.len() * 4 + 44); + let spec = hound::WavSpec { + channels: 1, + sample_rate: cfg_clone.sample_rate().0, + bits_per_sample: 32, + sample_format: hound::SampleFormat::Float, + }; + let mut writer = hound::WavWriter::new(Cursor::new(&mut buf), spec) + .expect("wav writer"); + for s in pcm { + writer.write_sample(s).unwrap(); + } + writer.finalize().unwrap(); + buf + }) + .await + .unwrap(); + + let result_msg = match trans + .chat(&[ChatMessage::user().audio(wav_bytes).build()]) + .await + { + Ok(_) => "βœ… Audio processed".to_string(), + Err(e) => format!("❌ Error: {e}"), + }; + + let _ = ui_tx.send(AppMessage::Status(result_msg.clone())).await; + + let success = result_msg.starts_with("βœ…"); + + if success { + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + let _ = ui_tx + .send(AppMessage::Status( + "Press SPACE to record, 'q' to quit".into(), + )) + .await; + } + }); + } else { + samples.lock().unwrap().clear(); + let samples_clone = Arc::clone(&samples); + let chans = cfg.channels() as usize; + + let s = input_dev.build_input_stream( + &cfg.clone().into(), + move |data: &[f32], _| { + let mut buf = samples_clone.lock().unwrap(); + buf.extend(data.iter().step_by(chans)); + }, + |err| eprintln!("Audio error: {err}"), + None, + )?; + s.play()?; + stream = Some(s); + app.set_recording(true); + } + } + _ => {} + } + } + } + } + + disable_raw_mode()?; + execute!( + terminal.backend_mut(), + LeaveAlternateScreen, + DisableMouseCapture + )?; + terminal.show_cursor()?; + Ok(()) +} diff --git a/src/agent/builder.rs b/src/agent/builder.rs index b4916b7..2961f7e 100644 --- a/src/agent/builder.rs +++ b/src/agent/builder.rs @@ -18,7 +18,7 @@ use tokio::sync::RwLock; /// - Maintain conversation context /// - Handle speech-to-text and text-to-speech capabilities pub struct AgentBuilder { - llm_builder: LLMBuilder, + llm_builder: Option, role: Option, role_triggers: Vec<(String, MessageCondition)>, max_cycles: Option, @@ -33,7 +33,7 @@ impl AgentBuilder { /// Creates a new AgentBuilder instance. pub fn new() -> Self { Self { - llm_builder: LLMBuilder::new(), + llm_builder: None, role: None, role_triggers: Vec::new(), max_cycles: None, @@ -84,7 +84,7 @@ impl AgentBuilder { /// Sets the underlying LLM configuration. pub fn llm(mut self, llm_builder: LLMBuilder) -> Self { - self.llm_builder = llm_builder; + self.llm_builder = Some(llm_builder); self } @@ -112,8 +112,23 @@ impl AgentBuilder { /// /// Returns an error if the underlying LLM configuration is invalid. pub fn build(self) -> Result, LLMError> { - // Build the base LLM provider - let base_provider = self.llm_builder.build()?; + let (base_provider, stt_provider) = match (self.llm_builder, self.stt_builder) { + (Some(llm), Some(stt)) => { + // Both LLM and STT - use LLM as base, STT as separate provider + (llm.build()?, Some(Arc::from(stt.build()?))) + }, + (Some(llm), None) => { + // LLM only + (llm.build()?, None) + }, + (None, Some(stt)) => { + // STT only + (stt.build()?, None) + }, + (None, None) => { + return Err(LLMError::InvalidRequest("No provider configured".into())); + } + }; // If memory is configured, wrap with ChatWithMemory including agent capabilities if let Some(memory) = self.memory { @@ -125,6 +140,7 @@ impl AgentBuilder { self.role, self.role_triggers, self.max_cycles, + stt_provider, ); Ok(Box::new(agent_provider)) } else { diff --git a/src/backends/anthropic.rs b/src/backends/anthropic.rs index 0763bcc..9dcff6a 100644 --- a/src/backends/anthropic.rs +++ b/src/backends/anthropic.rs @@ -402,6 +402,7 @@ impl ChatProvider for Anthropic { tool_output: Some(r.function.arguments.clone()), }) .collect(), + MessageType::Audio(_) => vec![], }, }) .collect(); diff --git a/src/backends/azure_openai.rs b/src/backends/azure_openai.rs index f373b9c..209ab35 100644 --- a/src/backends/azure_openai.rs +++ b/src/backends/azure_openai.rs @@ -91,6 +91,7 @@ impl<'a> From<&'a ChatMessage> for AzureOpenAIChatMessage<'a> { } MessageType::ToolUse(_) => None, MessageType::ToolResult(_) => None, + MessageType::Audio(_) => None, }, tool_calls: match &chat_msg.message_type { MessageType::ToolUse(calls) => { diff --git a/src/backends/google.rs b/src/backends/google.rs index bd87799..03ed957 100644 --- a/src/backends/google.rs +++ b/src/backends/google.rs @@ -562,6 +562,7 @@ impl ChatProvider for Google { }) }) .collect(), + MessageType::Audio(_) => vec![], }, }); } @@ -737,6 +738,7 @@ impl ChatProvider for Google { }) }) .collect(), + MessageType::Audio(_) => vec![], }, }); } diff --git a/src/backends/openai.rs b/src/backends/openai.rs index c03e31e..aba6cf2 100644 --- a/src/backends/openai.rs +++ b/src/backends/openai.rs @@ -703,6 +703,7 @@ fn chat_message_to_api_message(chat_msg: ChatMessage) -> OpenAIChatMessage<'stat } MessageType::ToolUse(_) => None, MessageType::ToolResult(_) => None, + MessageType::Audio(_) => None, }, tool_calls: match &chat_msg.message_type { MessageType::ToolUse(calls) => { diff --git a/src/builder.rs b/src/builder.rs index b2fb6e9..38a790a 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -899,6 +899,7 @@ impl LLMBuilder { None, Vec::new(), None, + None, )); } diff --git a/src/chat/mod.rs b/src/chat/mod.rs index 1cb33dd..1a95f29 100644 --- a/src/chat/mod.rs +++ b/src/chat/mod.rs @@ -58,6 +58,8 @@ pub enum MessageType { ToolUse(Vec), /// Tool result ToolResult(Vec), + /// Audio data for speech-to-text processing + Audio(Vec), } /// The type of reasoning effort for a message in a chat conversation. @@ -343,6 +345,19 @@ impl ChatMessage { pub fn assistant() -> ChatMessageBuilder { ChatMessageBuilder::new(ChatRole::Assistant) } + + /// Check if this message contains audio data + pub fn has_audio(&self) -> bool { + matches!(self.message_type, MessageType::Audio(_)) + } + + /// Get audio data if this is an audio message + pub fn audio_data(&self) -> Option<&[u8]> { + match &self.message_type { + MessageType::Audio(data) => Some(data), + _ => None, + } + } } /// Builder for ChatMessage @@ -399,6 +414,12 @@ impl ChatMessageBuilder { self } + /// Set the message type as Audio + pub fn audio(mut self, audio_data: Vec) -> Self { + self.message_type = MessageType::Audio(audio_data); + self + } + /// Build the ChatMessage pub fn build(self) -> ChatMessage { ChatMessage { diff --git a/src/memory/chat_wrapper.rs b/src/memory/chat_wrapper.rs index 7bdc47a..ba6e666 100644 --- a/src/memory/chat_wrapper.rs +++ b/src/memory/chat_wrapper.rs @@ -22,7 +22,7 @@ pub struct ChatWithMemory { memory: Arc>>, role: Option, role_triggers: Vec<(String, MessageCondition)>, - + stt_provider: Option>, max_cycles: Option, cycle_counter: std::sync::Arc, } @@ -34,12 +34,14 @@ impl ChatWithMemory { /// * `memory` – Conversation memory store /// * `role` – Optional agent role /// * `role_triggers` – Reactive rules + /// * `stt_provider` – Optional speech-to-text provider pub fn new( provider: Arc, memory: Arc>>, role: Option, role_triggers: Vec<(String, MessageCondition)>, max_cycles: Option, + stt_provider: Option>, ) -> Self { use std::sync::atomic::AtomicU32; @@ -48,6 +50,7 @@ impl ChatWithMemory { memory: memory.clone(), role, role_triggers: role_triggers.clone(), + stt_provider, max_cycles, cycle_counter: std::sync::Arc::new(AtomicU32::new(0)), }; @@ -56,6 +59,10 @@ impl ChatWithMemory { wrapper.spawn_reactive_listener(); } + if wrapper.stt_provider.is_some() { + wrapper.spawn_stt_pipeline(); + } + wrapper } @@ -124,6 +131,39 @@ impl ChatWithMemory { }); } + /// Spawn a background pipeline that automatically transcribes audio messages. + fn spawn_stt_pipeline(&self) { + let memory = self.memory.clone(); + let stt_provider = self.stt_provider.clone().expect("STT provider should exist"); + + tokio::spawn(async move { + let mut receiver = { + let guard = memory.read().await; + match guard.get_event_receiver() { + Some(r) => r, + None => return, + } + }; + + while let Ok(event) = receiver.recv().await { + if let Some(audio_data) = event.msg.audio_data() { + match stt_provider.transcribe(audio_data.to_vec()).await { + Ok(transcription) => { + let mut transcribed_msg = event.msg.clone(); + transcribed_msg.content = transcription; + + let mut guard = memory.write().await; + if let Err(e) = guard.remember_with_role(&transcribed_msg, event.role.clone()).await { + eprintln!("STT memory save error: {}", e); + } + } + Err(e) => eprintln!("STT transcription error: {}", e), + } + } + } + }); + } + /// Access the wrapped provider. pub fn inner(&self) -> &dyn LLMProvider { self.provider.as_ref() @@ -174,10 +214,37 @@ impl ChatProvider for ChatWithMemory { context = mem.recall("", None).await?; } - context.extend_from_slice(messages); + // Auto-transcribe audio messages if STT provider available + let processed_messages = if self.stt_provider.is_some() { + let mut processed = Vec::new(); + for msg in messages.iter() { + if msg.has_audio() { + if let Some(audio_data) = msg.audio_data() { + let transcription = if let Some(stt) = &self.stt_provider { + stt.transcribe(audio_data.to_vec()).await? + } else { + self.provider.transcribe(audio_data.to_vec()).await? + }; + + let transcribed = match msg.role { + ChatRole::User => ChatMessage::user().content(transcription).build(), + ChatRole::Assistant => ChatMessage::assistant().content(transcription).build(), + }; + processed.push(transcribed); + } else { + processed.push(msg.clone()); + } + } else { + processed.push(msg.clone()); + } + } + processed + } else { + messages.to_vec() + }; + context.extend_from_slice(&processed_messages); let response = self.provider.chat_with_tools(&context, tools).await?; - // record assistant reply once if let Some(text) = response.text() { let memory = self.memory.clone(); let tag = self.role.clone(); diff --git a/src/memory/cond_macros.rs b/src/memory/cond_macros.rs index 7250e20..4eeecfc 100644 --- a/src/memory/cond_macros.rs +++ b/src/memory/cond_macros.rs @@ -8,6 +8,7 @@ macro_rules! cond { (role_not $v:literal) => { $crate::memory::MessageCondition::RoleNot($v.into()) }; (len_gt $v:literal) => { $crate::memory::MessageCondition::LenGt($v) }; (regex $v:literal) => { $crate::memory::MessageCondition::Regex($v.into()) }; + (has_audio) => { $crate::memory::MessageCondition::HasAudio }; ($left:tt && $($rest:tt)+) => { $crate::memory::MessageCondition::All(vec![ diff --git a/src/memory/mod.rs b/src/memory/mod.rs index 9093b0c..1125c96 100644 --- a/src/memory/mod.rs +++ b/src/memory/mod.rs @@ -54,6 +54,8 @@ pub enum MessageCondition { AnyOf(Vec), /// Trigger if message content matches regex Regex(String), + /// Trigger if message contains audio data + HasAudio, } impl MessageCondition { @@ -72,6 +74,7 @@ impl MessageCondition { MessageCondition::All(inner) => inner.iter().all(|c| c.matches(event)), MessageCondition::AnyOf(inner) => inner.iter().any(|c| c.matches(event)), MessageCondition::Regex(regex) => Regex::new(regex).map(|re| re.is_match(&event.msg.content)).unwrap_or(false), + MessageCondition::HasAudio => event.msg.has_audio(), } } } diff --git a/src/memory/sliding_window.rs b/src/memory/sliding_window.rs index 34329a9..b37ab87 100644 --- a/src/memory/sliding_window.rs +++ b/src/memory/sliding_window.rs @@ -178,7 +178,9 @@ impl MemoryProvider for SlidingWindowMemory { limit: Option, ) -> Result, LLMError> { let limit = limit.unwrap_or(self.messages.len()); - Ok(self.recent_messages(limit)) + let mut messages = self.recent_messages(limit); + messages.retain(|m| !m.has_audio()); + Ok(messages) } async fn clear(&mut self) -> Result<(), LLMError> {