diff --git a/crates/openfang-cli/src/main.rs b/crates/openfang-cli/src/main.rs index 7bfa88ac7..b9206f006 100644 --- a/crates/openfang-cli/src/main.rs +++ b/crates/openfang-cli/src/main.rs @@ -2101,18 +2101,14 @@ fn cmd_doctor(json: bool, repair: bool) { ui::check_ok(".env file (permissions fixed to 0600)"); } repaired = true; - } else { - if !json { - ui::check_warn(&format!( - ".env file has loose permissions ({:o}), should be 0600", - mode - )); - } - } - } else { - if !json { - ui::check_ok(".env file"); + } else if !json { + ui::check_warn(&format!( + ".env file has loose permissions ({:o}), should be 0600", + mode + )); } + } else if !json { + ui::check_ok(".env file"); } } #[cfg(not(unix))] diff --git a/crates/openfang-hands/bundled/clip/HAND.toml b/crates/openfang-hands/bundled/clip/HAND.toml index ac7958338..09d070ad4 100644 --- a/crates/openfang-hands/bundled/clip/HAND.toml +++ b/crates/openfang-hands/bundled/clip/HAND.toml @@ -67,6 +67,11 @@ default = "auto" value = "auto" label = "Auto-detect (best available)" +[[settings.options]] +value = "parakeet_mlx" +label = "Parakeet MLX (local)" +binary = "parakeet-mlx" + [[settings.options]] value = "whisper_local" label = "Local Whisper" @@ -310,7 +315,14 @@ curl -s -X POST "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=tr ``` Parse `results.channels[0].alternatives[0].words` for word-level timing. -### Path E: Local Whisper (stt_provider = whisper_local or auto fallback) +### Path E: Parakeet MLX local pipeline (stt_provider = parakeet_mlx) +``` +ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav +uv run --with parakeet-mlx python3 -c "from parakeet_mlx import from_pretrained; import json, sys; model = from_pretrained('mlx-community/parakeet-tdt-0.6b-v3'); result = model.transcribe(sys.argv[1]); print(json.dumps({'text': result.text}))" audio.wav +``` +This produces JSON output with the transcript text using the local MLX model. + +### Path F: Local Whisper (stt_provider = whisper_local or auto fallback) ``` ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav whisper audio.wav --model small --output_format json --word_timestamps true --language en @@ -318,7 +330,7 @@ whisper audio.wav --model small --output_format json --word_timestamps true --la This produces `audio.json` with segments containing word-level timing. If `whisper` is not found, try `whisper-ctranslate2` (same flags, 4x faster). -### Path F: No subtitles, no STT (fallback) +### Path G: No subtitles, no STT (fallback) Fall back to ffmpeg scene detection + silence detection. Scene detection — run ffmpeg and look for `pts_time:` values in the output: diff --git a/crates/openfang-hands/src/registry.rs b/crates/openfang-hands/src/registry.rs index ada1fe38d..a45608d1c 100644 --- a/crates/openfang-hands/src/registry.rs +++ b/crates/openfang-hands/src/registry.rs @@ -555,6 +555,10 @@ fn check_chromium_available() -> bool { /// Check if a binary is on PATH (cross-platform). fn which_binary(name: &str) -> bool { + if name == "parakeet-mlx" { + return which_binary("uv"); + } + let path_var = std::env::var("PATH").unwrap_or_default(); let separator = if cfg!(windows) { ';' } else { ':' }; let extensions: Vec<&str> = if cfg!(windows) { @@ -860,4 +864,9 @@ mod tests { }; assert!(!req.optional); } + + #[test] + fn parakeet_mlx_option_uses_uv() { + assert_eq!(which_binary("parakeet-mlx"), which_binary("uv")); + } } diff --git a/crates/openfang-kernel/src/whatsapp_gateway.rs b/crates/openfang-kernel/src/whatsapp_gateway.rs index a4214a744..d72c3b4b8 100644 --- a/crates/openfang-kernel/src/whatsapp_gateway.rs +++ b/crates/openfang-kernel/src/whatsapp_gateway.rs @@ -270,8 +270,6 @@ mod tests { #[test] fn test_embedded_files_not_empty() { - assert!(!GATEWAY_INDEX_JS.is_empty()); - assert!(!GATEWAY_PACKAGE_JSON.is_empty()); assert!(GATEWAY_INDEX_JS.contains("WhatsApp")); assert!(GATEWAY_PACKAGE_JSON.contains("@openfang/whatsapp-gateway")); } diff --git a/crates/openfang-runtime/src/copilot_oauth.rs b/crates/openfang-runtime/src/copilot_oauth.rs index b63d69a21..a48f1e895 100644 --- a/crates/openfang-runtime/src/copilot_oauth.rs +++ b/crates/openfang-runtime/src/copilot_oauth.rs @@ -150,6 +150,6 @@ mod tests { fn test_constants() { assert!(GITHUB_DEVICE_CODE_URL.starts_with("https://")); assert!(GITHUB_TOKEN_URL.starts_with("https://")); - assert!(!COPILOT_CLIENT_ID.is_empty()); + assert!(COPILOT_CLIENT_ID.starts_with("Iv1.")); } } diff --git a/crates/openfang-runtime/src/media_understanding.rs b/crates/openfang-runtime/src/media_understanding.rs index b4f7dc1af..abcdec3f2 100644 --- a/crates/openfang-runtime/src/media_understanding.rs +++ b/crates/openfang-runtime/src/media_understanding.rs @@ -5,8 +5,10 @@ use openfang_types::media::{ MediaAttachment, MediaConfig, MediaSource, MediaType, MediaUnderstanding, }; +use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::sync::Semaphore; +use tokio::time::{timeout, Duration}; use tracing::info; /// Media understanding engine. @@ -75,6 +77,10 @@ impl MediaEngine { let _permit = self.semaphore.acquire().await.map_err(|e| e.to_string())?; + if provider == "parakeet-mlx" { + return transcribe_with_parakeet_mlx(attachment).await; + } + // Derive a proper filename with extension from mime_type // (Whisper APIs require an extension to detect format) let ext = match attachment.mime_type.as_str() { @@ -237,6 +243,123 @@ impl MediaEngine { } } +async fn transcribe_with_parakeet_mlx( + attachment: &MediaAttachment, +) -> Result { + let audio_path = materialize_audio_for_parakeet(attachment).await?; + + let script = r#" +import json +import sys +from parakeet_mlx import from_pretrained + +MODEL_ID = "mlx-community/parakeet-tdt-0.6b-v3" + +model = from_pretrained(MODEL_ID) +result = model.transcribe(sys.argv[1]) +print(json.dumps({ + "text": result.text, + "model": MODEL_ID, +})) +"#; + + let mut cmd = tokio::process::Command::new("uv"); + cmd.args([ + "run", + "--with", + "parakeet-mlx", + "python3", + "-c", + script, + &audio_path.to_string_lossy(), + ]); + cmd.env("PYTHONUNBUFFERED", "1"); + cmd.kill_on_drop(true); + + let output = timeout(Duration::from_secs(900), cmd.output()) + .await + .map_err(|_| "parakeet-mlx transcription timed out after 15 minutes".to_string())? + .map_err(|e| format!("Failed to launch parakeet-mlx via uv: {e}"))?; + + cleanup_temp_audio(&audio_path, attachment).await; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + let detail = if !stderr.trim().is_empty() { + stderr.trim() + } else { + stdout.trim() + }; + return Err(format!("parakeet-mlx transcription failed: {detail}")); + } + + let stdout = String::from_utf8(output.stdout) + .map_err(|e| format!("parakeet-mlx returned non-UTF8 output: {e}"))?; + let parsed: serde_json::Value = serde_json::from_str(stdout.trim()) + .map_err(|e| format!("Failed to parse parakeet-mlx output: {e}"))?; + + let transcription = parsed["text"] + .as_str() + .ok_or("parakeet-mlx output missing text field")? + .trim() + .to_string(); + if transcription.is_empty() { + return Err("parakeet-mlx returned empty transcription".into()); + } + + let model = parsed["model"] + .as_str() + .unwrap_or("mlx-community/parakeet-tdt-0.6b-v3"); + + Ok(MediaUnderstanding { + media_type: MediaType::Audio, + description: transcription, + provider: "parakeet-mlx".to_string(), + model: model.to_string(), + }) +} + +async fn materialize_audio_for_parakeet(attachment: &MediaAttachment) -> Result { + match &attachment.source { + MediaSource::FilePath { path } => Ok(PathBuf::from(path)), + MediaSource::Base64 { data, mime_type } => { + use base64::Engine; + let decoded = base64::engine::general_purpose::STANDARD + .decode(data) + .map_err(|e| format!("Failed to decode base64 audio: {e}"))?; + let ext = audio_extension_from_mime(mime_type); + let path = std::env::temp_dir() + .join(format!("openfang_parakeet_{}.{}", uuid::Uuid::new_v4(), ext)); + tokio::fs::write(&path, decoded) + .await + .map_err(|e| format!("Failed to write temp audio file: {e}"))?; + Ok(path) + } + MediaSource::Url { url } => Err(format!( + "URL-based audio source not supported for parakeet-mlx transcription: {url}" + )), + } +} + +async fn cleanup_temp_audio(path: &Path, attachment: &MediaAttachment) { + if matches!(attachment.source, MediaSource::Base64 { .. }) { + let _ = tokio::fs::remove_file(path).await; + } +} + +fn audio_extension_from_mime(mime_type: &str) -> &'static str { + match mime_type { + "audio/wav" | "audio/x-wav" => "wav", + "audio/mpeg" | "audio/mp3" => "mp3", + "audio/ogg" => "ogg", + "audio/webm" => "webm", + "audio/mp4" | "audio/m4a" => "m4a", + "audio/flac" => "flac", + _ => "wav", + } +} + /// Detect which vision provider is available based on environment variables. fn detect_vision_provider() -> Option<&'static str> { if std::env::var("ANTHROPIC_API_KEY").is_ok() { @@ -253,6 +376,15 @@ fn detect_vision_provider() -> Option<&'static str> { /// Detect which audio transcription provider is available. fn detect_audio_provider() -> Option<&'static str> { + if std::env::var("OPENFANG_ENABLE_PARAKEET_MLX").is_ok() + || std::process::Command::new("uv") + .arg("--version") + .output() + .map(|o| o.status.success()) + .unwrap_or(false) + { + return Some("parakeet-mlx"); + } if std::env::var("GROQ_API_KEY").is_ok() { return Some("groq"); } @@ -275,6 +407,7 @@ fn default_vision_model(provider: &str) -> &str { /// Get the default audio model for a provider. fn default_audio_model(provider: &str) -> &str { match provider { + "parakeet-mlx" => "mlx-community/parakeet-tdt-0.6b-v3", "groq" => "whisper-large-v3-turbo", "openai" => "whisper-1", _ => "unknown", @@ -405,6 +538,10 @@ mod tests { #[test] fn test_default_audio_models() { + assert_eq!( + default_audio_model("parakeet-mlx"), + "mlx-community/parakeet-tdt-0.6b-v3" + ); assert_eq!(default_audio_model("groq"), "whisper-large-v3-turbo"); assert_eq!(default_audio_model("openai"), "whisper-1"); }