RightNow-AI · yaroslavyaroslav · Mar 14, 2026
diff --git a/crates/openfang-cli/src/main.rs b/crates/openfang-cli/src/main.rs
@@ -2101,18 +2101,14 @@ fn cmd_doctor(json: bool, repair: bool) {
                             ui::check_ok(".env file (permissions fixed to 0600)");
                         }
                         repaired = true;
-                    } else {
-                        if !json {
-                            ui::check_warn(&format!(
-                                ".env file has loose permissions ({:o}), should be 0600",
-                                mode
-                            ));
-                        }
-                    }
-                } else {
-                    if !json {
-                        ui::check_ok(".env file");
+                    } else if !json {
+                        ui::check_warn(&format!(
+                            ".env file has loose permissions ({:o}), should be 0600",
+                            mode
+                        ));
                     }
+                } else if !json {
+                    ui::check_ok(".env file");
                 }
             }
             #[cfg(not(unix))]

diff --git a/crates/openfang-hands/bundled/clip/HAND.toml b/crates/openfang-hands/bundled/clip/HAND.toml
@@ -67,6 +67,11 @@ default = "auto"
 value = "auto"
 label = "Auto-detect (best available)"
 
+[[settings.options]]
+value = "parakeet_mlx"
+label = "Parakeet MLX (local)"
+binary = "parakeet-mlx"
+
 [[settings.options]]
 value = "whisper_local"
 label = "Local Whisper"
@@ -310,15 +315,22 @@ curl -s -X POST "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=tr
 ```
 Parse `results.channels[0].alternatives[0].words` for word-level timing.
 
-### Path E: Local Whisper (stt_provider = whisper_local or auto fallback)
+### Path E: Parakeet MLX local pipeline (stt_provider = parakeet_mlx)
+```
+ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
+uv run --with parakeet-mlx python3 -c "from parakeet_mlx import from_pretrained; import json, sys; model = from_pretrained('mlx-community/parakeet-tdt-0.6b-v3'); result = model.transcribe(sys.argv[1]); print(json.dumps({'text': result.text}))" audio.wav
+```
+This produces JSON output with the transcript text using the local MLX model.
+
+### Path F: Local Whisper (stt_provider = whisper_local or auto fallback)
 ```
 ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
 whisper audio.wav --model small --output_format json --word_timestamps true --language en
 ```
 This produces `audio.json` with segments containing word-level timing.
 If `whisper` is not found, try `whisper-ctranslate2` (same flags, 4x faster).
 
-### Path F: No subtitles, no STT (fallback)
+### Path G: No subtitles, no STT (fallback)
 Fall back to ffmpeg scene detection + silence detection.
 
 Scene detection — run ffmpeg and look for `pts_time:` values in the output:

diff --git a/crates/openfang-hands/src/registry.rs b/crates/openfang-hands/src/registry.rs
@@ -555,6 +555,10 @@ fn check_chromium_available() -> bool {
 
 /// Check if a binary is on PATH (cross-platform).
 fn which_binary(name: &str) -> bool {
+    if name == "parakeet-mlx" {
+        return which_binary("uv");
+    }
+
     let path_var = std::env::var("PATH").unwrap_or_default();
     let separator = if cfg!(windows) { ';' } else { ':' };
     let extensions: Vec<&str> = if cfg!(windows) {
@@ -860,4 +864,9 @@ mod tests {
         };
         assert!(!req.optional);
     }
+
+    #[test]
+    fn parakeet_mlx_option_uses_uv() {
+        assert_eq!(which_binary("parakeet-mlx"), which_binary("uv"));
+    }
 }
diff --git a/crates/openfang-kernel/src/whatsapp_gateway.rs b/crates/openfang-kernel/src/whatsapp_gateway.rs
@@ -270,8 +270,6 @@ mod tests {
 
     #[test]
     fn test_embedded_files_not_empty() {
-        assert!(!GATEWAY_INDEX_JS.is_empty());
-        assert!(!GATEWAY_PACKAGE_JSON.is_empty());
         assert!(GATEWAY_INDEX_JS.contains("WhatsApp"));
         assert!(GATEWAY_PACKAGE_JSON.contains("@openfang/whatsapp-gateway"));
     }

diff --git a/crates/openfang-runtime/src/copilot_oauth.rs b/crates/openfang-runtime/src/copilot_oauth.rs
@@ -150,6 +150,6 @@ mod tests {
     fn test_constants() {
         assert!(GITHUB_DEVICE_CODE_URL.starts_with("https://"));
         assert!(GITHUB_TOKEN_URL.starts_with("https://"));
-        assert!(!COPILOT_CLIENT_ID.is_empty());
+        assert!(COPILOT_CLIENT_ID.starts_with("Iv1."));
     }
 }
diff --git a/crates/openfang-runtime/src/media_understanding.rs b/crates/openfang-runtime/src/media_understanding.rs
@@ -5,8 +5,10 @@
 use openfang_types::media::{
     MediaAttachment, MediaConfig, MediaSource, MediaType, MediaUnderstanding,
 };
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use tokio::sync::Semaphore;
+use tokio::time::{timeout, Duration};
 use tracing::info;
 
 /// Media understanding engine.
@@ -75,6 +77,10 @@ impl MediaEngine {
 
         let _permit = self.semaphore.acquire().await.map_err(|e| e.to_string())?;
 
+        if provider == "parakeet-mlx" {
+            return transcribe_with_parakeet_mlx(attachment).await;
+        }
+
         // Derive a proper filename with extension from mime_type
         // (Whisper APIs require an extension to detect format)
         let ext = match attachment.mime_type.as_str() {
@@ -237,6 +243,123 @@ impl MediaEngine {
     }
 }
 
+async fn transcribe_with_parakeet_mlx(
+    attachment: &MediaAttachment,
+) -> Result<MediaUnderstanding, String> {
+    let audio_path = materialize_audio_for_parakeet(attachment).await?;
+
+    let script = r#"
+import json
+import sys
+from parakeet_mlx import from_pretrained
+
+MODEL_ID = "mlx-community/parakeet-tdt-0.6b-v3"
+
+model = from_pretrained(MODEL_ID)
+result = model.transcribe(sys.argv[1])
+print(json.dumps({
+    "text": result.text,
+    "model": MODEL_ID,
+}))
+"#;
+
+    let mut cmd = tokio::process::Command::new("uv");
+    cmd.args([
+        "run",
+        "--with",
+        "parakeet-mlx",
+        "python3",
+        "-c",
+        script,
+        &audio_path.to_string_lossy(),
+    ]);
+    cmd.env("PYTHONUNBUFFERED", "1");
+    cmd.kill_on_drop(true);
+
+    let output = timeout(Duration::from_secs(900), cmd.output())
+        .await
+        .map_err(|_| "parakeet-mlx transcription timed out after 15 minutes".to_string())?
+        .map_err(|e| format!("Failed to launch parakeet-mlx via uv: {e}"))?;
+
+    cleanup_temp_audio(&audio_path, attachment).await;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        let detail = if !stderr.trim().is_empty() {
+            stderr.trim()
+        } else {
+            stdout.trim()
+        };
+        return Err(format!("parakeet-mlx transcription failed: {detail}"));
+    }
+
+    let stdout = String::from_utf8(output.stdout)
+        .map_err(|e| format!("parakeet-mlx returned non-UTF8 output: {e}"))?;
+    let parsed: serde_json::Value = serde_json::from_str(stdout.trim())
+        .map_err(|e| format!("Failed to parse parakeet-mlx output: {e}"))?;
+
+    let transcription = parsed["text"]
+        .as_str()
+        .ok_or("parakeet-mlx output missing text field")?
+        .trim()
+        .to_string();
+    if transcription.is_empty() {
+        return Err("parakeet-mlx returned empty transcription".into());
+    }
+
+    let model = parsed["model"]
+        .as_str()
+        .unwrap_or("mlx-community/parakeet-tdt-0.6b-v3");
+
+    Ok(MediaUnderstanding {
+        media_type: MediaType::Audio,
+        description: transcription,
+        provider: "parakeet-mlx".to_string(),
+        model: model.to_string(),
+    })
+}
+
+async fn materialize_audio_for_parakeet(attachment: &MediaAttachment) -> Result<PathBuf, String> {
+    match &attachment.source {
+        MediaSource::FilePath { path } => Ok(PathBuf::from(path)),
+        MediaSource::Base64 { data, mime_type } => {
+            use base64::Engine;
+            let decoded = base64::engine::general_purpose::STANDARD
+                .decode(data)
+                .map_err(|e| format!("Failed to decode base64 audio: {e}"))?;
+            let ext = audio_extension_from_mime(mime_type);
+            let path = std::env::temp_dir()
+                .join(format!("openfang_parakeet_{}.{}", uuid::Uuid::new_v4(), ext));
+            tokio::fs::write(&path, decoded)
+                .await
+                .map_err(|e| format!("Failed to write temp audio file: {e}"))?;
+            Ok(path)
+        }
+        MediaSource::Url { url } => Err(format!(
+            "URL-based audio source not supported for parakeet-mlx transcription: {url}"
+        )),
+    }
+}
+
+async fn cleanup_temp_audio(path: &Path, attachment: &MediaAttachment) {
+    if matches!(attachment.source, MediaSource::Base64 { .. }) {
+        let _ = tokio::fs::remove_file(path).await;
+    }
+}
+
+fn audio_extension_from_mime(mime_type: &str) -> &'static str {
+    match mime_type {
+        "audio/wav" | "audio/x-wav" => "wav",
+        "audio/mpeg" | "audio/mp3" => "mp3",
+        "audio/ogg" => "ogg",
+        "audio/webm" => "webm",
+        "audio/mp4" | "audio/m4a" => "m4a",
+        "audio/flac" => "flac",
+        _ => "wav",
+    }
+}
+
 /// Detect which vision provider is available based on environment variables.
 fn detect_vision_provider() -> Option<&'static str> {
     if std::env::var("ANTHROPIC_API_KEY").is_ok() {
@@ -253,6 +376,15 @@ fn detect_vision_provider() -> Option<&'static str> {
 
 /// Detect which audio transcription provider is available.
 fn detect_audio_provider() -> Option<&'static str> {
+    if std::env::var("OPENFANG_ENABLE_PARAKEET_MLX").is_ok()
+        || std::process::Command::new("uv")
+            .arg("--version")
+            .output()
+            .map(|o| o.status.success())
+            .unwrap_or(false)
+    {
+        return Some("parakeet-mlx");
+    }
     if std::env::var("GROQ_API_KEY").is_ok() {
         return Some("groq");
     }
@@ -275,6 +407,7 @@ fn default_vision_model(provider: &str) -> &str {
 /// Get the default audio model for a provider.
 fn default_audio_model(provider: &str) -> &str {
     match provider {
+        "parakeet-mlx" => "mlx-community/parakeet-tdt-0.6b-v3",
         "groq" => "whisper-large-v3-turbo",
         "openai" => "whisper-1",
         _ => "unknown",
@@ -405,6 +538,10 @@ mod tests {
 
     #[test]
     fn test_default_audio_models() {
+        assert_eq!(
+            default_audio_model("parakeet-mlx"),
+            "mlx-community/parakeet-tdt-0.6b-v3"
+        );
         assert_eq!(default_audio_model("groq"), "whisper-large-v3-turbo");
         assert_eq!(default_audio_model("openai"), "whisper-1");
     }