diff --git a/Package.swift b/Package.swift index 1917090..0dde7bb 100644 --- a/Package.swift +++ b/Package.swift @@ -8,10 +8,14 @@ let package = Package( .macOS(.v14) ], products: [ - .executable(name: "MiniWhisper", targets: ["MiniWhisper"]) + .executable(name: "MiniWhisper", targets: ["MiniWhisper"]), + .executable(name: "MiniWhisperDebug", targets: ["MiniWhisperDebug"]) ], dependencies: [ - .package(url: "https://github.com/FluidInference/FluidAudio.git", from: "0.9.1") + .package( + url: "https://github.com/FluidInference/FluidAudio.git", + .upToNextMinor(from: "0.12.6") + ) ], targets: [ .executableTarget( @@ -26,6 +30,17 @@ let package = Package( .enableExperimentalFeature("StrictConcurrency") ] ), + .executableTarget( + name: "MiniWhisperDebug", + dependencies: [ + "FluidAudio", + "whisper" + ], + path: "Sources/MiniWhisperDebug", + swiftSettings: [ + .enableExperimentalFeature("StrictConcurrency") + ] + ), .binaryTarget( name: "whisper", url: "https://github.com/andyhtran/MiniWhisper/releases/download/whisper-xcframework-1.0/whisper.xcframework.zip", diff --git a/README.md b/README.md index 3f90f18..0f97f5d 100644 --- a/README.md +++ b/README.md @@ -60,9 +60,16 @@ just dev just dev # Build, package, and launch just build # Debug build only just release # Release build + .app bundle +just debug-tool # Build local debug transcription CLI just clean # Remove build artifacts ``` +Run the local debug CLI on an existing audio file or recording directory: + +```bash +just debug-transcribe ~/Code/debug-stt/whisper_cpp.wav --engine whisper --preset current-app +``` + ## Release Signing, notarization, and publishing require [`asc`](https://github.com/rudrankriyam/App-Store-Connect-CLI): diff --git a/Scripts/build-app.sh b/Scripts/build-app.sh index 72d1f05..35ee0d6 100755 --- a/Scripts/build-app.sh +++ b/Scripts/build-app.sh @@ -21,7 +21,7 @@ fi echo "Building $APP_NAME ($BUILD_CONFIG)..." -swift build -c "$BUILD_CONFIG" +swift build -c "$BUILD_CONFIG" --product "$APP_NAME" echo "Creating app bundle..." rm -rf "$APP_BUNDLE" diff --git a/Sources/MiniWhisper/AppState.swift b/Sources/MiniWhisper/AppState.swift index cfe0b13..ec15f44 100644 --- a/Sources/MiniWhisper/AppState.swift +++ b/Sources/MiniWhisper/AppState.swift @@ -1,6 +1,7 @@ import Foundation import Observation import AppKit +import ServiceManagement import UserNotifications @Observable @@ -37,6 +38,8 @@ final class AppState: Sendable { var isModelDownloading: Bool { whisper.isDownloading } var modelDownloadProgress: Double { whisper.downloadProgress } + var launchAtLoginEnabled: Bool { SMAppService.mainApp.status == .enabled } + var launchAtLoginSupported: Bool { SMAppService.mainApp.status != .notFound } // MARK: - Initialization @@ -154,8 +157,80 @@ final class AppState: Sendable { guard recorder.state.isRecording else { return } stopDurationChecks() onRecordingEnded?() - recorder.cancelRecording() + + let duration = recorder.currentDuration + let sampleRate = recorder.actualSampleRate + let recordingId = currentRecordingId ?? Recording.generateId() currentRecordingId = nil + + guard let audioURL = recorder.stopRecording() else { + recorder.reset() + return + } + recorder.reset() + + let fileSize = (try? FileManager.default.attributesOfItem(atPath: audioURL.path)[.size] as? Int64) ?? 0 + let recording = Recording( + id: recordingId, + createdAt: Date(), + recording: RecordingInfo( + duration: duration, + sampleRate: sampleRate, + channels: 1, + fileSize: fileSize, + inputDevice: recorder.systemDefaultDeviceName + ), + transcription: nil, + configuration: RecordingConfiguration( + voiceModel: transcriptionMode == .english ? "Parakeet" : "Whisper", + language: "en" + ), + status: .cancelled + ) + + do { + try recordingStore.saveWithExistingAudio(recording) + } catch { + toast.showError(title: "Cancel Save Failed", message: error.localizedDescription) + } + } + + func retranscribe(_ recording: Recording) { + guard recorder.state.isIdle else { + toast.showError(title: "Busy", message: "Wait for the current recording/transcription to finish.") + return + } + guard recording.canRetranscribe else { + toast.showError(title: "Cannot Re-transcribe", message: "Audio file is no longer available.") + return + } + + recorder.state = .processing + + Task { + await retranscribeCancelledRecording(recording) + } + } + + func setLaunchAtLogin(_ enabled: Bool) { + let service = SMAppService.mainApp + guard service.status != .notFound else { + toast.showError( + title: "Start on Login Unavailable", + message: "This is available only when running the bundled app." + ) + return + } + + do { + if enabled { + try service.register() + } else { + try service.unregister() + } + } catch { + toast.showError(title: "Start on Login Failed", message: error.localizedDescription) + } } // MARK: - Transcription @@ -213,7 +288,8 @@ final class AppState: Sendable { configuration: RecordingConfiguration( voiceModel: result.model, language: result.language - ) + ), + status: .completed ) try recordingStore.saveWithExistingAudio(recording) @@ -243,13 +319,80 @@ final class AppState: Sendable { configuration: RecordingConfiguration( voiceModel: transcriptionMode == .english ? "Parakeet" : "Whisper", language: "en" - ) + ), + status: .failed ) // Use saveMetadataOnly for failed recordings since audio may not exist try? recordingStore.saveFailedRecording(recording) } } + private func retranscribeCancelledRecording(_ recording: Recording) async { + do { + let result: TranscriptionResult + switch transcriptionMode { + case .english: + result = try await parakeet.transcribe(audioURL: recording.audioURL) + case .multilingual: + result = try await whisper.transcribe(audioURL: recording.audioURL) + } + + guard recorder.state == .processing else { return } + + guard !result.text.isEmpty else { + recorder.reset() + toast.showError(title: "Empty Transcription", message: "No speech detected in recording.") + return + } + + let finalText: String + if replacementSettings.enabled { + let processor = ReplacementProcessor(rules: replacementSettings.enabledRules) + finalText = processor.apply(to: result.text) + } else { + finalText = result.text + } + + pasteboard.copyAndPaste(finalText) + + let fileSize = (try? FileManager.default.attributesOfItem(atPath: recording.audioURL.path)[.size] as? Int64) ?? 0 + let updatedRecording = Recording( + id: recording.id, + createdAt: recording.createdAt, + recording: RecordingInfo( + duration: recording.recording.duration, + sampleRate: recording.recording.sampleRate, + channels: recording.recording.channels, + fileSize: fileSize, + inputDevice: recording.recording.inputDevice + ), + transcription: RecordingTranscription( + text: finalText, + segments: result.segments, + language: result.language, + model: result.model, + transcriptionDuration: result.duration + ), + configuration: RecordingConfiguration( + voiceModel: result.model, + language: result.language + ), + status: .completed + ) + + try recordingStore.saveWithExistingAudio(updatedRecording) + analyticsStore.record( + duration: recording.recording.duration, + wordCount: result.text.split(separator: " ").count + ) + recorder.reset() + } catch { + guard recorder.state == .processing else { return } + recorder.reset() + toast.showError(title: "Re-transcription Failed", message: error.localizedDescription) + } + } + // MARK: - Duration Monitoring private func startDurationChecks() { diff --git a/Sources/MiniWhisper/Models/Recording.swift b/Sources/MiniWhisper/Models/Recording.swift index 7d1fdc0..5043f4a 100644 --- a/Sources/MiniWhisper/Models/Recording.swift +++ b/Sources/MiniWhisper/Models/Recording.swift @@ -1,5 +1,11 @@ import Foundation +enum RecordingStatus: String, Codable, Equatable, Hashable, Sendable { + case completed + case failed + case cancelled +} + struct RecordingInfo: Codable, Equatable, Hashable, Sendable { let duration: TimeInterval let sampleRate: Double @@ -27,6 +33,43 @@ struct Recording: Codable, Identifiable, Equatable, Hashable, Sendable { let recording: RecordingInfo var transcription: RecordingTranscription? let configuration: RecordingConfiguration + var status: RecordingStatus + + init( + id: String, + createdAt: Date, + recording: RecordingInfo, + transcription: RecordingTranscription?, + configuration: RecordingConfiguration, + status: RecordingStatus = .completed + ) { + self.id = id + self.createdAt = createdAt + self.recording = recording + self.transcription = transcription + self.configuration = configuration + self.status = status + } + + enum CodingKeys: String, CodingKey { + case id + case createdAt + case recording + case transcription + case configuration + case status + } + + init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + id = try container.decode(String.self, forKey: .id) + createdAt = try container.decode(Date.self, forKey: .createdAt) + recording = try container.decode(RecordingInfo.self, forKey: .recording) + transcription = try container.decodeIfPresent(RecordingTranscription.self, forKey: .transcription) + configuration = try container.decode(RecordingConfiguration.self, forKey: .configuration) + status = try container.decodeIfPresent(RecordingStatus.self, forKey: .status) + ?? (transcription == nil ? .failed : .completed) + } var audioURL: URL { storageDirectory.appendingPathComponent("audio.wav") @@ -40,6 +83,10 @@ struct Recording: Codable, Identifiable, Equatable, Hashable, Sendable { FileManager.default.fileExists(atPath: audioURL.path) } + var canRetranscribe: Bool { + status == .cancelled && transcription == nil && hasAudioFile + } + static var baseDirectory: URL { let docs = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first! return docs.appendingPathComponent("MiniWhisper/recordings") diff --git a/Sources/MiniWhisper/Services/RecordingStore.swift b/Sources/MiniWhisper/Services/RecordingStore.swift index c8efbcf..19c26be 100644 --- a/Sources/MiniWhisper/Services/RecordingStore.swift +++ b/Sources/MiniWhisper/Services/RecordingStore.swift @@ -31,12 +31,14 @@ final class RecordingStore: Sendable { try saveMetadata(recording) try saveTranscriptionFiles(recording) + recordings.removeAll { $0.id == recording.id } recordings.insert(recording, at: 0) performRetention() } func saveFailedRecording(_ recording: Recording) throws { try saveMetadata(recording) + recordings.removeAll { $0.id == recording.id } recordings.insert(recording, at: 0) performRetention() } @@ -77,6 +79,13 @@ final class RecordingStore: Sendable { Array(recordings.prefix(3)) } + var recentHistoryItems: [Recording] { + let filtered = recordings.filter { recording in + recording.transcription != nil || recording.status == .cancelled + } + return Array(filtered.prefix(3)) + } + // MARK: - Retention func performRetention() { diff --git a/Sources/MiniWhisper/Services/WhisperProvider.swift b/Sources/MiniWhisper/Services/WhisperProvider.swift index 23451d6..0d86b0c 100644 --- a/Sources/MiniWhisper/Services/WhisperProvider.swift +++ b/Sources/MiniWhisper/Services/WhisperProvider.swift @@ -2,7 +2,13 @@ import Foundation @preconcurrency import AVFoundation import whisper +enum WhisperLanguageMode: Sendable { + case auto + case fixed(String) +} + struct WhisperTranscriptionOptions: Sendable { + let language: WhisperLanguageMode let detectLanguage: Bool let noTimestamps: Bool let singleSegment: Bool @@ -10,8 +16,9 @@ struct WhisperTranscriptionOptions: Sendable { static func `default`() -> WhisperTranscriptionOptions { WhisperTranscriptionOptions( + language: .fixed("en"), detectLanguage: false, - noTimestamps: true, + noTimestamps: false, singleSegment: false, threadCount: max(1, Int32(ProcessInfo.processInfo.activeProcessorCount - 2)) ) @@ -51,7 +58,19 @@ final class WhisperContext: @unchecked Sendable { func transcribe(samples: [Float]) -> (text: String, language: String) { let options = Self.transcriptionOptions() var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY) - params.language = nil + var languageCString: UnsafeMutablePointer? + switch options.language { + case .auto: + params.language = nil + case .fixed(let language): + languageCString = strdup(language) + params.language = languageCString.map { UnsafePointer($0) } + } + defer { + if let languageCString { + free(languageCString) + } + } params.detect_language = options.detectLanguage params.print_special = false params.print_progress = false diff --git a/Sources/MiniWhisper/Views/MenuBarView.swift b/Sources/MiniWhisper/Views/MenuBarView.swift index debd273..ee5fc69 100644 --- a/Sources/MiniWhisper/Views/MenuBarView.swift +++ b/Sources/MiniWhisper/Views/MenuBarView.swift @@ -364,6 +364,7 @@ private struct FooterBarView: View { @State private var showHistory = false @State private var showReplacements = false @State private var showModelPicker = false + @State private var showLaunchAtLogin = false var body: some View { HStack(spacing: 16) { @@ -423,11 +424,25 @@ private struct FooterBarView: View { .frame(width: 28, height: 28) } .buttonStyle(.plain) - .help("Recent Transcriptions") + .help("Recent History") .popover(isPresented: $showHistory, arrowEdge: .bottom) { HistoryPopoverView() } + Button { + showLaunchAtLogin.toggle() + } label: { + Image(systemName: appState.launchAtLoginEnabled ? "power.circle.fill" : "power.circle") + .font(.system(size: 14)) + .foregroundColor(appState.launchAtLoginEnabled ? .accentColor : .secondary) + .frame(width: 28, height: 28) + } + .buttonStyle(.plain) + .help("Start on Login") + .popover(isPresented: $showLaunchAtLogin, arrowEdge: .bottom) { + LaunchAtLoginPopoverView() + } + Button { NSApplication.shared.terminate(nil) } label: { @@ -444,6 +459,39 @@ private struct FooterBarView: View { } } +private struct LaunchAtLoginPopoverView: View { + @Environment(AppState.self) private var appState + + var body: some View { + VStack(alignment: .leading, spacing: 10) { + Text("Launch at Login") + .font(.system(size: 11, weight: .semibold)) + .foregroundColor(.secondary) + .textCase(.uppercase) + .tracking(0.5) + + if appState.launchAtLoginSupported { + Toggle( + "Start MiniWhisper when you log in", + isOn: Binding( + get: { appState.launchAtLoginEnabled }, + set: { appState.setLaunchAtLogin($0) } + ) + ) + .toggleStyle(.switch) + .font(.system(size: 13)) + } else { + Text("Unavailable in this runtime. Build/run the bundled app to enable login item registration.") + .font(.system(size: 12)) + .foregroundColor(.secondary) + .fixedSize(horizontal: false, vertical: true) + } + } + .padding(12) + .frame(width: 280) + } +} + // MARK: - History Popover private struct HistoryPopoverView: View { @@ -458,8 +506,8 @@ private struct HistoryPopoverView: View { .tracking(0.5) .padding(.horizontal, 10) - if appState.recordingStore.recentRecordings.isEmpty { - Text("No recordings yet") + if appState.recordingStore.recentHistoryItems.isEmpty { + Text("No recent transcripts") .font(.system(size: 13)) .foregroundColor(.secondary.opacity(0.7)) .italic() @@ -467,8 +515,8 @@ private struct HistoryPopoverView: View { .padding(.horizontal, 10) } else { VStack(spacing: 2) { - ForEach(appState.recordingStore.recentRecordings) { recording in - HistoryPopoverRow(recording: recording, pasteboard: appState.pasteboard) + ForEach(appState.recordingStore.recentHistoryItems) { recording in + HistoryPopoverRow(recording: recording) } } } @@ -479,35 +527,53 @@ private struct HistoryPopoverView: View { } private struct HistoryPopoverRow: View { + @Environment(AppState.self) private var appState let recording: Recording - let pasteboard: PasteboardService @State private var copied = false @State private var isHovering = false var body: some View { - Button { + Group { if let text = recording.transcription?.text { - pasteboard.copy(text) - copied = true - DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { - copied = false + Button { + appState.pasteboard.copy(text) + copied = true + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { + copied = false + } + } label: { + rowContent } + .buttonStyle(.plain) + } else { + rowContent } - } label: { - HStack(spacing: 8) { - VStack(alignment: .leading, spacing: 3) { - Text(recording.transcription?.text ?? "No transcription") - .font(.system(size: 13)) - .lineLimit(2) - .foregroundColor(recording.transcription != nil ? .primary : .secondary) + } + .animation(.easeInOut(duration: 0.12), value: isHovering) + .animation(.spring(response: 0.25, dampingFraction: 0.7), value: copied) + .onHover { hovering in + withAnimation(.easeInOut(duration: 0.12)) { + isHovering = hovering + } + } + } - Text(formatDate(recording.createdAt)) - .font(.system(size: 10)) - .foregroundColor(.secondary.opacity(0.7)) - } + private var rowContent: some View { + HStack(spacing: 8) { + VStack(alignment: .leading, spacing: 3) { + Text(primaryText) + .font(.system(size: 13)) + .lineLimit(2) + .foregroundColor(recording.transcription != nil ? .primary : .secondary) - Spacer(minLength: 12) + Text(formatDate(recording.createdAt)) + .font(.system(size: 10)) + .foregroundColor(.secondary.opacity(0.7)) + } + Spacer(minLength: 12) + + if recording.transcription != nil { if copied { Image(systemName: "checkmark.circle.fill") .foregroundColor(.green) @@ -519,23 +585,43 @@ private struct HistoryPopoverRow: View { .foregroundColor(.secondary) .transition(.opacity.combined(with: .scale(scale: 0.8))) } + } else if recording.status == .cancelled { + if recording.canRetranscribe { + Button("Re-transcribe") { + appState.retranscribe(recording) + } + .buttonStyle(.plain) + .font(.system(size: 11, weight: .medium)) + .foregroundColor(isReTranscribeDisabled ? .secondary : .accentColor) + .disabled(isReTranscribeDisabled) + } else { + Text("Audio expired") + .font(.system(size: 11)) + .foregroundColor(.secondary) + } } - .padding(.horizontal, 10) - .padding(.vertical, 8) - .background( - RoundedRectangle(cornerRadius: 10) - .fill(isHovering ? Color.primary.opacity(0.06) : Color.clear) - ) - .contentShape(Rectangle()) } - .buttonStyle(.plain) - .animation(.easeInOut(duration: 0.12), value: isHovering) - .animation(.spring(response: 0.25, dampingFraction: 0.7), value: copied) - .onHover { hovering in - withAnimation(.easeInOut(duration: 0.12)) { - isHovering = hovering - } + .padding(.horizontal, 10) + .padding(.vertical, 8) + .background( + RoundedRectangle(cornerRadius: 10) + .fill(isHovering ? Color.primary.opacity(0.06) : Color.clear) + ) + .contentShape(Rectangle()) + } + + private var primaryText: String { + if let text = recording.transcription?.text { + return text + } + if recording.status == .cancelled { + return "Canceled recording" } + return "No transcription" + } + + private var isReTranscribeDisabled: Bool { + recording.canRetranscribe == false || appState.recorder.state.isRecording || appState.recorder.state == .processing } private func formatDate(_ date: Date) -> String { diff --git a/Sources/MiniWhisperDebug/main.swift b/Sources/MiniWhisperDebug/main.swift new file mode 100644 index 0000000..26bf1d4 --- /dev/null +++ b/Sources/MiniWhisperDebug/main.swift @@ -0,0 +1,919 @@ +import AVFoundation +import Darwin +@preconcurrency import FluidAudio +import Foundation +import whisper + +private struct WordTiming: Codable, Equatable, Hashable, Sendable { + let word: String + let start: TimeInterval + let end: TimeInterval + let probability: Float +} + +private struct TranscriptionSegment: Codable, Equatable, Hashable, Sendable { + let start: TimeInterval + let end: TimeInterval + let text: String + let words: [WordTiming]? +} + +private struct TranscriptionResult: Sendable { + let text: String + let segments: [TranscriptionSegment] + let language: String + let duration: TimeInterval + let model: String +} + +private enum CLIError: LocalizedError { + case usage(String) + case missingValue(String) + case invalidValue(flag: String, value: String) + case invalidInputPath(String) + case invalidPresetForEngine(engine: DebugEngine, preset: DebugPreset) + case modelLoadFailed + case resampleFailed + case transcriptionFailed(Int32) + + var errorDescription: String? { + switch self { + case .usage(let message): + return message + case .missingValue(let flag): + return "Missing value for \(flag)" + case .invalidValue(let flag, let value): + return "Invalid value '\(value)' for \(flag)" + case .invalidInputPath(let path): + return "Input path does not exist or is unsupported: \(path)" + case .invalidPresetForEngine(let engine, let preset): + return "Preset '\(preset.rawValue)' is not valid for engine '\(engine.rawValue)'" + case .modelLoadFailed: + return "Failed to load Whisper model" + case .resampleFailed: + return "Failed to resample audio to 16kHz" + case .transcriptionFailed(let code): + return "Whisper transcription failed with status \(code)" + } + } +} + +private enum DebugEngine: String { + case whisper + case parakeet +} + +private enum DebugPreset: String { + case currentApp = "current-app" + case candidateFix = "candidate-fix" + case nanovoxLike = "nanovox-like" + case `default` +} + +private enum WhisperLanguageMode: Sendable { + case auto + case fixed(String) +} + +private struct WhisperTranscriptionOptions: Sendable { + var language: WhisperLanguageMode + var detectLanguage: Bool + var noTimestamps: Bool + var singleSegment: Bool + var tokenTimestamps: Bool + var splitOnWord: Bool + var maxLen: Int32 + var threadCount: Int32 + + static func appDefault() -> WhisperTranscriptionOptions { + WhisperTranscriptionOptions( + language: .auto, + detectLanguage: false, + noTimestamps: true, + singleSegment: false, + tokenTimestamps: false, + splitOnWord: false, + maxLen: 0, + threadCount: max(1, Int32(ProcessInfo.processInfo.activeProcessorCount - 2)) + ) + } + + static func candidateFixDefault() -> WhisperTranscriptionOptions { + WhisperTranscriptionOptions( + language: .fixed("en"), + detectLanguage: false, + noTimestamps: false, + singleSegment: false, + tokenTimestamps: false, + splitOnWord: false, + maxLen: 0, + threadCount: max(1, Int32(ProcessInfo.processInfo.activeProcessorCount - 2)) + ) + } + + static func nanoVoxLike() -> WhisperTranscriptionOptions { + WhisperTranscriptionOptions( + language: .fixed("en"), + detectLanguage: false, + noTimestamps: false, + singleSegment: false, + tokenTimestamps: true, + splitOnWord: false, + maxLen: 1, + threadCount: max(1, Int32(ProcessInfo.processInfo.activeProcessorCount - 2)) + ) + } +} + +private struct ParsedArgs { + let inputPath: String + let engine: DebugEngine + let preset: DebugPreset + let outputDir: URL? + let noTimestamps: Bool? + let tokenTimestamps: Bool? + let splitOnWord: Bool? + let maxLen: Int32? + let detectLanguage: Bool? + let singleSegment: Bool? + let language: WhisperLanguageMode? +} + +private struct ResolvedInput { + let sourceURL: URL + let audioURL: URL + let sourceKind: String +} + +private struct SegmentArtifact: Codable { + let totalDuration: TimeInterval + let wordTimestampsEnabled: Bool + let segments: [TranscriptionSegment] +} + +private struct DebugRunArtifact: Codable { + let createdAt: Date + let engine: String + let preset: String + let sourcePath: String + let sourceKind: String + let audioPath: String + let outputDir: String + let overrides: [String: String] + let model: String +} + +private struct DebugMetadataArtifact: Codable { + struct RecordingInfo: Codable { + let duration: TimeInterval + let sampleRate: Double + let channels: Int + let fileSize: Int64 + let inputDevice: String? + } + + struct TranscriptionInfo: Codable { + let text: String + let language: String + let transcriptionDuration: TimeInterval + let segments: [TranscriptionSegment] + let model: String + } + + struct Configuration: Codable { + let voiceModel: String + let language: String + } + + let id: String + let createdAt: Date + let recording: RecordingInfo + let transcription: TranscriptionInfo + let configuration: Configuration +} + +private final class AudioBufferInputState: @unchecked Sendable { + let buffer: AVAudioPCMBuffer + var consumed = false + + init(buffer: AVAudioPCMBuffer) { + self.buffer = buffer + } +} + +private struct WhisperDecodeResult: Sendable { + let text: String + let language: String + let segments: [TranscriptionSegment] +} + +private final class WhisperContext { + private let ctx: OpaquePointer + + private init(ctx: OpaquePointer) { + self.ctx = ctx + } + + static func load(from path: String) throws -> WhisperContext { + var contextParams = whisper_context_default_params() + contextParams.use_gpu = true + + guard let ctx = whisper_init_from_file_with_params(path, contextParams) else { + throw CLIError.modelLoadFailed + } + + return WhisperContext(ctx: ctx) + } + + func transcribe(samples: [Float], options: WhisperTranscriptionOptions) throws -> WhisperDecodeResult { + var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY) + var languageCString: UnsafeMutablePointer? + switch options.language { + case .auto: + params.language = nil + case .fixed(let language): + languageCString = strdup(language) + params.language = UnsafePointer(languageCString) + } + defer { + if let languageCString { + free(languageCString) + } + } + + params.detect_language = options.detectLanguage + params.print_special = false + params.print_progress = false + params.print_realtime = false + params.print_timestamps = false + params.no_timestamps = options.noTimestamps + params.single_segment = options.singleSegment + params.token_timestamps = options.tokenTimestamps + params.split_on_word = options.splitOnWord + params.max_len = options.maxLen + params.n_threads = options.threadCount + + let resultCode = samples.withUnsafeBufferPointer { ptr in + whisper_full(ctx, params, ptr.baseAddress, Int32(ptr.count)) + } + + guard resultCode == 0 else { + throw CLIError.transcriptionFailed(resultCode) + } + + let segmentCount = whisper_full_n_segments(ctx) + var text = "" + var segments: [TranscriptionSegment] = [] + + for i in 0..? + + private static let modelFileName = "ggml-large-v3-turbo-q5_0.bin" + private static let modelURL = URL(string: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin")! + + static var modelsDirectory: URL { + let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + return appSupport.appendingPathComponent("MiniWhisper/models") + } + + static var modelPath: URL { + modelsDirectory.appendingPathComponent(modelFileName) + } + + func initialize() async throws { + if context != nil { return } + + if let task = initTask { + try await task.value + return + } + + let task = Task { + try await Self.ensureModelExists() + context = try WhisperContext.load(from: Self.modelPath.path) + } + initTask = task + + do { + try await task.value + } catch { + initTask = nil + throw error + } + } + + func transcribe(audioURL: URL, options: WhisperTranscriptionOptions) async throws -> TranscriptionResult { + if context == nil { + try await initialize() + } + + guard let context else { + throw CLIError.modelLoadFailed + } + + let samples = try resampleTo16kHz(audioURL: audioURL) + let audioDuration = Double(samples.count) / 16_000.0 + + let decode = try context.transcribe(samples: samples, options: options) + let trimmed = decode.text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) + var segments = decode.segments + if segments.isEmpty && !trimmed.isEmpty { + segments = [TranscriptionSegment(start: 0, end: audioDuration, text: trimmed, words: nil)] + } + + return TranscriptionResult( + text: trimmed, + segments: segments, + language: decode.language, + duration: segments.last?.end ?? audioDuration, + model: "whisper-large-v3-turbo" + ) + } + + private static func ensureModelExists() async throws { + let fm = FileManager.default + if fm.fileExists(atPath: modelPath.path) { + return + } + + try fm.createDirectory(at: modelsDirectory, withIntermediateDirectories: true) + let (downloadedURL, response) = try await URLSession.shared.download(from: modelURL) + guard let http = response as? HTTPURLResponse, (200 ... 299).contains(http.statusCode) else { + throw CLIError.modelLoadFailed + } + + let temporary = modelPath.appendingPathExtension("download") + try? fm.removeItem(at: temporary) + if fm.fileExists(atPath: modelPath.path) { + return + } + try fm.moveItem(at: downloadedURL, to: temporary) + try fm.moveItem(at: temporary, to: modelPath) + } + + private func resampleTo16kHz(audioURL: URL) throws -> [Float] { + let audioFile = try AVAudioFile(forReading: audioURL) + + guard let inputFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: audioFile.fileFormat.sampleRate, + channels: 1, + interleaved: false + ) else { + throw CLIError.resampleFailed + } + + guard let outputFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 16000, + channels: 1, + interleaved: false + ) else { + throw CLIError.resampleFailed + } + + let frameCount = AVAudioFrameCount(audioFile.length) + guard let inputBuffer = AVAudioPCMBuffer(pcmFormat: inputFormat, frameCapacity: frameCount) else { + throw CLIError.resampleFailed + } + + if audioFile.fileFormat.channelCount != 1 || audioFile.fileFormat.commonFormat != .pcmFormatFloat32 { + guard let converter = AVAudioConverter(from: audioFile.fileFormat, to: inputFormat) else { + throw CLIError.resampleFailed + } + guard let readBuffer = AVAudioPCMBuffer(pcmFormat: audioFile.fileFormat, frameCapacity: frameCount) else { + throw CLIError.resampleFailed + } + + try audioFile.read(into: readBuffer) + inputBuffer.frameLength = frameCount + + let state = AudioBufferInputState(buffer: readBuffer) + var conversionError: NSError? + converter.convert(to: inputBuffer, error: &conversionError) { _, outStatus in + if state.consumed { + outStatus.pointee = .endOfStream + return nil + } + state.consumed = true + outStatus.pointee = .haveData + return state.buffer + } + if let conversionError { + throw conversionError + } + } else { + try audioFile.read(into: inputBuffer) + } + + if audioFile.fileFormat.sampleRate == 16000 { + guard let ptr = inputBuffer.floatChannelData?[0] else { + throw CLIError.resampleFailed + } + return Array(UnsafeBufferPointer(start: ptr, count: Int(inputBuffer.frameLength))) + } + + guard let resampler = AVAudioConverter(from: inputFormat, to: outputFormat) else { + throw CLIError.resampleFailed + } + + let ratio = 16000.0 / audioFile.fileFormat.sampleRate + let outputFrameCount = AVAudioFrameCount(Double(inputBuffer.frameLength) * ratio) + guard let outputBuffer = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: outputFrameCount) else { + throw CLIError.resampleFailed + } + + let state = AudioBufferInputState(buffer: inputBuffer) + var resampleError: NSError? + resampler.convert(to: outputBuffer, error: &resampleError) { _, outStatus in + if state.consumed { + outStatus.pointee = .endOfStream + return nil + } + state.consumed = true + outStatus.pointee = .haveData + return state.buffer + } + if let resampleError { + throw resampleError + } + + guard let ptr = outputBuffer.floatChannelData?[0] else { + throw CLIError.resampleFailed + } + return Array(UnsafeBufferPointer(start: ptr, count: Int(outputBuffer.frameLength))) + } +} + +@MainActor +private final class ParakeetEngine { + private var asrManager: AsrManager? + private var initTask: Task? + + func initialize() async throws { + if asrManager != nil { return } + + if let task = initTask { + try await task.value + return + } + + let task = Task { + let models = try await AsrModels.downloadAndLoad(version: .v3) + let manager = AsrManager(config: .default) + try await manager.initialize(models: models) + asrManager = manager + } + initTask = task + + do { + try await task.value + } catch { + initTask = nil + throw error + } + } + + func transcribe(audioURL: URL) async throws -> TranscriptionResult { + if asrManager == nil { + try await initialize() + } + + guard let manager = asrManager else { + throw CLIError.modelLoadFailed + } + + let result = try await manager.transcribe(audioURL, source: .microphone) + let segments = convertToSegments(result) + + return TranscriptionResult( + text: result.text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines), + segments: segments, + language: "en", + duration: segments.last?.end ?? result.duration, + model: "parakeet-tdt-v3" + ) + } + + private func convertToSegments(_ result: ASRResult) -> [TranscriptionSegment] { + guard let tokenTimings = result.tokenTimings, !tokenTimings.isEmpty else { + return [TranscriptionSegment( + start: 0, + end: result.duration, + text: result.text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines), + words: nil + )] + } + + var words: [WordTiming] = [] + var currentWord = "" + var wordStart: TimeInterval = 0 + var wordEnd: TimeInterval = 0 + var confidences: [Float] = [] + + for timing in tokenTimings { + let token = timing.token + let startsNewWord = token.hasPrefix(" ") || token.hasPrefix("▁") || (words.isEmpty && currentWord.isEmpty) + + if startsNewWord && !currentWord.isEmpty { + let average = confidences.isEmpty ? 1.0 : confidences.reduce(0, +) / Float(confidences.count) + words.append(WordTiming(word: currentWord, start: wordStart, end: wordEnd, probability: average)) + currentWord = "" + confidences = [] + } + + let clean = token.trimmingCharacters(in: CharacterSet(charactersIn: " ▁")) + if currentWord.isEmpty { + wordStart = timing.startTime + } + currentWord += clean + wordEnd = timing.endTime + confidences.append(timing.confidence) + } + + if !currentWord.isEmpty { + let average = confidences.isEmpty ? 1.0 : confidences.reduce(0, +) / Float(confidences.count) + words.append(WordTiming(word: currentWord, start: wordStart, end: wordEnd, probability: average)) + } + + if words.isEmpty { + return [] + } + + return [TranscriptionSegment( + start: words.first?.start ?? 0, + end: words.last?.end ?? result.duration, + text: result.text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines), + words: words + )] + } +} + +@main +struct MiniWhisperDebugCLI { + static func main() async { + do { + let parsed = try parseArgs(Array(CommandLine.arguments.dropFirst())) + let resolvedInput = try resolveInput(parsed.inputPath) + let outputDir = parsed.outputDir ?? defaultOutputDir( + input: resolvedInput, + engine: parsed.engine, + preset: parsed.preset + ) + + try FileManager.default.createDirectory(at: outputDir, withIntermediateDirectories: true) + let transcription = try await runTranscription(parsed: parsed, input: resolvedInput) + let audioInfo = try loadAudioInfo(audioURL: resolvedInput.audioURL) + + try writeArtifacts( + outputDir: outputDir, + input: resolvedInput, + transcription: transcription, + audioInfo: audioInfo, + parsed: parsed + ) + + print("transcription complete") + print("engine: \(parsed.engine.rawValue)") + print("preset: \(parsed.preset.rawValue)") + print("audio: \(resolvedInput.audioURL.path)") + print("output: \(outputDir.path)") + } catch { + FileHandle.standardError.write(Data("error: \(error.localizedDescription)\n".utf8)) + exit(1) + } + } + + private static func parseArgs(_ args: [String]) throws -> ParsedArgs { + guard args.first == "transcribe" else { + throw CLIError.usage(usageText) + } + + var inputPath: String? + var engine: DebugEngine = .whisper + var preset: DebugPreset? + var outputDir: URL? + var noTimestamps: Bool? + var tokenTimestamps: Bool? + var splitOnWord: Bool? + var maxLen: Int32? + var detectLanguage: Bool? + var singleSegment: Bool? + var language: WhisperLanguageMode? + + var index = 1 + while index < args.count { + let arg = args[index] + if arg.hasPrefix("--") { + switch arg { + case "--engine": + let value = try parseStringValue(args, at: &index, flag: arg) + guard let parsed = DebugEngine(rawValue: value) else { + throw CLIError.invalidValue(flag: arg, value: value) + } + engine = parsed + case "--preset": + let value = try parseStringValue(args, at: &index, flag: arg) + guard let parsed = DebugPreset(rawValue: value) else { + throw CLIError.invalidValue(flag: arg, value: value) + } + preset = parsed + case "--output-dir": + let value = try parseStringValue(args, at: &index, flag: arg) + outputDir = URL(fileURLWithPath: (value as NSString).expandingTildeInPath) + case "--no-timestamps": + noTimestamps = try parseBoolValue(args, at: &index, flag: arg) + case "--token-timestamps": + tokenTimestamps = try parseBoolValue(args, at: &index, flag: arg) + case "--split-on-word": + splitOnWord = try parseBoolValue(args, at: &index, flag: arg) + case "--max-len": + let value = try parseStringValue(args, at: &index, flag: arg) + guard let parsed = Int32(value) else { + throw CLIError.invalidValue(flag: arg, value: value) + } + maxLen = parsed + case "--detect-language": + detectLanguage = try parseBoolValue(args, at: &index, flag: arg) + case "--single-segment": + singleSegment = try parseBoolValue(args, at: &index, flag: arg) + case "--language": + let value = try parseStringValue(args, at: &index, flag: arg) + if value == "auto" { + language = .auto + } else { + language = .fixed(value) + } + case "--help": + throw CLIError.usage(usageText) + default: + throw CLIError.invalidValue(flag: "flag", value: arg) + } + } else if inputPath == nil { + inputPath = arg + } else { + throw CLIError.usage(usageText) + } + index += 1 + } + + guard let inputPath else { + throw CLIError.usage(usageText) + } + + let effectivePreset = preset ?? (engine == .whisper ? .currentApp : .default) + if engine == .parakeet && effectivePreset != .default { + throw CLIError.invalidPresetForEngine(engine: engine, preset: effectivePreset) + } + + return ParsedArgs( + inputPath: inputPath, + engine: engine, + preset: effectivePreset, + outputDir: outputDir, + noTimestamps: noTimestamps, + tokenTimestamps: tokenTimestamps, + splitOnWord: splitOnWord, + maxLen: maxLen, + detectLanguage: detectLanguage, + singleSegment: singleSegment, + language: language + ) + } + + private static func parseStringValue(_ args: [String], at index: inout Int, flag: String) throws -> String { + let next = index + 1 + guard next < args.count else { + throw CLIError.missingValue(flag) + } + index = next + return args[next] + } + + private static func parseBoolValue(_ args: [String], at index: inout Int, flag: String) throws -> Bool { + let value = try parseStringValue(args, at: &index, flag: flag).lowercased() + switch value { + case "1", "true", "yes", "y": + return true + case "0", "false", "no", "n": + return false + default: + throw CLIError.invalidValue(flag: flag, value: value) + } + } + + private static func resolveInput(_ rawPath: String) throws -> ResolvedInput { + let expandedPath = (rawPath as NSString).expandingTildeInPath + let path = URL(fileURLWithPath: expandedPath) + var isDirectory: ObjCBool = false + + guard FileManager.default.fileExists(atPath: path.path, isDirectory: &isDirectory) else { + throw CLIError.invalidInputPath(rawPath) + } + + if isDirectory.boolValue { + let audioURL = path.appendingPathComponent("audio.wav") + guard FileManager.default.fileExists(atPath: audioURL.path) else { + throw CLIError.invalidInputPath(rawPath) + } + return ResolvedInput(sourceURL: path, audioURL: audioURL, sourceKind: "recording_directory") + } + + return ResolvedInput(sourceURL: path, audioURL: path, sourceKind: "audio_file") + } + + private static func defaultOutputDir(input: ResolvedInput, engine: DebugEngine, preset: DebugPreset) -> URL { + let baseDir: URL + if input.sourceKind == "recording_directory" { + baseDir = input.sourceURL.appendingPathComponent("debug-runs", isDirectory: true) + } else { + baseDir = input.sourceURL.deletingLastPathComponent().appendingPathComponent("debug-runs", isDirectory: true) + } + + let formatter = DateFormatter() + formatter.locale = Locale(identifier: "en_US_POSIX") + formatter.dateFormat = "yyyyMMdd-HHmmss" + let stamp = formatter.string(from: Date()) + return baseDir.appendingPathComponent("\(stamp)-\(engine.rawValue)-\(preset.rawValue)", isDirectory: true) + } + + @MainActor + private static func runTranscription(parsed: ParsedArgs, input: ResolvedInput) async throws -> TranscriptionResult { + switch parsed.engine { + case .parakeet: + let engine = ParakeetEngine() + return try await engine.transcribe(audioURL: input.audioURL) + case .whisper: + let engine = WhisperEngine() + var options = whisperOptions(for: parsed.preset) + if let noTimestamps = parsed.noTimestamps { + options.noTimestamps = noTimestamps + } + if let tokenTimestamps = parsed.tokenTimestamps { + options.tokenTimestamps = tokenTimestamps + } + if let splitOnWord = parsed.splitOnWord { + options.splitOnWord = splitOnWord + } + if let maxLen = parsed.maxLen { + options.maxLen = maxLen + } + if let detectLanguage = parsed.detectLanguage { + options.detectLanguage = detectLanguage + } + if let singleSegment = parsed.singleSegment { + options.singleSegment = singleSegment + } + if let language = parsed.language { + options.language = language + } + return try await engine.transcribe(audioURL: input.audioURL, options: options) + } + } + + private static func whisperOptions(for preset: DebugPreset) -> WhisperTranscriptionOptions { + switch preset { + case .currentApp: + return .appDefault() + case .candidateFix: + return .candidateFixDefault() + case .nanovoxLike: + return .nanoVoxLike() + case .default: + return .appDefault() + } + } + + private static func loadAudioInfo(audioURL: URL) throws -> (duration: TimeInterval, sampleRate: Double, channels: Int, fileSize: Int64) { + let audioFile = try AVAudioFile(forReading: audioURL) + let duration = Double(audioFile.length) / audioFile.fileFormat.sampleRate + let channels = Int(audioFile.fileFormat.channelCount) + let sampleRate = audioFile.fileFormat.sampleRate + let fileSize = (try? FileManager.default.attributesOfItem(atPath: audioURL.path)[.size] as? Int64) ?? 0 + return (duration, sampleRate, channels, fileSize) + } + + private static func writeArtifacts( + outputDir: URL, + input: ResolvedInput, + transcription: TranscriptionResult, + audioInfo: (duration: TimeInterval, sampleRate: Double, channels: Int, fileSize: Int64), + parsed: ParsedArgs + ) throws { + try transcription.text.write(to: outputDir.appendingPathComponent("transcript.txt"), atomically: true, encoding: .utf8) + + let segments = SegmentArtifact( + totalDuration: audioInfo.duration, + wordTimestampsEnabled: transcription.segments.contains { $0.words?.isEmpty == false }, + segments: transcription.segments + ) + try writeJSON(segments, to: outputDir.appendingPathComponent("segments.json")) + + let recordingID = String(Int(Date().timeIntervalSince1970 * 1000)) + let metadata = DebugMetadataArtifact( + id: recordingID, + createdAt: Date(), + recording: .init( + duration: audioInfo.duration, + sampleRate: audioInfo.sampleRate, + channels: audioInfo.channels, + fileSize: audioInfo.fileSize, + inputDevice: nil + ), + transcription: .init( + text: transcription.text, + language: transcription.language, + transcriptionDuration: transcription.duration, + segments: transcription.segments, + model: transcription.model + ), + configuration: .init( + voiceModel: transcription.model, + language: transcription.language + ) + ) + try writeJSON(metadata, to: outputDir.appendingPathComponent("metadata.json")) + + var overrides: [String: String] = [:] + if let value = parsed.noTimestamps { overrides["no_timestamps"] = String(value) } + if let value = parsed.tokenTimestamps { overrides["token_timestamps"] = String(value) } + if let value = parsed.splitOnWord { overrides["split_on_word"] = String(value) } + if let value = parsed.maxLen { overrides["max_len"] = String(value) } + if let value = parsed.detectLanguage { overrides["detect_language"] = String(value) } + if let value = parsed.singleSegment { overrides["single_segment"] = String(value) } + if let value = parsed.language { + switch value { + case .auto: + overrides["language"] = "auto" + case .fixed(let language): + overrides["language"] = language + } + } + + let run = DebugRunArtifact( + createdAt: Date(), + engine: parsed.engine.rawValue, + preset: parsed.preset.rawValue, + sourcePath: input.sourceURL.path, + sourceKind: input.sourceKind, + audioPath: input.audioURL.path, + outputDir: outputDir.path, + overrides: overrides, + model: transcription.model + ) + try writeJSON(run, to: outputDir.appendingPathComponent("debug-run.json")) + } + + private static func writeJSON(_ value: T, to url: URL) throws { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + encoder.dateEncodingStrategy = .iso8601 + try encoder.encode(value).write(to: url) + } + + private static var usageText: String { + """ + Usage: + MiniWhisperDebug transcribe [options] + + Input: + Audio file path or MiniWhisper recording directory containing audio.wav + + Options: + --engine whisper|parakeet Transcription engine (default: whisper) + --preset current-app|candidate-fix|nanovox-like|default + Whisper preset (default: current-app for whisper, default for parakeet) + --output-dir Output directory for transcript and JSON artifacts + --language auto| Whisper language override + --no-timestamps Whisper no_timestamps override + --token-timestamps Whisper token_timestamps override + --split-on-word Whisper split_on_word override + --max-len Whisper max_len override + --detect-language Whisper detect_language override + --single-segment Whisper single_segment override + --help Show this message + """ + } +} diff --git a/Tests/MiniWhisperTests/WhisperProviderTests.swift b/Tests/MiniWhisperTests/WhisperProviderTests.swift index 82cc2d2..160436e 100644 --- a/Tests/MiniWhisperTests/WhisperProviderTests.swift +++ b/Tests/MiniWhisperTests/WhisperProviderTests.swift @@ -2,11 +2,17 @@ import Testing @testable import MiniWhisper struct WhisperProviderTests { - @Test func transcriptionUsesAutoDetectWithoutDetectionOnlyMode() { + @Test func transcriptionUsesEnglishWithTimestampedSegments() { let options = WhisperContext.transcriptionOptions() + switch options.language { + case .fixed(let language): + #expect(language == "en") + case .auto: + Issue.record("Expected Whisper language to be pinned to English") + } #expect(!options.detectLanguage) - #expect(options.noTimestamps) + #expect(!options.noTimestamps) #expect(!options.singleSegment) #expect(options.threadCount >= 1) } diff --git a/justfile b/justfile index b0fdd2a..1f98e0a 100644 --- a/justfile +++ b/justfile @@ -28,7 +28,15 @@ dev: kill build package # Debug build [group('build')] build: - swift build + swift build --product MiniWhisper + +[group('build')] +debug-tool: + swift build --product MiniWhisperDebug + +[group('build')] +debug-transcribe +args: + swift run MiniWhisperDebug transcribe {{args}} # Create .app bundle (debug) [group('build')]