Skip to content

Commit 308c56f

Browse files
committed
fix(voice): optimize audio capture, fix text overwrite, and force simplified Chinese
- MacAudioCaptureService: pre-allocate capture buffer (180s), cache ObjC selectors, use ArrayPool for tap callback, merge resample+PCM in one pass - SpeechInputCoordinator: window partial transcription (10s) and final transcription (30s) to bound Whisper processing time - OverlayViewModel: track pre-recording text and append transcription results instead of overwriting on successive recording sessions - WhisperSpeechToTextEngine: add WithPrompt for zh language to guide Whisper toward simplified Chinese output Made-with: Cursor
1 parent d44d5ef commit 308c56f

File tree

4 files changed

+166
-78
lines changed

4 files changed

+166
-78
lines changed

src/LiveLingo.Core/Speech/WhisperSpeechToTextEngine.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ private async Task<WhisperProcessor> GetOrLoadProcessorAsync(string? language, C
8484
{
8585
_logger?.LogDebug("Whisper language set to {Language}", language);
8686
builder.WithLanguage(language);
87+
if (language.Equals("zh", StringComparison.OrdinalIgnoreCase))
88+
builder.WithPrompt("以下是普通话的句子。");
8789
}
8890
else
8991
{

src/LiveLingo.Desktop/Platform/macOS/MacAudioCaptureService.cs

Lines changed: 137 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System.Buffers;
12
using System.Runtime.InteropServices;
23
using System.Runtime.Versioning;
34
using LiveLingo.Core.Speech;
@@ -9,13 +10,16 @@ internal sealed class MacAudioCaptureService : IAudioCaptureService
910
{
1011
private const int TargetSampleRate = 16000;
1112
private const int TargetChannels = 1;
12-
private const int MaxRecordingSeconds = 60;
13+
private const int MaxRecordingSeconds = 180;
1314
private const int MaxCaptureBytes = MaxRecordingSeconds * TargetSampleRate * TargetChannels * 2;
1415
private const uint TapBufferFrames = 4096;
1516

17+
private static readonly Lazy<TapSelectors> s_selectors = new(TapSelectors.Resolve);
18+
1619
private IntPtr _engine;
1720
private IntPtr _inputNode;
18-
private readonly MemoryStream _capturedData = new();
21+
private readonly byte[] _captureBuffer = new byte[MaxCaptureBytes];
22+
private int _capturePosition;
1923
private readonly object _gate = new();
2024
private bool _isRecording;
2125

@@ -61,7 +65,7 @@ public Task StartAsync(CancellationToken ct = default)
6165
throw new InvalidOperationException("Already recording.");
6266
}
6367

64-
_capturedData.SetLength(0);
68+
_capturePosition = 0;
6569
_activeInstance = this;
6670

6771
var engineClass = MacNativeMethods.objc_getClass("AVAudioEngine");
@@ -105,24 +109,30 @@ public Task<AudioCaptureResult> StopAsync(CancellationToken ct = default)
105109
var stopSel = MacNativeMethods.sel_registerName("stop");
106110
MacNativeMethods.objc_msgSend(_engine, stopSel);
107111

108-
lock (_gate) _isRecording = false;
112+
int length;
113+
lock (_gate)
114+
{
115+
_isRecording = false;
116+
length = _capturePosition;
117+
}
109118
_activeInstance = null;
110119
CleanupEngine();
111120

112-
var pcm = _capturedData.ToArray();
113-
var duration = TimeSpan.FromSeconds(
114-
(double)pcm.Length / (TargetSampleRate * TargetChannels * 2));
121+
var pcm = new byte[length];
122+
Buffer.BlockCopy(_captureBuffer, 0, pcm, 0, length);
123+
var duration = TimeSpan.FromSeconds((double)length / (TargetSampleRate * TargetChannels * 2));
115124
return Task.FromResult(new AudioCaptureResult(pcm, TargetSampleRate, TargetChannels, duration));
116125
}
117126

118127
public AudioCaptureResult? GetCurrentBuffer()
119128
{
120129
lock (_gate)
121130
{
122-
if (!_isRecording || _capturedData.Length == 0) return null;
123-
var pcm = _capturedData.ToArray();
131+
if (!_isRecording || _capturePosition == 0) return null;
132+
var pcm = new byte[_capturePosition];
133+
Buffer.BlockCopy(_captureBuffer, 0, pcm, 0, _capturePosition);
124134
var duration = TimeSpan.FromSeconds(
125-
(double)pcm.Length / (TargetSampleRate * TargetChannels * 2));
135+
(double)_capturePosition / (TargetSampleRate * TargetChannels * 2));
126136
return new AudioCaptureResult(pcm, TargetSampleRate, TargetChannels, duration);
127137
}
128138
}
@@ -149,7 +159,7 @@ private void InstallTapOnInputNode()
149159
_inputNode, tapSel,
150160
0,
151161
TapBufferFrames,
152-
IntPtr.Zero, // nil = use the node's native hardware format
162+
IntPtr.Zero,
153163
_blockPtr);
154164
}
155165

@@ -188,60 +198,73 @@ private static void TapCallbackStatic(IntPtr block, IntPtr pcmBuffer, IntPtr whe
188198

189199
try
190200
{
191-
var frameLengthSel = MacNativeMethods.sel_registerName("frameLength");
192-
var frameLength = MacAudioNative.objc_msgSend_uint(pcmBuffer, frameLengthSel);
193-
if (frameLength == 0) return;
201+
var sel = s_selectors.Value;
194202

195-
var formatSel = MacNativeMethods.sel_registerName("format");
196-
var bufferFormat = MacNativeMethods.objc_msgSend(pcmBuffer, formatSel);
203+
var frameLength = (int)MacAudioNative.objc_msgSend_uint(pcmBuffer, sel.FrameLength);
204+
if (frameLength == 0) return;
197205

198-
var sampleRateSel = MacNativeMethods.sel_registerName("sampleRate");
206+
var bufferFormat = MacNativeMethods.objc_msgSend(pcmBuffer, sel.Format);
199207
var sourceSampleRate = (int)Math.Round(
200-
MacAudioNative.objc_msgSend_double(bufferFormat, sampleRateSel));
201-
202-
var channelCountSel = MacNativeMethods.sel_registerName("channelCount");
203-
var channelCount = MacAudioNative.objc_msgSend_uint(bufferFormat, channelCountSel);
204-
205-
var floatDataSel = MacNativeMethods.sel_registerName("floatChannelData");
206-
var floatDataPtr = MacNativeMethods.objc_msgSend(pcmBuffer, floatDataSel);
208+
MacAudioNative.objc_msgSend_double(bufferFormat, sel.SampleRate));
209+
var channelCount = (int)MacAudioNative.objc_msgSend_uint(bufferFormat, sel.ChannelCount);
210+
var floatDataPtr = MacNativeMethods.objc_msgSend(pcmBuffer, sel.FloatChannelData);
207211
if (floatDataPtr == IntPtr.Zero) return;
208212

209213
var channel0Ptr = Marshal.ReadIntPtr(floatDataPtr);
210214
if (channel0Ptr == IntPtr.Zero) return;
211-
var channel0 = new float[frameLength];
212-
Marshal.Copy(channel0Ptr, channel0, 0, (int)frameLength);
213215

214-
float[] mono;
215-
if (channelCount >= 2)
216+
var ch0 = ArrayPool<float>.Shared.Rent(frameLength);
217+
float[]? ch1 = null;
218+
try
216219
{
217-
var channel1Ptr = Marshal.ReadIntPtr(floatDataPtr, IntPtr.Size);
218-
var channel1 = new float[frameLength];
219-
if (channel1Ptr != IntPtr.Zero)
220-
Marshal.Copy(channel1Ptr, channel1, 0, (int)frameLength);
221-
mono = new float[frameLength];
222-
for (var i = 0; i < (int)frameLength; i++)
223-
mono[i] = (channel0[i] + channel1[i]) * 0.5f;
220+
Marshal.Copy(channel0Ptr, ch0, 0, frameLength);
221+
222+
if (channelCount >= 2)
223+
{
224+
var channel1Ptr = Marshal.ReadIntPtr(floatDataPtr, IntPtr.Size);
225+
if (channel1Ptr != IntPtr.Zero)
226+
{
227+
ch1 = ArrayPool<float>.Shared.Rent(frameLength);
228+
Marshal.Copy(channel1Ptr, ch1, 0, frameLength);
229+
for (var i = 0; i < frameLength; i++)
230+
ch0[i] = (ch0[i] + ch1[i]) * 0.5f;
231+
}
232+
}
233+
234+
var ratio = (double)sourceSampleRate / TargetSampleRate;
235+
var outSamples = sourceSampleRate == TargetSampleRate
236+
? frameLength
237+
: (int)(frameLength / ratio);
238+
if (outSamples <= 0) return;
239+
240+
var pcmByteCount = outSamples * 2;
241+
var pcmBytes = ArrayPool<byte>.Shared.Rent(pcmByteCount);
242+
try
243+
{
244+
WritePcm16(ch0, frameLength, sourceSampleRate, pcmBytes, outSamples);
245+
246+
lock (instance._gate)
247+
{
248+
if (!instance._isRecording) return;
249+
var space = MaxCaptureBytes - instance._capturePosition;
250+
var toWrite = Math.Min(pcmByteCount, space);
251+
if (toWrite > 0)
252+
{
253+
Buffer.BlockCopy(pcmBytes, 0,
254+
instance._captureBuffer, instance._capturePosition, toWrite);
255+
instance._capturePosition += toWrite;
256+
}
257+
}
258+
}
259+
finally
260+
{
261+
ArrayPool<byte>.Shared.Return(pcmBytes);
262+
}
224263
}
225-
else
226-
{
227-
mono = channel0;
228-
}
229-
230-
var resampled = ResampleLinear(mono, sourceSampleRate, TargetSampleRate);
231-
232-
var pcmBytes = new byte[resampled.Length * 2];
233-
for (var i = 0; i < resampled.Length; i++)
234-
{
235-
var clamped = Math.Clamp(resampled[i], -1f, 1f);
236-
var int16 = (short)(clamped * 32767f);
237-
pcmBytes[i * 2] = (byte)(int16 & 0xFF);
238-
pcmBytes[i * 2 + 1] = (byte)((int16 >> 8) & 0xFF);
239-
}
240-
241-
lock (instance._gate)
264+
finally
242265
{
243-
if (instance._isRecording && instance._capturedData.Length < MaxCaptureBytes)
244-
instance._capturedData.Write(pcmBytes, 0, pcmBytes.Length);
266+
ArrayPool<float>.Shared.Return(ch0);
267+
if (ch1 is not null) ArrayPool<float>.Shared.Return(ch1);
245268
}
246269
}
247270
catch
@@ -250,29 +273,42 @@ private static void TapCallbackStatic(IntPtr block, IntPtr pcmBuffer, IntPtr whe
250273
}
251274
}
252275

253-
private static float[] ResampleLinear(float[] input, int sourceRate, int targetRate)
276+
private static void WritePcm16(
277+
float[] mono, int monoLength,
278+
int sourceRate,
279+
byte[] dest, int outSamples)
254280
{
255-
if (sourceRate == targetRate)
256-
return input;
257-
258-
var ratio = (double)sourceRate / targetRate;
259-
var outputLength = (int)(input.Length / ratio);
260-
if (outputLength <= 0) return [];
261-
262-
var output = new float[outputLength];
263-
for (var i = 0; i < outputLength; i++)
281+
if (sourceRate == TargetSampleRate)
264282
{
265-
var srcPos = i * ratio;
266-
var srcIdx = (int)srcPos;
267-
var frac = (float)(srcPos - srcIdx);
268-
269-
if (srcIdx + 1 < input.Length)
270-
output[i] = input[srcIdx] * (1f - frac) + input[srcIdx + 1] * frac;
271-
else if (srcIdx < input.Length)
272-
output[i] = input[srcIdx];
283+
for (var i = 0; i < outSamples; i++)
284+
{
285+
var int16 = (short)(Math.Clamp(mono[i], -1f, 1f) * 32767f);
286+
dest[i * 2] = (byte)(int16 & 0xFF);
287+
dest[i * 2 + 1] = (byte)((int16 >> 8) & 0xFF);
288+
}
289+
}
290+
else
291+
{
292+
var ratio = (double)sourceRate / TargetSampleRate;
293+
for (var i = 0; i < outSamples; i++)
294+
{
295+
var srcPos = i * ratio;
296+
var srcIdx = (int)srcPos;
297+
var frac = (float)(srcPos - srcIdx);
298+
299+
float sample;
300+
if (srcIdx + 1 < monoLength)
301+
sample = mono[srcIdx] * (1f - frac) + mono[srcIdx + 1] * frac;
302+
else if (srcIdx < monoLength)
303+
sample = mono[srcIdx];
304+
else
305+
sample = 0;
306+
307+
var int16 = (short)(Math.Clamp(sample, -1f, 1f) * 32767f);
308+
dest[i * 2] = (byte)(int16 & 0xFF);
309+
dest[i * 2 + 1] = (byte)((int16 >> 8) & 0xFF);
310+
}
273311
}
274-
275-
return output;
276312
}
277313

278314
[UnmanagedFunctionPointer(CallingConvention.Cdecl)]
@@ -295,6 +331,33 @@ private struct BlockDescriptor
295331
public ulong size;
296332
}
297333

334+
private sealed class TapSelectors
335+
{
336+
public readonly IntPtr FrameLength;
337+
public readonly IntPtr Format;
338+
public readonly IntPtr SampleRate;
339+
public readonly IntPtr ChannelCount;
340+
public readonly IntPtr FloatChannelData;
341+
342+
private TapSelectors(
343+
IntPtr frameLength, IntPtr format, IntPtr sampleRate,
344+
IntPtr channelCount, IntPtr floatChannelData)
345+
{
346+
FrameLength = frameLength;
347+
Format = format;
348+
SampleRate = sampleRate;
349+
ChannelCount = channelCount;
350+
FloatChannelData = floatChannelData;
351+
}
352+
353+
public static TapSelectors Resolve() => new(
354+
MacNativeMethods.sel_registerName("frameLength"),
355+
MacNativeMethods.sel_registerName("format"),
356+
MacNativeMethods.sel_registerName("sampleRate"),
357+
MacNativeMethods.sel_registerName("channelCount"),
358+
MacNativeMethods.sel_registerName("floatChannelData"));
359+
}
360+
298361
#endregion
299362

300363
private void CleanupEngine()

src/LiveLingo.Desktop/Services/Speech/SpeechInputCoordinator.cs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ public sealed class SpeechInputCoordinator : ISpeechInputCoordinator
99
{
1010
private static readonly TimeSpan VadPollInterval = TimeSpan.FromMilliseconds(100);
1111
private static readonly TimeSpan FallbackTranscriptionInterval = TimeSpan.FromSeconds(5);
12+
private const int PartialWindowSeconds = 10;
13+
private const int FinalWindowSeconds = 30;
14+
private const int BytesPerSample = 2;
1215

1316
private readonly IAudioCaptureService _audioCapture;
1417
private readonly ISpeechToTextEngine _sttEngine;
@@ -129,7 +132,8 @@ public async Task<SpeechInputResult> StopAndTranscribeAsync(string? language = n
129132
var audio = await _audioCapture.StopAsync(_sessionCts.Token);
130133

131134
var lang = language ?? _recordingLanguage;
132-
var result = await _sttEngine.TranscribeAsync(audio, lang, _sessionCts.Token);
135+
var windowedAudio = CreateWindow(audio, FinalWindowSeconds);
136+
var result = await _sttEngine.TranscribeAsync(windowedAudio, lang, _sessionCts.Token);
133137
SetState(VoiceInputState.Idle);
134138
return new SpeechInputResult(true, result.Text, SpeechInputErrorCode.None);
135139
}
@@ -192,7 +196,8 @@ private async Task RunVadDrivenTranscriptionLoopAsync(CancellationToken ct)
192196

193197
try
194198
{
195-
var result = await _sttEngine.TranscribeAsync(buffer, _recordingLanguage, ct);
199+
var windowBuffer = CreateWindow(buffer, PartialWindowSeconds);
200+
var result = await _sttEngine.TranscribeAsync(windowBuffer, _recordingLanguage, ct);
196201
if (!string.IsNullOrWhiteSpace(result.Text))
197202
PartialTranscription?.Invoke(result.Text);
198203
}
@@ -213,6 +218,20 @@ private async Task RunVadDrivenTranscriptionLoopAsync(CancellationToken ct)
213218
}
214219
}
215220

221+
private static AudioCaptureResult CreateWindow(AudioCaptureResult full, int windowSeconds)
222+
{
223+
var maxWindowBytes = windowSeconds * full.SampleRate * full.Channels * BytesPerSample;
224+
if (full.PcmData.Length <= maxWindowBytes)
225+
return full;
226+
227+
var windowStart = full.PcmData.Length - maxWindowBytes;
228+
var windowPcm = new byte[maxWindowBytes];
229+
Buffer.BlockCopy(full.PcmData, windowStart, windowPcm, 0, maxWindowBytes);
230+
var duration = TimeSpan.FromSeconds(
231+
(double)maxWindowBytes / (full.SampleRate * full.Channels * BytesPerSample));
232+
return new AudioCaptureResult(windowPcm, full.SampleRate, full.Channels, duration);
233+
}
234+
216235
private static float[] ConvertPcmToFloat(byte[] pcm, int byteOffset, int byteCount)
217236
{
218237
var sampleCount = byteCount / 2;

src/LiveLingo.Desktop/ViewModels/OverlayViewModel.cs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public partial class OverlayViewModel : ObservableObject
6666
[ObservableProperty] private bool _isVoiceAvailable;
6767
[ObservableProperty] private bool _showSttDownloadLink;
6868
[ObservableProperty] private bool _isVoiceLanguagePickerOpen;
69+
private string _preRecordingText = string.Empty;
6970
[ObservableProperty] private LanguageInfo? _selectedVoiceLanguage;
7071

7172
public string CopyLabel => L("overlay.copy");
@@ -597,7 +598,7 @@ private async Task ToggleVoiceInputAsync()
597598
{
598599
if (!string.IsNullOrWhiteSpace(result.Text))
599600
{
600-
SourceText = result.Text;
601+
SourceText = _preRecordingText + result.Text;
601602
if (SelectedVoiceLanguage is not null)
602603
{
603604
SelectedSourceLanguage = SelectedVoiceLanguage;
@@ -618,6 +619,9 @@ private async Task ToggleVoiceInputAsync()
618619
else if (VoiceState == VoiceInputState.Idle || VoiceState == VoiceInputState.Error)
619620
{
620621
VoiceStatusText = string.Empty;
622+
_preRecordingText = SourceText;
623+
if (!string.IsNullOrEmpty(_preRecordingText) && !_preRecordingText.EndsWith(' '))
624+
_preRecordingText += " ";
621625
SelectedVoiceLanguage = SelectedSourceLanguage;
622626
var result = await _speechCoordinator.StartRecordingAsync(SelectedVoiceLanguage?.Code);
623627
if (!result.Success)
@@ -674,7 +678,7 @@ private void SubscribeSpeechCoordinator()
674678
private void HandlePartialTranscription(string text)
675679
{
676680
if (VoiceState == VoiceInputState.Recording)
677-
SourceText = text;
681+
SourceText = _preRecordingText + text;
678682
}
679683

680684
private void HandleVoiceStateChanged(VoiceInputState state)

0 commit comments

Comments
 (0)