Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ audio.wav
__pycache__/
dist
build
main.spec
main.spec
venv
9 changes: 9 additions & 0 deletions .sample_env
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,17 @@ OPENAI_BASE_URL="http://localhost:7000/v1"
OPENAI_MODEL_NAME="Systran/faster-distil-whisper-large-v3"
# OPENAI_MODEL_NAME="deepdml/faster-whisper-large-v3-turbo-ct2"

# Language Settings
UTTERTYPE_LANGUAGE="en"
#UTTERTYPE_SECOND_LANGUAGE="ru"

# Hotkey Configuration
UTTERTYPE_RECORD_HOTKEYS="<ctrl>+<alt>+v"
#UTTERTYPE_RECORD_HOTKEYS_SECOND_LANGUAGE="<ctrl>+<alt>+r"

# Alternative hotkeys for macOS:
# UTTERTYPE_RECORD_HOTKEYS="<cmd>+<ctrl>"
# UTTERTYPE_RECORD_HOTKEYS_SECOND_LANGUAGE="<cmd>+<shift>"

# Minimum duration of speech to send to API in case of silence
UTTERTYPE_MIN_TRANSCRIPTION_SIZE_MS=10000 # defaults to: 1500
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,5 +124,27 @@ OR

When the program first runs, you will likely need to give it sufficient permissions. On macOS, this will include adding terminal to accessibility under `Privacy and Security > Accessibility`, giving it permission to monitor the keyboard, and finally giving it permission to record using the microphone.

## Language Support

uttertype now supports dual language speech recognition with dedicated hotkeys for each language!

### Configuration
Add language settings to your `.env` file:

```env
# Language configuration
UTTERTYPE_LANGUAGE=en # Primary language
UTTERTYPE_SECOND_LANGUAGE=ru # Secondary language

# Hotkey configuration
UTTERTYPE_RECORD_HOTKEYS=<ctrl>+<alt>+v # Primary language hotkey
UTTERTYPE_RECORD_HOTKEYS_SECOND_LANGUAGE=<ctrl>+<alt>+r # Secondary language hotkey
```

### Usage
- **Primary Language**: Hold your primary hotkey (default: `Ctrl+Alt+V`) and speak in your primary language
- **Secondary Language**: Hold your secondary hotkey (default: `Ctrl+Alt+R`) and speak in your secondary language
- The console will show which language was used for each transcription with language code prefix


## Usage
To start transcription, press and hold the registered hotkey to start recording. To stop the recording, lift your registered hotkey. On macOS, the registered hotkey is the globe icon by default. For other operating systems, this will have to by manually configured in `main.py` as described earlier.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add back the usage section.

63 changes: 50 additions & 13 deletions key_listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,56 @@ def release(self, key):
self.press(key)


def create_keylistener(transcriber, env_var="UTTERTYPE_RECORD_HOTKEYS"):
key_code = os.getenv(env_var, "")

if (sys.platform == "darwin") and (key_code in ["<globe>", ""]):
return HoldGlobeKey(
on_activate=transcriber.start_recording,
on_deactivate=transcriber.stop_recording,
class MultiHotKeyListener:
"""Handles multiple hotkeys for different functions"""
def __init__(self, transcriber):
self.transcriber = transcriber
self.hotkeys = []
self._setup_hotkeys()

def _setup_hotkeys(self):
# Get language configuration
primary_lang = os.getenv("UTTERTYPE_LANGUAGE", "en")
secondary_lang = os.getenv("UTTERTYPE_SECOND_LANGUAGE", "ru")

# Primary language recording hotkey
primary_key = os.getenv("UTTERTYPE_RECORD_HOTKEYS", "<ctrl>+<alt>+v")
if (sys.platform == "darwin") and (primary_key in ["<globe>", ""]):
primary_hotkey = HoldGlobeKey(
on_activate=lambda: self._start_recording(primary_lang),
on_deactivate=self.transcriber.stop_recording,
)
else:
primary_hotkey = HoldHotKey(
HoldHotKey.parse(primary_key),
on_activate=lambda: self._start_recording(primary_lang),
on_deactivate=self.transcriber.stop_recording,
)
self.hotkeys.append(primary_hotkey)

# Secondary language recording hotkey
secondary_key = os.getenv("UTTERTYPE_RECORD_HOTKEYS_SECOND_LANGUAGE", "<ctrl>+<alt>+r")
secondary_hotkey = HoldHotKey(
HoldHotKey.parse(secondary_key),
on_activate=lambda: self._start_recording(secondary_lang),
on_deactivate=self.transcriber.stop_recording,
)
self.hotkeys.append(secondary_hotkey)

def _start_recording(self, language):
"""Start recording with specified language"""
self.transcriber.set_language(language)
self.transcriber.start_recording()

def press(self, key):
for hotkey in self.hotkeys:
hotkey.press(key)

def release(self, key):
for hotkey in self.hotkeys:
hotkey.release(key)

key_code = key_code if key_code else "<ctrl>+<alt>+v"

return HoldHotKey(
HoldHotKey.parse(key_code),
on_activate=transcriber.start_recording,
on_deactivate=transcriber.stop_recording,
)
def create_keylistener(transcriber, env_var="UTTERTYPE_RECORD_HOTKEYS"):
"""Create a multi-hotkey listener for recording and language toggle"""
return MultiHotKeyListener(transcriber)
21 changes: 20 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import os
from pynput import keyboard
from transcriber import WhisperAPITranscriber
from table_interface import ConsoleTable
Expand All @@ -11,15 +12,33 @@ async def main():
load_dotenv()

transcriber = WhisperAPITranscriber.create()
# Set initial language from environment variable if provided
initial_language = os.getenv('UTTERTYPE_LANGUAGE', 'en')
transcriber.set_language(initial_language)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would prefer if this is done in the APITranscriber class itself to keep main.py simple. One of the goals of the repo is also to keep things extremely readable, especially main.py.


hotkey = create_keylistener(transcriber)

keyboard.Listener(on_press=hotkey.press, on_release=hotkey.release).start()
console_table = ConsoleTable()

# Get language configuration for display
primary_lang = os.getenv('UTTERTYPE_LANGUAGE', 'en')
secondary_lang = os.getenv('UTTERTYPE_SECOND_LANGUAGE', 'ru')
primary_key = os.getenv('UTTERTYPE_RECORD_HOTKEYS', '<ctrl>+<alt>+v')
secondary_key = os.getenv('UTTERTYPE_RECORD_HOTKEYS_SECOND_LANGUAGE', '<ctrl>+<alt>+r')

print(f"UtterType started with dual language support")
print(f"Primary language ({primary_lang.upper()}): {primary_key}")
print(f"Secondary language ({secondary_lang.upper()}): {secondary_key}")
print("Hold the respective hotkey to record in the corresponding language")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also move these print statements and env fetches somewhere else? Perhaps in the default transcriber class?


with console_table:
async for transcription, audio_duration_ms in transcriber.get_transcriptions():
current_lang = transcriber.get_language().upper()
print(f"[{current_lang}] Transcribed: {transcription.strip()}")
manual_type(transcription.strip())
console_table.insert(
transcription,
f"[{current_lang}] {transcription}",
round(0.0001 * audio_duration_ms / 1000, 6),
)

Expand Down
106 changes: 106 additions & 0 deletions test_dual_hotkeys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Test script for dual language hotkey functionality
"""

class MockAudioTranscriber:
"""Mock version of AudioTranscriber for testing"""
def __init__(self):
self.language = "en" # Default language
self.recording = False

def set_language(self, language: str):
"""Set the transcription language"""
self.language = language
print(f"Language switched to: {language}")

def get_language(self) -> str:
"""Get current language"""
return self.language

def start_recording(self):
"""Start recording"""
self.recording = True
print(f"Started recording in {self.language}")

def stop_recording(self):
"""Stop recording"""
self.recording = False
print("Stopped recording")

def test_dual_hotkey_logic():
"""Test the dual hotkey functionality"""
print("Testing dual language hotkey functionality...")

transcriber = MockAudioTranscriber()

# Simulate primary language hotkey press
print("\n--- Simulating primary language hotkey ---")
transcriber.set_language("en")
transcriber.start_recording()
assert transcriber.get_language() == "en", "Should be primary language"
assert transcriber.recording == True, "Should be recording"
transcriber.stop_recording()

# Simulate secondary language hotkey press
print("\n--- Simulating secondary language hotkey ---")
transcriber.set_language("ru")
transcriber.start_recording()
assert transcriber.get_language() == "ru", "Should be secondary language"
assert transcriber.recording == True, "Should be recording"
transcriber.stop_recording()

# Test switching between languages
print("\n--- Testing language switching ---")
transcriber.set_language("fr")
assert transcriber.get_language() == "fr", "Should accept any language"

transcriber.set_language("de")
assert transcriber.get_language() == "de", "Should accept any language"

print("✅ All dual hotkey tests passed!")

def test_environment_variables():
"""Test environment variable parsing"""
import os

print("Testing environment variable defaults...")

# Test default values
primary_lang = os.getenv("UTTERTYPE_LANGUAGE", "en")
secondary_lang = os.getenv("UTTERTYPE_SECOND_LANGUAGE", "ru")
primary_hotkey = os.getenv("UTTERTYPE_RECORD_HOTKEYS", "<ctrl>+<alt>+v")
secondary_hotkey = os.getenv("UTTERTYPE_RECORD_HOTKEYS_SECOND_LANGUAGE", "<ctrl>+<alt>+r")

print(f"Primary language: {primary_lang}")
print(f"Secondary language: {secondary_lang}")
print(f"Primary hotkey: {primary_hotkey}")
print(f"Secondary hotkey: {secondary_hotkey}")

# Don't assert specific languages, just that the variables work
assert len(primary_lang) >= 2, "Primary language should be valid"
assert len(secondary_lang) >= 2, "Secondary language should be valid"
assert len(primary_hotkey) > 0, "Primary hotkey should be configured"
assert len(secondary_hotkey) > 0, "Secondary hotkey should be configured"

print("✅ Environment variable tests passed!")

if __name__ == "__main__":
try:
test_dual_hotkey_logic()
test_environment_variables()
print("\n🎉 All tests passed! Dual language functionality is working correctly.")
print("\n📝 Implementation summary:")
print("✅ Configurable primary and secondary languages")
print("✅ Separate hotkeys for each language")
print("✅ Automatic language switching when using hotkeys")
print("✅ Environment variable configuration")
print("\n🚀 Ready to use:")
print("1. Configure languages and hotkeys in .env file")
print("2. Run: python main.py")
print("3. Hold primary hotkey to record in primary language")
print("4. Hold secondary hotkey to record in secondary language")
except Exception as e:
print(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
16 changes: 14 additions & 2 deletions transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(self):
self.event_loop = asyncio.get_event_loop()
self.vad = webrtcvad.Vad(1) # Voice Activity Detector, mode can be 0 to 3
self.transcriptions = asyncio.Queue()
self.language = "en" # Default language is English

def start_recording(self):
"""Start recording audio from the microphone."""
Expand Down Expand Up @@ -116,6 +117,14 @@ def _frames_to_wav(self):
wf.close()
return buffer

def set_language(self, language: str):
"""Set the transcription language"""
self.language = language

def get_language(self) -> str:
"""Get current language"""
return self.language

def transcribe_audio(self, audio: io.BytesIO) -> str:
raise NotImplementedError("Please use a subclass of AudioTranscriber")

Expand Down Expand Up @@ -146,12 +155,15 @@ def create(*args, **kwargs):

def transcribe_audio(self, audio: io.BytesIO) -> str:
try:
# Use default prompt for technical speech
prompt = "The following is normal speech or technical speech from an engineer."

transcription = self.client.audio.transcriptions.create(
model=self.model_name,
file=audio,
response_format="text",
language="en",
prompt="The following is normal speech or technical speech from an engineer.",
language=self.language,
prompt=prompt,
)
return transcription
except Exception as e:
Expand Down