language-flashcard-generator/main.py at master · dav3rin/language-flashcard-generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
from pathlib import Path
from typing import Optional

import typer
from dotenv import load_dotenv
from gtts import gTTS

load_dotenv()

app = typer.Typer(help="Language flashcard generator with TTS and AI enhancement.")


@app.command()
def google(
    text: str = typer.Argument(help="Text to convert to speech."),
    output: Path = typer.Option("output.mp3", "--output", "-o", help="Output file path."),
    lang: str = typer.Option("en", "--lang", "-l", help="Language code (e.g. en, es, fr, ja)."),
    slow: bool = typer.Option(False, "--slow", "-s", help="Speak slowly."),
) -> None:
    """Convert text to speech using Google TTS."""
    tts = gTTS(text=text, lang=lang, slow=slow)
    tts.save(str(output))
    typer.echo(f"Saved audio to {output}")


@app.command()
def azure(
    text: str = typer.Argument(help="Text to convert to speech."),
    output: Path = typer.Option("output.mp3", "--output", "-o", help="Output file path."),
    voice: Optional[str] = typer.Option(None, "--voice", "-v", help="Azure neural voice name (e.g. zu-ZA-ThandoNeural)."),
    lang: str = typer.Option("en-US", "--lang", "-l", help="Language code (e.g. en-US, zu-ZA, ja-JP)."),
    rate: str = typer.Option("medium", "--rate", "-r", help="Speech rate: x-slow, slow, medium, fast, x-fast, or percentage (e.g. +50%, -30%)."),
) -> None:
    """Convert text to speech using Azure TTS."""
    import azure.cognitiveservices.speech as speechsdk
    from azure.identity import DefaultAzureCredential

    endpoint = os.environ.get("SPEECH_ENDPOINT")
    resource_id = os.environ.get("SPEECH_RESOURCE_ID")
    if not endpoint or not resource_id:
        typer.echo("Error: SPEECH_ENDPOINT and SPEECH_RESOURCE_ID environment variables are required.", err=True)
        raise typer.Exit(1)

    speech_config = speechsdk.SpeechConfig(endpoint=endpoint)

    # Authenticate via az cli / DefaultAzureCredential
    aad_token = (
        DefaultAzureCredential()
        .get_token("https://cognitiveservices.azure.com/.default")
        .token
    )
    speech_config.authorization_token = f"aad#{resource_id}#{aad_token}"

    if voice:
        speech_config.speech_synthesis_voice_name = voice
    else:
        speech_config.speech_synthesis_language = lang

    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
    )

    audio_config = speechsdk.audio.AudioOutputConfig(filename=str(output))
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

    voice_name = voice or speech_config.speech_synthesis_voice_name
    ssml = (
        f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{lang}">'
        f'<voice name="{voice_name}">'
        f'<prosody rate="{rate}">{text}</prosody>'
        f'</voice></speak>'
    )
    result = synthesizer.speak_ssml_async(ssml).get()

    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        typer.echo(f"Saved audio to {output}")
    else:
        details = result.cancellation_details
        typer.echo(f"Synthesis failed: {details.reason} — {details.error_details}", err=True)
        raise typer.Exit(1)


@app.command()
def build_deck(
    config: Path = typer.Argument(help="Path to YAML config file (e.g. configs/zulu_common.yaml)."),
    size: Optional[int] = typer.Option(None, "--size", "-s", help="Limit to N cards."),
    render_only: bool = typer.Option(False, "--render-only", help="Re-render .mochi from cached cards only."),
    start: int = typer.Option(0, "--start", help="Start from card index (for resume)."),
) -> None:
    """Build a Mochi flashcard deck from frequency lists using Claude AI and Azure TTS."""
    import yaml

    from enhancer import FlashcardEnhancer
    from models import EnhancementConfig

    if not config.exists():
        typer.echo(f"Config file not found: {config}", err=True)
        raise typer.Exit(1)

    with open(config, "r") as f:
        config_data = yaml.safe_load(f)

    enhancement_config = EnhancementConfig(**config_data)
    enhancer = FlashcardEnhancer(enhancement_config)
    success = enhancer.build(size=size, render_only=render_only, start=start)

    if not success:
        raise typer.Exit(1)


@app.command()
def clean(
    config: Path = typer.Argument(help="Path to YAML config file."),
) -> None:
    """Remove empty (0-byte) audio files left by failed TTS generation."""
    import yaml

    from enhancer import FlashcardEnhancer
    from models import EnhancementConfig

    if not config.exists():
        typer.echo(f"Config file not found: {config}", err=True)
        raise typer.Exit(1)

    with open(config, "r") as f:
        config_data = yaml.safe_load(f)

    enhancement_config = EnhancementConfig(**config_data)
    enhancer = FlashcardEnhancer(enhancement_config)
    enhancer.clean_empty_audio()


if __name__ == "__main__":
    app()