-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
145 lines (116 loc) · 5.5 KB
/
main.py
File metadata and controls
145 lines (116 loc) · 5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Markdown Podcast Narrator
Converts a Markdown document into a podcast-style audio file
using Qwen3-TTS, Kokoro, or macOS 'say' as a fallback.
Strategy:
- macOS 'say':
Sends the entire document in a single call with [[slnc N]]
embedded pause commands. This helps maintain consistent
pacing and emotion across the full narration.
- Qwen3-TTS:
Uses section-level chunks (grouped by headings). Each section
is large enough to give the neural model context for a stable
narrator tone. Sections are stitched together with PCM silence
to create natural pauses.
- Kokoro:
Uses medium-sized chunks (grouped by paragraphs or short
sections) to keep generation fast while preserving natural
phrasing. Audio segments are concatenated with short pauses
so the narration flows smoothly. Kokoro is optimized for
fast local inference while still producing natural speech.
"""
import sys
from pathlib import Path
import click
from parser import MarkdownParser
from narrator import Narrator
@click.command()
@click.argument("input_file", type=click.Path(exists=True))
@click.option("-o", "--output", "output_file", default=None,
help="Output audio file path (default: output.wav)")
@click.option("--speaker", default="Ryan", help="Qwen3-TTS speaker name")
@click.option("--rate", default=0.95, type=float, help="Speech rate multiplier (0.5-2.0)")
@click.option("--fallback", is_flag=True, help="Use macOS 'say' instead of neural TTS")
@click.option("--engine", default="qwen", type=click.Choice(["qwen", "kokoro", "macos"]),
help="TTS engine to use (default: qwen)")
@click.option("--model", default=None,
help="Qwen3-TTS model ID (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice)")
@click.option("--instruct", default=None,
help="Narrator style instruction for Qwen3-TTS (e.g. 'Speak slowly and calmly')")
@click.option("--kokoro-voice", default=None,
help="Kokoro voice name (default: af_heart)")
def cli(input_file: str, output_file: str, speaker: str, rate: float,
fallback: bool, engine: str, model: str, instruct: str,
kokoro_voice: str):
"""Convert a Markdown file to podcast-style audio narration.
INPUT_FILE: Path to the Markdown (.md) file to convert.
"""
try:
if not output_file:
output_file = "output.wav"
output_path = Path(output_file)
if output_path.suffix.lower() not in (".wav",):
output_file = str(output_path.with_suffix(".wav"))
click.echo("Note: output format changed to .wav")
# --- Parse ---
click.echo("Parsing markdown...")
content = Path(input_file).read_text(encoding="utf-8")
parser = MarkdownParser()
tokens = parser.parse_to_speech_tokens(content)
if not tokens:
click.echo("Error: no content found in markdown file", err=True)
sys.exit(1)
# --- Init TTS ---
if fallback:
engine = "macos"
click.echo(f"Initializing TTS ({engine})...")
narrator = Narrator(engine=engine, model_id=model)
if not narrator.initialize():
if engine != "macos":
click.echo(f"{engine} unavailable, falling back to macOS 'say'...")
narrator = Narrator(engine="macos")
if not narrator.initialize():
click.echo("Error: no TTS backend available", err=True)
sys.exit(1)
else:
click.echo("Error: TTS initialization failed", err=True)
sys.exit(1)
narrator.set_voice_params(rate=rate, speaker=speaker, instruct=instruct,
kokoro_voice=kokoro_voice)
# --- Choose chunk strategy based on backend ---
is_neural = narrator.is_neural
if is_neural:
# Section-level chunks for neural TTS — consistent emotion
sections = parser.tokens_to_section_chunks(tokens)
if not sections:
click.echo("Error: no speakable content", err=True)
sys.exit(1)
total_chars = sum(len(t) for t, _ in sections)
max_chars = max(len(t) for t, _ in sections)
click.echo(f"Prepared {len(sections)} sections ({total_chars} chars, max {max_chars}/section)")
click.echo("Generating audio (section-by-section)...")
def on_progress(current: int, total: int):
click.echo(f" Section {current}/{total}...", nl=False)
click.echo("\r", nl=False)
ok = narrator.synthesize_sections(sections, output_file, on_progress)
click.echo()
else:
# Fine-grained chunks for macOS 'say' — single call with [[slnc]]
chunks = parser.tokens_to_speech_chunks(tokens)
if not chunks:
click.echo("Error: no speakable content", err=True)
sys.exit(1)
total_chars = sum(len(t) for t, _ in chunks)
click.echo(f"Prepared {len(chunks)} chunks ({total_chars} chars)")
click.echo("Generating audio (single narration)...")
ok = narrator.synthesize_chunks(chunks, output_file)
if not ok:
click.echo("Error: audio generation failed", err=True)
sys.exit(1)
size_kb = Path(output_file).stat().st_size / 1024
click.echo(f"Saved to {output_file} ({size_kb:.0f} KB)")
except Exception as e:
click.echo(f"Error: {e}", err=True)
sys.exit(1)
if __name__ == "__main__":
cli()