-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzen-voice-clone.tex
More file actions
867 lines (710 loc) · 45.7 KB
/
zen-voice-clone.tex
File metadata and controls
867 lines (710 loc) · 45.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
% =============================================================================
% Zen-Voice: Zero-Shot Voice Cloning and Expressive Speech Synthesis
% Hanzo AI Inc. & Zoo Labs Foundation
% Technical Whitepaper v1.0 — February 2026
% =============================================================================
\documentclass[11pt,a4paper]{article}
% --- Encoding & Fonts ---------------------------------------------------------
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}
% --- Mathematics --------------------------------------------------------------
\usepackage{amsmath,amsfonts,amssymb,amsthm}
\usepackage{mathtools}
\usepackage{bm}
% --- Layout & Geometry --------------------------------------------------------
\usepackage[top=1in,bottom=1in,left=1.25in,right=1.25in]{geometry}
\usepackage{microtype}
\usepackage{setspace}
\onehalfspacing
% --- Graphics & Tables --------------------------------------------------------
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{array}
\usepackage{float}
% --- Algorithms ---------------------------------------------------------------
\usepackage{algorithm}
\usepackage{algpseudocode}
\algnewcommand\algorithmicforeach{\textbf{for each}}
\algdef{S}[FOR]{ForEach}[1]{\algorithmicforeach\ #1\ \textbf{do}}
% --- Colors & Hyperlinks -------------------------------------------------------
\usepackage{xcolor}
\definecolor{zenred}{RGB}{253,68,68}
\definecolor{zenblue}{RGB}{41,121,255}
\definecolor{zendark}{RGB}{30,30,40}
\definecolor{codegray}{RGB}{248,248,250}
\definecolor{linkcolor}{RGB}{41,121,255}
\usepackage{hyperref}
\hypersetup{
colorlinks=true,
linkcolor=zenblue,
urlcolor=zenblue,
citecolor=zenred,
pdftitle={Zen-Voice: Zero-Shot Voice Cloning and Expressive Speech Synthesis},
pdfauthor={Hanzo AI Inc., Zoo Labs Foundation},
pdfsubject={Speech Synthesis, Voice Cloning, Text-to-Speech},
pdfkeywords={voice cloning, TTS, speaker embedding, prosody transfer, anti-deepfake}
}
% --- Code Listings ------------------------------------------------------------
\usepackage{listings}
\lstset{
backgroundcolor=\color{codegray},
basicstyle=\ttfamily\footnotesize,
breaklines=true,
captionpos=b,
frame=single,
numbers=left,
numberstyle=\tiny\color{gray},
keywordstyle=\color{zenblue}\bfseries,
stringstyle=\color{zenred},
commentstyle=\color{gray}\itshape,
showstringspaces=false,
tabsize=2
}
% --- Section & Caption Formatting ---------------------------------------------
\usepackage{titlesec}
\usepackage{caption}
\captionsetup{font=small,labelfont=bf}
% --- Theorems & Definitions ---------------------------------------------------
\newtheorem{definition}{Definition}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}{Proposition}[section]
% --- Bibliography -------------------------------------------------------------
\usepackage{natbib}
\bibliographystyle{abbrvnat}
\setcitestyle{authoryear,round}
% =============================================================================
% TITLE BLOCK
% =============================================================================
\title{
\vspace{-1.5cm}
{\normalsize \textsc{Hanzo AI Research} \hfill \textsc{Technical Whitepaper v1.0}} \\[0.8em]
\rule{\linewidth}{0.5pt} \\[0.6em]
{\LARGE \textbf{Zen-Voice:}} \\[0.3em]
{\Large Zero-Shot Voice Cloning and Expressive Speech Synthesis} \\[0.3em]
\rule{\linewidth}{0.5pt}
}
\author{ \textbf{Hanzo AI Research}$^{1}$ \quad \textbf{Zoo Labs Foundation}$^{2}$ \\[0.6em]
$^{1}$Hanzo AI Inc. (Techstars '17) \quad $^{2}$Zoo Labs Foundation (501(c)(3)) \\[0.3em]
\texttt{research@hanzo.ai} \quad \texttt{foundation@zoo.ngo} \\[0.3em]
{\small \url{https://hanzo.ai/research/zen-voice}}
}
\date{February 2026}
% =============================================================================
\begin{document}
\maketitle
\begin{abstract}
We present \textbf{Zen-Voice}, a neural speech synthesis system capable of zero-shot voice cloning from as little as 3 seconds of reference audio. Zen-Voice produces natural, expressive speech that faithfully reproduces the timbre, accent, speaking rate, and emotional characteristics of the reference speaker while synthesizing arbitrary text content. The system is built on three core innovations: (1) a \textbf{Hierarchical Speaker Encoder (HSE)} that disentangles speaker identity from prosodic style through multi-scale contrastive learning on 680,000 hours of multilingual speech, (2) a \textbf{Prosody Transfer Module (PTM)} based on a flow-matching architecture that models the joint distribution of pitch, energy, and duration conditioned on both text and speaker embedding, and (3) a \textbf{Neural Codec Vocoder (NCV)} that synthesizes waveforms at 24kHz from discrete codec tokens with a lightweight streaming architecture suitable for real-time applications. Zen-Voice achieves a speaker similarity MOS of 4.21 (5-point scale) on zero-shot cloning with 3-second references, improving to 4.52 with 10-second references. On the LibriTTS test-clean benchmark, it achieves a naturalness MOS of 4.38, surpassing both VALL-E (3.84) and VoiceBox (4.12). On the VCTK multi-speaker benchmark, Zen-Voice achieves a speaker verification Equal Error Rate (EER) of 2.1\% for cloned speech, approaching the 1.8\% EER of ground-truth recordings. We additionally introduce an \textbf{anti-deepfake watermarking} system that embeds imperceptible, cryptographically signed provenance markers into all Zen-Voice output, enabling downstream detection of AI-generated speech with 99.7\% accuracy even after MP3 compression and noise addition. Models and inference code are released under Apache 2.0.
\end{abstract}
\vspace{0.5em}
\noindent\textbf{Keywords:} Voice Cloning, Text-to-Speech, Speaker Embedding, Prosody Transfer, Neural Codec, Anti-Deepfake Watermarking
% =============================================================================
\section{Introduction}
\label{sec:introduction}
Human speech conveys far more than linguistic content. A speaker's voice carries identity (timbre, accent, vocal register), emotion (joy, sadness, anger, surprise), and communicative intent (emphasis, irony, urgency) through subtle variations in pitch, timing, and spectral characteristics. Reproducing this richness in synthetic speech---particularly when cloning a voice from a brief audio sample---remains one of the most challenging problems in generative AI.
Recent advances in neural text-to-speech (TTS) have dramatically improved synthesis quality. Autoregressive models such as VALL-E \citep{wang2023neural} treat TTS as a language modeling problem over discrete audio tokens, achieving impressive zero-shot cloning. Non-autoregressive approaches like VoiceBox \citep{le2024voicebox} and NaturalSpeech 3 \citep{ju2024naturalspeech} use flow-matching and diffusion to generate speech in parallel, offering faster inference. However, existing systems still struggle with three key challenges: (i) faithfully reproducing speaker identity from very short references ($<$5 seconds), (ii) transferring fine-grained prosodic patterns (emphasis, pacing, emotional coloring) independently of speaker identity, and (iii) generating speech in real-time for interactive applications.
Zen-Voice addresses these challenges through a modular architecture that cleanly separates speaker identity, prosodic style, and linguistic content. The Hierarchical Speaker Encoder captures speaker characteristics at multiple temporal scales---from sub-phonemic spectral details to utterance-level speaking style---enabling robust identity extraction even from very short references. The Prosody Transfer Module models prosody as a conditional flow that can be guided by explicit emotion labels, reference audio, or natural language descriptions (``speak with quiet intensity''). The Neural Codec Vocoder converts the model's output into high-fidelity waveforms with a streaming architecture that achieves 24kHz synthesis with less than 150ms latency.
Beyond technical capabilities, we address the ethical imperative of preventing misuse. Voice cloning technology poses significant risks for fraud, impersonation, and disinformation. We integrate an anti-deepfake watermarking system directly into the synthesis pipeline, ensuring that all Zen-Voice output carries an imperceptible but detectable provenance marker. This marker is robust to common audio transformations and enables forensic verification of synthetic speech.
Our contributions are as follows:
\begin{enumerate}
\item A hierarchical speaker encoder that achieves state-of-the-art speaker similarity from references as short as 3 seconds.
\item A flow-matching prosody transfer module that enables independent control of emotion, emphasis, and pacing.
\item A streaming neural codec vocoder with sub-150ms latency for real-time applications.
\item An integrated anti-deepfake watermarking system with 99.7\% detection accuracy.
\item Comprehensive evaluation on LibriTTS, VCTK, and a new multilingual benchmark covering 12 languages.
\end{enumerate}
% =============================================================================
\section{Background and Related Work}
\label{sec:background}
\subsection{Neural Text-to-Speech}
Modern neural TTS systems have evolved through several generations. Tacotron \citep{wang2017tacotron} and Tacotron 2 \citep{shen2018natural} introduced attention-based sequence-to-sequence models that generate mel spectrograms from text, followed by a vocoder (WaveNet \citep{oord2016wavenet}, WaveRNN \citep{kalchbrenner2018efficient}, or HiFi-GAN \citep{kong2020hifi}) for waveform synthesis. FastSpeech \citep{ren2019fastspeech} and FastSpeech 2 \citep{ren2021fastspeech} replaced autoregressive generation with parallel synthesis guided by explicit duration predictions, dramatically reducing inference time.
\subsection{Zero-Shot Voice Cloning}
Zero-shot voice cloning synthesizes speech in a target speaker's voice using only a brief reference sample, without any fine-tuning. Speaker encoders \citep{jia2018transfer,cooper2020zero} extract fixed-dimensional embeddings that condition the TTS model. VALL-E \citep{wang2023neural} reformulated TTS as language modeling over neural codec tokens, demonstrating strong zero-shot cloning by treating the reference audio as a prompt. VALL-E 2 \citep{chen2024vall} improved upon this with grouped code modeling and repetition-aware sampling. VoiceBox \citep{le2024voicebox} used flow matching for non-autoregressive generation with infilling capabilities.
\subsection{Prosody Modeling}
Prosody---the suprasegmental features of speech including pitch, duration, energy, and rhythm---is critical for natural and expressive synthesis. Global Style Tokens (GST) \citep{wang2018style} learned a bank of style embeddings from reference audio. The Variational Autoencoder (VAE) approach \citep{zhang2019learning} modeled prosody as a latent variable. More recent work has explored hierarchical prosody representations \citep{sun2020generating} and fine-grained prosody control through explicit feature prediction.
\subsection{Audio Watermarking}
Audio watermarking embeds imperceptible information into audio signals for authentication and provenance tracking. Traditional methods operate in the frequency domain \citep{cox2007digital}. Neural watermarking approaches \citep{pavlovic2022robust,roman2024proactive} use learned encoders and decoders to embed and extract watermarks with improved robustness. AudioSeal \citep{san2024proactive} introduced a localized watermarking approach specifically designed for AI-generated speech detection.
% =============================================================================
\section{Architecture}
\label{sec:architecture}
Zen-Voice consists of four main components: (1) a text encoder, (2) the Hierarchical Speaker Encoder, (3) the Prosody Transfer Module, and (4) the Neural Codec Vocoder. We describe each in detail.
\subsection{Text Encoder}
\label{sec:text_encoder}
The text encoder converts input text into a sequence of linguistic feature vectors. We use a pipeline combining:
\begin{enumerate}
\item \textbf{Grapheme-to-Phoneme (G2P):} Text is converted to IPA phoneme sequences using language-specific G2P models for 12 supported languages (English, Mandarin, Japanese, Korean, Spanish, French, German, Portuguese, Italian, Hindi, Arabic, Russian). A language identification module automatically selects the appropriate G2P model.
\item \textbf{Phoneme Encoder:} Phoneme sequences are encoded by a 6-layer transformer with relative positional encoding:
\begin{equation}
\bm{H}_{\text{text}} = \text{Transformer}_{\text{text}}(\text{Embed}(\bm{p}_1, \ldots, \bm{p}_T)) \in \mathbb{R}^{T \times d}
\end{equation}
where $\bm{p}_i$ are phoneme tokens and $d = 512$.
\item \textbf{Semantic Enhancement:} For text requiring contextual disambiguation (homographs, emphasis placement), we optionally condition on semantic features extracted from a lightweight BERT model, injected via cross-attention:
\begin{equation}
\tilde{\bm{H}}_{\text{text}} = \bm{H}_{\text{text}} + \text{CrossAttn}(\bm{H}_{\text{text}}, \bm{H}_{\text{BERT}})
\end{equation}
\end{enumerate}
\subsection{Hierarchical Speaker Encoder (HSE)}
\label{sec:hse}
The HSE extracts a comprehensive speaker representation from reference audio at three hierarchical levels.
\paragraph{Level 1: Frame-Level Encoder.} A convolutional encoder processes 80-dimensional log-Mel spectrograms extracted at 16kHz with 25ms windows and 10ms hop size:
\begin{equation}
\bm{F} = \text{Conv1D}_{\text{stack}}(\text{MelSpec}(\bm{w})) \in \mathbb{R}^{T_f \times d_f}
\end{equation}
where $T_f$ is the number of frames and $d_f = 256$. This level captures fine-grained spectral characteristics---formant frequencies, breathiness, nasality---that define vocal timbre.
\paragraph{Level 2: Segment-Level Encoder.} A 4-layer transformer with 128-frame windows processes the frame features to capture phoneme-level and syllable-level patterns:
\begin{equation}
\bm{S} = \text{Transformer}_{\text{seg}}(\bm{F}) \in \mathbb{R}^{T_s \times d_s}
\end{equation}
where $T_s = \lceil T_f / 128 \rceil$ and $d_s = 384$. This level captures articulation patterns, coarticulation effects, and local speaking rate variations.
\paragraph{Level 3: Utterance-Level Encoder.} An attentive statistics pooling layer aggregates segment features into a fixed-dimensional speaker embedding:
\begin{equation}
\bm{e}_{\text{spk}} = \text{AttentivePooling}(\bm{S}) = \sum_{i=1}^{T_s} \alpha_i \cdot [\bm{s}_i; \sigma_i] \in \mathbb{R}^{d_e}
\end{equation}
where $\alpha_i = \text{softmax}(\bm{v}^\top \tanh(\bm{W}\bm{s}_i + \bm{b}))$ are attention weights, $\sigma_i$ are local standard deviations, and $d_e = 512$. This captures global speaker characteristics: average pitch range, speaking tempo, and overall voice quality.
\paragraph{Multi-Scale Contrastive Training.} The HSE is trained with a hierarchical contrastive loss on 680,000 hours of multilingual speech data:
\begin{equation}
\mathcal{L}_{\text{HSE}} = \lambda_1 \mathcal{L}_{\text{frame}}^{\text{contrast}} + \lambda_2 \mathcal{L}_{\text{segment}}^{\text{contrast}} + \lambda_3 \mathcal{L}_{\text{utterance}}^{\text{contrast}}
\end{equation}
where each level uses the InfoNCE loss \citep{oord2018representation} with augmented positive pairs (same speaker, different utterance) and in-batch negatives. We set $\lambda_1 = 0.2$, $\lambda_2 = 0.3$, $\lambda_3 = 0.5$.
\paragraph{Speaker Disentanglement.} To separate speaker identity from content and prosody, we apply a gradient reversal layer \citep{ganin2016domain} that penalizes the speaker embedding for containing phoneme information:
\begin{equation}
\mathcal{L}_{\text{disentangle}} = -\gamma \cdot \mathcal{L}_{\text{phoneme\_clf}}(\bm{e}_{\text{spk}})
\end{equation}
where $\gamma = 0.1$ and $\mathcal{L}_{\text{phoneme\_clf}}$ is the cross-entropy loss of a phoneme classifier operating on the speaker embedding.
\subsection{Prosody Transfer Module (PTM)}
\label{sec:ptm}
The PTM generates prosodic features---pitch contour $f_0(t)$, energy envelope $e(t)$, and phoneme durations $d(t)$---conditioned on text, speaker identity, and optional prosodic guidance.
\paragraph{Flow-Matching Formulation.} We model prosody generation as a conditional flow matching problem \citep{lipman2023flow}. Let $\bm{z}_0 \sim \mathcal{N}(0, I)$ be a noise sample and $\bm{z}_1 = [f_0, e, d]$ be the target prosody features. The flow is parameterized by a vector field $\bm{v}_\theta$:
\begin{equation}
\bm{v}_\theta(\bm{z}_t, t, \bm{c}) = \frac{d\bm{z}_t}{dt}, \quad \bm{z}_t = (1-t)\bm{z}_0 + t\bm{z}_1
\end{equation}
where $\bm{c} = [\tilde{\bm{H}}_{\text{text}}; \bm{e}_{\text{spk}}; \bm{e}_{\text{style}}]$ is the conditioning vector. The training objective is:
\begin{equation}
\mathcal{L}_{\text{flow}} = \mathbb{E}_{t, \bm{z}_0, \bm{z}_1} \left[\|\bm{v}_\theta(\bm{z}_t, t, \bm{c}) - (\bm{z}_1 - \bm{z}_0)\|_2^2\right]
\end{equation}
\paragraph{Style Conditioning.} The style embedding $\bm{e}_{\text{style}}$ can be derived from three sources:
\begin{itemize}
\item \textbf{Reference audio:} A prosody encoder extracts style features from a reference utterance.
\item \textbf{Emotion labels:} A learned embedding table maps categorical emotions (neutral, happy, sad, angry, fearful, surprised, disgusted) to style vectors.
\item \textbf{Natural language descriptions:} A text encoder maps descriptions like ``whispered, with building excitement'' to style vectors via CLIP-like contrastive training on (description, audio) pairs.
\end{itemize}
\paragraph{Architecture.} The PTM uses a DiT (Diffusion Transformer) architecture \citep{peebles2023scalable} with 12 layers, 8 attention heads, and 512-dimensional hidden states. Conditioning is injected through adaptive layer normalization (adaLN-Zero).
\begin{algorithm}[t]
\caption{Zen-Voice Inference Pipeline}
\label{alg:inference}
\begin{algorithmic}[1]
\Require Text $s$, reference audio $\bm{w}_{\text{ref}}$, optional style guidance
\State $\bm{H}_{\text{text}} \leftarrow \text{TextEncoder}(s)$ \Comment{Phoneme encoding}
\State $\bm{e}_{\text{spk}} \leftarrow \text{HSE}(\bm{w}_{\text{ref}})$ \Comment{Speaker embedding}
\State $\bm{e}_{\text{style}} \leftarrow \text{StyleEncoder}(\text{guidance})$ \Comment{Prosody guidance}
\State $\bm{c} \leftarrow [\bm{H}_{\text{text}}; \bm{e}_{\text{spk}}; \bm{e}_{\text{style}}]$ \Comment{Conditioning}
\State $\bm{z}_0 \sim \mathcal{N}(0, I)$ \Comment{Sample noise}
\For{$t = 0$ to $1$ in $N$ steps} \Comment{ODE integration}
\State $\bm{z}_{t+\Delta t} \leftarrow \bm{z}_t + \Delta t \cdot \bm{v}_\theta(\bm{z}_t, t, \bm{c})$
\EndFor
\State $[f_0, e, d] \leftarrow \bm{z}_1$ \Comment{Extract prosody}
\State $\bm{y} \leftarrow \text{NCV}(\bm{H}_{\text{text}}, f_0, e, d, \bm{e}_{\text{spk}})$ \Comment{Waveform synthesis}
\State \Return $\bm{y}$
\end{algorithmic}
\end{algorithm}
\subsection{Neural Codec Vocoder (NCV)}
\label{sec:ncv}
The NCV converts linguistic features, prosodic parameters, and speaker embeddings into high-fidelity audio waveforms.
\paragraph{Codec Token Generation.} We use a residual vector quantization (RVQ) scheme with 8 codebooks, each containing 1024 codes:
\begin{equation}
\bm{q}_l = \text{Quantize}_l\left(\bm{r}_{l-1}\right), \quad \bm{r}_l = \bm{r}_{l-1} - \text{Dequantize}_l(\bm{q}_l)
\end{equation}
where $\bm{r}_0$ is the input acoustic representation and $l \in \{1, \ldots, 8\}$ indexes the codebook level.
\paragraph{Token Prediction.} A 6-layer transformer predicts codec tokens from conditioned features:
\begin{equation}
p(\bm{q}_l | \bm{q}_{<l}, \bm{H}_{\text{text}}, f_0, e, d, \bm{e}_{\text{spk}}) = \text{Transformer}_{\text{codec}}(\bm{q}_{<l}, \bm{c}_{\text{acoustic}})
\end{equation}
We use a grouped prediction scheme: codebooks 1--2 are predicted autoregressively, while codebooks 3--8 are predicted in parallel.
\paragraph{Waveform Decoder.} Codec tokens are decoded to 24kHz waveforms using a HiFi-GAN-style generator \citep{kong2020hifi} with multi-period and multi-scale discriminators:
\begin{equation}
\bm{y} = \text{HiFiGAN}(\text{Dequantize}(\bm{q}_1, \ldots, \bm{q}_8)) \in \mathbb{R}^{L}
\end{equation}
\paragraph{Streaming Architecture.} For real-time applications, the NCV operates in streaming mode with a lookahead of 80ms (2 codec frames). Causal convolutions replace non-causal ones, and the transformer uses sliding window attention of 32 frames. This achieves end-to-end latency of 148ms on a single NVIDIA A10G GPU.
% =============================================================================
\section{Anti-Deepfake Watermarking}
\label{sec:watermark}
Given the potential for misuse of voice cloning technology, we integrate a mandatory watermarking system into the synthesis pipeline.
\subsection{Watermark Design}
The watermarking system embeds a 128-bit payload into the synthesized audio:
\begin{itemize}
\item \textbf{Bits 1--32:} Model identifier and version hash
\item \textbf{Bits 33--64:} Timestamp (Unix epoch, second precision)
\item \textbf{Bits 65--96:} User/API key fingerprint
\item \textbf{Bits 97--128:} HMAC-SHA256 truncated signature over bits 1--96
\end{itemize}
\paragraph{Embedding.} The watermark is embedded using a learned encoder $W_{\text{enc}}$ that modifies the codec tokens before waveform decoding:
\begin{equation}
\tilde{\bm{q}} = \bm{q} + W_{\text{enc}}(\bm{q}, \bm{m})
\end{equation}
where $\bm{m} \in \{0, 1\}^{128}$ is the watermark payload. The training loss is:
\begin{equation}
\mathcal{L}_{\text{wm}} = \lambda_{\text{det}} \cdot \mathcal{L}_{\text{BCE}}(W_{\text{dec}}(\tilde{\bm{y}}), \bm{m}) + \lambda_{\text{qual}} \cdot \|\tilde{\bm{y}} - \bm{y}\|_1 + \lambda_{\text{percept}} \cdot \mathcal{L}_{\text{STFT}}(\tilde{\bm{y}}, \bm{y})
\end{equation}
where $\mathcal{L}_{\text{STFT}}$ is a multi-resolution STFT loss ensuring imperceptibility.
\paragraph{Robustness Training.} During training, we apply a differentiable augmentation pipeline between embedding and detection: MP3 compression (64--320 kbps), Opus and AAC codec simulation, Gaussian and environmental noise (SNR 5--40 dB), resampling (8kHz--48kHz), time stretching ($\pm$20\%), pitch shifting ($\pm$4 semitones), dynamic range compression, and room impulse response convolution.
\subsection{Detection Performance}
\begin{table}[t]
\centering
\caption{Watermark detection accuracy under various audio transformations.}
\label{tab:watermark}
\begin{tabular}{lcc}
\toprule
\textbf{Transformation} & \textbf{Bit Acc. (\%)} & \textbf{Payload Recovery (\%)} \\
\midrule
None (clean) & 99.9 & 99.8 \\
MP3 128 kbps & 99.4 & 99.1 \\
MP3 64 kbps & 98.1 & 96.8 \\
Opus 32 kbps & 97.3 & 95.2 \\
Gaussian noise (SNR 20 dB) & 98.8 & 97.4 \\
Gaussian noise (SNR 10 dB) & 95.2 & 89.6 \\
Resample 8kHz $\rightarrow$ 24kHz & 97.6 & 95.8 \\
Time stretch $\pm$10\% & 98.2 & 96.4 \\
Pitch shift $\pm$2 semitones & 97.8 & 95.9 \\
RIR convolution (medium room) & 98.5 & 97.1 \\
Combined (MP3 + noise + RIR) & 94.8 & 87.3 \\
\midrule
\textbf{AI-generated detection (binary)} & \multicolumn{2}{c}{\textbf{99.7\% accuracy}} \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Perceptual Impact}
The watermark introduces minimal perceptual degradation: A/B testing with 200 listeners showed no statistically significant preference between watermarked and non-watermarked audio ($p = 0.42$, two-tailed binomial test). The signal-to-watermark ratio (SWR) averages 38.2 dB, well above the perceptual threshold.
% =============================================================================
\section{Training}
\label{sec:training}
\subsection{Data}
\begin{table}[t]
\centering
\caption{Training data composition for Zen-Voice.}
\label{tab:data}
\begin{tabular}{llrc}
\toprule
\textbf{Component} & \textbf{Dataset} & \textbf{Hours} & \textbf{Languages} \\
\midrule
\multirow{4}{*}{HSE Pre-training} & VoxCeleb 1\&2 & 7,400 & en \\
& Common Voice 16.0 & 28,000 & 12 \\
& MLS & 50,000 & 8 \\
& Internal (licensed) & 594,600 & 12 \\
\midrule
\multirow{3}{*}{TTS Training} & LibriTTS-R & 585 & en \\
& VCTK & 44 & en \\
& Internal studio recordings & 12,000 & 12 \\
\midrule
Prosody & Expressive audiobooks & 8,400 & en, zh, ja \\
\midrule
Watermark & Synthetic + real mix & 50,000 & 12 \\
\midrule
\multicolumn{2}{l}{\textbf{Total (deduplicated)}} & \textbf{680,000} & 12 \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Training Pipeline}
Training follows a four-stage curriculum:
\paragraph{Stage 1: Speaker Encoder Pre-training (2 weeks, 32 A100 GPUs).} The HSE is trained with the multi-scale contrastive objective on the full 680K-hour dataset. We use AdamW with learning rate $3 \times 10^{-4}$, batch size 4096, and cosine schedule with 5000 warm-up steps.
\paragraph{Stage 2: TTS Model Training (3 weeks, 64 A100 GPUs).} The text encoder, prosody transfer module, and codec predictor are jointly trained on the TTS subset (12,629 hours). The HSE is frozen during this stage. We use AdamW with learning rate $1 \times 10^{-4}$, batch size 256 utterances, and a two-phase schedule: 100K steps on clean studio data, then 200K steps on the full mix with data augmentation.
\paragraph{Stage 3: Vocoder Training (1 week, 16 A100 GPUs).} The HiFi-GAN vocoder is trained with multi-period and multi-scale discriminator losses:
\begin{equation}
\mathcal{L}_{\text{vocoder}} = \mathcal{L}_{\text{adv}} + \lambda_{\text{fm}} \mathcal{L}_{\text{feature}} + \lambda_{\text{mel}} \mathcal{L}_{\text{mel}}
\end{equation}
with $\lambda_{\text{fm}} = 2.0$ and $\lambda_{\text{mel}} = 45.0$.
\paragraph{Stage 4: Watermark Integration (3 days, 8 A100 GPUs).} The watermark encoder and decoder are trained end-to-end with the frozen vocoder, optimizing for detection accuracy under augmentation while minimizing perceptual impact.
\subsection{Emotion and Expressiveness Training}
For emotional speech synthesis, we curate a subset of 2,400 hours of speech with emotion annotations across seven categories. Annotations are obtained through human labeling (800 hours) and a pre-trained speech emotion recognition model validated against human judgments (Cohen's $\kappa = 0.78$). The PTM is fine-tuned with an emotion classification auxiliary loss:
\begin{equation}
\mathcal{L}_{\text{emotion}} = \mathcal{L}_{\text{flow}} + \mu \cdot \text{CE}(\text{EmotionClf}(\bm{z}_1), y_{\text{emotion}})
\end{equation}
where $\mu = 0.1$ and $y_{\text{emotion}}$ is the ground-truth emotion label.
% =============================================================================
\section{Evaluation}
\label{sec:evaluation}
We evaluate Zen-Voice on three dimensions: synthesis quality (naturalness), speaker similarity (cloning fidelity), and prosodic expressiveness.
\subsection{Benchmarks and Metrics}
\paragraph{Datasets.}
\begin{itemize}
\item \textbf{LibriTTS test-clean:} 500 utterances from 39 speakers, standard TTS evaluation set.
\item \textbf{VCTK:} 109 speakers with diverse accents, 400 utterances per speaker.
\item \textbf{ZenVoice-Eval:} Our multilingual benchmark with 1200 utterances across 12 languages, 100 speakers, balanced by gender and age.
\end{itemize}
\paragraph{Metrics.}
\begin{itemize}
\item \textbf{MOS (Mean Opinion Score):} 5-point Likert scale rated by 200 native speakers.
\item \textbf{UTMOS:} Automated MOS prediction using the UTokyo-SaruLab model \citep{saeki2022utmos}.
\item \textbf{Speaker Verification EER:} Equal Error Rate of a pre-trained ECAPA-TDNN \citep{desplanques2020ecapa} speaker verification model.
\item \textbf{Word Error Rate (WER):} Intelligibility measured via Whisper-large-v3 transcription.
\item \textbf{F0 RMSE:} Root mean square error of pitch contour relative to reference.
\end{itemize}
\subsection{Baselines}
\begin{itemize}
\item \textbf{VALL-E} \citep{wang2023neural}: Autoregressive neural codec language model.
\item \textbf{VALL-E 2} \citep{chen2024vall}: Improved VALL-E with grouped code modeling.
\item \textbf{VoiceBox} \citep{le2024voicebox}: Flow-matching TTS with infilling.
\item \textbf{NaturalSpeech 3} \citep{ju2024naturalspeech}: Factorized diffusion with discrete tokens.
\item \textbf{XTTS v2} \citep{casanova2024xtts}: Open-source multilingual TTS.
\end{itemize}
\subsection{Results}
\begin{table}[t]
\centering
\caption{Speech quality on LibriTTS test-clean (3-second reference).}
\label{tab:quality}
\begin{tabular}{lcccc}
\toprule
\textbf{Method} & \textbf{Nat. MOS} & \textbf{Sim. MOS} & \textbf{UTMOS} & \textbf{WER (\%)} \\
\midrule
Ground Truth & 4.52 & -- & 4.31 & 2.1 \\
\midrule
VALL-E & 3.84 & 3.62 & 3.71 & 5.8 \\
VALL-E 2 & 4.02 & 3.81 & 3.89 & 4.2 \\
VoiceBox & 4.12 & 3.94 & 4.01 & 3.6 \\
NaturalSpeech 3 & 4.18 & 4.02 & 4.08 & 3.3 \\
XTTS v2 & 3.91 & 3.72 & 3.82 & 4.8 \\
\midrule
Zen-Voice & \textbf{4.38} & \textbf{4.21} & \textbf{4.22} & \textbf{2.7} \\
\bottomrule
\end{tabular}
\end{table}
\begin{table}[t]
\centering
\caption{Speaker verification EER on VCTK (lower is better).}
\label{tab:speaker_ver}
\begin{tabular}{lccc}
\toprule
\textbf{Method} & \textbf{3-sec ref} & \textbf{5-sec ref} & \textbf{10-sec ref} \\
\midrule
Ground Truth & \multicolumn{3}{c}{1.8\%} \\
\midrule
VALL-E & 8.4\% & 6.2\% & 4.8\% \\
VALL-E 2 & 6.1\% & 4.8\% & 3.6\% \\
VoiceBox & 5.3\% & 4.1\% & 3.2\% \\
NaturalSpeech 3 & 4.7\% & 3.6\% & 2.8\% \\
\midrule
Zen-Voice & \textbf{3.4\%} & \textbf{2.6\%} & \textbf{2.1\%} \\
\bottomrule
\end{tabular}
\end{table}
\begin{table}[t]
\centering
\caption{Multilingual evaluation on ZenVoice-Eval (10-second reference).}
\label{tab:multilingual}
\begin{tabular}{lcccc}
\toprule
\textbf{Language} & \textbf{Nat. MOS} & \textbf{Sim. MOS} & \textbf{WER (\%)} & \textbf{F0 RMSE} \\
\midrule
English & 4.41 & 4.52 & 2.4 & 18.3 \\
Mandarin & 4.32 & 4.38 & 3.1 & 22.1 \\
Japanese & 4.28 & 4.31 & 3.8 & 19.7 \\
Korean & 4.18 & 4.22 & 4.2 & 21.4 \\
Spanish & 4.35 & 4.41 & 2.8 & 17.8 \\
French & 4.31 & 4.36 & 3.2 & 18.9 \\
German & 4.27 & 4.33 & 3.5 & 20.2 \\
Portuguese & 4.22 & 4.28 & 3.9 & 19.3 \\
Italian & 4.29 & 4.34 & 3.3 & 18.6 \\
Hindi & 4.08 & 4.12 & 5.1 & 24.3 \\
Arabic & 4.04 & 4.08 & 5.6 & 25.1 \\
Russian & 4.15 & 4.19 & 4.4 & 22.8 \\
\midrule
\textbf{Average} & \textbf{4.24} & \textbf{4.30} & \textbf{3.8} & \textbf{20.7} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:quality} shows Zen-Voice achieves a naturalness MOS of 4.38 on LibriTTS, closing the gap to ground-truth (4.52) more than any prior method. Speaker similarity MOS of 4.21 with just 3 seconds of reference significantly outperforms all baselines.
Table~\ref{tab:speaker_ver} demonstrates speaker verification EER of 2.1\% with 10-second references, approaching the ground-truth EER of 1.8\%.
Table~\ref{tab:multilingual} shows consistent quality across 12 languages, with highest performance on English and slightly lower on Hindi and Arabic where training data is more limited.
\subsection{Emotion Transfer Evaluation}
\begin{table}[t]
\centering
\caption{Emotion recognition accuracy of synthesized emotional speech.}
\label{tab:emotion}
\begin{tabular}{lccccccc}
\toprule
\textbf{Method} & \textbf{Neu} & \textbf{Hap} & \textbf{Sad} & \textbf{Ang} & \textbf{Fear} & \textbf{Sur} & \textbf{Avg} \\
\midrule
Reference audio & 92.1 & 87.3 & 84.6 & 89.2 & 78.4 & 82.1 & 85.6 \\
\midrule
VALL-E & 78.3 & 52.1 & 48.7 & 54.2 & 41.3 & 45.8 & 53.4 \\
NaturalSpeech 3 & 84.2 & 68.4 & 62.1 & 71.3 & 55.2 & 58.7 & 66.7 \\
Zen-Voice & \textbf{89.4} & \textbf{78.2} & \textbf{74.8} & \textbf{81.3} & \textbf{68.4} & \textbf{72.1} & \textbf{77.4} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:emotion} evaluates emotional expressiveness using a pre-trained speech emotion recognition model. Zen-Voice achieves 77.4\% average emotion recognition accuracy, significantly outperforming baselines and approaching the 85.6\% accuracy on real emotional speech.
\subsection{Latency Analysis}
\begin{table}[t]
\centering
\caption{Inference latency for 10-second utterances on NVIDIA A10G GPU.}
\label{tab:latency}
\begin{tabular}{lccc}
\toprule
\textbf{Method} & \textbf{First Token (ms)} & \textbf{RTF} & \textbf{Streaming} \\
\midrule
VALL-E & 1,240 & 0.82 & No \\
VALL-E 2 & 680 & 0.51 & No \\
VoiceBox & 320 & 0.24 & No \\
NaturalSpeech 3 & 410 & 0.31 & No \\
\midrule
Zen-Voice (batch) & 280 & 0.18 & No \\
Zen-Voice (stream) & \textbf{148} & \textbf{0.21} & \textbf{Yes} \\
\bottomrule
\end{tabular}
\end{table}
Zen-Voice achieves a real-time factor (RTF) of 0.18 in batch mode and 0.21 in streaming mode, both well under 1.0. The streaming mode achieves first-audio latency of 148ms.
% =============================================================================
\section{Ablation Studies}
\label{sec:ablation}
\subsection{Speaker Encoder Design}
\begin{table}[t]
\centering
\caption{Speaker encoder architecture ablation on VCTK (3-second reference).}
\label{tab:spk_ablation}
\begin{tabular}{lccc}
\toprule
\textbf{Architecture} & \textbf{Sim. MOS} & \textbf{EER (\%)} & \textbf{Params} \\
\midrule
ECAPA-TDNN (frozen) & 3.72 & 6.8 & 6.2M \\
x-vector & 3.68 & 7.2 & 4.8M \\
Frame-level only & 3.91 & 5.1 & 12M \\
Segment-level only & 3.98 & 4.4 & 18M \\
HSE (no disentangle) & 4.12 & 3.8 & 32M \\
HSE (full) & \textbf{4.21} & \textbf{3.4} & 32M \\
\bottomrule
\end{tabular}
\end{table}
The hierarchical design contributes 0.30 MOS improvement over frame-level only, and disentanglement adds 0.09 MOS.
\subsection{Prosody Module Design}
\begin{table}[t]
\centering
\caption{Prosody generation method ablation on LibriTTS.}
\label{tab:prosody_ablation}
\begin{tabular}{lccc}
\toprule
\textbf{Method} & \textbf{Nat. MOS} & \textbf{F0 RMSE (Hz)} & \textbf{Steps} \\
\midrule
Duration predictor only & 4.02 & 32.1 & 1 \\
VAE prosody & 4.14 & 26.8 & 1 \\
Diffusion (DDPM, 50 steps) & 4.28 & 21.4 & 50 \\
Flow matching (10 steps) & 4.35 & 19.2 & 10 \\
Flow matching (25 steps) & \textbf{4.38} & \textbf{18.3} & 25 \\
\bottomrule
\end{tabular}
\end{table}
Flow matching achieves the best quality at 25 ODE steps, with 10 steps providing a favorable quality-speed trade-off.
\subsection{Reference Length Sensitivity}
\begin{table}[t]
\centering
\caption{Speaker similarity vs. reference audio length on VCTK.}
\label{tab:ref_length}
\begin{tabular}{lcccccc}
\toprule
\textbf{Ref Length} & \textbf{1s} & \textbf{3s} & \textbf{5s} & \textbf{10s} & \textbf{30s} & \textbf{60s} \\
\midrule
Sim. MOS & 3.82 & 4.21 & 4.38 & 4.52 & 4.61 & 4.63 \\
EER (\%) & 7.2 & 3.4 & 2.6 & 2.1 & 1.9 & 1.9 \\
\bottomrule
\end{tabular}
\end{table}
Performance improves significantly from 1 to 10 seconds with diminishing returns beyond 30 seconds.
% =============================================================================
\section{Discussion}
\label{sec:discussion}
\subsection{Strengths}
Zen-Voice's modular architecture provides three distinct advantages: (1) the separation of speaker identity and prosody enables independent control; (2) the flow-matching formulation provides deterministic, high-quality prosody generation in few ODE steps; (3) the streaming architecture enables real-time applications without sacrificing quality.
\subsection{Limitations}
\begin{itemize}
\item \textbf{Singing voice:} Zen-Voice produces suboptimal results for singing, where pitch accuracy and vibrato control are critical.
\item \textbf{Extremely short references:} Below 3 seconds, speaker similarity degrades noticeably.
\item \textbf{Cross-lingual cloning:} Accent artifacts can occur when cloning into a language not present in the reference.
\item \textbf{Watermark removal:} Targeted adversarial attacks could potentially remove the watermark.
\end{itemize}
\subsection{Ethical Considerations}
Voice cloning poses significant ethical risks. We mitigate these through:
\begin{enumerate}
\item \textbf{Mandatory watermarking:} All API output carries provenance markers that cannot be disabled.
\item \textbf{Consent verification:} API users must attest consent from the voice owner.
\item \textbf{Voice blocklist:} Protected voices are rejected by speaker verification at inference time.
\item \textbf{Detection tools:} The watermark detector is released as a free, open-source tool.
\end{enumerate}
% =============================================================================
\section{Conclusion}
\label{sec:conclusion}
We presented Zen-Voice, a neural speech synthesis system achieving state-of-the-art zero-shot voice cloning from as little as 3 seconds of reference audio. The hierarchical speaker encoder, flow-matching prosody transfer module, and streaming neural codec vocoder combine to produce natural, expressive speech that faithfully reproduces target speaker characteristics. Integrated anti-deepfake watermarking ensures responsible deployment.
Zen-Voice achieves a naturalness MOS of 4.38 on LibriTTS test-clean, a speaker similarity MOS of 4.21 with 3-second references, and a speaker verification EER of 2.1\% with 10-second references. The streaming architecture achieves sub-150ms latency for real-time conversational applications.
Models and inference code are available at \url{https://github.com/hanzoai/zen-voice} under Apache 2.0. The watermark detection tool is released at \url{https://github.com/hanzoai/zen-voice-detect}.
% =============================================================================
% REFERENCES
% =============================================================================
\begin{thebibliography}{28}
\bibitem[Casanova et~al.(2024)]{casanova2024xtts}
Casanova, E., Weber, J., Shulby, C.~D., Junior, A.~C., G{\"o}lge, E., and Ponti, M.~A.
\newblock XTTS: A massively multilingual zero-shot text-to-speech model.
\newblock In \emph{Proceedings of Interspeech}, 2024.
\bibitem[Chen et~al.(2024)]{chen2024vall}
Chen, S., Wu, S., Wang, C., Chen, S., Wu, Y., Liu, S., Zhou, L., Liu, J., Kanda, N., Yoshioka, T., et~al.
\newblock VALL-E 2: Neural codec language models are human parity zero-shot text to speech synthesizers.
\newblock \emph{arXiv preprint arXiv:2406.05370}, 2024.
\bibitem[Cooper et~al.(2020)]{cooper2020zero}
Cooper, E., Lai, C.-I., Yasuda, Y., Fang, F., Wang, X., Chen, N., and Yamagishi, J.
\newblock Zero-shot multi-speaker text-to-speech with state-of-the-art neural speaker embeddings.
\newblock In \emph{Proceedings of ICASSP}, pp.\ 6184--6188, 2020.
\bibitem[Cox et~al.(2007)]{cox2007digital}
Cox, I., Miller, M., Bloom, J., Fridrich, J., and Kalker, T.
\newblock \emph{Digital Watermarking and Steganography}.
\newblock Morgan Kaufmann, 2nd edition, 2007.
\bibitem[Desplanques et~al.(2020)]{desplanques2020ecapa}
Desplanques, B., Thienpondt, J., and Demuynck, K.
\newblock ECAPA-TDNN: Emphasized channel attention, propagation and aggregation in TDNN based speaker verification.
\newblock In \emph{Proceedings of Interspeech}, pp.\ 3830--3834, 2020.
\bibitem[Ganin et~al.(2016)]{ganin2016domain}
Ganin, Y., Ustinova, E., Ajakan, H., Germain, P., Larochelle, H., Laviolette, F., Marchand, M., and Lempitsky, V.
\newblock Domain-adversarial training of neural networks.
\newblock \emph{JMLR}, 17(59):1--35, 2016.
\bibitem[Jia et~al.(2018)]{jia2018transfer}
Jia, Y., Zhang, Y., Weiss, R., Wang, Q., Shen, J., Ren, F., Chen, Z., Nguyen, P., Pang, R., Lopez~Moreno, I., and Wu, Y.
\newblock Transfer learning from speaker verification to multispeaker text-to-speech synthesis.
\newblock In \emph{NeurIPS}, pp.\ 4480--4490, 2018.
\bibitem[Ju et~al.(2024)]{ju2024naturalspeech}
Ju, Z., Wang, Y., Shen, K., Tan, X., Xin, D., Yang, D., Liu, Y., Leng, Y., Song, K., Tang, S., et~al.
\newblock NaturalSpeech 3: Zero-shot speech synthesis with factorized codec and diffusion models.
\newblock In \emph{ICML}, 2024.
\bibitem[Kalchbrenner et~al.(2018)]{kalchbrenner2018efficient}
Kalchbrenner, N., Elsen, E., Simonyan, K., Noury, S., Casagrande, N., Lockhart, E., Stimberg, F., Oord, A., Dieleman, S., and Kavukcuoglu, K.
\newblock Efficient neural audio synthesis.
\newblock In \emph{ICML}, pp.\ 2410--2419, 2018.
\bibitem[Kong et~al.(2020)]{kong2020hifi}
Kong, J., Kim, J., and Bae, J.
\newblock HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis.
\newblock In \emph{NeurIPS}, pp.\ 17022--17033, 2020.
\bibitem[Le et~al.(2024)]{le2024voicebox}
Le, M., Vyas, A., Shi, B., Karrer, B., Sari, L., Moritz, R., Williamson, M., Manohar, V., Adi, Y., Mahadeokar, J., and Hsu, W.-N.
\newblock Voicebox: Text-guided multilingual universal speech generation at scale.
\newblock In \emph{NeurIPS}, 2024.
\bibitem[Lipman et~al.(2023)]{lipman2023flow}
Lipman, Y., Chen, R.~T.~Q., Ben-Hamu, H., Nickel, M., and Le, M.
\newblock Flow matching for generative modeling.
\newblock In \emph{ICLR}, 2023.
\bibitem[Oord et~al.(2016)]{oord2016wavenet}
Oord, A.~v.~d., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., Kalchbrenner, N., Senior, A., and Kavukcuoglu, K.
\newblock WaveNet: A generative model for raw audio.
\newblock \emph{arXiv preprint arXiv:1609.03499}, 2016.
\bibitem[Oord et~al.(2018)]{oord2018representation}
Oord, A.~v.~d., Li, Y., and Vinyals, O.
\newblock Representation learning with contrastive predictive coding.
\newblock \emph{arXiv preprint arXiv:1807.03748}, 2018.
\bibitem[Pavlovic and Koeppl(2022)]{pavlovic2022robust}
Pavlovic, N. and Koeppl, H.
\newblock Robust audio watermarking with deep neural networks.
\newblock In \emph{IEEE WIFS}, 2022.
\bibitem[Peebles and Xie(2023)]{peebles2023scalable}
Peebles, W. and Xie, S.
\newblock Scalable diffusion models with transformers.
\newblock In \emph{ICCV}, pp.\ 4195--4205, 2023.
\bibitem[Ren et~al.(2019)]{ren2019fastspeech}
Ren, Y., Ruan, Y., Tan, X., Qin, T., Zhao, S., Zhao, Z., and Liu, T.-Y.
\newblock FastSpeech: Fast, robust and controllable text to speech.
\newblock In \emph{NeurIPS}, 2019.
\bibitem[Ren et~al.(2021)]{ren2021fastspeech}
Ren, Y., Hu, C., Tan, X., Qin, T., Zhao, S., Zhao, Z., and Liu, T.-Y.
\newblock FastSpeech 2: Fast and high-quality end-to-end text to speech.
\newblock In \emph{ICLR}, 2021.
\bibitem[Roman et~al.(2024)]{roman2024proactive}
San~Roman, R., Fernandez, P., Elsahar, H., D{\'e}fossez, A., Furon, T., and Tuli, T.
\newblock Proactive detection of voice cloning with localized watermarking.
\newblock In \emph{ICML}, 2024.
\bibitem[Saeki et~al.(2022)]{saeki2022utmos}
Saeki, T., Xin, D., Nakata, W., Koriyama, T., Takamichi, S., and Saruwatari, H.
\newblock UTMOS: UTokyo-SaruLab system for VoiceMOS Challenge 2022.
\newblock In \emph{Proceedings of Interspeech}, 2022.
\bibitem[San~Roman et~al.(2024)]{san2024proactive}
San~Roman, R., Fernandez, P., D{\'e}fossez, A., Furon, T., Tuli, T., and Elsahar, H.
\newblock AudioSeal: Proactive localized watermarking.
\newblock In \emph{ICML}, 2024.
\bibitem[Shen et~al.(2018)]{shen2018natural}
Shen, J., Pang, R., Weiss, R.~J., Schuster, M., Jaitly, N., Yang, Z., Chen, Z., Zhang, Y., Wang, Y., Skerrv-Ryan, R., et~al.
\newblock Natural TTS synthesis by conditioning WaveNet on mel spectrogram predictions.
\newblock In \emph{ICASSP}, pp.\ 4779--4783, 2018.
\bibitem[Sun et~al.(2020)]{sun2020generating}
Sun, G., Zhang, Y., Weiss, R.~J., Cao, Y., Zen, H., and Wu, Y.
\newblock Generating diverse and natural text-to-speech samples using a quantized fine-grained VAE and autoregressive prosody prior.
\newblock In \emph{ICASSP}, pp.\ 6699--6703, 2020.
\bibitem[Wang et~al.(2017)]{wang2017tacotron}
Wang, Y., Skerry-Ryan, R., Stanton, D., Wu, Y., Weiss, R.~J., Jaitly, N., Yang, Z., Xiao, Y., Chen, Z., Bengio, S., et~al.
\newblock Tacotron: Towards end-to-end speech synthesis.
\newblock In \emph{Interspeech}, pp.\ 4006--4010, 2017.
\bibitem[Wang et~al.(2018)]{wang2018style}
Wang, Y., Stanton, D., Zhang, Y., Skerry-Ryan, R., Battenberg, E., Shor, J., Xiao, Y., Jia, Y., Ren, F., and Saurous, R.~A.
\newblock Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis.
\newblock In \emph{ICML}, pp.\ 5180--5189, 2018.
\bibitem[Wang et~al.(2023)]{wang2023neural}
Wang, C., Chen, S., Wu, Y., Zhang, Z., Zhou, L., Liu, S., Chen, Z., Liu, Y., Wang, H., Li, J., et~al.
\newblock Neural codec language models are zero-shot text to speech synthesizers.
\newblock \emph{arXiv preprint arXiv:2301.02111}, 2023.
\bibitem[Zhang et~al.(2019)]{zhang2019learning}
Zhang, Y., Pan, S., He, L., and Ling, Z.-H.
\newblock Learning latent representations for style control and transfer in end-to-end speech synthesis.
\newblock In \emph{ICASSP}, pp.\ 6945--6949, 2019.
\end{thebibliography}
% =============================================================================
\appendix
\section{Model Hyperparameters}
\label{app:hyperparams}
\begin{table}[h]
\centering
\caption{Zen-Voice module hyperparameters.}
\label{tab:hyperparams}
\begin{tabular}{llc}
\toprule
\textbf{Module} & \textbf{Parameter} & \textbf{Value} \\
\midrule
\multirow{4}{*}{Text Encoder} & Layers & 6 \\
& Hidden dim & 512 \\
& Attention heads & 8 \\
& FFN dim & 2048 \\
\midrule
\multirow{5}{*}{HSE} & Frame encoder channels & [64, 128, 256] \\
& Segment transformer layers & 4 \\
& Speaker embedding dim & 512 \\
& Total parameters & 32M \\
& Contrastive temperature & 0.07 \\
\midrule
\multirow{4}{*}{Prosody Transfer} & DiT layers & 12 \\
& Hidden dim & 512 \\
& ODE steps (inference) & 25 \\
& Classifier-free guidance & 2.0 \\
\midrule
\multirow{4}{*}{Neural Codec} & RVQ codebooks & 8 \\
& Codebook size & 1024 \\
& Codec frame rate & 50 Hz \\
& Streaming lookahead & 80 ms \\
\midrule
\multirow{3}{*}{Watermark} & Payload bits & 128 \\
& Encoder layers & 4 \\
& SWR target & $\geq$35 dB \\
\bottomrule
\end{tabular}
\end{table}
\section{Computational Requirements}
\label{app:compute}
\begin{table}[h]
\centering
\caption{Inference computational requirements by deployment configuration.}
\label{tab:compute_req}
\begin{tabular}{lccc}
\toprule
\textbf{Configuration} & \textbf{GPU} & \textbf{VRAM} & \textbf{RTF} \\
\midrule
Full model (FP16) & A100 80GB & 14.2 GB & 0.12 \\
Full model (FP16) & A10G 24GB & 14.2 GB & 0.18 \\
INT8 quantized & T4 16GB & 8.1 GB & 0.34 \\
INT4 quantized & L4 24GB & 5.2 GB & 0.42 \\
Streaming (FP16) & A10G 24GB & 14.8 GB & 0.21 \\
CPU (INT4) & 32-core Xeon & 6.8 GB RAM & 2.8 \\
\bottomrule
\end{tabular}
\end{table}
The INT8 quantized model on T4 achieves real-time synthesis (RTF $<$ 1.0) at approximately \$0.0004 per second of generated audio.
\end{document}