papers/zen-distributed-training.tex at main · zenlm/papers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
\documentclass[11pt,a4paper]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{color}
\usepackage{booktabs}
\usepackage{float}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{geometry}
\geometry{margin=1in}
\definecolor{zenblue}{RGB}{41,121,255}
\hypersetup{colorlinks=true,linkcolor=zenblue,urlcolor=zenblue,citecolor=zenblue}

\title{\textbf{DSO: Decentralized Training Infrastructure for Zen}\\
\large Technical Report v2025.06}
\author{Antje Worring, Zach Kelling \\ Zen LM Research Team\\
\texttt{research@zenlm.org}}
\date{June 2025}

\begin{document}
\maketitle

\begin{abstract}
We present Decentralized Semantic Optimization (DSO), the distributed training
infrastructure used for the Zen model family. DSO addresses the challenge of
coordinating gradient aggregation across heterogeneous, geographically distributed
compute nodes without a central parameter server. Key contributions include
semantic-aware gradient compression that preserves model-semantic fidelity at 94\%
bandwidth reduction, a peer-to-peer gossip protocol for Byzantine-fault-tolerant
gradient sharing, and integration with the Active Semantic Optimization (ASO) protocol
for decentralized reward-model alignment. On a 256-node commodity cluster, DSO achieves
convergence matching centralized training with 94\% bandwidth reduction and robustness
against up to 20\% Byzantine nodes.
\end{abstract}

\section{Introduction}

Training frontier language models is increasingly constrained by communication bottlenecks
in centralized data centers. All-reduce operations across thousands of GPUs require
high-bandwidth InfiniBand interconnects that are expensive and geographically centralized.
DSO enables training on distributed, heterogeneous compute--including voluntarily
contributed nodes, cloud spot instances, and cooperating organizations--without
sacrificing convergence quality.

DSO is motivated by three constraints:
\begin{enumerate}
  \item \textbf{Bandwidth}: Cross-datacenter bandwidth is 10--100$\times$ lower than
    NVLink/InfiniBand within a cluster.
  \item \textbf{Reliability}: Heterogeneous nodes fail, disconnect, or behave adversarially.
  \item \textbf{Privacy}: Participating organizations may not want to expose raw gradients.
\end{enumerate}

DSO addresses all three via semantic gradient compression, gossip-based aggregation,
and differential privacy guarantees. We describe the system architecture, algorithms,
and experimental results.

\section{Background}

\subsection{Distributed Gradient Aggregation}

Standard distributed training uses synchronous all-reduce (e.g., NCCL ring-allreduce)
to average gradients across workers. This requires $O(N)$ bandwidth per step and
cannot tolerate node failures without restart~\cite{rajbhandari2020zero}.

Asynchronous SGD~\cite{dean2012distbelief} allows stale gradients but suffers from
convergence degradation at high staleness. Federated learning~\cite{mcmahan2017fedavg}
addresses heterogeneity but was designed for small models on mobile devices.

\subsection{Gradient Compression}

Gradient sparsification (Top-$k$~\cite{stich2018sparsified}) and quantization
(1-bit SGD~\cite{seide20141bit}) reduce communication cost at the expense of convergence
rate. Error feedback mechanisms~\cite{karimireddy2019error} recover convergence but
require error memory per node.

\subsection{Byzantine Fault Tolerance}

Byzantine-fault-tolerant SGD~\cite{blanchard2017machine} uses coordinate-wise median
or trimmed mean to filter malicious gradients. These methods have high computational
overhead and assume known Byzantine fraction. DSO uses a cryptographic commitment
scheme to detect and filter Byzantine updates with $O(\log N)$ overhead.

\section{DSO Architecture}

\subsection{System Overview}

DSO organizes compute nodes into a two-tier hierarchy:

\begin{itemize}
  \item \textbf{Aggregator nodes} (8--16 per cluster): Receive gradients from workers,
    apply compression and aggregation, and propagate to peer aggregators via gossip.
  \item \textbf{Worker nodes} ($N$ total): Compute forward/backward passes on local
    data shards, compress gradients, and send to their assigned aggregator.
\end{itemize}

Workers are assigned to aggregators based on network proximity (latency measurement).
Aggregators form a gossip overlay network for inter-cluster communication.

\subsection{Semantic Gradient Compression}

Standard gradient compression (Top-$k$, random sparsification) treats all gradient
dimensions as equal. DSO introduces semantic-aware compression that prioritizes
gradient dimensions with high impact on semantic representations:

\begin{equation}
\text{SemImportance}(g_i) = |g_i| \cdot \text{SemSensitivity}(i)
\end{equation}

where $\text{SemSensitivity}(i)$ is a per-parameter importance score derived from
activation statistics on a semantic probe task. Parameters in attention layers that
are highly sensitive to semantic probes receive higher importance and lower compression.

\begin{algorithm}[H]
\caption{Semantic Gradient Compression}
\begin{algorithmic}[1]
\REQUIRE Gradient $g \in \mathbb{R}^d$, semantic sensitivity $s \in \mathbb{R}^d$, target bandwidth $k$
\ENSURE Compressed gradient $\tilde{g}$, error buffer $e$
\STATE $g' \leftarrow g + e$ \COMMENT{Add error feedback}
\STATE $\text{importance} \leftarrow |g'| \cdot s$
\STATE $\mathcal{I} \leftarrow \text{TopK}(\text{importance}, k)$ \COMMENT{Select top-$k$ by semantic importance}
\STATE $\tilde{g} \leftarrow \text{Quantize}(g'[\mathcal{I}], \text{bits}=8)$
\STATE $e \leftarrow g' - \text{Expand}(\tilde{g}, \mathcal{I})$ \COMMENT{Update error buffer}
\RETURN $\tilde{g}$, $e$
\end{algorithmic}
\end{algorithm}

Semantic sensitivity scores $s$ are computed once per 1000 training steps on a
fixed semantic probe dataset (10K samples from Wikipedia) and updated incrementally.

\subsection{Peer-to-Peer Gradient Sharing}

Aggregators share gradients via a gossip protocol adapted from epidemic broadcast~\cite{demers1987epidemic}:

\begin{algorithm}[H]
\caption{DSO Gossip Aggregation}
\begin{algorithmic}[1]
\REQUIRE Aggregator set $\mathcal{A}$, local gradient $g_\text{local}$, fanout $f=4$
\STATE $G \leftarrow \{g_\text{local}\}$
\FOR{round $r = 1 \ldots R$}
  \STATE Select $f$ peers $\mathcal{P} \leftarrow \text{RandomSample}(\mathcal{A} \setminus \{\text{self}\}, f)$
  \FOR{each peer $p \in \mathcal{P}$}
    \STATE Send $G$ to $p$; Receive $G_p$ from $p$
    \STATE $G \leftarrow G \cup G_p$ \COMMENT{Accumulate received gradients}
  \ENDFOR
\ENDFOR
\STATE $g_\text{agg} \leftarrow \text{ByzantineFilter}(G)$
\RETURN $g_\text{agg}$
\end{algorithmic}
\end{algorithm}

With fanout $f=4$ and $R=3$ rounds, all $|\mathcal{A}|=16$ aggregators receive
all gradients within $\lceil \log_f |\mathcal{A}| \rceil = 2$ rounds with high probability.

\subsection{Byzantine Fault Tolerance}

Each worker submits a cryptographic commitment $c_i = H(g_i \| \text{nonce}_i)$ before
sending the gradient, where $H$ is SHA-3. Aggregators verify commitments and apply
coordinate-wise trimmed mean, discarding the top and bottom $\beta$ fraction of
values per coordinate:

\begin{equation}
g_\text{agg}[j] = \frac{1}{N(1-2\beta)} \sum_{i : \beta N < \text{rank}_j(g_i) < (1-\beta)N} g_i[j]
\end{equation}

with $\beta = 0.1$, providing robustness against up to 20\% Byzantine nodes.

\section{Integration with ASO}

DSO integrates with the Active Semantic Optimization (ASO) protocol (HIP-002) for
decentralized reward-model alignment during RLHF. ASO allows distributed preference
learning where multiple organizations contribute preference labels without sharing
raw annotation data.

Each ASO participant trains a local reward model $r_i$ on their private data and
contributes reward model gradients (not data) to the DSO aggregation. The global
reward model emerges from gradient consensus:

\begin{equation}
r_\text{global} = \text{Aggregate}(\{r_i\}_{i=1}^N)
\end{equation}

using the same Byzantine-robust aggregation. This preserves preference data privacy
while enabling collaborative reward learning.

\section{Differential Privacy}

Worker gradients are perturbed with calibrated Gaussian noise before transmission
to the aggregator:

\begin{equation}
\tilde{g}_i = g_i + \mathcal{N}(0, \sigma^2 \mathbf{I}), \quad \sigma = \frac{C \sqrt{2 \ln(1.25/\delta)}}{\epsilon \cdot N}
\end{equation}

where $C$ is the gradient clipping norm, $\epsilon = 8.0$ and $\delta = 10^{-6}$ are
privacy parameters. This provides $(\epsilon, \delta)$-differential privacy for each
participant's local training data.

\section{Experiments}

\subsection{Cluster Configuration}

Experiments use three cluster configurations:
\begin{table}[H]
\centering
\begin{tabular}{lrrr}
\toprule
\textbf{Config} & \textbf{Nodes} & \textbf{GPUs/node} & \textbf{Interconnect} \\
\midrule
Centralized (baseline) & 64 & 8$\times$H100 & 400Gb/s IB \\
DSO-LAN & 256 & 4$\times$A100 & 10Gb/s Ethernet \\
DSO-WAN & 256 & 4$\times$A100 & 1Gb/s (simulated) \\
\bottomrule
\end{tabular}
\caption{Cluster configurations for DSO evaluation.}
\end{table}

\subsection{Convergence on 256-Node Cluster}

\begin{table}[H]
\centering
\begin{tabular}{lcccc}
\toprule
\textbf{System} & \textbf{Validation Loss} & \textbf{Steps to 2.1} & \textbf{BW/node} & \textbf{Fault Tol.} \\
\midrule
Centralized all-reduce & 2.08 & 48K & 40 GB/s & None \\
DSO-LAN (no compression) & 2.09 & 50K & 8.2 GB/s & 20\% BFT \\
DSO-LAN + sem. compression & 2.10 & 52K & 0.49 GB/s & 20\% BFT \\
DSO-WAN + sem. compression & 2.12 & 57K & 0.06 GB/s & 20\% BFT \\
\bottomrule
\end{tabular}
\caption{Convergence comparison. ``Steps to 2.1'' is training loss target on Zen-7B.}
\end{table}

\subsection{Bandwidth Reduction}

\begin{table}[H]
\centering
\begin{tabular}{lrr}
\toprule
\textbf{Compression Method} & \textbf{Bandwidth/Node} & \textbf{Reduction vs. FP32} \\
\midrule
None (FP32 all-reduce) & 40.0 GB/s & 1$\times$ \\
Random Top-1\% & 0.52 GB/s & 77$\times$ \\
Semantic Top-1\% & 0.49 GB/s & 82$\times$ \\
Semantic Top-1\% + INT8 & 0.12 GB/s & 333$\times$ \\
\textbf{DSO default (Top-1\% + INT8 + gossip)} & \textbf{0.024 GB/s} & \textbf{1667$\times$} \\
\bottomrule
\end{tabular}
\caption{Bandwidth usage for 7B-parameter gradient aggregation (28B floats per step).}
\end{table}

The 94\% bandwidth reduction cited in the abstract refers to semantic vs. uncompressed
aggregation at equivalent compression ratios. DSO default achieves 1667$\times$ vs.
FP32 all-reduce, enabling training over 1Gb/s WAN links.

\subsection{Byzantine Robustness}

\begin{table}[H]
\centering
\begin{tabular}{lcccc}
\toprule
\textbf{Byzantine Fraction} & \textbf{Trimmed Mean} & \textbf{Coordinate Median} & \textbf{DSO (ours)} \\
\midrule
0\% & 2.08 & 2.08 & 2.08 \\
5\% & 2.10 & 2.09 & 2.09 \\
10\% & 2.19 & 2.12 & 2.10 \\
20\% & 2.41 & 2.18 & 2.12 \\
30\% & diverge & 2.31 & 2.19 \\
\bottomrule
\end{tabular}
\caption{Final validation loss at varying Byzantine node fractions. Lower is better.}
\end{table}

DSO's cryptographic commitment scheme + trimmed mean outperforms both baselines at
high Byzantine fractions by filtering commitment-mismatched updates before aggregation.

\section{Analysis}

\subsection{Staleness and Convergence}

DSO introduces gradient staleness due to asynchronous gossip. We measure effective
staleness at $\tau \approx 2.3$ steps (average age of gradient at aggregation time).
Convergence theory for async SGD with staleness $\tau$ gives:

\begin{equation}
\mathbb{E}\|\nabla f(\theta_t)\|^2 \leq \frac{2(f(\theta_0) - f^*)}{T\eta} + \eta L^2 \tau^2 \sigma^2
\end{equation}

where $\sigma^2$ is gradient variance. With learning rate scaled as $\eta \propto 1/\sqrt{T\tau}$,
convergence is preserved with a factor of $\tau$ overhead in required steps.

\subsection{Communication Topology}

Ring vs. gossip vs. tree aggregation:
\begin{table}[H]
\centering
\begin{tabular}{lccc}
\toprule
\textbf{Topology} & \textbf{Rounds to converge} & \textbf{BW per node} & \textbf{Fault tolerance} \\
\midrule
Ring all-reduce & 1 & $O(N)$ & None \\
Binary tree & $\log_2 N$ & $O(\log N)$ & Partial \\
Gossip (DSO) & $\log_f N$ & $O(f)$ & Byzantine \\
\bottomrule
\end{tabular}
\caption{Communication topology comparison for $N=256$ nodes, fanout $f=4$.}
\end{table}

\section{Conclusion}

DSO provides a practical decentralized training infrastructure for the Zen model family,
achieving 94\% bandwidth reduction with negligible convergence degradation ($\Delta\mathcal{L} < 0.04$)
and Byzantine fault tolerance up to 20\% malicious nodes. Integration with ASO enables
privacy-preserving distributed reward model training. The system enables Zen training
to run on geographically distributed, heterogeneous compute without centralized
infrastructure, consistent with the Zoo Labs Foundation's decentralized AI research mandate.

\bibliographystyle{plain}
\begin{thebibliography}{99}
\bibitem{rajbhandari2020zero} Rajbhandari et al. (2020). ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. \textit{SC20}.
\bibitem{dean2012distbelief} Dean et al. (2012). Large Scale Distributed Deep Networks. \textit{NeurIPS}.
\bibitem{mcmahan2017fedavg} McMahan et al. (2017). Communication-Efficient Learning of Deep Networks from Decentralized Data. \textit{AISTATS}.
\bibitem{stich2018sparsified} Stich et al. (2018). Sparsified SGD with Memory. \textit{NeurIPS}.
\bibitem{seide20141bit} Seide et al. (2014). 1-bit stochastic gradient descent and its application to data-parallel distributed training. \textit{Interspeech}.
\bibitem{karimireddy2019error} Karimireddy et al. (2019). Error Feedback Fixes SignSGD and Other Gradient Compression Schemes. \textit{ICML}.
\bibitem{blanchard2017machine} Blanchard et al. (2017). Machine Learning with Adversaries: Byzantine Tolerant Gradient Descent. \textit{NeurIPS}.
\bibitem{demers1987epidemic} Demers et al. (1987). Epidemic Algorithms for Replicated Database Maintenance. \textit{PODC}.
\end{thebibliography}

\end{document}