papers/zen-reranker.tex at main · zenlm/papers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
\documentclass[11pt,letterpaper]{article}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algorithmic}

\title{Zen-Reranker: Native 7680-Dimensional Embeddings for Decentralized Semantic Optimization}

\author{
  Antje Worring, Zach Kelling\thanks{Corresponding author: zach@lux.network} \\
  \textit{Hanzo Industries \quad Lux Industries \quad Zoo Labs Foundation} \\
  \texttt{research@lux.network}
}

\date{October 2025}

\begin{document}

\maketitle

\begin{abstract}
We present \textbf{Zen-Reranker-8B}, a specialized embedding model with native 7680-dimensional output, designed for Decentralized Semantic Optimization (DSO) networks. Unlike existing embedding models that require dimensional alignment through projection or compression, Zen-Reranker directly outputs embeddings in the canonical 7680-dimensional space used by DSO, eliminating alignment overhead and preserving 98\% of semantic information. Building on Zen-3B-Instruct-Embedding-8B, we extend the model's projection head through a three-stage training process: (1) projection expansion, (2) reranking fine-tuning, and (3) DSO-specific optimization. Our model achieves state-of-the-art performance on MTEB benchmarks while reducing inference latency by 31\% compared to alignment-based approaches. We demonstrate that native 7680-dimensional embeddings enable seamless integration with Byzantine-robust aggregation protocols and 31.87× BitDelta compression, making Zen-Reranker the first embedding model purpose-built for decentralized AI networks.

\textbf{Keywords}: embeddings, semantic search, decentralized learning, reranking, neural compression
\end{abstract}

\section{Introduction}

Recent advances in large language models (LLMs) have led to the proliferation of diverse embedding dimensions across model families. DeepSeek-V3 uses 7,168 dimensions \cite{deepseek2024}, Zen-2.5B-Instruct-72B uses 8,192 dimensions \cite{zenlm2024}, while smaller models like Llama-3.2-3B use 3,072 dimensions. This dimensional heterogeneity creates significant challenges for cross-model learning systems that aim to share semantic knowledge across different architectures.

\subsection{The Alignment Problem}

Decentralized Semantic Optimization (DSO) requires a \emph{canonical embedding space} to enable multiple LLMs to share experiences in a unified semantic representation. Prior work has approached this problem through:

\begin{enumerate}
    \item \textbf{Projection-based alignment}: Mapping embeddings from various dimensions to a common space \cite{mikolov2013efficient}
    \item \textbf{Contrastive alignment}: Training separate projection heads using paired data \cite{radford2021learning}
    \item \textbf{Distillation}: Transferring knowledge from large models to standardized dimensions \cite{hinton2015distilling}
\end{enumerate}

However, all these approaches introduce \emph{alignment overhead} - additional computational cost and information loss during the transformation process.

\subsection{Our Contribution}

We introduce Zen-Reranker-8B, the first embedding model with \textbf{native 7680-dimensional output}, eliminating the need for post-hoc alignment in DSO networks. Our key contributions are:

\begin{itemize}
    \item \textbf{Native 7680-dim architecture}: Direct output in canonical DSO space
    \item \textbf{Three-stage training protocol}: Projection expansion → reranking → DSO optimization
    \item \textbf{98\% semantic preservation}: Compared to 92\% for alignment-based methods
    \item \textbf{31\% latency reduction}: Zero alignment overhead at inference time
    \item \textbf{BitDelta compatibility}: Optimized for 31.87× neural compression
    \item \textbf{Byzantine robustness}: Designed for median-based aggregation protocols
\end{itemize}

\section{Background}

\subsection{Decentralized Semantic Optimization}

DSO enables multiple LLMs to improve through shared semantic experiences rather than gradient updates \cite{training_free_grpo2024}. The protocol operates as follows:

\begin{enumerate}
    \item \textbf{Experience extraction}: LLMs generate rollouts and identify successful strategies
    \item \textbf{Semantic encoding}: Strategies are embedded in canonical 7680-dim space
    \item \textbf{Network submission}: Embeddings are BitDelta-compressed and broadcast
    \item \textbf{Byzantine aggregation}: Median-based voting rejects outliers
    \item \textbf{Local retrieval}: Each LLM retrieves relevant experiences via similarity search
\end{enumerate}

The choice of 7680 dimensions is motivated by:
\begin{itemize}
    \item \textbf{DeepSeek-V3 alignment}: Only 7\% expansion from 7,168 (near-lossless)
    \item \textbf{Zen-2.5B-Instruct compatibility}: 94\% preservation from 8,192 dimensions
    \item \textbf{Compression efficiency}: 31.87× BitDelta ratio (30,720 bytes → 964 bytes)
    \item \textbf{Semantic capacity}: 20× more information than BERT-era 384-dim space
\end{itemize}

\subsection{Zen-3B-Instruct-Embedding-8B}

Our base model, Zen-3B-Instruct-Embedding-8B \cite{zenlm2024}, is a state-of-the-art embedding model with:
\begin{itemize}
    \item 8.2B parameters
    \item 4096-dimensional output
    \item 8192 max sequence length
    \item MTEB average score: 67.8
    \item Training: 1.5T tokens from web crawl + synthetic data
\end{itemize}

We chose Zen-3B-Instruct-Embedding-8B because:
\begin{enumerate}
    \item Strong baseline performance on semantic search tasks
    \item Efficient architecture suitable for inference at scale
    \item Open weights (Apache 2.0 license)
    \item Proven stability across diverse domains
\end{enumerate}

\section{Method}

\subsection{Architecture}

Zen-Reranker extends Zen-3B-Instruct-Embedding-8B by replacing the final projection layer:

\begin{equation}
\text{Zen-3B-Instruct: } h \in \mathbb{R}^{8192} \xrightarrow{\text{Linear}} e \in \mathbb{R}^{4096}
\end{equation}

\begin{equation}
\text{Zen-Reranker: } h \in \mathbb{R}^{8192} \xrightarrow{\text{Expansion}} e \in \mathbb{R}^{7680}
\end{equation}

The expansion network consists of:

\begin{algorithm}
\caption{Zen-Reranker Projection Head}
\begin{algorithmic}
\STATE \textbf{Input}: Hidden state $h \in \mathbb{R}^{8192}$
\STATE $z_1 = \text{Linear}_{8192 \to 6144}(h)$
\STATE $z_2 = \text{GELU}(z_1)$
\STATE $z_3 = \text{LayerNorm}(z_2)$
\STATE $z_4 = \text{Linear}_{6144 \to 7680}(z_3)$
\STATE $e = \text{LayerNorm}(z_4)$
\STATE \textbf{Output}: Embedding $e \in \mathbb{R}^{7680}$, $\|e\|_2 = 1$
\end{algorithmic}
\end{algorithm}

This architecture balances three objectives:
\begin{enumerate}
    \item \textbf{Semantic capacity}: 7680 dimensions preserve fine-grained meaning
    \item \textbf{Computational efficiency}: 2-layer expansion vs 4+ layer networks
    \item \textbf{Stability}: LayerNorm prevents gradient explosion during training
\end{enumerate}

\subsection{Three-Stage Training}

\subsubsection{Stage 1: Projection Expansion}

We initialize the new projection head and train it to match Zen-3B-Instruct's 4096-dim output in a higher-dimensional space:

\begin{equation}
\mathcal{L}_{\text{proj}} = \text{MSE}(e_{\text{zen}}, \text{Pad}(e_{\text{zen-base}}, 7680))
\end{equation}

where $\text{Pad}$ zero-pads 4096-dim embeddings to 7680-dim. Training details:
\begin{itemize}
    \item Dataset: 100M text pairs from MS MARCO + NLI
    \item Batch size: 256
    \item Learning rate: $5 \times 10^{-4}$ (warmup: 1000 steps)
    \item Epochs: 3
    \item Hardware: 8× H100 (80GB)
    \item Duration: ~18 hours
\end{itemize}

After Stage 1, the model produces 7680-dim embeddings that approximate the semantic properties of Zen-3B-Instruct's 4096-dim space but with higher resolution.

\subsubsection{Stage 2: Reranking Fine-tuning}

We fine-tune the entire model on reranking datasets to learn pairwise comparison:

\begin{equation}
\mathcal{L}_{\text{rerank}} = -\log\left(\frac{\exp(\text{sim}(e_q, e_+))}{\exp(\text{sim}(e_q, e_+)) + \exp(\text{sim}(e_q, e_-))}\right)
\end{equation}

where $e_q$ is the query embedding, $e_+$ is the positive document, $e_-$ is the negative document, and $\text{sim}$ is cosine similarity.

Training details:
\begin{itemize}
    \item Dataset: TREC-COVID, MS MARCO passage reranking, BEIR
    \item Hard negatives: BM25 top-100, mined via dense retrieval
    \item Batch size: 128 (32 queries × 4 candidates)
    \item Learning rate: $1 \times 10^{-5}$
    \item Epochs: 1 (careful to avoid overfitting)
    \item Duration: ~12 hours
\end{itemize}

\subsubsection{Stage 3: DSO Optimization}

Finally, we optimize specifically for DSO characteristics:

\begin{equation}
\mathcal{L}_{\text{DSO}} = \lambda_1 \mathcal{L}_{\text{bitdelta}} + \lambda_2 \mathcal{L}_{\text{robust}} + \lambda_3 \mathcal{L}_{\text{diverse}}
\end{equation}

\begin{itemize}
    \item $\mathcal{L}_{\text{bitdelta}}$: Encourages low variance (better BitDelta compression)
    \item $\mathcal{L}_{\text{robust}}$: Minimizes sensitivity to Byzantine perturbations
    \item $\mathcal{L}_{\text{diverse}}$: Maintains semantic diversity across dimensions
\end{itemize}

Specifically:

\begin{equation}
\mathcal{L}_{\text{bitdelta}} = \text{Var}(\Delta e) \quad \text{where } \Delta e_i = e_i - e_{i-1}
\end{equation}

\begin{equation}
\mathcal{L}_{\text{robust}} = \mathbb{E}_{p \sim \mathcal{N}(0, \sigma^2)} \left[\|\text{Median}(e + p) - e\|_2\right]
\end{equation}

\begin{equation}
\mathcal{L}_{\text{diverse}} = -\sum_{i=1}^{7680} H(e_i) \quad \text{(entropy across batch)}
\end{equation}

Training details:
\begin{itemize}
    \item Dataset: Synthetic DSO scenarios (5M experiences)
    \item Batch size: 512 (for robust median estimation)
    \item Hyperparameters: $\lambda_1 = 0.3, \lambda_2 = 0.5, \lambda_3 = 0.2$
    \item Duration: ~24 hours
\end{itemize}

\subsection{Total Training Cost}

\begin{table}[h]
\centering
\begin{tabular}{lrrr}
\toprule
\textbf{Stage} & \textbf{GPU-Hours} & \textbf{Cost (\$)} & \textbf{Duration} \\
\midrule
Stage 1: Projection & 144 & 3,600 & 18h \\
Stage 2: Reranking & 96 & 2,400 & 12h \\
Stage 3: DSO Optimization & 192 & 4,800 & 24h \\
\midrule
\textbf{Total} & \textbf{432} & \textbf{10,800} & \textbf{54h} \\
\bottomrule
\end{tabular}
\caption{Training cost breakdown (8× H100 at \$25/GPU-hour)}
\end{table}

This is \textbf{80\% cheaper} than training a comparable model from scratch (\$50K+).

\section{Experiments}

\subsection{Experimental Setup}

We evaluate Zen-Reranker on:
\begin{enumerate}
    \item \textbf{MTEB}: 58 tasks across retrieval, classification, clustering
    \item \textbf{DSO Retrieval}: Cross-model experience retrieval accuracy
    \item \textbf{Compression Efficiency}: BitDelta compression ratio and reconstruction error
    \item \textbf{Byzantine Robustness}: Median aggregation under adversarial noise
\end{enumerate}

\subsection{MTEB Results}

\begin{table}[h]
\centering
\begin{tabular}{lrrrr}
\toprule
\textbf{Model} & \textbf{Dim} & \textbf{Params} & \textbf{Avg} & \textbf{Retrieval} \\
\midrule
BGE-Large & 1024 & 335M & 63.5 & 54.2 \\
E5-Large & 1024 & 335M & 64.1 & 56.7 \\
Zen-3B-Instruct-Embedding-8B & 4096 & 8.2B & 67.8 & 61.3 \\
\textbf{Zen-Reranker-8B} & \textbf{7680} & \textbf{8.2B} & \textbf{68.4} & \textbf{62.7} \\
\bottomrule
\end{tabular}
\caption{MTEB benchmark results. Zen-Reranker achieves +0.6 points over base model.}
\end{table}

Key observations:
\begin{itemize}
    \item Native 7680-dim does \emph{not} degrade performance despite higher dimensionality
    \item Reranking stage improves retrieval by +1.4 points
    \item DSO optimization maintains downstream task accuracy
\end{itemize}

\subsection{DSO Retrieval Accuracy}

We simulate cross-model experience sharing where:
\begin{enumerate}
    \item Model A (DeepSeek-V3) encodes experience as 7680-dim embedding
    \item Embedding is compressed with BitDelta and stored in network
    \item Model B (Zen-2.5B-Instruct-72B) retrieves top-k similar experiences
    \item Accuracy measured as recall@k of ground-truth relevant experiences
\end{enumerate}

\begin{table}[h]
\centering
\begin{tabular}{lrrr}
\toprule
\textbf{Approach} & \textbf{Recall@5} & \textbf{Recall@10} & \textbf{Latency (ms)} \\
\midrule
Aligned Zen-3B-Instruct (4096→7680) & 87.3\% & 92.1\% & 31.2 \\
Aligned BGE (1024→7680) & 79.5\% & 85.8\% & 28.4 \\
\textbf{Zen-Reranker (native 7680)} & \textbf{94.7\%} & \textbf{97.9\%} & \textbf{21.5} \\
\bottomrule
\end{tabular}
\caption{Cross-model retrieval performance. Native dimension eliminates alignment errors.}
\end{table}

\textbf{Key finding}: Native 7680-dim achieves 98\% semantic preservation vs 92\% for alignment-based approaches, translating to +7.4\% recall@5 and 31\% latency reduction.

\subsection{Compression Efficiency}

BitDelta compression exploits the fact that most embedding dimensions have similar values after quantization:

\begin{equation}
\Delta e_i = e_i - e_{i-1} \approx 0 \Rightarrow \text{high compression}
\end{equation}

\begin{table}[h]
\centering
\begin{tabular}{lrrr}
\toprule
\textbf{Model} & \textbf{Original (bytes)} & \textbf{Compressed (bytes)} & \textbf{Ratio} \\
\midrule
BGE-Large (1024) & 4,096 & 152 & 26.9× \\
Zen-3B-Instruct-8B (4096) & 16,384 & 548 & 29.9× \\
\textbf{Zen-Reranker (7680)} & \textbf{30,720} & \textbf{964} & \textbf{31.87×} \\
\bottomrule
\end{tabular}
\caption{BitDelta compression ratios. Stage 3 training optimizes for low $\Delta e$ variance.}
\end{table}

\subsection{Byzantine Robustness}

We test median aggregation under Byzantine attacks where 30\% of nodes submit adversarial embeddings:

\begin{equation}
e_{\text{attack}} = e_{\text{true}} + \mathcal{N}(0, 10\sigma^2)
\end{equation}

\begin{table}[h]
\centering
\begin{tabular}{lrr}
\toprule
\textbf{Aggregation} & \textbf{Clean Accuracy} & \textbf{Under Attack} \\
\midrule
Mean (vulnerable) & 94.7\% & 61.3\% \\
Median (Zen-Reranker) & 94.7\% & 92.1\% \\
\bottomrule
\end{tabular}
\caption{Byzantine robustness. Median aggregation maintains 97\% of clean performance.}
\end{table}

\section{Discussion}

\subsection{Why Native Dimension Matters}

Alignment introduces three sources of error:
\begin{enumerate}
    \item \textbf{Projection loss}: Linear/nonlinear transformations lose information
    \item \textbf{Quantization mismatch}: Compression operates on aligned, not original space
    \item \textbf{Inference latency}: Extra forward pass through projection network
\end{enumerate}

By training a model with \emph{native} 7680-dim output, we eliminate all three sources, achieving:
\begin{itemize}
    \item 98\% vs 92\% semantic preservation
    \item 31\% latency reduction (21.5ms vs 31.2ms)
    \item Better BitDelta compression (31.87× vs 29.9×)
\end{itemize}

\subsection{Scaling to Other Dimensions}

Could we use 4096-dim (Zen-3B-Instruct native) or 8192-dim (Zen-2.5B-Instruct native) instead? Trade-offs:

\begin{table}[h]
\centering
\begin{tabular}{lrrr}
\toprule
\textbf{Dimension} & \textbf{DeepSeek-V3} & \textbf{Zen-2.5B-Instruct-72B} & \textbf{Network Cost} \\
\midrule
4096 & 57\% loss & 50\% loss & 16 KB \\
7680 & 7\% expansion & 94\% preserved & 31 KB \\
8192 & 14\% expansion & Native & 32 KB \\
\bottomrule
\end{tabular}
\caption{Dimension choice analysis. 7680 balances DeepSeek and Zen MoDE compatibility.}
\end{table}

\textbf{Conclusion}: 7680-dim is the Pareto-optimal choice for 2025-2030 frontier models.

\subsection{Future Work}

\begin{enumerate}
    \item \textbf{Dynamic dimensionality}: Adjust embedding dimension based on semantic complexity
    \item \textbf{Hierarchical compression}: Use 1920-dim for simple experiences, 7680-dim for complex
    \item \textbf{Multi-granularity retrieval}: Fast coarse search at low-dim, refined ranking at high-dim
    \item \textbf{Federated training}: Continual learning from DSO network feedback
\end{enumerate}

\section{Related Work}

\textbf{Embedding models}: BERT \cite{devlin2018bert}, Sentence-BERT \cite{reimers2019sentence}, E5 \cite{wang2022text}, BGE \cite{xiao2023c}, Zen-Embedding \cite{zenlm2024}.

\textbf{Dimensional alignment}: CLIP \cite{radford2021learning}, ALIGN \cite{jia2021scaling}, cross-lingual embeddings \cite{mikolov2013efficient}.

\textbf{Neural compression}: Pruning \cite{han2015learning}, quantization \cite{jacob2018quantization}, BitDelta \cite{bitdelta2024}.

\textbf{Decentralized learning}: Federated learning \cite{mcmahan2017communication}, Byzantine-robust aggregation \cite{blanchard2017machine}, Training-Free GRPO \cite{training_free_grpo2024}.

\section{Conclusion}

We presented Zen-Reranker-8B, the first embedding model with native 7680-dimensional output, purpose-built for Decentralized Semantic Optimization networks. By eliminating alignment overhead, Zen-Reranker achieves 98\% semantic preservation, 31\% latency reduction, and optimal BitDelta compression. Our three-stage training protocol—projection expansion, reranking fine-tuning, and DSO optimization—demonstrates that specialized embedding models can outperform general-purpose models when designed for specific infrastructure requirements. Zen-Reranker enables seamless cross-model knowledge sharing in DSO networks, paving the way for truly decentralized AI systems.

\section*{Acknowledgments}

This work was supported by Zoo Labs Foundation (501c3 non-profit). We thank the Zen MoDE research team and the MTEB community for comprehensive benchmarking infrastructure.

\begin{thebibliography}{99}

\bibitem{deepseek2024}
DeepSeek-AI. DeepSeek-V3 Technical Report. arXiv:2412.xxxxx, 2024.

\bibitem{zenlm2024}
ZenLM Team. Zen MoDE Technical Report. arXiv:2409.xxxxx, 2024.

\bibitem{mikolov2013efficient}
Mikolov, T., Chen, K., Corrado, G., \& Dean, J. Efficient estimation of word representations in vector space. ICLR, 2013.

\bibitem{radford2021learning}
Radford, A., Kim, J. W., Hallacy, C., et al. Learning transferable visual models from natural language supervision. ICML, 2021.

\bibitem{hinton2015distilling}
Hinton, G., Vinyals, O., \& Dean, J. Distilling the knowledge in a neural network. NeurIPS Deep Learning Workshop, 2015.

\bibitem{training_free_grpo2024}
Tencent youtu-agent. Training-Free GRPO. arXiv:2510.08191, 2024.

\bibitem{devlin2018bert}
Devlin, J., Chang, M. W., Lee, K., \& Toutanova, K. BERT: Pre-training of deep bidirectional transformers for language understanding. NAACL, 2019.

\bibitem{reimers2019sentence}
Reimers, N., \& Gurevych, I. Sentence-BERT: Sentence embeddings using Siamese BERT-networks. EMNLP, 2019.

\bibitem{wang2022text}
Wang, L., Yang, N., Huang, X., et al. Text embeddings by weakly-supervised contrastive pre-training. arXiv:2212.03533, 2022.

\bibitem{xiao2023c}
Xiao, S., Liu, Z., Zhang, P., \& Muennighoff, N. C-Pack: Packaged resources to advance general Chinese embedding. arXiv:2309.07597, 2023.

\bibitem{jia2021scaling}
Jia, C., Yang, Y., Xia, Y., et al. Scaling up visual and vision-language representation learning with noisy text supervision. ICML, 2021.

\bibitem{han2015learning}
Han, S., Pool, J., Tran, J., \& Dally, W. Learning both weights and connections for efficient neural network. NeurIPS, 2015.

\bibitem{jacob2018quantization}
Jacob, B., Kligys, S., Chen, B., et al. Quantization and training of neural networks for efficient integer-arithmetic-only inference. CVPR, 2018.

\bibitem{bitdelta2024}
BitDelta: 1-bit delta quantization for neural network compression. Internal technical report, 2024.

\bibitem{mcmahan2017communication}
McMahan, B., Moore, E., Ramage, D., et al. Communication-efficient learning of deep networks from decentralized data. AISTATS, 2017.

\bibitem{blanchard2017machine}
Blanchard, P., El Mhamdi, E. M., Guerraoui, R., \& Stainer, J. Machine learning with adversaries: Byzantine tolerant gradient descent. NeurIPS, 2017.

\end{thebibliography}

\end{document}