papers/zen-reasoning.tex at main · zenlm/papers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
% =============================================================================
% Zen-Reasoning: Verifiable Chain-of-Thought with Formal Guarantees
% Hanzo AI Inc. & Zoo Labs Foundation
% Technical Whitepaper v1.0 — February 2026
% =============================================================================

\documentclass[11pt,a4paper]{article}

% --- Encoding & Fonts ---------------------------------------------------------
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}

% --- Mathematics --------------------------------------------------------------
\usepackage{amsmath,amsfonts,amssymb,amsthm}
\usepackage{mathtools}
\usepackage{bm}

% --- Layout & Geometry --------------------------------------------------------
\usepackage[top=1in,bottom=1in,left=1.25in,right=1.25in]{geometry}
\usepackage{microtype}
\usepackage{setspace}
\onehalfspacing

% --- Graphics & Tables --------------------------------------------------------
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{array}
\usepackage{float}

% --- Algorithms ---------------------------------------------------------------
\usepackage{algorithm}
\usepackage{algpseudocode}
\algnewcommand\algorithmicforeach{\textbf{for each}}
\algdef{S}[FOR]{ForEach}[1]{\algorithmicforeach\ #1\ \textbf{do}}

% --- Colors & Hyperlinks -------------------------------------------------------
\usepackage{xcolor}
\definecolor{zenred}{RGB}{253,68,68}
\definecolor{zenblue}{RGB}{41,121,255}
\definecolor{zendark}{RGB}{30,30,40}
\definecolor{codegray}{RGB}{248,248,250}
\definecolor{linkcolor}{RGB}{41,121,255}

\usepackage{hyperref}
\hypersetup{
    colorlinks=true,
    linkcolor=zenblue,
    urlcolor=zenblue,
    citecolor=zenred,
    pdftitle={Zen-Reasoning: Verifiable Chain-of-Thought with Formal Guarantees},
    pdfauthor={Hanzo AI Inc., Zoo Labs Foundation},
    pdfsubject={Reasoning, Formal Verification, Chain-of-Thought, Proof Assistants},
    pdfkeywords={reasoning, chain-of-thought, formal verification, Lean 4, self-correction, tool-augmented}
}

% --- Code Listings ------------------------------------------------------------
\usepackage{listings}
\lstset{
    backgroundcolor=\color{codegray},
    basicstyle=\ttfamily\footnotesize,
    breaklines=true,
    captionpos=b,
    frame=single,
    numbers=left,
    numberstyle=\tiny\color{gray},
    keywordstyle=\color{zenblue}\bfseries,
    stringstyle=\color{zenred},
    commentstyle=\color{gray}\itshape,
    showstringspaces=false,
    tabsize=2
}

% --- Section & Caption Formatting ---------------------------------------------
\usepackage{titlesec}
\usepackage{caption}
\captionsetup{font=small,labelfont=bf}

% --- Theorems & Definitions ---------------------------------------------------
\newtheorem{definition}{Definition}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{corollary}{Corollary}[section]

% --- Bibliography -------------------------------------------------------------
\usepackage{natbib}
\bibliographystyle{abbrvnat}
\setcitestyle{authoryear,round}

% =============================================================================
% TITLE BLOCK
% =============================================================================
\title{
    \vspace{-1.5cm}
    {\normalsize \textsc{Hanzo AI Research} \hfill \textsc{Technical Whitepaper v1.0}} \\[0.8em]
    \rule{\linewidth}{0.5pt} \\[0.6em]
    {\LARGE \textbf{Zen-Reasoning:}} \\[0.3em]
    {\Large Verifiable Chain-of-Thought with Formal Guarantees} \\[0.3em]
    \rule{\linewidth}{0.5pt}
}

\author{    \textbf{Hanzo AI Research}$^{1}$ \quad \textbf{Zoo Labs Foundation}$^{2}$ \\[0.6em]
    $^{1}$Hanzo AI Inc. (Techstars '17) \quad $^{2}$Zoo Labs Foundation (501(c)(3)) \\[0.3em]
    \texttt{research@hanzo.ai} \quad \texttt{foundation@zoo.ngo} \\[0.3em]
    {\small \url{https://hanzo.ai/research/zen-reasoning}}
}

\date{February 2026}

% =============================================================================
\begin{document}
\maketitle

\begin{abstract}
We present \textbf{Zen-Reasoning}, a language model specialized for verifiable mathematical and logical reasoning with formal guarantees on solution correctness. Unlike standard chain-of-thought (CoT) approaches where reasoning steps may contain subtle errors that propagate to incorrect conclusions, Zen-Reasoning integrates step-level verification through a novel \textbf{Verify-then-Generate (VtG)} paradigm that validates each reasoning step against formal proof assistants before proceeding. The system combines three key innovations: (1) a \textbf{Step-Level Verifier (SLV)} that decomposes informal reasoning into verifiable logical steps and checks each against Lean 4 and Z3 solvers, (2) a \textbf{Self-Correction Module (SCM)} that identifies and repairs verification failures through targeted backtracking and alternative reasoning paths, and (3) a \textbf{Tool-Augmented Reasoning Engine (TARE)} that dynamically invokes computational tools---symbolic algebra systems, numerical solvers, code interpreters, and web search---when pure language model reasoning is insufficient. Zen-Reasoning achieves state-of-the-art results on GSM8K (97.8\% accuracy), MATH (86.4\%), GPQA (61.8\%), and the ARC-AGI public evaluation (44.2\%), while providing formal verification certificates for 73.2\% of its correct solutions on MATH. On a new benchmark of competition mathematics problems (AMC/AIME level), Zen-Reasoning solves 68.4\% of problems with verified proofs, compared to 41.2\% for the next best system. We demonstrate that formal verification not only provides correctness guarantees but also improves accuracy by 4--8\% through early detection and correction of reasoning errors.
\end{abstract}

\vspace{0.5em}
\noindent\textbf{Keywords:} Chain-of-Thought Reasoning, Formal Verification, Lean 4, Self-Correction, Tool-Augmented Reasoning, Mathematical Problem Solving

% =============================================================================
\section{Introduction}
\label{sec:introduction}

Large language models have demonstrated remarkable reasoning capabilities through chain-of-thought (CoT) prompting \citep{wei2022chain}, where models generate intermediate reasoning steps before arriving at a final answer. This approach has yielded significant improvements on mathematical reasoning \citep{lewkowycz2022solving}, scientific question answering \citep{sun2024scieval}, and logical deduction \citep{saparov2023language} tasks. However, CoT reasoning remains fundamentally unreliable: models can produce chains that appear plausible but contain subtle logical errors, incorrect calculations, or unjustified inferential leaps that propagate to wrong conclusions.

The unreliability of neural reasoning poses a critical barrier to deploying language models in high-stakes domains---mathematical research, scientific discovery, engineering design, legal analysis---where correctness is non-negotiable. A calculator that returns wrong answers 15\% of the time is not merely imperfect; it is useless.

Zen-Reasoning addresses this fundamental limitation by integrating formal verification directly into the reasoning process. Rather than generating a complete chain of thought and checking it post hoc, Zen-Reasoning verifies each step as it is produced, catching errors before they can propagate. When verification fails, the self-correction module identifies the source of the error and generates alternative reasoning paths. When neural reasoning alone is insufficient, the tool-augmented engine brings in external computational resources.

The key insight is that formal verification and neural reasoning are complementary: language models excel at creative problem decomposition, intuitive leaps, and natural language understanding, while formal systems excel at precise logical deduction and exhaustive case analysis. Zen-Reasoning combines both within a unified architecture that can flexibly allocate reasoning effort between neural and formal modes.

Our contributions are:
\begin{enumerate}
    \item The Verify-then-Generate (VtG) paradigm, which interleaves neural reasoning with formal verification at the step level.
    \item A Step-Level Verifier that translates informal reasoning steps into Lean 4 proof obligations and checks them with automated tactics.
    \item A Self-Correction Module that repairs verification failures through targeted backtracking with provably terminating search.
    \item A Tool-Augmented Reasoning Engine with dynamic tool selection and result integration.
    \item State-of-the-art results on GSM8K, MATH, GPQA, and ARC-AGI, with formal verification certificates for the majority of correct solutions.
\end{enumerate}

% =============================================================================
\section{Background and Related Work}
\label{sec:background}

\subsection{Chain-of-Thought Reasoning}

Chain-of-thought prompting \citep{wei2022chain} elicits step-by-step reasoning from language models by providing few-shot examples with intermediate steps. Zero-shot CoT \citep{kojima2022large} achieves similar effects by appending ``Let's think step by step'' to prompts. Self-consistency \citep{wang2023self} improves reliability by sampling multiple reasoning chains and taking majority vote. Tree-of-Thought \citep{yao2024tree} and Graph-of-Thought \citep{besta2024graph} extend CoT to non-linear reasoning structures.

Despite these advances, CoT reasoning remains prone to errors including arithmetic mistakes, logical fallacies, incorrect lemma application, and hallucinated intermediate results \citep{huang2024large}.

\subsection{Formal Verification and Proof Assistants}

Interactive theorem provers such as Lean 4 \citep{moura2021lean}, Coq \citep{bertot2013interactive}, and Isabelle \citep{paulson1994isabelle} provide mechanically verified proofs of mathematical statements. These systems guarantee correctness by checking every deductive step against a small, trusted kernel. Recent work has explored using language models to generate formal proofs \citep{polu2020generative,lample2022hypertree,yang2024leandojo}, achieving notable success on olympiad-level problems.

SMT solvers such as Z3 \citep{moura2008z3} provide automated decision procedures for specific logical theories (linear arithmetic, bitvectors, arrays), complementing the interactive verification provided by proof assistants.

\subsection{Self-Correction in Language Models}

Self-correction mechanisms enable models to identify and fix errors in their own outputs. Self-Refine \citep{madaan2024self} uses iterative feedback-generation loops. Reflexion \citep{shinn2024reflexion} maintains an episodic memory of past failures. However, research has shown that without external verification, language models often fail to reliably identify their own errors \citep{huang2024large}, sometimes introducing new errors while attempting to fix existing ones.

\subsection{Tool-Augmented Language Models}

Toolformer \citep{schick2024toolformer} trained models to use APIs through self-supervised learning. PAL \citep{gao2023pal} and PoT \citep{chen2023program} delegate computation to code interpreters. Chameleon \citep{lu2024chameleon} composes multiple tools for complex reasoning. These approaches demonstrate that offloading computation to specialized tools can dramatically improve accuracy on tasks requiring precise calculation.

% =============================================================================
\section{Architecture}
\label{sec:architecture}

Zen-Reasoning integrates three modules---the Step-Level Verifier (SLV), Self-Correction Module (SCM), and Tool-Augmented Reasoning Engine (TARE)---with a Zen-72B backbone model fine-tuned for mathematical and logical reasoning. We describe each component and their interaction.

\subsection{System Overview}

Given a problem $P$, Zen-Reasoning generates a solution through the following iterative process:

\begin{algorithm}[t]
\caption{Verify-then-Generate Reasoning}
\label{alg:vtg}
\begin{algorithmic}[1]
\Require Problem $P$, max steps $N$, max corrections $K$
\State $\mathcal{C} \leftarrow []$ \Comment{Reasoning chain}
\State $\mathcal{S} \leftarrow \{P\}$ \Comment{Formal state}
\For{$n = 1, \ldots, N$}
    \State $s_n \leftarrow \text{Generate}(P, \mathcal{C}, \mathcal{S})$ \Comment{Generate next step}
    \If{$s_n$ requires tool use}
        \State $s_n \leftarrow \text{TARE}(s_n, \mathcal{C})$ \Comment{Execute tool call}
    \EndIf
    \State $(v, \sigma) \leftarrow \text{SLV}(s_n, \mathcal{S})$ \Comment{Verify step}
    \If{$v = \textsc{Valid}$}
        \State $\mathcal{C} \leftarrow \mathcal{C} \cup \{s_n\}$
        \State $\mathcal{S} \leftarrow \mathcal{S} \cup \{\sigma\}$ \Comment{Update formal state}
    \Else
        \For{$k = 1, \ldots, K$} \Comment{Self-correction}
            \State $s_n' \leftarrow \text{SCM}(s_n, v, \mathcal{C}, \mathcal{S})$
            \State $(v', \sigma') \leftarrow \text{SLV}(s_n', \mathcal{S})$
            \If{$v' = \textsc{Valid}$}
                \State $\mathcal{C} \leftarrow \mathcal{C} \cup \{s_n'\}$; $\mathcal{S} \leftarrow \mathcal{S} \cup \{\sigma'\}$
                \State \textbf{break}
            \EndIf
        \EndFor
        \If{all corrections failed}
            \State Backtrack to last verified step
        \EndIf
    \EndIf
    \If{$s_n$ is a final answer}
        \State \Return $(\mathcal{C}, \mathcal{S})$ \Comment{Solution with proof}
    \EndIf
\EndFor
\end{algorithmic}
\end{algorithm}

\subsection{Step-Level Verifier (SLV)}
\label{sec:slv}

The SLV validates each reasoning step by translating it into a formal obligation and checking it against proof assistants.

\paragraph{Step Classification.} Each reasoning step $s_n$ is first classified into one of five categories:
\begin{itemize}
    \item \textbf{Algebraic manipulation:} Equation transformations, simplifications, substitutions
    \item \textbf{Logical deduction:} Implications, case analysis, proof by contradiction
    \item \textbf{Numerical computation:} Arithmetic, evaluation of expressions
    \item \textbf{Definitional:} Introduction of variables, notation, or problem restatement
    \item \textbf{Heuristic:} Intuitive leaps, pattern recognition, strategy selection
\end{itemize}

The first three categories are formally verifiable; definitional steps are checked for consistency; heuristic steps are flagged as unverified but tracked for potential backtracking.

\paragraph{Autoformalization.} The SLV translates informal reasoning steps into Lean 4 proof obligations using a specialized autoformalization model $\mathcal{A}_\phi$:
\begin{equation}
    \tau_n = \mathcal{A}_\phi(s_n, \mathcal{S}_{\text{formal}})
\end{equation}
where $\tau_n$ is a Lean 4 tactic proof term and $\mathcal{S}_{\text{formal}}$ is the current formal context (accumulated hypotheses and definitions). The autoformalization model is a Zen-7B model fine-tuned on 2.8 million (informal step, Lean 4 tactic) pairs extracted from Mathlib4 and ProofNet.

\paragraph{Verification Strategies.} Depending on the step category, the SLV applies different verification strategies:

\begin{enumerate}
    \item \textbf{Lean 4 tactic proof:} For algebraic and logical steps, the autoformalized tactic is executed in the Lean 4 kernel. If the tactic succeeds, the step is verified. If it fails, the SLV attempts up to 5 alternative tactic suggestions from the autoformalization model.

    \item \textbf{Z3 SMT solving:} For steps involving linear arithmetic, polynomial inequalities, or propositional logic, the obligation is encoded as an SMT formula and checked with Z3:
    \begin{equation}
        \text{Z3}(\neg(\mathcal{S}_n \implies s_n)) = \textsc{Unsat} \implies s_n \text{ is valid}
    \end{equation}

    \item \textbf{Numerical verification:} For computational steps, the SLV evaluates the claimed computation using arbitrary-precision arithmetic (via mpmath) and checks agreement within a tolerance of $10^{-12}$.

    \item \textbf{Consistency checking:} For definitional steps, the SLV verifies that new definitions do not contradict existing hypotheses using a lightweight constraint solver.
\end{enumerate}

\paragraph{Confidence Scoring.} Each verified step receives a confidence score:
\begin{equation}
    c_n = \begin{cases}
        1.0 & \text{if formally verified (Lean 4 or Z3)} \\
        0.95 & \text{if numerically verified} \\
        0.8 & \text{if consistency-checked} \\
        p_{\text{model}}(s_n | \mathcal{C}, P) & \text{if heuristic (model confidence)}
    \end{cases}
\end{equation}
The overall solution confidence is $C = \prod_{n=1}^{N} c_n$.

\subsection{Self-Correction Module (SCM)}
\label{sec:scm}

When verification fails, the SCM identifies the error and generates corrections.

\paragraph{Error Diagnosis.} The SCM receives the failed step $s_n$, the verification result $v$ (including the specific failure mode from Lean 4 or Z3), and the current reasoning context. It generates a structured error diagnosis:
\begin{equation}
    \text{diag} = \text{SCM}_{\text{diagnose}}(s_n, v, \mathcal{C}, \mathcal{S})
\end{equation}
Common failure modes include:
\begin{itemize}
    \item \textbf{Arithmetic error:} Incorrect calculation (e.g., $3 \times 7 = 24$)
    \item \textbf{Sign error:} Incorrect handling of negative values
    \item \textbf{Missing case:} Incomplete case analysis (e.g., forgetting the negative root)
    \item \textbf{Invalid lemma:} Application of a theorem whose preconditions are not met
    \item \textbf{Logical gap:} A step that does not follow from the premises
\end{itemize}

\paragraph{Targeted Repair.} Based on the diagnosis, the SCM generates a corrected step by conditioning on both the error analysis and the formal feedback:
\begin{equation}
    s_n' = \text{SCM}_{\text{repair}}(s_n, \text{diag}, v_{\text{formal}}, \mathcal{C})
\end{equation}
where $v_{\text{formal}}$ includes the specific Lean 4 error message or Z3 counterexample, providing precise information about why the step failed.

\paragraph{Backtracking.} If correction fails after $K$ attempts, the SCM initiates backtracking to the last verified step and generates an alternative reasoning path. To prevent infinite loops, we maintain a set of explored paths and use a beam search with diversity penalty:
\begin{equation}
    \text{score}(\mathcal{C}') = \log p(\mathcal{C}' | P) - \lambda \sum_{\mathcal{C}_j \in \text{explored}} \text{sim}(\mathcal{C}', \mathcal{C}_j)
\end{equation}
where $\lambda = 0.3$ penalizes similarity to previously explored chains.

\begin{proposition}[Termination]
The VtG reasoning process terminates in at most $N \times (K + 1) \times B$ verification calls, where $N$ is the maximum chain length, $K$ is the maximum corrections per step, and $B$ is the maximum backtracking depth.
\end{proposition}

\subsection{Tool-Augmented Reasoning Engine (TARE)}
\label{sec:tare}

TARE extends the model's reasoning capabilities by dynamically invoking external tools.

\paragraph{Available Tools.}
\begin{itemize}
    \item \textbf{SymPy:} Symbolic algebra, calculus, equation solving
    \item \textbf{SageMath:} Number theory, combinatorics, abstract algebra
    \item \textbf{Python interpreter:} General computation, simulation, brute-force search
    \item \textbf{Wolfram Alpha API:} Mathematical knowledge base queries
    \item \textbf{Lean 4 REPL:} Interactive theorem proving for exploration
    \item \textbf{Web search:} Retrieval of mathematical facts and theorems
\end{itemize}

\paragraph{Tool Selection.} The model learns to select appropriate tools through a routing mechanism:
\begin{equation}
    t^* = \arg\max_{t \in \mathcal{T}} p_\theta(t | s_n, \mathcal{C}, P)
\end{equation}
where $\mathcal{T}$ is the set of available tools and $p_\theta$ is the tool selection probability predicted by the backbone model. Tool selection is trained through supervised learning on 500K examples of correct tool usage.

\paragraph{Result Integration.} Tool outputs are formatted as verified facts and integrated into the reasoning chain:
\begin{equation}
    s_n^{\text{tool}} = \text{Format}(t^*, \text{output}(t^*, \text{query}(s_n)))
\end{equation}
Tool-computed results receive a confidence of 1.0 (for symbolic algebra) or 0.98 (for numerical computation), as they are produced by reliable external systems.

% =============================================================================
\section{Training}
\label{sec:training}

\subsection{Data Curation}
\label{sec:data}

Training data for Zen-Reasoning is curated from four sources:

\begin{table}[t]
\centering
\caption{Training data composition for Zen-Reasoning.}
\label{tab:data}
\begin{tabular}{llrc}
\toprule
\textbf{Source} & \textbf{Description} & \textbf{Examples} & \textbf{Verified} \\
\midrule
Mathlib4 & Lean 4 library theorems & 420K & 100\% \\
ProofNet & Formal-informal proof pairs & 180K & 100\% \\
AMPS & Synthetic math problems & 5.2M & 92\% \\
Competition math & AMC, AIME, IMO, Putnam & 48K & 85\% \\
GSM-hard & Augmented grade school math & 1.2M & 97\% \\
STEM textbooks & University-level problems & 680K & 78\% \\
Tool-use traces & Correct tool invocations & 500K & 100\% \\
Error-correction & (error, diagnosis, fix) triples & 340K & 100\% \\
\midrule
\multicolumn{2}{l}{\textbf{Total}} & \textbf{8.57M} & \\
\bottomrule
\end{tabular}
\end{table}

\paragraph{Verification Pipeline.} Training examples undergo automated verification: solutions are parsed into individual steps, each step is autoformalized and checked against Lean 4/Z3, and only examples where all verifiable steps pass are retained in the ``verified'' category. Examples with verification failures are used to generate error-correction training data.

\paragraph{Synthetic Data Generation.} We generate 5.2M synthetic problems using a curriculum of increasing difficulty, from single-step arithmetic to multi-step algebraic proofs. Each synthetic problem is generated with both a step-by-step solution and a formal proof, ensuring ground-truth verification.

\subsection{Training Procedure}
\label{sec:training_procedure}

Zen-Reasoning is initialized from the aligned Zen-72B checkpoint and fine-tuned in three phases:

\paragraph{Phase 1: Reasoning Pre-training (2 weeks, 128 A100 GPUs).} The backbone is fine-tuned on the full 8.57M example dataset using the SFT objective. We train for 3 epochs with learning rate $5 \times 10^{-6}$, batch size 128, and cosine schedule.

\paragraph{Phase 2: Verification-Aware Training (1 week, 128 A100 GPUs).} We introduce the VtG loop during training, using online verification to provide step-level feedback. The model receives additional reward for steps that pass formal verification:
\begin{equation}
    r_n = \begin{cases}
        +1.0 & \text{step verified and correct} \\
        -0.5 & \text{step verified but unnecessary} \\
        -1.0 & \text{step failed verification} \\
        +0.5 & \text{heuristic step leading to verified conclusion}
    \end{cases}
\end{equation}
We use this reward signal with PPO to optimize the policy for generating verifiable reasoning chains.

\paragraph{Phase 3: Self-Correction Training (3 days, 64 A100 GPUs).} The SCM is trained on the error-correction dataset (340K examples). For each example, the model sees a failed step, the verification error message, the correct diagnosis, and the corrected step. We fine-tune with the SFT objective on this structured data.

\subsection{Autoformalization Training}

The autoformalization model $\mathcal{A}_\phi$ is trained separately on 2.8M (informal, formal) pairs:
\begin{itemize}
    \item 420K examples from Mathlib4 (docstring to tactic)
    \item 180K examples from ProofNet (natural language to Lean 4)
    \item 1.2M synthetic examples generated by back-translation (Lean 4 $\rightarrow$ informal $\rightarrow$ Lean 4)
    \item 1.0M examples from augmented textbook proofs
\end{itemize}

The autoformalization model achieves 78.4\% exact-match accuracy on a held-out test set of 10K (informal, formal) pairs, and 91.2\% accuracy when allowing up to 5 tactic suggestions per step.

% =============================================================================
\section{Evaluation}
\label{sec:evaluation}

We evaluate Zen-Reasoning on four established benchmarks and two new benchmarks, measuring both accuracy and verifiability.

\subsection{Benchmarks}

\begin{itemize}
    \item \textbf{GSM8K} \citep{cobbe2021training}: 1319 grade-school math word problems requiring multi-step arithmetic reasoning.
    \item \textbf{MATH} \citep{hendrycks2021measuring}: 5000 competition-level math problems across 7 categories (Prealgebra, Algebra, Number Theory, Counting \& Probability, Geometry, Intermediate Algebra, Precalculus) with 5 difficulty levels.
    \item \textbf{GPQA} \citep{rein2024gpqa}: 448 graduate-level science questions validated by domain experts.
    \item \textbf{ARC-AGI} \citep{chollet2019measure}: Abstract reasoning tasks requiring pattern recognition and rule induction.
    \item \textbf{CompMath (new):} 500 competition mathematics problems at AMC 10/12 and AIME difficulty, with human-verified solutions and formal proofs.
    \item \textbf{ProofBench (new):} 200 theorem-proving tasks requiring complete Lean 4 proofs, spanning undergraduate to graduate mathematics.
\end{itemize}

\subsection{Metrics}

\begin{itemize}
    \item \textbf{Accuracy:} Fraction of problems solved correctly (final answer matches ground truth).
    \item \textbf{Verified Accuracy:} Fraction of correct solutions accompanied by a complete formal proof.
    \item \textbf{Verification Rate:} Fraction of reasoning steps that are formally verified.
    \item \textbf{Self-Correction Rate:} Fraction of initially incorrect steps successfully repaired by the SCM.
    \item \textbf{Tool Usage Rate:} Fraction of problems where at least one external tool is invoked.
\end{itemize}

\subsection{Baselines}

\begin{itemize}
    \item \textbf{Zen-72B (CoT):} The base Zen-72B model with standard chain-of-thought prompting.
    \item \textbf{GPT-4o (CoT):} OpenAI's GPT-4o with chain-of-thought prompting.
    \item \textbf{o1-preview:} OpenAI's reasoning model with extended thinking.
    \item \textbf{DeepSeek-R1:} DeepSeek's reasoning model.
    \item \textbf{AlphaProof:} DeepMind's formal reasoning system (where results are available).
\end{itemize}

\subsection{Main Results}

\begin{table}[t]
\centering
\caption{Main results on mathematical and scientific reasoning benchmarks.}
\label{tab:main_results}
\begin{tabular}{lccccc}
\toprule
\textbf{Method} & \textbf{GSM8K} & \textbf{MATH} & \textbf{GPQA} & \textbf{ARC-AGI} & \textbf{CompMath} \\
\midrule
Zen-72B (CoT) & 95.2 & 78.6 & 52.3 & 38.2 & 48.6 \\
GPT-4o (CoT) & 94.8 & 76.3 & 53.1 & 36.8 & 45.2 \\
o1-preview & 96.4 & 83.2 & 58.4 & 42.1 & 62.8 \\
DeepSeek-R1 & 97.1 & 84.8 & 59.2 & 41.8 & 61.4 \\
\midrule
Zen-Reasoning & \textbf{97.8} & \textbf{86.4} & \textbf{61.8} & \textbf{44.2} & \textbf{68.4} \\
\quad w/o SLV & 96.1 & 81.2 & 55.8 & 40.1 & 56.2 \\
\quad w/o SCM & 96.8 & 83.7 & 58.4 & 42.3 & 62.1 \\
\quad w/o TARE & 97.2 & 84.1 & 57.2 & 43.8 & 64.8 \\
\bottomrule
\end{tabular}
\end{table}

Table~\ref{tab:main_results} shows Zen-Reasoning achieves state-of-the-art on all five benchmarks. The most significant gains are on MATH (+1.6\% over DeepSeek-R1) and CompMath (+5.6\% over o1-preview), where formal verification catches errors that escape informal reasoning.

Ablation results confirm that each component contributes meaningfully: removing the SLV causes the largest accuracy drop ($-$5.2\% on MATH), demonstrating that step-level verification is the primary driver of improvement. The SCM contributes $-$2.7\% and TARE contributes $-$2.3\% on MATH.

\subsection{Verification Analysis}

\begin{table}[t]
\centering
\caption{Verification statistics on MATH benchmark by category.}
\label{tab:verification}
\begin{tabular}{lcccc}
\toprule
\textbf{Category} & \textbf{Accuracy} & \textbf{Verified} & \textbf{Verif. Rate} & \textbf{SC Rate} \\
\midrule
Prealgebra & 98.2 & 92.1 & 94.3\% & 23.1\% \\
Algebra & 93.4 & 84.2 & 88.7\% & 31.4\% \\
Number Theory & 84.2 & 71.8 & 78.4\% & 28.7\% \\
Count. \& Prob. & 82.1 & 65.3 & 72.1\% & 35.2\% \\
Geometry & 78.6 & 58.4 & 64.2\% & 41.3\% \\
Inter. Algebra & 81.3 & 68.7 & 74.8\% & 33.8\% \\
Precalculus & 79.8 & 63.2 & 68.3\% & 37.6\% \\
\midrule
\textbf{Overall} & \textbf{86.4} & \textbf{73.2} & \textbf{77.3\%} & \textbf{32.4\%} \\
\bottomrule
\end{tabular}
\end{table}

Table~\ref{tab:verification} breaks down verification statistics by MATH category. Algebraically-heavy categories (Prealgebra, Algebra) achieve the highest verification rates, while geometry problems---which often require spatial reasoning that is harder to formalize---have lower verification rates. The self-correction rate of 32.4\% indicates that roughly one-third of initially incorrect steps are successfully repaired.

\subsection{MATH Results by Difficulty}

\begin{table}[t]
\centering
\caption{MATH accuracy by difficulty level (1 = easiest, 5 = hardest).}
\label{tab:math_difficulty}
\begin{tabular}{lccccc}
\toprule
\textbf{Method} & \textbf{Level 1} & \textbf{Level 2} & \textbf{Level 3} & \textbf{Level 4} & \textbf{Level 5} \\
\midrule
Zen-72B (CoT) & 97.1 & 92.4 & 83.2 & 71.6 & 52.8 \\
o1-preview & 98.4 & 94.1 & 87.8 & 78.3 & 61.4 \\
DeepSeek-R1 & 98.8 & 95.2 & 89.1 & 79.8 & 63.2 \\
\midrule
Zen-Reasoning & \textbf{99.2} & \textbf{96.8} & \textbf{91.4} & \textbf{83.2} & \textbf{68.4} \\
\bottomrule
\end{tabular}
\end{table}

The advantage of formal verification increases with problem difficulty: on Level 5 (hardest) problems, Zen-Reasoning outperforms DeepSeek-R1 by 5.2\%, compared to only 0.4\% on Level 1 problems. This confirms that verification is most valuable for complex reasoning chains where errors are most likely to accumulate.

\subsection{ProofBench Results}

\begin{table}[t]
\centering
\caption{Formal proof generation on ProofBench (200 problems).}
\label{tab:proofbench}
\begin{tabular}{lcccc}
\toprule
\textbf{Method} & \textbf{Solved} & \textbf{Avg Steps} & \textbf{Avg Time (s)} & \textbf{Tactics/Step} \\
\midrule
Lean Copilot & 31.0\% & 8.2 & 42.3 & 1.4 \\
LeanDojo & 38.5\% & 12.1 & 68.7 & 1.8 \\
AlphaProof* & 52.0\% & 6.4 & 124.8 & 2.1 \\
\midrule
Zen-Reasoning & \textbf{58.5\%} & 9.8 & 34.2 & 1.6 \\
\bottomrule
\end{tabular}
\end{table}

On ProofBench, Zen-Reasoning solves 58.5\% of formal proof tasks, outperforming AlphaProof (52.0\%, where comparable) while being 3.6$\times$ faster. The combination of neural proof search with formal verification produces more efficient proofs than purely search-based approaches.

\subsection{Error Analysis}

We analyze the 680 incorrectly solved MATH problems:

\begin{table}[t]
\centering
\caption{Error analysis on MATH (680 incorrect solutions).}
\label{tab:errors}
\begin{tabular}{lcc}
\toprule
\textbf{Error Type} & \textbf{Count} & \textbf{\% of Errors} \\
\midrule
Formalization failure (cannot verify) & 218 & 32.1\% \\
Incorrect heuristic step (unverifiable) & 156 & 22.9\% \\
Problem misunderstanding & 112 & 16.5\% \\
Exhausted correction budget & 89 & 13.1\% \\
Tool error / incorrect tool choice & 54 & 7.9\% \\
Correct solution, wrong final answer format & 51 & 7.5\% \\
\bottomrule
\end{tabular}
\end{table}

The largest error category (32.1\%) is formalization failure---problems where the autoformalization model cannot translate informal reasoning into Lean 4 tactics. This represents the primary bottleneck and the most promising avenue for improvement.

% =============================================================================
\section{Ablation Studies}
\label{sec:ablation}

\subsection{Verification Backend}

\begin{table}[t]
\centering
\caption{Effect of verification backend on MATH accuracy.}
\label{tab:backend}
\begin{tabular}{lccc}
\toprule
\textbf{Backend} & \textbf{Accuracy} & \textbf{Verif. Rate} & \textbf{Overhead (s)} \\
\midrule
No verification & 78.6 & 0\% & 0 \\
Z3 only & 82.1 & 41.2\% & 1.2 \\
Lean 4 only & 84.8 & 68.4\% & 8.4 \\
Z3 + Lean 4 (cascade) & \textbf{86.4} & \textbf{77.3\%} & 5.8 \\
\bottomrule
\end{tabular}
\end{table}

The cascade strategy---trying Z3 first (fast, covers arithmetic and propositional logic) then falling back to Lean 4 (slower, covers higher-order reasoning)---provides the best accuracy-overhead trade-off.

\subsection{Correction Budget}

\begin{table}[t]
\centering
\caption{Effect of self-correction budget $K$ on MATH accuracy.}
\label{tab:correction_budget}
\begin{tabular}{lccc}
\toprule
\textbf{$K$ (corrections)} & \textbf{Accuracy} & \textbf{Avg Time (s)} & \textbf{SC Rate} \\
\midrule
0 (no correction) & 82.8 & 12.4 & 0\% \\
1 & 84.6 & 18.2 & 22.1\% \\
3 & 86.1 & 24.8 & 30.8\% \\
5 (default) & \textbf{86.4} & 28.3 & 32.4\% \\
10 & 86.5 & 41.7 & 33.1\% \\
\bottomrule
\end{tabular}
\end{table}

Accuracy improves with correction budget up to $K=5$, beyond which diminishing returns set in. Most correctable errors are fixed within 3 attempts.

\subsection{Tool Usage Analysis}

\begin{table}[t]
\centering
\caption{Tool usage statistics on MATH by tool type.}
\label{tab:tool_usage}
\begin{tabular}{lccc}
\toprule
\textbf{Tool} & \textbf{Usage Rate} & \textbf{Success Rate} & \textbf{Acc. Improvement} \\
\midrule
SymPy & 34.2\% & 96.1\% & +3.8\% \\
Python interpreter & 22.8\% & 91.4\% & +2.1\% \\
SageMath & 8.4\% & 93.2\% & +1.2\% \\
Lean 4 REPL & 12.1\% & 82.3\% & +1.8\% \\
Web search & 5.2\% & 78.6\% & +0.4\% \\
\bottomrule
\end{tabular}
\end{table}

SymPy is the most frequently used and most impactful tool, providing symbolic computation that complements neural reasoning. The Lean 4 REPL is used for exploratory proof search, helping the model discover proof strategies.

% =============================================================================
\section{Discussion}
\label{sec:discussion}

\subsection{Why Verification Improves Accuracy}

Formal verification improves accuracy through three mechanisms:

\begin{enumerate}
    \item \textbf{Error prevention:} Step-level verification catches errors immediately, preventing them from propagating through the reasoning chain. On MATH, 18.3\% of initially generated steps fail verification but are successfully corrected.
    \item \textbf{Confidence calibration:} The model learns to produce more careful, verifiable reasoning steps during verification-aware training, reducing the initial error rate.
    \item \textbf{Search guidance:} Verification failures provide structured feedback that guides the model toward correct reasoning paths more efficiently than unconstrained search.
\end{enumerate}

\subsection{Limitations}

\begin{itemize}
    \item \textbf{Autoformalization bottleneck:} The autoformalization model fails to translate 32.1\% of incorrect solutions, representing the primary accuracy ceiling.
    \item \textbf{Geometry and spatial reasoning:} Geometric problems are inherently harder to formalize, leading to lower verification rates and accuracy.
    \item \textbf{Latency:} The VtG loop adds 2--5$\times$ latency compared to standard CoT generation, making it unsuitable for real-time applications.
    \item \textbf{Lean 4 dependency:} The system requires a running Lean 4 server, adding infrastructure complexity.
    \item \textbf{Domain specificity:} The current system is specialized for mathematical and logical reasoning; extending to other domains (legal, scientific) requires domain-specific formalization.
\end{itemize}

\subsection{Broader Impact}

Verifiable reasoning has significant implications for AI safety. By providing formal certificates of correctness, Zen-Reasoning enables \textit{trustworthy AI reasoning}---users can verify that the model's conclusions follow from its premises, rather than trusting the model on faith. This is particularly valuable in education (ensuring correct mathematical instruction), engineering (verifying design calculations), and scientific research (validating proof attempts).

% =============================================================================
\section{Conclusion}
\label{sec:conclusion}

We presented Zen-Reasoning, a language model that integrates formal verification into the reasoning process through the Verify-then-Generate paradigm. By validating each reasoning step against Lean 4 and Z3 solvers, detecting and correcting errors through self-correction, and augmenting reasoning with external tools, Zen-Reasoning achieves state-of-the-art results on GSM8K (97.8\%), MATH (86.4\%), GPQA (61.8\%), and ARC-AGI (44.2\%) while providing formal verification certificates for 73.2\% of correct solutions.

The key lesson of this work is that formal verification is not merely a post-hoc auditing tool but an active component of the reasoning process that improves accuracy by 4--8\% through early error detection and guided correction. We believe this approach points toward a future where AI reasoning is not just impressive but provably correct.

Models, verification infrastructure, and benchmarks are available at \url{https://github.com/hanzoai/zen-reasoning} under Apache 2.0.

% =============================================================================
% REFERENCES
% =============================================================================
\begin{thebibliography}{30}

\bibitem[Bertot and Cast{\'e}ran(2013)]{bertot2013interactive}
Bertot, Y. and Cast{\'e}ran, P.
\newblock \emph{Interactive Theorem Proving and Program Development: Coq'Art: The Calculus of Inductive Constructions}.
\newblock Springer, 2013.

\bibitem[Besta et~al.(2024)]{besta2024graph}
Besta, M., Blach, N., Kubicek, A., Gerstenberger, R., Gianinazzi, L., Gajber, J., Lehmann, T., Podstawski, M., Nyczyk, H., Wettig, A., and Hoefler, T.
\newblock Graph of thoughts: Solving elaborate problems with large language models.
\newblock In \emph{AAAI Conference on Artificial Intelligence}, 2024.

\bibitem[Chen et~al.(2023)]{chen2023program}
Chen, W., Ma, X., Wang, X., and Cohen, W.~W.
\newblock Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks.
\newblock \emph{TMLR}, 2023.

\bibitem[Chollet(2019)]{chollet2019measure}
Chollet, F.
\newblock On the measure of intelligence.
\newblock \emph{arXiv preprint arXiv:1911.01547}, 2019.

\bibitem[Cobbe et~al.(2021)]{cobbe2021training}
Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R., et~al.
\newblock Training verifiers to solve math word problems.
\newblock \emph{arXiv preprint arXiv:2110.14168}, 2021.

\bibitem[Gao et~al.(2023)]{gao2023pal}
Gao, L., Madaan, A., Zhou, S., Alon, U., Liu, P., Yang, Y., Callan, J., and Neubig, G.
\newblock PAL: Program-aided language models.
\newblock In \emph{ICML}, pp.\ 10764--10799, 2023.

\bibitem[Hendrycks et~al.(2021)]{hendrycks2021measuring}
Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., and Steinhardt, J.
\newblock Measuring mathematical problem solving with the MATH dataset.
\newblock In \emph{NeurIPS}, 2021.

\bibitem[Huang et~al.(2024)]{huang2024large}
Huang, J., Gu, S.~S., Le~Hou, Wu, Y., Wang, X., Yu, H., and Han, J.
\newblock Large language models cannot self-correct reasoning yet.
\newblock In \emph{ICLR}, 2024.

\bibitem[Kojima et~al.(2022)]{kojima2022large}
Kojima, T., Gu, S.~S., Reid, M., Matsuo, Y., and Iwasawa, Y.
\newblock Large language models are zero-shot reasoners.
\newblock In \emph{NeurIPS}, 2022.

\bibitem[Lample et~al.(2022)]{lample2022hypertree}
Lample, G., Lacroix, T., Lachaux, M.-A., Rodriguez, A., Hayat, A., Lavril, T., Ebner, G., and Martinet, X.
\newblock HyperTree proof search for neural theorem proving.
\newblock In \emph{NeurIPS}, 2022.

\bibitem[Lewkowycz et~al.(2022)]{lewkowycz2022solving}
Lewkowycz, A., Andreassen, A., Dohan, D., Dyer, E., Michalewski, H., Ramasesh, V., Slone, A., Anil, C., Schlag, I., Gutman-Solo, T., et~al.
\newblock Solving quantitative reasoning problems with language models.
\newblock In \emph{NeurIPS}, 2022.

\bibitem[Lu et~al.(2024)]{lu2024chameleon}
Lu, P., Peng, B., Cheng, H., Galley, M., Chang, K.-W., Wu, Y.~N., Zhu, S.-C., and Gao, J.
\newblock Chameleon: Plug-and-play compositional reasoning with large language models.
\newblock In \emph{NeurIPS}, 2024.

\bibitem[Madaan et~al.(2024)]{madaan2024self}
Madaan, A., Tandon, N., Gupta, P., Hallinan, S., Gao, L., Wiegreffe, S., Alon, U., Dziri, N., Prabhumoye, S., Yang, Y., et~al.
\newblock Self-refine: Iterative refinement with self-feedback.
\newblock In \emph{NeurIPS}, 2024.

\bibitem[de~Moura and Bj{\o}rner(2008)]{moura2008z3}
de~Moura, L. and Bj{\o}rner, N.
\newblock Z3: An efficient SMT solver.
\newblock In \emph{TACAS}, pp.\ 337--340, 2008.

\bibitem[de~Moura et~al.(2021)]{moura2021lean}
de~Moura, L., Kong, S., Avigad, J., van~Doorn, F., and von~Raumer, J.
\newblock The Lean 4 theorem prover and programming language.
\newblock In \emph{CADE}, pp.\ 625--635, 2021.

\bibitem[Paulson(1994)]{paulson1994isabelle}
Paulson, L.~C.
\newblock \emph{Isabelle: A Generic Theorem Prover}.
\newblock Springer, 1994.

\bibitem[Polu and Sutskever(2020)]{polu2020generative}
Polu, S. and Sutskever, I.
\newblock Generative language modeling for automated theorem proving.
\newblock \emph{arXiv preprint arXiv:2009.03393}, 2020.

\bibitem[Rein et~al.(2024)]{rein2024gpqa}
Rein, D., Hou, B.~L., Stickland, A.~C., Petty, J., Pang, R.~Y., Dirani, J., Michael, J., and Bowman, S.~R.
\newblock GPQA: A graduate-level Google-proof Q\&A benchmark.
\newblock In \emph{ICLR}, 2024.

\bibitem[Saparov and He(2023)]{saparov2023language}
Saparov, A. and He, H.
\newblock Language models are greedy reasoners: A systematic formal analysis of chain-of-thought.
\newblock In \emph{ICLR}, 2023.

\bibitem[Schick et~al.(2024)]{schick2024toolformer}
Schick, T., Dwivedi-Yu, J., Dess{\`i}, R., Raileanu, R., Lomeli, M., Hambro, E., Zettlemoyer, L., Cancedda, N., and Scialom, T.
\newblock Toolformer: Language models can teach themselves to use tools.
\newblock In \emph{NeurIPS}, 2024.

\bibitem[Shinn et~al.(2024)]{shinn2024reflexion}
Shinn, N., Cassano, F., Gopinath, A., Narasimhan, K., and Yao, S.
\newblock Reflexion: Language agents with verbal reinforcement learning.
\newblock In \emph{NeurIPS}, 2024.

\bibitem[Sun et~al.(2024)]{sun2024scieval}
Sun, R., Ren, H., and Liang, P.
\newblock SciEval: A multi-level large language model evaluation benchmark for scientific research.
\newblock In \emph{AAAI}, 2024.

\bibitem[Wang et~al.(2023)]{wang2023self}
Wang, X., Wei, J., Schuurmans, D., Le, Q., Chi, E., Narang, S., Chowdhery, A., and Zhou, D.
\newblock Self-consistency improves chain of thought reasoning in language models.
\newblock In \emph{ICLR}, 2023.

\bibitem[Wei et~al.(2022)]{wei2022chain}
Wei, J., Wang, X., Schuurmans, D., Bosma, M., Ichter, B., Xia, F., Chi, E., Le, Q., and Zhou, D.
\newblock Chain-of-thought prompting elicits reasoning in large language models.
\newblock In \emph{NeurIPS}, 2022.

\bibitem[Yang et~al.(2024)]{yang2024leandojo}
Yang, K., Swope, A., Gu, A., Chalamala, R., Song, P., Yu, S., Godil, S., Prenger, R., and Anandkumar, A.
\newblock LeanDojo: Theorem proving with retrieval-augmented language models.
\newblock In \emph{NeurIPS}, 2024.

\bibitem[Yao et~al.(2024)]{yao2024tree}
Yao, S., Yu, D., Zhao, J., Shafran, I., Griffiths, T., Cao, Y., and Narasimhan, K.
\newblock Tree of thoughts: Deliberate problem solving with large language models.
\newblock In \emph{NeurIPS}, 2024.

\end{thebibliography}

% =============================================================================
\appendix

\section{Autoformalization Examples}
\label{app:autoformalization}

\begin{lstlisting}[caption={Example: informal step to Lean 4 tactic},language={}]
Informal: "Since x^2 - 5x + 6 = 0, we can factor
          to get (x-2)(x-3) = 0"

Lean 4:
  have h : x^2 - 5*x + 6 = (x - 2) * (x - 3) := by ring
  rw [h] at h_eq
\end{lstlisting}

\begin{lstlisting}[caption={Example: numerical verification},language={}]
Informal: "The sum 1/2 + 1/3 + 1/6 = 1"

Verification:
  Compute: 1/2 + 1/3 + 1/6 = 3/6 + 2/6 + 1/6 = 6/6 = 1
  Status: VERIFIED (exact arithmetic)
\end{lstlisting}

\section{CompMath Benchmark Details}
\label{app:compmath}

CompMath contains 500 problems: 200 at AMC 10/12 level (answer is an integer 000--999), 200 at AIME level (answer is an integer 000--999), and 100 at USA(J)MO level (proof required). Problems were collected from public competition archives and verified by three independent mathematicians. Formal Lean 4 proofs are provided for 380 of the 500 problems.

\section{Computational Cost}
\label{app:cost}

\begin{table}[h]
\centering
\caption{Average computational cost per problem by benchmark.}
\label{tab:cost}
\begin{tabular}{lccccc}
\toprule
\textbf{Benchmark} & \textbf{Steps} & \textbf{Verif. Calls} & \textbf{Tool Calls} & \textbf{Time (s)} & \textbf{Tokens} \\
\midrule
GSM8K & 6.2 & 4.8 & 1.2 & 8.4 & 1,240 \\
MATH & 12.8 & 9.4 & 2.8 & 28.3 & 3,820 \\
GPQA & 8.4 & 5.2 & 3.1 & 22.1 & 4,560 \\
ARC-AGI & 14.2 & 8.1 & 5.4 & 34.8 & 2,180 \\
CompMath & 18.6 & 14.2 & 4.2 & 52.4 & 6,240 \\
\bottomrule
\end{tabular}
\end{table}

The verification overhead averages 2--5$\times$ compared to standard CoT generation, but this is offset by the accuracy improvement and the value of formal correctness certificates.

\end{document}