From b055180b1ff8739e5374a6c236f92a4758b767f7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shinaoka Date: Fri, 13 Mar 2026 11:03:46 +0900 Subject: [PATCH] docs: tighten mathematical AD notes --- docs/math/cholesky.md | 8 +--- docs/math/det.md | 26 +++++------ docs/math/eig.md | 19 ++------ docs/math/eigen.md | 9 ---- docs/math/index.md | 5 +-- docs/math/inv.md | 13 +----- docs/math/lstsq.md | 11 ----- docs/math/lu.md | 12 ++--- docs/math/matrix_exp.md | 12 ----- docs/math/norm.md | 10 ----- docs/math/pinv.md | 11 +---- docs/math/qr.md | 27 +++--------- docs/math/scalar_ops.md | 88 ++++++++++++++++++++++++------------- docs/math/solve.md | 18 -------- docs/math/svd.md | 38 +++++----------- tests/test_math_registry.py | 20 ++++++--- 16 files changed, 115 insertions(+), 212 deletions(-) diff --git a/docs/math/cholesky.md b/docs/math/cholesky.md index cd0e300..a34a1c5 100644 --- a/docs/math/cholesky.md +++ b/docs/math/cholesky.md @@ -65,13 +65,7 @@ $$ This is the adjoint of the JVP map and keeps $\bar{A}$ Hermitian. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/cholesky.md` uses the same $\varphi / \varphi^*$ pair to - express both JVP and VJP. -- PyTorch's `cholesky_jvp` and `cholesky_backward` implement the same - triangular-solve sandwich rather than explicit inverses. -- Never form $L^{-1}$ explicitly; use triangular solves on the left and right. +Never form $L^{-1}$ explicitly; use triangular solves on the left and right. ## Verification diff --git a/docs/math/det.md b/docs/math/det.md index 2c88b0a..ee70414 100644 --- a/docs/math/det.md +++ b/docs/math/det.md @@ -31,7 +31,7 @@ $$ - complex case: $$ -\bar{A} = \overline{\bar{d} \cdot \det(A)} \cdot A^{-\mathsf{H}}. +\bar{A} = \bar{d} \cdot \overline{\det(A)} \cdot A^{-\mathsf{H}}. $$ ## Singular matrix handling @@ -42,9 +42,9 @@ still makes sense: - rank $N-1$: the adjugate is rank 1 and can be reconstructed from an SVD - rank $\le N-2$: the adjugate vanishes -PyTorch's `linalg_det_backward` handles this regime by reconstructing the -leave-one-out singular-value products together with the orientation/phase factor -coming from $U$ and $V^{\mathsf{H}}$. +The rank-$N-1$ adjugate can be reconstructed from the leave-one-out singular +value products together with the orientation/phase factor carried by the +singular vectors. ## 2. `slogdet` @@ -64,12 +64,13 @@ $$ ### Reverse Rule -For the differentiable log-magnitude path: +Given cotangents $\bar{s}$ for the sign output and $\bar{\ell}$ for the +log-magnitude output: - real case: $$ -\bar{A} = \overline{\operatorname{logabsdet}} \cdot A^{-\mathsf{T}} +\bar{A} = \bar{\ell} \cdot A^{-\mathsf{T}} $$ - complex case: @@ -77,21 +78,14 @@ $$ $$ \bar{A} = g \cdot A^{-\mathsf{H}}, \qquad -g = \overline{\operatorname{logabsdet}} -- i \operatorname{Im}(\overline{\operatorname{sign}}^* \operatorname{sign}). +g = \bar{\ell} - i \operatorname{Im}(\bar{s}^* s), $$ +where $s = \operatorname{sign}(A)$. + `slogdet` is not differentiable at singular matrices because $\operatorname{logabsdet} = -\infty$ there. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/det.md` keeps both `det` and `slogdet` in one note and - discusses the singular adjugate path explicitly. -- PyTorch's `linalg_det_jvp`, `linalg_det_backward`, `slogdet_jvp`, and - `slogdet_backward` implement the same split and use solves rather than - explicit inverses. - ## Verification - compare primal `det(A)` and `slogdet(A)` with direct evaluation diff --git a/docs/math/eig.md b/docs/math/eig.md index a42485c..216b43b 100644 --- a/docs/math/eig.md +++ b/docs/math/eig.md @@ -59,8 +59,8 @@ $$ ### Normalization correction -PyTorch and `tenferro-rs` both normalize eigenvectors to unit norm. Therefore -the raw tangent must be projected back onto that gauge: +If eigenvectors are normalized to unit norm, the raw tangent must be projected +back onto that gauge: $$ \dot{V} = @@ -114,8 +114,8 @@ $$ \operatorname{Im}(\operatorname{diag}(V^\dagger \bar{V})) = 0. $$ -PyTorch's `linalg_eig_backward` checks this condition numerically and raises for -ill-defined losses. +Losses that violate this condition are ill-defined for derivatives through the +eigenvector phase gauge. ## Relationship to the Hermitian Case @@ -123,17 +123,6 @@ When $A$ is Hermitian, $V$ is unitary, $V^{-1} = V^\dagger$, and eigenvalues are real. The formulas simplify to the structured rule documented in [`eigen.md`](./eigen.md). -## Implementation Correspondence - -- `tenferro-rs/docs/AD/eig.md` uses the $V^{-1}\dot{A}V$ and - $V^{-\dagger} G V^\dagger$ formulation with an explicit normalization - correction. -- PyTorch's `linalg_eig_jvp` and `linalg_eig_backward` implement the same rule. - Their comments explicitly note that the uncorrected textbook formulas are - missing the normalization term. -- For real inputs with complex outputs, PyTorch applies the usual - `handle_r_to_c` projection back to the real cotangent domain. - ## Verification ### Forward reconstruction diff --git a/docs/math/eigen.md b/docs/math/eigen.md index 00b8a9b..661f7b5 100644 --- a/docs/math/eigen.md +++ b/docs/math/eigen.md @@ -96,15 +96,6 @@ $$ with the understanding that the skew-Hermitian gauge is projected away. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/eigen.md` writes the reverse rule through the explicit - Hermitian inner matrix $D$; this note keeps that structure. -- PyTorch does not have a separate Hermitian kernel. It calls - `linalg_eig_backward(..., is_hermitian=true)` and - `linalg_eig_jvp(..., is_hermitian=true)`, which reduce to the same formulas - with $V^{-1} = V^\dagger$. - ## Verification ### Forward reconstruction diff --git a/docs/math/index.md b/docs/math/index.md index 523e1da..a9557e2 100644 --- a/docs/math/index.md +++ b/docs/math/index.md @@ -6,9 +6,8 @@ - the machine-readable oracle database The mathematical notes under `docs/math/` are the human-facing source of truth -for known AD rules in this repository. They are maintained to preserve the full -derivation detail migrated from `tenferro-rs/docs/AD/` while adding explicit -correspondence to PyTorch's manual autograd formulas where relevant. +for known AD rules in this repository. They are maintained to preserve full +derivation detail without collapsing the rules into implementation summaries. Standalone linalg operations are documented as one note per operation, while shared scalar and wrapper formulas are grouped where that keeps the corpus diff --git a/docs/math/inv.md b/docs/math/inv.md index f42ca5d..8cd855d 100644 --- a/docs/math/inv.md +++ b/docs/math/inv.md @@ -40,17 +40,8 @@ immediately recovers - JVP: $\dot{B} = -B\,\dot{A}\,B$ - VJP: $\bar{A} = -B^{\mathsf{H}}\,\bar{B}\,B^{\mathsf{H}}$ -The same relationship is used in PyTorch and downstream libraries to avoid -duplicating logic. - -## Implementation Correspondence - -- `tenferro-rs/docs/AD/inv.md` writes the inverse rule directly and then points - back to solve as the conceptual source. -- PyTorch exposes the inverse derivative via solve-style formulas in - `derivatives.yaml` and related linear-solve kernels. -- For higher-order AD, prefer `solve` over explicit multiplication by a cached - inverse. +For higher-order AD, it is often more stable to treat the inverse as an +implicit linear solve rather than as a primitive cached matrix product. ## Verification diff --git a/docs/math/lstsq.md b/docs/math/lstsq.md index 5f96e5a..b41e5cc 100644 --- a/docs/math/lstsq.md +++ b/docs/math/lstsq.md @@ -117,17 +117,6 @@ $$ Since $A z = Q y$, the formulas for $\bar{b}$ and $\bar{A}$ follow. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/lstsq.md` uses the QR-based derivation above, which makes - the residual correction term explicit. -- PyTorch's `linalg_lstsq_solution_jvp` and `linalg_lstsq_backward` currently - route the solution term through `pinv_jvp` / `pinv_backward`, while the - residual term is added directly. The resulting adjoint matches the same - least-squares geometry. -- The residual JVP in PyTorch uses Danskin's theorem, treating the minimizer as - fixed when differentiating the residual objective itself. - ## Verification ### Forward check diff --git a/docs/math/lu.md b/docs/math/lu.md index f23e14e..4948664 100644 --- a/docs/math/lu.md +++ b/docs/math/lu.md @@ -201,15 +201,9 @@ P^T U^{-\dagger}. $$ -## Implementation Correspondence - -- `tenferro-rs/docs/AD/lu.md` writes the rule in exactly this block-structured - way, with separate square, wide, and tall cases. -- PyTorch's `linalg_lu_backward` and `linalg_lu_jvp` implement the same three - cases using `tril(-1)` / `triu()` projections and triangular solves rather - than explicit inverses. -- All $L^{-1} X$ and $X U^{-1}$ operations should be implemented as triangular - solves. +All appearances of $L^{-1}X$, $XU^{-1}$, $L^{-\dagger}X$, and $XU^{-\dagger}$ +should be interpreted as triangular solves rather than as explicit inverse +formation. ## Verification diff --git a/docs/math/matrix_exp.md b/docs/math/matrix_exp.md index f229a49..ac88087 100644 --- a/docs/math/matrix_exp.md +++ b/docs/math/matrix_exp.md @@ -58,9 +58,6 @@ f\!\begin{pmatrix} A & E \\ 0 & A \end{pmatrix} = \begin{pmatrix} f(A) & L_f(A, E) \\ 0 & f(A) \end{pmatrix}. $$ -PyTorch factors this pattern through the helper -`differential_analytic_matrix_function`. - ## Computational cost | Method | Cost relative to $\exp(A)$ | @@ -69,15 +66,6 @@ PyTorch factors this pattern through the helper | Dedicated Fr\'echet scaling-and-squaring | about $3\times$ | | Eigendecomposition shortcut | cheaper on paper, but unstable for non-normal $A$ | -## Implementation Correspondence - -- `tenferro-rs/docs/AD/matrix_exp.md` uses the block-exponential construction - as the main derivation. -- PyTorch's `differential_analytic_matrix_function` and - `linalg_matrix_exp_differential` implement the same Mathias 1996 identity. -- The block matrix approach is simple but more expensive than a dedicated - scaling-and-squaring Fr\'echet implementation. - ## Verification - compare the block-matrix Fr\'echet derivative against finite differences diff --git a/docs/math/norm.md b/docs/math/norm.md index 7ea4f19..119dcc7 100644 --- a/docs/math/norm.md +++ b/docs/math/norm.md @@ -95,16 +95,6 @@ $$ For multiplicity $k > 1$, the subgradient is the average over the active singular-vector dyads. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/norm.md` separates vector norms, Frobenius norm, nuclear - norm, and spectral norm explicitly. This note preserves that structure. -- PyTorch's `norm_backward` and `norm_jvp` implement the scalar/vector $p$-norm - cases directly, including the tie-handling for $p = \infty$. -- `linalg_vector_norm_backward` is a thin wrapper around the same formulas. -- Matrix nuclear and spectral norms are implemented in PyTorch by decomposition - into SVD-derived primitives rather than a dedicated manual formula. - ## Numerical Notes - Nonsmooth points, especially zero inputs and repeated top singular values, diff --git a/docs/math/pinv.md b/docs/math/pinv.md index 7cb61bb..4ed6019 100644 --- a/docs/math/pinv.md +++ b/docs/math/pinv.md @@ -52,15 +52,8 @@ $$ This is the adjoint counterpart of the same three-term structure. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/pinv.md` follows the classical Golub-Pereyra formulas and - makes the projector interpretation explicit. -- PyTorch's `pinv_jvp` and `pinv_backward` implement algebraically equivalent - forms but branch on $M \leq N$ versus $M > N$ to reduce intermediate matrix - sizes. -- The `atol` / `rtol` thresholding used to define the primal pseudoinverse is - treated as fixed metadata, not as a differentiable branch. +The `atol` / `rtol` thresholding used to define the primal pseudoinverse is +treated as fixed metadata, not as a differentiable branch. ## Verification diff --git a/docs/math/qr.md b/docs/math/qr.md index 233efa8..550999f 100644 --- a/docs/math/qr.md +++ b/docs/math/qr.md @@ -1,8 +1,7 @@ # QR AD Notes -This note covers the reduced QR rule that is materialized in the DB and keeps -the transpose-dual LQ formulas from `tenferro-rs/docs/AD/qr.md` so that no -derivation detail is lost in the migration. +This note covers the reduced QR rule materialized in the DB together with the +transpose-dual LQ formulas. ## QR Forward Definition @@ -56,8 +55,7 @@ $$ \end{cases} $$ -This is the adjoint helper appearing in PyTorch's `linalg_qr_backward` for the -$M < N$ case. +This is the adjoint helper for the $M < N$ case. ## Reverse Rule @@ -92,8 +90,7 @@ $$ \bar{A} = B R^{-\dagger}. $$ -Implementation-wise this is a right solve with $R^\dagger$. PyTorch expresses -the same step as +This is a right solve with $R^\dagger$. An equivalent form is $$ \bar{A} = @@ -143,7 +140,7 @@ $$ \bar{A} = \pi^\*(\bar{A}_{\mathrm{lead}}) + Q \bar{R}. $$ -PyTorch's `linalg_qr_backward` implements the same case as +Equivalently, $$ \bar{A} = Q \bar{R} + \pi^\*\!\left( @@ -153,8 +150,6 @@ $$ ## Forward Rule -PyTorch's `linalg_qr_jvp` uses the same case split. - ### Case $M \geq N$ Define $\operatorname{sym}(X) = X + X^\dagger$ and @@ -211,8 +206,7 @@ $$ ## LQ Reverse Rule -The transpose-dual LQ formulas are retained here because the original -`tenferro-rs` note grouped QR and LQ together. +The transpose-dual LQ formulas are included for completeness. ### LQ Forward Definition @@ -319,15 +313,6 @@ $$ with random Hermitian operators independent of $A$. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/qr.md` writes the rule in terms of `copyltu`, - `trilImInvAdjSkew`, and the QR/LQ duality. This note keeps those helpers. -- PyTorch's `linalg_qr_backward` uses the same two reduced-QR cases: - full-rank via `syminvadj(... ) R^{-H}` and wide reduced QR via the - `pi*`-embedded `trilImInvAdjSkew` formula. -- PyTorch's `linalg_qr_jvp` mirrors the same case split in forward mode. - ## References 1. M. Seeger, A. Hetzel, Z. Dai, E. Meissner, N. D. Lawrence, diff --git a/docs/math/scalar_ops.md b/docs/math/scalar_ops.md index 9e13e3b..0c5d6ca 100644 --- a/docs/math/scalar_ops.md +++ b/docs/math/scalar_ops.md @@ -2,20 +2,8 @@ ## Scope -This note records the shared scalar AD formulas implemented in -`chainrules-scalarops` together with the tensor-level wrappers used by -`tenferro-dyadtensor`. - -## PyTorch Baseline - -The local comparison baseline is PyTorch's manual autograd formulas: - -- `tools/autograd/derivatives.yaml` -- `torch/csrc/autograd/FunctionsManual.cpp` -- `docs/source/notes/autograd.rst` - -In particular, the scalar wrappers here follow the same `handle_r_to_c` -real-input projection convention used by PyTorch. +This note records shared scalar AD formulas together with the tensor-level +wrappers built from them. ## Complex Gradient Convention @@ -24,7 +12,6 @@ For real-valued losses: - gradients follow the conjugate-Wirtinger convention - VJP formulas include complex conjugation where required - real inputs project complex intermediates back to the real domain - (`handle_r_to_c`) ## Scalar Basis Rules @@ -33,10 +20,13 @@ primal output. ### Core arithmetic -- `add`: $(dx_1, dx_2) = (g, g)$ -- `sub`: $(dx_1, dx_2) = (g, -g)$ +- `add`: for `x_1 + \alpha x_2`, $(dx_1, dx_2) = (g, \overline{\alpha}\, g)$ +- `sub`: for `x_1 - \alpha x_2`, $(dx_1, dx_2) = (g, -\overline{\alpha}\, g)$ - `mul`: $(dx_1, dx_2) = (g \cdot \overline{x_2}, g \cdot \overline{x_1})$ -- `div`: quotient rule with conjugated denominator factors +- `div`: + - numerator path: $dx_1 = g / \overline{x_2}$ + - denominator path: $dx_2 = -g \cdot \overline{x_1 / x_2^2}$ + - integer-style rounding modes are treated as nondifferentiable branches ### Analytic unary wrappers @@ -44,16 +34,18 @@ primal output. - `sqrt`: $dx = g / (2 \overline{\sqrt{x}})$ - `exp`: $dx = g \cdot \overline{y}$ - `log`: $dx = g / \overline{x}$ -- `expm1`: derivative factor `exp(x)` -- `log1p`: derivative factor `1 / (1 + x)` -- `sin`: derivative factor `cos(x)` -- `cos`: derivative factor `-sin(x)` -- `tanh`: derivative factor `1 - y^2` +- `expm1`: $dx = g \cdot \overline{\exp(x)}$ +- `log1p`: $dx = g / \overline{(1 + x)}$ +- `sin`: $dx = g \cdot \overline{\cos(x)}$ +- `cos`: $dx = -g \cdot \overline{\sin(x)}$ +- `tanh`: $dx = g \cdot \overline{(1 - y^2)}$ ### Parameterized wrappers -- `atan2`: standard real partials over $a^2 + b^2$ -- `powf`: fixed scalar-exponent rule +- `atan2`: for real inputs $(a, b)$, + $da = g \, b / (a^2 + b^2)$ and $db = -g \, a / (a^2 + b^2)$, with the + zero-denominator singularity masked by the implementation convention +- `powf`: for fixed exponent $p$, $dx = g \cdot \overline{(p x^{p-1})}$ - `powi`: integer-exponent specialization of `powf` - `pow`: - base path: $dx = g \cdot \overline{a x^{a-1}}$ @@ -72,20 +64,56 @@ Tensor-level wrappers built on top of the scalar basis include: ### `sum_ad` -Every element receives the same cotangent. +For a reduction over index set $\mathcal{I}$, + +$$ +y = \sum_{i \in \mathcal{I}} x_i +\quad \Longrightarrow \quad +\bar{x}_i = \bar{y} +$$ + +for every reduced element, with the cotangent broadcast back to the input +shape. ### `mean_ad` -Every element receives the cotangent divided by the number of reduced entries. +If $n$ entries are reduced, + +$$ +y = \frac{1}{n} \sum_{i \in \mathcal{I}} x_i +\quad \Longrightarrow \quad +\bar{x}_i = \frac{\bar{y}}{n}. +$$ ### `var_ad` -Differentiate through the centered residual -$x - \operatorname{mean}(x)$. +Let $\mu = \operatorname{mean}(x)$ over the reduced axes and let `correction` +denote the Bessel-style offset used by the variance operator. Then + +$$ +\operatorname{var}(x) = \frac{1}{n - \mathrm{correction}} \sum_i |x_i - \mu|^2, +$$ + +so away from the singular degrees-of-freedom boundary, + +$$ +\bar{x} += \frac{2}{n - \mathrm{correction}} \, \bar{v} \, (x - \mu). +$$ + +At $n - \mathrm{correction} \le 0$, the operator is singular and the derivative +inherits the same NaN / infinity boundary behavior as the primal convention. ### `std_ad` -Combine the variance rule with the derivative of `sqrt`. +For $\sigma = \sqrt{v}$ with $v = \operatorname{var}(x)$, + +$$ +\bar{v} = \frac{\bar{\sigma}}{2 \sigma}, +$$ + +masked at $\sigma = 0$, and then the variance rule is applied to propagate back +to $x$. ## Published DB Families Using This Note diff --git a/docs/math/solve.md b/docs/math/solve.md index 7f91a1d..9ac9859 100644 --- a/docs/math/solve.md +++ b/docs/math/solve.md @@ -32,8 +32,6 @@ $$ \dot{X} = A^{-1}(\dot{B} - \dot{A} X). $$ -This is exactly the JVP implemented by PyTorch's `linalg_solve_jvp`. - ## Reverse Rule Given a cotangent $\bar{X}$: @@ -59,8 +57,6 @@ $$ \bar{A} = -G X^{\mathsf{H}}. $$ -This is the same adjoint implemented by PyTorch's `linalg_solve_backward`. - ## Triangular Solve When $A$ is triangular, the same formulas apply with triangular solves replacing @@ -79,8 +75,6 @@ $$ $$ For unit-triangular matrices, the diagonal of $\bar{A}$ is additionally zeroed. -This matches PyTorch's `triangular_solve_jvp` and -`linalg_solve_triangular_backward`. ## Right-side solve @@ -106,15 +100,6 @@ $$ - `tensorsolve` is the indexed tensor analogue of the same implicit-system rule. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/solve.md` writes both the left/right solve identities and - the triangular projection rules explicitly. -- PyTorch's `linalg_solve_jvp` and `linalg_solve_backward` implement the same - two equations $dX = A^{-1}(dB - dA X)$ and $gA = -gB X^H$. -- Higher-order AD should solve against $A^\dagger$ directly rather than expose - saved LU factors as differentiable objects. - ## Verification ### Forward residual @@ -134,9 +119,6 @@ $$ 1. M. B. Giles, "An extended collection of matrix derivative results for forward and reverse mode AD," 2008. -2. PyTorch `FunctionsManual.cpp`: `linalg_solve_jvp`, - `linalg_solve_backward`, `triangular_solve_jvp`, - `linalg_solve_triangular_backward`. ## DB Families diff --git a/docs/math/svd.md b/docs/math/svd.md index 6b4c4fd..84bb915 100644 --- a/docs/math/svd.md +++ b/docs/math/svd.md @@ -14,10 +14,9 @@ the thin SVD uses - $\Sigma = \operatorname{diag}(\sigma_1, \ldots, \sigma_K)$ with $\sigma_i > 0$ - $V \in \mathbb{C}^{N \times K}$ with $V^\dagger V = I_K$ -PyTorch's `_linalg_svd` may be called with `full_matrices=True`, but its AD -formulas narrow back to the leading $K$ singular vectors before applying the -differential rules. The thin factors are therefore the mathematical source of -truth for the note and for the oracle DB. +If a decomposition is returned with full orthonormal factors, the AD rules +still depend only on the leading thin factors. The thin SVD is therefore the +mathematical source of truth for this note and for the oracle DB. ## Reverse Rule @@ -57,8 +56,8 @@ $$ S_{\text{inv},i} = \frac{\sigma_i}{\sigma_i^2 + \eta} \approx \frac{1}{\sigma_i}. $$ -PyTorch writes the formulas using $E$, while `tenferro-rs` writes them using -$F$. They are the same off the diagonal. +The matrices $E$ and $F$ encode the same off-diagonal inverse-gap +information. ### Step 2: Inner matrix split @@ -97,9 +96,7 @@ $$ \Gamma_{\bar{V}} = \Sigma (K + K^\dagger). $$ -This is the right-singular-vector analogue of the $\bar{U}$ path. PyTorch's -`svd_backward` combines the same information through -$S ((V^\dagger \bar{V}) / E)$ inside its skew formulation. +This is the right-singular-vector analogue of the $\bar{U}$ path. #### From $\bar{S}$ @@ -117,7 +114,7 @@ $$ \bar{A}_{\text{core}} = U \Gamma V^\dagger. $$ -Equivalently, PyTorch writes the same expression as +Equivalently, the same expression can be written as $$ \bar{A}_{\text{core}} = @@ -172,9 +169,9 @@ $$ \operatorname{Im}(\operatorname{diag}(U^\dagger \bar{U} + V^\dagger \bar{V})) = 0. $$ -PyTorch's `svd_backward` checks this numerically and raises an error when the -loss depends on the singular-vector phase. The DB's `gauge_ill_defined` family -records those expected failures. +Losses that violate this condition are ill-defined for derivatives through the +singular-vector phase gauge. The DB's `gauge_ill_defined` family records those +expected failures. ## Forward Rule @@ -223,9 +220,8 @@ $$ dV \mathrel{+}= (I_N - V V^\dagger)(dA)^\dagger U \operatorname{diag}(S_{\text{inv}}). $$ -This is the form implemented in PyTorch's `linalg_svd_jvp`, up to the -convention that PyTorch returns `Vh = V^\dagger` and thus reports -$dVh = (dV)^\dagger$ directly. +Equivalent formulations may return $V^\dagger$ instead of $V$ and therefore +report $d(V^\dagger) = (dV)^\dagger$ directly. ## Numerical and Domain Notes @@ -262,16 +258,6 @@ Representative scalar test functions: where $H$ is a random Hermitian matrix independent of $A$. -## Implementation Correspondence - -- `tenferro-rs/docs/AD/svd.md` writes the reverse rule by splitting - $\Gamma_{\bar{U}}$, $\Gamma_{\bar{V}}$, and $\Gamma_{\bar{S}}`, and by making - the `F` and `S_inv` helpers explicit. This note keeps that structure. -- PyTorch's `svd_backward` uses the equivalent $E$-matrix formulation together - with skew/sym operators and an explicit gauge check in the complex case. -- PyTorch's `linalg_svd_jvp` uses the same thin-factor formulas and only pads - zeros back out when it must return `full_matrices=True` shaped tangents. - ## References 1. J. Townsend, "Differentiating the Singular Value Decomposition," 2016. diff --git a/tests/test_math_registry.py b/tests/test_math_registry.py index a5bea6d..24a38a7 100644 --- a/tests/test_math_registry.py +++ b/tests/test_math_registry.py @@ -299,14 +299,15 @@ def test_repo_det_note_retains_slogdet_and_singular_handling(self) -> None: self.assertIn("slogdet", text) self.assertIn("orientation/phase factor", text) - def test_repo_matrix_exp_note_retains_block_matrix_and_pytorch_mapping(self) -> None: + def test_repo_matrix_exp_note_retains_block_matrix_and_generality(self) -> None: text = ( Path(__file__).resolve().parents[1] / "docs" / "math" / "matrix_exp.md" ).read_text(encoding="utf-8") self.assertIn("Mathias 1996", text) self.assertIn("2N \\times 2N", text) - self.assertIn("differential_analytic_matrix_function", text) + self.assertIn("The same block-matrix technique works for any analytic matrix function", text) + self.assertIn("L_f(A, E)", text) self.assertIn("Computational cost", text) def test_repo_dyadtensor_reverse_note_retains_pullback_bridge_details(self) -> None: @@ -322,20 +323,29 @@ def test_repo_dyadtensor_reverse_note_retains_pullback_bridge_details(self) -> N self.assertIn("eig_ad(...).run()", text) self.assertIn("register_bridge_rule", text) - def test_repo_scalar_ops_note_retains_pytorch_baseline_and_reduction_wrappers(self) -> None: + def test_repo_scalar_ops_note_retains_complex_convention_and_reduction_wrappers(self) -> None: text = ( Path(__file__).resolve().parents[1] / "docs" / "math" / "scalar_ops.md" ).read_text(encoding="utf-8") - self.assertIn("PyTorch Baseline", text) - self.assertIn("handle_r_to_c", text) + self.assertIn("Complex Gradient Convention", text) + self.assertIn("conjugate-Wirtinger", text) + self.assertIn("real inputs project complex intermediates back to the real domain", text) self.assertIn("mean_ad", text) self.assertIn("var_ad", text) self.assertIn("std_ad", text) + self.assertIn("correction", text) + self.assertIn("atan2", text) self.assertIn("powf", text) self.assertIn("powi", text) self.assertIn("Tensor-Composite Rules", text) + def test_repo_math_notes_omit_implementation_correspondence_sections(self) -> None: + note_dir = Path(__file__).resolve().parents[1] / "docs" / "math" + for note_path in note_dir.glob("*.md"): + text = note_path.read_text(encoding="utf-8") + self.assertNotIn("## Implementation Correspondence", text, note_path.name) + def test_repo_eig_and_eigen_notes_are_distinct(self) -> None: note_dir = Path(__file__).resolve().parents[1] / "docs" / "math" eig_text = (note_dir / "eig.md").read_text(encoding="utf-8")