From e9016797c77b6b1d998cf8d9df8f91f40c4a1650 Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Mon, 1 Dec 2025 22:44:07 +0100
Subject: [PATCH 1/8] nmod poly mat: mul_geometric, working version

---
 flint-extras/src/nmod_extra.h                 |   2 +-
 flint-extras/src/nmod_extra/nmod_find_root.c  |  10 +-
 .../nmod_poly_mat_middle_product_geometric.c  |   4 +-
 ...tric.bak => nmod_poly_mat_mul_geometric.c} | 111 +++++-------------
 .../src/nmod_poly_mat_extra/test/main.c       |   8 +-
 .../test/t-mul_geometric.c                    |  88 ++++++++++++++
 .../test/test_mul_geometric.bak               |  67 -----------
 flint-extras/src/nmod_poly_mat_multiply.h     |   1 -
 8 files changed, 132 insertions(+), 159 deletions(-)
 rename flint-extras/src/nmod_poly_mat_extra/{nmod_poly_mat_mul_geometric.bak => nmod_poly_mat_mul_geometric.c} (56%)
 create mode 100644 flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
 delete mode 100644 flint-extras/src/nmod_poly_mat_extra/test/test_mul_geometric.bak

diff --git a/flint-extras/src/nmod_extra.h b/flint-extras/src/nmod_extra.h
index 34432a4d..b5103230 100644
--- a/flint-extras/src/nmod_extra.h
+++ b/flint-extras/src/nmod_extra.h
@@ -43,7 +43,7 @@ ulong inverse_mod_power_of_two(ulong p, int k);
 /* finds an element of order at least n                       */
 /* returns 0 if not found                                     */
 /*------------------------------------------------------------*/
-ulong nmod_find_root(long n, nmod_t mod);
+ulong nmod_find_root(ulong n, nmod_t mod);
 
 
 /*------------------------------------------------------------*/
diff --git a/flint-extras/src/nmod_extra/nmod_find_root.c b/flint-extras/src/nmod_extra/nmod_find_root.c
index 2f704f11..4036c58f 100644
--- a/flint-extras/src/nmod_extra/nmod_find_root.c
+++ b/flint-extras/src/nmod_extra/nmod_find_root.c
@@ -18,13 +18,13 @@
 /* finds an element of order at least n                       */
 /* returns 0 if not found                                     */
 /*------------------------------------------------------------*/
-ulong nmod_find_root(slong n, nmod_t mod)
+ulong nmod_find_root(ulong n, nmod_t mod)
 {
-    slong q;
-    for(q = 2; q < (slong) mod.n; q++)
+    ulong q;
+    for(q = 2; q < mod.n; q++)
     {
-        slong k = 1;
-        slong qk = q;
+        ulong k = 1;
+        ulong qk = q;
         while (qk != 1 && k < n)
         {
             qk = nmod_mul(qk, q, mod);
diff --git a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c b/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
index c285ac52..a9353ece 100644
--- a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
@@ -76,7 +76,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
 
 #ifdef DIRTY_ALLOC_MATRIX
     // we alloc the memory for all matrices at once
-    nn_ptr tmp = (nn_ptr) malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
+    nn_ptr tmp = (nn_ptr) flint_malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
     nn_ptr bak;
 
     bak = tmp;
@@ -185,7 +185,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
 
 
 #ifdef DIRTY_ALLOC_MATRIX
-    free(tmp);
+    flint_free(tmp);
 #else
     for (i = 0; i < ellC; i++)
     {
diff --git a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.bak b/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.c
similarity index 56%
rename from flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.bak
rename to flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.c
index be028858..42c00c15 100644
--- a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.bak
+++ b/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.c
@@ -1,25 +1,23 @@
-#include <stdlib.h>
 #include <flint/nmod.h>
 #include <flint/nmod_vec.h>
 #include <flint/nmod_mat.h>
+#include <flint/nmod_poly.h>
 #include <flint/nmod_poly_mat.h>
 
-#include "nmod_extra.h" // for nmod_mat_mul_pml
-#include "nmod_mat_extra.h" // for nmod_mat_mul_pml
-#include "nmod_poly_extra.h" // for geom progression
+#include "nmod_extra.h" // for nmod_find_root
 
 #include "nmod_poly_mat_multiply.h"
 
 /** Multiplication for polynomial matrices
  *  sets C = A * B
  *  output can alias input
- *  ASSUME: existence of primitive root (assumption not checked)
- *  uses geometric evaluation and interpolation
+ *  ASSUMPTION (not checked): existence of element of "large enough" order
+ *  uses evaluation and interpolation at a geometric progression
  */
 void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, const nmod_poly_mat_t B)
 {
     nmod_mat_t *mod_A, *mod_B, *mod_C;
-    ulong ellA, ellB, ellC, order;
+    ulong ellA, ellB, ellC;
     ulong i, j, ell, m, k, n;
     ulong p, w;
     nn_ptr val;
@@ -57,61 +55,21 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
         return;
     }
 
-
     ellC = ellA + ellB - 1;  // length(C) = length(A) + length(B) - 1
-    order = ellC;
     nmod_init(&mod, p);
-    w = nmod_find_root(order, mod);
-    nmod_geometric_progression_init_set(F, w, order, mod);
+    w = nmod_find_root(2*ellC, mod);
+    nmod_geometric_progression_init(F, w, ellC, mod);
 
     mod_A = FLINT_ARRAY_ALLOC(ellC, nmod_mat_t);
     mod_B = FLINT_ARRAY_ALLOC(ellC, nmod_mat_t);
     mod_C = FLINT_ARRAY_ALLOC(ellC, nmod_mat_t);
     val = _nmod_vec_init(ellC);
 
-
 #ifdef DIRTY_ALLOC_MATRIX
     // we alloc the memory for all matrices at once
-    nn_ptr tmp = (nn_ptr) malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
-    nn_ptr bak;
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-    nn_ptr *tmp_rows = (nn_ptr *) malloc((m + k + m) * ellC * sizeof(nn_ptr));
-    nn_ptr *bak_rows;
-#endif
-
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-    bak_rows = tmp_rows;
-    j = 0;
-    for (i = 0; i < m*ellC; i++)
-    {
-        tmp_rows[i] = tmp + j;
-        j += k;
-    }
-    tmp_rows += m*ellC;
-
-    for (i = 0; i < k*ellC; i++)
-    {
-        tmp_rows[i] = tmp + j;
-        j += n;
-    }
-    tmp_rows += k*ellC;
-
-    for (i = 0; i < m*ellC; i++)
-    {
-        tmp_rows[i] = tmp + j;
-        j += n;
-    }
-    tmp_rows = bak_rows;
-
-    bak_rows = tmp_rows;
-#endif
-
-    bak = tmp;
+    nn_ptr tmp = (nn_ptr) flint_malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
     for (i = 0; i < ellC; i++)
     {
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-        mod_A[i]->rows = tmp_rows + i*m;
-#endif
         mod_A[i]->entries = tmp + i*m*k;
         mod_A[i]->r = m;
         mod_A[i]->c = k;
@@ -119,16 +77,10 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
         mod_A[i]->mod.norm = mod.norm;
         mod_A[i]->mod.ninv = mod.ninv;
     }
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-    tmp_rows += ellC*m;
-#endif
     tmp += ellC*m*k;
 
     for (i = 0; i < ellC; i++)
     {
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-        mod_B[i]->rows = tmp_rows + i*k;
-#endif
         mod_B[i]->entries = tmp + i*k*n;
         mod_B[i]->r = k;
         mod_B[i]->c = n;
@@ -136,16 +88,10 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
         mod_B[i]->mod.norm = mod.norm;
         mod_B[i]->mod.ninv = mod.ninv;
     }
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-    tmp_rows += ellC*k;
-#endif
     tmp += ellC*k*n;
 
     for (i = 0; i < ellC; i++)
     {
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-        mod_C[i]->rows = tmp_rows + i*m;
-#endif
         mod_C[i]->entries = tmp + i*m*n;
         mod_C[i]->r = m;
         mod_C[i]->c = n;
@@ -153,10 +99,7 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
         mod_C[i]->mod.norm = mod.norm;
         mod_C[i]->mod.ninv = mod.ninv;
     }
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-    tmp_rows = bak_rows;
-#endif
-    tmp = bak;
+    tmp = tmp - ellC*m*k - ellC*k*n;
 #else
     for (i = 0; i < ellC; i++)
     {
@@ -166,41 +109,49 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
     }
 #endif
 
+    flint_printf("starting work\n");
+    nmod_poly_struct * pol;
 
-    for (i = 0; i < n; i++)
+    for (i = 0; i < m; i++)
+    {
         for (j = 0; j < k; j++)
         {
-            nmod_geometric_progression_evaluate(val, nmod_poly_mat_entry(A, i, j), F);
+            pol = nmod_poly_mat_entry(A, i, j);
+            _nmod_poly_evaluate_geometric_nmod_vec_fast_precomp(val, pol->coeffs, pol->length, F, ellC, mod);
             for (ell = 0; ell < ellC; ell++)
                 nmod_mat_entry(mod_A[ell], i, j) = val[ell];
         }
+    }
 
     for (i = 0; i < k; i++)
-        for (j = 0; j < m; j++)
+    {
+        for (j = 0; j < n; j++)
         {
-            nmod_geometric_progression_evaluate(val, nmod_poly_mat_entry(B, i, j), F);
+            pol = nmod_poly_mat_entry(B, i, j);
+            _nmod_poly_evaluate_geometric_nmod_vec_fast_precomp(val, pol->coeffs, pol->length, F, ellC, mod);
             for (ell = 0; ell < ellC; ell++)
                 nmod_mat_entry(mod_B[ell], i, j) = val[ell];
         }
-
+    }
+    flint_printf("eval ok\n");
 
     for (ell = 0; ell < ellC; ell++)
-            nmod_mat_mul_pml(mod_C[ell], mod_A[ell], mod_B[ell]);
-
+        nmod_mat_mul(mod_C[ell], mod_A[ell], mod_B[ell]);
+    flint_printf("matmul ok\n");
 
-    for (i = 0; i < n; i++)
-        for (j = 0; j < m; j++)
+    for (i = 0; i < m; i++)
+    {
+        for (j = 0; j < n; j++)
         {
             for (ell = 0; ell < ellC; ell++)
                 val[ell] = nmod_mat_entry(mod_C[ell], i, j);
-            nmod_geometric_progression_interpolate(nmod_poly_mat_entry(C, i, j), val, F);
+            nmod_poly_interpolate_geometric_nmod_vec_fast_precomp(nmod_poly_mat_entry(C, i, j), val, F, ellC);
         }
+    }
+    flint_printf("interp ok\n");
 
 #ifdef DIRTY_ALLOC_MATRIX
-#if __FLINT_VERSION < 3 || (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR < 3)
-    free(tmp_rows);
-#endif
-    free(tmp);
+    flint_free(tmp);
 #else
     for (i = 0; i < ellC; i++)
     {
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/main.c b/flint-extras/src/nmod_poly_mat_extra/test/main.c
index dc2e407a..ba7f76d1 100644
--- a/flint-extras/src/nmod_poly_mat_extra/test/main.c
+++ b/flint-extras/src/nmod_poly_mat_extra/test/main.c
@@ -18,7 +18,7 @@
 #include "t-dixon.c"
 #include "t-hermite_normal_form.c"
 /* #include "t-middle_product_geometric.c" */ /* TODO */
-/* #include "t-mul_geometric.c" */ /* TODO */
+#include "t-mul_geometric.c"
 #include "t-mbasis.c"
 #include "t-pmbasis.c"
 #include "t-mul_waksman.c"
@@ -32,13 +32,15 @@ test_struct tests[] =
     TEST_FUNCTION(nmod_poly_mat_det),
     TEST_FUNCTION(nmod_poly_mat_dixon),
     TEST_FUNCTION(nmod_poly_mat_hnf),
-    /* TEST_FUNCTION(nmod_poly_mat_middle_product_geometric), */  /* TODO */
-    /* TEST_FUNCTION(nmod_poly_mat_mul_geometric), */  /* TODO */
     TEST_FUNCTION(nmod_poly_mat_mbasis),
     TEST_FUNCTION(nmod_poly_mat_pmbasis),
     /* TEST_FUNCTION(nmod_poly_mat_mul_waksman), */  /* TODO */
     TEST_FUNCTION(nmod_poly_mat_rand),
     TEST_FUNCTION(nmod_poly_mat_weak_popov_form),
+#if (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4)
+    /* TEST_FUNCTION(nmod_poly_mat_middle_product_geometric), */  /* TODO */
+    TEST_FUNCTION(nmod_poly_mat_mul_geometric),
+#endif  /* FLINT_VERSION */
 };
 
 /* main function *************************************************************/
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c b/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
new file mode 100644
index 00000000..da34f248
--- /dev/null
+++ b/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
@@ -0,0 +1,88 @@
+/*
+    Copyright (C) 2025 Vincent Neiger, Éric Schost
+
+    This file is part of PML.
+
+    PML is free software: you can redistribute it and/or modify it under
+    the terms of the GNU General Public License version 2.0 (GPL-2.0-or-later)
+    as published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version. See
+    <https://www.gnu.org/licenses/>.
+*/
+
+#include <flint/nmod_poly_mat.h>
+#include <flint/test_helpers.h>
+
+#include "nmod_poly_mat_utils.h" // for rand
+#include "nmod_poly_mat_multiply.h"
+
+/* TODO more primes, more variations of degrees, etc. */
+
+#if (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4)
+
+/*--------------------------------------------------------------*/
+/* middle product using different implementations               */
+/*--------------------------------------------------------------*/
+int test_mat_mul_geometric(ulong bits, ulong m, ulong n, ulong p, ulong deg, flint_rand_t state)
+{
+    ulong prime = n_randprime(state, bits, 1);
+
+    nmod_poly_mat_t A, B, C1, C2;
+
+    nmod_poly_mat_init(A, m, n, prime);
+    nmod_poly_mat_init(B, n, p, prime);
+    nmod_poly_mat_init(C1, m, p, prime);
+    nmod_poly_mat_init(C2, m, p, prime);
+
+    nmod_poly_mat_rand(A, state, deg);
+    nmod_poly_mat_rand(B, state, deg);
+    nmod_poly_mat_rand(C1, state, deg);
+    nmod_poly_mat_rand(C2, state, deg);
+
+    flint_printf("OK\n");
+    nmod_poly_mat_mul(C1, A, B);
+    flint_printf("OK\n");
+    nmod_poly_mat_mul_geometric(C2, A, B);
+    flint_printf("OK\n");
+
+    int res = nmod_poly_mat_equal(C1, C2);
+
+    nmod_poly_mat_clear(C1);
+    nmod_poly_mat_clear(C2);
+    nmod_poly_mat_clear(B);
+    nmod_poly_mat_clear(A);
+
+    return res;
+}
+
+TEST_FUNCTION_START(nmod_poly_mat_mul_geometric, state)
+{
+    int i, result;
+
+    for (i = 0; i < 100 * flint_test_multiplier(); i++)
+    {
+        /* FIXME field must be "large enough" : add proper test/fallback */
+        ulong bits = 16 + n_randint(state, 49);
+        ulong m = 1 + n_randint(state, 50);
+        ulong n = 1 + n_randint(state, 50);
+        ulong p = 1 + n_randint(state, 50);
+        ulong deg = 0 + n_randint(state, 50);
+
+        flint_printf("~~~ test i = %d ~~~\n"
+                     "m = %wu, n = %wu, p = %wu\n"
+                     "deg = %wu, n_bits = %wu\n",
+                     i, m, n, p, deg, bits);
+
+        result = test_mat_mul_geometric(bits, m, n, p, deg, state);
+
+        if (!result)
+            TEST_FUNCTION_FAIL(
+                    "m = %wu, n = %wu, p = %wu\n"
+                    "n_bits = %wu\n",
+                    m, n, p, bits);
+    }
+
+    TEST_FUNCTION_END(state);
+}
+
+#endif
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/test_mul_geometric.bak b/flint-extras/src/nmod_poly_mat_extra/test/test_mul_geometric.bak
deleted file mode 100644
index 2ab3a89c..00000000
--- a/flint-extras/src/nmod_poly_mat_extra/test/test_mul_geometric.bak
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <stdlib.h>
-#include <flint/nmod_poly_mat.h>
-
-#include "nmod_poly_mat_utils.h" // for rand
-#include "nmod_poly_mat_multiply.h"
-
-
-/*--------------------------------------------------------------*/
-/* multiplies matrices using different implementations          */
-/*--------------------------------------------------------------*/
-void test_nmod_poly_mat_mul(ulong m, ulong n, ulong p, ulong deg)
-{
-    flint_rand_t state;
-    nmod_poly_mat_t A, B, C1, C2;
-    ulong modulus;
-
-    modulus = 1108307720798209;
-    flint_rand_init(state);
-
-    nmod_poly_mat_init(A, m, n, modulus);
-    nmod_poly_mat_init(B, n, p, modulus);
-    nmod_poly_mat_init(C1, m, p, modulus);
-    nmod_poly_mat_init(C2, m, p, modulus);
-
-    nmod_poly_mat_rand(A, state, deg);
-    nmod_poly_mat_rand(B, state, deg);
-    nmod_poly_mat_rand(C1, state, deg);
-    nmod_poly_mat_rand(C2, state, deg);
-
-    nmod_poly_mat_mul(C1, A, B);
-    nmod_poly_mat_mul_geometric(C2, A, B);
-
-    if (!nmod_poly_mat_equal(C1, C2))
-    {
-        printf("error at m=%lu n=%lu p=%lu deg=%lu\n", m, n, p, deg);
-        exit(-1);
-    }
-    
-    nmod_poly_mat_clear(C1);
-    nmod_poly_mat_clear(C2);
-    nmod_poly_mat_clear(B);
-    nmod_poly_mat_clear(A);
-    flint_rand_clear(state);
-}
-
-/*--------------------------------------------------------------*/
-/* main calls tets                                              */
-/*--------------------------------------------------------------*/
-int main()
-{
-    ulong i;
-    flint_set_num_threads(1);
-
-    for (i = 1; i < 300; i += 1)
-        test_nmod_poly_mat_mul(i, i, i, 2);
-
-    for (i = 1; i < 300; i += 1)
-        test_nmod_poly_mat_mul(i, i, i, 200);
-
-    for (i = 1; i < 300; i += 5)
-        test_nmod_poly_mat_mul(i, i, i, 2000);
-
-    for (i = 1; i < 100; i += 20)
-        test_nmod_poly_mat_mul(i, i, i, 20000);
-
-    return 0;
-}
diff --git a/flint-extras/src/nmod_poly_mat_multiply.h b/flint-extras/src/nmod_poly_mat_multiply.h
index 03459b3a..b2839d39 100644
--- a/flint-extras/src/nmod_poly_mat_multiply.h
+++ b/flint-extras/src/nmod_poly_mat_multiply.h
@@ -14,7 +14,6 @@
 #define NMOD_POLY_MAT_MULTIPLY_H
 
 #include <flint/nmod_types.h>
-
 #include "pml.h"
 
 // several functions allocate arrays of matrices

From 983c3b19f7a87f7c9a50736f0e3c105e25490148 Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 00:28:52 +0100
Subject: [PATCH 2/8] nmod poly mat mul via geometric eval/interp

---
 .../src/nmod32_vec/profile/p-dot_mdot.c       |  12 +-
 ...ly_mat_mul_geometric.c => mul_geometric.c} |  21 +-
 .../src/nmod_poly_mat_extra/profile/p-mul.c   | 259 ++++++++++++++++++
 .../test/t-mul_geometric.c                    |  11 +-
 ntl-extras/mat_lzz_pX_extra/timings/Makefile  |   4 +-
 .../timings/time_polmatmul.cpp                |   2 +-
 6 files changed, 282 insertions(+), 27 deletions(-)
 rename flint-extras/src/nmod_poly_mat_extra/{nmod_poly_mat_mul_geometric.c => mul_geometric.c} (90%)
 create mode 100644 flint-extras/src/nmod_poly_mat_extra/profile/p-mul.c

diff --git a/flint-extras/src/nmod32_vec/profile/p-dot_mdot.c b/flint-extras/src/nmod32_vec/profile/p-dot_mdot.c
index cd354110..36981736 100644
--- a/flint-extras/src/nmod32_vec/profile/p-dot_mdot.c
+++ b/flint-extras/src/nmod32_vec/profile/p-dot_mdot.c
@@ -35,9 +35,9 @@ void time_##fun(time_args targs, flint_rand_t state) \
     \
     double FLINT_SET_BUT_UNUSED(tcpu), twall; \
     \
-    TIMEIT_START \
+    TIMEIT_START; \
     res = _nmod32_vec_##fun(v1, v2, len, mod, arg); \
-    TIMEIT_STOP_VALUES(tcpu, twall) \
+    TIMEIT_STOP_VALUES(tcpu, twall); \
     \
     printf("%.2e", twall); \
     \
@@ -151,9 +151,9 @@ void time_dot_msolve_avx2(time_args targs, flint_rand_t state)
 
     double FLINT_SET_BUT_UNUSED(tcpu), twall;
 
-    TIMEIT_START
+    TIMEIT_START;
     res = _nmod32_vec_dot_msolve_avx2(v1, v2, len, mod.n);
-    TIMEIT_STOP_VALUES(tcpu, twall)
+    TIMEIT_STOP_VALUES(tcpu, twall);
 
     printf("%.2e", twall);
 
@@ -219,10 +219,10 @@ void time_##fun(time_args targs, flint_rand_t state)    \
                                                         \
     double FLINT_SET_BUT_UNUSED(tcpu), twall;           \
                                                         \
-    TIMEIT_START                                        \
+    TIMEIT_START;                                       \
     _nmod32_vec_##fun(res, mat, vec,                    \
                       nrows, len, len, mod);            \
-    TIMEIT_STOP_VALUES(tcpu, twall)                     \
+    TIMEIT_STOP_VALUES(tcpu, twall);                    \
                                                         \
     printf("%.2e", twall);                              \
                                                         \
diff --git a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.c b/flint-extras/src/nmod_poly_mat_extra/mul_geometric.c
similarity index 90%
rename from flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.c
rename to flint-extras/src/nmod_poly_mat_extra/mul_geometric.c
index 42c00c15..d03f022f 100644
--- a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_mul_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/mul_geometric.c
@@ -67,39 +67,42 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
 
 #ifdef DIRTY_ALLOC_MATRIX
     // we alloc the memory for all matrices at once
-    nn_ptr tmp = (nn_ptr) flint_malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
+    nn_ptr tmp = flint_malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
+    nn_ptr ptr = tmp;
     for (i = 0; i < ellC; i++)
     {
-        mod_A[i]->entries = tmp + i*m*k;
+        mod_A[i]->entries = ptr + i*m*k;
+        mod_A[i]->stride = k;
         mod_A[i]->r = m;
         mod_A[i]->c = k;
         mod_A[i]->mod.n = mod.n;
         mod_A[i]->mod.norm = mod.norm;
         mod_A[i]->mod.ninv = mod.ninv;
     }
-    tmp += ellC*m*k;
+    ptr += ellC*m*k;
 
     for (i = 0; i < ellC; i++)
     {
-        mod_B[i]->entries = tmp + i*k*n;
+        mod_B[i]->entries = ptr + i*k*n;
+        mod_B[i]->stride = n;
         mod_B[i]->r = k;
         mod_B[i]->c = n;
         mod_B[i]->mod.n = mod.n;
         mod_B[i]->mod.norm = mod.norm;
         mod_B[i]->mod.ninv = mod.ninv;
     }
-    tmp += ellC*k*n;
+    ptr += ellC*k*n;
 
     for (i = 0; i < ellC; i++)
     {
-        mod_C[i]->entries = tmp + i*m*n;
+        mod_C[i]->entries = ptr + i*m*n;
+        mod_C[i]->stride = n;
         mod_C[i]->r = m;
         mod_C[i]->c = n;
         mod_C[i]->mod.n = mod.n;
         mod_C[i]->mod.norm = mod.norm;
         mod_C[i]->mod.ninv = mod.ninv;
     }
-    tmp = tmp - ellC*m*k - ellC*k*n;
 #else
     for (i = 0; i < ellC; i++)
     {
@@ -109,7 +112,6 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
     }
 #endif
 
-    flint_printf("starting work\n");
     nmod_poly_struct * pol;
 
     for (i = 0; i < m; i++)
@@ -133,11 +135,9 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
                 nmod_mat_entry(mod_B[ell], i, j) = val[ell];
         }
     }
-    flint_printf("eval ok\n");
 
     for (ell = 0; ell < ellC; ell++)
         nmod_mat_mul(mod_C[ell], mod_A[ell], mod_B[ell]);
-    flint_printf("matmul ok\n");
 
     for (i = 0; i < m; i++)
     {
@@ -148,7 +148,6 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
             nmod_poly_interpolate_geometric_nmod_vec_fast_precomp(nmod_poly_mat_entry(C, i, j), val, F, ellC);
         }
     }
-    flint_printf("interp ok\n");
 
 #ifdef DIRTY_ALLOC_MATRIX
     flint_free(tmp);
diff --git a/flint-extras/src/nmod_poly_mat_extra/profile/p-mul.c b/flint-extras/src/nmod_poly_mat_extra/profile/p-mul.c
new file mode 100644
index 00000000..8851a374
--- /dev/null
+++ b/flint-extras/src/nmod_poly_mat_extra/profile/p-mul.c
@@ -0,0 +1,259 @@
+#include <stdlib.h>  // for atoi
+
+#include <flint/ulong_extras.h>
+#include <flint/profiler.h>
+#include <flint/nmod.h>
+#include <flint/nmod_vec.h>
+#include <flint/nmod_poly_mat.h>
+
+#include "nmod_poly_mat_utils.h"
+#include "nmod_poly_mat_multiply.h"
+
+#define MEASURE_SAMPLE 0
+
+typedef struct
+{
+    slong rdim;  /* row outer dimension */
+    slong idim;  /* inner dimension */
+    slong cdim;  /* column outer dimension */
+    slong deg;   /* degree */
+    slong modn;  /* modulus */
+}
+time_args;
+
+#define TIME_MUL(fun)                                   \
+void time_##fun(time_args targs, flint_rand_t state)    \
+{                                                       \
+    const slong rdim = targs.rdim;                      \
+    const slong idim = targs.idim;                      \
+    const slong cdim = targs.cdim;                      \
+    const slong deg = targs.deg;                        \
+    const slong n = targs.modn;                         \
+                                                        \
+    nmod_t mod;                                         \
+    nmod_init(&mod, n);                                 \
+                                                        \
+    nmod_poly_mat_t A;                                  \
+    nmod_poly_mat_init(A, rdim, idim, n);               \
+    nmod_poly_mat_rand(A, state, deg);                  \
+    nmod_poly_mat_t B;                                  \
+    nmod_poly_mat_init(B, idim, cdim, n);               \
+    nmod_poly_mat_rand(B, state, deg);                  \
+    nmod_poly_mat_t C;                                  \
+    nmod_poly_mat_init(C, rdim, cdim, n);               \
+                                                        \
+    double FLINT_SET_BUT_UNUSED(tcpu), twall;           \
+                                                        \
+    TIMEIT_START;                                       \
+    nmod_poly_mat_##fun(C, A, B);                       \
+    TIMEIT_STOP_VALUES(tcpu, twall);                    \
+                                                        \
+    printf("%.2e", twall);                              \
+                                                        \
+    nmod_poly_mat_clear(A);                             \
+    nmod_poly_mat_clear(B);                             \
+    nmod_poly_mat_clear(C);                             \
+}
+
+TIME_MUL(mul)
+TIME_MUL(mul_geometric)
+
+/*-------------------------*/
+/*  main                   */
+/*-------------------------*/
+
+int main(int argc, char ** argv)
+{
+    flint_rand_t state;
+    flint_rand_init(state);
+    flint_rand_set_seed(state, time(NULL), time(NULL)+129384125L);
+
+    // modulus bitsize
+    const slong nbits = 7;
+    const ulong bits[] = {12, 24, 30, 40, 50, 60, 63};
+
+    // matrix dimensions (all square for the moment)
+    const slong ndims = 10;
+    const ulong dims[] = {2, 4, 6, 8, 11, 15, 20, 30, 50, 100};
+
+    // matrix degrees
+    const slong ndegs = 12;
+    const ulong degs[] = {5, 10, 20, 40, 80, 160, 320, 640, 1280, 2560, 5120, 10240};
+
+    // bench functions
+    const slong nfuns = 2;
+    typedef void (*timefun) (time_args, flint_rand_t);
+    const timefun funs[] = {
+        time_mul,                      // 0
+        time_mul_geometric,            // 1
+    };
+
+    // TODO
+    //typedef void (*samplefun) (void*, ulong);
+    //const samplefun sfuns[] = {
+    //    sample_mul,                      // 0
+    //    sample_mul_geometric,            // 1
+    //};
+
+    const char * description[] = {
+        "#0  --> mul                          ",
+        "#1  --> mul_geometric                ",
+    };
+
+    if (argc == 1)  // show usage
+    {
+        printf("Usage: `%s [nbits] [dim] [deg] [fun]`\n", argv[0]);
+        printf("   Each argument is optional; no argument shows this help.\n");
+        printf("   - nbits: number of bits (in (1..64]) for the modulus, chosen as nextprime(2**(nbits-1))\n");
+        printf("        (nbits == -1 launches full suite)\n");
+        printf("   - dim: matrices are dim x dim\n");
+        printf("   - deg: matrices are random of degree < deg\n");
+        printf("   - deg: id number of the timed function (see below),\n");
+        printf("\nAvailable functions:\n");
+        for (slong j = 0; j < nfuns; j++)
+            printf("   %s\n", description[j]);
+
+        return 0;
+    }
+
+    printf("#warmup...\n");
+    for (slong i = 0; i < 3; i++)
+    {
+        time_args targs = {4, 4, 4, 1000, UWORD(1) << 20};
+        time_mul(targs, state);
+        printf(" ");
+    }
+    printf("\n\n");
+
+    if (argc == 2 && atoi(argv[1]) == -1)  // launching full suite
+    {
+        printf("           dim");
+        for (slong i = 0; i < ndims; i++)
+            printf("%17ld", dims[i]);
+        printf("\n");
+        printf("bits fun deg\n");
+        for (slong j = 0; j < nbits; j++)
+        {
+            const slong b = bits[j];
+            ulong n;
+            n = n_nextprime(UWORD(1) << (b-1), 0);
+            for (slong ifun = 0; ifun < nfuns; ifun++)
+            {
+                for (slong d = 0; d < ndegs; d++)
+                {
+                    printf("%-5ld#%-3ld%-8ld", b, ifun, degs[d]);
+                    for (slong i = 0; i < ndims; i++)
+                    {
+                        time_args targs = {dims[i], dims[i], dims[i], degs[d], n};
+
+#if MEASURE_SAMPLE
+                        const samplefun sfun = sfuns[ifun];
+                        double min, max;
+                        prof_repeat(&min, &max, sfun, (void*) &targs);
+                        printf("%.2e", min/1000000);
+#else
+                        const timefun tfun = funs[ifun];
+                        tfun(targs, state);
+#endif
+                        printf(" ");
+                    }
+                }
+                printf("\n");
+            }
+        }
+    }
+    else if (argc == 2)  // nbits is given
+    {
+        printf("       dim");
+        for (slong i = 0; i < ndims; i++)
+            printf("%17ld", dims[i]);
+        printf("\n");
+        printf("bits fun deg\n");
+        const slong b = atoi(argv[1]);
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+        for (slong ifun = 0; ifun < nfuns; ifun++)
+        {
+            const timefun tfun = funs[ifun];
+            for (slong d = 0; d < ndegs; d++)
+            {
+                printf("%-5ld#%-3ld%-8ld", b, ifun, degs[d]);
+                for (slong i = 0; i < ndims; i++)
+                {
+                    time_args targs = {dims[i], dims[i], dims[i], degs[d], n};
+                    tfun(targs, state);
+                    printf(" ");
+                }
+            }
+            printf("\n");
+        }
+    }
+    else if (argc == 3)  // nbits + dim given
+    {
+        const slong dim = atoi(argv[2]);
+        printf("       dim");
+        printf("%17ld", dim);
+        printf("\n");
+        printf("bits fun deg\n");
+        const slong b = atoi(argv[1]);
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+        for (slong ifun = 0; ifun < nfuns; ifun++)
+        {
+            const timefun tfun = funs[ifun];
+            for (slong d = 0; d < ndegs; d++)
+            {
+                printf("%-5ld#%-3ld%-8ld", b, ifun, degs[d]);
+                time_args targs = {dim, dim, dim, degs[d], n};
+                tfun(targs, state);
+                printf(" ");
+                printf("\n");
+            }
+        }
+    }
+    else if (argc == 4)  // nbits + dim + deg given
+    {
+        const slong dim = atoi(argv[2]);
+        const slong deg = atoi(argv[3]);
+        printf("       dim");
+        printf("%17ld", dim);
+        printf("\n");
+        printf("bits fun deg\n");
+        const slong b = atoi(argv[1]);
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+        for (slong ifun = 0; ifun < nfuns; ifun++)
+        {
+            const timefun tfun = funs[ifun];
+            printf("%-5ld#%-3ld%-8ld", b, ifun, deg);
+            time_args targs = {dim, dim, dim, deg, n};
+            tfun(targs, state);
+            printf(" ");
+            printf("\n");
+        }
+    }
+    else if (argc == 5)  // nbits + dim + deg + fun given
+    {
+        const slong b = atoi(argv[1]);
+        const slong dim = atoi(argv[2]);
+        const slong deg = atoi(argv[3]);
+        const slong ifun = atoi(argv[4]);
+        const timefun tfun = funs[ifun];
+        printf("       dim");
+        printf("%17ld", dim);
+        printf("\n");
+        printf("bits fun deg\n");
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+
+        printf("%-5ld#%-3ld%-8ld", b, ifun, deg);
+        time_args targs = {dim, dim, dim, deg, n};
+        tfun(targs, state);
+
+        printf(" ");
+        printf("\n");
+    }
+
+    flint_rand_clear(state);
+    return 0;
+}
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c b/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
index da34f248..bf457575 100644
--- a/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
@@ -39,11 +39,8 @@ int test_mat_mul_geometric(ulong bits, ulong m, ulong n, ulong p, ulong deg, fli
     nmod_poly_mat_rand(C1, state, deg);
     nmod_poly_mat_rand(C2, state, deg);
 
-    flint_printf("OK\n");
     nmod_poly_mat_mul(C1, A, B);
-    flint_printf("OK\n");
     nmod_poly_mat_mul_geometric(C2, A, B);
-    flint_printf("OK\n");
 
     int res = nmod_poly_mat_equal(C1, C2);
 
@@ -68,10 +65,10 @@ TEST_FUNCTION_START(nmod_poly_mat_mul_geometric, state)
         ulong p = 1 + n_randint(state, 50);
         ulong deg = 0 + n_randint(state, 50);
 
-        flint_printf("~~~ test i = %d ~~~\n"
-                     "m = %wu, n = %wu, p = %wu\n"
-                     "deg = %wu, n_bits = %wu\n",
-                     i, m, n, p, deg, bits);
+        /* flint_printf("~~~ test i = %d ~~~\n" */
+        /*              "m = %wu, n = %wu, p = %wu\n" */
+        /*              "deg = %wu, n_bits = %wu\n", */
+        /*              i, m, n, p, deg, bits); */
 
         result = test_mat_mul_geometric(bits, m, n, p, deg, state);
 
diff --git a/ntl-extras/mat_lzz_pX_extra/timings/Makefile b/ntl-extras/mat_lzz_pX_extra/timings/Makefile
index 94bcecca..12593894 100644
--- a/ntl-extras/mat_lzz_pX_extra/timings/Makefile
+++ b/ntl-extras/mat_lzz_pX_extra/timings/Makefile
@@ -12,10 +12,10 @@ RUN = $(patsubst %.cpp, %.dat, $(patsubst time_%, %, $(SRC)))
 
 CXX = g++
 CXXFLAGS = -Wall -std=c++11 -O3 -march=native -I$(INC_DIR)
-LIBS = -L/dsk/l1/vneiger/lib -lntl -lpthread -lgmp -lm
+LIBS = -L/dsk/l1/vneiger/lib -L/usr/local/lib -lntl -lpthread -lgmp -lm
 
 LBCXXFLAGS = -DHAVE_CONFIG_H -O2 -Wall -g -DNDEBUG -U_LB_DEBUG -DDISABLE_COMMENTATOR -fopenmp -fabi-version=6 -I/usr/local/include -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma
-LBLIBS = -lntl -L/usr/local/lib -lopenblas -lpthread -lgfortran -lgivaro -lgmp -lgmpxx -fopenmp
+LBLIBS = -L/usr/local/lib -lntl -lopenblas -lpthread -lgfortran -lgivaro -lgmp -lgmpxx -fopenmp
 
 .PHONY: clean
 
diff --git a/ntl-extras/mat_lzz_pX_extra/timings/time_polmatmul.cpp b/ntl-extras/mat_lzz_pX_extra/timings/time_polmatmul.cpp
index dd8cf9b9..d015fb2d 100644
--- a/ntl-extras/mat_lzz_pX_extra/timings/time_polmatmul.cpp
+++ b/ntl-extras/mat_lzz_pX_extra/timings/time_polmatmul.cpp
@@ -261,7 +261,7 @@ void benchmark_nbits_dim_deg(long nbits, long dim, long deg, bool fftprime)
         std::cout << "Bench square polynomial matrix multiplication:" << std::endl;
         zz_p::init(GenPrime_long(nbits));
         std::cout << "p = " << zz_p::modulus() << "  (prime, bit length = " << NumBits(zz_p::modulus()) << ")" << std::endl;
-        std::cout << "dim\tdim\tdim\tdegree\tmult\t3pri\tev-geo\tvdmd\tvdmd2" << std::endl;
+        std::cout << "dim\tdim\tdim\tdegl\tdegr\tmult\t3pri\tev-geo\tvdmd\tvdmd2" << std::endl;
         benchmark_polmatmul(dim,dim,dim,deg,deg);
     }
 }

From 11d24951f1f14a565427d1d073531bc448f11fe6 Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 09:59:06 +0100
Subject: [PATCH 3/8] require FLINT v3.4.0

---
 .github/workflows/CI.yml                             | 7 +++----
 README.md                                            | 3 ++-
 flint-extras/src/nmod_poly_mat_extra/mul_geometric.c | 8 ++++++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index cfd2116d..c4e97c50 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -14,10 +14,9 @@ env:
   GLOBAL_MULTIPLIER: 1
 
 concurrency:
-  # group by workflow and ref; the last slightly strange component ensures that for pull
-  # requests, we limit to 1 concurrent job, but for the main branch we don't
+  # Group by workflow and ref, and to limit to 1 concurrent job except for main
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/main' || github.run_number }}
-  # Cancel intermediate builds, but only if it is a pull request build.
+  # Cancel intermediate builds for pull requests
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
@@ -128,7 +127,7 @@ jobs:
           # install FLINT dev version
           git clone https://github.com/flintlib/flint.git
           cd flint
-          git checkout v3.2.0
+          git checkout v3.4.0
           ./bootstrap.sh
           ./configure
           make
diff --git a/README.md b/README.md
index 38517a66..ea361f80 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ matrices, structured matrices, and their applications.
 Version 0.5
 
 Warning: the FLINT-based part of PML (flint-extras folder) is work in progress.
+Currently, it requires FLINT version 3.4.0 (November 2025) or later.
  
 ## Authors
 
@@ -61,7 +62,7 @@ Public License), see `flint-extras/COPYING_FLINT` for the license.
 ## Installation (FLINT-based version)
 
 The FLINT-based version is work in progress, and currently compiles with FLINT
-3.2.0 or later versions, including the current development version.
+3.4.0 or later versions, including the current development version.
 
 The build system is similar to (and directly derived from) FLINT's one. In
 short, for a standard configuration, the following steps should work:
diff --git a/flint-extras/src/nmod_poly_mat_extra/mul_geometric.c b/flint-extras/src/nmod_poly_mat_extra/mul_geometric.c
index d03f022f..31f73c9a 100644
--- a/flint-extras/src/nmod_poly_mat_extra/mul_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/mul_geometric.c
@@ -10,8 +10,11 @@
 
 /** Multiplication for polynomial matrices
  *  sets C = A * B
- *  output can alias input
+ *  output can alias input  (TODO make this consistent with existing functions)
  *  ASSUMPTION (not checked): existence of element of "large enough" order
+ *  TODO -> fail flag when element not found
+ *  FIXME -> version underscore with provided geometric progression?
+ *           (if will be used with various degrees, may require recrafting interpolation for more flexible npoints)
  *  uses evaluation and interpolation at a geometric progression
  */
 void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, const nmod_poly_mat_t B)
@@ -57,7 +60,7 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
 
     ellC = ellA + ellB - 1;  // length(C) = length(A) + length(B) - 1
     nmod_init(&mod, p);
-    w = nmod_find_root(2*ellC, mod);
+    w = nmod_find_root(2*ellC, mod); /* TODO 2*ellC ok? */
     nmod_geometric_progression_init(F, w, ellC, mod);
 
     mod_A = FLINT_ARRAY_ALLOC(ellC, nmod_mat_t);
@@ -66,6 +69,7 @@ void nmod_poly_mat_mul_geometric(nmod_poly_mat_t C, const nmod_poly_mat_t A, con
     val = _nmod_vec_init(ellC);
 
 #ifdef DIRTY_ALLOC_MATRIX
+    /* TODO check if still has any interest after FLINT's new mat storage */
     // we alloc the memory for all matrices at once
     nn_ptr tmp = flint_malloc((m*k + k*n + m*n) * ellC * sizeof(ulong));
     nn_ptr ptr = tmp;

From 060f6720ee474e11124afcfbfde67a404a7d51b0 Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 21:40:38 +0100
Subject: [PATCH 4/8] middle product works

---
 .github/workflows/CI.yml                      | 28 ++++++------
 .../nmod_poly_mat_middle_product_geometric.c  | 43 ++++++++++---------
 .../src/nmod_poly_mat_extra/test/main.c       |  8 ++--
 .../test/t-middle_product_geometric.c         | 14 +++---
 .../test/t-mul_geometric.c                    |  4 --
 5 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index c4e97c50..51e7fa34 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -39,7 +39,7 @@ jobs:
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}"
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}" >> $GITHUB_ENV
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: "Setup"
         run: |
@@ -57,7 +57,7 @@ jobs:
           echo "MAKE=make -j$(expr $(nproc) + 1) --output-sync=target" >> $GITHUB_ENV
 
           # install FLINT dev version
-          git clone https://github.com/flintlib/flint.git
+          git clone --depth=1 https://github.com/flintlib/flint.git
           cd flint
           ./bootstrap.sh
           ./configure
@@ -89,10 +89,10 @@ jobs:
           $MAKE check
 
   ##############################################################################
-  # ubuntu 24.04, clang, flint 3.2.0
+  # ubuntu 24.04, clang, flint 3.4.0
   ##############################################################################
   ubuntu-clang-flint320:
-    name: Ubuntu 24.04, clang, FLINT 3.2.0 (x10)
+    name: Ubuntu 24.04, clang, FLINT 3.4.0 (x10)
 
     runs-on: ubuntu-24.04
 
@@ -107,7 +107,7 @@ jobs:
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}"
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}" >> $GITHUB_ENV
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: "Setup"
         run: |
@@ -125,7 +125,7 @@ jobs:
           echo "MAKE=make -j$(expr $(nproc) + 1) --output-sync=target" >> $GITHUB_ENV
 
           # install FLINT dev version
-          git clone https://github.com/flintlib/flint.git
+          git clone --depth=1 https://github.com/flintlib/flint.git
           cd flint
           git checkout v3.4.0
           ./bootstrap.sh
@@ -176,7 +176,7 @@ jobs:
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}"
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}" >> $GITHUB_ENV
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: "Setup"
         run: |
@@ -194,7 +194,7 @@ jobs:
           echo "MAKE=make -j$(expr $(nproc) + 1) --output-sync=target" >> $GITHUB_ENV
 
           # install FLINT dev version
-          git clone https://github.com/flintlib/flint.git
+          git clone --depth=1 https://github.com/flintlib/flint.git
           cd flint
           ./bootstrap.sh
           ./configure
@@ -245,7 +245,7 @@ jobs:
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}"
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}" >> $GITHUB_ENV
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: "Setup"
         run: |
@@ -307,7 +307,7 @@ jobs:
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}"
           echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}" >> $GITHUB_ENV
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: "Setup"
         run: |
@@ -325,7 +325,7 @@ jobs:
           echo "MAKE=gmake -j$(expr $(nproc) + 1) -l 10 --output-sync=target" >> $GITHUB_ENV
 
           # install FLINT dev version
-          git clone https://github.com/flintlib/flint.git
+          git clone --depth=1 https://github.com/flintlib/flint.git
           cd flint
           ./bootstrap.sh
           ./configure \
@@ -379,7 +379,7 @@ jobs:
   #    FLINT_TEST_MULTIPLIER: "0.5"
 
   #  steps:
-  #    - uses: actions/checkout@v4
+  #    - uses: actions/checkout@v6
 
   #    - name: "Setup MinGW"
   #      uses: msys2/setup-msys2@v2
@@ -443,7 +443,7 @@ jobs:
   #        echo "TIMEOUT=$TIMEOUT | Out-File -Append -FilePath $env:GITHUB_ENV"
   #      shell: powershell
 
-  #    - uses: actions/checkout@v4
+  #    - uses: actions/checkout@v6
 
   #    - name: "Setup cache for dependencies"
   #      uses: actions/github-script@v7
@@ -508,7 +508,7 @@ jobs:
   #        echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}"
   #        echo "FLINT_TEST_MULTIPLIER=${FLINT_TEST_MULTIPLIER}" >> $GITHUB_ENV
 
-  #    - uses: actions/checkout@v4
+  #    - uses: actions/checkout@v6
 
   #    - name: "Setup latest Alpine Linux"
   #      uses: jirutka/setup-alpine@v1
diff --git a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c b/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
index a9353ece..3aca1fb8 100644
--- a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
@@ -5,8 +5,6 @@
 #include "nmod_extra.h"
 #include "nmod_poly_mat_multiply.h"
 
-#if (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4)
-
 /** Middle product for polynomial matrices
  *  sets C = ((A * B) div x^dA) mod x^(dB+1)
  *  output can alias input
@@ -48,7 +46,6 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
         return;
     }
 
-
     // length = 0 iff matrix is zero
     ellA = nmod_poly_mat_max_length(A);
     ellB = nmod_poly_mat_max_length(B);
@@ -64,7 +61,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     ellC = ellA + ellB - 1;  // length(C) = length(A) + length(B) - 1
     order = ellC;
     nmod_init(&mod, p);
-    w = nmod_find_root(order, mod);
+    w = nmod_find_root(2*order, mod);  /* TODO check necessary order */
     nmod_geometric_progression_init(F, w, order, mod);
 
     mod_A = FLINT_ARRAY_ALLOC(ellC, nmod_mat_t);
@@ -72,7 +69,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     mod_C = FLINT_ARRAY_ALLOC(ellC, nmod_mat_t);
     val = _nmod_vec_init(ellC);
     val2 = _nmod_vec_init(ellC);
-    nmod_poly_init2(tmp_poly, mod.n, dA+1);
+    nmod_poly_init2(tmp_poly, mod.n, ellC);
 
 #ifdef DIRTY_ALLOC_MATRIX
     // we alloc the memory for all matrices at once
@@ -83,6 +80,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     for (i = 0; i < ellC; i++)
     {
         mod_A[i]->entries = tmp + i*m*k;
+        mod_A[i]->stride = k;
         mod_A[i]->r = m;
         mod_A[i]->c = k;
         mod_A[i]->mod.n = mod.n;
@@ -94,6 +92,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     for (i = 0; i < ellC; i++)
     {
         mod_B[i]->entries = tmp + i*k*n;
+        mod_B[i]->stride = n;
         mod_B[i]->r = k;
         mod_B[i]->c = n;
         mod_B[i]->mod.n = mod.n;
@@ -105,6 +104,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     for (i = 0; i < ellC; i++)
     {
         mod_C[i]->entries = tmp + i*m*n;
+        mod_C[i]->stride = n;
         mod_C[i]->r = m;
         mod_C[i]->c = n;
         mod_C[i]->mod.n = mod.n;
@@ -121,14 +121,15 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     }
 #endif
 
-    for (i = 0; i < n; i++)
+    for (i = 0; i < m; i++)
+    {
         for (j = 0; j < k; j++)
         {
-            ulong deg = nmod_poly_degree(nmod_poly_mat_entry(A, i, j));
+            ulong len = nmod_poly_mat_entry(A, i, j)->length;
             nn_ptr src = nmod_poly_mat_entry(A, i, j)->coeffs;
             nn_ptr dest = tmp_poly->coeffs;
             v = dA;
-            for (u = 0; u <= deg; u++, v--)
+            for (u = 0; u < len; u++, v--)
                 dest[v] = src[u];
             for (; v >= 0; v--)
                 dest[v] = 0;
@@ -139,33 +140,35 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
             for (ell = 0; ell < ellC; ell++)
                 nmod_mat_entry(mod_A[ell], i, j) = val[ell];
         }
+    }
 
 
     for (i = 0; i < k; i++)
+    {
         for (j = 0; j < n; j++)
         {
-            ulong deg = nmod_poly_degree(nmod_poly_mat_entry(B, i, j));
+            ulong len = nmod_poly_mat_entry(B, i, j)->length;
             nn_ptr src = nmod_poly_mat_entry(B, i, j)->coeffs;
-            for (u = 0; u <= deg; u++)
+            for (u = 0; u < len; u++)
                 val2[u] = src[u];
             for (; u < ellC; u++)
                 val2[u] = 0;
             nmod_poly_interpolate_geometric_nmod_vec_fast_precomp(tmp_poly, val2, F, ellC);
-            deg = nmod_poly_degree(tmp_poly);
+            len = tmp_poly->length;
             src = tmp_poly->coeffs;
-            for (ell = 0; ell <= deg; ell++)
+            for (ell = 0; ell < len; ell++)
                 nmod_mat_entry(mod_B[ell], i, j) = src[ell];
             for (; ell < ellC; ell++)
                 nmod_mat_entry(mod_B[ell], i, j) = 0;
         }
+    }
 
     for (ell = 0; ell < ellC; ell++)
         nmod_mat_mul(mod_C[ell], mod_A[ell], mod_B[ell]);
 
-
-    nmod_poly_fit_length(tmp_poly, ellC);
-    for (i = 0; i < n; i++)
-        for (j = 0; j < m; j++)
+    for (i = 0; i < m; i++)
+    {
+        for (j = 0; j < n; j++)
         {
             nn_ptr dest = tmp_poly->coeffs;
             for (ell = 0; ell < ellC; ell++)
@@ -177,12 +180,12 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
 
             nmod_poly_realloc(nmod_poly_mat_entry(C, i, j), dB + 1);
             nmod_poly_mat_entry(C, i, j)->length = dB + 1;
-            dest = nmod_poly_mat_entry(A, i, j)->coeffs;
+            dest = nmod_poly_mat_entry(C, i, j)->coeffs;
             for (u = 0; u <= dB; u++)
                 dest[u] = val2[u];
-            _nmod_poly_normalise(nmod_poly_mat_entry(A, i, j));
+            _nmod_poly_normalise(nmod_poly_mat_entry(C, i, j));
         }
-
+    }
 
 #ifdef DIRTY_ALLOC_MATRIX
     flint_free(tmp);
@@ -203,5 +206,3 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     _nmod_vec_clear(val);
     nmod_geometric_progression_clear(F);
 }
-
-#endif  /* (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4) */
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/main.c b/flint-extras/src/nmod_poly_mat_extra/test/main.c
index ba7f76d1..75661fb7 100644
--- a/flint-extras/src/nmod_poly_mat_extra/test/main.c
+++ b/flint-extras/src/nmod_poly_mat_extra/test/main.c
@@ -17,7 +17,7 @@
 #include "t-det.c"
 #include "t-dixon.c"
 #include "t-hermite_normal_form.c"
-/* #include "t-middle_product_geometric.c" */ /* TODO */
+#include "t-middle_product_geometric.c"
 #include "t-mul_geometric.c"
 #include "t-mbasis.c"
 #include "t-pmbasis.c"
@@ -29,6 +29,8 @@
 
 test_struct tests[] =
 {
+    TEST_FUNCTION(nmod_poly_mat_middle_product_geometric),
+    TEST_FUNCTION(nmod_poly_mat_mul_geometric),
     TEST_FUNCTION(nmod_poly_mat_det),
     TEST_FUNCTION(nmod_poly_mat_dixon),
     TEST_FUNCTION(nmod_poly_mat_hnf),
@@ -37,10 +39,6 @@ test_struct tests[] =
     /* TEST_FUNCTION(nmod_poly_mat_mul_waksman), */  /* TODO */
     TEST_FUNCTION(nmod_poly_mat_rand),
     TEST_FUNCTION(nmod_poly_mat_weak_popov_form),
-#if (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4)
-    /* TEST_FUNCTION(nmod_poly_mat_middle_product_geometric), */  /* TODO */
-    TEST_FUNCTION(nmod_poly_mat_mul_geometric),
-#endif  /* FLINT_VERSION */
 };
 
 /* main function *************************************************************/
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c b/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c
index 26ad40dc..03fe629d 100644
--- a/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c
@@ -16,8 +16,6 @@
 #include "nmod_poly_mat_utils.h" // for rand
 #include "nmod_poly_mat_multiply.h"
 
-#if (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4)
-
 /*--------------------------------------------------------------*/
 /* middle product using different implementations               */
 /*--------------------------------------------------------------*/
@@ -57,22 +55,24 @@ TEST_FUNCTION_START(nmod_poly_mat_middle_product_geometric, state)
     for (i = 0; i < 100 * flint_test_multiplier(); i++)
     {
         /* FIXME field must be "large enough" : add proper test/fallback */
-        ulong bits = 16 + n_randint(state, 63);
+        ulong bits = 16 + n_randint(state, 48);
         ulong m = 1 + n_randint(state, 50);
         ulong n = 1 + n_randint(state, 50);
         ulong p = 1 + n_randint(state, 50);
         ulong deg = 0 + n_randint(state, 50);
 
+        /* flint_printf("m = %wu, n = %wu, p = %wu\n" */
+        /*              "deg = %wu, n_bits = %wu\n", */
+        /*              m, n, p, deg, bits); */
+
         result = test_mat_middle_product_geometric(bits, m, n, p, deg, state);
 
         if (!result)
             TEST_FUNCTION_FAIL(
                     "m = %wu, n = %wu, p = %wu\n"
-                    "n_bits = %wu\n",
-                    m, n, p, bits);
+                    "deg = %wu, n_bits = %wu\n",
+                    m, n, p, deg, bits);
     }
 
     TEST_FUNCTION_END(state);
 }
-
-#endif
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c b/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
index bf457575..ad3de381 100644
--- a/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/test/t-mul_geometric.c
@@ -18,8 +18,6 @@
 
 /* TODO more primes, more variations of degrees, etc. */
 
-#if (__FLINT_VERSION == 3 && __FLINT_VERSION_MINOR >= 4)
-
 /*--------------------------------------------------------------*/
 /* middle product using different implementations               */
 /*--------------------------------------------------------------*/
@@ -81,5 +79,3 @@ TEST_FUNCTION_START(nmod_poly_mat_mul_geometric, state)
 
     TEST_FUNCTION_END(state);
 }
-
-#endif

From ec6e79464130d0dc5bd206bb7a44e11a121a4803 Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 21:42:05 +0100
Subject: [PATCH 5/8] minor fixes

---
 ...at_middle_product_geometric.c => middle_product_geometric.c} | 0
 .../src/nmod_poly_mat_extra/test/t-middle_product_geometric.c   | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename flint-extras/src/nmod_poly_mat_extra/{nmod_poly_mat_middle_product_geometric.c => middle_product_geometric.c} (100%)

diff --git a/flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c b/flint-extras/src/nmod_poly_mat_extra/middle_product_geometric.c
similarity index 100%
rename from flint-extras/src/nmod_poly_mat_extra/nmod_poly_mat_middle_product_geometric.c
rename to flint-extras/src/nmod_poly_mat_extra/middle_product_geometric.c
diff --git a/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c b/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c
index 03fe629d..df7750fc 100644
--- a/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/test/t-middle_product_geometric.c
@@ -55,7 +55,7 @@ TEST_FUNCTION_START(nmod_poly_mat_middle_product_geometric, state)
     for (i = 0; i < 100 * flint_test_multiplier(); i++)
     {
         /* FIXME field must be "large enough" : add proper test/fallback */
-        ulong bits = 16 + n_randint(state, 48);
+        ulong bits = 16 + n_randint(state, 49);
         ulong m = 1 + n_randint(state, 50);
         ulong n = 1 + n_randint(state, 50);
         ulong p = 1 + n_randint(state, 50);

From b44be463edcec23ca54963ffa565e8d69f9d23fb Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:04:41 +0100
Subject: [PATCH 6/8] profile

---
 .../middle_product_geometric.c                |   9 +-
 .../profile/p-middle_product.c                | 259 ++++++++++++++++++
 .../timings/time_middle_product.cpp           |  57 ++--
 3 files changed, 298 insertions(+), 27 deletions(-)
 create mode 100644 flint-extras/src/nmod_poly_mat_extra/profile/p-middle_product.c

diff --git a/flint-extras/src/nmod_poly_mat_extra/middle_product_geometric.c b/flint-extras/src/nmod_poly_mat_extra/middle_product_geometric.c
index 3aca1fb8..6d1abe27 100644
--- a/flint-extras/src/nmod_poly_mat_extra/middle_product_geometric.c
+++ b/flint-extras/src/nmod_poly_mat_extra/middle_product_geometric.c
@@ -16,7 +16,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
                                             const ulong dA, const ulong dB)
 {
     nmod_mat_t *mod_A, *mod_B, *mod_C;
-    ulong ellA, ellB, ellC, order;
+    ulong ellC, order;
     ulong i, j, ell, m, k, n, u;
     long v;
     ulong p, w;
@@ -47,10 +47,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
     }
 
     // length = 0 iff matrix is zero
-    ellA = nmod_poly_mat_max_length(A);
-    ellB = nmod_poly_mat_max_length(B);
-
-    if (ellA == 0 || ellB == 0)
+    if (nmod_poly_mat_max_length(A) == 0 || nmod_poly_mat_max_length(B) == 0)
     {
         nmod_poly_mat_zero(C);
         return;
@@ -58,7 +55,7 @@ void nmod_poly_mat_middle_product_geometric(nmod_poly_mat_t C, const nmod_poly_m
 
     nmod_init(&mod, p);
 
-    ellC = ellA + ellB - 1;  // length(C) = length(A) + length(B) - 1
+    ellC = dA + dB + 1;  // length(C) = length(A) + length(B) - 1
     order = ellC;
     nmod_init(&mod, p);
     w = nmod_find_root(2*order, mod);  /* TODO check necessary order */
diff --git a/flint-extras/src/nmod_poly_mat_extra/profile/p-middle_product.c b/flint-extras/src/nmod_poly_mat_extra/profile/p-middle_product.c
new file mode 100644
index 00000000..93ed4fe7
--- /dev/null
+++ b/flint-extras/src/nmod_poly_mat_extra/profile/p-middle_product.c
@@ -0,0 +1,259 @@
+#include <stdlib.h>  // for atoi
+
+#include <flint/ulong_extras.h>
+#include <flint/profiler.h>
+#include <flint/nmod.h>
+#include <flint/nmod_vec.h>
+#include <flint/nmod_poly_mat.h>
+
+#include "nmod_poly_mat_utils.h"
+#include "nmod_poly_mat_multiply.h"
+
+#define MEASURE_SAMPLE 0
+
+typedef struct
+{
+    slong rdim;  /* row outer dimension */
+    slong idim;  /* inner dimension */
+    slong cdim;  /* column outer dimension */
+    slong deg;   /* degree */
+    slong modn;  /* modulus */
+}
+time_args;
+
+#define TIME_TMUL(fun)                                  \
+void time_##fun(time_args targs, flint_rand_t state)    \
+{                                                       \
+    const slong rdim = targs.rdim;                      \
+    const slong idim = targs.idim;                      \
+    const slong cdim = targs.cdim;                      \
+    const slong deg = targs.deg;                        \
+    const slong n = targs.modn;                         \
+                                                        \
+    nmod_t mod;                                         \
+    nmod_init(&mod, n);                                 \
+                                                        \
+    nmod_poly_mat_t A;                                  \
+    nmod_poly_mat_init(A, rdim, idim, n);               \
+    nmod_poly_mat_rand(A, state, deg);                  \
+    nmod_poly_mat_t B;                                  \
+    nmod_poly_mat_init(B, idim, cdim, n);               \
+    nmod_poly_mat_rand(B, state, 2*deg-1);              \
+    nmod_poly_mat_t C;                                  \
+    nmod_poly_mat_init(C, rdim, cdim, n);               \
+                                                        \
+    double FLINT_SET_BUT_UNUSED(tcpu), twall;           \
+                                                        \
+    TIMEIT_START;                                       \
+    nmod_poly_mat_##fun(C, A, B, deg-1, deg-1);         \
+    TIMEIT_STOP_VALUES(tcpu, twall);                    \
+                                                        \
+    printf("%.2e", twall);                              \
+                                                        \
+    nmod_poly_mat_clear(A);                             \
+    nmod_poly_mat_clear(B);                             \
+    nmod_poly_mat_clear(C);                             \
+}
+
+TIME_TMUL(middle_product_naive)
+TIME_TMUL(middle_product_geometric)
+
+/*-------------------------*/
+/*  main                   */
+/*-------------------------*/
+
+int main(int argc, char ** argv)
+{
+    flint_rand_t state;
+    flint_rand_init(state);
+    flint_rand_set_seed(state, time(NULL), time(NULL)+129384125L);
+
+    // modulus bitsize
+    const slong nbits = 7;
+    const ulong bits[] = {12, 24, 30, 40, 50, 60, 63};
+
+    // matrix dimensions (all square for the moment)
+    const slong ndims = 10;
+    const ulong dims[] = {2, 4, 6, 8, 11, 15, 20, 30, 50, 100};
+
+    // matrix degrees
+    const slong ndegs = 12;
+    const ulong degs[] = {5, 10, 20, 40, 80, 160, 320, 640, 1280, 2560, 5120, 10240};
+
+    // bench functions
+    const slong nfuns = 2;
+    typedef void (*timefun) (time_args, flint_rand_t);
+    const timefun funs[] = {
+        time_middle_product_naive,                // 0
+        time_middle_product_geometric,            // 1
+    };
+
+    // TODO
+    //typedef void (*samplefun) (void*, ulong);
+    //const samplefun sfuns[] = {
+    //    sample_mul,                      // 0
+    //    sample_mul_geometric,            // 1
+    //};
+
+    const char * description[] = {
+        "#0  --> middle_product_naive                    ",
+        "#1  --> middle_product_geometric                ",
+    };
+
+    if (argc == 1)  // show usage
+    {
+        printf("Usage: `%s [nbits] [dim] [deg] [fun]`\n", argv[0]);
+        printf("   Each argument is optional; no argument shows this help.\n");
+        printf("   - nbits: number of bits (in (1..64]) for the modulus, chosen as nextprime(2**(nbits-1))\n");
+        printf("        (nbits == -1 launches full suite)\n");
+        printf("   - dim: matrices are dim x dim\n");
+        printf("   - deg: matrices are random of degree < deg\n");
+        printf("   - deg: id number of the timed function (see below),\n");
+        printf("\nAvailable functions:\n");
+        for (slong j = 0; j < nfuns; j++)
+            printf("   %s\n", description[j]);
+
+        return 0;
+    }
+
+    printf("#warmup...\n");
+    for (slong i = 0; i < 3; i++)
+    {
+        time_args targs = {4, 4, 4, 1000, UWORD(1) << 20};
+        time_middle_product_naive(targs, state);
+        printf(" ");
+    }
+    printf("\n\n");
+
+    if (argc == 2 && atoi(argv[1]) == -1)  // launching full suite
+    {
+        printf("           dim");
+        for (slong i = 0; i < ndims; i++)
+            printf("%17ld", dims[i]);
+        printf("\n");
+        printf("bits fun deg\n");
+        for (slong j = 0; j < nbits; j++)
+        {
+            const slong b = bits[j];
+            ulong n;
+            n = n_nextprime(UWORD(1) << (b-1), 0);
+            for (slong ifun = 0; ifun < nfuns; ifun++)
+            {
+                for (slong d = 0; d < ndegs; d++)
+                {
+                    printf("%-5ld#%-3ld%-8ld", b, ifun, degs[d]);
+                    for (slong i = 0; i < ndims; i++)
+                    {
+                        time_args targs = {dims[i], dims[i], dims[i], degs[d], n};
+
+#if MEASURE_SAMPLE
+                        const samplefun sfun = sfuns[ifun];
+                        double min, max;
+                        prof_repeat(&min, &max, sfun, (void*) &targs);
+                        printf("%.2e", min/1000000);
+#else
+                        const timefun tfun = funs[ifun];
+                        tfun(targs, state);
+#endif
+                        printf(" ");
+                    }
+                }
+                printf("\n");
+            }
+        }
+    }
+    else if (argc == 2)  // nbits is given
+    {
+        printf("       dim");
+        for (slong i = 0; i < ndims; i++)
+            printf("%17ld", dims[i]);
+        printf("\n");
+        printf("bits fun deg\n");
+        const slong b = atoi(argv[1]);
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+        for (slong ifun = 0; ifun < nfuns; ifun++)
+        {
+            const timefun tfun = funs[ifun];
+            for (slong d = 0; d < ndegs; d++)
+            {
+                printf("%-5ld#%-3ld%-8ld", b, ifun, degs[d]);
+                for (slong i = 0; i < ndims; i++)
+                {
+                    time_args targs = {dims[i], dims[i], dims[i], degs[d], n};
+                    tfun(targs, state);
+                    printf(" ");
+                }
+            }
+            printf("\n");
+        }
+    }
+    else if (argc == 3)  // nbits + dim given
+    {
+        const slong dim = atoi(argv[2]);
+        printf("       dim");
+        printf("%17ld", dim);
+        printf("\n");
+        printf("bits fun deg\n");
+        const slong b = atoi(argv[1]);
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+        for (slong ifun = 0; ifun < nfuns; ifun++)
+        {
+            const timefun tfun = funs[ifun];
+            for (slong d = 0; d < ndegs; d++)
+            {
+                printf("%-5ld#%-3ld%-8ld", b, ifun, degs[d]);
+                time_args targs = {dim, dim, dim, degs[d], n};
+                tfun(targs, state);
+                printf(" ");
+                printf("\n");
+            }
+        }
+    }
+    else if (argc == 4)  // nbits + dim + deg given
+    {
+        const slong dim = atoi(argv[2]);
+        const slong deg = atoi(argv[3]);
+        printf("       dim");
+        printf("%17ld", dim);
+        printf("\n");
+        printf("bits fun deg\n");
+        const slong b = atoi(argv[1]);
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+        for (slong ifun = 0; ifun < nfuns; ifun++)
+        {
+            const timefun tfun = funs[ifun];
+            printf("%-5ld#%-3ld%-8ld", b, ifun, deg);
+            time_args targs = {dim, dim, dim, deg, n};
+            tfun(targs, state);
+            printf(" ");
+            printf("\n");
+        }
+    }
+    else if (argc == 5)  // nbits + dim + deg + fun given
+    {
+        const slong b = atoi(argv[1]);
+        const slong dim = atoi(argv[2]);
+        const slong deg = atoi(argv[3]);
+        const slong ifun = atoi(argv[4]);
+        const timefun tfun = funs[ifun];
+        printf("       dim");
+        printf("%17ld", dim);
+        printf("\n");
+        printf("bits fun deg\n");
+        ulong n;
+        n = n_nextprime(UWORD(1) << (b-1), 0);
+
+        printf("%-5ld#%-3ld%-8ld", b, ifun, deg);
+        time_args targs = {dim, dim, dim, deg, n};
+        tfun(targs, state);
+
+        printf(" ");
+        printf("\n");
+    }
+
+    flint_rand_clear(state);
+    return 0;
+}
diff --git a/ntl-extras/mat_lzz_pX_extra/timings/time_middle_product.cpp b/ntl-extras/mat_lzz_pX_extra/timings/time_middle_product.cpp
index 8d374497..eeaa6b39 100644
--- a/ntl-extras/mat_lzz_pX_extra/timings/time_middle_product.cpp
+++ b/ntl-extras/mat_lzz_pX_extra/timings/time_middle_product.cpp
@@ -26,8 +26,8 @@ void one_check(long sz, long deg)
             random(a, sz, sz, dA + 1);
             random(b, sz, sz, dB + 1);
             random(c, sz, sz, dA + dB + 1);
-            
-            cout << sz << " " << dA << " " << dB << " ";
+
+            cout << sz << "\t" << dA << "\t" << dB << "\t";
 
             if (is_FFT_prime())
             {
@@ -99,19 +99,20 @@ void one_check(long sz, long deg)
                 }
                 while ((GetWallTime()-t_geom) <= thres);
                 t_geom = (GetWallTime()-t_geom) / nb;
-                
 
-                t_dense = GetWallTime();
-                nb = 0;
-                do
-                {
-                    middle_product_evaluate_dense(b, a, c, dA, dB);
-                    nb++;
-                }
-                while ((GetWallTime()-t_dense) <= thres);
-                t_dense = (GetWallTime()-t_dense) / nb;
-                
-                
+
+                t_dense=0.0;
+                /* t_dense = GetWallTime(); */
+                /* nb = 0; */
+                /* do */
+                /* { */
+                /*     middle_product_evaluate_dense(b, a, c, dA, dB); */
+                /*     nb++; */
+                /* } */
+                /* while ((GetWallTime()-t_dense) <= thres); */
+                /* t_dense = (GetWallTime()-t_dense) / nb; */
+
+
                 t_3_primes = GetWallTime();
                 nb = 0;
                 do
@@ -157,7 +158,7 @@ void one_check(long sz, long deg)
                 while ((GetWallTime()-t_naive) <= thres);
                 t_naive = (GetWallTime()-t_naive) / nb;
 
-                cout << t_geom << " " << t_dense << " " << t_3_primes << "   " << t_middle << "   " << t_direct << " " << t_naive;
+                cout << t_geom << "\t" << t_dense << "\t" << t_3_primes << "\t" << t_middle << "\t" << t_direct << "\t" << t_naive;
             }
             cout << endl;
         }
@@ -188,7 +189,7 @@ void check(long p)
         for (size_t j = 0; j < degrees.size(); j++)
             one_check(sizes[i], degrees[j]);
 
-    
+
 }
 
 /*------------------------------------------------------------*/
@@ -196,12 +197,26 @@ void check(long p)
 /*------------------------------------------------------------*/
 int main(int argc, char ** argv)
 {
-    std::cout << std::fixed;
-    std::cout << std::setprecision(8);
+    std::cout << std::scientific;
+    std::cout << std::setprecision(1);
     warmup();
-    // check(0);
-    // check(23068673);
-    check(288230376151711813);
+    if (argc == 1)
+    {
+        // check(0);
+        // check(23068673);
+        check(1108307720798209);
+        check(288230376151711813);
+    }
+    else if (argc == 3)
+    {
+        zz_p::init(1108307720798209);
+        cout << "dim\tdeg\tdeg\tgeom\tdense\t3primes\tmiddle\tdirect\tnaive" << endl;
+        one_check(atoi(argv[1]), atoi(argv[2]));
+    }
+    else
+    {
+        std::cout << "Usage: EXE dim deg" << std::endl;
+    }
     return 0;
 }
 

From a6409e98c12f4baf1c16dfd5f901dabf92ab64e8 Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 23:53:11 +0100
Subject: [PATCH 7/8] fix retrieving depth 1 specific tag

---
 .github/workflows/CI.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 51e7fa34..a80071ea 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -91,7 +91,7 @@ jobs:
   ##############################################################################
   # ubuntu 24.04, clang, flint 3.4.0
   ##############################################################################
-  ubuntu-clang-flint320:
+  ubuntu-clang-flint340:
     name: Ubuntu 24.04, clang, FLINT 3.4.0 (x10)
 
     runs-on: ubuntu-24.04
@@ -125,9 +125,8 @@ jobs:
           echo "MAKE=make -j$(expr $(nproc) + 1) --output-sync=target" >> $GITHUB_ENV
 
           # install FLINT dev version
-          git clone --depth=1 https://github.com/flintlib/flint.git
+          git clone --depth=1 https://github.com/flintlib/flint.git --branch v3.4.0 --single-branch
           cd flint
-          git checkout v3.4.0
           ./bootstrap.sh
           ./configure
           make

From d979cfbe08b01e52eaf52e507101f2ba2cba821b Mon Sep 17 00:00:00 2001
From: Vincent Neiger <vneiger@users.noreply.github.com>
Date: Fri, 5 Dec 2025 15:56:05 +0100
Subject: [PATCH 8/8] minor fix in config files

---
 flint-extras/configure.ac | 2 +-
 flint-extras/pml.pc.in    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flint-extras/configure.ac b/flint-extras/configure.ac
index fd6bfb17..acd97eb0 100644
--- a/flint-extras/configure.ac
+++ b/flint-extras/configure.ac
@@ -979,7 +979,7 @@ FLINT_CHECK_MPFR_H(4,1,0)
 # check FLINT
 ################################################################################
 
-PML_CHECK_FLINT_H(3,2,0)
+PML_CHECK_FLINT_H(3,4,0)
 
 ################################################################################
 # check headers
diff --git a/flint-extras/pml.pc.in b/flint-extras/pml.pc.in
index deb39d6b..b03321cd 100644
--- a/flint-extras/pml.pc.in
+++ b/flint-extras/pml.pc.in
@@ -7,6 +7,6 @@ Name: @PACKAGE_NAME@
 Description: Polynomial Matrix Library / FLINT
 Version: @PACKAGE_VERSION@
 URL: https://github.com/vneiger/pml
-Requires: gmp >= 6.2.1 mpfr >= 4.1.0 flint >= 3.2.0
+Requires: gmp >= 6.2.1 mpfr >= 4.1.0 flint >= 3.4.0
 Cflags: -I${includedir}
 Libs: -L${libdir} -lpml