From 61fc72a4a431cbfd42f22e2af76177c73431c3e6 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" <68491+gpshead@users.noreply.github.com> Date: Thu, 1 Jan 2026 22:03:05 -0800 Subject: [PATCH 1/2] gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD] (GH-143262) Optimize base64 encoding/decoding by eliminating loop-carried dependencies. Key changes: - Add `base64_encode_trio()` and `base64_decode_quad()` helper functions that process complete groups independently - Add `base64_encode_fast()` and `base64_decode_fast()` wrappers - Update `b2a_base64` and `a2b_base64` to use fast path for complete groups Performance gains (encode/decode speedup vs main, PGO builds): ``` 64 bytes 64K 1M Zen2: 1.2x/1.8x 1.7x/2.8x 1.5x/2.8x Zen4: 1.2x/1.7x 1.6x/3.0x 1.5x/3.0x [old data, likely faster] M4: 1.3x/1.9x 2.3x/2.8x 2.4x/2.9x [old data, likely faster] RPi5-32: 1.2x/1.2x 2.4x/2.4x 2.0x/2.1x ``` Based on my exploratory work done in https://github.com/python/cpython/compare/main...gpshead:cpython:claude/vectorize-base64-c-S7Hku See PR and issue for further thoughts on sometimes MUCH faster SIMD vectorized versions of this. --- Doc/whatsnew/3.15.rst | 7 + ...-12-29-00-42-26.gh-issue-124951.OsC5K4.rst | 3 + Modules/binascii.c | 147 +++++++++++++++--- 3 files changed, 134 insertions(+), 23 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 11f08031ec54f2..dc4248655b4b2d 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -428,6 +428,13 @@ argparse inline code when color output is enabled. (Contributed by Savannah Ostrowski in :gh:`142390`.) +base64 & binascii +----------------- + +* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x + faster thanks to simple CPU pipelining optimizations. + (Contributed by Gregory P. Smith & Serhiy Storchaka in :gh:`143262`.) + calendar -------- diff --git a/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst new file mode 100644 index 00000000000000..10c7f8632d736b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst @@ -0,0 +1,3 @@ +The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and +related codec has been optimized for modern pipelined CPU architectures and +now performs 2-3x faster across all platforms. diff --git a/Modules/binascii.c b/Modules/binascii.c index 13e4bc5be03ebd..a0a2960eef5ab0 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -76,11 +76,12 @@ get_binascii_state(PyObject *module) } -static const unsigned char table_a2b_base64[] = { +/* Align to 64 bytes for L1 cache line friendliness */ +static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = { -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, - 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1, 0,-1,-1, /* Note PAD->0 */ + 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,64,-1,-1, /* PAD->64 detected by fast path */ -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, @@ -101,9 +102,91 @@ static const unsigned char table_a2b_base64[] = { /* Max binary chunk size; limited only by available memory */ #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) -static const unsigned char table_b2a_base64[] = +/* + * Fast base64 encoding/decoding helpers. + * + * Process complete groups without loop-carried dependencies. + */ + +/* Align to 64 bytes for L1 cache line friendliness */ +static const unsigned char table_b2a_base64[] Py_ALIGNED(64) = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +/* Encode 3 bytes into 4 base64 characters. */ +static inline void +base64_encode_trio(const unsigned char *in, unsigned char *out, + const unsigned char *table) +{ + unsigned int combined = ((unsigned int)in[0] << 16) | + ((unsigned int)in[1] << 8) | + (unsigned int)in[2]; + out[0] = table[(combined >> 18) & 0x3f]; + out[1] = table[(combined >> 12) & 0x3f]; + out[2] = table[(combined >> 6) & 0x3f]; + out[3] = table[combined & 0x3f]; +} + +/* Encode multiple complete 3-byte groups. + * Returns the number of input bytes processed (always a multiple of 3). + */ +static inline Py_ssize_t +base64_encode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char *table) +{ + Py_ssize_t n_trios = in_len / 3; + const unsigned char *in_end = in + n_trios * 3; + + while (in < in_end) { + base64_encode_trio(in, out, table); + in += 3; + out += 4; + } + + return n_trios * 3; +} + +/* Decode 4 base64 characters into 3 bytes. + * Returns 1 on success, 0 if any character is invalid. + */ +static inline int +base64_decode_quad(const unsigned char *in, unsigned char *out, + const unsigned char *table) +{ + unsigned char v0 = table[in[0]]; + unsigned char v1 = table[in[1]]; + unsigned char v2 = table[in[2]]; + unsigned char v3 = table[in[3]]; + + if ((v0 | v1 | v2 | v3) & 0xc0) { + return 0; + } + + out[0] = (v0 << 2) | (v1 >> 4); + out[1] = (v1 << 4) | (v2 >> 2); + out[2] = (v2 << 6) | v3; + return 1; +} + +/* Decode multiple complete 4-character groups (no padding allowed). + * Returns the number of input characters processed. + * Stops at the first invalid character, padding, or incomplete group. + */ +static inline Py_ssize_t +base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char *table) +{ + Py_ssize_t n_quads = in_len / 4; + Py_ssize_t i; + + for (i = 0; i < n_quads; i++) { + if (!base64_decode_quad(in + i * 4, out + i * 3, table)) { + break; + } + } + + return i * 4; +} + static const unsigned short crctab_hqx[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, @@ -403,10 +486,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) goto error_end; } + size_t i = 0; /* Current position in input */ + + /* Fast path: use optimized decoder for complete quads. + * This works for both strict and non-strict mode for valid input. + * The fast path stops at padding, invalid chars, or incomplete groups. + */ + if (ascii_len >= 4) { + Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len, + bin_data, table_a2b_base64); + if (fast_chars > 0) { + i = (size_t)fast_chars; + bin_data += (fast_chars / 4) * 3; + } + } + + /* Slow path: handle remaining input (padding, invalid chars, partial groups) */ int quad_pos = 0; unsigned char leftchar = 0; int pads = 0; - for (size_t i = 0; i < ascii_len; i++) { + for (; i < ascii_len; i++) { unsigned char this_ch = ascii_data[i]; /* Check for pad sequences and ignore @@ -533,9 +632,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/ { const unsigned char *bin_data; - int leftbits = 0; - unsigned char this_ch; - unsigned int leftchar = 0; Py_ssize_t bin_len; binascii_state *state; @@ -566,26 +662,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) } unsigned char *ascii_data = PyBytesWriter_GetData(writer); - for( ; bin_len > 0 ; bin_len--, bin_data++ ) { - /* Shift the data into our buffer */ - leftchar = (leftchar << 8) | *bin_data; - leftbits += 8; - - /* See if there are 6-bit groups ready */ - while ( leftbits >= 6 ) { - this_ch = (leftchar >> (leftbits-6)) & 0x3f; - leftbits -= 6; - *ascii_data++ = table_b2a_base64[this_ch]; - } - } - if ( leftbits == 2 ) { - *ascii_data++ = table_b2a_base64[(leftchar&3) << 4]; + /* Use the optimized fast path for complete 3-byte groups */ + Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data, + table_b2a_base64); + bin_data += fast_bytes; + ascii_data += (fast_bytes / 3) * 4; + bin_len -= fast_bytes; + + /* Handle remaining 0-2 bytes */ + if (bin_len == 1) { + /* 1 byte remaining: produces 2 base64 chars + 2 padding */ + unsigned int val = bin_data[0]; + *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f]; + *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f]; *ascii_data++ = BASE64_PAD; *ascii_data++ = BASE64_PAD; - } else if ( leftbits == 4 ) { - *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2]; + } + else if (bin_len == 2) { + /* 2 bytes remaining: produces 3 base64 chars + 1 padding */ + unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1]; + *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f]; + *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f]; + *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f]; *ascii_data++ = BASE64_PAD; } + if (newline) *ascii_data++ = '\n'; /* Append a courtesy newline */ From 9712dc1d9eb03ffa96ed746d20bd43239d251ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 2 Jan 2026 10:51:05 +0100 Subject: [PATCH 2/2] gh-143310: fix crash in Tcl object conversion with concurrent mutations (#143321) --- Lib/test/test_tcl.py | 24 ++++++- ...-01-01-11-21-57.gh-issue-143310.8rxtH3.rst | 3 + Modules/_tkinter.c | 63 +++++++++++++------ 3 files changed, 68 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-01-01-11-21-57.gh-issue-143310.8rxtH3.rst diff --git a/Lib/test/test_tcl.py b/Lib/test/test_tcl.py index d479f7d7515d9b..ef281f6d1fe53a 100644 --- a/Lib/test/test_tcl.py +++ b/Lib/test/test_tcl.py @@ -40,6 +40,9 @@ def setUp(self): self.interp = Tcl() self.wantobjects = self.interp.tk.wantobjects() + def passValue(self, value): + return self.interp.call('set', '_', value) + def testEval(self): tcl = self.interp tcl.eval('set a 1') @@ -490,8 +493,7 @@ def test_expr_bignum(self): self.assertIsInstance(result, str) def test_passing_values(self): - def passValue(value): - return self.interp.call('set', '_', value) + passValue = self.passValue self.assertEqual(passValue(True), True if self.wantobjects else '1') self.assertEqual(passValue(False), False if self.wantobjects else '0') @@ -537,6 +539,24 @@ def passValue(value): self.assertEqual(passValue(['a', ['b', 'c']]), ('a', ('b', 'c')) if self.wantobjects else 'a {b c}') + def test_set_object_concurrent_mutation_in_sequence_conversion(self): + # Prevent SIGSEV when the object to convert is concurrently mutated. + # See: https://github.com/python/cpython/issues/143310. + + string = "value" + + class Value: + def __str__(self): + values.clear() + return string + + class List(list): + pass + + expect = (string, "pad") if self.wantobjects else f"{string} pad" + self.assertEqual(self.passValue(values := [Value(), "pad"]), expect) + self.assertEqual(self.passValue(values := List([Value(), "pad"])), expect) + def test_user_command(self): result = None def testfunc(arg): diff --git a/Misc/NEWS.d/next/Library/2026-01-01-11-21-57.gh-issue-143310.8rxtH3.rst b/Misc/NEWS.d/next/Library/2026-01-01-11-21-57.gh-issue-143310.8rxtH3.rst new file mode 100644 index 00000000000000..32d4862179d587 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-01-01-11-21-57.gh-issue-143310.8rxtH3.rst @@ -0,0 +1,3 @@ +:mod:`tkinter`: fix a crash when a Python :class:`list` is mutated during +the conversion to a Tcl object (e.g., when setting a Tcl variable). +Patch by Bénédikt Tran. diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index 8cea7b59fe730e..e6c0c1ff46ce52 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -947,6 +947,40 @@ asBignumObj(PyObject *value) return result; } +static Tcl_Obj* AsObj(PyObject *value); + +static Tcl_Obj* +TupleAsObj(PyObject *value, int wrapped) +{ + Tcl_Obj *result = NULL; + Py_ssize_t size = PyTuple_GET_SIZE(value); + if (size == 0) { + return Tcl_NewListObj(0, NULL); + } + if (!CHECK_SIZE(size, sizeof(Tcl_Obj *))) { + PyErr_SetString(PyExc_OverflowError, + wrapped ? "list is too long" : "tuple is too long"); + return NULL; + } + Tcl_Obj **argv = (Tcl_Obj **)PyMem_Malloc(((size_t)size) * sizeof(Tcl_Obj *)); + if (argv == NULL) { + PyErr_NoMemory(); + return NULL; + } + for (Py_ssize_t i = 0; i < size; i++) { + Tcl_Obj *item = AsObj(PyTuple_GET_ITEM(value, i)); + if (item == NULL) { + goto exit; + } + argv[i] = item; + } + result = Tcl_NewListObj((int)size, argv); + +exit: + PyMem_Free(argv); + return result; +} + static Tcl_Obj* AsObj(PyObject *value) { @@ -993,28 +1027,17 @@ AsObj(PyObject *value) if (PyFloat_Check(value)) return Tcl_NewDoubleObj(PyFloat_AS_DOUBLE(value)); - if (PyTuple_Check(value) || PyList_Check(value)) { - Tcl_Obj **argv; - Py_ssize_t size, i; - - size = PySequence_Fast_GET_SIZE(value); - if (size == 0) - return Tcl_NewListObj(0, NULL); - if (!CHECK_SIZE(size, sizeof(Tcl_Obj *))) { - PyErr_SetString(PyExc_OverflowError, - PyTuple_Check(value) ? "tuple is too long" : - "list is too long"); + if (PyTuple_Check(value)) { + return TupleAsObj(value, false); + } + + if (PyList_Check(value)) { + PyObject *value_as_tuple = PyList_AsTuple(value); + if (value_as_tuple == NULL) { return NULL; } - argv = (Tcl_Obj **) PyMem_Malloc(((size_t)size) * sizeof(Tcl_Obj *)); - if (!argv) { - PyErr_NoMemory(); - return NULL; - } - for (i = 0; i < size; i++) - argv[i] = AsObj(PySequence_Fast_GET_ITEM(value,i)); - result = Tcl_NewListObj((int)size, argv); - PyMem_Free(argv); + result = TupleAsObj(value_as_tuple, true); + Py_DECREF(value_as_tuple); return result; }