From 0e2cdd313ba5c67c5e2e21d993399b890e687c63 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Date: Tue, 7 Oct 2025 13:29:18 +0100 Subject: [PATCH 1/5] gh-139436: Remove ``dist-pdf`` from the docs archives rebuild target (#139437) --- Doc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/Makefile b/Doc/Makefile index 84578c5c57f478..f6f4c721080c42 100644 --- a/Doc/Makefile +++ b/Doc/Makefile @@ -184,7 +184,7 @@ venv: fi .PHONY: dist-no-html -dist-no-html: dist-text dist-pdf dist-epub dist-texinfo +dist-no-html: dist-text dist-epub dist-texinfo .PHONY: dist dist: From 7094f09f547dc1a520c666dc6ce11b7b69a1b8da Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Tue, 7 Oct 2025 14:04:37 +0100 Subject: [PATCH 2/5] GH-139291: Fix C stack limits by factoring out finding hardware stack limits (GH-139294) --- Python/ceval.c | 60 +++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 0ccaacaf3ed5b1..1b52128c858ecb 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -438,31 +438,26 @@ int pthread_attr_destroy(pthread_attr_t *a) #endif - -void -_Py_InitializeRecursionLimits(PyThreadState *tstate) +static void +hardware_stack_limits(uintptr_t *top, uintptr_t *base) { - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; #ifdef WIN32 ULONG_PTR low, high; GetCurrentThreadStackLimits(&low, &high); - _tstate->c_stack_top = (uintptr_t)high; + *top = (uintptr_t)high; ULONG guarantee = 0; SetThreadStackGuarantee(&guarantee); - _tstate->c_stack_hard_limit = ((uintptr_t)low) + guarantee + _PyOS_STACK_MARGIN_BYTES; - _tstate->c_stack_soft_limit = _tstate->c_stack_hard_limit + _PyOS_STACK_MARGIN_BYTES; + *base = (uintptr_t)low + guarantee; #elif defined(__APPLE__) pthread_t this_thread = pthread_self(); void *stack_addr = pthread_get_stackaddr_np(this_thread); // top of the stack size_t stack_size = pthread_get_stacksize_np(this_thread); - _tstate->c_stack_top = (uintptr_t)stack_addr; - _tstate->c_stack_hard_limit = _tstate->c_stack_top - stack_size; - _tstate->c_stack_soft_limit = _tstate->c_stack_hard_limit + _PyOS_STACK_MARGIN_BYTES; + *top = (uintptr_t)stack_addr; + *base = ((uintptr_t)stack_addr) - stack_size; #else - uintptr_t here_addr = _Py_get_machine_stack_pointer(); -/// XXX musl supports HAVE_PTHRED_GETATTR_NP, but the resulting stack size -/// (on alpine at least) is much smaller than expected and imposes undue limits -/// compared to the old stack size estimation. (We assume musl is not glibc.) + /// XXX musl supports HAVE_PTHRED_GETATTR_NP, but the resulting stack size + /// (on alpine at least) is much smaller than expected and imposes undue limits + /// compared to the old stack size estimation. (We assume musl is not glibc.) # if defined(HAVE_PTHREAD_GETATTR_NP) && !defined(_AIX) && \ !defined(__NetBSD__) && (defined(__GLIBC__) || !defined(__linux__)) size_t stack_size, guard_size; @@ -475,26 +470,35 @@ _Py_InitializeRecursionLimits(PyThreadState *tstate) err |= pthread_attr_destroy(&attr); } if (err == 0) { - uintptr_t base = ((uintptr_t)stack_addr) + guard_size; - _tstate->c_stack_top = base + stack_size; -#ifdef _Py_THREAD_SANITIZER - // Thread sanitizer crashes if we use a bit more than half the stack. - _tstate->c_stack_soft_limit = base + (stack_size / 2); -#else - _tstate->c_stack_soft_limit = base + _PyOS_STACK_MARGIN_BYTES * 2; -#endif - _tstate->c_stack_hard_limit = base + _PyOS_STACK_MARGIN_BYTES; - assert(_tstate->c_stack_soft_limit < here_addr); - assert(here_addr < _tstate->c_stack_top); + *base = ((uintptr_t)stack_addr) + guard_size; + *top = (uintptr_t)stack_addr + stack_size; return; } # endif - _tstate->c_stack_top = _Py_SIZE_ROUND_UP(here_addr, 4096); - _tstate->c_stack_soft_limit = _tstate->c_stack_top - Py_C_STACK_SIZE; - _tstate->c_stack_hard_limit = _tstate->c_stack_top - (Py_C_STACK_SIZE + _PyOS_STACK_MARGIN_BYTES); + uintptr_t here_addr = _Py_get_machine_stack_pointer(); + uintptr_t top_addr = _Py_SIZE_ROUND_UP(here_addr, 4096); + *top = top_addr; + *base = top_addr - Py_C_STACK_SIZE; #endif } +void +_Py_InitializeRecursionLimits(PyThreadState *tstate) +{ + uintptr_t top; + uintptr_t base; + hardware_stack_limits(&top, &base); +#ifdef _Py_THREAD_SANITIZER + // Thread sanitizer crashes if we use more than half the stack. + uintptr_t stacksize = top - base; + base += stacksize/2; +#endif + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; + _tstate->c_stack_top = top; + _tstate->c_stack_hard_limit = base + _PyOS_STACK_MARGIN_BYTES; + _tstate->c_stack_soft_limit = base + _PyOS_STACK_MARGIN_BYTES * 2; +} + /* The function _Py_EnterRecursiveCallTstate() only calls _Py_CheckRecursiveCall() if the recursion_depth reaches recursion_limit. */ int From 96c59a6e427fab32d0bca89b77febca8cba8aada Mon Sep 17 00:00:00 2001 From: danigm Date: Tue, 7 Oct 2025 16:54:31 +0200 Subject: [PATCH 3/5] gh-138497: Support LLVM_VERSION configuration via env (#138498) Co-authored-by: Savannah Ostrowski --- ...-09-04-12-16-31.gh-issue-138497.Y_5YXh.rst | 4 ++ Tools/jit/README.md | 2 +- Tools/jit/_llvm.py | 49 ++++++++++++------- Tools/jit/_targets.py | 19 +++++-- Tools/jit/build.py | 3 ++ configure | 2 +- configure.ac | 2 +- 7 files changed, 56 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Build/2025-09-04-12-16-31.gh-issue-138497.Y_5YXh.rst diff --git a/Misc/NEWS.d/next/Build/2025-09-04-12-16-31.gh-issue-138497.Y_5YXh.rst b/Misc/NEWS.d/next/Build/2025-09-04-12-16-31.gh-issue-138497.Y_5YXh.rst new file mode 100644 index 00000000000000..7eb0770996877a --- /dev/null +++ b/Misc/NEWS.d/next/Build/2025-09-04-12-16-31.gh-issue-138497.Y_5YXh.rst @@ -0,0 +1,4 @@ +The LLVM version used by the JIT at build time can now be modified using +the ``LLVM_VERSION`` environment variable. Use this at your own risk, as +there is only one officially supported LLVM version. For more information, +please check ``Tools/jit/README.md``. diff --git a/Tools/jit/README.md b/Tools/jit/README.md index ffc762d3828bfb..35c7ffd7a283f8 100644 --- a/Tools/jit/README.md +++ b/Tools/jit/README.md @@ -9,7 +9,7 @@ Python 3.11 or newer is required to build the JIT. The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon). -LLVM version 19 is required. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-19`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. +LLVM version 19 is the officially supported version. You can modify if needed using the `LLVM_VERSION` env var during configure. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-19`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. It's easy to install all of the required tools: diff --git a/Tools/jit/_llvm.py b/Tools/jit/_llvm.py index f09a8404871b24..bc3b50ffe61634 100644 --- a/Tools/jit/_llvm.py +++ b/Tools/jit/_llvm.py @@ -10,8 +10,8 @@ import _targets -_LLVM_VERSION = 19 -_LLVM_VERSION_PATTERN = re.compile(rf"version\s+{_LLVM_VERSION}\.\d+\.\d+\S*\s+") + +_LLVM_VERSION = "19" _EXTERNALS_LLVM_TAG = "llvm-19.1.7.0" _P = typing.ParamSpec("_P") @@ -56,53 +56,66 @@ async def _run(tool: str, args: typing.Iterable[str], echo: bool = False) -> str @_async_cache -async def _check_tool_version(name: str, *, echo: bool = False) -> bool: +async def _check_tool_version( + name: str, llvm_version: str, *, echo: bool = False +) -> bool: output = await _run(name, ["--version"], echo=echo) - return bool(output and _LLVM_VERSION_PATTERN.search(output)) + _llvm_version_pattern = re.compile(rf"version\s+{llvm_version}\.\d+\.\d+\S*\s+") + return bool(output and _llvm_version_pattern.search(output)) @_async_cache -async def _get_brew_llvm_prefix(*, echo: bool = False) -> str | None: - output = await _run("brew", ["--prefix", f"llvm@{_LLVM_VERSION}"], echo=echo) +async def _get_brew_llvm_prefix(llvm_version: str, *, echo: bool = False) -> str | None: + output = await _run("brew", ["--prefix", f"llvm@{llvm_version}"], echo=echo) return output and output.removesuffix("\n") @_async_cache -async def _find_tool(tool: str, *, echo: bool = False) -> str | None: +async def _find_tool(tool: str, llvm_version: str, *, echo: bool = False) -> str | None: # Unversioned executables: path = tool - if await _check_tool_version(path, echo=echo): + if await _check_tool_version(path, llvm_version, echo=echo): return path # Versioned executables: - path = f"{tool}-{_LLVM_VERSION}" - if await _check_tool_version(path, echo=echo): + path = f"{tool}-{llvm_version}" + if await _check_tool_version(path, llvm_version, echo=echo): return path # PCbuild externals: externals = os.environ.get("EXTERNALS_DIR", _targets.EXTERNALS) path = os.path.join(externals, _EXTERNALS_LLVM_TAG, "bin", tool) - if await _check_tool_version(path, echo=echo): + if await _check_tool_version(path, llvm_version, echo=echo): return path # Homebrew-installed executables: - prefix = await _get_brew_llvm_prefix(echo=echo) + prefix = await _get_brew_llvm_prefix(llvm_version, echo=echo) if prefix is not None: path = os.path.join(prefix, "bin", tool) - if await _check_tool_version(path, echo=echo): + if await _check_tool_version(path, llvm_version, echo=echo): return path # Nothing found: return None async def maybe_run( - tool: str, args: typing.Iterable[str], echo: bool = False + tool: str, + args: typing.Iterable[str], + echo: bool = False, + llvm_version: str = _LLVM_VERSION, ) -> str | None: """Run an LLVM tool if it can be found. Otherwise, return None.""" - path = await _find_tool(tool, echo=echo) + + path = await _find_tool(tool, llvm_version, echo=echo) return path and await _run(path, args, echo=echo) -async def run(tool: str, args: typing.Iterable[str], echo: bool = False) -> str: +async def run( + tool: str, + args: typing.Iterable[str], + echo: bool = False, + llvm_version: str = _LLVM_VERSION, +) -> str: """Run an LLVM tool if it can be found. Otherwise, raise RuntimeError.""" - output = await maybe_run(tool, args, echo=echo) + + output = await maybe_run(tool, args, echo=echo, llvm_version=llvm_version) if output is None: - raise RuntimeError(f"Can't find {tool}-{_LLVM_VERSION}!") + raise RuntimeError(f"Can't find {tool}-{llvm_version}!") return output diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 2f3969e7d0540c..9fc3522d23d982 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -50,6 +50,7 @@ class _Target(typing.Generic[_S, _R]): debug: bool = False verbose: bool = False cflags: str = "" + llvm_version: str = _llvm._LLVM_VERSION known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) pyconfig_dir: pathlib.Path = pathlib.Path.cwd().resolve() @@ -81,7 +82,9 @@ def _compute_digest(self) -> str: async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: group = _stencils.StencilGroup() args = ["--disassemble", "--reloc", f"{path}"] - output = await _llvm.maybe_run("llvm-objdump", args, echo=self.verbose) + output = await _llvm.maybe_run( + "llvm-objdump", args, echo=self.verbose, llvm_version=self.llvm_version + ) if output is not None: # Make sure that full paths don't leak out (for reproducibility): long, short = str(path), str(path.name) @@ -99,7 +102,9 @@ async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: "--sections", f"{path}", ] - output = await _llvm.run("llvm-readobj", args, echo=self.verbose) + output = await _llvm.run( + "llvm-readobj", args, echo=self.verbose, llvm_version=self.llvm_version + ) # --elf-output-style=JSON is only *slightly* broken on Mach-O... output = output.replace("PrivateExtern\n", "\n") output = output.replace("Extern\n", "\n") @@ -175,12 +180,16 @@ async def _compile( # Allow user-provided CFLAGS to override any defaults *shlex.split(self.cflags), ] - await _llvm.run("clang", args_s, echo=self.verbose) + await _llvm.run( + "clang", args_s, echo=self.verbose, llvm_version=self.llvm_version + ) self.optimizer( s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix ).run() args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] - await _llvm.run("clang", args_o, echo=self.verbose) + await _llvm.run( + "clang", args_o, echo=self.verbose, llvm_version=self.llvm_version + ) return await self._parse(o) async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: @@ -224,6 +233,8 @@ def build( if not self.stable: warning = f"JIT support for {self.triple} is still experimental!" request = "Please report any issues you encounter.".center(len(warning)) + if self.llvm_version != _llvm._LLVM_VERSION: + request = f"Warning! Building with an LLVM version other than {_llvm._LLVM_VERSION} is not supported." outline = "=" * len(warning) print("\n".join(["", outline, warning, request, outline, ""])) digest = f"// {self._compute_digest()}\n" diff --git a/Tools/jit/build.py b/Tools/jit/build.py index a0733005929bf2..127d93b317fb09 100644 --- a/Tools/jit/build.py +++ b/Tools/jit/build.py @@ -42,6 +42,7 @@ parser.add_argument( "--cflags", help="additional flags to pass to the compiler", default="" ) + parser.add_argument("--llvm-version", help="LLVM version to use") args = parser.parse_args() for target in args.target: target.debug = args.debug @@ -49,6 +50,8 @@ target.verbose = args.verbose target.cflags = args.cflags target.pyconfig_dir = args.pyconfig_dir + if args.llvm_version: + target.llvm_version = args.llvm_version target.build( comment=comment, force=args.force, diff --git a/configure b/configure index 0d1f6a29e9b432..d80340e3015bee 100755 --- a/configure +++ b/configure @@ -10875,7 +10875,7 @@ then : else case e in #( e) as_fn_append CFLAGS_NODIST " $jit_flags" - REGEN_JIT_COMMAND="\$(PYTHON_FOR_REGEN) \$(srcdir)/Tools/jit/build.py ${ARCH_TRIPLES:-$host} --output-dir . --pyconfig-dir . --cflags=\"$CFLAGS_JIT\"" + REGEN_JIT_COMMAND="\$(PYTHON_FOR_REGEN) \$(srcdir)/Tools/jit/build.py ${ARCH_TRIPLES:-$host} --output-dir . --pyconfig-dir . --cflags=\"$CFLAGS_JIT\" --llvm-version=\"$LLVM_VERSION\"" if test "x$Py_DEBUG" = xtrue then : as_fn_append REGEN_JIT_COMMAND " --debug" diff --git a/configure.ac b/configure.ac index 7b5da6e0d15682..1e0c0f71b7c281 100644 --- a/configure.ac +++ b/configure.ac @@ -2786,7 +2786,7 @@ AS_VAR_IF([jit_flags], [], [AS_VAR_APPEND([CFLAGS_NODIST], [" $jit_flags"]) AS_VAR_SET([REGEN_JIT_COMMAND], - ["\$(PYTHON_FOR_REGEN) \$(srcdir)/Tools/jit/build.py ${ARCH_TRIPLES:-$host} --output-dir . --pyconfig-dir . --cflags=\"$CFLAGS_JIT\""]) + ["\$(PYTHON_FOR_REGEN) \$(srcdir)/Tools/jit/build.py ${ARCH_TRIPLES:-$host} --output-dir . --pyconfig-dir . --cflags=\"$CFLAGS_JIT\" --llvm-version=\"$LLVM_VERSION\""]) AS_VAR_IF([Py_DEBUG], [true], [AS_VAR_APPEND([REGEN_JIT_COMMAND], [" --debug"])], From 539461d9ec8e5322ead638f7be733fd196aa6c79 Mon Sep 17 00:00:00 2001 From: Tomasz Pytel Date: Tue, 7 Oct 2025 12:28:15 -0400 Subject: [PATCH 4/5] gh-139516: Fix lambda colon start format spec in f-string in tokenizer (#139657) --- Lib/test/test_fstring.py | 7 +++++++ Lib/test/test_tokenize.py | 17 +++++++++++++++++ ...25-10-06-13-15-26.gh-issue-139516.d9Pkur.rst | 1 + Parser/lexer/lexer.c | 2 +- Parser/lexer/state.h | 2 ++ 5 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-13-15-26.gh-issue-139516.d9Pkur.rst diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 41cefe0e286d50..05d0cbd2445c4c 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1859,6 +1859,13 @@ def __format__(self, format): # Test multiple format specs in same raw f-string self.assertEqual(rf"{UnchangedFormat():\xFF} {UnchangedFormat():\n}", '\\xFF \\n') + def test_gh139516(self): + with temp_cwd(): + script = 'script.py' + with open(script, 'wb') as f: + f.write('''def f(a): pass\nf"{f(a=lambda: 'à'\n)}"'''.encode()) + assert_python_ok(script) + if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index d90a7659c4237c..8fdd03f347b632 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1216,6 +1216,23 @@ def test_multiline_non_ascii_fstring_with_expr(self): FSTRING_END "\'\'\'" (3, 1) (3, 4) """) + # gh-139516, the '\n' is explicit to ensure no trailing whitespace which would invalidate the test + self.check_tokenize('''f"{f(a=lambda: 'à'\n)}"''', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + OP '{' (1, 2) (1, 3) + NAME 'f' (1, 3) (1, 4) + OP '(' (1, 4) (1, 5) + NAME 'a' (1, 5) (1, 6) + OP '=' (1, 6) (1, 7) + NAME 'lambda' (1, 7) (1, 13) + OP ':' (1, 13) (1, 14) + STRING "\'à\'" (1, 15) (1, 18) + NL '\\n' (1, 18) (1, 19) + OP ')' (2, 0) (2, 1) + OP '}' (2, 1) (2, 2) + FSTRING_END \'"\' (2, 2) (2, 3) + """) + class GenerateTokensTest(TokenizeTest): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-13-15-26.gh-issue-139516.d9Pkur.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-13-15-26.gh-issue-139516.d9Pkur.rst new file mode 100644 index 00000000000000..a709112306025f --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-13-15-26.gh-issue-139516.d9Pkur.rst @@ -0,0 +1 @@ +Fix lambda colon erroneously start format spec in f-string in tokenizer. diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 81363cf8e810fe..a69994e9b3d005 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -1376,7 +1376,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c)); } - if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { + if( c == '=' && INSIDE_FSTRING_EXPR_AT_TOP(current_tok)) { current_tok->in_debug = 1; } diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h index 5e8cac7249b21c..877127125a7652 100644 --- a/Parser/lexer/state.h +++ b/Parser/lexer/state.h @@ -9,6 +9,8 @@ #define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) #define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) +#define INSIDE_FSTRING_EXPR_AT_TOP(tok) \ + (tok->curly_bracket_depth - tok->curly_bracket_expr_start_depth == 1) enum decoding_state { STATE_INIT, From 162997bb70e067668c039700141770687bc8f267 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 7 Oct 2025 20:15:26 +0300 Subject: [PATCH 5/5] gh-139700: Check consistency of the zip64 end of central directory record (GH-139702) Support records with "zip64 extensible data" if there are no bytes prepended to the ZIP file. --- Lib/test/test_zipfile/test_core.py | 82 ++++++++++++++++++- Lib/zipfile/__init__.py | 51 +++++++----- ...-10-07-19-31-34.gh-issue-139700.vNHU1O.rst | 3 + 3 files changed, 113 insertions(+), 23 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-10-07-19-31-34.gh-issue-139700.vNHU1O.rst diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index c033059a515db6..6acfefc74d6665 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -898,6 +898,8 @@ def make_zip64_file( self, file_size_64_set=False, file_size_extra=False, compress_size_64_set=False, compress_size_extra=False, header_offset_64_set=False, header_offset_extra=False, + extensible_data=b'', + end_of_central_dir_size=None, offset_to_end_of_central_dir=None, ): """Generate bytes sequence for a zip with (incomplete) zip64 data. @@ -951,6 +953,12 @@ def make_zip64_file( central_dir_size = struct.pack(' 2: inferred = concat + offset_cd @@ -289,16 +286,15 @@ def _EndRecData64(fpin, offset, endrec): """ Read the ZIP64 end-of-archive records and use that to update endrec """ - try: - fpin.seek(offset - sizeEndCentDir64Locator, 2) - except OSError: - # If the seek fails, the file is not large enough to contain a ZIP64 + offset -= sizeEndCentDir64Locator + if offset < 0: + # The file is not large enough to contain a ZIP64 # end-of-archive record, so just return the end record we were given. return endrec - + fpin.seek(offset) data = fpin.read(sizeEndCentDir64Locator) if len(data) != sizeEndCentDir64Locator: - return endrec + raise OSError("Unknown I/O error") sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) if sig != stringEndArchive64Locator: return endrec @@ -306,16 +302,33 @@ def _EndRecData64(fpin, offset, endrec): if diskno != 0 or disks > 1: raise BadZipFile("zipfiles that span multiple disks are not supported") - # Assume no 'zip64 extensible data' - fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) + offset -= sizeEndCentDir64 + if reloff > offset: + raise BadZipFile("Corrupt zip64 end of central directory locator") + # First, check the assumption that there is no prepended data. + fpin.seek(reloff) + extrasz = offset - reloff data = fpin.read(sizeEndCentDir64) if len(data) != sizeEndCentDir64: - return endrec + raise OSError("Unknown I/O error") + if not data.startswith(stringEndArchive64) and reloff != offset: + # Since we already have seen the Zip64 EOCD Locator, it's + # possible we got here because there is prepended data. + # Assume no 'zip64 extensible data' + fpin.seek(offset) + extrasz = 0 + data = fpin.read(sizeEndCentDir64) + if len(data) != sizeEndCentDir64: + raise OSError("Unknown I/O error") + if not data.startswith(stringEndArchive64): + raise BadZipFile("Zip64 end of central directory record not found") + sig, sz, create_version, read_version, disk_num, disk_dir, \ dircount, dircount2, dirsize, diroffset = \ struct.unpack(structEndArchive64, data) - if sig != stringEndArchive64: - return endrec + if (diroffset + dirsize != reloff or + sz + 12 != sizeEndCentDir64 + extrasz): + raise BadZipFile("Corrupt zip64 end of central directory record") # Update the original endrec using data from the ZIP64 record endrec[_ECD_SIGNATURE] = sig @@ -325,6 +338,7 @@ def _EndRecData64(fpin, offset, endrec): endrec[_ECD_ENTRIES_TOTAL] = dircount2 endrec[_ECD_SIZE] = dirsize endrec[_ECD_OFFSET] = diroffset + endrec[_ECD_LOCATION] = offset - extrasz return endrec @@ -358,7 +372,7 @@ def _EndRecData(fpin): endrec.append(filesize - sizeEndCentDir) # Try to read the "Zip64 end of central directory" structure - return _EndRecData64(fpin, -sizeEndCentDir, endrec) + return _EndRecData64(fpin, filesize - sizeEndCentDir, endrec) # Either this is not a ZIP file, or it is a ZIP file with an archive # comment. Search the end of the file for the "end of central directory" @@ -382,8 +396,7 @@ def _EndRecData(fpin): endrec.append(maxCommentStart + start) # Try to read the "Zip64 end of central directory" structure - return _EndRecData64(fpin, maxCommentStart + start - filesize, - endrec) + return _EndRecData64(fpin, maxCommentStart + start, endrec) # Unable to find a valid end of central directory structure return None @@ -2142,7 +2155,7 @@ def _write_end_record(self): " would require ZIP64 extensions") zip64endrec = struct.pack( structEndArchive64, stringEndArchive64, - 44, 45, 45, 0, 0, centDirCount, centDirCount, + sizeEndCentDir64 - 12, 45, 45, 0, 0, centDirCount, centDirCount, centDirSize, centDirOffset) self.fp.write(zip64endrec) diff --git a/Misc/NEWS.d/next/Security/2025-10-07-19-31-34.gh-issue-139700.vNHU1O.rst b/Misc/NEWS.d/next/Security/2025-10-07-19-31-34.gh-issue-139700.vNHU1O.rst new file mode 100644 index 00000000000000..a8e7a1f1878c6b --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-10-07-19-31-34.gh-issue-139700.vNHU1O.rst @@ -0,0 +1,3 @@ +Check consistency of the zip64 end of central directory record. Support +records with "zip64 extensible data" if there are no bytes prepended to the +ZIP file.