diff --git a/Doc/library/http.cookiejar.rst b/Doc/library/http.cookiejar.rst index 251aea891c3f98..fcb0069b760e59 100644 --- a/Doc/library/http.cookiejar.rst +++ b/Doc/library/http.cookiejar.rst @@ -12,7 +12,7 @@ -------------- The :mod:`http.cookiejar` module defines classes for automatic handling of HTTP -cookies. It is useful for accessing web sites that require small pieces of data +cookies. It is useful for accessing websites that require small pieces of data -- :dfn:`cookies` -- to be set on the client machine by an HTTP response from a web server, and then returned to the server in later HTTP requests. diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index 016fcdc75da67a..674f646c633bbd 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -19,7 +19,7 @@ This module provides a single class, :class:`RobotFileParser`, which answers questions about whether or not a particular user agent can fetch a URL on the -web site that published the :file:`robots.txt` file. For more details on the +website that published the :file:`robots.txt` file. For more details on the structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. diff --git a/Doc/tutorial/index.rst b/Doc/tutorial/index.rst index d0bf77dc40d0a1..20fe161be4acc2 100644 --- a/Doc/tutorial/index.rst +++ b/Doc/tutorial/index.rst @@ -15,7 +15,7 @@ together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms. The Python interpreter and the extensive standard library are freely available -in source or binary form for all major platforms from the Python web site, +in source or binary form for all major platforms from the Python website, https://www.python.org/, and may be freely distributed. The same site also contains distributions of and pointers to many free third party Python modules, programs and tools, and additional documentation. diff --git a/Doc/tutorial/whatnow.rst b/Doc/tutorial/whatnow.rst index dbe2d7fc09927e..359cf80a7b2ecf 100644 --- a/Doc/tutorial/whatnow.rst +++ b/Doc/tutorial/whatnow.rst @@ -30,7 +30,7 @@ the set are: More Python resources: -* https://www.python.org: The major Python web site. It contains code, +* https://www.python.org: The major Python website. It contains code, documentation, and pointers to Python-related pages around the web. * https://docs.python.org: Fast access to Python's documentation. diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 2d7d81d491c157..9748e036bf2874 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -274,8 +274,13 @@ extern void _PyLineTable_InitAddressRange( /** API for traversing the line number table. */ PyAPI_FUNC(int) _PyLineTable_NextAddressRange(PyCodeAddressRange *range); extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); -// This is used in dump_frame() in traceback.c without an attached tstate. -extern int _PyCode_Addr2LineNoTstate(PyCodeObject *co, int addr); + +// Similar to PyCode_Addr2Line(), but return -1 if the code object is invalid +// and can be called without an attached tstate. Used by dump_frame() in +// Python/traceback.c. The function uses heuristics to detect freed memory, +// it's not 100% reliable. +extern int _PyCode_SafeAddr2Line(PyCodeObject *co, int addr); + /** API for executors */ extern void _PyCode_Clear_Executors(PyCodeObject *code); diff --git a/Include/internal/pycore_interpframe.h b/Include/internal/pycore_interpframe.h index 2ee3696317c727..8949d6cc2fc4bb 100644 --- a/Include/internal/pycore_interpframe.h +++ b/Include/internal/pycore_interpframe.h @@ -24,6 +24,36 @@ static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { return (PyCodeObject *)executable; } +// Similar to _PyFrame_GetCode(), but return NULL if the frame is invalid or +// freed. Used by dump_frame() in Python/traceback.c. The function uses +// heuristics to detect freed memory, it's not 100% reliable. +static inline PyCodeObject* +_PyFrame_SafeGetCode(_PyInterpreterFrame *f) +{ + // globals and builtins may be NULL on a legit frame, but it's unlikely. + // It's more likely that it's a sign of an invalid frame. + if (f->f_globals == NULL || f->f_builtins == NULL) { + return NULL; + } + + if (PyStackRef_IsNull(f->f_executable)) { + return NULL; + } + void *ptr; + memcpy(&ptr, &f->f_executable, sizeof(f->f_executable)); + if (_PyMem_IsPtrFreed(ptr)) { + return NULL; + } + PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable); + if (_PyObject_IsFreed(executable)) { + return NULL; + } + if (!PyCode_Check(executable)) { + return NULL; + } + return (PyCodeObject *)executable; +} + static inline _Py_CODEUNIT * _PyFrame_GetBytecode(_PyInterpreterFrame *f) { @@ -37,6 +67,31 @@ _PyFrame_GetBytecode(_PyInterpreterFrame *f) #endif } +// Similar to PyUnstable_InterpreterFrame_GetLasti(), but return NULL if the +// frame is invalid or freed. Used by dump_frame() in Python/traceback.c. The +// function uses heuristics to detect freed memory, it's not 100% reliable. +static inline int +_PyFrame_SafeGetLasti(struct _PyInterpreterFrame *f) +{ + // Code based on _PyFrame_GetBytecode() but replace _PyFrame_GetCode() + // with _PyFrame_SafeGetCode(). + PyCodeObject *co = _PyFrame_SafeGetCode(f); + if (co == NULL) { + return -1; + } + + _Py_CODEUNIT *bytecode; +#ifdef Py_GIL_DISABLED + _PyCodeArray *tlbc = _PyCode_GetTLBCArray(co); + assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size); + bytecode = (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index]; +#else + bytecode = _PyCode_CODE(co); +#endif + + return (int)(f->instr_ptr - bytecode) * sizeof(_Py_CODEUNIT); +} + static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) { PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj); assert(PyFunction_Check(func)); diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h index f3f2ae0a140828..ed943b510567a4 100644 --- a/Include/internal/pycore_pymem.h +++ b/Include/internal/pycore_pymem.h @@ -54,15 +54,17 @@ static inline int _PyMem_IsPtrFreed(const void *ptr) { uintptr_t value = (uintptr_t)ptr; #if SIZEOF_VOID_P == 8 - return (value == 0 + return (value <= 0xff // NULL, 0x1, 0x2, ..., 0xff || value == (uintptr_t)0xCDCDCDCDCDCDCDCD || value == (uintptr_t)0xDDDDDDDDDDDDDDDD - || value == (uintptr_t)0xFDFDFDFDFDFDFDFD); + || value == (uintptr_t)0xFDFDFDFDFDFDFDFD + || value >= (uintptr_t)0xFFFFFFFFFFFFFF00); // -0xff, ..., -2, -1 #elif SIZEOF_VOID_P == 4 - return (value == 0 + return (value <= 0xff || value == (uintptr_t)0xCDCDCDCD || value == (uintptr_t)0xDDDDDDDD - || value == (uintptr_t)0xFDFDFDFD); + || value == (uintptr_t)0xFDFDFDFD + || value >= (uintptr_t)0xFFFFFF00); #else # error "unknown pointer size" #endif diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index a5316391297f4c..ecec16e9005f3b 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -397,9 +397,12 @@ def __init__(self, lexicon, flags=0): s = _parser.State() s.flags = flags for phrase, action in lexicon: + sub_pattern = _parser.parse(phrase, flags) + if sub_pattern.state.groups != 1: + raise ValueError("Cannot use capturing groups in re.Scanner") gid = s.opengroup() p.append(_parser.SubPattern(s, [ - (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), + (SUBPATTERN, (gid, 0, 0, sub_pattern)), ])) s.closegroup(gid, p[-1]) p = _parser.SubPattern(s, [(BRANCH, (None, p))]) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 5fc95087f2b6ad..9f6f04bf6b8347 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1639,6 +1639,24 @@ def s_int(scanner, token): return int(token) (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], '')) + def test_bug_gh140797(self): + # gh140797: Capturing groups are not allowed in re.Scanner + + msg = r"Cannot use capturing groups in re\.Scanner" + # Capturing group throws an error + with self.assertRaisesRegex(ValueError, msg): + Scanner([("(a)b", None)]) + + # Named Group + with self.assertRaisesRegex(ValueError, msg): + Scanner([("(?Pa)", None)]) + + # Non-capturing groups should pass normally + s = Scanner([("(?:a)b", lambda scanner, token: token)]) + result, rem = s.scan("ab") + self.assertEqual(result,['ab']) + self.assertEqual(rem,'') + def test_bug_448951(self): # bug 448951 (similar to 429357, but with single char match) # (Also test greedy matches.) diff --git a/Misc/NEWS.d/next/Library/2025-11-02-19-23-32.gh-issue-140815.McEG-T.rst b/Misc/NEWS.d/next/Library/2025-11-02-19-23-32.gh-issue-140815.McEG-T.rst new file mode 100644 index 00000000000000..18c4d3836efef1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-11-02-19-23-32.gh-issue-140815.McEG-T.rst @@ -0,0 +1,2 @@ +:mod:`faulthandler` now detects if a frame or a code object is invalid or +freed. Patch by Victor Stinner. diff --git a/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst new file mode 100644 index 00000000000000..493b740261e64c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst @@ -0,0 +1,2 @@ +The undocumented :class:`!re.Scanner` class now forbids regular expressions containing capturing groups in its lexicon patterns. Patterns using capturing groups could +previously lead to crashes with segmentation fault. Use non-capturing groups (?:...) instead. diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 0d264a6e346f95..fc3f5d9dde0bc1 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -1005,8 +1005,8 @@ PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno) * source location tracking (co_lines/co_positions) ******************/ -int -_PyCode_Addr2LineNoTstate(PyCodeObject *co, int addrq) +static int +_PyCode_Addr2Line(PyCodeObject *co, int addrq) { if (addrq < 0) { return co->co_firstlineno; @@ -1020,12 +1020,29 @@ _PyCode_Addr2LineNoTstate(PyCodeObject *co, int addrq) return _PyCode_CheckLineNumber(addrq, &bounds); } +int +_PyCode_SafeAddr2Line(PyCodeObject *co, int addrq) +{ + if (addrq < 0) { + return co->co_firstlineno; + } + if (co->_co_monitoring && co->_co_monitoring->lines) { + return _Py_Instrumentation_GetLine(co, addrq/sizeof(_Py_CODEUNIT)); + } + if (!(addrq >= 0 && addrq < _PyCode_NBYTES(co))) { + return -1; + } + PyCodeAddressRange bounds; + _PyCode_InitAddressRange(co, &bounds); + return _PyCode_CheckLineNumber(addrq, &bounds); +} + int PyCode_Addr2Line(PyCodeObject *co, int addrq) { int lineno; Py_BEGIN_CRITICAL_SECTION(co); - lineno = _PyCode_Addr2LineNoTstate(co, addrq); + lineno = _PyCode_Addr2Line(co, addrq); Py_END_CRITICAL_SECTION(); return lineno; } diff --git a/Python/traceback.c b/Python/traceback.c index ef67368550aafb..521d6322a5c439 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -1028,14 +1028,24 @@ _Py_DumpWideString(int fd, wchar_t *str) /* Write a frame into the file fd: "File "xxx", line xxx in xxx". - This function is signal safe. */ + This function is signal safe. -static void + Return 0 on success. Return -1 if the frame is invalid. */ + +static int dump_frame(int fd, _PyInterpreterFrame *frame) { - assert(frame->owner < FRAME_OWNED_BY_INTERPRETER); + if (frame->owner == FRAME_OWNED_BY_INTERPRETER) { + /* Ignore trampoline frame */ + return 0; + } - PyCodeObject *code =_PyFrame_GetCode(frame); + PyCodeObject *code = _PyFrame_SafeGetCode(frame); + if (code == NULL) { + return -1; + } + + int res = 0; PUTS(fd, " File "); if (code->co_filename != NULL && PyUnicode_Check(code->co_filename)) @@ -1043,29 +1053,36 @@ dump_frame(int fd, _PyInterpreterFrame *frame) PUTS(fd, "\""); _Py_DumpASCII(fd, code->co_filename); PUTS(fd, "\""); - } else { + } + else { PUTS(fd, "???"); + res = -1; } - int lasti = PyUnstable_InterpreterFrame_GetLasti(frame); - int lineno = _PyCode_Addr2LineNoTstate(code, lasti); + PUTS(fd, ", line "); + int lasti = _PyFrame_SafeGetLasti(frame); + int lineno = -1; + if (lasti >= 0) { + lineno = _PyCode_SafeAddr2Line(code, lasti); + } if (lineno >= 0) { _Py_DumpDecimal(fd, (size_t)lineno); } else { PUTS(fd, "???"); + res = -1; } - PUTS(fd, " in "); - if (code->co_name != NULL - && PyUnicode_Check(code->co_name)) { + PUTS(fd, " in "); + if (code->co_name != NULL && PyUnicode_Check(code->co_name)) { _Py_DumpASCII(fd, code->co_name); } else { PUTS(fd, "???"); + res = -1; } - PUTS(fd, "\n"); + return res; } static int @@ -1108,17 +1125,6 @@ dump_traceback(int fd, PyThreadState *tstate, int write_header) unsigned int depth = 0; while (1) { - if (frame->owner == FRAME_OWNED_BY_INTERPRETER) { - /* Trampoline frame */ - frame = frame->previous; - if (frame == NULL) { - break; - } - - /* Can't have more than one shim frame in a row */ - assert(frame->owner != FRAME_OWNED_BY_INTERPRETER); - } - if (MAX_FRAME_DEPTH <= depth) { if (MAX_FRAME_DEPTH < depth) { PUTS(fd, "plus "); @@ -1128,7 +1134,15 @@ dump_traceback(int fd, PyThreadState *tstate, int write_header) break; } - dump_frame(fd, frame); + if (_PyMem_IsPtrFreed(frame)) { + PUTS(fd, " \n"); + break; + } + if (dump_frame(fd, frame) < 0) { + PUTS(fd, " \n"); + break; + } + frame = frame->previous; if (frame == NULL) { break;