From a4421f5f9345e3a31b0deebe58bc5263ee9133b8 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Fri, 20 Feb 2026 13:11:24 +0000 Subject: [PATCH 1/5] Implement `unicodedata.block()` --- Doc/library/unicodedata.rst | 12 + Doc/whatsnew/3.15.rst | 5 + Lib/test/test_unicodedata.py | 82 ++ ...6-02-20-13-03-10.gh-issue-66802.OYcAi_.rst | 3 + Modules/clinic/unicodedata.c.h | 38 +- Modules/unicodedata.c | 41 +- Modules/unicodedata_db.h | 703 ++++++++++++++++++ Tools/unicode/makeunicodedata.py | 40 + 8 files changed, 922 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 2fc8b1d8b52341..30029664f6f105 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -130,6 +130,18 @@ following functions: `Unicode Standard Annex #11 `_. +.. function:: block(chr, /) + + Returns the `block + `_ + assigned to the character *chr*. For example:: + + >>> unicodedata.block('S') + 'Basic Latin' + + .. versionadded:: next + + .. function:: mirrored(chr, /) Returns the mirrored property assigned to the character *chr* as diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index feccc496fad0e0..d47df4e7c0547e 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1134,6 +1134,11 @@ unicodedata of the character which are related to the above algorithm. (Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.) +* Add :func:`~unicodedata.block` function to return the `Unicode block + `_ + assigned to a character. + (Contributed by Stan Ulbrych in :gh:`66802`.) + unittest -------- diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 1d03e7d9fec717..e30db356c775b1 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -973,6 +973,88 @@ def graphemes(*args): 'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'), ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3']) + def test_block(self): + self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo') + self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo') + self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables') + self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables') + # New in 5.0.0 + self.assertEqual(self.db.block('\u05BA'), 'Hebrew') + self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols') + # New in 5.1.0 + self.assertEqual(self.db.block('\u2064'), 'General Punctuation') + self.assertEqual(self.db.block('\uAA4D'), 'Cham') + # New in 5.2.0 + self.assertEqual(self.db.block('\u0816'), 'Samaritan') + self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A') + self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B') + self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B') + # New in 6.0.0 + self.assertEqual(self.db.block('\u093A'), 'Devanagari') + self.assertEqual(self.db.block('\U00011002'), 'Brahmi') + # New in 6.1.0 + self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block') + self.assertEqual(self.db.block('\U00016F7E'), 'Miao') + # New in 6.2.0 + self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement') + self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement') + # New in 6.3.0 + self.assertEqual(self.db.block('\u180E'), 'Mongolian') + self.assertEqual(self.db.block('\u1A1B'), 'Buginese') + # New in 7.0.0 + self.assertEqual(self.db.block('\u0E33'), 'Thai') + self.assertEqual(self.db.block('\u0EB3'), 'Lao') + self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls') + self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui') + self.assertEqual(self.db.block('\U0001163E'), 'Modi') + # New in 8.0.0 + self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A') + self.assertEqual(self.db.block('\U00011726'), 'Ahom') + # New in 9.0.0 + self.assertEqual(self.db.block('\u0600'), 'Arabic') + self.assertEqual(self.db.block('\U000E007F'), 'Tags') + self.assertEqual(self.db.block('\U00011CB4'), 'Marchen') + self.assertEqual(self.db.block('\u200D'), 'General Punctuation') + # New in 10.0.0 + self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi') + self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi') + self.assertEqual(self.db.block('\U00011A97'), 'Soyombo') + # New in 11.0.0 + self.assertEqual(self.db.block('\U000110CD'), 'Kaithi') + self.assertEqual(self.db.block('\u07FD'), 'NKo') + self.assertEqual(self.db.block('\U00011EF6'), 'Makasar') + # New in 12.0.0 + self.assertEqual(self.db.block('\U00011A84'), 'Soyombo') + self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls') + self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho') + self.assertEqual(self.db.block('\U00016F87'), 'Miao') + # New in 13.0.0 + self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru') + self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation') + self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru') + # New in 14.0.0 + self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B') + self.assertEqual(self.db.block('\U0001E2AE'), 'Toto') + # New in 15.0.0 + self.assertEqual(self.db.block('\U00011F02'), 'Kawi') + self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls') + self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari') + self.assertEqual(self.db.block('\U00011F3F'), 'Kawi') + # New in 16.0.0 + self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari') + self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal') + self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema') + self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai') + # New in 17.0.0 + self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended') + self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement') + + self.assertRaises(TypeError, self.db.block) + self.assertRaises(TypeError, self.db.block, b'x') + self.assertRaises(TypeError, self.db.block, 120) + self.assertRaises(TypeError, self.db.block, '') + self.assertRaises(TypeError, self.db.block, 'xx') + class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest): db = unicodedata.ucd_3_2_0 diff --git a/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst new file mode 100644 index 00000000000000..04e150671b5f7a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst @@ -0,0 +1,3 @@ +Add :func:`unicodedata.block` function to return the `Unicode block +`_ of a +character. diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 8e2dd7a0ce5663..5443893079b1af 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -691,6 +691,42 @@ unicodedata_iter_graphemes(PyObject *module, PyObject *const *args, Py_ssize_t n return return_value; } +PyDoc_STRVAR(unicodedata_block__doc__, +"block($module, chr, /)\n" +"--\n" +"\n" +"Return block assigned to the character chr."); + +#define UNICODEDATA_BLOCK_METHODDEF \ + {"block", (PyCFunction)unicodedata_block, METH_O, unicodedata_block__doc__}, + +static PyObject * +unicodedata_block_impl(PyObject *module, int chr); + +static PyObject * +unicodedata_block(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("block", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "block(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_block_impl(module, chr); + +exit: + return return_value; +} + PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__, "grapheme_cluster_break($module, chr, /)\n" "--\n" @@ -798,4 +834,4 @@ unicodedata_extended_pictographic(PyObject *module, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=0f09cc90f06ace76 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=482a87df218f07c1 input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 1ed9760874b2a6..744d07493f0246 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -96,6 +96,27 @@ _getrecord_ex(Py_UCS4 code) return &_PyUnicode_Database_Records[index]; } +static const char * +_getrecord_block(Py_UCS4 code) +{ + int l = 0, h = BLOCK_COUNT - 1; + while (l <= h) { + int m = (l + h) / 2; + if (code < _PyUnicode_Blocks[m].s) { + h = m - 1; + } + else if (code > _PyUnicode_Blocks[m].e) { + l = m + 1; + } + else { + return _PyUnicode_BlockNames[_PyUnicode_Blocks[m].name]; + } + } + // Otherwise, return the deefault value per + // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189 + return "No_Block"; +} + typedef struct { PyObject *SegmentType; PyObject *GraphemeBreakIteratorType; @@ -2066,6 +2087,23 @@ unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr, return (PyObject*)gbi; } +/*[clinic input] +unicodedata.block + + chr: int(accept={str}) + / + +Return block assigned to the character chr. +[clinic start generated code]*/ + +static PyObject * +unicodedata_block_impl(PyObject *module, int chr) +/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/ +{ + Py_UCS4 c = (Py_UCS4)chr; + return PyUnicode_FromString(_getrecord_block(c)); +} + /*[clinic input] unicodedata.grapheme_cluster_break @@ -2128,6 +2166,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr) // an UCD instance. static PyMethodDef unicodedata_functions[] = { // Module only functions. + UNICODEDATA_BLOCK_METHODDEF UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF @@ -2137,7 +2176,7 @@ static PyMethodDef unicodedata_functions[] = { // The following definitions are shared between the module // and the UCD class. -#define DB_methods (unicodedata_functions + 6) +#define DB_methods (unicodedata_functions + 7) UNICODEDATA_UCD_DECIMAL_METHODDEF UNICODEDATA_UCD_DIGIT_METHODDEF diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h index 3cc5776a1f240d..14f861eec56e96 100644 --- a/Modules/unicodedata_db.h +++ b/Modules/unicodedata_db.h @@ -796,6 +796,709 @@ const char * const _PyUnicode_IndicConjunctBreakNames[] = { "Extend", NULL }; +static const char * const _PyUnicode_BlockNames[] = { + "Basic Latin", + "Latin-1 Supplement", + "Latin Extended-A", + "Latin Extended-B", + "IPA Extensions", + "Spacing Modifier Letters", + "Combining Diacritical Marks", + "Greek and Coptic", + "Cyrillic", + "Cyrillic Supplement", + "Armenian", + "Hebrew", + "Arabic", + "Syriac", + "Arabic Supplement", + "Thaana", + "NKo", + "Samaritan", + "Mandaic", + "Syriac Supplement", + "Arabic Extended-B", + "Arabic Extended-A", + "Devanagari", + "Bengali", + "Gurmukhi", + "Gujarati", + "Oriya", + "Tamil", + "Telugu", + "Kannada", + "Malayalam", + "Sinhala", + "Thai", + "Lao", + "Tibetan", + "Myanmar", + "Georgian", + "Hangul Jamo", + "Ethiopic", + "Ethiopic Supplement", + "Cherokee", + "Unified Canadian Aboriginal Syllabics", + "Ogham", + "Runic", + "Tagalog", + "Hanunoo", + "Buhid", + "Tagbanwa", + "Khmer", + "Mongolian", + "Unified Canadian Aboriginal Syllabics Extended", + "Limbu", + "Tai Le", + "New Tai Lue", + "Khmer Symbols", + "Buginese", + "Tai Tham", + "Combining Diacritical Marks Extended", + "Balinese", + "Sundanese", + "Batak", + "Lepcha", + "Ol Chiki", + "Cyrillic Extended-C", + "Georgian Extended", + "Sundanese Supplement", + "Vedic Extensions", + "Phonetic Extensions", + "Phonetic Extensions Supplement", + "Combining Diacritical Marks Supplement", + "Latin Extended Additional", + "Greek Extended", + "General Punctuation", + "Superscripts and Subscripts", + "Currency Symbols", + "Combining Diacritical Marks for Symbols", + "Letterlike Symbols", + "Number Forms", + "Arrows", + "Mathematical Operators", + "Miscellaneous Technical", + "Control Pictures", + "Optical Character Recognition", + "Enclosed Alphanumerics", + "Box Drawing", + "Block Elements", + "Geometric Shapes", + "Miscellaneous Symbols", + "Dingbats", + "Miscellaneous Mathematical Symbols-A", + "Supplemental Arrows-A", + "Braille Patterns", + "Supplemental Arrows-B", + "Miscellaneous Mathematical Symbols-B", + "Supplemental Mathematical Operators", + "Miscellaneous Symbols and Arrows", + "Glagolitic", + "Latin Extended-C", + "Coptic", + "Georgian Supplement", + "Tifinagh", + "Ethiopic Extended", + "Cyrillic Extended-A", + "Supplemental Punctuation", + "CJK Radicals Supplement", + "Kangxi Radicals", + "Ideographic Description Characters", + "CJK Symbols and Punctuation", + "Hiragana", + "Katakana", + "Bopomofo", + "Hangul Compatibility Jamo", + "Kanbun", + "Bopomofo Extended", + "CJK Strokes", + "Katakana Phonetic Extensions", + "Enclosed CJK Letters and Months", + "CJK Compatibility", + "CJK Unified Ideographs Extension A", + "Yijing Hexagram Symbols", + "CJK Unified Ideographs", + "Yi Syllables", + "Yi Radicals", + "Lisu", + "Vai", + "Cyrillic Extended-B", + "Bamum", + "Modifier Tone Letters", + "Latin Extended-D", + "Syloti Nagri", + "Common Indic Number Forms", + "Phags-pa", + "Saurashtra", + "Devanagari Extended", + "Kayah Li", + "Rejang", + "Hangul Jamo Extended-A", + "Javanese", + "Myanmar Extended-B", + "Cham", + "Myanmar Extended-A", + "Tai Viet", + "Meetei Mayek Extensions", + "Ethiopic Extended-A", + "Latin Extended-E", + "Cherokee Supplement", + "Meetei Mayek", + "Hangul Syllables", + "Hangul Jamo Extended-B", + "High Surrogates", + "High Private Use Surrogates", + "Low Surrogates", + "Private Use Area", + "CJK Compatibility Ideographs", + "Alphabetic Presentation Forms", + "Arabic Presentation Forms-A", + "Variation Selectors", + "Vertical Forms", + "Combining Half Marks", + "CJK Compatibility Forms", + "Small Form Variants", + "Arabic Presentation Forms-B", + "Halfwidth and Fullwidth Forms", + "Specials", + "Linear B Syllabary", + "Linear B Ideograms", + "Aegean Numbers", + "Ancient Greek Numbers", + "Ancient Symbols", + "Phaistos Disc", + "Lycian", + "Carian", + "Coptic Epact Numbers", + "Old Italic", + "Gothic", + "Old Permic", + "Ugaritic", + "Old Persian", + "Deseret", + "Shavian", + "Osmanya", + "Osage", + "Elbasan", + "Caucasian Albanian", + "Vithkuqi", + "Todhri", + "Linear A", + "Latin Extended-F", + "Cypriot Syllabary", + "Imperial Aramaic", + "Palmyrene", + "Nabataean", + "Hatran", + "Phoenician", + "Lydian", + "Sidetic", + "Meroitic Hieroglyphs", + "Meroitic Cursive", + "Kharoshthi", + "Old South Arabian", + "Old North Arabian", + "Manichaean", + "Avestan", + "Inscriptional Parthian", + "Inscriptional Pahlavi", + "Psalter Pahlavi", + "Old Turkic", + "Old Hungarian", + "Hanifi Rohingya", + "Garay", + "Rumi Numeral Symbols", + "Yezidi", + "Arabic Extended-C", + "Old Sogdian", + "Sogdian", + "Old Uyghur", + "Chorasmian", + "Elymaic", + "Brahmi", + "Kaithi", + "Sora Sompeng", + "Chakma", + "Mahajani", + "Sharada", + "Sinhala Archaic Numbers", + "Khojki", + "Multani", + "Khudawadi", + "Grantha", + "Tulu-Tigalari", + "Newa", + "Tirhuta", + "Siddham", + "Modi", + "Mongolian Supplement", + "Takri", + "Myanmar Extended-C", + "Ahom", + "Dogra", + "Warang Citi", + "Dives Akuru", + "Nandinagari", + "Zanabazar Square", + "Soyombo", + "Unified Canadian Aboriginal Syllabics Extended-A", + "Pau Cin Hau", + "Devanagari Extended-A", + "Sharada Supplement", + "Sunuwar", + "Bhaiksuki", + "Marchen", + "Masaram Gondi", + "Gunjala Gondi", + "Tolong Siki", + "Makasar", + "Kawi", + "Lisu Supplement", + "Tamil Supplement", + "Cuneiform", + "Cuneiform Numbers and Punctuation", + "Early Dynastic Cuneiform", + "Cypro-Minoan", + "Egyptian Hieroglyphs", + "Egyptian Hieroglyph Format Controls", + "Egyptian Hieroglyphs Extended-A", + "Anatolian Hieroglyphs", + "Gurung Khema", + "Bamum Supplement", + "Mro", + "Tangsa", + "Bassa Vah", + "Pahawh Hmong", + "Kirat Rai", + "Medefaidrin", + "Beria Erfe", + "Miao", + "Ideographic Symbols and Punctuation", + "Tangut", + "Tangut Components", + "Khitan Small Script", + "Tangut Supplement", + "Tangut Components Supplement", + "Kana Extended-B", + "Kana Supplement", + "Kana Extended-A", + "Small Kana Extension", + "Nushu", + "Duployan", + "Shorthand Format Controls", + "Symbols for Legacy Computing Supplement", + "Miscellaneous Symbols Supplement", + "Znamenny Musical Notation", + "Byzantine Musical Symbols", + "Musical Symbols", + "Ancient Greek Musical Notation", + "Kaktovik Numerals", + "Mayan Numerals", + "Tai Xuan Jing Symbols", + "Counting Rod Numerals", + "Mathematical Alphanumeric Symbols", + "Sutton SignWriting", + "Latin Extended-G", + "Glagolitic Supplement", + "Cyrillic Extended-D", + "Nyiakeng Puachue Hmong", + "Toto", + "Wancho", + "Nag Mundari", + "Ol Onal", + "Tai Yo", + "Ethiopic Extended-B", + "Mende Kikakui", + "Adlam", + "Indic Siyaq Numbers", + "Ottoman Siyaq Numbers", + "Arabic Mathematical Alphabetic Symbols", + "Mahjong Tiles", + "Domino Tiles", + "Playing Cards", + "Enclosed Alphanumeric Supplement", + "Enclosed Ideographic Supplement", + "Miscellaneous Symbols and Pictographs", + "Emoticons", + "Ornamental Dingbats", + "Transport and Map Symbols", + "Alchemical Symbols", + "Geometric Shapes Extended", + "Supplemental Arrows-C", + "Supplemental Symbols and Pictographs", + "Chess Symbols", + "Symbols and Pictographs Extended-A", + "Symbols for Legacy Computing", + "CJK Unified Ideographs Extension B", + "CJK Unified Ideographs Extension C", + "CJK Unified Ideographs Extension D", + "CJK Unified Ideographs Extension E", + "CJK Unified Ideographs Extension F", + "CJK Unified Ideographs Extension I", + "CJK Compatibility Ideographs Supplement", + "CJK Unified Ideographs Extension G", + "CJK Unified Ideographs Extension H", + "CJK Unified Ideographs Extension J", + "Tags", + "Variation Selectors Supplement", + "Supplementary Private Use Area-A", + "Supplementary Private Use Area-B", +}; +typedef struct { + Py_UCS4 s; + Py_UCS4 e; + unsigned short name; +} _PyUnicode_Block; +static const _PyUnicode_Block _PyUnicode_Blocks[] = { + {0x0000, 0x007F, 0}, + {0x0080, 0x00FF, 1}, + {0x0100, 0x017F, 2}, + {0x0180, 0x024F, 3}, + {0x0250, 0x02AF, 4}, + {0x02B0, 0x02FF, 5}, + {0x0300, 0x036F, 6}, + {0x0370, 0x03FF, 7}, + {0x0400, 0x04FF, 8}, + {0x0500, 0x052F, 9}, + {0x0530, 0x058F, 10}, + {0x0590, 0x05FF, 11}, + {0x0600, 0x06FF, 12}, + {0x0700, 0x074F, 13}, + {0x0750, 0x077F, 14}, + {0x0780, 0x07BF, 15}, + {0x07C0, 0x07FF, 16}, + {0x0800, 0x083F, 17}, + {0x0840, 0x085F, 18}, + {0x0860, 0x086F, 19}, + {0x0870, 0x089F, 20}, + {0x08A0, 0x08FF, 21}, + {0x0900, 0x097F, 22}, + {0x0980, 0x09FF, 23}, + {0x0A00, 0x0A7F, 24}, + {0x0A80, 0x0AFF, 25}, + {0x0B00, 0x0B7F, 26}, + {0x0B80, 0x0BFF, 27}, + {0x0C00, 0x0C7F, 28}, + {0x0C80, 0x0CFF, 29}, + {0x0D00, 0x0D7F, 30}, + {0x0D80, 0x0DFF, 31}, + {0x0E00, 0x0E7F, 32}, + {0x0E80, 0x0EFF, 33}, + {0x0F00, 0x0FFF, 34}, + {0x1000, 0x109F, 35}, + {0x10A0, 0x10FF, 36}, + {0x1100, 0x11FF, 37}, + {0x1200, 0x137F, 38}, + {0x1380, 0x139F, 39}, + {0x13A0, 0x13FF, 40}, + {0x1400, 0x167F, 41}, + {0x1680, 0x169F, 42}, + {0x16A0, 0x16FF, 43}, + {0x1700, 0x171F, 44}, + {0x1720, 0x173F, 45}, + {0x1740, 0x175F, 46}, + {0x1760, 0x177F, 47}, + {0x1780, 0x17FF, 48}, + {0x1800, 0x18AF, 49}, + {0x18B0, 0x18FF, 50}, + {0x1900, 0x194F, 51}, + {0x1950, 0x197F, 52}, + {0x1980, 0x19DF, 53}, + {0x19E0, 0x19FF, 54}, + {0x1A00, 0x1A1F, 55}, + {0x1A20, 0x1AAF, 56}, + {0x1AB0, 0x1AFF, 57}, + {0x1B00, 0x1B7F, 58}, + {0x1B80, 0x1BBF, 59}, + {0x1BC0, 0x1BFF, 60}, + {0x1C00, 0x1C4F, 61}, + {0x1C50, 0x1C7F, 62}, + {0x1C80, 0x1C8F, 63}, + {0x1C90, 0x1CBF, 64}, + {0x1CC0, 0x1CCF, 65}, + {0x1CD0, 0x1CFF, 66}, + {0x1D00, 0x1D7F, 67}, + {0x1D80, 0x1DBF, 68}, + {0x1DC0, 0x1DFF, 69}, + {0x1E00, 0x1EFF, 70}, + {0x1F00, 0x1FFF, 71}, + {0x2000, 0x206F, 72}, + {0x2070, 0x209F, 73}, + {0x20A0, 0x20CF, 74}, + {0x20D0, 0x20FF, 75}, + {0x2100, 0x214F, 76}, + {0x2150, 0x218F, 77}, + {0x2190, 0x21FF, 78}, + {0x2200, 0x22FF, 79}, + {0x2300, 0x23FF, 80}, + {0x2400, 0x243F, 81}, + {0x2440, 0x245F, 82}, + {0x2460, 0x24FF, 83}, + {0x2500, 0x257F, 84}, + {0x2580, 0x259F, 85}, + {0x25A0, 0x25FF, 86}, + {0x2600, 0x26FF, 87}, + {0x2700, 0x27BF, 88}, + {0x27C0, 0x27EF, 89}, + {0x27F0, 0x27FF, 90}, + {0x2800, 0x28FF, 91}, + {0x2900, 0x297F, 92}, + {0x2980, 0x29FF, 93}, + {0x2A00, 0x2AFF, 94}, + {0x2B00, 0x2BFF, 95}, + {0x2C00, 0x2C5F, 96}, + {0x2C60, 0x2C7F, 97}, + {0x2C80, 0x2CFF, 98}, + {0x2D00, 0x2D2F, 99}, + {0x2D30, 0x2D7F, 100}, + {0x2D80, 0x2DDF, 101}, + {0x2DE0, 0x2DFF, 102}, + {0x2E00, 0x2E7F, 103}, + {0x2E80, 0x2EFF, 104}, + {0x2F00, 0x2FDF, 105}, + {0x2FF0, 0x2FFF, 106}, + {0x3000, 0x303F, 107}, + {0x3040, 0x309F, 108}, + {0x30A0, 0x30FF, 109}, + {0x3100, 0x312F, 110}, + {0x3130, 0x318F, 111}, + {0x3190, 0x319F, 112}, + {0x31A0, 0x31BF, 113}, + {0x31C0, 0x31EF, 114}, + {0x31F0, 0x31FF, 115}, + {0x3200, 0x32FF, 116}, + {0x3300, 0x33FF, 117}, + {0x3400, 0x4DBF, 118}, + {0x4DC0, 0x4DFF, 119}, + {0x4E00, 0x9FFF, 120}, + {0xA000, 0xA48F, 121}, + {0xA490, 0xA4CF, 122}, + {0xA4D0, 0xA4FF, 123}, + {0xA500, 0xA63F, 124}, + {0xA640, 0xA69F, 125}, + {0xA6A0, 0xA6FF, 126}, + {0xA700, 0xA71F, 127}, + {0xA720, 0xA7FF, 128}, + {0xA800, 0xA82F, 129}, + {0xA830, 0xA83F, 130}, + {0xA840, 0xA87F, 131}, + {0xA880, 0xA8DF, 132}, + {0xA8E0, 0xA8FF, 133}, + {0xA900, 0xA92F, 134}, + {0xA930, 0xA95F, 135}, + {0xA960, 0xA97F, 136}, + {0xA980, 0xA9DF, 137}, + {0xA9E0, 0xA9FF, 138}, + {0xAA00, 0xAA5F, 139}, + {0xAA60, 0xAA7F, 140}, + {0xAA80, 0xAADF, 141}, + {0xAAE0, 0xAAFF, 142}, + {0xAB00, 0xAB2F, 143}, + {0xAB30, 0xAB6F, 144}, + {0xAB70, 0xABBF, 145}, + {0xABC0, 0xABFF, 146}, + {0xAC00, 0xD7AF, 147}, + {0xD7B0, 0xD7FF, 148}, + {0xD800, 0xDB7F, 149}, + {0xDB80, 0xDBFF, 150}, + {0xDC00, 0xDFFF, 151}, + {0xE000, 0xF8FF, 152}, + {0xF900, 0xFAFF, 153}, + {0xFB00, 0xFB4F, 154}, + {0xFB50, 0xFDFF, 155}, + {0xFE00, 0xFE0F, 156}, + {0xFE10, 0xFE1F, 157}, + {0xFE20, 0xFE2F, 158}, + {0xFE30, 0xFE4F, 159}, + {0xFE50, 0xFE6F, 160}, + {0xFE70, 0xFEFF, 161}, + {0xFF00, 0xFFEF, 162}, + {0xFFF0, 0xFFFF, 163}, + {0x10000, 0x1007F, 164}, + {0x10080, 0x100FF, 165}, + {0x10100, 0x1013F, 166}, + {0x10140, 0x1018F, 167}, + {0x10190, 0x101CF, 168}, + {0x101D0, 0x101FF, 169}, + {0x10280, 0x1029F, 170}, + {0x102A0, 0x102DF, 171}, + {0x102E0, 0x102FF, 172}, + {0x10300, 0x1032F, 173}, + {0x10330, 0x1034F, 174}, + {0x10350, 0x1037F, 175}, + {0x10380, 0x1039F, 176}, + {0x103A0, 0x103DF, 177}, + {0x10400, 0x1044F, 178}, + {0x10450, 0x1047F, 179}, + {0x10480, 0x104AF, 180}, + {0x104B0, 0x104FF, 181}, + {0x10500, 0x1052F, 182}, + {0x10530, 0x1056F, 183}, + {0x10570, 0x105BF, 184}, + {0x105C0, 0x105FF, 185}, + {0x10600, 0x1077F, 186}, + {0x10780, 0x107BF, 187}, + {0x10800, 0x1083F, 188}, + {0x10840, 0x1085F, 189}, + {0x10860, 0x1087F, 190}, + {0x10880, 0x108AF, 191}, + {0x108E0, 0x108FF, 192}, + {0x10900, 0x1091F, 193}, + {0x10920, 0x1093F, 194}, + {0x10940, 0x1095F, 195}, + {0x10980, 0x1099F, 196}, + {0x109A0, 0x109FF, 197}, + {0x10A00, 0x10A5F, 198}, + {0x10A60, 0x10A7F, 199}, + {0x10A80, 0x10A9F, 200}, + {0x10AC0, 0x10AFF, 201}, + {0x10B00, 0x10B3F, 202}, + {0x10B40, 0x10B5F, 203}, + {0x10B60, 0x10B7F, 204}, + {0x10B80, 0x10BAF, 205}, + {0x10C00, 0x10C4F, 206}, + {0x10C80, 0x10CFF, 207}, + {0x10D00, 0x10D3F, 208}, + {0x10D40, 0x10D8F, 209}, + {0x10E60, 0x10E7F, 210}, + {0x10E80, 0x10EBF, 211}, + {0x10EC0, 0x10EFF, 212}, + {0x10F00, 0x10F2F, 213}, + {0x10F30, 0x10F6F, 214}, + {0x10F70, 0x10FAF, 215}, + {0x10FB0, 0x10FDF, 216}, + {0x10FE0, 0x10FFF, 217}, + {0x11000, 0x1107F, 218}, + {0x11080, 0x110CF, 219}, + {0x110D0, 0x110FF, 220}, + {0x11100, 0x1114F, 221}, + {0x11150, 0x1117F, 222}, + {0x11180, 0x111DF, 223}, + {0x111E0, 0x111FF, 224}, + {0x11200, 0x1124F, 225}, + {0x11280, 0x112AF, 226}, + {0x112B0, 0x112FF, 227}, + {0x11300, 0x1137F, 228}, + {0x11380, 0x113FF, 229}, + {0x11400, 0x1147F, 230}, + {0x11480, 0x114DF, 231}, + {0x11580, 0x115FF, 232}, + {0x11600, 0x1165F, 233}, + {0x11660, 0x1167F, 234}, + {0x11680, 0x116CF, 235}, + {0x116D0, 0x116FF, 236}, + {0x11700, 0x1174F, 237}, + {0x11800, 0x1184F, 238}, + {0x118A0, 0x118FF, 239}, + {0x11900, 0x1195F, 240}, + {0x119A0, 0x119FF, 241}, + {0x11A00, 0x11A4F, 242}, + {0x11A50, 0x11AAF, 243}, + {0x11AB0, 0x11ABF, 244}, + {0x11AC0, 0x11AFF, 245}, + {0x11B00, 0x11B5F, 246}, + {0x11B60, 0x11B7F, 247}, + {0x11BC0, 0x11BFF, 248}, + {0x11C00, 0x11C6F, 249}, + {0x11C70, 0x11CBF, 250}, + {0x11D00, 0x11D5F, 251}, + {0x11D60, 0x11DAF, 252}, + {0x11DB0, 0x11DEF, 253}, + {0x11EE0, 0x11EFF, 254}, + {0x11F00, 0x11F5F, 255}, + {0x11FB0, 0x11FBF, 256}, + {0x11FC0, 0x11FFF, 257}, + {0x12000, 0x123FF, 258}, + {0x12400, 0x1247F, 259}, + {0x12480, 0x1254F, 260}, + {0x12F90, 0x12FFF, 261}, + {0x13000, 0x1342F, 262}, + {0x13430, 0x1345F, 263}, + {0x13460, 0x143FF, 264}, + {0x14400, 0x1467F, 265}, + {0x16100, 0x1613F, 266}, + {0x16800, 0x16A3F, 267}, + {0x16A40, 0x16A6F, 268}, + {0x16A70, 0x16ACF, 269}, + {0x16AD0, 0x16AFF, 270}, + {0x16B00, 0x16B8F, 271}, + {0x16D40, 0x16D7F, 272}, + {0x16E40, 0x16E9F, 273}, + {0x16EA0, 0x16EDF, 274}, + {0x16F00, 0x16F9F, 275}, + {0x16FE0, 0x16FFF, 276}, + {0x17000, 0x187FF, 277}, + {0x18800, 0x18AFF, 278}, + {0x18B00, 0x18CFF, 279}, + {0x18D00, 0x18D7F, 280}, + {0x18D80, 0x18DFF, 281}, + {0x1AFF0, 0x1AFFF, 282}, + {0x1B000, 0x1B0FF, 283}, + {0x1B100, 0x1B12F, 284}, + {0x1B130, 0x1B16F, 285}, + {0x1B170, 0x1B2FF, 286}, + {0x1BC00, 0x1BC9F, 287}, + {0x1BCA0, 0x1BCAF, 288}, + {0x1CC00, 0x1CEBF, 289}, + {0x1CEC0, 0x1CEFF, 290}, + {0x1CF00, 0x1CFCF, 291}, + {0x1D000, 0x1D0FF, 292}, + {0x1D100, 0x1D1FF, 293}, + {0x1D200, 0x1D24F, 294}, + {0x1D2C0, 0x1D2DF, 295}, + {0x1D2E0, 0x1D2FF, 296}, + {0x1D300, 0x1D35F, 297}, + {0x1D360, 0x1D37F, 298}, + {0x1D400, 0x1D7FF, 299}, + {0x1D800, 0x1DAAF, 300}, + {0x1DF00, 0x1DFFF, 301}, + {0x1E000, 0x1E02F, 302}, + {0x1E030, 0x1E08F, 303}, + {0x1E100, 0x1E14F, 304}, + {0x1E290, 0x1E2BF, 305}, + {0x1E2C0, 0x1E2FF, 306}, + {0x1E4D0, 0x1E4FF, 307}, + {0x1E5D0, 0x1E5FF, 308}, + {0x1E6C0, 0x1E6FF, 309}, + {0x1E7E0, 0x1E7FF, 310}, + {0x1E800, 0x1E8DF, 311}, + {0x1E900, 0x1E95F, 312}, + {0x1EC70, 0x1ECBF, 313}, + {0x1ED00, 0x1ED4F, 314}, + {0x1EE00, 0x1EEFF, 315}, + {0x1F000, 0x1F02F, 316}, + {0x1F030, 0x1F09F, 317}, + {0x1F0A0, 0x1F0FF, 318}, + {0x1F100, 0x1F1FF, 319}, + {0x1F200, 0x1F2FF, 320}, + {0x1F300, 0x1F5FF, 321}, + {0x1F600, 0x1F64F, 322}, + {0x1F650, 0x1F67F, 323}, + {0x1F680, 0x1F6FF, 324}, + {0x1F700, 0x1F77F, 325}, + {0x1F780, 0x1F7FF, 326}, + {0x1F800, 0x1F8FF, 327}, + {0x1F900, 0x1F9FF, 328}, + {0x1FA00, 0x1FA6F, 329}, + {0x1FA70, 0x1FAFF, 330}, + {0x1FB00, 0x1FBFF, 331}, + {0x20000, 0x2A6DF, 332}, + {0x2A700, 0x2B73F, 333}, + {0x2B740, 0x2B81F, 334}, + {0x2B820, 0x2CEAF, 335}, + {0x2CEB0, 0x2EBEF, 336}, + {0x2EBF0, 0x2EE5F, 337}, + {0x2F800, 0x2FA1F, 338}, + {0x30000, 0x3134F, 339}, + {0x31350, 0x323AF, 340}, + {0x323B0, 0x3347F, 341}, + {0xE0000, 0xE007F, 342}, + {0xE0100, 0xE01EF, 343}, + {0xF0000, 0xFFFFF, 344}, + {0x100000, 0x10FFFF, 345}, +}; +#define BLOCK_COUNT 346 + static const char *decomp_prefix[] = { "", "", diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 11f626ca0aba7a..e7103a0d82ccec 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -60,6 +60,7 @@ CASE_FOLDING = "CaseFolding%s.txt" GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt" EMOJI_DATA = "emoji/emoji-data%s.txt" +BLOCKS = "Blocks%s.txt" # Private Use Areas -- in planes 1, 15, 16 PUA_1 = range(0xE000, 0xF900) @@ -392,6 +393,34 @@ def makeunicodedata(unicode, trace): fprint(" NULL") fprint("};") + # Generate block tables + names = [] + name_to_index = {} + blocks = [] + for s, e, name in unicode.blocks: + if name not in name_to_index: + name_to_index[name] = len(names) + names.append(name) + blocks.append((s, e, name_to_index[name])) + + fprint("static const char * const _PyUnicode_BlockNames[] = {") + for name in names: + fprint(' "%s",' % name) + fprint("};") + + fprint("typedef struct {") + fprint(" Py_UCS4 s;") + fprint(" Py_UCS4 e;") + fprint(" unsigned short name;") + fprint("} _PyUnicode_Block;") + + fprint("static const _PyUnicode_Block _PyUnicode_Blocks[] = {") + for s, e, name in blocks: + fprint(" {0x%04X, 0x%04X, %d}," % (s, e, name)) + fprint("};") + fprint(f"#define BLOCK_COUNT {len(blocks)}") + fprint() + fprint("static const char *decomp_prefix[] = {") for name in decomp_prefix: fprint(" \"%s\"," % name) @@ -1205,6 +1234,17 @@ def __init__(self, version, ideograph_check=True): ext_picts[char] = True self.ext_picts = ext_picts + # See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189 + self.blocks = [] + for record in UcdFile(BLOCKS, version).records(): + rng = record[0] + name = record[1] + if '..' in rng: + s, e = [int(c, 16) for c in rng.split('..')] + else: + s = e = int(rng, 16) + self.blocks.append((s, e, name)) + self.blocks.sort() def uselatin1(self): # restrict character range to ISO Latin 1 From e80bac4f3f844f407f34b753b55bfcbc89740954 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Fri, 20 Feb 2026 16:44:29 +0000 Subject: [PATCH 2/5] Remove helper --- Modules/unicodedata.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 744d07493f0246..82b5fdfbc952b5 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -96,27 +96,6 @@ _getrecord_ex(Py_UCS4 code) return &_PyUnicode_Database_Records[index]; } -static const char * -_getrecord_block(Py_UCS4 code) -{ - int l = 0, h = BLOCK_COUNT - 1; - while (l <= h) { - int m = (l + h) / 2; - if (code < _PyUnicode_Blocks[m].s) { - h = m - 1; - } - else if (code > _PyUnicode_Blocks[m].e) { - l = m + 1; - } - else { - return _PyUnicode_BlockNames[_PyUnicode_Blocks[m].name]; - } - } - // Otherwise, return the deefault value per - // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189 - return "No_Block"; -} - typedef struct { PyObject *SegmentType; PyObject *GraphemeBreakIteratorType; @@ -2101,7 +2080,22 @@ unicodedata_block_impl(PyObject *module, int chr) /*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/ { Py_UCS4 c = (Py_UCS4)chr; - return PyUnicode_FromString(_getrecord_block(c)); + int l = 0, h = BLOCK_COUNT - 1; + while (l <= h) { + int m = (l + h) / 2; + if (c < _PyUnicode_Blocks[m].s) { + h = m - 1; + } + else if (c > _PyUnicode_Blocks[m].e) { + l = m + 1; + } + else { + return PyUnicode_FromString(_PyUnicode_BlockNames[_PyUnicode_Blocks[m].name]); + } + } + // Otherwise, return the deefault value per + // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189 + return PyUnicode_FromString("No_Block"); } /*[clinic input] From 76f753828198cc891d59bb210eb18644c189c76b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Sat, 21 Feb 2026 09:58:32 +0000 Subject: [PATCH 3/5] Update Modules/unicodedata.c Co-authored-by: Ezio Melotti --- Modules/unicodedata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 82b5fdfbc952b5..05ac686858dc3b 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -2093,7 +2093,7 @@ unicodedata_block_impl(PyObject *module, int chr) return PyUnicode_FromString(_PyUnicode_BlockNames[_PyUnicode_Blocks[m].name]); } } - // Otherwise, return the deefault value per + // Otherwise, return the default value per // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189 return PyUnicode_FromString("No_Block"); } From 1cfc852c4a3bf52c8463f04083934fb3249a530d Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 21 Feb 2026 10:05:26 +0000 Subject: [PATCH 4/5] Ezio's review --- Lib/test/test_unicodedata.py | 8 ++++++++ Tools/unicode/makeunicodedata.py | 8 ++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index e30db356c775b1..f1eaa6b1f89efa 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -974,6 +974,11 @@ def graphemes(*args): ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3']) def test_block(self): + self.assertEqual(self.db.block('\u0000'), 'Basic Latin') + self.assertEqual(self.db.block('\u0041'), 'Basic Latin') + self.assertEqual(self.db.block('\u007F'), 'Basic Latin') + self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement') + self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement') self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo') self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo') self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables') @@ -1048,6 +1053,9 @@ def test_block(self): # New in 17.0.0 self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended') self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement') + # Unassigned + self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B') + self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B') self.assertRaises(TypeError, self.db.block) self.assertRaises(TypeError, self.db.block, b'x') diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index e7103a0d82ccec..bd2735d21eacaa 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -1237,12 +1237,8 @@ def __init__(self, version, ideograph_check=True): # See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189 self.blocks = [] for record in UcdFile(BLOCKS, version).records(): - rng = record[0] - name = record[1] - if '..' in rng: - s, e = [int(c, 16) for c in rng.split('..')] - else: - s = e = int(rng, 16) + start_end, name = record + s, e = [int(c, 16) for c in start_end.split('..')] self.blocks.append((s, e, name)) self.blocks.sort() From e108e27ceb501b63c4cd939c3e1c201b667f906a Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 21 Feb 2026 11:10:19 +0000 Subject: [PATCH 5/5] Stop being lazy --- Doc/library/unicodedata.rst | 2 +- Doc/whatsnew/3.15.rst | 2 +- Lib/test/test_unicodedata.py | 1 + ...026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst | 2 +- Modules/unicodedata.c | 17 +++++++++-------- Modules/unicodedata_db.h | 4 ++-- Tools/unicode/makeunicodedata.py | 16 ++++++++-------- 7 files changed, 23 insertions(+), 21 deletions(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 30029664f6f105..d5f0405efbecc6 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -133,7 +133,7 @@ following functions: .. function:: block(chr, /) Returns the `block - `_ + `_ assigned to the character *chr*. For example:: >>> unicodedata.block('S') diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index d47df4e7c0547e..50cd4cf089d750 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1135,7 +1135,7 @@ unicodedata (Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.) * Add :func:`~unicodedata.block` function to return the `Unicode block - `_ + `_ assigned to a character. (Contributed by Stan Ulbrych in :gh:`66802`.) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index f1eaa6b1f89efa..8d4ba677faaa6f 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -1057,6 +1057,7 @@ def test_block(self): self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B') self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B') + def test_block_invalid_input(self): self.assertRaises(TypeError, self.db.block) self.assertRaises(TypeError, self.db.block, b'x') self.assertRaises(TypeError, self.db.block, 120) diff --git a/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst index 04e150671b5f7a..68a25262c7d7f7 100644 --- a/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst +++ b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst @@ -1,3 +1,3 @@ Add :func:`unicodedata.block` function to return the `Unicode block -`_ of a +`_ of a character. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 05ac686858dc3b..f20726a937ce38 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -2080,17 +2080,18 @@ unicodedata_block_impl(PyObject *module, int chr) /*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/ { Py_UCS4 c = (Py_UCS4)chr; - int l = 0, h = BLOCK_COUNT - 1; - while (l <= h) { - int m = (l + h) / 2; - if (c < _PyUnicode_Blocks[m].s) { - h = m - 1; + int lo = 0, hi = BLOCK_COUNT - 1; + while (lo <= hi) { + int mid = (lo + hi) / 2; + if (c < _PyUnicode_Blocks[mid].start) { + hi = mid - 1; } - else if (c > _PyUnicode_Blocks[m].e) { - l = m + 1; + else if (c > _PyUnicode_Blocks[mid].end) { + lo = mid + 1; } else { - return PyUnicode_FromString(_PyUnicode_BlockNames[_PyUnicode_Blocks[m].name]); + size_t name = _PyUnicode_Blocks[mid].name; + return PyUnicode_FromString(_PyUnicode_BlockNames[name]); } } // Otherwise, return the default value per diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h index 14f861eec56e96..9e88f5cca7115b 100644 --- a/Modules/unicodedata_db.h +++ b/Modules/unicodedata_db.h @@ -1145,8 +1145,8 @@ static const char * const _PyUnicode_BlockNames[] = { "Supplementary Private Use Area-B", }; typedef struct { - Py_UCS4 s; - Py_UCS4 e; + Py_UCS4 start; + Py_UCS4 end; unsigned short name; } _PyUnicode_Block; static const _PyUnicode_Block _PyUnicode_Blocks[] = { diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index bd2735d21eacaa..5db850ca2d1f0c 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -397,11 +397,11 @@ def makeunicodedata(unicode, trace): names = [] name_to_index = {} blocks = [] - for s, e, name in unicode.blocks: + for start, end, name in unicode.blocks: if name not in name_to_index: name_to_index[name] = len(names) names.append(name) - blocks.append((s, e, name_to_index[name])) + blocks.append((start, end, name_to_index[name])) fprint("static const char * const _PyUnicode_BlockNames[] = {") for name in names: @@ -409,14 +409,14 @@ def makeunicodedata(unicode, trace): fprint("};") fprint("typedef struct {") - fprint(" Py_UCS4 s;") - fprint(" Py_UCS4 e;") + fprint(" Py_UCS4 start;") + fprint(" Py_UCS4 end;") fprint(" unsigned short name;") fprint("} _PyUnicode_Block;") fprint("static const _PyUnicode_Block _PyUnicode_Blocks[] = {") - for s, e, name in blocks: - fprint(" {0x%04X, 0x%04X, %d}," % (s, e, name)) + for start, end, name in blocks: + fprint(" {0x%04X, 0x%04X, %d}," % (start, end, name)) fprint("};") fprint(f"#define BLOCK_COUNT {len(blocks)}") fprint() @@ -1238,8 +1238,8 @@ def __init__(self, version, ideograph_check=True): self.blocks = [] for record in UcdFile(BLOCKS, version).records(): start_end, name = record - s, e = [int(c, 16) for c in start_end.split('..')] - self.blocks.append((s, e, name)) + start, end = [int(c, 16) for c in start_end.split('..')] + self.blocks.append((start, end, name)) self.blocks.sort() def uselatin1(self):