Skip to content

Commit f1f61bf

Browse files
gh-66802: Add unicodedata.block() function (#145042)
Closes #66802
1 parent 6940c1d commit f1f61bf

File tree

8 files changed

+922
-2
lines changed

8 files changed

+922
-2
lines changed

Doc/library/unicodedata.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,18 @@ following functions:
130130
`Unicode Standard Annex #11 <https://www.unicode.org/reports/tr11/>`_.
131131

132132

133+
.. function:: block(chr, /)
134+
135+
Returns the `block
136+
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
137+
assigned to the character *chr*. For example::
138+
139+
>>> unicodedata.block('S')
140+
'Basic Latin'
141+
142+
.. versionadded:: next
143+
144+
133145
.. function:: mirrored(chr, /)
134146

135147
Returns the mirrored property assigned to the character *chr* as

Doc/whatsnew/3.15.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,11 @@ unicodedata
11341134
of the character which are related to the above algorithm.
11351135
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
11361136

1137+
* Add :func:`~unicodedata.block` function to return the `Unicode block
1138+
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
1139+
assigned to a character.
1140+
(Contributed by Stan Ulbrych in :gh:`66802`.)
1141+
11371142

11381143
unittest
11391144
--------

Lib/test/test_unicodedata.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,97 @@ def graphemes(*args):
973973
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
974974
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
975975

976+
def test_block(self):
977+
self.assertEqual(self.db.block('\u0000'), 'Basic Latin')
978+
self.assertEqual(self.db.block('\u0041'), 'Basic Latin')
979+
self.assertEqual(self.db.block('\u007F'), 'Basic Latin')
980+
self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement')
981+
self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement')
982+
self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
983+
self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
984+
self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
985+
self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
986+
# New in 5.0.0
987+
self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
988+
self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols')
989+
# New in 5.1.0
990+
self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
991+
self.assertEqual(self.db.block('\uAA4D'), 'Cham')
992+
# New in 5.2.0
993+
self.assertEqual(self.db.block('\u0816'), 'Samaritan')
994+
self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
995+
self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
996+
self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
997+
# New in 6.0.0
998+
self.assertEqual(self.db.block('\u093A'), 'Devanagari')
999+
self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
1000+
# New in 6.1.0
1001+
self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
1002+
self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
1003+
# New in 6.2.0
1004+
self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement')
1005+
self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement')
1006+
# New in 6.3.0
1007+
self.assertEqual(self.db.block('\u180E'), 'Mongolian')
1008+
self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
1009+
# New in 7.0.0
1010+
self.assertEqual(self.db.block('\u0E33'), 'Thai')
1011+
self.assertEqual(self.db.block('\u0EB3'), 'Lao')
1012+
self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls')
1013+
self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
1014+
self.assertEqual(self.db.block('\U0001163E'), 'Modi')
1015+
# New in 8.0.0
1016+
self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
1017+
self.assertEqual(self.db.block('\U00011726'), 'Ahom')
1018+
# New in 9.0.0
1019+
self.assertEqual(self.db.block('\u0600'), 'Arabic')
1020+
self.assertEqual(self.db.block('\U000E007F'), 'Tags')
1021+
self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
1022+
self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
1023+
# New in 10.0.0
1024+
self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
1025+
self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
1026+
self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
1027+
# New in 11.0.0
1028+
self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
1029+
self.assertEqual(self.db.block('\u07FD'), 'NKo')
1030+
self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
1031+
# New in 12.0.0
1032+
self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
1033+
self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls')
1034+
self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
1035+
self.assertEqual(self.db.block('\U00016F87'), 'Miao')
1036+
# New in 13.0.0
1037+
self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
1038+
self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation')
1039+
self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
1040+
# New in 14.0.0
1041+
self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
1042+
self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
1043+
# New in 15.0.0
1044+
self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
1045+
self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls')
1046+
self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
1047+
self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
1048+
# New in 16.0.0
1049+
self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
1050+
self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
1051+
self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
1052+
self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
1053+
# New in 17.0.0
1054+
self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended')
1055+
self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
1056+
# Unassigned
1057+
self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B')
1058+
self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B')
1059+
1060+
def test_block_invalid_input(self):
1061+
self.assertRaises(TypeError, self.db.block)
1062+
self.assertRaises(TypeError, self.db.block, b'x')
1063+
self.assertRaises(TypeError, self.db.block, 120)
1064+
self.assertRaises(TypeError, self.db.block, '')
1065+
self.assertRaises(TypeError, self.db.block, 'xx')
1066+
9761067

9771068
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
9781069
db = unicodedata.ucd_3_2_0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add :func:`unicodedata.block` function to return the `Unicode block
2+
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_ of a
3+
character.

Modules/clinic/unicodedata.c.h

Lines changed: 37 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/unicodedata.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2066,6 +2066,39 @@ unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
20662066
return (PyObject*)gbi;
20672067
}
20682068

2069+
/*[clinic input]
2070+
unicodedata.block
2071+
2072+
chr: int(accept={str})
2073+
/
2074+
2075+
Return block assigned to the character chr.
2076+
[clinic start generated code]*/
2077+
2078+
static PyObject *
2079+
unicodedata_block_impl(PyObject *module, int chr)
2080+
/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
2081+
{
2082+
Py_UCS4 c = (Py_UCS4)chr;
2083+
int lo = 0, hi = BLOCK_COUNT - 1;
2084+
while (lo <= hi) {
2085+
int mid = (lo + hi) / 2;
2086+
if (c < _PyUnicode_Blocks[mid].start) {
2087+
hi = mid - 1;
2088+
}
2089+
else if (c > _PyUnicode_Blocks[mid].end) {
2090+
lo = mid + 1;
2091+
}
2092+
else {
2093+
size_t name = _PyUnicode_Blocks[mid].name;
2094+
return PyUnicode_FromString(_PyUnicode_BlockNames[name]);
2095+
}
2096+
}
2097+
// Otherwise, return the default value per
2098+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
2099+
return PyUnicode_FromString("No_Block");
2100+
}
2101+
20692102
/*[clinic input]
20702103
unicodedata.grapheme_cluster_break
20712104
@@ -2128,6 +2161,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr)
21282161
// an UCD instance.
21292162
static PyMethodDef unicodedata_functions[] = {
21302163
// Module only functions.
2164+
UNICODEDATA_BLOCK_METHODDEF
21312165
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
21322166
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
21332167
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
@@ -2137,7 +2171,7 @@ static PyMethodDef unicodedata_functions[] = {
21372171

21382172
// The following definitions are shared between the module
21392173
// and the UCD class.
2140-
#define DB_methods (unicodedata_functions + 6)
2174+
#define DB_methods (unicodedata_functions + 7)
21412175

21422176
UNICODEDATA_UCD_DECIMAL_METHODDEF
21432177
UNICODEDATA_UCD_DIGIT_METHODDEF

0 commit comments

Comments
 (0)