Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Doc/library/unicodedata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,18 @@ following functions:
`Unicode Standard Annex #11 <https://www.unicode.org/reports/tr11/>`_.


.. function:: block(chr, /)

Returns the `block
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
assigned to the character *chr*. For example::

>>> unicodedata.block('S')
'Basic Latin'

.. versionadded:: next


.. function:: mirrored(chr, /)

Returns the mirrored property assigned to the character *chr* as
Expand Down
5 changes: 5 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,11 @@ unicodedata
of the character which are related to the above algorithm.
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)

* Add :func:`~unicodedata.block` function to return the `Unicode block
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
assigned to a character.
(Contributed by Stan Ulbrych in :gh:`66802`.)


unittest
--------
Expand Down
91 changes: 91 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,97 @@ def graphemes(*args):
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])

def test_block(self):
self.assertEqual(self.db.block('\u0000'), 'Basic Latin')
self.assertEqual(self.db.block('\u0041'), 'Basic Latin')
self.assertEqual(self.db.block('\u007F'), 'Basic Latin')
self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement')
self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement')
self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
# New in 5.0.0
self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols')
# New in 5.1.0
self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
self.assertEqual(self.db.block('\uAA4D'), 'Cham')
# New in 5.2.0
self.assertEqual(self.db.block('\u0816'), 'Samaritan')
self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
# New in 6.0.0
self.assertEqual(self.db.block('\u093A'), 'Devanagari')
self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
# New in 6.1.0
self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
# New in 6.2.0
self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement')
self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement')
# New in 6.3.0
self.assertEqual(self.db.block('\u180E'), 'Mongolian')
self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
# New in 7.0.0
self.assertEqual(self.db.block('\u0E33'), 'Thai')
self.assertEqual(self.db.block('\u0EB3'), 'Lao')
self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls')
self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
self.assertEqual(self.db.block('\U0001163E'), 'Modi')
# New in 8.0.0
self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
self.assertEqual(self.db.block('\U00011726'), 'Ahom')
# New in 9.0.0
self.assertEqual(self.db.block('\u0600'), 'Arabic')
self.assertEqual(self.db.block('\U000E007F'), 'Tags')
self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
# New in 10.0.0
self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
# New in 11.0.0
self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
self.assertEqual(self.db.block('\u07FD'), 'NKo')
self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
# New in 12.0.0
self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls')
self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
self.assertEqual(self.db.block('\U00016F87'), 'Miao')
# New in 13.0.0
self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation')
self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
# New in 14.0.0
self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
# New in 15.0.0
self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls')
self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
# New in 16.0.0
self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
# New in 17.0.0
self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended')
self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
# Unassigned
self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private Use Area-B')
self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private Use Area-B')

def test_block_invalid_input(self):
self.assertRaises(TypeError, self.db.block)
self.assertRaises(TypeError, self.db.block, b'x')
self.assertRaises(TypeError, self.db.block, 120)
self.assertRaises(TypeError, self.db.block, '')
self.assertRaises(TypeError, self.db.block, 'xx')


class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add :func:`unicodedata.block` function to return the `Unicode block
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_ of a
character.
38 changes: 37 additions & 1 deletion Modules/clinic/unicodedata.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 35 additions & 1 deletion Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1493,7 +1493,7 @@
}

if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / Build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / Build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / Build and test (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [C:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / Build and test (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [C:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]
if (find_prefix_id(v) != i) {
return 0;
}
Expand Down Expand Up @@ -2066,6 +2066,39 @@
return (PyObject*)gbi;
}

/*[clinic input]
unicodedata.block

chr: int(accept={str})
/

Return block assigned to the character chr.
[clinic start generated code]*/

static PyObject *
unicodedata_block_impl(PyObject *module, int chr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried other approaches for this, and compared the performances?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Table with ranges & bin search (PR) Adding to each record
unique properties 395 1829
size of unicodedata_db.h (bytes) 626,364 759,474
simple bench* 498 nsec per loop 490 nsec per loop

* ./python -m timeit -v -r 10 -n 1000000 -s "import unicodedata; b=unicodedata.block" "b('\u0041');b('\u4E00');b('\U0010FFFF')"

The lookup difference is minor, and we save quite a bit of memory (~17%), so I think this approach is better.

/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
{
Py_UCS4 c = (Py_UCS4)chr;
int lo = 0, hi = BLOCK_COUNT - 1;
while (lo <= hi) {
int mid = (lo + hi) / 2;
if (c < _PyUnicode_Blocks[mid].start) {
hi = mid - 1;
}
else if (c > _PyUnicode_Blocks[mid].end) {
lo = mid + 1;
}
else {
size_t name = _PyUnicode_Blocks[mid].name;
return PyUnicode_FromString(_PyUnicode_BlockNames[name]);
}
}
// Otherwise, return the default value per
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
return PyUnicode_FromString("No_Block");
}

/*[clinic input]
unicodedata.grapheme_cluster_break

Expand Down Expand Up @@ -2128,6 +2161,7 @@
// an UCD instance.
static PyMethodDef unicodedata_functions[] = {
// Module only functions.
UNICODEDATA_BLOCK_METHODDEF
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
Expand All @@ -2137,7 +2171,7 @@

// The following definitions are shared between the module
// and the UCD class.
#define DB_methods (unicodedata_functions + 6)
#define DB_methods (unicodedata_functions + 7)

UNICODEDATA_UCD_DECIMAL_METHODDEF
UNICODEDATA_UCD_DIGIT_METHODDEF
Expand Down
Loading
Loading