Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mypyc/doc/str_operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Methods
* ``s1.find(s2: str)``
* ``s1.find(s2: str, start: int)``
* ``s1.find(s2: str, start: int, end: int)``
* ``s.isspace()``
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was not documented in the str.isspace() PR, added it now

* ``s.isalnum()``
* ``s.join(x: Iterable)``
* ``s.lstrip()``
* ``s.lstrip(chars: str)``
Expand Down
1 change: 1 addition & 0 deletions mypyc/lib-rt/CPy.h
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged st
CPyTagged CPyStr_Ord(PyObject *obj);
PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count);
bool CPyStr_IsSpace(PyObject *str);
bool CPyStr_IsAlnum(PyObject *str);

// Bytes operations

Expand Down
23 changes: 23 additions & 0 deletions mypyc/lib-rt/str_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -654,3 +654,26 @@ bool CPyStr_IsSpace(PyObject *str) {
}
return true;
}

bool CPyStr_IsAlnum(PyObject *str) {
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
if (len == 0) return false;

if (PyUnicode_IS_ASCII(str)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be PyUnicode_KIND(obj) == PyUnicode_1BYTE_KIND instead? This would be needed if the loop below was split into dedicated 2/4 byte loops.

const Py_UCS1 *data = PyUnicode_1BYTE_DATA(str);
for (Py_ssize_t i = 0; i < len; i++) {
if (!Py_ISALNUM(data[i]))
return false;
}
return true;
}

int kind = PyUnicode_KIND(str);
const void *data = PyUnicode_DATA(str);
for (Py_ssize_t i = 0; i < len; i++) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance might increase if there was a separate loop for 2 byte and 4 byte kinds. This way the read operation wouldn't need to branch based on kind, which might result in better code. Can you try this out?

Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!Py_UNICODE_ISALNUM(ch))
return false;
}
return true;
}
9 changes: 9 additions & 0 deletions mypyc/primitives/str_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,15 @@
error_kind=ERR_NEVER,
)

method_op(
name="isalnum",
arg_types=[str_rprimitive],
return_type=bool_rprimitive,
c_function_name="CPyStr_IsAlnum",
error_kind=ERR_NEVER,
)


# obj.decode()
method_op(
name="decode",
Expand Down
1 change: 1 addition & 0 deletions mypyc/test-data/fixtures/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def removesuffix(self, suffix: str, /) -> str: ...
def islower(self) -> bool: ...
def count(self, substr: str, start: Optional[int] = None, end: Optional[int] = None) -> int: pass
def isspace(self) -> bool: ...
def isalnum(self) -> bool: ...

class float:
def __init__(self, x: object) -> None: pass
Expand Down
11 changes: 11 additions & 0 deletions mypyc/test-data/irbuild-str.test
Original file line number Diff line number Diff line change
Expand Up @@ -983,3 +983,14 @@ def is_space(x):
L0:
r0 = CPyStr_IsSpace(x)
return r0

[case testStrIsAlnum]
def is_alnum(x: str) -> bool:
return x.isalnum()
[out]
def is_alnum(x):
x :: str
r0 :: bool
L0:
r0 = CPyStr_IsAlnum(x)
return r0
30 changes: 30 additions & 0 deletions mypyc/test-data/run-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -1269,3 +1269,33 @@ def test_isspace() -> None:
c = chr(i)
a: Any = c
assert c.isspace() == a.isspace()

[case testIsAlnum]
def test_isalnum_basic() -> None:
assert "abc".isalnum()
assert "ABC".isalnum()
assert "abc123".isalnum()
assert "123".isalnum()
assert not "".isalnum()
assert not " ".isalnum()
assert not "abc!".isalnum()
assert not "hello world".isalnum()
assert not "abc-123".isalnum()

def test_isalnum_unicode() -> None:
# Single chars: letters and digits from various scripts
assert "\u00E9".isalnum() # é (UCS-1 Latin letter)
assert "\u0660".isalnum() # ٠ (UCS-2 Arabic-Indic digit)
assert "\u4E2D".isalnum() # 中 (UCS-2 CJK ideograph)
assert "\U00010400".isalnum() # 𐐀 (UCS-4 Deseret capital letter long I)
assert not "\u2000".isalnum() # EN QUAD (whitespace)
assert not "\u0021".isalnum() # !
assert not "\u00B6".isalnum() # ¶ (pilcrow sign, punctuation)

# Mixed Unicode letters and digits — all alnum
assert "\u00E9\u0660".isalnum() # é٠
assert "\u4E2D\u0041\u0660".isalnum() # 中A٠

# Unicode letter/digit mixed with punctuation — not alnum
assert not "\u00E9!".isalnum()
assert not "\u4E2D\u2000".isalnum() # CJK + whitespace