From 84139edecd1674cce379b8357f664dea0f318572 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Wed, 24 Dec 2025 13:07:15 -0800 Subject: [PATCH] Fix comma --- unicodetools/data/linkification/dev/LinkEmail.txt | 7 ++++--- .../java/org/unicode/utilities/LinkUtilities.java | 15 +++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/unicodetools/data/linkification/dev/LinkEmail.txt b/unicodetools/data/linkification/dev/LinkEmail.txt index c1f1d8116..3cd024cd2 100644 --- a/unicodetools/data/linkification/dev/LinkEmail.txt +++ b/unicodetools/data/linkification/dev/LinkEmail.txt @@ -1,5 +1,5 @@ # LinkEmail.txt -# Date: 2025-12-20, 21:02:29 GMT +# Date: 2025-12-24, 21:06:25 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -26,7 +26,8 @@ # 0021 # 1.1 (!) EXCLAMATION MARK 0023..0027 # 1.1 [5] (#..') NUMBER SIGN..APOSTROPHE -002A..0039 # 1.1 [16] (*..9) ASTERISK..DIGIT NINE +002A..002B # 1.1 [2] (*..+) ASTERISK..PLUS SIGN +002D..0039 # 1.1 [13] (-..9) HYPHEN-MINUS..DIGIT NINE 003D # 1.1 (=) EQUALS SIGN 003F # 1.1 (?) QUESTION MARK 0041..005A # 1.1 [26] (A..Z) LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z @@ -1292,4 +1293,4 @@ FFDA..FFDC # 1.1 [3] (ᅳ..ᅵ) HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 323B0..33479 # 17.0 [4298] (𲎰..𳑹) CJK UNIFIED IDEOGRAPH-323B0..CJK UNIFIED IDEOGRAPH-33479 E0100..E01EF # 4.0 [240] (U+E0100..U+E01EF) VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 149241 +# Total code points: 149240 diff --git a/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java b/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java index df6e95413..efdaa4443 100644 --- a/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java +++ b/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java @@ -190,14 +190,17 @@ private LinkTermination(String uset) { } } - // Note: the source standards are painful to read. - // https://en.wikipedia.org/wiki/Email_address#Local-part is much easier + // https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 has the full list for ASCII part + // See also https://en.wikipedia.org/wiki/Email_address#Local-part + // We add dot (ascii '.'), and then check after for the special dot constraints. - static final UnicodeSet EMAIL_EXCLUDES = - new UnicodeSet("[\\u0020 ; \\: \" ( ) \\[ \\] @ \\\\ < >]").freeze(); + static final UnicodeSet EMAIL_ASCII_INCLUDES = + new UnicodeSet("[[a-zA-Z][0-9][_ \\- ! ? ' \\{ \\} * / \\& # % ` \\^ + = | ~ \\$]]") + .add('.') + .freeze(); static final UnicodeSet validEmailLocalPart = - new UnicodeSet("[\\p{XID_Continue}\\p{block=basic_latin}-\\p{Cc}]") - .removeAll(EMAIL_EXCLUDES) + new UnicodeSet("[\\p{XID_Continue}-\\p{block=basic_latin}]") + .addAll(EMAIL_ASCII_INCLUDES) .freeze(); public static final UnicodeProperty LinkEmail = new UnicodeSetProperty()