diff --git a/unicodetools/data/linkification/dev/LinkEmail.txt b/unicodetools/data/linkification/dev/LinkEmail.txt index b4a520a7c..70613a565 100644 --- a/unicodetools/data/linkification/dev/LinkEmail.txt +++ b/unicodetools/data/linkification/dev/LinkEmail.txt @@ -1,5 +1,5 @@ # LinkEmail.txt -# Date: 2025-12-24, 02:37:15 GMT +# Date: 2025-12-24, 21:06:25 GMT # © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html @@ -26,7 +26,8 @@ # 0021 # 1.1 (!) EXCLAMATION MARK 0023..0027 # 1.1 [5] (#..') NUMBER SIGN..APOSTROPHE -002A..0039 # 1.1 [16] (*..9) ASTERISK..DIGIT NINE +002A..002B # 1.1 [2] (*..+) ASTERISK..PLUS SIGN +002D..0039 # 1.1 [13] (-..9) HYPHEN-MINUS..DIGIT NINE 003D # 1.1 (=) EQUALS SIGN 003F # 1.1 (?) QUESTION MARK 0041..005A # 1.1 [26] (A..Z) LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z @@ -1331,4 +1332,4 @@ FFDA..FFDC # 1.1 [3] (ᅳ..ᅵ) HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN 3D000..3FC3F # 18.0 [11328] (U+3D000..U+3FC3F) SEAL CHARACTER-3D000..SEAL CHARACTER-3FC3F E0100..E01EF # 4.0 [240] (U+E0100..U+E01EF) VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 162119 +# Total code points: 149240 diff --git a/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java b/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java index 4b5ad875f..2302b2ca0 100644 --- a/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java +++ b/unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java @@ -194,11 +194,14 @@ private LinkTermination(String uset) { } } - // Note: the source standards are painful to read. - // https://en.wikipedia.org/wiki/Email_address#Local-part is much easier + // https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 has the full list for ASCII part + // See also https://en.wikipedia.org/wiki/Email_address#Local-part + // We add dot (ascii '.'), and then check after for the special dot constraints. - static final UnicodeSet EMAIL_EXCLUDES = - new UnicodeSet("[\\u0020 ; \\: \" ( ) \\[ \\] @ \\\\ < >]").freeze(); + static final UnicodeSet EMAIL_ASCII_INCLUDES = + new UnicodeSet("[[a-zA-Z][0-9][_ \\- ! ? ' \\{ \\} * / \\& # % ` \\^ + = | ~ \\$]]") + .add('.') + .freeze(); static final UnicodeSet validEmailLocalPart = new UnicodeSet( "[\\p{XID_Continue}\\p{block=basic_latin}-\\p{Cc}]",