From fe6fd4ebc9ffdd4cd9ff71f468dadc70d463ffd6 Mon Sep 17 00:00:00 2001 From: "xuan.huang" <5563315+Huxpro@users.noreply.github.com> Date: Wed, 1 Apr 2026 00:46:00 -0400 Subject: [PATCH 1/2] fix: avoid Unicode property escape parse failures --- src/analysis.ts | 20 ++++++++++++++++---- src/measurement.ts | 32 ++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/analysis.ts b/src/analysis.ts index a22d881e..a4200d90 100644 --- a/src/analysis.ts +++ b/src/analysis.ts @@ -94,9 +94,21 @@ export function setAnalysisLocale(locale?: string): void { sharedWordSegmenter = null } -const arabicScriptRe = /\p{Script=Arabic}/u -const combiningMarkRe = /\p{M}/u -const decimalDigitRe = /\p{Nd}/u +const arabicScriptFallbackRe = /[\u0600-\u0604\u0606-\u060B\u060D-\u061A\u061C-\u061E\u0620-\u063F\u0641-\u064A\u0656-\u066F\u0671-\u06DC\u06DE-\u06FF\u0750-\u077F\u0870-\u0891\u0897-\u08E1\u08E3-\u08FF\uFB50-\uFD3D\uFD40-\uFDCF\uFDF0-\uFDFF\uFE70-\uFE74\uFE76-\uFEFC\u{10E60}-\u{10E7E}\u{10EC2}-\u{10EC7}\u{10ED0}-\u{10ED8}\u{10EFA}-\u{10EFF}\u{1EE00}-\u{1EE03}\u{1EE05}-\u{1EE1F}\u{1EE21}\u{1EE22}\u{1EE24}\u{1EE27}\u{1EE29}-\u{1EE32}\u{1EE34}-\u{1EE37}\u{1EE39}\u{1EE3B}\u{1EE42}\u{1EE47}\u{1EE49}\u{1EE4B}\u{1EE4D}-\u{1EE4F}\u{1EE51}\u{1EE52}\u{1EE54}\u{1EE57}\u{1EE59}\u{1EE5B}\u{1EE5D}\u{1EE5F}\u{1EE61}\u{1EE62}\u{1EE64}\u{1EE67}-\u{1EE6A}\u{1EE6C}-\u{1EE72}\u{1EE74}-\u{1EE77}\u{1EE79}-\u{1EE7C}\u{1EE7E}\u{1EE80}-\u{1EE89}\u{1EE8B}-\u{1EE9B}\u{1EEA1}-\u{1EEA3}\u{1EEA5}-\u{1EEA9}\u{1EEAB}-\u{1EEBB}\u{1EEF0}\u{1EEF1}]/u +const combiningMarkFallbackRe = /[\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u07FD\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u0897-\u089F\u08CA-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u09FE\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0AFA-\u0AFF\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B55-\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C04\u0C3C\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0CF3\u0D00-\u0D03\u0D3B\u0D3C\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D81-\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EBC\u0EC8-\u0ECE\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1715\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u180F\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ADD\u1AE0-\u1AEB\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF4\u1CF7-\u1CF9\u1DC0-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA82C\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA8FF\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F\u{101FD}\u{102E0}\u{10376}-\u{1037A}\u{10A01}-\u{10A03}\u{10A05}\u{10A06}\u{10A0C}-\u{10A0F}\u{10A38}-\u{10A3A}\u{10A3F}\u{10AE5}\u{10AE6}\u{10D24}-\u{10D27}\u{10D69}-\u{10D6D}\u{10EAB}\u{10EAC}\u{10EFA}-\u{10EFF}\u{10F46}-\u{10F50}\u{10F82}-\u{10F85}\u{11000}-\u{11002}\u{11038}-\u{11046}\u{11070}\u{11073}\u{11074}\u{1107F}-\u{11082}\u{110B0}-\u{110BA}\u{110C2}\u{11100}-\u{11102}\u{11127}-\u{11134}\u{11145}\u{11146}\u{11173}\u{11180}-\u{11182}\u{111B3}-\u{111C0}\u{111C9}-\u{111CC}\u{111CE}\u{111CF}\u{1122C}-\u{11237}\u{1123E}\u{11241}\u{112DF}-\u{112EA}\u{11300}-\u{11303}\u{1133B}\u{1133C}\u{1133E}-\u{11344}\u{11347}\u{11348}\u{1134B}-\u{1134D}\u{11357}\u{11362}\u{11363}\u{11366}-\u{1136C}\u{11370}-\u{11374}\u{113B8}-\u{113C0}\u{113C2}\u{113C5}\u{113C7}-\u{113CA}\u{113CC}-\u{113D0}\u{113D2}\u{113E1}\u{113E2}\u{11435}-\u{11446}\u{1145E}\u{114B0}-\u{114C3}\u{115AF}-\u{115B5}\u{115B8}-\u{115C0}\u{115DC}\u{115DD}\u{11630}-\u{11640}\u{116AB}-\u{116B7}\u{1171D}-\u{1172B}\u{1182C}-\u{1183A}\u{11930}-\u{11935}\u{11937}\u{11938}\u{1193B}-\u{1193E}\u{11940}\u{11942}\u{11943}\u{119D1}-\u{119D7}\u{119DA}-\u{119E0}\u{119E4}\u{11A01}-\u{11A0A}\u{11A33}-\u{11A39}\u{11A3B}-\u{11A3E}\u{11A47}\u{11A51}-\u{11A5B}\u{11A8A}-\u{11A99}\u{11B60}-\u{11B67}\u{11C2F}-\u{11C36}\u{11C38}-\u{11C3F}\u{11C92}-\u{11CA7}\u{11CA9}-\u{11CB6}\u{11D31}-\u{11D36}\u{11D3A}\u{11D3C}\u{11D3D}\u{11D3F}-\u{11D45}\u{11D47}\u{11D8A}-\u{11D8E}\u{11D90}\u{11D91}\u{11D93}-\u{11D97}\u{11EF3}-\u{11EF6}\u{11F00}\u{11F01}\u{11F03}\u{11F34}-\u{11F3A}\u{11F3E}-\u{11F42}\u{11F5A}\u{13440}\u{13447}-\u{13455}\u{1611E}-\u{1612F}\u{16AF0}-\u{16AF4}\u{16B30}-\u{16B36}\u{16F4F}\u{16F51}-\u{16F87}\u{16F8F}-\u{16F92}\u{16FE4}\u{16FF0}\u{16FF1}\u{1BC9D}\u{1BC9E}\u{1CF00}-\u{1CF2D}\u{1CF30}-\u{1CF46}\u{1D165}-\u{1D169}\u{1D16D}-\u{1D172}\u{1D17B}-\u{1D182}\u{1D185}-\u{1D18B}\u{1D1AA}-\u{1D1AD}\u{1D242}-\u{1D244}\u{1DA00}-\u{1DA36}\u{1DA3B}-\u{1DA6C}\u{1DA75}\u{1DA84}\u{1DA9B}-\u{1DA9F}\u{1DAA1}-\u{1DAAF}\u{1E000}-\u{1E006}\u{1E008}-\u{1E018}\u{1E01B}-\u{1E021}\u{1E023}\u{1E024}\u{1E026}-\u{1E02A}\u{1E08F}\u{1E130}-\u{1E136}\u{1E2AE}\u{1E2EC}-\u{1E2EF}\u{1E4EC}-\u{1E4EF}\u{1E5EE}\u{1E5EF}\u{1E6E3}\u{1E6E6}\u{1E6EE}\u{1E6EF}\u{1E6F5}\u{1E8D0}-\u{1E8D6}\u{1E944}-\u{1E94A}\u{E0100}-\u{E01EF}]/u +const decimalDigitFallbackRe = /[0-9\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE6-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29\u1040-\u1049\u1090-\u1099\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\uA620-\uA629\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19\u{104A0}-\u{104A9}\u{10D30}-\u{10D39}\u{10D40}-\u{10D49}\u{11066}-\u{1106F}\u{110F0}-\u{110F9}\u{11136}-\u{1113F}\u{111D0}-\u{111D9}\u{112F0}-\u{112F9}\u{11450}-\u{11459}\u{114D0}-\u{114D9}\u{11650}-\u{11659}\u{116C0}-\u{116C9}\u{116D0}-\u{116E3}\u{11730}-\u{11739}\u{118E0}-\u{118E9}\u{11950}-\u{11959}\u{11BF0}-\u{11BF9}\u{11C50}-\u{11C59}\u{11D50}-\u{11D59}\u{11DA0}-\u{11DA9}\u{11DE0}-\u{11DE9}\u{11F50}-\u{11F59}\u{16130}-\u{16139}\u{16A60}-\u{16A69}\u{16AC0}-\u{16AC9}\u{16B50}-\u{16B59}\u{16D70}-\u{16D79}\u{1CCF0}-\u{1CCF9}\u{1D7CE}-\u{1D7FF}\u{1E140}-\u{1E149}\u{1E2F0}-\u{1E2F9}\u{1E4F0}-\u{1E4F9}\u{1E5F1}-\u{1E5FA}\u{1E950}-\u{1E959}\u{1FBF0}-\u{1FBF9}]/u + +function createUnicodePropertyRegex(source: string, fallback: RegExp): RegExp { + try { + return new RegExp(source, 'u') + } catch { + return fallback + } +} + +const arabicScriptRe = createUnicodePropertyRegex('\\p{Script=Arabic}', arabicScriptFallbackRe) +const combiningMarkRe = createUnicodePropertyRegex('\\p{M}', combiningMarkFallbackRe) +const decimalDigitRe = createUnicodePropertyRegex('\\p{Nd}', decimalDigitFallbackRe) function containsArabicScript(text: string): boolean { return arabicScriptRe.test(text) @@ -303,7 +315,7 @@ function endsWithMyanmarMedialGlue(segment: string): boolean { function splitLeadingSpaceAndMarks(segment: string): { space: string, marks: string } | null { if (segment.length < 2 || segment[0] !== ' ') return null const marks = segment.slice(1) - if (/^\p{M}+$/u.test(marks)) { + if (marks.length > 0 && Array.from(marks).every((mark) => combiningMarkRe.test(mark))) { return { space: ' ', marks } } return null diff --git a/src/measurement.ts b/src/measurement.ts index b2fb6d57..4ec39b56 100644 --- a/src/measurement.ts +++ b/src/measurement.ts @@ -19,11 +19,35 @@ let measureContext: CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D const segmentMetricCaches = new Map>() let cachedEngineProfile: EngineProfile | null = null -const emojiPresentationRe = /\p{Emoji_Presentation}/u -const maybeEmojiRe = /[\p{Emoji_Presentation}\p{Extended_Pictographic}\p{Regional_Indicator}\uFE0F\u20E3]/u +const maybeEmojiFallbackRe = /[\xA9\xAE\u203C\u2049\u20E3\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618\u261D\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u265F\u2660\u2663\u2665\u2666\u2668\u267B\u267E\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26A7\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\uFE0F\u{1F004}\u{1F02C}-\u{1F02F}\u{1F094}-\u{1F09F}\u{1F0AF}\u{1F0B0}\u{1F0C0}\u{1F0CF}\u{1F0D0}\u{1F0F6}-\u{1F0FF}\u{1F170}\u{1F171}\u{1F17E}\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AE}-\u{1F1FF}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F25F}\u{1F266}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F8}-\u{1F4FD}\u{1F3F7}-\u{1F3FA}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}\u{1F596}\u{1F5A4}\u{1F5A5}\u{1F5A8}\u{1F5B1}\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6D5}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6F0}\u{1F6F3}-\u{1F6FF}\u{1F7DA}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE}\u{1F8AF}\u{1F8BC}-\u{1F8BF}\u{1F8C2}-\u{1F8CF}\u{1F8D9}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1F9FF}\u{1FA58}-\u{1FA5F}\u{1FA6E}-\u{1FAFF}\u{1FC00}-\u{1FFFD}]/u +let emojiPresentationRe: RegExp | null = null +let maybeEmojiRe: RegExp | null = null let sharedGraphemeSegmenter: Intl.Segmenter | null = null const emojiCorrectionCache = new Map() +function getEmojiPresentationRe(): RegExp { + if (emojiPresentationRe !== null) return emojiPresentationRe + try { + emojiPresentationRe = new RegExp('\\p{Emoji_Presentation}', 'u') + } catch { + emojiPresentationRe = maybeEmojiFallbackRe + } + return emojiPresentationRe +} + +function getMaybeEmojiRe(): RegExp { + if (maybeEmojiRe !== null) return maybeEmojiRe + try { + maybeEmojiRe = new RegExp( + '[\\p{Emoji_Presentation}\\p{Extended_Pictographic}\\p{Regional_Indicator}\\uFE0F\\u20E3]', + 'u', + ) + } catch { + maybeEmojiRe = maybeEmojiFallbackRe + } + return maybeEmojiRe +} + export function getMeasureContext(): CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D { if (measureContext !== null) return measureContext @@ -113,11 +137,11 @@ function getSharedGraphemeSegmenter(): Intl.Segmenter { } function isEmojiGrapheme(g: string): boolean { - return emojiPresentationRe.test(g) || g.includes('\uFE0F') + return getEmojiPresentationRe().test(g) || g.includes('\uFE0F') } export function textMayContainEmoji(text: string): boolean { - return maybeEmojiRe.test(text) + return getMaybeEmojiRe().test(text) } function getEmojiCorrection(font: string, fontSize: number): number { From 4c5d8a7ed6cc3e2fe2c4dd132cd326ca872ae89c Mon Sep 17 00:00:00 2001 From: "xuan.huang" <5563315+Huxpro@users.noreply.github.com> Date: Wed, 1 Apr 2026 04:29:28 -0400 Subject: [PATCH 2/2] feat: add a compat entrypoint for portable regex support --- package.json | 5 + src/analysis.ts | 20 +- src/compat/analysis.ts | 1019 +++++++++++++++++++++++++++++++++++ src/compat/layout.ts | 717 +++++++++++++++++++++++++ src/compat/line-break.ts | 1056 +++++++++++++++++++++++++++++++++++++ src/compat/measurement.ts | 255 +++++++++ src/layout.test.ts | 37 ++ src/measurement.ts | 32 +- 8 files changed, 3097 insertions(+), 44 deletions(-) create mode 100644 src/compat/analysis.ts create mode 100644 src/compat/layout.ts create mode 100644 src/compat/line-break.ts create mode 100644 src/compat/measurement.ts diff --git a/package.json b/package.json index bda5a13a..1f78bcf2 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,11 @@ "import": "./dist/layout.js", "default": "./dist/layout.js" }, + "./compat": { + "types": "./dist/compat/layout.d.ts", + "import": "./dist/compat/layout.js", + "default": "./dist/compat/layout.js" + }, "./demos/*": "./pages/demos/*", "./assets/*": "./pages/assets/*", "./package.json": "./package.json" diff --git a/src/analysis.ts b/src/analysis.ts index a4200d90..a22d881e 100644 --- a/src/analysis.ts +++ b/src/analysis.ts @@ -94,21 +94,9 @@ export function setAnalysisLocale(locale?: string): void { sharedWordSegmenter = null } -const arabicScriptFallbackRe = /[\u0600-\u0604\u0606-\u060B\u060D-\u061A\u061C-\u061E\u0620-\u063F\u0641-\u064A\u0656-\u066F\u0671-\u06DC\u06DE-\u06FF\u0750-\u077F\u0870-\u0891\u0897-\u08E1\u08E3-\u08FF\uFB50-\uFD3D\uFD40-\uFDCF\uFDF0-\uFDFF\uFE70-\uFE74\uFE76-\uFEFC\u{10E60}-\u{10E7E}\u{10EC2}-\u{10EC7}\u{10ED0}-\u{10ED8}\u{10EFA}-\u{10EFF}\u{1EE00}-\u{1EE03}\u{1EE05}-\u{1EE1F}\u{1EE21}\u{1EE22}\u{1EE24}\u{1EE27}\u{1EE29}-\u{1EE32}\u{1EE34}-\u{1EE37}\u{1EE39}\u{1EE3B}\u{1EE42}\u{1EE47}\u{1EE49}\u{1EE4B}\u{1EE4D}-\u{1EE4F}\u{1EE51}\u{1EE52}\u{1EE54}\u{1EE57}\u{1EE59}\u{1EE5B}\u{1EE5D}\u{1EE5F}\u{1EE61}\u{1EE62}\u{1EE64}\u{1EE67}-\u{1EE6A}\u{1EE6C}-\u{1EE72}\u{1EE74}-\u{1EE77}\u{1EE79}-\u{1EE7C}\u{1EE7E}\u{1EE80}-\u{1EE89}\u{1EE8B}-\u{1EE9B}\u{1EEA1}-\u{1EEA3}\u{1EEA5}-\u{1EEA9}\u{1EEAB}-\u{1EEBB}\u{1EEF0}\u{1EEF1}]/u -const combiningMarkFallbackRe = /[\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u07FD\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u0897-\u089F\u08CA-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u09FE\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0AFA-\u0AFF\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B55-\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C04\u0C3C\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0CF3\u0D00-\u0D03\u0D3B\u0D3C\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D81-\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EBC\u0EC8-\u0ECE\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1715\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u180F\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ADD\u1AE0-\u1AEB\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF4\u1CF7-\u1CF9\u1DC0-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA82C\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA8FF\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F\u{101FD}\u{102E0}\u{10376}-\u{1037A}\u{10A01}-\u{10A03}\u{10A05}\u{10A06}\u{10A0C}-\u{10A0F}\u{10A38}-\u{10A3A}\u{10A3F}\u{10AE5}\u{10AE6}\u{10D24}-\u{10D27}\u{10D69}-\u{10D6D}\u{10EAB}\u{10EAC}\u{10EFA}-\u{10EFF}\u{10F46}-\u{10F50}\u{10F82}-\u{10F85}\u{11000}-\u{11002}\u{11038}-\u{11046}\u{11070}\u{11073}\u{11074}\u{1107F}-\u{11082}\u{110B0}-\u{110BA}\u{110C2}\u{11100}-\u{11102}\u{11127}-\u{11134}\u{11145}\u{11146}\u{11173}\u{11180}-\u{11182}\u{111B3}-\u{111C0}\u{111C9}-\u{111CC}\u{111CE}\u{111CF}\u{1122C}-\u{11237}\u{1123E}\u{11241}\u{112DF}-\u{112EA}\u{11300}-\u{11303}\u{1133B}\u{1133C}\u{1133E}-\u{11344}\u{11347}\u{11348}\u{1134B}-\u{1134D}\u{11357}\u{11362}\u{11363}\u{11366}-\u{1136C}\u{11370}-\u{11374}\u{113B8}-\u{113C0}\u{113C2}\u{113C5}\u{113C7}-\u{113CA}\u{113CC}-\u{113D0}\u{113D2}\u{113E1}\u{113E2}\u{11435}-\u{11446}\u{1145E}\u{114B0}-\u{114C3}\u{115AF}-\u{115B5}\u{115B8}-\u{115C0}\u{115DC}\u{115DD}\u{11630}-\u{11640}\u{116AB}-\u{116B7}\u{1171D}-\u{1172B}\u{1182C}-\u{1183A}\u{11930}-\u{11935}\u{11937}\u{11938}\u{1193B}-\u{1193E}\u{11940}\u{11942}\u{11943}\u{119D1}-\u{119D7}\u{119DA}-\u{119E0}\u{119E4}\u{11A01}-\u{11A0A}\u{11A33}-\u{11A39}\u{11A3B}-\u{11A3E}\u{11A47}\u{11A51}-\u{11A5B}\u{11A8A}-\u{11A99}\u{11B60}-\u{11B67}\u{11C2F}-\u{11C36}\u{11C38}-\u{11C3F}\u{11C92}-\u{11CA7}\u{11CA9}-\u{11CB6}\u{11D31}-\u{11D36}\u{11D3A}\u{11D3C}\u{11D3D}\u{11D3F}-\u{11D45}\u{11D47}\u{11D8A}-\u{11D8E}\u{11D90}\u{11D91}\u{11D93}-\u{11D97}\u{11EF3}-\u{11EF6}\u{11F00}\u{11F01}\u{11F03}\u{11F34}-\u{11F3A}\u{11F3E}-\u{11F42}\u{11F5A}\u{13440}\u{13447}-\u{13455}\u{1611E}-\u{1612F}\u{16AF0}-\u{16AF4}\u{16B30}-\u{16B36}\u{16F4F}\u{16F51}-\u{16F87}\u{16F8F}-\u{16F92}\u{16FE4}\u{16FF0}\u{16FF1}\u{1BC9D}\u{1BC9E}\u{1CF00}-\u{1CF2D}\u{1CF30}-\u{1CF46}\u{1D165}-\u{1D169}\u{1D16D}-\u{1D172}\u{1D17B}-\u{1D182}\u{1D185}-\u{1D18B}\u{1D1AA}-\u{1D1AD}\u{1D242}-\u{1D244}\u{1DA00}-\u{1DA36}\u{1DA3B}-\u{1DA6C}\u{1DA75}\u{1DA84}\u{1DA9B}-\u{1DA9F}\u{1DAA1}-\u{1DAAF}\u{1E000}-\u{1E006}\u{1E008}-\u{1E018}\u{1E01B}-\u{1E021}\u{1E023}\u{1E024}\u{1E026}-\u{1E02A}\u{1E08F}\u{1E130}-\u{1E136}\u{1E2AE}\u{1E2EC}-\u{1E2EF}\u{1E4EC}-\u{1E4EF}\u{1E5EE}\u{1E5EF}\u{1E6E3}\u{1E6E6}\u{1E6EE}\u{1E6EF}\u{1E6F5}\u{1E8D0}-\u{1E8D6}\u{1E944}-\u{1E94A}\u{E0100}-\u{E01EF}]/u -const decimalDigitFallbackRe = /[0-9\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE6-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29\u1040-\u1049\u1090-\u1099\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\uA620-\uA629\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19\u{104A0}-\u{104A9}\u{10D30}-\u{10D39}\u{10D40}-\u{10D49}\u{11066}-\u{1106F}\u{110F0}-\u{110F9}\u{11136}-\u{1113F}\u{111D0}-\u{111D9}\u{112F0}-\u{112F9}\u{11450}-\u{11459}\u{114D0}-\u{114D9}\u{11650}-\u{11659}\u{116C0}-\u{116C9}\u{116D0}-\u{116E3}\u{11730}-\u{11739}\u{118E0}-\u{118E9}\u{11950}-\u{11959}\u{11BF0}-\u{11BF9}\u{11C50}-\u{11C59}\u{11D50}-\u{11D59}\u{11DA0}-\u{11DA9}\u{11DE0}-\u{11DE9}\u{11F50}-\u{11F59}\u{16130}-\u{16139}\u{16A60}-\u{16A69}\u{16AC0}-\u{16AC9}\u{16B50}-\u{16B59}\u{16D70}-\u{16D79}\u{1CCF0}-\u{1CCF9}\u{1D7CE}-\u{1D7FF}\u{1E140}-\u{1E149}\u{1E2F0}-\u{1E2F9}\u{1E4F0}-\u{1E4F9}\u{1E5F1}-\u{1E5FA}\u{1E950}-\u{1E959}\u{1FBF0}-\u{1FBF9}]/u - -function createUnicodePropertyRegex(source: string, fallback: RegExp): RegExp { - try { - return new RegExp(source, 'u') - } catch { - return fallback - } -} - -const arabicScriptRe = createUnicodePropertyRegex('\\p{Script=Arabic}', arabicScriptFallbackRe) -const combiningMarkRe = createUnicodePropertyRegex('\\p{M}', combiningMarkFallbackRe) -const decimalDigitRe = createUnicodePropertyRegex('\\p{Nd}', decimalDigitFallbackRe) +const arabicScriptRe = /\p{Script=Arabic}/u +const combiningMarkRe = /\p{M}/u +const decimalDigitRe = /\p{Nd}/u function containsArabicScript(text: string): boolean { return arabicScriptRe.test(text) @@ -315,7 +303,7 @@ function endsWithMyanmarMedialGlue(segment: string): boolean { function splitLeadingSpaceAndMarks(segment: string): { space: string, marks: string } | null { if (segment.length < 2 || segment[0] !== ' ') return null const marks = segment.slice(1) - if (marks.length > 0 && Array.from(marks).every((mark) => combiningMarkRe.test(mark))) { + if (/^\p{M}+$/u.test(marks)) { return { space: ' ', marks } } return null diff --git a/src/compat/analysis.ts b/src/compat/analysis.ts new file mode 100644 index 00000000..a4200d90 --- /dev/null +++ b/src/compat/analysis.ts @@ -0,0 +1,1019 @@ +export type WhiteSpaceMode = 'normal' | 'pre-wrap' + +export type SegmentBreakKind = + | 'text' + | 'space' + | 'preserved-space' + | 'tab' + | 'glue' + | 'zero-width-break' + | 'soft-hyphen' + | 'hard-break' + +type SegmentationPiece = { + text: string + isWordLike: boolean + kind: SegmentBreakKind + start: number +} + +export type MergedSegmentation = { + len: number + texts: string[] + isWordLike: boolean[] + kinds: SegmentBreakKind[] + starts: number[] +} + +export type AnalysisChunk = { + startSegmentIndex: number + endSegmentIndex: number + consumedEndSegmentIndex: number +} + +export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[] } & MergedSegmentation + +export type AnalysisProfile = { + carryCJKAfterClosingQuote: boolean +} + +const collapsibleWhitespaceRunRe = /[ \t\n\r\f]+/g +const needsWhitespaceNormalizationRe = /[\t\n\r\f]| {2,}|^ | $/ + +type WhiteSpaceProfile = { + mode: WhiteSpaceMode + preserveOrdinarySpaces: boolean + preserveHardBreaks: boolean +} + +function getWhiteSpaceProfile(whiteSpace?: WhiteSpaceMode): WhiteSpaceProfile { + const mode = whiteSpace ?? 'normal' + return mode === 'pre-wrap' + ? { mode, preserveOrdinarySpaces: true, preserveHardBreaks: true } + : { mode, preserveOrdinarySpaces: false, preserveHardBreaks: false } +} + +export function normalizeWhitespaceNormal(text: string): string { + if (!needsWhitespaceNormalizationRe.test(text)) return text + + let normalized = text.replace(collapsibleWhitespaceRunRe, ' ') + if (normalized.charCodeAt(0) === 0x20) { + normalized = normalized.slice(1) + } + if (normalized.length > 0 && normalized.charCodeAt(normalized.length - 1) === 0x20) { + normalized = normalized.slice(0, -1) + } + return normalized +} + +function normalizeWhitespacePreWrap(text: string): string { + if (!/[\r\f]/.test(text)) return text.replace(/\r\n/g, '\n') + return text + .replace(/\r\n/g, '\n') + .replace(/[\r\f]/g, '\n') +} + +let sharedWordSegmenter: Intl.Segmenter | null = null +let segmenterLocale: string | undefined + +function getSharedWordSegmenter(): Intl.Segmenter { + if (sharedWordSegmenter === null) { + sharedWordSegmenter = new Intl.Segmenter(segmenterLocale, { granularity: 'word' }) + } + return sharedWordSegmenter +} + +export function clearAnalysisCaches(): void { + sharedWordSegmenter = null +} + +export function setAnalysisLocale(locale?: string): void { + const nextLocale = locale && locale.length > 0 ? locale : undefined + if (segmenterLocale === nextLocale) return + segmenterLocale = nextLocale + sharedWordSegmenter = null +} + +const arabicScriptFallbackRe = /[\u0600-\u0604\u0606-\u060B\u060D-\u061A\u061C-\u061E\u0620-\u063F\u0641-\u064A\u0656-\u066F\u0671-\u06DC\u06DE-\u06FF\u0750-\u077F\u0870-\u0891\u0897-\u08E1\u08E3-\u08FF\uFB50-\uFD3D\uFD40-\uFDCF\uFDF0-\uFDFF\uFE70-\uFE74\uFE76-\uFEFC\u{10E60}-\u{10E7E}\u{10EC2}-\u{10EC7}\u{10ED0}-\u{10ED8}\u{10EFA}-\u{10EFF}\u{1EE00}-\u{1EE03}\u{1EE05}-\u{1EE1F}\u{1EE21}\u{1EE22}\u{1EE24}\u{1EE27}\u{1EE29}-\u{1EE32}\u{1EE34}-\u{1EE37}\u{1EE39}\u{1EE3B}\u{1EE42}\u{1EE47}\u{1EE49}\u{1EE4B}\u{1EE4D}-\u{1EE4F}\u{1EE51}\u{1EE52}\u{1EE54}\u{1EE57}\u{1EE59}\u{1EE5B}\u{1EE5D}\u{1EE5F}\u{1EE61}\u{1EE62}\u{1EE64}\u{1EE67}-\u{1EE6A}\u{1EE6C}-\u{1EE72}\u{1EE74}-\u{1EE77}\u{1EE79}-\u{1EE7C}\u{1EE7E}\u{1EE80}-\u{1EE89}\u{1EE8B}-\u{1EE9B}\u{1EEA1}-\u{1EEA3}\u{1EEA5}-\u{1EEA9}\u{1EEAB}-\u{1EEBB}\u{1EEF0}\u{1EEF1}]/u +const combiningMarkFallbackRe = /[\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u07FD\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u0897-\u089F\u08CA-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u09FE\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0AFA-\u0AFF\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B55-\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C04\u0C3C\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0CF3\u0D00-\u0D03\u0D3B\u0D3C\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D81-\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EBC\u0EC8-\u0ECE\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1715\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u180F\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ADD\u1AE0-\u1AEB\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF4\u1CF7-\u1CF9\u1DC0-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA82C\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA8FF\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F\u{101FD}\u{102E0}\u{10376}-\u{1037A}\u{10A01}-\u{10A03}\u{10A05}\u{10A06}\u{10A0C}-\u{10A0F}\u{10A38}-\u{10A3A}\u{10A3F}\u{10AE5}\u{10AE6}\u{10D24}-\u{10D27}\u{10D69}-\u{10D6D}\u{10EAB}\u{10EAC}\u{10EFA}-\u{10EFF}\u{10F46}-\u{10F50}\u{10F82}-\u{10F85}\u{11000}-\u{11002}\u{11038}-\u{11046}\u{11070}\u{11073}\u{11074}\u{1107F}-\u{11082}\u{110B0}-\u{110BA}\u{110C2}\u{11100}-\u{11102}\u{11127}-\u{11134}\u{11145}\u{11146}\u{11173}\u{11180}-\u{11182}\u{111B3}-\u{111C0}\u{111C9}-\u{111CC}\u{111CE}\u{111CF}\u{1122C}-\u{11237}\u{1123E}\u{11241}\u{112DF}-\u{112EA}\u{11300}-\u{11303}\u{1133B}\u{1133C}\u{1133E}-\u{11344}\u{11347}\u{11348}\u{1134B}-\u{1134D}\u{11357}\u{11362}\u{11363}\u{11366}-\u{1136C}\u{11370}-\u{11374}\u{113B8}-\u{113C0}\u{113C2}\u{113C5}\u{113C7}-\u{113CA}\u{113CC}-\u{113D0}\u{113D2}\u{113E1}\u{113E2}\u{11435}-\u{11446}\u{1145E}\u{114B0}-\u{114C3}\u{115AF}-\u{115B5}\u{115B8}-\u{115C0}\u{115DC}\u{115DD}\u{11630}-\u{11640}\u{116AB}-\u{116B7}\u{1171D}-\u{1172B}\u{1182C}-\u{1183A}\u{11930}-\u{11935}\u{11937}\u{11938}\u{1193B}-\u{1193E}\u{11940}\u{11942}\u{11943}\u{119D1}-\u{119D7}\u{119DA}-\u{119E0}\u{119E4}\u{11A01}-\u{11A0A}\u{11A33}-\u{11A39}\u{11A3B}-\u{11A3E}\u{11A47}\u{11A51}-\u{11A5B}\u{11A8A}-\u{11A99}\u{11B60}-\u{11B67}\u{11C2F}-\u{11C36}\u{11C38}-\u{11C3F}\u{11C92}-\u{11CA7}\u{11CA9}-\u{11CB6}\u{11D31}-\u{11D36}\u{11D3A}\u{11D3C}\u{11D3D}\u{11D3F}-\u{11D45}\u{11D47}\u{11D8A}-\u{11D8E}\u{11D90}\u{11D91}\u{11D93}-\u{11D97}\u{11EF3}-\u{11EF6}\u{11F00}\u{11F01}\u{11F03}\u{11F34}-\u{11F3A}\u{11F3E}-\u{11F42}\u{11F5A}\u{13440}\u{13447}-\u{13455}\u{1611E}-\u{1612F}\u{16AF0}-\u{16AF4}\u{16B30}-\u{16B36}\u{16F4F}\u{16F51}-\u{16F87}\u{16F8F}-\u{16F92}\u{16FE4}\u{16FF0}\u{16FF1}\u{1BC9D}\u{1BC9E}\u{1CF00}-\u{1CF2D}\u{1CF30}-\u{1CF46}\u{1D165}-\u{1D169}\u{1D16D}-\u{1D172}\u{1D17B}-\u{1D182}\u{1D185}-\u{1D18B}\u{1D1AA}-\u{1D1AD}\u{1D242}-\u{1D244}\u{1DA00}-\u{1DA36}\u{1DA3B}-\u{1DA6C}\u{1DA75}\u{1DA84}\u{1DA9B}-\u{1DA9F}\u{1DAA1}-\u{1DAAF}\u{1E000}-\u{1E006}\u{1E008}-\u{1E018}\u{1E01B}-\u{1E021}\u{1E023}\u{1E024}\u{1E026}-\u{1E02A}\u{1E08F}\u{1E130}-\u{1E136}\u{1E2AE}\u{1E2EC}-\u{1E2EF}\u{1E4EC}-\u{1E4EF}\u{1E5EE}\u{1E5EF}\u{1E6E3}\u{1E6E6}\u{1E6EE}\u{1E6EF}\u{1E6F5}\u{1E8D0}-\u{1E8D6}\u{1E944}-\u{1E94A}\u{E0100}-\u{E01EF}]/u +const decimalDigitFallbackRe = /[0-9\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE6-\u0BEF\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29\u1040-\u1049\u1090-\u1099\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\uA620-\uA629\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19\u{104A0}-\u{104A9}\u{10D30}-\u{10D39}\u{10D40}-\u{10D49}\u{11066}-\u{1106F}\u{110F0}-\u{110F9}\u{11136}-\u{1113F}\u{111D0}-\u{111D9}\u{112F0}-\u{112F9}\u{11450}-\u{11459}\u{114D0}-\u{114D9}\u{11650}-\u{11659}\u{116C0}-\u{116C9}\u{116D0}-\u{116E3}\u{11730}-\u{11739}\u{118E0}-\u{118E9}\u{11950}-\u{11959}\u{11BF0}-\u{11BF9}\u{11C50}-\u{11C59}\u{11D50}-\u{11D59}\u{11DA0}-\u{11DA9}\u{11DE0}-\u{11DE9}\u{11F50}-\u{11F59}\u{16130}-\u{16139}\u{16A60}-\u{16A69}\u{16AC0}-\u{16AC9}\u{16B50}-\u{16B59}\u{16D70}-\u{16D79}\u{1CCF0}-\u{1CCF9}\u{1D7CE}-\u{1D7FF}\u{1E140}-\u{1E149}\u{1E2F0}-\u{1E2F9}\u{1E4F0}-\u{1E4F9}\u{1E5F1}-\u{1E5FA}\u{1E950}-\u{1E959}\u{1FBF0}-\u{1FBF9}]/u + +function createUnicodePropertyRegex(source: string, fallback: RegExp): RegExp { + try { + return new RegExp(source, 'u') + } catch { + return fallback + } +} + +const arabicScriptRe = createUnicodePropertyRegex('\\p{Script=Arabic}', arabicScriptFallbackRe) +const combiningMarkRe = createUnicodePropertyRegex('\\p{M}', combiningMarkFallbackRe) +const decimalDigitRe = createUnicodePropertyRegex('\\p{Nd}', decimalDigitFallbackRe) + +function containsArabicScript(text: string): boolean { + return arabicScriptRe.test(text) +} + +export function isCJK(s: string): boolean { + for (const ch of s) { + const c = ch.codePointAt(0)! + if ((c >= 0x4E00 && c <= 0x9FFF) || + (c >= 0x3400 && c <= 0x4DBF) || + (c >= 0x20000 && c <= 0x2A6DF) || + (c >= 0x2A700 && c <= 0x2B73F) || + (c >= 0x2B740 && c <= 0x2B81F) || + (c >= 0x2B820 && c <= 0x2CEAF) || + (c >= 0x2CEB0 && c <= 0x2EBEF) || + (c >= 0x30000 && c <= 0x3134F) || + (c >= 0xF900 && c <= 0xFAFF) || + (c >= 0x2F800 && c <= 0x2FA1F) || + (c >= 0x3000 && c <= 0x303F) || + (c >= 0x3040 && c <= 0x309F) || + (c >= 0x30A0 && c <= 0x30FF) || + (c >= 0xAC00 && c <= 0xD7AF) || + (c >= 0xFF00 && c <= 0xFFEF)) { + return true + } + } + return false +} + +export const kinsokuStart = new Set([ + '\uFF0C', + '\uFF0E', + '\uFF01', + '\uFF1A', + '\uFF1B', + '\uFF1F', + '\u3001', + '\u3002', + '\u30FB', + '\uFF09', + '\u3015', + '\u3009', + '\u300B', + '\u300D', + '\u300F', + '\u3011', + '\u3017', + '\u3019', + '\u301B', + '\u30FC', + '\u3005', + '\u303B', + '\u309D', + '\u309E', + '\u30FD', + '\u30FE', +]) + +export const kinsokuEnd = new Set([ + '"', + '(', '[', '{', + '“', '‘', '«', '‹', + '\uFF08', + '\u3014', + '\u3008', + '\u300A', + '\u300C', + '\u300E', + '\u3010', + '\u3016', + '\u3018', + '\u301A', +]) + +const forwardStickyGlue = new Set([ + "'", '’', +]) + +export const leftStickyPunctuation = new Set([ + '.', ',', '!', '?', ':', ';', + '\u060C', + '\u061B', + '\u061F', + '\u0964', + '\u0965', + '\u104A', + '\u104B', + '\u104C', + '\u104D', + '\u104F', + ')', ']', '}', + '%', + '"', + '”', '’', '»', '›', + '…', +]) + +const arabicNoSpaceTrailingPunctuation = new Set([ + ':', + '.', + '\u060C', + '\u061B', +]) + +const myanmarMedialGlue = new Set([ + '\u104F', +]) + +const closingQuoteChars = new Set([ + '”', '’', '»', '›', + '\u300D', + '\u300F', + '\u3011', + '\u300B', + '\u3009', + '\u3015', + '\uFF09', +]) + +function isLeftStickyPunctuationSegment(segment: string): boolean { + if (isEscapedQuoteClusterSegment(segment)) return true + let sawPunctuation = false + for (const ch of segment) { + if (leftStickyPunctuation.has(ch)) { + sawPunctuation = true + continue + } + if (sawPunctuation && combiningMarkRe.test(ch)) continue + return false + } + return sawPunctuation +} + +function isCJKLineStartProhibitedSegment(segment: string): boolean { + for (const ch of segment) { + if (!kinsokuStart.has(ch) && !leftStickyPunctuation.has(ch)) return false + } + return segment.length > 0 +} + +function isForwardStickyClusterSegment(segment: string): boolean { + if (isEscapedQuoteClusterSegment(segment)) return true + for (const ch of segment) { + if (!kinsokuEnd.has(ch) && !forwardStickyGlue.has(ch) && !combiningMarkRe.test(ch)) return false + } + return segment.length > 0 +} + +function isEscapedQuoteClusterSegment(segment: string): boolean { + let sawQuote = false + for (const ch of segment) { + if (ch === '\\' || combiningMarkRe.test(ch)) continue + if (kinsokuEnd.has(ch) || leftStickyPunctuation.has(ch) || forwardStickyGlue.has(ch)) { + sawQuote = true + continue + } + return false + } + return sawQuote +} + +function splitTrailingForwardStickyCluster(text: string): { head: string, tail: string } | null { + const chars = Array.from(text) + let splitIndex = chars.length + + while (splitIndex > 0) { + const ch = chars[splitIndex - 1]! + if (combiningMarkRe.test(ch)) { + splitIndex-- + continue + } + if (kinsokuEnd.has(ch) || forwardStickyGlue.has(ch)) { + splitIndex-- + continue + } + break + } + + if (splitIndex <= 0 || splitIndex === chars.length) return null + return { + head: chars.slice(0, splitIndex).join(''), + tail: chars.slice(splitIndex).join(''), + } +} + +function isRepeatedSingleCharRun(segment: string, ch: string): boolean { + if (segment.length === 0) return false + for (const part of segment) { + if (part !== ch) return false + } + return true +} + +function endsWithArabicNoSpacePunctuation(segment: string): boolean { + if (!containsArabicScript(segment) || segment.length === 0) return false + return arabicNoSpaceTrailingPunctuation.has(segment[segment.length - 1]!) +} + +function endsWithMyanmarMedialGlue(segment: string): boolean { + if (segment.length === 0) return false + return myanmarMedialGlue.has(segment[segment.length - 1]!) +} + +function splitLeadingSpaceAndMarks(segment: string): { space: string, marks: string } | null { + if (segment.length < 2 || segment[0] !== ' ') return null + const marks = segment.slice(1) + if (marks.length > 0 && Array.from(marks).every((mark) => combiningMarkRe.test(mark))) { + return { space: ' ', marks } + } + return null +} + +export function endsWithClosingQuote(text: string): boolean { + for (let i = text.length - 1; i >= 0; i--) { + const ch = text[i]! + if (closingQuoteChars.has(ch)) return true + if (!leftStickyPunctuation.has(ch)) return false + } + return false +} + +function classifySegmentBreakChar(ch: string, whiteSpaceProfile: WhiteSpaceProfile): SegmentBreakKind { + if (whiteSpaceProfile.preserveOrdinarySpaces || whiteSpaceProfile.preserveHardBreaks) { + if (ch === ' ') return 'preserved-space' + if (ch === '\t') return 'tab' + if (whiteSpaceProfile.preserveHardBreaks && ch === '\n') return 'hard-break' + } + if (ch === ' ') return 'space' + if (ch === '\u00A0' || ch === '\u202F' || ch === '\u2060' || ch === '\uFEFF') { + return 'glue' + } + if (ch === '\u200B') return 'zero-width-break' + if (ch === '\u00AD') return 'soft-hyphen' + return 'text' +} + +function splitSegmentByBreakKind( + segment: string, + isWordLike: boolean, + start: number, + whiteSpaceProfile: WhiteSpaceProfile, +): SegmentationPiece[] { + const pieces: SegmentationPiece[] = [] + let currentKind: SegmentBreakKind | null = null + let currentText = '' + let currentStart = start + let currentWordLike = false + let offset = 0 + + for (const ch of segment) { + const kind = classifySegmentBreakChar(ch, whiteSpaceProfile) + const wordLike = kind === 'text' && isWordLike + + if (currentKind !== null && kind === currentKind && wordLike === currentWordLike) { + currentText += ch + offset += ch.length + continue + } + + if (currentKind !== null) { + pieces.push({ + text: currentText, + isWordLike: currentWordLike, + kind: currentKind, + start: currentStart, + }) + } + + currentKind = kind + currentText = ch + currentStart = start + offset + currentWordLike = wordLike + offset += ch.length + } + + if (currentKind !== null) { + pieces.push({ + text: currentText, + isWordLike: currentWordLike, + kind: currentKind, + start: currentStart, + }) + } + + return pieces +} + +function isTextRunBoundary(kind: SegmentBreakKind): boolean { + return ( + kind === 'space' || + kind === 'preserved-space' || + kind === 'zero-width-break' || + kind === 'hard-break' + ) +} + +const urlSchemeSegmentRe = /^[A-Za-z][A-Za-z0-9+.-]*:$/ + +function isUrlLikeRunStart(segmentation: MergedSegmentation, index: number): boolean { + const text = segmentation.texts[index]! + if (text.startsWith('www.')) return true + return ( + urlSchemeSegmentRe.test(text) && + index + 1 < segmentation.len && + segmentation.kinds[index + 1] === 'text' && + segmentation.texts[index + 1] === '//' + ) +} + +function isUrlQueryBoundarySegment(text: string): boolean { + return text.includes('?') && (text.includes('://') || text.startsWith('www.')) +} + +function mergeUrlLikeRuns(segmentation: MergedSegmentation): MergedSegmentation { + const texts = segmentation.texts.slice() + const isWordLike = segmentation.isWordLike.slice() + const kinds = segmentation.kinds.slice() + const starts = segmentation.starts.slice() + + for (let i = 0; i < segmentation.len; i++) { + if (kinds[i] !== 'text' || !isUrlLikeRunStart(segmentation, i)) continue + + let j = i + 1 + while (j < segmentation.len && !isTextRunBoundary(kinds[j]!)) { + texts[i] += texts[j]! + isWordLike[i] = true + const endsQueryPrefix = texts[j]!.includes('?') + kinds[j] = 'text' + texts[j] = '' + j++ + if (endsQueryPrefix) break + } + } + + let compactLen = 0 + for (let read = 0; read < texts.length; read++) { + const text = texts[read]! + if (text.length === 0) continue + if (compactLen !== read) { + texts[compactLen] = text + isWordLike[compactLen] = isWordLike[read]! + kinds[compactLen] = kinds[read]! + starts[compactLen] = starts[read]! + } + compactLen++ + } + + texts.length = compactLen + isWordLike.length = compactLen + kinds.length = compactLen + starts.length = compactLen + + return { + len: compactLen, + texts, + isWordLike, + kinds, + starts, + } +} + +function mergeUrlQueryRuns(segmentation: MergedSegmentation): MergedSegmentation { + const texts: string[] = [] + const isWordLike: boolean[] = [] + const kinds: SegmentBreakKind[] = [] + const starts: number[] = [] + + for (let i = 0; i < segmentation.len; i++) { + const text = segmentation.texts[i]! + texts.push(text) + isWordLike.push(segmentation.isWordLike[i]!) + kinds.push(segmentation.kinds[i]!) + starts.push(segmentation.starts[i]!) + + if (!isUrlQueryBoundarySegment(text)) continue + + const nextIndex = i + 1 + if ( + nextIndex >= segmentation.len || + isTextRunBoundary(segmentation.kinds[nextIndex]!) + ) { + continue + } + + let queryText = '' + const queryStart = segmentation.starts[nextIndex]! + let j = nextIndex + while (j < segmentation.len && !isTextRunBoundary(segmentation.kinds[j]!)) { + queryText += segmentation.texts[j]! + j++ + } + + if (queryText.length > 0) { + texts.push(queryText) + isWordLike.push(true) + kinds.push('text') + starts.push(queryStart) + i = j - 1 + } + } + + return { + len: texts.length, + texts, + isWordLike, + kinds, + starts, + } +} + +const numericJoinerChars = new Set([ + ':', '-', '/', '×', ',', '.', '+', + '\u2013', + '\u2014', +]) + +const asciiPunctuationChainSegmentRe = /^[A-Za-z0-9_]+[,:;]*$/ +const asciiPunctuationChainTrailingJoinersRe = /[,:;]+$/ + +function segmentContainsDecimalDigit(text: string): boolean { + for (const ch of text) { + if (decimalDigitRe.test(ch)) return true + } + return false +} + +function isNumericRunSegment(text: string): boolean { + if (text.length === 0) return false + for (const ch of text) { + if (decimalDigitRe.test(ch) || numericJoinerChars.has(ch)) continue + return false + } + return true +} + +function mergeNumericRuns(segmentation: MergedSegmentation): MergedSegmentation { + const texts: string[] = [] + const isWordLike: boolean[] = [] + const kinds: SegmentBreakKind[] = [] + const starts: number[] = [] + + for (let i = 0; i < segmentation.len; i++) { + const text = segmentation.texts[i]! + const kind = segmentation.kinds[i]! + + if (kind === 'text' && isNumericRunSegment(text) && segmentContainsDecimalDigit(text)) { + let mergedText = text + let j = i + 1 + while ( + j < segmentation.len && + segmentation.kinds[j] === 'text' && + isNumericRunSegment(segmentation.texts[j]!) + ) { + mergedText += segmentation.texts[j]! + j++ + } + + texts.push(mergedText) + isWordLike.push(true) + kinds.push('text') + starts.push(segmentation.starts[i]!) + i = j - 1 + continue + } + + texts.push(text) + isWordLike.push(segmentation.isWordLike[i]!) + kinds.push(kind) + starts.push(segmentation.starts[i]!) + } + + return { + len: texts.length, + texts, + isWordLike, + kinds, + starts, + } +} + +function mergeAsciiPunctuationChains(segmentation: MergedSegmentation): MergedSegmentation { + const texts: string[] = [] + const isWordLike: boolean[] = [] + const kinds: SegmentBreakKind[] = [] + const starts: number[] = [] + + for (let i = 0; i < segmentation.len; i++) { + const text = segmentation.texts[i]! + const kind = segmentation.kinds[i]! + const wordLike = segmentation.isWordLike[i]! + + if (kind === 'text' && wordLike && asciiPunctuationChainSegmentRe.test(text)) { + let mergedText = text + let j = i + 1 + + while ( + asciiPunctuationChainTrailingJoinersRe.test(mergedText) && + j < segmentation.len && + segmentation.kinds[j] === 'text' && + segmentation.isWordLike[j] && + asciiPunctuationChainSegmentRe.test(segmentation.texts[j]!) + ) { + mergedText += segmentation.texts[j]! + j++ + } + + texts.push(mergedText) + isWordLike.push(true) + kinds.push('text') + starts.push(segmentation.starts[i]!) + i = j - 1 + continue + } + + texts.push(text) + isWordLike.push(wordLike) + kinds.push(kind) + starts.push(segmentation.starts[i]!) + } + + return { + len: texts.length, + texts, + isWordLike, + kinds, + starts, + } +} + +function splitHyphenatedNumericRuns(segmentation: MergedSegmentation): MergedSegmentation { + const texts: string[] = [] + const isWordLike: boolean[] = [] + const kinds: SegmentBreakKind[] = [] + const starts: number[] = [] + + for (let i = 0; i < segmentation.len; i++) { + const text = segmentation.texts[i]! + if (segmentation.kinds[i] === 'text' && text.includes('-')) { + const parts = text.split('-') + let shouldSplit = parts.length > 1 + for (let j = 0; j < parts.length; j++) { + const part = parts[j]! + if (!shouldSplit) break + if ( + part.length === 0 || + !segmentContainsDecimalDigit(part) || + !isNumericRunSegment(part) + ) { + shouldSplit = false + } + } + + if (shouldSplit) { + let offset = 0 + for (let j = 0; j < parts.length; j++) { + const part = parts[j]! + const splitText = j < parts.length - 1 ? `${part}-` : part + texts.push(splitText) + isWordLike.push(true) + kinds.push('text') + starts.push(segmentation.starts[i]! + offset) + offset += splitText.length + } + continue + } + } + + texts.push(text) + isWordLike.push(segmentation.isWordLike[i]!) + kinds.push(segmentation.kinds[i]!) + starts.push(segmentation.starts[i]!) + } + + return { + len: texts.length, + texts, + isWordLike, + kinds, + starts, + } +} + +function mergeGlueConnectedTextRuns(segmentation: MergedSegmentation): MergedSegmentation { + const texts: string[] = [] + const isWordLike: boolean[] = [] + const kinds: SegmentBreakKind[] = [] + const starts: number[] = [] + + let read = 0 + while (read < segmentation.len) { + let text = segmentation.texts[read]! + let wordLike = segmentation.isWordLike[read]! + let kind = segmentation.kinds[read]! + let start = segmentation.starts[read]! + + if (kind === 'glue') { + let glueText = text + const glueStart = start + read++ + while (read < segmentation.len && segmentation.kinds[read] === 'glue') { + glueText += segmentation.texts[read]! + read++ + } + + if (read < segmentation.len && segmentation.kinds[read] === 'text') { + text = glueText + segmentation.texts[read]! + wordLike = segmentation.isWordLike[read]! + kind = 'text' + start = glueStart + read++ + } else { + texts.push(glueText) + isWordLike.push(false) + kinds.push('glue') + starts.push(glueStart) + continue + } + } else { + read++ + } + + if (kind === 'text') { + while (read < segmentation.len && segmentation.kinds[read] === 'glue') { + let glueText = '' + while (read < segmentation.len && segmentation.kinds[read] === 'glue') { + glueText += segmentation.texts[read]! + read++ + } + + if (read < segmentation.len && segmentation.kinds[read] === 'text') { + text += glueText + segmentation.texts[read]! + wordLike = wordLike || segmentation.isWordLike[read]! + read++ + continue + } + + text += glueText + } + } + + texts.push(text) + isWordLike.push(wordLike) + kinds.push(kind) + starts.push(start) + } + + return { + len: texts.length, + texts, + isWordLike, + kinds, + starts, + } +} + +function carryTrailingForwardStickyAcrossCJKBoundary(segmentation: MergedSegmentation): MergedSegmentation { + const texts = segmentation.texts.slice() + const isWordLike = segmentation.isWordLike.slice() + const kinds = segmentation.kinds.slice() + const starts = segmentation.starts.slice() + + for (let i = 0; i < texts.length - 1; i++) { + if (kinds[i] !== 'text' || kinds[i + 1] !== 'text') continue + if (!isCJK(texts[i]!) || !isCJK(texts[i + 1]!)) continue + + const split = splitTrailingForwardStickyCluster(texts[i]!) + if (split === null) continue + + texts[i] = split.head + texts[i + 1] = split.tail + texts[i + 1]! + starts[i + 1] = starts[i]! + split.head.length + } + + return { + len: texts.length, + texts, + isWordLike, + kinds, + starts, + } +} + + +function buildMergedSegmentation( + normalized: string, + profile: AnalysisProfile, + whiteSpaceProfile: WhiteSpaceProfile, +): MergedSegmentation { + const wordSegmenter = getSharedWordSegmenter() + let mergedLen = 0 + const mergedTexts: string[] = [] + const mergedWordLike: boolean[] = [] + const mergedKinds: SegmentBreakKind[] = [] + const mergedStarts: number[] = [] + + for (const s of wordSegmenter.segment(normalized)) { + for (const piece of splitSegmentByBreakKind(s.segment, s.isWordLike ?? false, s.index, whiteSpaceProfile)) { + const isText = piece.kind === 'text' + + if ( + profile.carryCJKAfterClosingQuote && + isText && + mergedLen > 0 && + mergedKinds[mergedLen - 1] === 'text' && + isCJK(piece.text) && + isCJK(mergedTexts[mergedLen - 1]!) && + endsWithClosingQuote(mergedTexts[mergedLen - 1]!) + ) { + mergedTexts[mergedLen - 1] += piece.text + mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike + } else if ( + isText && + mergedLen > 0 && + mergedKinds[mergedLen - 1] === 'text' && + isCJKLineStartProhibitedSegment(piece.text) && + isCJK(mergedTexts[mergedLen - 1]!) + ) { + mergedTexts[mergedLen - 1] += piece.text + mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike + } else if ( + isText && + mergedLen > 0 && + mergedKinds[mergedLen - 1] === 'text' && + endsWithMyanmarMedialGlue(mergedTexts[mergedLen - 1]!) + ) { + mergedTexts[mergedLen - 1] += piece.text + mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike + } else if ( + isText && + mergedLen > 0 && + mergedKinds[mergedLen - 1] === 'text' && + piece.isWordLike && + containsArabicScript(piece.text) && + endsWithArabicNoSpacePunctuation(mergedTexts[mergedLen - 1]!) + ) { + mergedTexts[mergedLen - 1] += piece.text + mergedWordLike[mergedLen - 1] = true + } else if ( + isText && + !piece.isWordLike && + mergedLen > 0 && + mergedKinds[mergedLen - 1] === 'text' && + piece.text.length === 1 && + piece.text !== '-' && + piece.text !== '—' && + isRepeatedSingleCharRun(mergedTexts[mergedLen - 1]!, piece.text) + ) { + mergedTexts[mergedLen - 1] += piece.text + } else if ( + isText && + !piece.isWordLike && + mergedLen > 0 && + mergedKinds[mergedLen - 1] === 'text' && + ( + isLeftStickyPunctuationSegment(piece.text) || + (piece.text === '-' && mergedWordLike[mergedLen - 1]!) + ) + ) { + mergedTexts[mergedLen - 1] += piece.text + } else { + mergedTexts[mergedLen] = piece.text + mergedWordLike[mergedLen] = piece.isWordLike + mergedKinds[mergedLen] = piece.kind + mergedStarts[mergedLen] = piece.start + mergedLen++ + } + } + } + + for (let i = 1; i < mergedLen; i++) { + if ( + mergedKinds[i] === 'text' && + !mergedWordLike[i]! && + isEscapedQuoteClusterSegment(mergedTexts[i]!) && + mergedKinds[i - 1] === 'text' + ) { + mergedTexts[i - 1] += mergedTexts[i]! + mergedWordLike[i - 1] = mergedWordLike[i - 1]! || mergedWordLike[i]! + mergedTexts[i] = '' + } + } + + for (let i = mergedLen - 2; i >= 0; i--) { + if (mergedKinds[i] === 'text' && !mergedWordLike[i]! && isForwardStickyClusterSegment(mergedTexts[i]!)) { + let j = i + 1 + while (j < mergedLen && mergedTexts[j] === '') j++ + if (j < mergedLen && mergedKinds[j] === 'text') { + mergedTexts[j] = mergedTexts[i]! + mergedTexts[j]! + mergedStarts[j] = mergedStarts[i]! + mergedTexts[i] = '' + } + } + } + + let compactLen = 0 + for (let read = 0; read < mergedLen; read++) { + const text = mergedTexts[read]! + if (text.length === 0) continue + if (compactLen !== read) { + mergedTexts[compactLen] = text + mergedWordLike[compactLen] = mergedWordLike[read]! + mergedKinds[compactLen] = mergedKinds[read]! + mergedStarts[compactLen] = mergedStarts[read]! + } + compactLen++ + } + + mergedTexts.length = compactLen + mergedWordLike.length = compactLen + mergedKinds.length = compactLen + mergedStarts.length = compactLen + + const compacted = mergeGlueConnectedTextRuns({ + len: compactLen, + texts: mergedTexts, + isWordLike: mergedWordLike, + kinds: mergedKinds, + starts: mergedStarts, + }) + const withMergedUrls = carryTrailingForwardStickyAcrossCJKBoundary( + mergeAsciiPunctuationChains( + splitHyphenatedNumericRuns(mergeNumericRuns(mergeUrlQueryRuns(mergeUrlLikeRuns(compacted)))), + ), + ) + + for (let i = 0; i < withMergedUrls.len - 1; i++) { + const split = splitLeadingSpaceAndMarks(withMergedUrls.texts[i]!) + if (split === null) continue + if ( + (withMergedUrls.kinds[i] !== 'space' && withMergedUrls.kinds[i] !== 'preserved-space') || + withMergedUrls.kinds[i + 1] !== 'text' || + !containsArabicScript(withMergedUrls.texts[i + 1]!) + ) { + continue + } + + withMergedUrls.texts[i] = split.space + withMergedUrls.isWordLike[i] = false + withMergedUrls.kinds[i] = withMergedUrls.kinds[i] === 'preserved-space' ? 'preserved-space' : 'space' + withMergedUrls.texts[i + 1] = split.marks + withMergedUrls.texts[i + 1]! + withMergedUrls.starts[i + 1] = withMergedUrls.starts[i]! + split.space.length + } + + return withMergedUrls +} + +function compileAnalysisChunks(segmentation: MergedSegmentation, whiteSpaceProfile: WhiteSpaceProfile): AnalysisChunk[] { + if (segmentation.len === 0) return [] + if (!whiteSpaceProfile.preserveHardBreaks) { + return [{ + startSegmentIndex: 0, + endSegmentIndex: segmentation.len, + consumedEndSegmentIndex: segmentation.len, + }] + } + + const chunks: AnalysisChunk[] = [] + let startSegmentIndex = 0 + + for (let i = 0; i < segmentation.len; i++) { + if (segmentation.kinds[i] !== 'hard-break') continue + + chunks.push({ + startSegmentIndex, + endSegmentIndex: i, + consumedEndSegmentIndex: i + 1, + }) + startSegmentIndex = i + 1 + } + + if (startSegmentIndex < segmentation.len) { + chunks.push({ + startSegmentIndex, + endSegmentIndex: segmentation.len, + consumedEndSegmentIndex: segmentation.len, + }) + } + + return chunks +} + +export function analyzeText( + text: string, + profile: AnalysisProfile, + whiteSpace: WhiteSpaceMode = 'normal', +): TextAnalysis { + const whiteSpaceProfile = getWhiteSpaceProfile(whiteSpace) + const normalized = whiteSpaceProfile.mode === 'pre-wrap' + ? normalizeWhitespacePreWrap(text) + : normalizeWhitespaceNormal(text) + if (normalized.length === 0) { + return { + normalized, + chunks: [], + len: 0, + texts: [], + isWordLike: [], + kinds: [], + starts: [], + } + } + const segmentation = buildMergedSegmentation(normalized, profile, whiteSpaceProfile) + return { + normalized, + chunks: compileAnalysisChunks(segmentation, whiteSpaceProfile), + ...segmentation, + } +} diff --git a/src/compat/layout.ts b/src/compat/layout.ts new file mode 100644 index 00000000..9fa8929c --- /dev/null +++ b/src/compat/layout.ts @@ -0,0 +1,717 @@ +// Text measurement for browser environments using canvas measureText. +// +// Problem: DOM-based text measurement (getBoundingClientRect, offsetHeight) +// forces synchronous layout reflow. When components independently measure text, +// each measurement triggers a reflow of the entire document. This creates +// read/write interleaving that can cost 30ms+ per frame for 500 text blocks. +// +// Solution: two-phase measurement centered around canvas measureText. +// prepare(text, font) — segments text via Intl.Segmenter, measures each word +// via canvas, caches widths, and does one cached DOM calibration read per +// font when emoji correction is needed. Call once when text first appears. +// layout(prepared, maxWidth, lineHeight) — walks cached word widths with pure +// arithmetic to count lines and compute height. Call on every resize. +// ~0.0002ms per text. +// +// i18n: Intl.Segmenter handles CJK (per-character breaking), Thai, Arabic, etc. +// Bidi: simplified rich-path metadata for mixed LTR/RTL custom rendering. +// Punctuation merging: "better." measured as one unit (matches CSS behavior). +// Trailing whitespace: hangs past line edge without triggering breaks (CSS behavior). +// overflow-wrap: pre-measured grapheme widths enable character-level word breaking. +// +// Emoji correction: Chrome/Firefox canvas measures emoji wider than DOM at font +// sizes <24px on macOS (Apple Color Emoji). The inflation is constant per emoji +// grapheme at a given size, font-independent. Auto-detected by comparing canvas +// vs actual DOM emoji width (one cached DOM read per font). Safari canvas and +// DOM agree (both wider than fontSize), so correction = 0 there. +// +// Limitations: +// - system-ui font: canvas resolves to different optical variants than DOM on macOS. +// Use named fonts (Helvetica, Inter, etc.) for guaranteed accuracy. +// See RESEARCH.md "Discovery: system-ui font resolution mismatch". +// +// Based on Sebastian Markbage's text-layout research (github.com/chenglou/text-layout). + +import { computeSegmentLevels } from '../bidi.js' +import { + analyzeText, + clearAnalysisCaches, + endsWithClosingQuote, + isCJK, + kinsokuEnd, + kinsokuStart, + leftStickyPunctuation, + setAnalysisLocale, + type AnalysisChunk, + type SegmentBreakKind, + type TextAnalysis, + type WhiteSpaceMode, +} from './analysis.js' +import { + clearMeasurementCaches, + getCorrectedSegmentWidth, + getEngineProfile, + getFontMeasurementState, + getSegmentGraphemePrefixWidths, + getSegmentGraphemeWidths, + getSegmentMetrics, + textMayContainEmoji, +} from './measurement.js' +import { + countPreparedLines, + layoutNextLineRange as stepPreparedLineRange, + walkPreparedLines, + type InternalLayoutLine, +} from './line-break.js' + +let sharedGraphemeSegmenter: Intl.Segmenter | null = null +// Rich-path only. Reuses grapheme splits while materializing multiple lines +// from the same prepared handle, without pushing that cache into the API. +let sharedLineTextCaches = new WeakMap>() + +function getSharedGraphemeSegmenter(): Intl.Segmenter { + if (sharedGraphemeSegmenter === null) { + sharedGraphemeSegmenter = new Intl.Segmenter(undefined, { granularity: 'grapheme' }) + } + return sharedGraphemeSegmenter +} + +// --- Public types --- + +declare const preparedTextBrand: unique symbol + +type PreparedCore = { + widths: number[] // Segment widths, e.g. [42.5, 4.4, 37.2] + lineEndFitAdvances: number[] // Width contribution when a line ends after this segment + lineEndPaintAdvances: number[] // Painted width contribution when a line ends after this segment + kinds: SegmentBreakKind[] // Break behavior per segment, e.g. ['text', 'space', 'text'] + simpleLineWalkFastPath: boolean // Normal text can use the simpler old line walker across all layout APIs + segLevels: Int8Array | null // Rich-path bidi metadata for custom rendering; layout() never reads it + breakableWidths: (number[] | null)[] // Grapheme widths for overflow-wrap segments, else null + breakablePrefixWidths: (number[] | null)[] // Cumulative grapheme prefix widths for narrow browser-policy shims + discretionaryHyphenWidth: number // Visible width added when a soft hyphen is chosen as the break + tabStopAdvance: number // Absolute advance between tab stops for pre-wrap tab segments + chunks: PreparedLineChunk[] // Precompiled hard-break chunks for line walking +} + +// Keep the main prepared handle opaque so the public API does not accidentally +// calcify around the current parallel-array representation. +export type PreparedText = { + readonly [preparedTextBrand]: true +} + +type InternalPreparedText = PreparedText & PreparedCore + +// Rich/diagnostic variant that still exposes the structural segment data. +// Treat this as the unstable escape hatch for experiments and custom rendering. +export type PreparedTextWithSegments = InternalPreparedText & { + segments: string[] // Segment text aligned with the parallel arrays, e.g. ['hello', ' ', 'world'] +} + +export type LayoutCursor = { + segmentIndex: number // Segment index in `segments` + graphemeIndex: number // Grapheme index within that segment; `0` at segment boundaries +} + +export type LayoutResult = { + lineCount: number // Number of wrapped lines, e.g. 3 + height: number // Total block height, e.g. lineCount * lineHeight = 57 +} + +export type LayoutLine = { + text: string // Full text content of this line, e.g. 'hello world' + width: number // Measured width of this line, e.g. 87.5 + start: LayoutCursor // Inclusive start cursor in prepared segments/graphemes + end: LayoutCursor // Exclusive end cursor in prepared segments/graphemes +} + +export type LayoutLineRange = { + width: number // Measured width of this line, e.g. 87.5 + start: LayoutCursor // Inclusive start cursor in prepared segments/graphemes + end: LayoutCursor // Exclusive end cursor in prepared segments/graphemes +} + +export type LayoutLinesResult = LayoutResult & { + lines: LayoutLine[] // Per-line text/width pairs for custom rendering +} + +export type PrepareProfile = { + analysisMs: number + measureMs: number + totalMs: number + analysisSegments: number + preparedSegments: number + breakableSegments: number +} + +export type PrepareOptions = { + whiteSpace?: WhiteSpaceMode +} + +export type PreparedLineChunk = { + startSegmentIndex: number + endSegmentIndex: number + consumedEndSegmentIndex: number +} + +// --- Public API --- + +function createEmptyPrepared(includeSegments: boolean): InternalPreparedText | PreparedTextWithSegments { + if (includeSegments) { + return { + widths: [], + lineEndFitAdvances: [], + lineEndPaintAdvances: [], + kinds: [], + simpleLineWalkFastPath: true, + segLevels: null, + breakableWidths: [], + breakablePrefixWidths: [], + discretionaryHyphenWidth: 0, + tabStopAdvance: 0, + chunks: [], + segments: [], + } as unknown as PreparedTextWithSegments + } + return { + widths: [], + lineEndFitAdvances: [], + lineEndPaintAdvances: [], + kinds: [], + simpleLineWalkFastPath: true, + segLevels: null, + breakableWidths: [], + breakablePrefixWidths: [], + discretionaryHyphenWidth: 0, + tabStopAdvance: 0, + chunks: [], + } as unknown as InternalPreparedText +} + +function measureAnalysis( + analysis: TextAnalysis, + font: string, + includeSegments: boolean, +): InternalPreparedText | PreparedTextWithSegments { + const graphemeSegmenter = getSharedGraphemeSegmenter() + const engineProfile = getEngineProfile() + const { cache, emojiCorrection } = getFontMeasurementState( + font, + textMayContainEmoji(analysis.normalized), + ) + const discretionaryHyphenWidth = getCorrectedSegmentWidth('-', getSegmentMetrics('-', cache), emojiCorrection) + const spaceWidth = getCorrectedSegmentWidth(' ', getSegmentMetrics(' ', cache), emojiCorrection) + const tabStopAdvance = spaceWidth * 8 + + if (analysis.len === 0) return createEmptyPrepared(includeSegments) + + const widths: number[] = [] + const lineEndFitAdvances: number[] = [] + const lineEndPaintAdvances: number[] = [] + const kinds: SegmentBreakKind[] = [] + let simpleLineWalkFastPath = analysis.chunks.length <= 1 + const segStarts = includeSegments ? [] as number[] : null + const breakableWidths: (number[] | null)[] = [] + const breakablePrefixWidths: (number[] | null)[] = [] + const segments = includeSegments ? [] as string[] : null + const preparedStartByAnalysisIndex = Array.from({ length: analysis.len }) + const preparedEndByAnalysisIndex = Array.from({ length: analysis.len }) + + function pushMeasuredSegment( + text: string, + width: number, + lineEndFitAdvance: number, + lineEndPaintAdvance: number, + kind: SegmentBreakKind, + start: number, + breakable: number[] | null, + breakablePrefix: number[] | null, + ): void { + if (kind !== 'text' && kind !== 'space' && kind !== 'zero-width-break') { + simpleLineWalkFastPath = false + } + widths.push(width) + lineEndFitAdvances.push(lineEndFitAdvance) + lineEndPaintAdvances.push(lineEndPaintAdvance) + kinds.push(kind) + segStarts?.push(start) + breakableWidths.push(breakable) + breakablePrefixWidths.push(breakablePrefix) + if (segments !== null) segments.push(text) + } + + for (let mi = 0; mi < analysis.len; mi++) { + preparedStartByAnalysisIndex[mi] = widths.length + const segText = analysis.texts[mi]! + const segWordLike = analysis.isWordLike[mi]! + const segKind = analysis.kinds[mi]! + const segStart = analysis.starts[mi]! + + if (segKind === 'soft-hyphen') { + pushMeasuredSegment( + segText, + 0, + discretionaryHyphenWidth, + discretionaryHyphenWidth, + segKind, + segStart, + null, + null, + ) + preparedEndByAnalysisIndex[mi] = widths.length + continue + } + + if (segKind === 'hard-break') { + pushMeasuredSegment(segText, 0, 0, 0, segKind, segStart, null, null) + preparedEndByAnalysisIndex[mi] = widths.length + continue + } + + if (segKind === 'tab') { + pushMeasuredSegment(segText, 0, 0, 0, segKind, segStart, null, null) + preparedEndByAnalysisIndex[mi] = widths.length + continue + } + + const segMetrics = getSegmentMetrics(segText, cache) + + if (segKind === 'text' && segMetrics.containsCJK) { + let unitText = '' + let unitStart = 0 + + for (const gs of graphemeSegmenter.segment(segText)) { + const grapheme = gs.segment + + if (unitText.length === 0) { + unitText = grapheme + unitStart = gs.index + continue + } + + if ( + kinsokuEnd.has(unitText) || + kinsokuStart.has(grapheme) || + leftStickyPunctuation.has(grapheme) || + (engineProfile.carryCJKAfterClosingQuote && + isCJK(grapheme) && + endsWithClosingQuote(unitText)) + ) { + unitText += grapheme + continue + } + + const unitMetrics = getSegmentMetrics(unitText, cache) + const w = getCorrectedSegmentWidth(unitText, unitMetrics, emojiCorrection) + pushMeasuredSegment(unitText, w, w, w, 'text', segStart + unitStart, null, null) + + unitText = grapheme + unitStart = gs.index + } + + if (unitText.length > 0) { + const unitMetrics = getSegmentMetrics(unitText, cache) + const w = getCorrectedSegmentWidth(unitText, unitMetrics, emojiCorrection) + pushMeasuredSegment(unitText, w, w, w, 'text', segStart + unitStart, null, null) + } + preparedEndByAnalysisIndex[mi] = widths.length + continue + } + + const w = getCorrectedSegmentWidth(segText, segMetrics, emojiCorrection) + const lineEndFitAdvance = + segKind === 'space' || segKind === 'preserved-space' || segKind === 'zero-width-break' + ? 0 + : w + const lineEndPaintAdvance = + segKind === 'space' || segKind === 'zero-width-break' + ? 0 + : w + + if (segWordLike && segText.length > 1) { + const graphemeWidths = getSegmentGraphemeWidths(segText, segMetrics, cache, emojiCorrection) + const graphemePrefixWidths = engineProfile.preferPrefixWidthsForBreakableRuns + ? getSegmentGraphemePrefixWidths(segText, segMetrics, cache, emojiCorrection) + : null + pushMeasuredSegment( + segText, + w, + lineEndFitAdvance, + lineEndPaintAdvance, + segKind, + segStart, + graphemeWidths, + graphemePrefixWidths, + ) + } else { + pushMeasuredSegment( + segText, + w, + lineEndFitAdvance, + lineEndPaintAdvance, + segKind, + segStart, + null, + null, + ) + } + preparedEndByAnalysisIndex[mi] = widths.length + } + + const chunks = mapAnalysisChunksToPreparedChunks(analysis.chunks, preparedStartByAnalysisIndex, preparedEndByAnalysisIndex) + const segLevels = segStarts === null ? null : computeSegmentLevels(analysis.normalized, segStarts) + if (segments !== null) { + return { + widths, + lineEndFitAdvances, + lineEndPaintAdvances, + kinds, + simpleLineWalkFastPath, + segLevels, + breakableWidths, + breakablePrefixWidths, + discretionaryHyphenWidth, + tabStopAdvance, + chunks, + segments, + } as unknown as PreparedTextWithSegments + } + return { + widths, + lineEndFitAdvances, + lineEndPaintAdvances, + kinds, + simpleLineWalkFastPath, + segLevels, + breakableWidths, + breakablePrefixWidths, + discretionaryHyphenWidth, + tabStopAdvance, + chunks, + } as unknown as InternalPreparedText +} + +function mapAnalysisChunksToPreparedChunks( + chunks: AnalysisChunk[], + preparedStartByAnalysisIndex: number[], + preparedEndByAnalysisIndex: number[], +): PreparedLineChunk[] { + const preparedChunks: PreparedLineChunk[] = [] + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]! + const startSegmentIndex = + chunk.startSegmentIndex < preparedStartByAnalysisIndex.length + ? preparedStartByAnalysisIndex[chunk.startSegmentIndex]! + : preparedEndByAnalysisIndex[preparedEndByAnalysisIndex.length - 1] ?? 0 + const endSegmentIndex = + chunk.endSegmentIndex < preparedStartByAnalysisIndex.length + ? preparedStartByAnalysisIndex[chunk.endSegmentIndex]! + : preparedEndByAnalysisIndex[preparedEndByAnalysisIndex.length - 1] ?? 0 + const consumedEndSegmentIndex = + chunk.consumedEndSegmentIndex < preparedStartByAnalysisIndex.length + ? preparedStartByAnalysisIndex[chunk.consumedEndSegmentIndex]! + : preparedEndByAnalysisIndex[preparedEndByAnalysisIndex.length - 1] ?? 0 + + preparedChunks.push({ + startSegmentIndex, + endSegmentIndex, + consumedEndSegmentIndex, + }) + } + return preparedChunks +} + +function prepareInternal( + text: string, + font: string, + includeSegments: boolean, + options?: PrepareOptions, +): InternalPreparedText | PreparedTextWithSegments { + const analysis = analyzeText(text, getEngineProfile(), options?.whiteSpace) + return measureAnalysis(analysis, font, includeSegments) +} + +// Diagnostic-only helper used by the browser benchmark harness to separate the +// text-analysis and measurement phases without duplicating the prepare logic. +export function profilePrepare(text: string, font: string, options?: PrepareOptions): PrepareProfile { + const t0 = performance.now() + const analysis = analyzeText(text, getEngineProfile(), options?.whiteSpace) + const t1 = performance.now() + const prepared = measureAnalysis(analysis, font, false) as InternalPreparedText + const t2 = performance.now() + + let breakableSegments = 0 + for (const widths of prepared.breakableWidths) { + if (widths !== null) breakableSegments++ + } + + return { + analysisMs: t1 - t0, + measureMs: t2 - t1, + totalMs: t2 - t0, + analysisSegments: analysis.len, + preparedSegments: prepared.widths.length, + breakableSegments, + } +} + +// Prepare text for layout. Segments the text, measures each segment via canvas, +// and stores the widths for fast relayout at any width. Call once per text block +// (e.g. when a comment first appears). The result is width-independent — the +// same PreparedText can be laid out at any maxWidth and lineHeight via layout(). +// +// Steps: +// 1. Normalize collapsible whitespace (CSS white-space: normal behavior) +// 2. Segment via Intl.Segmenter (handles CJK, Thai, etc.) +// 3. Merge punctuation into preceding word ("better." as one unit) +// 4. Split CJK words into individual graphemes (per-character line breaks) +// 5. Measure each segment via canvas measureText, cache by (segment, font) +// 6. Pre-measure graphemes of long words (for overflow-wrap: break-word) +// 7. Correct emoji canvas inflation (auto-detected per font size) +// 8. Optionally compute rich-path bidi metadata for custom renderers +export function prepare(text: string, font: string, options?: PrepareOptions): PreparedText { + return prepareInternal(text, font, false, options) as PreparedText +} + +// Rich variant used by callers that need enough information to render the +// laid-out lines themselves. +export function prepareWithSegments(text: string, font: string, options?: PrepareOptions): PreparedTextWithSegments { + return prepareInternal(text, font, true, options) as PreparedTextWithSegments +} + +function getInternalPrepared(prepared: PreparedText): InternalPreparedText { + return prepared as InternalPreparedText +} + +// Layout prepared text at a given max width and caller-provided lineHeight. +// Pure arithmetic on cached widths — no canvas calls, no DOM reads, no string +// operations, no allocations. +// ~0.0002ms per text block. Call on every resize. +// +// Line breaking rules (matching CSS white-space: normal + overflow-wrap: break-word): +// - Break before any non-space segment that would overflow the line +// - Trailing whitespace hangs past the line edge (doesn't trigger breaks) +// - Segments wider than maxWidth are broken at grapheme boundaries +export function layout(prepared: PreparedText, maxWidth: number, lineHeight: number): LayoutResult { + // Keep the resize hot path specialized. `layoutWithLines()` shares the same + // break semantics but also tracks line ranges; the extra bookkeeping is too + // expensive to pay on every hot-path `layout()` call. + const lineCount = countPreparedLines(getInternalPrepared(prepared), maxWidth) + return { lineCount, height: lineCount * lineHeight } +} + +function getSegmentGraphemes( + segmentIndex: number, + segments: string[], + cache: Map, +): string[] { + let graphemes = cache.get(segmentIndex) + if (graphemes !== undefined) return graphemes + + graphemes = [] + const graphemeSegmenter = getSharedGraphemeSegmenter() + for (const gs of graphemeSegmenter.segment(segments[segmentIndex]!)) { + graphemes.push(gs.segment) + } + cache.set(segmentIndex, graphemes) + return graphemes +} + +function getLineTextCache(prepared: PreparedTextWithSegments): Map { + let cache = sharedLineTextCaches.get(prepared) + if (cache !== undefined) return cache + + cache = new Map() + sharedLineTextCaches.set(prepared, cache) + return cache +} + +function lineHasDiscretionaryHyphen( + kinds: SegmentBreakKind[], + startSegmentIndex: number, + startGraphemeIndex: number, + endSegmentIndex: number, +): boolean { + return ( + endSegmentIndex > 0 && + kinds[endSegmentIndex - 1] === 'soft-hyphen' && + !(startSegmentIndex === endSegmentIndex && startGraphemeIndex > 0) + ) +} + +function buildLineTextFromRange( + segments: string[], + kinds: SegmentBreakKind[], + cache: Map, + startSegmentIndex: number, + startGraphemeIndex: number, + endSegmentIndex: number, + endGraphemeIndex: number, +): string { + let text = '' + const endsWithDiscretionaryHyphen = lineHasDiscretionaryHyphen( + kinds, + startSegmentIndex, + startGraphemeIndex, + endSegmentIndex, + ) + + for (let i = startSegmentIndex; i < endSegmentIndex; i++) { + if (kinds[i] === 'soft-hyphen' || kinds[i] === 'hard-break') continue + if (i === startSegmentIndex && startGraphemeIndex > 0) { + text += getSegmentGraphemes(i, segments, cache).slice(startGraphemeIndex).join('') + } else { + text += segments[i]! + } + } + + if (endGraphemeIndex > 0) { + if (endsWithDiscretionaryHyphen) text += '-' + text += getSegmentGraphemes(endSegmentIndex, segments, cache).slice( + startSegmentIndex === endSegmentIndex ? startGraphemeIndex : 0, + endGraphemeIndex, + ).join('') + } else if (endsWithDiscretionaryHyphen) { + text += '-' + } + + return text +} + +function createLayoutLine( + prepared: PreparedTextWithSegments, + cache: Map, + width: number, + startSegmentIndex: number, + startGraphemeIndex: number, + endSegmentIndex: number, + endGraphemeIndex: number, +): LayoutLine { + return { + text: buildLineTextFromRange( + prepared.segments, + prepared.kinds, + cache, + startSegmentIndex, + startGraphemeIndex, + endSegmentIndex, + endGraphemeIndex, + ), + width, + start: { + segmentIndex: startSegmentIndex, + graphemeIndex: startGraphemeIndex, + }, + end: { + segmentIndex: endSegmentIndex, + graphemeIndex: endGraphemeIndex, + }, + } +} + +function materializeLayoutLine( + prepared: PreparedTextWithSegments, + cache: Map, + line: InternalLayoutLine, +): LayoutLine { + return createLayoutLine( + prepared, + cache, + line.width, + line.startSegmentIndex, + line.startGraphemeIndex, + line.endSegmentIndex, + line.endGraphemeIndex, + ) +} + +function toLayoutLineRange(line: InternalLayoutLine): LayoutLineRange { + return { + width: line.width, + start: { + segmentIndex: line.startSegmentIndex, + graphemeIndex: line.startGraphemeIndex, + }, + end: { + segmentIndex: line.endSegmentIndex, + graphemeIndex: line.endGraphemeIndex, + }, + } +} + +function stepLineRange( + prepared: PreparedTextWithSegments, + start: LayoutCursor, + maxWidth: number, +): LayoutLineRange | null { + const line = stepPreparedLineRange(prepared, start, maxWidth) + if (line === null) return null + return toLayoutLineRange(line) +} + +function materializeLine( + prepared: PreparedTextWithSegments, + line: LayoutLineRange, +): LayoutLine { + return createLayoutLine( + prepared, + getLineTextCache(prepared), + line.width, + line.start.segmentIndex, + line.start.graphemeIndex, + line.end.segmentIndex, + line.end.graphemeIndex, + ) +} + +// Batch low-level line geometry pass. This is the non-materializing counterpart +// to layoutWithLines(), useful for shrinkwrap and other aggregate geometry work. +export function walkLineRanges( + prepared: PreparedTextWithSegments, + maxWidth: number, + onLine: (line: LayoutLineRange) => void, +): number { + if (prepared.widths.length === 0) return 0 + + return walkPreparedLines(getInternalPrepared(prepared), maxWidth, line => { + onLine(toLayoutLineRange(line)) + }) +} + +export function layoutNextLine( + prepared: PreparedTextWithSegments, + start: LayoutCursor, + maxWidth: number, +): LayoutLine | null { + const line = stepLineRange(prepared, start, maxWidth) + if (line === null) return null + return materializeLine(prepared, line) +} + +// Rich layout API for callers that want the actual line contents and widths. +// Caller still supplies lineHeight at layout time. Mirrors layout()'s break +// decisions, but keeps extra per-line bookkeeping so it should stay off the +// resize hot path. +export function layoutWithLines(prepared: PreparedTextWithSegments, maxWidth: number, lineHeight: number): LayoutLinesResult { + const lines: LayoutLine[] = [] + if (prepared.widths.length === 0) return { lineCount: 0, height: 0, lines } + + const graphemeCache = getLineTextCache(prepared) + const lineCount = walkPreparedLines(getInternalPrepared(prepared), maxWidth, line => { + lines.push(materializeLayoutLine(prepared, graphemeCache, line)) + }) + + return { lineCount, height: lineCount * lineHeight, lines } +} + +export function clearCache(): void { + clearAnalysisCaches() + sharedGraphemeSegmenter = null + sharedLineTextCaches = new WeakMap>() + clearMeasurementCaches() +} + +export function setLocale(locale?: string): void { + setAnalysisLocale(locale) + clearCache() +} diff --git a/src/compat/line-break.ts b/src/compat/line-break.ts new file mode 100644 index 00000000..57fa1131 --- /dev/null +++ b/src/compat/line-break.ts @@ -0,0 +1,1056 @@ +import type { SegmentBreakKind } from './analysis.js' +import { getEngineProfile } from './measurement.js' + +export type LineBreakCursor = { + segmentIndex: number + graphemeIndex: number +} + +export type PreparedLineBreakData = { + widths: number[] + lineEndFitAdvances: number[] + lineEndPaintAdvances: number[] + kinds: SegmentBreakKind[] + simpleLineWalkFastPath: boolean + breakableWidths: (number[] | null)[] + breakablePrefixWidths: (number[] | null)[] + discretionaryHyphenWidth: number + tabStopAdvance: number + chunks: { + startSegmentIndex: number + endSegmentIndex: number + consumedEndSegmentIndex: number + }[] +} + +export type InternalLayoutLine = { + startSegmentIndex: number + startGraphemeIndex: number + endSegmentIndex: number + endGraphemeIndex: number + width: number +} + +function canBreakAfter(kind: SegmentBreakKind): boolean { + return ( + kind === 'space' || + kind === 'preserved-space' || + kind === 'tab' || + kind === 'zero-width-break' || + kind === 'soft-hyphen' + ) +} + +function isSimpleCollapsibleSpace(kind: SegmentBreakKind): boolean { + return kind === 'space' +} + +function getTabAdvance(lineWidth: number, tabStopAdvance: number): number { + if (tabStopAdvance <= 0) return 0 + + const remainder = lineWidth % tabStopAdvance + if (Math.abs(remainder) <= 1e-6) return tabStopAdvance + return tabStopAdvance - remainder +} + +function getBreakableAdvance( + graphemeWidths: number[], + graphemePrefixWidths: number[] | null, + graphemeIndex: number, + preferPrefixWidths: boolean, +): number { + if (!preferPrefixWidths || graphemePrefixWidths === null) { + return graphemeWidths[graphemeIndex]! + } + return graphemePrefixWidths[graphemeIndex]! - (graphemeIndex > 0 ? graphemePrefixWidths[graphemeIndex - 1]! : 0) +} + +function fitSoftHyphenBreak( + graphemeWidths: number[], + initialWidth: number, + maxWidth: number, + lineFitEpsilon: number, + discretionaryHyphenWidth: number, + cumulativeWidths: boolean, +): { fitCount: number, fittedWidth: number } { + let fitCount = 0 + let fittedWidth = initialWidth + + while (fitCount < graphemeWidths.length) { + const nextWidth = cumulativeWidths + ? initialWidth + graphemeWidths[fitCount]! + : fittedWidth + graphemeWidths[fitCount]! + const nextLineWidth = fitCount + 1 < graphemeWidths.length + ? nextWidth + discretionaryHyphenWidth + : nextWidth + if (nextLineWidth > maxWidth + lineFitEpsilon) break + fittedWidth = nextWidth + fitCount++ + } + + return { fitCount, fittedWidth } +} + +function findChunkIndexForStart(prepared: PreparedLineBreakData, segmentIndex: number): number { + for (let i = 0; i < prepared.chunks.length; i++) { + const chunk = prepared.chunks[i]! + if (segmentIndex < chunk.consumedEndSegmentIndex) return i + } + return -1 +} + +export function normalizeLineStart( + prepared: PreparedLineBreakData, + start: LineBreakCursor, +): LineBreakCursor | null { + let segmentIndex = start.segmentIndex + const graphemeIndex = start.graphemeIndex + + if (segmentIndex >= prepared.widths.length) return null + if (graphemeIndex > 0) return start + + const chunkIndex = findChunkIndexForStart(prepared, segmentIndex) + if (chunkIndex < 0) return null + + const chunk = prepared.chunks[chunkIndex]! + if (chunk.startSegmentIndex === chunk.endSegmentIndex && segmentIndex === chunk.startSegmentIndex) { + return { segmentIndex, graphemeIndex: 0 } + } + + if (segmentIndex < chunk.startSegmentIndex) segmentIndex = chunk.startSegmentIndex + while (segmentIndex < chunk.endSegmentIndex) { + const kind = prepared.kinds[segmentIndex]! + if (kind !== 'space' && kind !== 'zero-width-break' && kind !== 'soft-hyphen') { + return { segmentIndex, graphemeIndex: 0 } + } + segmentIndex++ + } + + if (chunk.consumedEndSegmentIndex >= prepared.widths.length) return null + return { segmentIndex: chunk.consumedEndSegmentIndex, graphemeIndex: 0 } +} + +export function countPreparedLines(prepared: PreparedLineBreakData, maxWidth: number): number { + if (prepared.simpleLineWalkFastPath) { + return countPreparedLinesSimple(prepared, maxWidth) + } + return walkPreparedLines(prepared, maxWidth) +} + +function countPreparedLinesSimple(prepared: PreparedLineBreakData, maxWidth: number): number { + const { widths, kinds, breakableWidths, breakablePrefixWidths } = prepared + if (widths.length === 0) return 0 + + const engineProfile = getEngineProfile() + const lineFitEpsilon = engineProfile.lineFitEpsilon + + let lineCount = 0 + let lineW = 0 + let hasContent = false + + function placeOnFreshLine(segmentIndex: number): void { + const w = widths[segmentIndex]! + if (w > maxWidth && breakableWidths[segmentIndex] !== null) { + const gWidths = breakableWidths[segmentIndex]! + const gPrefixWidths = breakablePrefixWidths[segmentIndex] ?? null + lineW = 0 + for (let g = 0; g < gWidths.length; g++) { + const gw = getBreakableAdvance( + gWidths, + gPrefixWidths, + g, + engineProfile.preferPrefixWidthsForBreakableRuns, + ) + if (lineW > 0 && lineW + gw > maxWidth + lineFitEpsilon) { + lineCount++ + lineW = gw + } else { + if (lineW === 0) lineCount++ + lineW += gw + } + } + } else { + lineW = w + lineCount++ + } + hasContent = true + } + + for (let i = 0; i < widths.length; i++) { + const w = widths[i]! + const kind = kinds[i]! + + if (!hasContent) { + placeOnFreshLine(i) + continue + } + + const newW = lineW + w + if (newW > maxWidth + lineFitEpsilon) { + if (isSimpleCollapsibleSpace(kind)) continue + lineW = 0 + hasContent = false + placeOnFreshLine(i) + continue + } + + lineW = newW + } + + if (!hasContent) return lineCount + 1 + return lineCount +} + +function walkPreparedLinesSimple( + prepared: PreparedLineBreakData, + maxWidth: number, + onLine?: (line: InternalLayoutLine) => void, +): number { + const { widths, kinds, breakableWidths, breakablePrefixWidths } = prepared + if (widths.length === 0) return 0 + + const engineProfile = getEngineProfile() + const lineFitEpsilon = engineProfile.lineFitEpsilon + + let lineCount = 0 + let lineW = 0 + let hasContent = false + let lineStartSegmentIndex = 0 + let lineStartGraphemeIndex = 0 + let lineEndSegmentIndex = 0 + let lineEndGraphemeIndex = 0 + let pendingBreakSegmentIndex = -1 + let pendingBreakPaintWidth = 0 + + function clearPendingBreak(): void { + pendingBreakSegmentIndex = -1 + pendingBreakPaintWidth = 0 + } + + function emitCurrentLine( + endSegmentIndex = lineEndSegmentIndex, + endGraphemeIndex = lineEndGraphemeIndex, + width = lineW, + ): void { + lineCount++ + onLine?.({ + startSegmentIndex: lineStartSegmentIndex, + startGraphemeIndex: lineStartGraphemeIndex, + endSegmentIndex, + endGraphemeIndex, + width, + }) + lineW = 0 + hasContent = false + clearPendingBreak() + } + + function startLineAtSegment(segmentIndex: number, width: number): void { + hasContent = true + lineStartSegmentIndex = segmentIndex + lineStartGraphemeIndex = 0 + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + lineW = width + } + + function startLineAtGrapheme(segmentIndex: number, graphemeIndex: number, width: number): void { + hasContent = true + lineStartSegmentIndex = segmentIndex + lineStartGraphemeIndex = graphemeIndex + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = graphemeIndex + 1 + lineW = width + } + + function appendWholeSegment(segmentIndex: number, width: number): void { + if (!hasContent) { + startLineAtSegment(segmentIndex, width) + return + } + lineW += width + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + + function updatePendingBreak(segmentIndex: number, segmentWidth: number): void { + if (!canBreakAfter(kinds[segmentIndex]!)) return + pendingBreakSegmentIndex = segmentIndex + 1 + pendingBreakPaintWidth = lineW - segmentWidth + } + + function appendBreakableSegment(segmentIndex: number): void { + appendBreakableSegmentFrom(segmentIndex, 0) + } + + function appendBreakableSegmentFrom(segmentIndex: number, startGraphemeIndex: number): void { + const gWidths = breakableWidths[segmentIndex]! + const gPrefixWidths = breakablePrefixWidths[segmentIndex] ?? null + for (let g = startGraphemeIndex; g < gWidths.length; g++) { + const gw = getBreakableAdvance( + gWidths, + gPrefixWidths, + g, + engineProfile.preferPrefixWidthsForBreakableRuns, + ) + + if (!hasContent) { + startLineAtGrapheme(segmentIndex, g, gw) + continue + } + + if (lineW + gw > maxWidth + lineFitEpsilon) { + emitCurrentLine() + startLineAtGrapheme(segmentIndex, g, gw) + } else { + lineW += gw + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = g + 1 + } + } + + if (hasContent && lineEndSegmentIndex === segmentIndex && lineEndGraphemeIndex === gWidths.length) { + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + } + + let i = 0 + while (i < widths.length) { + const w = widths[i]! + const kind = kinds[i]! + + if (!hasContent) { + if (w > maxWidth && breakableWidths[i] !== null) { + appendBreakableSegment(i) + } else { + startLineAtSegment(i, w) + } + updatePendingBreak(i, w) + i++ + continue + } + + const newW = lineW + w + if (newW > maxWidth + lineFitEpsilon) { + if (canBreakAfter(kind)) { + appendWholeSegment(i, w) + emitCurrentLine(i + 1, 0, lineW - w) + i++ + continue + } + + if (pendingBreakSegmentIndex >= 0) { + emitCurrentLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + continue + } + + if (w > maxWidth && breakableWidths[i] !== null) { + emitCurrentLine() + appendBreakableSegment(i) + i++ + continue + } + + emitCurrentLine() + continue + } + + appendWholeSegment(i, w) + updatePendingBreak(i, w) + i++ + } + + if (hasContent) emitCurrentLine() + return lineCount +} + +export function walkPreparedLines( + prepared: PreparedLineBreakData, + maxWidth: number, + onLine?: (line: InternalLayoutLine) => void, +): number { + if (prepared.simpleLineWalkFastPath) { + return walkPreparedLinesSimple(prepared, maxWidth, onLine) + } + + const { + widths, + lineEndFitAdvances, + lineEndPaintAdvances, + kinds, + breakableWidths, + breakablePrefixWidths, + discretionaryHyphenWidth, + tabStopAdvance, + chunks, + } = prepared + if (widths.length === 0 || chunks.length === 0) return 0 + + const engineProfile = getEngineProfile() + const lineFitEpsilon = engineProfile.lineFitEpsilon + + let lineCount = 0 + let lineW = 0 + let hasContent = false + let lineStartSegmentIndex = 0 + let lineStartGraphemeIndex = 0 + let lineEndSegmentIndex = 0 + let lineEndGraphemeIndex = 0 + let pendingBreakSegmentIndex = -1 + let pendingBreakFitWidth = 0 + let pendingBreakPaintWidth = 0 + let pendingBreakKind: SegmentBreakKind | null = null + + function clearPendingBreak(): void { + pendingBreakSegmentIndex = -1 + pendingBreakFitWidth = 0 + pendingBreakPaintWidth = 0 + pendingBreakKind = null + } + + function emitCurrentLine( + endSegmentIndex = lineEndSegmentIndex, + endGraphemeIndex = lineEndGraphemeIndex, + width = lineW, + ): void { + lineCount++ + onLine?.({ + startSegmentIndex: lineStartSegmentIndex, + startGraphemeIndex: lineStartGraphemeIndex, + endSegmentIndex, + endGraphemeIndex, + width, + }) + lineW = 0 + hasContent = false + clearPendingBreak() + } + + function startLineAtSegment(segmentIndex: number, width: number): void { + hasContent = true + lineStartSegmentIndex = segmentIndex + lineStartGraphemeIndex = 0 + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + lineW = width + } + + function startLineAtGrapheme(segmentIndex: number, graphemeIndex: number, width: number): void { + hasContent = true + lineStartSegmentIndex = segmentIndex + lineStartGraphemeIndex = graphemeIndex + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = graphemeIndex + 1 + lineW = width + } + + function appendWholeSegment(segmentIndex: number, width: number): void { + if (!hasContent) { + startLineAtSegment(segmentIndex, width) + return + } + lineW += width + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + + function updatePendingBreakForWholeSegment(segmentIndex: number, segmentWidth: number): void { + if (!canBreakAfter(kinds[segmentIndex]!)) return + const fitAdvance = kinds[segmentIndex] === 'tab' ? 0 : lineEndFitAdvances[segmentIndex]! + const paintAdvance = kinds[segmentIndex] === 'tab' ? segmentWidth : lineEndPaintAdvances[segmentIndex]! + pendingBreakSegmentIndex = segmentIndex + 1 + pendingBreakFitWidth = lineW - segmentWidth + fitAdvance + pendingBreakPaintWidth = lineW - segmentWidth + paintAdvance + pendingBreakKind = kinds[segmentIndex]! + } + + function appendBreakableSegment(segmentIndex: number): void { + appendBreakableSegmentFrom(segmentIndex, 0) + } + + function appendBreakableSegmentFrom(segmentIndex: number, startGraphemeIndex: number): void { + const gWidths = breakableWidths[segmentIndex]! + const gPrefixWidths = breakablePrefixWidths[segmentIndex] ?? null + for (let g = startGraphemeIndex; g < gWidths.length; g++) { + const gw = getBreakableAdvance( + gWidths, + gPrefixWidths, + g, + engineProfile.preferPrefixWidthsForBreakableRuns, + ) + + if (!hasContent) { + startLineAtGrapheme(segmentIndex, g, gw) + continue + } + + if (lineW + gw > maxWidth + lineFitEpsilon) { + emitCurrentLine() + startLineAtGrapheme(segmentIndex, g, gw) + } else { + lineW += gw + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = g + 1 + } + } + + if (hasContent && lineEndSegmentIndex === segmentIndex && lineEndGraphemeIndex === gWidths.length) { + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + } + + function continueSoftHyphenBreakableSegment(segmentIndex: number): boolean { + if (pendingBreakKind !== 'soft-hyphen') return false + const gWidths = breakableWidths[segmentIndex]! + if (gWidths === null) return false + const fitWidths = engineProfile.preferPrefixWidthsForBreakableRuns + ? breakablePrefixWidths[segmentIndex] ?? gWidths + : gWidths + const usesPrefixWidths = fitWidths !== gWidths + const { fitCount, fittedWidth } = fitSoftHyphenBreak( + fitWidths, + lineW, + maxWidth, + lineFitEpsilon, + discretionaryHyphenWidth, + usesPrefixWidths, + ) + if (fitCount === 0) return false + + lineW = fittedWidth + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = fitCount + clearPendingBreak() + + if (fitCount === gWidths.length) { + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + return true + } + + emitCurrentLine( + segmentIndex, + fitCount, + fittedWidth + discretionaryHyphenWidth, + ) + appendBreakableSegmentFrom(segmentIndex, fitCount) + return true + } + + function emitEmptyChunk(chunk: { startSegmentIndex: number, consumedEndSegmentIndex: number }): void { + lineCount++ + onLine?.({ + startSegmentIndex: chunk.startSegmentIndex, + startGraphemeIndex: 0, + endSegmentIndex: chunk.consumedEndSegmentIndex, + endGraphemeIndex: 0, + width: 0, + }) + clearPendingBreak() + } + + for (let chunkIndex = 0; chunkIndex < chunks.length; chunkIndex++) { + const chunk = chunks[chunkIndex]! + if (chunk.startSegmentIndex === chunk.endSegmentIndex) { + emitEmptyChunk(chunk) + continue + } + + hasContent = false + lineW = 0 + lineStartSegmentIndex = chunk.startSegmentIndex + lineStartGraphemeIndex = 0 + lineEndSegmentIndex = chunk.startSegmentIndex + lineEndGraphemeIndex = 0 + clearPendingBreak() + + let i = chunk.startSegmentIndex + while (i < chunk.endSegmentIndex) { + const kind = kinds[i]! + const w = kind === 'tab' ? getTabAdvance(lineW, tabStopAdvance) : widths[i]! + + if (kind === 'soft-hyphen') { + if (hasContent) { + lineEndSegmentIndex = i + 1 + lineEndGraphemeIndex = 0 + pendingBreakSegmentIndex = i + 1 + pendingBreakFitWidth = lineW + discretionaryHyphenWidth + pendingBreakPaintWidth = lineW + discretionaryHyphenWidth + pendingBreakKind = kind + } + i++ + continue + } + + if (!hasContent) { + if (w > maxWidth && breakableWidths[i] !== null) { + appendBreakableSegment(i) + } else { + startLineAtSegment(i, w) + } + updatePendingBreakForWholeSegment(i, w) + i++ + continue + } + + const newW = lineW + w + if (newW > maxWidth + lineFitEpsilon) { + const currentBreakFitWidth = lineW + (kind === 'tab' ? 0 : lineEndFitAdvances[i]!) + const currentBreakPaintWidth = lineW + (kind === 'tab' ? w : lineEndPaintAdvances[i]!) + + if ( + pendingBreakKind === 'soft-hyphen' && + engineProfile.preferEarlySoftHyphenBreak && + pendingBreakFitWidth <= maxWidth + lineFitEpsilon + ) { + emitCurrentLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + continue + } + + if (pendingBreakKind === 'soft-hyphen' && continueSoftHyphenBreakableSegment(i)) { + i++ + continue + } + + if (canBreakAfter(kind) && currentBreakFitWidth <= maxWidth + lineFitEpsilon) { + appendWholeSegment(i, w) + emitCurrentLine(i + 1, 0, currentBreakPaintWidth) + i++ + continue + } + + if (pendingBreakSegmentIndex >= 0 && pendingBreakFitWidth <= maxWidth + lineFitEpsilon) { + emitCurrentLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + continue + } + + if (w > maxWidth && breakableWidths[i] !== null) { + emitCurrentLine() + appendBreakableSegment(i) + i++ + continue + } + + emitCurrentLine() + continue + } + + appendWholeSegment(i, w) + updatePendingBreakForWholeSegment(i, w) + i++ + } + + if (hasContent) { + const finalPaintWidth = + pendingBreakSegmentIndex === chunk.consumedEndSegmentIndex + ? pendingBreakPaintWidth + : lineW + emitCurrentLine(chunk.consumedEndSegmentIndex, 0, finalPaintWidth) + } + } + + return lineCount +} + +export function layoutNextLineRange( + prepared: PreparedLineBreakData, + start: LineBreakCursor, + maxWidth: number, +): InternalLayoutLine | null { + const normalizedStart = normalizeLineStart(prepared, start) + if (normalizedStart === null) return null + + if (prepared.simpleLineWalkFastPath) { + return layoutNextLineRangeSimple(prepared, normalizedStart, maxWidth) + } + + const chunkIndex = findChunkIndexForStart(prepared, normalizedStart.segmentIndex) + if (chunkIndex < 0) return null + + const chunk = prepared.chunks[chunkIndex]! + if (chunk.startSegmentIndex === chunk.endSegmentIndex) { + return { + startSegmentIndex: chunk.startSegmentIndex, + startGraphemeIndex: 0, + endSegmentIndex: chunk.consumedEndSegmentIndex, + endGraphemeIndex: 0, + width: 0, + } + } + + const { + widths, + lineEndFitAdvances, + lineEndPaintAdvances, + kinds, + breakableWidths, + breakablePrefixWidths, + discretionaryHyphenWidth, + tabStopAdvance, + } = prepared + const engineProfile = getEngineProfile() + const lineFitEpsilon = engineProfile.lineFitEpsilon + + let lineW = 0 + let hasContent = false + const lineStartSegmentIndex = normalizedStart.segmentIndex + const lineStartGraphemeIndex = normalizedStart.graphemeIndex + let lineEndSegmentIndex = lineStartSegmentIndex + let lineEndGraphemeIndex = lineStartGraphemeIndex + let pendingBreakSegmentIndex = -1 + let pendingBreakFitWidth = 0 + let pendingBreakPaintWidth = 0 + let pendingBreakKind: SegmentBreakKind | null = null + + function clearPendingBreak(): void { + pendingBreakSegmentIndex = -1 + pendingBreakFitWidth = 0 + pendingBreakPaintWidth = 0 + pendingBreakKind = null + } + + function finishLine( + endSegmentIndex = lineEndSegmentIndex, + endGraphemeIndex = lineEndGraphemeIndex, + width = lineW, + ): InternalLayoutLine | null { + if (!hasContent) return null + + return { + startSegmentIndex: lineStartSegmentIndex, + startGraphemeIndex: lineStartGraphemeIndex, + endSegmentIndex, + endGraphemeIndex, + width, + } + } + + function startLineAtSegment(segmentIndex: number, width: number): void { + hasContent = true + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + lineW = width + } + + function startLineAtGrapheme(segmentIndex: number, graphemeIndex: number, width: number): void { + hasContent = true + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = graphemeIndex + 1 + lineW = width + } + + function appendWholeSegment(segmentIndex: number, width: number): void { + if (!hasContent) { + startLineAtSegment(segmentIndex, width) + return + } + lineW += width + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + + function updatePendingBreakForWholeSegment(segmentIndex: number, segmentWidth: number): void { + if (!canBreakAfter(kinds[segmentIndex]!)) return + const fitAdvance = kinds[segmentIndex] === 'tab' ? 0 : lineEndFitAdvances[segmentIndex]! + const paintAdvance = kinds[segmentIndex] === 'tab' ? segmentWidth : lineEndPaintAdvances[segmentIndex]! + pendingBreakSegmentIndex = segmentIndex + 1 + pendingBreakFitWidth = lineW - segmentWidth + fitAdvance + pendingBreakPaintWidth = lineW - segmentWidth + paintAdvance + pendingBreakKind = kinds[segmentIndex]! + } + + function appendBreakableSegmentFrom(segmentIndex: number, startGraphemeIndex: number): InternalLayoutLine | null { + const gWidths = breakableWidths[segmentIndex]! + const gPrefixWidths = breakablePrefixWidths[segmentIndex] ?? null + for (let g = startGraphemeIndex; g < gWidths.length; g++) { + const gw = getBreakableAdvance( + gWidths, + gPrefixWidths, + g, + engineProfile.preferPrefixWidthsForBreakableRuns, + ) + + if (!hasContent) { + startLineAtGrapheme(segmentIndex, g, gw) + continue + } + + if (lineW + gw > maxWidth + lineFitEpsilon) { + return finishLine() + } + + lineW += gw + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = g + 1 + } + + if (hasContent && lineEndSegmentIndex === segmentIndex && lineEndGraphemeIndex === gWidths.length) { + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + return null + } + + function maybeFinishAtSoftHyphen(segmentIndex: number): InternalLayoutLine | null { + if (pendingBreakKind !== 'soft-hyphen' || pendingBreakSegmentIndex < 0) return null + + const gWidths = breakableWidths[segmentIndex] ?? null + if (gWidths !== null) { + const fitWidths = engineProfile.preferPrefixWidthsForBreakableRuns + ? breakablePrefixWidths[segmentIndex] ?? gWidths + : gWidths + const usesPrefixWidths = fitWidths !== gWidths + const { fitCount, fittedWidth } = fitSoftHyphenBreak( + fitWidths, + lineW, + maxWidth, + lineFitEpsilon, + discretionaryHyphenWidth, + usesPrefixWidths, + ) + + if (fitCount === gWidths.length) { + lineW = fittedWidth + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + clearPendingBreak() + return null + } + + if (fitCount > 0) { + return finishLine( + segmentIndex, + fitCount, + fittedWidth + discretionaryHyphenWidth, + ) + } + } + + if (pendingBreakFitWidth <= maxWidth + lineFitEpsilon) { + return finishLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + } + + return null + } + + for (let i = normalizedStart.segmentIndex; i < chunk.endSegmentIndex; i++) { + const kind = kinds[i]! + const startGraphemeIndex = i === normalizedStart.segmentIndex ? normalizedStart.graphemeIndex : 0 + const w = kind === 'tab' ? getTabAdvance(lineW, tabStopAdvance) : widths[i]! + + if (kind === 'soft-hyphen' && startGraphemeIndex === 0) { + if (hasContent) { + lineEndSegmentIndex = i + 1 + lineEndGraphemeIndex = 0 + pendingBreakSegmentIndex = i + 1 + pendingBreakFitWidth = lineW + discretionaryHyphenWidth + pendingBreakPaintWidth = lineW + discretionaryHyphenWidth + pendingBreakKind = kind + } + continue + } + + if (!hasContent) { + if (startGraphemeIndex > 0) { + const line = appendBreakableSegmentFrom(i, startGraphemeIndex) + if (line !== null) return line + } else if (w > maxWidth && breakableWidths[i] !== null) { + const line = appendBreakableSegmentFrom(i, 0) + if (line !== null) return line + } else { + startLineAtSegment(i, w) + } + updatePendingBreakForWholeSegment(i, w) + continue + } + + const newW = lineW + w + if (newW > maxWidth + lineFitEpsilon) { + const currentBreakFitWidth = lineW + (kind === 'tab' ? 0 : lineEndFitAdvances[i]!) + const currentBreakPaintWidth = lineW + (kind === 'tab' ? w : lineEndPaintAdvances[i]!) + + if ( + pendingBreakKind === 'soft-hyphen' && + engineProfile.preferEarlySoftHyphenBreak && + pendingBreakFitWidth <= maxWidth + lineFitEpsilon + ) { + return finishLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + } + + const softBreakLine = maybeFinishAtSoftHyphen(i) + if (softBreakLine !== null) return softBreakLine + + if (canBreakAfter(kind) && currentBreakFitWidth <= maxWidth + lineFitEpsilon) { + appendWholeSegment(i, w) + return finishLine(i + 1, 0, currentBreakPaintWidth) + } + + if (pendingBreakSegmentIndex >= 0 && pendingBreakFitWidth <= maxWidth + lineFitEpsilon) { + return finishLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + } + + if (w > maxWidth && breakableWidths[i] !== null) { + const currentLine = finishLine() + if (currentLine !== null) return currentLine + const line = appendBreakableSegmentFrom(i, 0) + if (line !== null) return line + } + + return finishLine() + } + + appendWholeSegment(i, w) + updatePendingBreakForWholeSegment(i, w) + } + + if (pendingBreakSegmentIndex === chunk.consumedEndSegmentIndex && lineEndGraphemeIndex === 0) { + return finishLine(chunk.consumedEndSegmentIndex, 0, pendingBreakPaintWidth) + } + + return finishLine(chunk.consumedEndSegmentIndex, 0, lineW) +} + +function layoutNextLineRangeSimple( + prepared: PreparedLineBreakData, + normalizedStart: LineBreakCursor, + maxWidth: number, +): InternalLayoutLine | null { + const { widths, kinds, breakableWidths, breakablePrefixWidths } = prepared + const engineProfile = getEngineProfile() + const lineFitEpsilon = engineProfile.lineFitEpsilon + + let lineW = 0 + let hasContent = false + const lineStartSegmentIndex = normalizedStart.segmentIndex + const lineStartGraphemeIndex = normalizedStart.graphemeIndex + let lineEndSegmentIndex = lineStartSegmentIndex + let lineEndGraphemeIndex = lineStartGraphemeIndex + let pendingBreakSegmentIndex = -1 + let pendingBreakPaintWidth = 0 + + function finishLine( + endSegmentIndex = lineEndSegmentIndex, + endGraphemeIndex = lineEndGraphemeIndex, + width = lineW, + ): InternalLayoutLine | null { + if (!hasContent) return null + + return { + startSegmentIndex: lineStartSegmentIndex, + startGraphemeIndex: lineStartGraphemeIndex, + endSegmentIndex, + endGraphemeIndex, + width, + } + } + + function startLineAtSegment(segmentIndex: number, width: number): void { + hasContent = true + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + lineW = width + } + + function startLineAtGrapheme(segmentIndex: number, graphemeIndex: number, width: number): void { + hasContent = true + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = graphemeIndex + 1 + lineW = width + } + + function appendWholeSegment(segmentIndex: number, width: number): void { + if (!hasContent) { + startLineAtSegment(segmentIndex, width) + return + } + lineW += width + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + + function updatePendingBreak(segmentIndex: number, segmentWidth: number): void { + if (!canBreakAfter(kinds[segmentIndex]!)) return + pendingBreakSegmentIndex = segmentIndex + 1 + pendingBreakPaintWidth = lineW - segmentWidth + } + + function appendBreakableSegmentFrom(segmentIndex: number, startGraphemeIndex: number): InternalLayoutLine | null { + const gWidths = breakableWidths[segmentIndex]! + const gPrefixWidths = breakablePrefixWidths[segmentIndex] ?? null + for (let g = startGraphemeIndex; g < gWidths.length; g++) { + const gw = getBreakableAdvance( + gWidths, + gPrefixWidths, + g, + engineProfile.preferPrefixWidthsForBreakableRuns, + ) + + if (!hasContent) { + startLineAtGrapheme(segmentIndex, g, gw) + continue + } + + if (lineW + gw > maxWidth + lineFitEpsilon) { + return finishLine() + } + + lineW += gw + lineEndSegmentIndex = segmentIndex + lineEndGraphemeIndex = g + 1 + } + + if (hasContent && lineEndSegmentIndex === segmentIndex && lineEndGraphemeIndex === gWidths.length) { + lineEndSegmentIndex = segmentIndex + 1 + lineEndGraphemeIndex = 0 + } + return null + } + + for (let i = normalizedStart.segmentIndex; i < widths.length; i++) { + const w = widths[i]! + const kind = kinds[i]! + const startGraphemeIndex = i === normalizedStart.segmentIndex ? normalizedStart.graphemeIndex : 0 + + if (!hasContent) { + if (startGraphemeIndex > 0) { + const line = appendBreakableSegmentFrom(i, startGraphemeIndex) + if (line !== null) return line + } else if (w > maxWidth && breakableWidths[i] !== null) { + const line = appendBreakableSegmentFrom(i, 0) + if (line !== null) return line + } else { + startLineAtSegment(i, w) + } + updatePendingBreak(i, w) + continue + } + + const newW = lineW + w + if (newW > maxWidth + lineFitEpsilon) { + if (canBreakAfter(kind)) { + appendWholeSegment(i, w) + return finishLine(i + 1, 0, lineW - w) + } + + if (pendingBreakSegmentIndex >= 0) { + return finishLine(pendingBreakSegmentIndex, 0, pendingBreakPaintWidth) + } + + if (w > maxWidth && breakableWidths[i] !== null) { + const currentLine = finishLine() + if (currentLine !== null) return currentLine + const line = appendBreakableSegmentFrom(i, 0) + if (line !== null) return line + } + + return finishLine() + } + + appendWholeSegment(i, w) + updatePendingBreak(i, w) + } + + return finishLine() +} diff --git a/src/compat/measurement.ts b/src/compat/measurement.ts new file mode 100644 index 00000000..4ec39b56 --- /dev/null +++ b/src/compat/measurement.ts @@ -0,0 +1,255 @@ +import { isCJK } from './analysis.js' + +export type SegmentMetrics = { + width: number + containsCJK: boolean + emojiCount?: number + graphemeWidths?: number[] | null + graphemePrefixWidths?: number[] | null +} + +export type EngineProfile = { + lineFitEpsilon: number + carryCJKAfterClosingQuote: boolean + preferPrefixWidthsForBreakableRuns: boolean + preferEarlySoftHyphenBreak: boolean +} + +let measureContext: CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D | null = null +const segmentMetricCaches = new Map>() +let cachedEngineProfile: EngineProfile | null = null + +const maybeEmojiFallbackRe = /[\xA9\xAE\u203C\u2049\u20E3\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618\u261D\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u265F\u2660\u2663\u2665\u2666\u2668\u267B\u267E\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26A7\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\uFE0F\u{1F004}\u{1F02C}-\u{1F02F}\u{1F094}-\u{1F09F}\u{1F0AF}\u{1F0B0}\u{1F0C0}\u{1F0CF}\u{1F0D0}\u{1F0F6}-\u{1F0FF}\u{1F170}\u{1F171}\u{1F17E}\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AE}-\u{1F1FF}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F25F}\u{1F266}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F8}-\u{1F4FD}\u{1F3F7}-\u{1F3FA}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}\u{1F596}\u{1F5A4}\u{1F5A5}\u{1F5A8}\u{1F5B1}\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6D5}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6F0}\u{1F6F3}-\u{1F6FF}\u{1F7DA}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE}\u{1F8AF}\u{1F8BC}-\u{1F8BF}\u{1F8C2}-\u{1F8CF}\u{1F8D9}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1F9FF}\u{1FA58}-\u{1FA5F}\u{1FA6E}-\u{1FAFF}\u{1FC00}-\u{1FFFD}]/u +let emojiPresentationRe: RegExp | null = null +let maybeEmojiRe: RegExp | null = null +let sharedGraphemeSegmenter: Intl.Segmenter | null = null +const emojiCorrectionCache = new Map() + +function getEmojiPresentationRe(): RegExp { + if (emojiPresentationRe !== null) return emojiPresentationRe + try { + emojiPresentationRe = new RegExp('\\p{Emoji_Presentation}', 'u') + } catch { + emojiPresentationRe = maybeEmojiFallbackRe + } + return emojiPresentationRe +} + +function getMaybeEmojiRe(): RegExp { + if (maybeEmojiRe !== null) return maybeEmojiRe + try { + maybeEmojiRe = new RegExp( + '[\\p{Emoji_Presentation}\\p{Extended_Pictographic}\\p{Regional_Indicator}\\uFE0F\\u20E3]', + 'u', + ) + } catch { + maybeEmojiRe = maybeEmojiFallbackRe + } + return maybeEmojiRe +} + +export function getMeasureContext(): CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D { + if (measureContext !== null) return measureContext + + if (typeof OffscreenCanvas !== 'undefined') { + measureContext = new OffscreenCanvas(1, 1).getContext('2d')! + return measureContext + } + + if (typeof document !== 'undefined') { + measureContext = document.createElement('canvas').getContext('2d')! + return measureContext + } + + throw new Error('Text measurement requires OffscreenCanvas or a DOM canvas context.') +} + +export function getSegmentMetricCache(font: string): Map { + let cache = segmentMetricCaches.get(font) + if (!cache) { + cache = new Map() + segmentMetricCaches.set(font, cache) + } + return cache +} + +export function getSegmentMetrics(seg: string, cache: Map): SegmentMetrics { + let metrics = cache.get(seg) + if (metrics === undefined) { + const ctx = getMeasureContext() + metrics = { + width: ctx.measureText(seg).width, + containsCJK: isCJK(seg), + } + cache.set(seg, metrics) + } + return metrics +} + +export function getEngineProfile(): EngineProfile { + if (cachedEngineProfile !== null) return cachedEngineProfile + + if (typeof navigator === 'undefined') { + cachedEngineProfile = { + lineFitEpsilon: 0.005, + carryCJKAfterClosingQuote: false, + preferPrefixWidthsForBreakableRuns: false, + preferEarlySoftHyphenBreak: false, + } + return cachedEngineProfile + } + + const ua = navigator.userAgent + const vendor = navigator.vendor + const isSafari = + vendor === 'Apple Computer, Inc.' && + ua.includes('Safari/') && + !ua.includes('Chrome/') && + !ua.includes('Chromium/') && + !ua.includes('CriOS/') && + !ua.includes('FxiOS/') && + !ua.includes('EdgiOS/') + const isChromium = + ua.includes('Chrome/') || + ua.includes('Chromium/') || + ua.includes('CriOS/') || + ua.includes('Edg/') + + cachedEngineProfile = { + lineFitEpsilon: isSafari ? 1 / 64 : 0.005, + carryCJKAfterClosingQuote: isChromium, + preferPrefixWidthsForBreakableRuns: isSafari, + preferEarlySoftHyphenBreak: isSafari, + } + return cachedEngineProfile +} + +export function parseFontSize(font: string): number { + const m = font.match(/(\d+(?:\.\d+)?)\s*px/) + return m ? parseFloat(m[1]!) : 16 +} + +function getSharedGraphemeSegmenter(): Intl.Segmenter { + if (sharedGraphemeSegmenter === null) { + sharedGraphemeSegmenter = new Intl.Segmenter(undefined, { granularity: 'grapheme' }) + } + return sharedGraphemeSegmenter +} + +function isEmojiGrapheme(g: string): boolean { + return getEmojiPresentationRe().test(g) || g.includes('\uFE0F') +} + +export function textMayContainEmoji(text: string): boolean { + return getMaybeEmojiRe().test(text) +} + +function getEmojiCorrection(font: string, fontSize: number): number { + let correction = emojiCorrectionCache.get(font) + if (correction !== undefined) return correction + + const ctx = getMeasureContext() + ctx.font = font + const canvasW = ctx.measureText('\u{1F600}').width + correction = 0 + if ( + canvasW > fontSize + 0.5 && + typeof document !== 'undefined' && + document.body !== null + ) { + const span = document.createElement('span') + span.style.font = font + span.style.display = 'inline-block' + span.style.visibility = 'hidden' + span.style.position = 'absolute' + span.textContent = '\u{1F600}' + document.body.appendChild(span) + const domW = span.getBoundingClientRect().width + document.body.removeChild(span) + if (canvasW - domW > 0.5) { + correction = canvasW - domW + } + } + emojiCorrectionCache.set(font, correction) + return correction +} + +function countEmojiGraphemes(text: string): number { + let count = 0 + const graphemeSegmenter = getSharedGraphemeSegmenter() + for (const g of graphemeSegmenter.segment(text)) { + if (isEmojiGrapheme(g.segment)) count++ + } + return count +} + +function getEmojiCount(seg: string, metrics: SegmentMetrics): number { + if (metrics.emojiCount === undefined) { + metrics.emojiCount = countEmojiGraphemes(seg) + } + return metrics.emojiCount +} + +export function getCorrectedSegmentWidth(seg: string, metrics: SegmentMetrics, emojiCorrection: number): number { + if (emojiCorrection === 0) return metrics.width + return metrics.width - getEmojiCount(seg, metrics) * emojiCorrection +} + +export function getSegmentGraphemeWidths( + seg: string, + metrics: SegmentMetrics, + cache: Map, + emojiCorrection: number, +): number[] | null { + if (metrics.graphemeWidths !== undefined) return metrics.graphemeWidths + + const widths: number[] = [] + const graphemeSegmenter = getSharedGraphemeSegmenter() + for (const gs of graphemeSegmenter.segment(seg)) { + const graphemeMetrics = getSegmentMetrics(gs.segment, cache) + widths.push(getCorrectedSegmentWidth(gs.segment, graphemeMetrics, emojiCorrection)) + } + + metrics.graphemeWidths = widths.length > 1 ? widths : null + return metrics.graphemeWidths +} + +export function getSegmentGraphemePrefixWidths( + seg: string, + metrics: SegmentMetrics, + cache: Map, + emojiCorrection: number, +): number[] | null { + if (metrics.graphemePrefixWidths !== undefined) return metrics.graphemePrefixWidths + + const prefixWidths: number[] = [] + const graphemeSegmenter = getSharedGraphemeSegmenter() + let prefix = '' + for (const gs of graphemeSegmenter.segment(seg)) { + prefix += gs.segment + const prefixMetrics = getSegmentMetrics(prefix, cache) + prefixWidths.push(getCorrectedSegmentWidth(prefix, prefixMetrics, emojiCorrection)) + } + + metrics.graphemePrefixWidths = prefixWidths.length > 1 ? prefixWidths : null + return metrics.graphemePrefixWidths +} + +export function getFontMeasurementState(font: string, needsEmojiCorrection: boolean): { + cache: Map + fontSize: number + emojiCorrection: number +} { + const ctx = getMeasureContext() + ctx.font = font + const cache = getSegmentMetricCache(font) + const fontSize = parseFontSize(font) + const emojiCorrection = needsEmojiCorrection ? getEmojiCorrection(font, fontSize) : 0 + return { cache, fontSize, emojiCorrection } +} + +export function clearMeasurementCaches(): void { + segmentMetricCaches.clear() + emojiCorrectionCache.clear() + sharedGraphemeSegmenter = null +} diff --git a/src/layout.test.ts b/src/layout.test.ts index 3b5d01bb..69af9e4e 100644 --- a/src/layout.test.ts +++ b/src/layout.test.ts @@ -119,6 +119,43 @@ beforeEach(() => { }) describe('prepare invariants', () => { + test('package exposes a compat entrypoint with the layout API', async () => { + const pkg = await Bun.file(new URL('../package.json', import.meta.url)).json() as { + exports?: Record + } + + expect(pkg.exports?.['./compat']).toBeDefined() + + const compatMod = await import('./compat/layout.ts') + expect(typeof compatMod.prepare).toBe('function') + expect(typeof compatMod.prepareWithSegments).toBe('function') + expect(typeof compatMod.layout).toBe('function') + expect(typeof compatMod.layoutWithLines).toBe('function') + expect(typeof compatMod.layoutNextLine).toBe('function') + expect(typeof compatMod.walkLineRanges).toBe('function') + expect(typeof compatMod.clearCache).toBe('function') + expect(typeof compatMod.setLocale).toBe('function') + }) + + test('compat entrypoint matches the default layout behavior for representative text', async () => { + const compatMod = await import('./compat/layout.ts') + const text = 'Hello مرحبا world 👋' + const maxWidth = 90 + + const prepared = prepareWithSegments(text, FONT) + const compatPrepared = compatMod.prepareWithSegments(text, FONT) + + expect(compatPrepared.segments).toEqual(prepared.segments) + expect(compatPrepared.kinds).toEqual(prepared.kinds) + expect(compatPrepared.segLevels).toEqual(prepared.segLevels) + expect(compatMod.layout(compatPrepared, maxWidth, LINE_HEIGHT)).toEqual( + layout(prepared, maxWidth, LINE_HEIGHT), + ) + expect(compatMod.layoutWithLines(compatPrepared, maxWidth, LINE_HEIGHT)).toEqual( + layoutWithLines(prepared, maxWidth, LINE_HEIGHT), + ) + }) + test('whitespace-only input stays empty', () => { const prepared = prepare(' \t\n ', FONT) expect(layout(prepared, 200, LINE_HEIGHT)).toEqual({ lineCount: 0, height: 0 }) diff --git a/src/measurement.ts b/src/measurement.ts index 4ec39b56..b2fb6d57 100644 --- a/src/measurement.ts +++ b/src/measurement.ts @@ -19,35 +19,11 @@ let measureContext: CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D const segmentMetricCaches = new Map>() let cachedEngineProfile: EngineProfile | null = null -const maybeEmojiFallbackRe = /[\xA9\xAE\u203C\u2049\u20E3\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618\u261D\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u265F\u2660\u2663\u2665\u2666\u2668\u267B\u267E\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26A7\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\uFE0F\u{1F004}\u{1F02C}-\u{1F02F}\u{1F094}-\u{1F09F}\u{1F0AF}\u{1F0B0}\u{1F0C0}\u{1F0CF}\u{1F0D0}\u{1F0F6}-\u{1F0FF}\u{1F170}\u{1F171}\u{1F17E}\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1AE}-\u{1F1FF}\u{1F201}-\u{1F20F}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F23C}-\u{1F23F}\u{1F249}-\u{1F25F}\u{1F266}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F8}-\u{1F4FD}\u{1F3F7}-\u{1F3FA}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}\u{1F596}\u{1F5A4}\u{1F5A5}\u{1F5A8}\u{1F5B1}\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6D5}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6F0}\u{1F6F3}-\u{1F6FF}\u{1F7DA}-\u{1F7FF}\u{1F80C}-\u{1F80F}\u{1F848}-\u{1F84F}\u{1F85A}-\u{1F85F}\u{1F888}-\u{1F88F}\u{1F8AE}\u{1F8AF}\u{1F8BC}-\u{1F8BF}\u{1F8C2}-\u{1F8CF}\u{1F8D9}-\u{1F8FF}\u{1F90C}-\u{1F93A}\u{1F93C}-\u{1F945}\u{1F947}-\u{1F9FF}\u{1FA58}-\u{1FA5F}\u{1FA6E}-\u{1FAFF}\u{1FC00}-\u{1FFFD}]/u -let emojiPresentationRe: RegExp | null = null -let maybeEmojiRe: RegExp | null = null +const emojiPresentationRe = /\p{Emoji_Presentation}/u +const maybeEmojiRe = /[\p{Emoji_Presentation}\p{Extended_Pictographic}\p{Regional_Indicator}\uFE0F\u20E3]/u let sharedGraphemeSegmenter: Intl.Segmenter | null = null const emojiCorrectionCache = new Map() -function getEmojiPresentationRe(): RegExp { - if (emojiPresentationRe !== null) return emojiPresentationRe - try { - emojiPresentationRe = new RegExp('\\p{Emoji_Presentation}', 'u') - } catch { - emojiPresentationRe = maybeEmojiFallbackRe - } - return emojiPresentationRe -} - -function getMaybeEmojiRe(): RegExp { - if (maybeEmojiRe !== null) return maybeEmojiRe - try { - maybeEmojiRe = new RegExp( - '[\\p{Emoji_Presentation}\\p{Extended_Pictographic}\\p{Regional_Indicator}\\uFE0F\\u20E3]', - 'u', - ) - } catch { - maybeEmojiRe = maybeEmojiFallbackRe - } - return maybeEmojiRe -} - export function getMeasureContext(): CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D { if (measureContext !== null) return measureContext @@ -137,11 +113,11 @@ function getSharedGraphemeSegmenter(): Intl.Segmenter { } function isEmojiGrapheme(g: string): boolean { - return getEmojiPresentationRe().test(g) || g.includes('\uFE0F') + return emojiPresentationRe.test(g) || g.includes('\uFE0F') } export function textMayContainEmoji(text: string): boolean { - return getMaybeEmojiRe().test(text) + return maybeEmojiRe.test(text) } function getEmojiCorrection(font: string, fontSize: number): number {