diff --git a/README.md b/README.md index ee00d5cf..f266bef5 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,8 @@ const { lines } = layoutWithLines(prepared, 320, 26) // 320px max width, 26px li for (let i = 0; i < lines.length; i++) ctx.fillText(lines[i].text, 0, i * 26) ``` +Each rich line also carries `sourceOffset` / `sourceLength`, so callers can map a rendered line back to the original input without reconstructing whitespace normalization themselves. + - `walkLineRanges()` gives you line widths and cursors without building the text strings: ```ts @@ -88,6 +90,16 @@ while (true) { } ``` +- `cursorToSourceOffset()` / `cursorRangeToSourceSpan()` convert rich cursors back into original-source offsets: + +```ts +const line = layoutNextLine(prepared, cursor, width) +if (line) { + const start = cursorToSourceOffset(prepared, line.start) + const { sourceOffset, sourceLength } = cursorRangeToSourceSpan(prepared, line.start, line.end) +} +``` + This usage allows rendering to canvas, SVG, WebGL and (eventually) server-side. ### API Glossary @@ -109,6 +121,8 @@ type LayoutLine = { width: number // Measured width of this line, e.g. 87.5 start: LayoutCursor // Inclusive start cursor in prepared segments/graphemes end: LayoutCursor // Exclusive end cursor in prepared segments/graphemes + sourceOffset: number // Source offset in the original input covered by this line + sourceLength: number // Source span length in the original input covered by this line } type LayoutLineRange = { width: number // Measured width of this line, e.g. 87.5 @@ -125,6 +139,8 @@ Other helpers: ```ts clearCache(): void // clears Pretext's shared internal caches used by prepare() and prepareWithSegments(). Useful if your app cycles through many different fonts or text variants and you want to release the accumulated cache setLocale(locale?: string): void // optional (by default we use the current locale). Sets locale for future prepare() and prepareWithSegments(). Internally, it also calls clearCache(). Setting a new locale doesn't affect existing prepare() and prepareWithSegments() states (no mutations to them) +cursorToSourceOffset(prepared: PreparedTextWithSegments, cursor: LayoutCursor): number // converts a rich cursor back into an offset in the original input text +cursorRangeToSourceSpan(prepared: PreparedTextWithSegments, start: LayoutCursor, end: LayoutCursor): { sourceOffset: number, sourceLength: number } // converts a rich cursor range back into the original input span ``` ## Caveats diff --git a/src/analysis.ts b/src/analysis.ts index a22d881e..3eb4dea1 100644 --- a/src/analysis.ts +++ b/src/analysis.ts @@ -31,7 +31,12 @@ export type AnalysisChunk = { consumedEndSegmentIndex: number } -export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[] } & MergedSegmentation +export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[], sourceBoundaries?: number[] } & MergedSegmentation + +type NormalizedTextWithSourceBoundaries = { + text: string + sourceBoundaries: number[] +} export type AnalysisProfile = { carryCJKAfterClosingQuote: boolean @@ -66,13 +71,95 @@ export function normalizeWhitespaceNormal(text: string): string { return normalized } +function normalizeWhitespaceNormalWithSourceBoundaries(text: string): NormalizedTextWithSourceBoundaries { + if (!needsWhitespaceNormalizationRe.test(text)) { + const sourceBoundaries = new Array(text.length + 1) + for (let i = 0; i <= text.length; i++) sourceBoundaries[i] = i + return { text, sourceBoundaries } + } + + const pieces: string[] = [] + const sourceBoundaries: number[] = [] + let i = 0 + + while (i < text.length) { + const ch = text[i]! + const isWhitespace = ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r' || ch === '\f' + if (!isWhitespace) break + i++ + } + + sourceBoundaries.push(i) + + while (i < text.length) { + const ch = text[i]! + const isWhitespace = ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r' || ch === '\f' + if (isWhitespace) { + while (i < text.length) { + const next = text[i]! + if (next !== ' ' && next !== '\t' && next !== '\n' && next !== '\r' && next !== '\f') break + i++ + } + if (i >= text.length) break + pieces.push(' ') + sourceBoundaries.push(i) + continue + } + + pieces.push(ch) + i++ + sourceBoundaries.push(i) + } + + return { text: pieces.join(''), sourceBoundaries } +} + +function normalizeWhitespacePreWrapWithSourceBoundaries(text: string): NormalizedTextWithSourceBoundaries { + if (!/[\r\f]/.test(text)) { + return { + text, + sourceBoundaries: buildPreWrapSourceBoundaries(text), + } + } + + return { + text: text + .replace(/\r\n/g, '\n') + .replace(/[\r\f]/g, '\n'), + sourceBoundaries: buildPreWrapSourceBoundaries(text), + } +} + function normalizeWhitespacePreWrap(text: string): string { - if (!/[\r\f]/.test(text)) return text.replace(/\r\n/g, '\n') + if (!/[\r\f]/.test(text)) return text return text .replace(/\r\n/g, '\n') .replace(/[\r\f]/g, '\n') } +function buildPreWrapSourceBoundaries(text: string): number[] { + const sourceBoundaries = [0] + let i = 0 + + while (i < text.length) { + const ch = text[i]! + if (ch === '\r' && i + 1 < text.length && text[i + 1] === '\n') { + i += 2 + sourceBoundaries.push(i) + continue + } + if (ch === '\r' || ch === '\f') { + i += 1 + sourceBoundaries.push(i) + continue + } + i += 1 + sourceBoundaries.push(i) + } + + return sourceBoundaries +} + let sharedWordSegmenter: Intl.Segmenter | null = null let segmenterLocale: string | undefined @@ -982,11 +1069,22 @@ export function analyzeText( text: string, profile: AnalysisProfile, whiteSpace: WhiteSpaceMode = 'normal', + includeSourceBoundaries = false, ): TextAnalysis { const whiteSpaceProfile = getWhiteSpaceProfile(whiteSpace) - const normalized = whiteSpaceProfile.mode === 'pre-wrap' - ? normalizeWhitespacePreWrap(text) - : normalizeWhitespaceNormal(text) + const normalizedResult = includeSourceBoundaries + ? (whiteSpaceProfile.mode === 'pre-wrap' + ? normalizeWhitespacePreWrapWithSourceBoundaries(text) + : normalizeWhitespaceNormalWithSourceBoundaries(text)) + : null + const normalized = normalizedResult + ? normalizedResult.text + : (whiteSpaceProfile.mode === 'pre-wrap' + ? normalizeWhitespacePreWrap(text) + : normalizeWhitespaceNormal(text)) + const sourceBoundariesPayload = normalizedResult + ? { sourceBoundaries: normalizedResult.sourceBoundaries } + : {} if (normalized.length === 0) { return { normalized, @@ -996,12 +1094,14 @@ export function analyzeText( isWordLike: [], kinds: [], starts: [], + ...sourceBoundariesPayload, } } const segmentation = buildMergedSegmentation(normalized, profile, whiteSpaceProfile) return { normalized, chunks: compileAnalysisChunks(segmentation, whiteSpaceProfile), + ...sourceBoundariesPayload, ...segmentation, } } diff --git a/src/layout.test.ts b/src/layout.test.ts index 3b5d01bb..2129a2eb 100644 --- a/src/layout.test.ts +++ b/src/layout.test.ts @@ -17,6 +17,8 @@ let layout: LayoutModule['layout'] let layoutWithLines: LayoutModule['layoutWithLines'] let layoutNextLine: LayoutModule['layoutNextLine'] let walkLineRanges: LayoutModule['walkLineRanges'] +let cursorToSourceOffset: LayoutModule['cursorToSourceOffset'] +let cursorRangeToSourceSpan: LayoutModule['cursorRangeToSourceSpan'] let clearCache: LayoutModule['clearCache'] let setLocale: LayoutModule['setLocale'] let countPreparedLines: LineBreakModule['countPreparedLines'] @@ -107,6 +109,8 @@ beforeAll(async () => { layoutWithLines, layoutNextLine, walkLineRanges, + cursorToSourceOffset, + cursorRangeToSourceSpan, clearCache, setLocale, } = mod) @@ -127,6 +131,8 @@ describe('prepare invariants', () => { test('collapses ordinary whitespace runs and trims the edges', () => { const prepared = prepareWithSegments(' Hello\t \n World ', FONT) expect(prepared.segments).toEqual(['Hello', ' ', 'World']) + expect(prepared.segmentSourceOffsets).toEqual([2, 7, 12]) + expect(prepared.segmentSourceLengths).toEqual([5, 5, 5]) }) test('pre-wrap mode keeps ordinary spaces instead of collapsing them', () => { @@ -147,6 +153,11 @@ describe('prepare invariants', () => { expect(prepared.kinds).toEqual(['text', 'hard-break', 'text']) }) + test('pre-wrap mode also normalizes CRLF on the simple prepare path', () => { + const prepared = prepare('Hello\r\nWorld', FONT, { whiteSpace: 'pre-wrap' }) + expect(layout(prepared, 200, LINE_HEIGHT).lineCount).toBe(2) + }) + test('pre-wrap mode keeps tabs as explicit segments', () => { const prepared = prepareWithSegments('Hello\tWorld', FONT, { whiteSpace: 'pre-wrap' }) expect(prepared.segments).toEqual(['Hello', '\t', 'World']) @@ -413,6 +424,8 @@ describe('layout invariants', () => { width: widthOfHello, start: { segmentIndex: 0, graphemeIndex: 0 }, end: { segmentIndex: 1, graphemeIndex: 0 }, + sourceOffset: 0, + sourceLength: 5, }]) }) @@ -430,6 +443,22 @@ describe('layout invariants', () => { expect(rich.lines.map(line => line.text).join('')).toBe('Superlongword') expect(rich.lines[0]!.start).toEqual({ segmentIndex: 0, graphemeIndex: 0 }) expect(rich.lines.at(-1)!.end).toEqual({ segmentIndex: 1, graphemeIndex: 0 }) + expect(cursorToSourceOffset(prepared, rich.lines[0]!.end)).toBe(rich.lines[0]!.sourceOffset + rich.lines[0]!.sourceLength) + }) + + test('rich lines preserve source spans through collapsed whitespace normalization', () => { + const source = ' foo bar ' + const prepared = prepareWithSegments(source, FONT) + const lines = layoutWithLines(prepared, 200, LINE_HEIGHT) + expect(lines.lines).toHaveLength(1) + expect(lines.lines[0]!.text).toBe('foo bar') + expect(lines.lines[0]!.sourceOffset).toBe(2) + expect(lines.lines[0]!.sourceLength).toBe(9) + expect(source.slice(lines.lines[0]!.sourceOffset, lines.lines[0]!.sourceOffset + lines.lines[0]!.sourceLength)).toBe('foo bar') + expect(cursorRangeToSourceSpan(prepared, lines.lines[0]!.start, lines.lines[0]!.end)).toEqual({ + sourceOffset: 2, + sourceLength: 9, + }) }) test('mixed-direction text is a stable smoke test', () => { @@ -587,6 +616,7 @@ describe('layout invariants', () => { start: { segmentIndex: number, graphemeIndex: number } end: { segmentIndex: number, graphemeIndex: number } }> = [] + const sourceSpans: Array<{ sourceOffset: number, sourceLength: number }> = [] const lineCount = walkLineRanges(prepared, width, line => { actual.push({ @@ -594,6 +624,7 @@ describe('layout invariants', () => { start: { ...line.start }, end: { ...line.end }, }) + sourceSpans.push(cursorRangeToSourceSpan(prepared, line.start, line.end)) }) expect(lineCount).toBe(expected.lineCount) @@ -602,6 +633,10 @@ describe('layout invariants', () => { start: line.start, end: line.end, }))) + expect(sourceSpans).toEqual(expected.lines.map(line => ({ + sourceOffset: line.sourceOffset, + sourceLength: line.sourceLength, + }))) }) test('countPreparedLines stays aligned with the walked line counter', () => { diff --git a/src/layout.ts b/src/layout.ts index 465a0673..dc8f7b79 100644 --- a/src/layout.ts +++ b/src/layout.ts @@ -106,6 +106,8 @@ type InternalPreparedText = PreparedText & PreparedCore // Treat this as the unstable escape hatch for experiments and custom rendering. export type PreparedTextWithSegments = InternalPreparedText & { segments: string[] // Segment text aligned with the parallel arrays, e.g. ['hello', ' ', 'world'] + segmentSourceOffsets: number[] // Source offset per segment in the original input + segmentSourceLengths: number[] // Source span length per segment in the original input } export type LayoutCursor = { @@ -123,6 +125,8 @@ export type LayoutLine = { width: number // Measured width of this line, e.g. 87.5 start: LayoutCursor // Inclusive start cursor in prepared segments/graphemes end: LayoutCursor // Exclusive end cursor in prepared segments/graphemes + sourceOffset: number // Source offset of the covered text in the original input + sourceLength: number // Source span length of the covered text in the original input } export type LayoutLineRange = { @@ -171,6 +175,8 @@ function createEmptyPrepared(includeSegments: boolean): InternalPreparedText | P tabStopAdvance: 0, chunks: [], segments: [], + segmentSourceOffsets: [], + segmentSourceLengths: [], } as unknown as PreparedTextWithSegments } return { @@ -214,6 +220,9 @@ function measureAnalysis( const breakableWidths: (number[] | null)[] = [] const breakablePrefixWidths: (number[] | null)[] = [] const segments = includeSegments ? [] as string[] : null + const segmentSourceOffsets = includeSegments ? [] as number[] : null + const segmentSourceLengths = includeSegments ? [] as number[] : null + const sourceBoundaries = includeSegments ? analysis.sourceBoundaries ?? null : null const preparedStartByAnalysisIndex = Array.from({ length: analysis.len }) const preparedEndByAnalysisIndex = Array.from({ length: analysis.len }) @@ -224,6 +233,8 @@ function measureAnalysis( lineEndPaintAdvance: number, kind: SegmentBreakKind, start: number, + sourceOffset: number, + sourceLength: number, breakable: number[] | null, breakablePrefix: number[] | null, ): void { @@ -238,6 +249,10 @@ function measureAnalysis( breakableWidths.push(breakable) breakablePrefixWidths.push(breakablePrefix) if (segments !== null) segments.push(text) + if (segmentSourceOffsets !== null && segmentSourceLengths !== null) { + segmentSourceOffsets.push(sourceOffset) + segmentSourceLengths.push(sourceLength) + } } for (let mi = 0; mi < analysis.len; mi++) { @@ -246,6 +261,9 @@ function measureAnalysis( const segWordLike = analysis.isWordLike[mi]! const segKind = analysis.kinds[mi]! const segStart = analysis.starts[mi]! + const segSourceOffset = sourceBoundaries?.[segStart] ?? segStart + const segSourceEnd = sourceBoundaries?.[segStart + segText.length] ?? (segStart + segText.length) + const segSourceLength = segSourceEnd - segSourceOffset if (segKind === 'soft-hyphen') { pushMeasuredSegment( @@ -255,6 +273,8 @@ function measureAnalysis( discretionaryHyphenWidth, segKind, segStart, + segSourceOffset, + segSourceLength, null, null, ) @@ -263,13 +283,13 @@ function measureAnalysis( } if (segKind === 'hard-break') { - pushMeasuredSegment(segText, 0, 0, 0, segKind, segStart, null, null) + pushMeasuredSegment(segText, 0, 0, 0, segKind, segStart, segSourceOffset, segSourceLength, null, null) preparedEndByAnalysisIndex[mi] = widths.length continue } if (segKind === 'tab') { - pushMeasuredSegment(segText, 0, 0, 0, segKind, segStart, null, null) + pushMeasuredSegment(segText, 0, 0, 0, segKind, segStart, segSourceOffset, segSourceLength, null, null) preparedEndByAnalysisIndex[mi] = widths.length continue } @@ -303,7 +323,9 @@ function measureAnalysis( const unitMetrics = getSegmentMetrics(unitText, cache) const w = getCorrectedSegmentWidth(unitText, unitMetrics, emojiCorrection) - pushMeasuredSegment(unitText, w, w, w, 'text', segStart + unitStart, null, null) + const unitSourceOffset = sourceBoundaries?.[segStart + unitStart] ?? (segStart + unitStart) + const unitSourceEnd = sourceBoundaries?.[segStart + unitStart + unitText.length] ?? (segStart + unitStart + unitText.length) + pushMeasuredSegment(unitText, w, w, w, 'text', segStart + unitStart, unitSourceOffset, unitSourceEnd - unitSourceOffset, null, null) unitText = grapheme unitStart = gs.index @@ -312,7 +334,9 @@ function measureAnalysis( if (unitText.length > 0) { const unitMetrics = getSegmentMetrics(unitText, cache) const w = getCorrectedSegmentWidth(unitText, unitMetrics, emojiCorrection) - pushMeasuredSegment(unitText, w, w, w, 'text', segStart + unitStart, null, null) + const unitSourceOffset = sourceBoundaries?.[segStart + unitStart] ?? (segStart + unitStart) + const unitSourceEnd = sourceBoundaries?.[segStart + unitStart + unitText.length] ?? (segStart + unitStart + unitText.length) + pushMeasuredSegment(unitText, w, w, w, 'text', segStart + unitStart, unitSourceOffset, unitSourceEnd - unitSourceOffset, null, null) } preparedEndByAnalysisIndex[mi] = widths.length continue @@ -340,6 +364,8 @@ function measureAnalysis( lineEndPaintAdvance, segKind, segStart, + segSourceOffset, + segSourceLength, graphemeWidths, graphemePrefixWidths, ) @@ -351,6 +377,8 @@ function measureAnalysis( lineEndPaintAdvance, segKind, segStart, + segSourceOffset, + segSourceLength, null, null, ) @@ -374,6 +402,8 @@ function measureAnalysis( tabStopAdvance, chunks, segments, + segmentSourceOffsets, + segmentSourceLengths, } as unknown as PreparedTextWithSegments } return { @@ -427,7 +457,7 @@ function prepareInternal( includeSegments: boolean, options?: PrepareOptions, ): InternalPreparedText | PreparedTextWithSegments { - const analysis = analyzeText(text, getEngineProfile(), options?.whiteSpace) + const analysis = analyzeText(text, getEngineProfile(), options?.whiteSpace, includeSegments) return measureAnalysis(analysis, font, includeSegments) } @@ -526,6 +556,55 @@ function getLineTextCache(prepared: PreparedTextWithSegments): Map, + cursor: LayoutCursor, +): number { + if (prepared.segments.length === 0) return 0 + if (cursor.segmentIndex <= 0 && cursor.graphemeIndex <= 0) { + return prepared.segmentSourceOffsets[0]! + } + if (cursor.segmentIndex >= prepared.segments.length) { + return getPreparedSourceEnd(prepared) + } + + const segmentIndex = cursor.segmentIndex + const segmentSourceOffset = prepared.segmentSourceOffsets[segmentIndex]! + const segmentSourceLength = prepared.segmentSourceLengths[segmentIndex]! + if (cursor.graphemeIndex <= 0) return segmentSourceOffset + + const graphemes = getSegmentGraphemes(segmentIndex, prepared.segments, cache) + if (cursor.graphemeIndex >= graphemes.length) { + return segmentSourceOffset + segmentSourceLength + } + + let sourceDelta = 0 + for (let i = 0; i < cursor.graphemeIndex; i++) { + sourceDelta += graphemes[i]!.length + } + return segmentSourceOffset + Math.min(sourceDelta, segmentSourceLength) +} + +function cursorRangeToSourceSpanWithCache( + prepared: PreparedTextWithSegments, + cache: Map, + start: LayoutCursor, + end: LayoutCursor, +): { sourceOffset: number; sourceLength: number } { + const sourceStart = cursorToSourceOffsetWithCache(prepared, cache, start) + const sourceEnd = cursorToSourceOffsetWithCache(prepared, cache, end) + return sourceStart <= sourceEnd + ? { sourceOffset: sourceStart, sourceLength: sourceEnd - sourceStart } + : { sourceOffset: sourceEnd, sourceLength: sourceStart - sourceEnd } +} + function lineHasDiscretionaryHyphen( kinds: SegmentBreakKind[], startSegmentIndex: number, @@ -587,6 +666,18 @@ function createLayoutLine( endSegmentIndex: number, endGraphemeIndex: number, ): LayoutLine { + const sourceSpan = cursorRangeToSourceSpanWithCache( + prepared, + cache, + { + segmentIndex: startSegmentIndex, + graphemeIndex: startGraphemeIndex, + }, + { + segmentIndex: endSegmentIndex, + graphemeIndex: endGraphemeIndex, + }, + ) return { text: buildLineTextFromRange( prepared.segments, @@ -606,6 +697,8 @@ function createLayoutLine( segmentIndex: endSegmentIndex, graphemeIndex: endGraphemeIndex, }, + sourceOffset: sourceSpan.sourceOffset, + sourceLength: sourceSpan.sourceLength, } } @@ -678,6 +771,21 @@ export function walkLineRanges( }) } +export function cursorToSourceOffset( + prepared: PreparedTextWithSegments, + cursor: LayoutCursor, +): number { + return cursorToSourceOffsetWithCache(prepared, getLineTextCache(prepared), cursor) +} + +export function cursorRangeToSourceSpan( + prepared: PreparedTextWithSegments, + start: LayoutCursor, + end: LayoutCursor, +): { sourceOffset: number; sourceLength: number } { + return cursorRangeToSourceSpanWithCache(prepared, getLineTextCache(prepared), start, end) +} + export function layoutNextLine( prepared: PreparedTextWithSegments, start: LayoutCursor,