chenglou · somnai-dreams · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/README.md b/README.md
@@ -63,6 +63,8 @@ const { lines } = layoutWithLines(prepared, 320, 26) // 320px max width, 26px li
 for (let i = 0; i < lines.length; i++) ctx.fillText(lines[i].text, 0, i * 26)
 ```
 
+Each rich line also carries `sourceOffset` / `sourceLength`, so callers can map a rendered line back to the original input without reconstructing whitespace normalization themselves.
+
 - `walkLineRanges()` gives you line widths and cursors without building the text strings:
 
 ```ts
@@ -88,6 +90,16 @@ while (true) {
 }
 ```
 
+- `cursorToSourceOffset()` / `cursorRangeToSourceSpan()` convert rich cursors back into original-source offsets:
+
+```ts
+const line = layoutNextLine(prepared, cursor, width)
+if (line) {
+  const start = cursorToSourceOffset(prepared, line.start)
+  const { sourceOffset, sourceLength } = cursorRangeToSourceSpan(prepared, line.start, line.end)
+}
+```
+
 This usage allows rendering to canvas, SVG, WebGL and (eventually) server-side.
 
 ### API Glossary
@@ -109,6 +121,8 @@ type LayoutLine = {
   width: number // Measured width of this line, e.g. 87.5
   start: LayoutCursor // Inclusive start cursor in prepared segments/graphemes
   end: LayoutCursor // Exclusive end cursor in prepared segments/graphemes
+  sourceOffset: number // Source offset in the original input covered by this line
+  sourceLength: number // Source span length in the original input covered by this line
 }
 type LayoutLineRange = {
   width: number // Measured width of this line, e.g. 87.5
@@ -125,6 +139,8 @@ Other helpers:
 ```ts
 clearCache(): void // clears Pretext's shared internal caches used by prepare() and prepareWithSegments(). Useful if your app cycles through many different fonts or text variants and you want to release the accumulated cache
 setLocale(locale?: string): void // optional (by default we use the current locale). Sets locale for future prepare() and prepareWithSegments(). Internally, it also calls clearCache(). Setting a new locale doesn't affect existing prepare() and prepareWithSegments() states (no mutations to them)
+cursorToSourceOffset(prepared: PreparedTextWithSegments, cursor: LayoutCursor): number // converts a rich cursor back into an offset in the original input text
+cursorRangeToSourceSpan(prepared: PreparedTextWithSegments, start: LayoutCursor, end: LayoutCursor): { sourceOffset: number, sourceLength: number } // converts a rich cursor range back into the original input span
 ```
 
 ## Caveats

diff --git a/src/analysis.ts b/src/analysis.ts
@@ -31,7 +31,12 @@ export type AnalysisChunk = {
   consumedEndSegmentIndex: number
 }
 
-export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[] } & MergedSegmentation
+export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[], sourceBoundaries?: number[] } & MergedSegmentation
+
+type NormalizedTextWithSourceBoundaries = {
+  text: string
+  sourceBoundaries: number[]
+}
 
 export type AnalysisProfile = {
   carryCJKAfterClosingQuote: boolean
@@ -66,13 +71,95 @@ export function normalizeWhitespaceNormal(text: string): string {
   return normalized
 }
 
+function normalizeWhitespaceNormalWithSourceBoundaries(text: string): NormalizedTextWithSourceBoundaries {
+  if (!needsWhitespaceNormalizationRe.test(text)) {
+    const sourceBoundaries = new Array<number>(text.length + 1)
+    for (let i = 0; i <= text.length; i++) sourceBoundaries[i] = i
+    return { text, sourceBoundaries }
+  }
+
+  const pieces: string[] = []
+  const sourceBoundaries: number[] = []
+  let i = 0
+
+  while (i < text.length) {
+    const ch = text[i]!
+    const isWhitespace = ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r' || ch === '\f'
+    if (!isWhitespace) break
+    i++
+  }
+
+  sourceBoundaries.push(i)
+
+  while (i < text.length) {
+    const ch = text[i]!
+    const isWhitespace = ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r' || ch === '\f'
+    if (isWhitespace) {
+      while (i < text.length) {
+        const next = text[i]!
+        if (next !== ' ' && next !== '\t' && next !== '\n' && next !== '\r' && next !== '\f') break
+        i++
+      }
+      if (i >= text.length) break
+      pieces.push(' ')
+      sourceBoundaries.push(i)
+      continue
+    }
+
+    pieces.push(ch)
+    i++
+    sourceBoundaries.push(i)
+  }
+
+  return { text: pieces.join(''), sourceBoundaries }
+}
+
+function normalizeWhitespacePreWrapWithSourceBoundaries(text: string): NormalizedTextWithSourceBoundaries {
+  if (!/[\r\f]/.test(text)) {
+    return {
+      text,
+      sourceBoundaries: buildPreWrapSourceBoundaries(text),
+    }
+  }
+
+  return {
+    text: text
+      .replace(/\r\n/g, '\n')
+      .replace(/[\r\f]/g, '\n'),
+    sourceBoundaries: buildPreWrapSourceBoundaries(text),
+  }
+}
+
 function normalizeWhitespacePreWrap(text: string): string {
-  if (!/[\r\f]/.test(text)) return text.replace(/\r\n/g, '\n')
+  if (!/[\r\f]/.test(text)) return text
   return text
     .replace(/\r\n/g, '\n')
     .replace(/[\r\f]/g, '\n')
 }
 
+function buildPreWrapSourceBoundaries(text: string): number[] {
+  const sourceBoundaries = [0]
+  let i = 0
+
+  while (i < text.length) {
+    const ch = text[i]!
+    if (ch === '\r' && i + 1 < text.length && text[i + 1] === '\n') {
+      i += 2
+      sourceBoundaries.push(i)
+      continue
+    }
+    if (ch === '\r' || ch === '\f') {
+      i += 1
+      sourceBoundaries.push(i)
+      continue
+    }
+    i += 1
+    sourceBoundaries.push(i)
+  }
+
+  return sourceBoundaries
+}
+
 let sharedWordSegmenter: Intl.Segmenter | null = null
 let segmenterLocale: string | undefined
 
@@ -982,11 +1069,22 @@ export function analyzeText(
   text: string,
   profile: AnalysisProfile,
   whiteSpace: WhiteSpaceMode = 'normal',
+  includeSourceBoundaries = false,
 ): TextAnalysis {
   const whiteSpaceProfile = getWhiteSpaceProfile(whiteSpace)
-  const normalized = whiteSpaceProfile.mode === 'pre-wrap'
-    ? normalizeWhitespacePreWrap(text)
-    : normalizeWhitespaceNormal(text)
+  const normalizedResult = includeSourceBoundaries
+    ? (whiteSpaceProfile.mode === 'pre-wrap'
+        ? normalizeWhitespacePreWrapWithSourceBoundaries(text)
+        : normalizeWhitespaceNormalWithSourceBoundaries(text))
+    : null
+  const normalized = normalizedResult
+    ? normalizedResult.text
+    : (whiteSpaceProfile.mode === 'pre-wrap'
+        ? normalizeWhitespacePreWrap(text)
+        : normalizeWhitespaceNormal(text))
+  const sourceBoundariesPayload = normalizedResult
+    ? { sourceBoundaries: normalizedResult.sourceBoundaries }
+    : {}
   if (normalized.length === 0) {
     return {
       normalized,
@@ -996,12 +1094,14 @@ export function analyzeText(
       isWordLike: [],
       kinds: [],
       starts: [],
+      ...sourceBoundariesPayload,
     }
   }
   const segmentation = buildMergedSegmentation(normalized, profile, whiteSpaceProfile)
   return {
     normalized,
     chunks: compileAnalysisChunks(segmentation, whiteSpaceProfile),
+    ...sourceBoundariesPayload,
     ...segmentation,
   }
 }
diff --git a/src/layout.test.ts b/src/layout.test.ts
@@ -17,6 +17,8 @@ let layout: LayoutModule['layout']
 let layoutWithLines: LayoutModule['layoutWithLines']
 let layoutNextLine: LayoutModule['layoutNextLine']
 let walkLineRanges: LayoutModule['walkLineRanges']
+let cursorToSourceOffset: LayoutModule['cursorToSourceOffset']
+let cursorRangeToSourceSpan: LayoutModule['cursorRangeToSourceSpan']
 let clearCache: LayoutModule['clearCache']
 let setLocale: LayoutModule['setLocale']
 let countPreparedLines: LineBreakModule['countPreparedLines']
@@ -107,6 +109,8 @@ beforeAll(async () => {
     layoutWithLines,
     layoutNextLine,
     walkLineRanges,
+    cursorToSourceOffset,
+    cursorRangeToSourceSpan,
     clearCache,
     setLocale,
   } = mod)
@@ -127,6 +131,8 @@ describe('prepare invariants', () => {
   test('collapses ordinary whitespace runs and trims the edges', () => {
     const prepared = prepareWithSegments('  Hello\t \n  World  ', FONT)
     expect(prepared.segments).toEqual(['Hello', ' ', 'World'])
+    expect(prepared.segmentSourceOffsets).toEqual([2, 7, 12])
+    expect(prepared.segmentSourceLengths).toEqual([5, 5, 5])
   })
 
   test('pre-wrap mode keeps ordinary spaces instead of collapsing them', () => {
@@ -147,6 +153,11 @@ describe('prepare invariants', () => {
     expect(prepared.kinds).toEqual(['text', 'hard-break', 'text'])
   })
 
+  test('pre-wrap mode also normalizes CRLF on the simple prepare path', () => {
+    const prepared = prepare('Hello\r\nWorld', FONT, { whiteSpace: 'pre-wrap' })
+    expect(layout(prepared, 200, LINE_HEIGHT).lineCount).toBe(2)
+  })
+
   test('pre-wrap mode keeps tabs as explicit segments', () => {
     const prepared = prepareWithSegments('Hello\tWorld', FONT, { whiteSpace: 'pre-wrap' })
     expect(prepared.segments).toEqual(['Hello', '\t', 'World'])
@@ -413,6 +424,8 @@ describe('layout invariants', () => {
       width: widthOfHello,
       start: { segmentIndex: 0, graphemeIndex: 0 },
       end: { segmentIndex: 1, graphemeIndex: 0 },
+      sourceOffset: 0,
+      sourceLength: 5,
     }])
   })
 
@@ -430,6 +443,22 @@ describe('layout invariants', () => {
     expect(rich.lines.map(line => line.text).join('')).toBe('Superlongword')
     expect(rich.lines[0]!.start).toEqual({ segmentIndex: 0, graphemeIndex: 0 })
     expect(rich.lines.at(-1)!.end).toEqual({ segmentIndex: 1, graphemeIndex: 0 })
+    expect(cursorToSourceOffset(prepared, rich.lines[0]!.end)).toBe(rich.lines[0]!.sourceOffset + rich.lines[0]!.sourceLength)
+  })
+
+  test('rich lines preserve source spans through collapsed whitespace normalization', () => {
+    const source = '  foo   bar  '
+    const prepared = prepareWithSegments(source, FONT)
+    const lines = layoutWithLines(prepared, 200, LINE_HEIGHT)
+    expect(lines.lines).toHaveLength(1)
+    expect(lines.lines[0]!.text).toBe('foo bar')
+    expect(lines.lines[0]!.sourceOffset).toBe(2)
+    expect(lines.lines[0]!.sourceLength).toBe(9)
+    expect(source.slice(lines.lines[0]!.sourceOffset, lines.lines[0]!.sourceOffset + lines.lines[0]!.sourceLength)).toBe('foo   bar')
+    expect(cursorRangeToSourceSpan(prepared, lines.lines[0]!.start, lines.lines[0]!.end)).toEqual({
+      sourceOffset: 2,
+      sourceLength: 9,
+    })
   })
 
   test('mixed-direction text is a stable smoke test', () => {
@@ -587,13 +616,15 @@ describe('layout invariants', () => {
       start: { segmentIndex: number, graphemeIndex: number }
       end: { segmentIndex: number, graphemeIndex: number }
     }> = []
+    const sourceSpans: Array<{ sourceOffset: number, sourceLength: number }> = []
 
     const lineCount = walkLineRanges(prepared, width, line => {
       actual.push({
         width: line.width,
         start: { ...line.start },
         end: { ...line.end },
       })
+      sourceSpans.push(cursorRangeToSourceSpan(prepared, line.start, line.end))
     })
 
     expect(lineCount).toBe(expected.lineCount)
@@ -602,6 +633,10 @@ describe('layout invariants', () => {
       start: line.start,
       end: line.end,
     })))
+    expect(sourceSpans).toEqual(expected.lines.map(line => ({
+      sourceOffset: line.sourceOffset,
+      sourceLength: line.sourceLength,
+    })))
   })
 
   test('countPreparedLines stays aligned with the walked line counter', () => {