From 27553f25e55577f96093ff694b228ffa84af250d Mon Sep 17 00:00:00 2001 From: Jason Held Date: Mon, 19 Jan 2026 13:37:30 -0700 Subject: [PATCH 1/4] Single carriage return, update tests, add new tests --- src/compose/resolve-block-scalar.ts | 19 ++--- src/compose/resolve-flow-scalar.ts | 37 +++++++--- src/parse/cst.ts | 1 + src/parse/lexer.ts | 55 +++++++++----- tests/cst.ts | 16 +++++ tests/doc/parse.ts | 108 +++++++++++++++++++++++++++- tests/doc/types.ts | 5 +- tests/lexer.ts | 29 ++++++++ 8 files changed, 233 insertions(+), 37 deletions(-) diff --git a/src/compose/resolve-block-scalar.ts b/src/compose/resolve-block-scalar.ts index 1fb3036b..9c359a75 100644 --- a/src/compose/resolve-block-scalar.ts +++ b/src/compose/resolve-block-scalar.ts @@ -201,13 +201,16 @@ function parseBlockScalarHeader( /** @returns Array of lines split up as `[indent, content]` */ function splitLines(source: string) { - const split = source.split(/\n( *)/) - const first = split[0] - const m = first.match(/^( *)/) - const line0: [string, string] = m?.[1] - ? [m[1], first.slice(m[1].length)] - : ['', first] - const lines = [line0] - for (let i = 1; i < split.length; i += 2) lines.push([split[i], split[i + 1]]) + // split on all line break types: \n, \r\n, or standalone \r + const split = source.split(/\r?\n|\r(?!\n)/) + const lines: [string, string][] = [] + for (const line of split) { + const m = line.match(/^( *)/) + if (m?.[1]) { + lines.push([m[1], line.slice(m[1].length)]) + } else { + lines.push(['', line]) + } + } return lines } diff --git a/src/compose/resolve-flow-scalar.ts b/src/compose/resolve-flow-scalar.ts index 5568d63c..d76b722d 100644 --- a/src/compose/resolve-flow-scalar.ts +++ b/src/compose/resolve-flow-scalar.ts @@ -112,11 +112,12 @@ function foldLines(source: string) { */ let first: RegExp, line: RegExp try { - first = new RegExp('(.*?)(? wsStart ? source.slice(wsStart, i + 1) : ch } else { res += ch @@ -190,15 +195,27 @@ function doubleQuotedValue(source: string, onError: FlowScalarErrorHandler) { /** * Fold a single newline into a space, multiple newlines to N - 1 newlines. - * Presumes `source[offset] === '\n'` + * Presumes `source[offset] === '\n'` or `source[offset] === '\r'` */ function foldNewline(source: string, offset: number) { let fold = '' let ch = source[offset + 1] while (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') { - if (ch === '\r' && source[offset + 2] !== '\n') break - if (ch === '\n') fold += '\n' - offset += 1 + if (ch === '\r') { + // \r\n counts as one newline, standalone \r also counts as one + if (source[offset + 2] === '\n') { + fold += '\n' + offset += 2 + } else { + fold += '\n' + offset += 1 + } + } else if (ch === '\n') { + fold += '\n' + offset += 1 + } else { + offset += 1 + } ch = source[offset + 1] } if (!fold) fold = ' ' diff --git a/src/parse/cst.ts b/src/parse/cst.ts index a945ea03..f725842f 100644 --- a/src/parse/cst.ts +++ b/src/parse/cst.ts @@ -207,6 +207,7 @@ export function tokenType(source: string): TokenType | null { case '': case '\n': case '\r\n': + case '\r': return 'newline' case '-': return 'seq-item-ind' diff --git a/src/parse/lexer.ts b/src/parse/lexer.ts index e66751e6..a44092a0 100644 --- a/src/parse/lexer.ts +++ b/src/parse/lexer.ts @@ -17,7 +17,7 @@ const flowIndicatorChars = new Set(',[]{}') const invalidAnchorChars = new Set(' ,[]{}\n\r\t') const isNotAnchorChar = (ch: string) => !ch || invalidAnchorChars.has(ch) -const blockScalarHeader = /([|>][^\s#]*)([ \t]*)((?:.|\r(?!\n))*)$/my +const blockScalarHeader = /([|>][^\s#]*)([ \t]*)([^\r\n]*)$/my const blockStart = /([-?:])(?=[ \n\r\t]|$)([ \t]*)/y const directiveLine = /(%.*?)(?:([ \t]+)(#.*)?)?$/my const docMarker = /[-.]{3}(?=[ \n\r\t]|$)(?:([ \t]+)(#.*)?)?/y @@ -91,7 +91,10 @@ class Lexer { while (ch === ' ') ch = this.source[++indent + offset] if (ch === '\r') { const next = this.source[indent + offset + 1] + // \r\n is a single line break if (next === '\n') return offset + indent + 1 + // standalone \r is also a line break per YAML 1.2 spec + return offset + indent } return ch === '\n' || indent >= this.indentNext ? offset + indent : -1 } @@ -204,11 +207,13 @@ class Lexer { this.blockScalar() return this.lineStart() case '\r': + // \r\n and standalone \r are both line breaks if (this.charAt(1) === '\n') { this.count(2) - return this.lineStart() + } else { + this.count(1) } - // fallthrough + return this.lineStart() default: this.plainScalar() return 'document' @@ -307,8 +312,8 @@ class Lexer { break } case '\r': - if (this.charAt(1) === '\n') break - // fallthrough + // standalone \r is a line break, handled by newline() in loop + break default: this.flowKey = false this.plainScalar() @@ -355,12 +360,12 @@ class Lexer { } // Only looking for newlines within the quotes const qb = this.source.substring(0, end) - let nl = qb.indexOf('\n', this.pos) + let nl = this.findLineBreak(qb, this.pos) if (nl !== -1) { while (nl !== -1) { const cs = this.continueScalar(nl + 1) if (cs === -1) break - nl = qb.indexOf('\n', cs) + nl = this.findLineBreak(qb, cs) } if (nl !== -1) { // this is an error caused by an unexpected unindent @@ -399,8 +404,10 @@ class Lexer { indent = 0 break case '\r': - if (this.source[i + 1] === '\n') break - // fallthrough + nl = i + indent = 0 + if (this.source[i + 1] === '\n') i++ // skip \n in \r\n + break default: break loop } @@ -414,7 +421,7 @@ class Lexer { do { const cs = this.continueScalar(nl + 1) if (cs === -1) break - nl = this.source.indexOf('\n', cs) + nl = this.findLineBreak(this.source, cs) } while (nl !== -1) if (nl === -1) nl = this.source.length } @@ -460,10 +467,11 @@ class Lexer { i += 1 ch = '\n' next = this.source[i + 1] - } else end = i + } + // standalone \r is also a line break } if (next === '#' || (inFlow && flowIndicatorChars.has(next))) break - if (ch === '\n') { + if (ch === '\n' || ch === '\r') { const cs = this.continueScalar(i + 1) if (cs === -1) break i = Math.max(i, cs - 2) // to advance, but still account for ' #' @@ -501,16 +509,31 @@ class Lexer { private toLineEnd(): number { let i = this.pos let ch = this.source[i] - while (ch && ch !== '\n') ch = this.source[++i] - if (this.source[i - 1] === '\r') --i + // stop at \n or standalone \r + while (ch && ch !== '\n' && ch !== '\r') ch = this.source[++i] return this.toIndex(i, false) } + private findLineBreak(str: string, pos: number): number { + for (let i = pos; i < str.length; i++) { + const ch = str[i] + if (ch === '\n') return i + if (ch === '\r') { + if (str[i + 1] === '\n') return i + 1 // \r\n is a single line break + return i // standalone \r + } + } + return -1 + } + private newline(): number { const ch = this.source[this.pos] if (ch === '\n') return this.count(1) - else if (ch === '\r' && this.charAt(1) === '\n') return this.count(2) - else return 0 + if (ch === '\r') { + if (this.charAt(1) === '\n') return this.count(2) + return this.count(1) + } + return 0 } private spaces(allowTabs: boolean): number { diff --git a/tests/cst.ts b/tests/cst.ts index f2cccedb..e48de845 100644 --- a/tests/cst.ts +++ b/tests/cst.ts @@ -201,3 +201,19 @@ test('Line comment before unindented block-seq in block-map (#525)', () => { const [doc] = Array.from(new Parser().parse(src)) expect(CST.stringify(doc)).toBe(src) }) + +describe('standalone CR line break handling (#595)', () => { + test('tokenType recognizes standalone CR as newline', () => { + expect(CST.tokenType('\r')).toBe('newline') + expect(CST.tokenType('\n')).toBe('newline') + expect(CST.tokenType('\r\n')).toBe('newline') + }) + + test('Parser produces newline tokens for CR', () => { + const tokens = Array.from(new Parser().parse('a: 1\rb: 2')) + expect(tokens).toHaveLength(1) + const doc = tokens[0] as CST.Document + expect(doc.type).toBe('document') + expect(doc.value?.type).toBe('block-map') + }) +}) diff --git a/tests/doc/parse.ts b/tests/doc/parse.ts index 627b9d91..b20c626a 100644 --- a/tests/doc/parse.ts +++ b/tests/doc/parse.ts @@ -28,7 +28,7 @@ describe('scalars', () => { test('carriage returns in double-quotes', () => { const src = '"a\nb\n\rc\n\r\nd\n\r\n\re\n\r\n\r\nf"' - expect(YAML.parse(src)).toBe('a b \rc\nd\n\re\n\nf') + expect(YAML.parse(src)).toBe('a b\nc\nd\n\ne\n\nf') }) }) @@ -944,3 +944,109 @@ describe('stringKeys', () => { expect(doc.errors).toMatchObject([{ code: 'NON_STRING_KEY' }]) }) }) + +describe('standalone CR line break handling (#595)', () => { + describe('basic document parsing', () => { + test('CR-separated key-value pairs', () => { + expect(YAML.parse('a: 1\rb: 2\rc: 3')).toEqual({ a: 1, b: 2, c: 3 }) + }) + + test('CR produces same result as LF', () => { + const crDoc = 'a: 1\rb: 2' + const lfDoc = 'a: 1\nb: 2' + expect(YAML.parse(crDoc)).toEqual(YAML.parse(lfDoc)) + }) + + test('mixed CR, LF, and CRLF line breaks', () => { + expect(YAML.parse('a: 1\rb: 2\nc: 3\r\nd: 4')).toEqual({ a: 1, b: 2, c: 3, d: 4 }) + }) + + test('CR in block sequence', () => { + expect(YAML.parse('- a\r- b\r- c')).toEqual(['a', 'b', 'c']) + }) + + test('CR at end of document', () => { + expect(YAML.parse('foo: bar\r')).toEqual({ foo: 'bar' }) + }) + }) + + describe('double-quoted strings', () => { + test('unescaped CR folds to space', () => { + expect(YAML.parse('"a\rb"')).toBe('a b') + }) + + test('multiple unescaped CRs fold to newlines', () => { + expect(YAML.parse('"a\r\rb"')).toBe('a\nb') + expect(YAML.parse('"a\r\r\rb"')).toBe('a\n\nb') + }) + + test('CR matches LF folding behavior', () => { + expect(YAML.parse('"a\rb"')).toBe(YAML.parse('"a\nb"')) + expect(YAML.parse('"a\r\rb"')).toBe(YAML.parse('"a\n\nb"')) + }) + + test('escaped CR is line continuation', () => { + expect(YAML.parse('"a\\\rb"')).toBe('ab') + expect(YAML.parse('"a\\\r b"')).toBe('ab') // trims following whitespace + }) + + test('escaped CR matches escaped LF behavior', () => { + expect(YAML.parse('"a\\\rb"')).toBe(YAML.parse('"a\\\nb"')) + }) + }) + + describe('single-quoted strings', () => { + test('CR folds to space in single-quoted string', () => { + expect(YAML.parse("'a\rb'")).toBe('a b') + }) + + test('multiple CRs fold correctly', () => { + expect(YAML.parse("'a\r\rb'")).toBe('a\nb') + }) + }) + + describe('block scalars', () => { + test('literal block scalar with CR', () => { + expect(YAML.parse('|\ra\rb')).toBe('a\nb\n') + }) + + test('folded block scalar with CR', () => { + expect(YAML.parse('>\ra\rb')).toBe('a b\n') + }) + + test('block scalar content with CR line breaks', () => { + expect(YAML.parse('|\r line1\r line2')).toBe('line1\nline2\n') + }) + }) + + describe('flow collections', () => { + test('CR in flow sequence', () => { + expect(YAML.parse('[\r1\r,\r2\r]')).toEqual([1, 2]) + }) + + test('CR in flow mapping', () => { + expect(YAML.parse('{\ra: 1\r,\rb: 2\r}')).toEqual({ a: 1, b: 2 }) + }) + }) + + describe('comments', () => { + test('CR before comment', () => { + expect(YAML.parse('foo\r# comment')).toBe('foo') + }) + + test('CR after comment', () => { + expect(YAML.parse('a: 1 # comment\rb: 2')).toEqual({ a: 1, b: 2 }) + }) + }) + + describe('LF followed by CR (\\n\\r)', () => { + test('\\n\\r is two separate line breaks', () => { + // \n\r = LF + CR = two line breaks, folds to one newline + expect(YAML.parse('"a\n\rb"')).toBe('a\nb') + }) + + test('\\n\\r in document structure', () => { + expect(YAML.parse('a: 1\n\rb: 2')).toEqual({ a: 1, b: 2 }) + }) + }) +}) diff --git a/tests/doc/types.ts b/tests/doc/types.ts index 4f91a23e..fb3c654d 100644 --- a/tests/doc/types.ts +++ b/tests/doc/types.ts @@ -114,9 +114,10 @@ describe('tags', () => { }) test('CR in tag shorthand (#501)', () => { + // \r is now a line break, so !\r! is parsed as ! + newline + ! const doc = parseDocument(': | !\r!') - const err = doc.errors.find(err => err.code === 'TAG_RESOLVE_FAILED') - expect(err).not.toBeFalsy() + const err = doc.errors.find(err => err.code === 'MULTILINE_IMPLICIT_KEY') + expect(err).toBeDefined() }) }) diff --git a/tests/lexer.ts b/tests/lexer.ts index ca998af9..b8761e51 100644 --- a/tests/lexer.ts +++ b/tests/lexer.ts @@ -76,3 +76,32 @@ test('trailing comments on ...', () => { '\n' ]) }) + +test('standalone CR treated as line break in quoted string (#595)', () => { + const lfTokens = lex('text: "a\n\n\n\n b"') + const crTokens = lex('text: "a\r\r\r\r b"') + expect(crTokens.length).toEqual(lfTokens.length) +}) + +test('plain scalar + standalone CR + comment', () => { + const src = 'foo\r# bar' + expect(lex(src)).toEqual([DOC, SCALAR, 'foo', '\r', '# bar']) +}) + +test('standalone CR in document', () => { + const src = 'a: 1\rb: 2' + expect(lex(src)).toEqual([DOC, SCALAR, 'a', ':', ' ', SCALAR, '1', '\r', SCALAR, 'b', ':', ' ', SCALAR, '2']) +}) + +test('standalone CR in flow collection', () => { + const src = '[\r1\r]' + expect(lex(src)).toEqual([DOC, '[', '\r', SCALAR, '1', '\r', ']']) +}) + +test('mixed CR and LF line breaks', () => { + const src = 'a: 1\rb: 2\nc: 3\r\nd: 4' + const tokens = lex(src) + expect(tokens).toContain('\r') + expect(tokens).toContain('\n') + expect(tokens).toContain('\r\n') +}) From 90db68faf93384354ad45a02befc17161269685b Mon Sep 17 00:00:00 2001 From: Jason Held Date: Mon, 19 Jan 2026 13:45:57 -0700 Subject: [PATCH 2/4] Resolve prettier issues --- src/compose/resolve-flow-scalar.ts | 5 ++++- tests/doc/parse.ts | 7 ++++++- tests/lexer.ts | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/compose/resolve-flow-scalar.ts b/src/compose/resolve-flow-scalar.ts index d76b722d..bee78ff3 100644 --- a/src/compose/resolve-flow-scalar.ts +++ b/src/compose/resolve-flow-scalar.ts @@ -114,7 +114,10 @@ function foldLines(source: string) { try { // match all line breaks: \r\n, \n, or standalone \r first = new RegExp('(.*?)(? { }) test('mixed CR, LF, and CRLF line breaks', () => { - expect(YAML.parse('a: 1\rb: 2\nc: 3\r\nd: 4')).toEqual({ a: 1, b: 2, c: 3, d: 4 }) + expect(YAML.parse('a: 1\rb: 2\nc: 3\r\nd: 4')).toEqual({ + a: 1, + b: 2, + c: 3, + d: 4 + }) }) test('CR in block sequence', () => { diff --git a/tests/lexer.ts b/tests/lexer.ts index b8761e51..cdc1cb0b 100644 --- a/tests/lexer.ts +++ b/tests/lexer.ts @@ -90,7 +90,22 @@ test('plain scalar + standalone CR + comment', () => { test('standalone CR in document', () => { const src = 'a: 1\rb: 2' - expect(lex(src)).toEqual([DOC, SCALAR, 'a', ':', ' ', SCALAR, '1', '\r', SCALAR, 'b', ':', ' ', SCALAR, '2']) + expect(lex(src)).toEqual([ + DOC, + SCALAR, + 'a', + ':', + ' ', + SCALAR, + '1', + '\r', + SCALAR, + 'b', + ':', + ' ', + SCALAR, + '2' + ]) }) test('standalone CR in flow collection', () => { From aa12fc9a1639c7c7af62ebde2f9c056e418482cb Mon Sep 17 00:00:00 2001 From: Jason Held Date: Mon, 19 Jan 2026 13:51:58 -0700 Subject: [PATCH 3/4] Revert logic change --- src/compose/resolve-block-scalar.ts | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/compose/resolve-block-scalar.ts b/src/compose/resolve-block-scalar.ts index 9c359a75..2ef0ae31 100644 --- a/src/compose/resolve-block-scalar.ts +++ b/src/compose/resolve-block-scalar.ts @@ -201,16 +201,13 @@ function parseBlockScalarHeader( /** @returns Array of lines split up as `[indent, content]` */ function splitLines(source: string) { - // split on all line break types: \n, \r\n, or standalone \r - const split = source.split(/\r?\n|\r(?!\n)/) - const lines: [string, string][] = [] - for (const line of split) { - const m = line.match(/^( *)/) - if (m?.[1]) { - lines.push([m[1], line.slice(m[1].length)]) - } else { - lines.push(['', line]) - } - } + const split = source.split(/(?:\r?\n|\r(?!\n))( *)/) + const first = split[0] + const m = first.match(/^( *)/) + const line0: [string, string] = m?.[1] + ? [m[1], first.slice(m[1].length)] + : ['', first] + const lines = [line0] + for (let i = 1; i < split.length; i += 2) lines.push([split[i], split[i + 1]]) return lines } From 9b636a1b808b9781021b1898ed6fc47bac916811 Mon Sep 17 00:00:00 2001 From: Jason Held Date: Mon, 19 Jan 2026 14:17:15 -0700 Subject: [PATCH 4/4] Clean up code --- src/compose/resolve-flow-scalar.ts | 26 +++++++------------------- src/parse/lexer.ts | 13 ++++--------- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/src/compose/resolve-flow-scalar.ts b/src/compose/resolve-flow-scalar.ts index bee78ff3..789b683f 100644 --- a/src/compose/resolve-flow-scalar.ts +++ b/src/compose/resolve-flow-scalar.ts @@ -159,16 +159,12 @@ function doubleQuotedValue(source: string, onError: FlowScalarErrorHandler) { let next = source[++i] const cc = escapeCodes[next] if (cc) res += cc - else if (next === '\n') { - // skip escaped newlines, but still trim the following line - next = source[i + 1] - while (next === ' ' || next === '\t') next = source[++i + 1] - } else if (next === '\r' && source[i + 1] === '\n') { - // skip escaped CRLF newlines, but still trim the following line + else if (next === '\r' && source[i + 1] === '\n') { + // skip escaped CRLF, but still trim the following line next = source[++i + 1] while (next === ' ' || next === '\t') next = source[++i + 1] - } else if (next === '\r') { - // skip escaped standalone CR, but still trim the following line + } else if (next === '\n' || next === '\r') { + // skip escaped newline (LF or CR), but still trim the following line next = source[i + 1] while (next === ' ' || next === '\t') next = source[++i + 1] } else if (next === 'x' || next === 'u' || next === 'U') { @@ -204,18 +200,10 @@ function foldNewline(source: string, offset: number) { let fold = '' let ch = source[offset + 1] while (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') { - if (ch === '\r') { - // \r\n counts as one newline, standalone \r also counts as one - if (source[offset + 2] === '\n') { - fold += '\n' - offset += 2 - } else { - fold += '\n' - offset += 1 - } - } else if (ch === '\n') { + if (ch === '\n' || ch === '\r') { fold += '\n' - offset += 1 + if (ch === '\r' && source[offset + 2] === '\n') offset += 2 + else offset += 1 } else { offset += 1 } diff --git a/src/parse/lexer.ts b/src/parse/lexer.ts index a44092a0..277083cf 100644 --- a/src/parse/lexer.ts +++ b/src/parse/lexer.ts @@ -515,15 +515,10 @@ class Lexer { } private findLineBreak(str: string, pos: number): number { - for (let i = pos; i < str.length; i++) { - const ch = str[i] - if (ch === '\n') return i - if (ch === '\r') { - if (str[i + 1] === '\n') return i + 1 // \r\n is a single line break - return i // standalone \r - } - } - return -1 + const nl = str.indexOf('\n', pos) + const cr = str.indexOf('\r', pos) + if (cr !== -1 && (nl === -1 || cr < nl - 1)) return cr + return nl } private newline(): number {