diff --git a/src/compose/resolve-block-scalar.ts b/src/compose/resolve-block-scalar.ts index 1fb3036b..2ef0ae31 100644 --- a/src/compose/resolve-block-scalar.ts +++ b/src/compose/resolve-block-scalar.ts @@ -201,7 +201,7 @@ function parseBlockScalarHeader( /** @returns Array of lines split up as `[indent, content]` */ function splitLines(source: string) { - const split = source.split(/\n( *)/) + const split = source.split(/(?:\r?\n|\r(?!\n))( *)/) const first = split[0] const m = first.match(/^( *)/) const line0: [string, string] = m?.[1] diff --git a/src/compose/resolve-flow-scalar.ts b/src/compose/resolve-flow-scalar.ts index 5568d63c..789b683f 100644 --- a/src/compose/resolve-flow-scalar.ts +++ b/src/compose/resolve-flow-scalar.ts @@ -112,11 +112,15 @@ function foldLines(source: string) { */ let first: RegExp, line: RegExp try { - first = new RegExp('(.*?)(? wsStart ? source.slice(wsStart, i + 1) : ch } else { res += ch @@ -190,15 +194,19 @@ function doubleQuotedValue(source: string, onError: FlowScalarErrorHandler) { /** * Fold a single newline into a space, multiple newlines to N - 1 newlines. - * Presumes `source[offset] === '\n'` + * Presumes `source[offset] === '\n'` or `source[offset] === '\r'` */ function foldNewline(source: string, offset: number) { let fold = '' let ch = source[offset + 1] while (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') { - if (ch === '\r' && source[offset + 2] !== '\n') break - if (ch === '\n') fold += '\n' - offset += 1 + if (ch === '\n' || ch === '\r') { + fold += '\n' + if (ch === '\r' && source[offset + 2] === '\n') offset += 2 + else offset += 1 + } else { + offset += 1 + } ch = source[offset + 1] } if (!fold) fold = ' ' diff --git a/src/parse/cst.ts b/src/parse/cst.ts index a945ea03..f725842f 100644 --- a/src/parse/cst.ts +++ b/src/parse/cst.ts @@ -207,6 +207,7 @@ export function tokenType(source: string): TokenType | null { case '': case '\n': case '\r\n': + case '\r': return 'newline' case '-': return 'seq-item-ind' diff --git a/src/parse/lexer.ts b/src/parse/lexer.ts index e66751e6..277083cf 100644 --- a/src/parse/lexer.ts +++ b/src/parse/lexer.ts @@ -17,7 +17,7 @@ const flowIndicatorChars = new Set(',[]{}') const invalidAnchorChars = new Set(' ,[]{}\n\r\t') const isNotAnchorChar = (ch: string) => !ch || invalidAnchorChars.has(ch) -const blockScalarHeader = /([|>][^\s#]*)([ \t]*)((?:.|\r(?!\n))*)$/my +const blockScalarHeader = /([|>][^\s#]*)([ \t]*)([^\r\n]*)$/my const blockStart = /([-?:])(?=[ \n\r\t]|$)([ \t]*)/y const directiveLine = /(%.*?)(?:([ \t]+)(#.*)?)?$/my const docMarker = /[-.]{3}(?=[ \n\r\t]|$)(?:([ \t]+)(#.*)?)?/y @@ -91,7 +91,10 @@ class Lexer { while (ch === ' ') ch = this.source[++indent + offset] if (ch === '\r') { const next = this.source[indent + offset + 1] + // \r\n is a single line break if (next === '\n') return offset + indent + 1 + // standalone \r is also a line break per YAML 1.2 spec + return offset + indent } return ch === '\n' || indent >= this.indentNext ? offset + indent : -1 } @@ -204,11 +207,13 @@ class Lexer { this.blockScalar() return this.lineStart() case '\r': + // \r\n and standalone \r are both line breaks if (this.charAt(1) === '\n') { this.count(2) - return this.lineStart() + } else { + this.count(1) } - // fallthrough + return this.lineStart() default: this.plainScalar() return 'document' @@ -307,8 +312,8 @@ class Lexer { break } case '\r': - if (this.charAt(1) === '\n') break - // fallthrough + // standalone \r is a line break, handled by newline() in loop + break default: this.flowKey = false this.plainScalar() @@ -355,12 +360,12 @@ class Lexer { } // Only looking for newlines within the quotes const qb = this.source.substring(0, end) - let nl = qb.indexOf('\n', this.pos) + let nl = this.findLineBreak(qb, this.pos) if (nl !== -1) { while (nl !== -1) { const cs = this.continueScalar(nl + 1) if (cs === -1) break - nl = qb.indexOf('\n', cs) + nl = this.findLineBreak(qb, cs) } if (nl !== -1) { // this is an error caused by an unexpected unindent @@ -399,8 +404,10 @@ class Lexer { indent = 0 break case '\r': - if (this.source[i + 1] === '\n') break - // fallthrough + nl = i + indent = 0 + if (this.source[i + 1] === '\n') i++ // skip \n in \r\n + break default: break loop } @@ -414,7 +421,7 @@ class Lexer { do { const cs = this.continueScalar(nl + 1) if (cs === -1) break - nl = this.source.indexOf('\n', cs) + nl = this.findLineBreak(this.source, cs) } while (nl !== -1) if (nl === -1) nl = this.source.length } @@ -460,10 +467,11 @@ class Lexer { i += 1 ch = '\n' next = this.source[i + 1] - } else end = i + } + // standalone \r is also a line break } if (next === '#' || (inFlow && flowIndicatorChars.has(next))) break - if (ch === '\n') { + if (ch === '\n' || ch === '\r') { const cs = this.continueScalar(i + 1) if (cs === -1) break i = Math.max(i, cs - 2) // to advance, but still account for ' #' @@ -501,16 +509,26 @@ class Lexer { private toLineEnd(): number { let i = this.pos let ch = this.source[i] - while (ch && ch !== '\n') ch = this.source[++i] - if (this.source[i - 1] === '\r') --i + // stop at \n or standalone \r + while (ch && ch !== '\n' && ch !== '\r') ch = this.source[++i] return this.toIndex(i, false) } + private findLineBreak(str: string, pos: number): number { + const nl = str.indexOf('\n', pos) + const cr = str.indexOf('\r', pos) + if (cr !== -1 && (nl === -1 || cr < nl - 1)) return cr + return nl + } + private newline(): number { const ch = this.source[this.pos] if (ch === '\n') return this.count(1) - else if (ch === '\r' && this.charAt(1) === '\n') return this.count(2) - else return 0 + if (ch === '\r') { + if (this.charAt(1) === '\n') return this.count(2) + return this.count(1) + } + return 0 } private spaces(allowTabs: boolean): number { diff --git a/tests/cst.ts b/tests/cst.ts index f2cccedb..e48de845 100644 --- a/tests/cst.ts +++ b/tests/cst.ts @@ -201,3 +201,19 @@ test('Line comment before unindented block-seq in block-map (#525)', () => { const [doc] = Array.from(new Parser().parse(src)) expect(CST.stringify(doc)).toBe(src) }) + +describe('standalone CR line break handling (#595)', () => { + test('tokenType recognizes standalone CR as newline', () => { + expect(CST.tokenType('\r')).toBe('newline') + expect(CST.tokenType('\n')).toBe('newline') + expect(CST.tokenType('\r\n')).toBe('newline') + }) + + test('Parser produces newline tokens for CR', () => { + const tokens = Array.from(new Parser().parse('a: 1\rb: 2')) + expect(tokens).toHaveLength(1) + const doc = tokens[0] as CST.Document + expect(doc.type).toBe('document') + expect(doc.value?.type).toBe('block-map') + }) +}) diff --git a/tests/doc/parse.ts b/tests/doc/parse.ts index 627b9d91..aa2e2cd4 100644 --- a/tests/doc/parse.ts +++ b/tests/doc/parse.ts @@ -28,7 +28,7 @@ describe('scalars', () => { test('carriage returns in double-quotes', () => { const src = '"a\nb\n\rc\n\r\nd\n\r\n\re\n\r\n\r\nf"' - expect(YAML.parse(src)).toBe('a b \rc\nd\n\re\n\nf') + expect(YAML.parse(src)).toBe('a b\nc\nd\n\ne\n\nf') }) }) @@ -944,3 +944,114 @@ describe('stringKeys', () => { expect(doc.errors).toMatchObject([{ code: 'NON_STRING_KEY' }]) }) }) + +describe('standalone CR line break handling (#595)', () => { + describe('basic document parsing', () => { + test('CR-separated key-value pairs', () => { + expect(YAML.parse('a: 1\rb: 2\rc: 3')).toEqual({ a: 1, b: 2, c: 3 }) + }) + + test('CR produces same result as LF', () => { + const crDoc = 'a: 1\rb: 2' + const lfDoc = 'a: 1\nb: 2' + expect(YAML.parse(crDoc)).toEqual(YAML.parse(lfDoc)) + }) + + test('mixed CR, LF, and CRLF line breaks', () => { + expect(YAML.parse('a: 1\rb: 2\nc: 3\r\nd: 4')).toEqual({ + a: 1, + b: 2, + c: 3, + d: 4 + }) + }) + + test('CR in block sequence', () => { + expect(YAML.parse('- a\r- b\r- c')).toEqual(['a', 'b', 'c']) + }) + + test('CR at end of document', () => { + expect(YAML.parse('foo: bar\r')).toEqual({ foo: 'bar' }) + }) + }) + + describe('double-quoted strings', () => { + test('unescaped CR folds to space', () => { + expect(YAML.parse('"a\rb"')).toBe('a b') + }) + + test('multiple unescaped CRs fold to newlines', () => { + expect(YAML.parse('"a\r\rb"')).toBe('a\nb') + expect(YAML.parse('"a\r\r\rb"')).toBe('a\n\nb') + }) + + test('CR matches LF folding behavior', () => { + expect(YAML.parse('"a\rb"')).toBe(YAML.parse('"a\nb"')) + expect(YAML.parse('"a\r\rb"')).toBe(YAML.parse('"a\n\nb"')) + }) + + test('escaped CR is line continuation', () => { + expect(YAML.parse('"a\\\rb"')).toBe('ab') + expect(YAML.parse('"a\\\r b"')).toBe('ab') // trims following whitespace + }) + + test('escaped CR matches escaped LF behavior', () => { + expect(YAML.parse('"a\\\rb"')).toBe(YAML.parse('"a\\\nb"')) + }) + }) + + describe('single-quoted strings', () => { + test('CR folds to space in single-quoted string', () => { + expect(YAML.parse("'a\rb'")).toBe('a b') + }) + + test('multiple CRs fold correctly', () => { + expect(YAML.parse("'a\r\rb'")).toBe('a\nb') + }) + }) + + describe('block scalars', () => { + test('literal block scalar with CR', () => { + expect(YAML.parse('|\ra\rb')).toBe('a\nb\n') + }) + + test('folded block scalar with CR', () => { + expect(YAML.parse('>\ra\rb')).toBe('a b\n') + }) + + test('block scalar content with CR line breaks', () => { + expect(YAML.parse('|\r line1\r line2')).toBe('line1\nline2\n') + }) + }) + + describe('flow collections', () => { + test('CR in flow sequence', () => { + expect(YAML.parse('[\r1\r,\r2\r]')).toEqual([1, 2]) + }) + + test('CR in flow mapping', () => { + expect(YAML.parse('{\ra: 1\r,\rb: 2\r}')).toEqual({ a: 1, b: 2 }) + }) + }) + + describe('comments', () => { + test('CR before comment', () => { + expect(YAML.parse('foo\r# comment')).toBe('foo') + }) + + test('CR after comment', () => { + expect(YAML.parse('a: 1 # comment\rb: 2')).toEqual({ a: 1, b: 2 }) + }) + }) + + describe('LF followed by CR (\\n\\r)', () => { + test('\\n\\r is two separate line breaks', () => { + // \n\r = LF + CR = two line breaks, folds to one newline + expect(YAML.parse('"a\n\rb"')).toBe('a\nb') + }) + + test('\\n\\r in document structure', () => { + expect(YAML.parse('a: 1\n\rb: 2')).toEqual({ a: 1, b: 2 }) + }) + }) +}) diff --git a/tests/doc/types.ts b/tests/doc/types.ts index 4f91a23e..fb3c654d 100644 --- a/tests/doc/types.ts +++ b/tests/doc/types.ts @@ -114,9 +114,10 @@ describe('tags', () => { }) test('CR in tag shorthand (#501)', () => { + // \r is now a line break, so !\r! is parsed as ! + newline + ! const doc = parseDocument(': | !\r!') - const err = doc.errors.find(err => err.code === 'TAG_RESOLVE_FAILED') - expect(err).not.toBeFalsy() + const err = doc.errors.find(err => err.code === 'MULTILINE_IMPLICIT_KEY') + expect(err).toBeDefined() }) }) diff --git a/tests/lexer.ts b/tests/lexer.ts index ca998af9..cdc1cb0b 100644 --- a/tests/lexer.ts +++ b/tests/lexer.ts @@ -76,3 +76,47 @@ test('trailing comments on ...', () => { '\n' ]) }) + +test('standalone CR treated as line break in quoted string (#595)', () => { + const lfTokens = lex('text: "a\n\n\n\n b"') + const crTokens = lex('text: "a\r\r\r\r b"') + expect(crTokens.length).toEqual(lfTokens.length) +}) + +test('plain scalar + standalone CR + comment', () => { + const src = 'foo\r# bar' + expect(lex(src)).toEqual([DOC, SCALAR, 'foo', '\r', '# bar']) +}) + +test('standalone CR in document', () => { + const src = 'a: 1\rb: 2' + expect(lex(src)).toEqual([ + DOC, + SCALAR, + 'a', + ':', + ' ', + SCALAR, + '1', + '\r', + SCALAR, + 'b', + ':', + ' ', + SCALAR, + '2' + ]) +}) + +test('standalone CR in flow collection', () => { + const src = '[\r1\r]' + expect(lex(src)).toEqual([DOC, '[', '\r', SCALAR, '1', '\r', ']']) +}) + +test('mixed CR and LF line breaks', () => { + const src = 'a: 1\rb: 2\nc: 3\r\nd: 4' + const tokens = lex(src) + expect(tokens).toContain('\r') + expect(tokens).toContain('\n') + expect(tokens).toContain('\r\n') +})