diff --git a/CHANGELOG.md b/CHANGELOG.md index e2204f1..454bd4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.8] - 2025-12-31 + +### Changed + +- **Java 25 Grammar Sync** - Synced grammar improvements from jbct-cli + - Added cut operators (`^`) after discriminating keywords for better error messages + - Added keyword helper rules with word boundaries (`ClassKW`, `InterfaceKW`, `EnumKW`, etc.) + - Added token boundaries to `PrimType`, `Modifier`, `Literal`, `Primary` rules + - Updated `RefType` lookahead to handle `Type.@Annotation Inner` correctly + - Added `TypeExpr` rule for `Type.class` and `Type::new` expressions + - Updated operator rules with lookaheads to prevent compound operator conflicts + - Added `RecordDecl` lookahead to distinguish from methods/fields named 'record' + +### Fixed + +- **Farthest Failure Tracking** - Error positions now report at the furthest parsing position instead of 1:1 after backtracking + - Added `furthestPos`/`furthestFailure` tracking to both AST and CST generated parsers + - Replaces null checks with `Option` in CST parser generator for consistency + - Fixed infinite recursion in AST parser when whitespace rule contained `*` quantifier + - Fixed "unexpected input" errors to also use furthest failure position + +### Added + +- **Error Position Tests** - 3 new tests verifying farthest failure tracking in generated parsers +- Test count: 305 → 308 + ## [0.1.7] - 2025-12-30 ### Changed diff --git a/pom.xml b/pom.xml index d8978f5..07ac5b2 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.pragmatica-lite peglib - 0.1.7 + 0.1.8 jar Peglib diff --git a/src/main/java/org/pragmatica/peg/generator/ParserGenerator.java b/src/main/java/org/pragmatica/peg/generator/ParserGenerator.java index 3548a63..4a19846 100644 --- a/src/main/java/org/pragmatica/peg/generator/ParserGenerator.java +++ b/src/main/java/org/pragmatica/peg/generator/ParserGenerator.java @@ -91,12 +91,14 @@ private boolean matchesPattern(char c, String pattern, boolean caseInsensitive) private final String packageName; private final String className; private final ErrorReporting errorReporting; + private boolean inWhitespaceRuleGeneration; private ParserGenerator(Grammar grammar, String packageName, String className, ErrorReporting errorReporting) { this.grammar = grammar; this.packageName = packageName; this.className = className; this.errorReporting = errorReporting; + this.inWhitespaceRuleGeneration = false; } public static ParserGenerator create(Grammar grammar, String packageName, String className) { @@ -195,6 +197,10 @@ private void generateParseContext(StringBuilder sb) { private Map cache; private Map captures; private boolean packratEnabled = true; + private int furthestPos; + private int furthestLine; + private int furthestColumn; + private String furthestExpected; /** * Enable or disable packrat memoization. @@ -211,6 +217,10 @@ private void init(String input) { this.column = 1; this.cache = packratEnabled ? new HashMap<>() : null; this.captures = new HashMap<>(); + this.furthestPos = 0; + this.furthestLine = 1; + this.furthestColumn = 1; + this.furthestExpected = ""; } private boolean isAtEnd() { @@ -248,6 +258,17 @@ private long cacheKey(int ruleId, int position) { return ((long) ruleId << 32) | position; } + private void trackFailure(String expected) { + if (pos > furthestPos) { + furthestPos = pos; + furthestLine = line; + furthestColumn = column; + furthestExpected = expected; + } else if (pos == furthestPos && !furthestExpected.contains(expected)) { + furthestExpected = furthestExpected.isEmpty() ? expected : furthestExpected + " or " + expected; + } + } + """); } @@ -266,12 +287,17 @@ public Result parse(String input) { init(input); var result = parse_%s(); if (result.isFailure()) { - return Result.failure(new ParseError(line, column, "expected " + result.expected.orElse("valid input"))); + var errorLine = furthestPos > 0 ? furthestLine : line; + var errorColumn = furthestPos > 0 ? furthestColumn : column; + var expected = !furthestExpected.isEmpty() ? furthestExpected : result.expected.or("valid input"); + return Result.failure(new ParseError(errorLine, errorColumn, "expected " + expected)); } if (!isAtEnd()) { - return Result.failure(new ParseError(line, column, "unexpected input")); + var errorLine = furthestPos > 0 ? furthestLine : line; + var errorColumn = furthestPos > 0 ? furthestColumn : column; + return Result.failure(new ParseError(errorLine, errorColumn, "unexpected input")); } - return Result.success(result.value.orElse(null)); + return Result.success(result.value.or(null)); } """.formatted(sanitize(startRuleName))); @@ -447,8 +473,10 @@ private void generateExpressionCode(StringBuilder sb, .append(" = false;\n"); int i = 0; for (var elem : seq.elements()) { - sb.append(pad) - .append("skipWhitespace();\n"); + if (!inWhitespaceRuleGeneration) { + sb.append(pad) + .append("skipWhitespace();\n"); + } generateExpressionCode(sb, elem, "elem" + id + "_" + i, indent, counter); // Check for cut failure propagation sb.append(pad) @@ -655,8 +683,10 @@ private void generateExpressionCode(StringBuilder sb, .append(" int ") .append(beforePos) .append("Column = column;\n"); - sb.append(pad) - .append(" skipWhitespace();\n"); + if (!inWhitespaceRuleGeneration) { + sb.append(pad) + .append(" skipWhitespace();\n"); + } generateExpressionCode(sb, zom.expression(), zomElem, indent + 1, counter); sb.append(pad) .append(" if (") @@ -712,8 +742,10 @@ private void generateExpressionCode(StringBuilder sb, .append(" int ") .append(beforePos) .append("Column = column;\n"); - sb.append(pad) - .append(" skipWhitespace();\n"); + if (!inWhitespaceRuleGeneration) { + sb.append(pad) + .append(" skipWhitespace();\n"); + } generateExpressionCode(sb, oom.expression(), oomElem, indent + 2, counter); sb.append(pad) .append(" if (") @@ -829,10 +861,12 @@ private void generateExpressionCode(StringBuilder sb, .append(" int ") .append(beforePos) .append("Column = column;\n"); - sb.append(pad) - .append(" if (") - .append(repCount) - .append(" > 0) skipWhitespace();\n"); + if (!inWhitespaceRuleGeneration) { + sb.append(pad) + .append(" if (") + .append(repCount) + .append(" > 0) skipWhitespace();\n"); + } generateExpressionCode(sb, rep.expression(), repElem, indent + 1, counter); sb.append(pad) .append(" if (") @@ -1082,12 +1116,14 @@ private void skipWhitespace() { sb.append(" while (!isAtEnd()) {\n"); sb.append(" int wsBeforePos = pos;\n"); int[] wsCounter = {0}; + inWhitespaceRuleGeneration = true; generateExpressionCode(sb, grammar.whitespace() .unwrap(), "wsResult", 3, wsCounter); + inWhitespaceRuleGeneration = false; sb.append(" if (wsResult.isFailure() || pos == wsBeforePos) break;\n"); sb.append(" }\n"); } @@ -1096,6 +1132,7 @@ private void skipWhitespace() { private ParseResult matchLiteral(String text, boolean caseInsensitive) { if (remaining() < text.length()) { + trackFailure("'" + text + "'"); return ParseResult.failure("'" + text + "'"); } for (int i = 0; i < text.length(); i++) { @@ -1103,10 +1140,12 @@ private ParseResult matchLiteral(String text, boolean caseInsensitive) { char actual = peek(i); if (caseInsensitive) { if (Character.toLowerCase(expected) != Character.toLowerCase(actual)) { + trackFailure("'" + text + "'"); return ParseResult.failure("'" + text + "'"); } } else { if (expected != actual) { + trackFailure("'" + text + "'"); return ParseResult.failure("'" + text + "'"); } } @@ -1127,6 +1166,7 @@ private ParseResult matchDictionary(List words, boolean caseInsensitive) } } if (longestMatch == null) { + trackFailure("dictionary word"); return ParseResult.failure("dictionary word"); } for (int i = 0; i < longestLen; i++) { @@ -1141,12 +1181,14 @@ private ParseResult matchDictionary(List words, boolean caseInsensitive) private ParseResult matchCharClass(String pattern, boolean negated, boolean caseInsensitive) { if (isAtEnd()) { + trackFailure("[" + (negated ? "^" : "") + pattern + "]"); return ParseResult.failure("character class"); } char c = peek(); boolean matches = matchesPattern(c, pattern, caseInsensitive); if (negated) matches = !matches; if (!matches) { + trackFailure("[" + (negated ? "^" : "") + pattern + "]"); return ParseResult.failure("character class"); } advance(); @@ -1159,6 +1201,7 @@ private ParseResult matchCharClass(String pattern, boolean negated, boolean case private ParseResult matchAny() { if (isAtEnd()) { + trackFailure("any character"); return ParseResult.failure("any character"); } char c = advance(); @@ -1652,20 +1695,20 @@ public String formatSimple() { } public record ParseResultWithDiagnostics( - CstNode node, + Option node, List diagnostics, String source ) { public static ParseResultWithDiagnostics success(CstNode node, String source) { - return new ParseResultWithDiagnostics(node, List.of(), source); + return new ParseResultWithDiagnostics(Option.some(node), List.of(), source); } - public static ParseResultWithDiagnostics withErrors(CstNode node, List diagnostics, String source) { + public static ParseResultWithDiagnostics withErrors(Option node, List diagnostics, String source) { return new ParseResultWithDiagnostics(node, List.copyOf(diagnostics), source); } public boolean isSuccess() { - return node != null && diagnostics.isEmpty(); + return node.isPresent() && diagnostics.isEmpty(); } public boolean hasErrors() { @@ -1673,7 +1716,7 @@ public boolean hasErrors() { } public boolean hasNode() { - return node != null; + return node.isPresent(); } public String formatDiagnostics(String filename) { @@ -1720,6 +1763,8 @@ private void generateCstParseContext(StringBuilder sb) { private Map captures; private boolean inTokenBoundary; private boolean packratEnabled = true; + private Option furthestFailure; + private Option furthestExpected; /** * Enable or disable packrat memoization. @@ -1732,8 +1777,6 @@ public void setPackratEnabled(boolean enabled) { if (errorReporting == ErrorReporting.ADVANCED) { sb.append(""" private List diagnostics; - private SourceLocation furthestFailure; - private String furthestExpected; """); } sb.append(""" @@ -1746,12 +1789,12 @@ private void init(String input) { this.cache = packratEnabled ? new HashMap<>() : null; this.captures = new HashMap<>(); this.inTokenBoundary = false; + this.furthestFailure = Option.none(); + this.furthestExpected = Option.none(); """); if (errorReporting == ErrorReporting.ADVANCED) { sb.append(""" this.diagnostics = new ArrayList<>(); - this.furthestFailure = null; - this.furthestExpected = null; """); } sb.append(""" @@ -1802,17 +1845,19 @@ private void restoreLocation(SourceLocation loc) { this.column = loc.column(); } + private void trackFailure(String expected) { + var loc = location(); + if (furthestFailure.isEmpty() || loc.offset() > furthestFailure.unwrap().offset()) { + furthestFailure = Option.some(loc); + furthestExpected = Option.some(expected); + } else if (loc.offset() == furthestFailure.unwrap().offset() && !furthestExpected.or("").contains(expected)) { + furthestExpected = Option.some(furthestExpected.or("").isEmpty() ? expected : furthestExpected.or("") + " or " + expected); + } + } + """); if (errorReporting == ErrorReporting.ADVANCED) { sb.append(""" - private void trackFailure(String expected) { - var loc = location(); - if (furthestFailure == null || loc.offset() > furthestFailure.offset()) { - furthestFailure = loc; - furthestExpected = expected; - } - } - private SourceSpan skipToRecoveryPoint() { var start = location(); while (!isAtEnd()) { @@ -1854,11 +1899,14 @@ public Result parse(String input) { var leadingTrivia = skipWhitespace(); var result = parse_%s(leadingTrivia); if (result.isFailure()) { - return Result.failure(new ParseError(location(), "expected " + result.expected.or("valid input"))); + var errorLoc = furthestFailure.or(location()); + var expected = furthestExpected.filter(s -> !s.isEmpty()).or(result.expected.or("valid input")); + return Result.failure(new ParseError(errorLoc, "expected " + expected)); } var trailingTrivia = skipWhitespace(); // Capture trailing trivia if (!isAtEnd()) { - return Result.failure(new ParseError(location(), "unexpected input")); + var errorLoc = furthestFailure.or(location()); + return Result.failure(new ParseError(errorLoc, "unexpected input")); } // Attach trailing trivia to root node var rootNode = attachTrailingTrivia(result.node.unwrap(), trailingTrivia); @@ -1900,39 +1948,39 @@ public ParseResultWithDiagnostics parseWithDiagnostics(String input) { if (result.isFailure()) { // Record the failure and attempt recovery - var errorLoc = furthestFailure != null ? furthestFailure : location(); + var errorLoc = furthestFailure.or(location()); var errorSpan = SourceSpan.of(errorLoc, errorLoc); - addDiagnostic("expected " + (furthestExpected != null ? furthestExpected : result.expected.or("valid input")), errorSpan); + var expected = furthestExpected.filter(s -> !s.isEmpty()).or(result.expected.or("valid input")); + addDiagnostic("expected " + expected, errorSpan); // Skip to recovery point and try to continue var skippedSpan = skipToRecoveryPoint(); if (skippedSpan.length() > 0) { var skippedText = skippedSpan.extract(input); - var expected = furthestExpected != null ? furthestExpected : result.expected.or("valid input"); var errorNode = new CstNode.Error(skippedSpan, skippedText, expected, leadingTrivia, List.of()); - return ParseResultWithDiagnostics.withErrors(errorNode, diagnostics, input); + return ParseResultWithDiagnostics.withErrors(Option.some(errorNode), diagnostics, input); } - return ParseResultWithDiagnostics.withErrors(null, diagnostics, input); + return ParseResultWithDiagnostics.withErrors(Option.none(), diagnostics, input); } var trailingTrivia = skipWhitespace(); if (!isAtEnd()) { - // Unexpected trailing input - var errorStart = location(); + // Unexpected trailing input - use furthest failure position for error + var errorLoc = furthestFailure.or(location()); var skippedSpan = skipToRecoveryPoint(); - var skippedText = skippedSpan.extract(input); - addDiagnostic("unexpected input", skippedSpan, "expected end of input"); + var errorSpan = SourceSpan.of(errorLoc, skippedSpan.end()); + addDiagnostic("unexpected input", errorSpan, "expected end of input"); // Attach error node to result var rootNode = attachTrailingTrivia(result.node.unwrap(), trailingTrivia); - return ParseResultWithDiagnostics.withErrors(rootNode, diagnostics, input); + return ParseResultWithDiagnostics.withErrors(Option.some(rootNode), diagnostics, input); } var rootNode = attachTrailingTrivia(result.node.unwrap(), trailingTrivia); if (diagnostics.isEmpty()) { return ParseResultWithDiagnostics.success(rootNode, input); } - return ParseResultWithDiagnostics.withErrors(rootNode, diagnostics, input); + return ParseResultWithDiagnostics.withErrors(Option.some(rootNode), diagnostics, input); } """.formatted(sanitizedName)); @@ -3016,6 +3064,7 @@ private CstParseResult matchDictionaryCst(List words, boolean caseInsens sb.append(""" private CstParseResult matchLiteralCst(String text, boolean caseInsensitive) { if (remaining() < text.length()) { + trackFailure("'" + text + "'"); return CstParseResult.failure("'" + text + "'"); } var startLoc = location(); @@ -3024,10 +3073,12 @@ private CstParseResult matchLiteralCst(String text, boolean caseInsensitive) { char actual = peek(i); if (caseInsensitive) { if (Character.toLowerCase(expected) != Character.toLowerCase(actual)) { + trackFailure("'" + text + "'"); return CstParseResult.failure("'" + text + "'"); } } else { if (expected != actual) { + trackFailure("'" + text + "'"); return CstParseResult.failure("'" + text + "'"); } } @@ -3050,6 +3101,7 @@ private CstParseResult matchDictionaryCst(List words, boolean caseInsens } } if (longestMatch == null) { + trackFailure("dictionary word"); return CstParseResult.failure("dictionary word"); } var startLoc = location(); @@ -3064,91 +3116,48 @@ private CstParseResult matchDictionaryCst(List words, boolean caseInsens """); } sb.append(MATCHES_WORD_METHOD); - // Generate matchCharClassCst and matchAnyCst with trackFailure for ADVANCED mode - if (errorReporting == ErrorReporting.ADVANCED) { - sb.append(""" + // trackFailure is now always available (moved out of ADVANCED-only block) + sb.append(""" - private CstParseResult matchCharClassCst(String pattern, boolean negated, boolean caseInsensitive) { - if (isAtEnd()) { - trackFailure("character class"); - return CstParseResult.failure("character class"); - } - var startLoc = location(); - char c = peek(); - boolean matches = matchesPattern(c, pattern, caseInsensitive); - if (negated) matches = !matches; - if (!matches) { - trackFailure("character class"); - return CstParseResult.failure("character class"); - } - advance(); - var text = String.valueOf(c); - var span = SourceSpan.of(startLoc, location()); - var node = new CstNode.Terminal(span, RULE_PEG_CHAR_CLASS, text, List.of(), List.of()); - return CstParseResult.success(node, text, location()); + private CstParseResult matchCharClassCst(String pattern, boolean negated, boolean caseInsensitive) { + if (isAtEnd()) { + trackFailure("[" + (negated ? "^" : "") + pattern + "]"); + return CstParseResult.failure("character class"); } - - """); - }else { - sb.append(""" - - private CstParseResult matchCharClassCst(String pattern, boolean negated, boolean caseInsensitive) { - if (isAtEnd()) { - return CstParseResult.failure("character class"); - } - var startLoc = location(); - char c = peek(); - boolean matches = matchesPattern(c, pattern, caseInsensitive); - if (negated) matches = !matches; - if (!matches) { - return CstParseResult.failure("character class"); - } - advance(); - var text = String.valueOf(c); - var span = SourceSpan.of(startLoc, location()); - var node = new CstNode.Terminal(span, RULE_PEG_CHAR_CLASS, text, List.of(), List.of()); - return CstParseResult.success(node, text, location()); + var startLoc = location(); + char c = peek(); + boolean matches = matchesPattern(c, pattern, caseInsensitive); + if (negated) matches = !matches; + if (!matches) { + trackFailure("[" + (negated ? "^" : "") + pattern + "]"); + return CstParseResult.failure("character class"); } + advance(); + var text = String.valueOf(c); + var span = SourceSpan.of(startLoc, location()); + var node = new CstNode.Terminal(span, RULE_PEG_CHAR_CLASS, text, List.of(), List.of()); + return CstParseResult.success(node, text, location()); + } - """); - } + """); sb.append(MATCHES_PATTERN_METHOD); - if (errorReporting == ErrorReporting.ADVANCED) { - sb.append(""" - - private CstParseResult matchAnyCst() { - if (isAtEnd()) { - trackFailure("any character"); - return CstParseResult.failure("any character"); - } - var startLoc = location(); - char c = advance(); - var text = String.valueOf(c); - var span = SourceSpan.of(startLoc, location()); - var node = new CstNode.Terminal(span, RULE_PEG_ANY, text, List.of(), List.of()); - return CstParseResult.success(node, text, location()); - } - - // === CST Parse Result === - """); - }else { - sb.append(""" + sb.append(""" - private CstParseResult matchAnyCst() { - if (isAtEnd()) { - return CstParseResult.failure("any character"); - } - var startLoc = location(); - char c = advance(); - var text = String.valueOf(c); - var span = SourceSpan.of(startLoc, location()); - var node = new CstNode.Terminal(span, RULE_PEG_ANY, text, List.of(), List.of()); - return CstParseResult.success(node, text, location()); + private CstParseResult matchAnyCst() { + if (isAtEnd()) { + trackFailure("any character"); + return CstParseResult.failure("any character"); } + var startLoc = location(); + char c = advance(); + var text = String.valueOf(c); + var span = SourceSpan.of(startLoc, location()); + var node = new CstNode.Terminal(span, RULE_PEG_ANY, text, List.of(), List.of()); + return CstParseResult.success(node, text, location()); + } - // === CST Parse Result === - """); - } + // === CST Parse Result === + """); sb.append(""" private static final class CstParseResult { diff --git a/src/test/java/org/pragmatica/peg/examples/Java25GrammarExample.java b/src/test/java/org/pragmatica/peg/examples/Java25GrammarExample.java index e013a06..967a052 100644 --- a/src/test/java/org/pragmatica/peg/examples/Java25GrammarExample.java +++ b/src/test/java/org/pragmatica/peg/examples/Java25GrammarExample.java @@ -33,34 +33,44 @@ class Java25GrammarExample { // Java 25 Grammar (PEG-compatible adaptation of JLS Chapter 19) + // Cut operator (^) commits to current alternative, preventing backtracking. + // Used after discriminating keywords to improve error messages and performance. static final String JAVA_GRAMMAR = """ # === Compilation Units (JLS 7.3-7.8) === CompilationUnit <- ModuleDecl / OrdinaryUnit OrdinaryUnit <- PackageDecl? ImportDecl* TypeDecl* - PackageDecl <- Annotation* 'package' QualifiedName ';' - ImportDecl <- 'import' 'module' QualifiedName ';' / 'import' 'static'? QualifiedName ('.' '*')? ';' + PackageDecl <- Annotation* 'package' ^ QualifiedName ';' + ImportDecl <- 'import' ^ ('module' QualifiedName ';' / 'static'? QualifiedName ('.' '*')? ';') # === Module Declarations (JLS 7.7) === - ModuleDecl <- Annotation* 'open'? 'module' QualifiedName '{' ModuleDirective* '}' + ModuleDecl <- Annotation* 'open'? 'module' ^ QualifiedName '{' ModuleDirective* '}' ModuleDirective <- RequiresDirective / ExportsDirective / OpensDirective / UsesDirective / ProvidesDirective - RequiresDirective <- 'requires' ('transitive' / 'static')* QualifiedName ';' - ExportsDirective <- 'exports' QualifiedName ('to' QualifiedName (',' QualifiedName)*)? ';' - OpensDirective <- 'opens' QualifiedName ('to' QualifiedName (',' QualifiedName)*)? ';' - UsesDirective <- 'uses' QualifiedName ';' - ProvidesDirective <- 'provides' QualifiedName 'with' QualifiedName (',' QualifiedName)* ';' + RequiresDirective <- 'requires' ^ ('transitive' / 'static')* QualifiedName ';' + ExportsDirective <- 'exports' ^ QualifiedName ('to' QualifiedName (',' QualifiedName)*)? ';' + OpensDirective <- 'opens' ^ QualifiedName ('to' QualifiedName (',' QualifiedName)*)? ';' + UsesDirective <- 'uses' ^ QualifiedName ';' + ProvidesDirective <- 'provides' ^ QualifiedName 'with' QualifiedName (',' QualifiedName)* ';' TypeDecl <- Annotation* Modifier* TypeKind TypeKind <- ClassDecl / InterfaceDecl / EnumDecl / RecordDecl / AnnotationDecl - ClassDecl <- 'class' Identifier TypeParams? ('extends' Type)? ImplementsClause? PermitsClause? ClassBody - InterfaceDecl <- 'interface' Identifier TypeParams? ('extends' TypeList)? PermitsClause? ClassBody - AnnotationDecl <- '@' 'interface' Identifier AnnotationBody + ClassDecl <- ClassKW ^ Identifier TypeParams? ('extends' Type)? ImplementsClause? PermitsClause? ClassBody + InterfaceDecl <- InterfaceKW ^ Identifier TypeParams? ('extends' TypeList)? PermitsClause? ClassBody + AnnotationDecl <- '@' InterfaceKW ^ Identifier AnnotationBody + # Type declaration keywords with word boundary (prevents 'class' matching prefix of 'className') + ClassKW <- < 'class' ![a-zA-Z0-9_$] > + InterfaceKW <- < 'interface' ![a-zA-Z0-9_$] > AnnotationBody <- '{' AnnotationMember* '}' AnnotationMember <- Annotation* Modifier* (AnnotationElemDecl / FieldDecl / TypeKind) / ';' AnnotationElemDecl <- Type Identifier '(' ')' ('default' AnnotationElem)? ';' - EnumDecl <- 'enum' Identifier ImplementsClause? EnumBody - RecordDecl <- 'record' Identifier TypeParams? '(' RecordComponents? ')' ImplementsClause? RecordBody - ImplementsClause <- 'implements' TypeList - PermitsClause <- 'permits' TypeList + EnumDecl <- EnumKW ^ Identifier ImplementsClause? EnumBody + # Lookahead ensures this is a record declaration (record Name(...)) not: + # - method call: record(...) + # - field/variable of type 'record': record field; + RecordDecl <- RecordKW &(Identifier TypeParams? '(') Identifier ^ TypeParams? '(' RecordComponents? ')' ImplementsClause? RecordBody + EnumKW <- < 'enum' ![a-zA-Z0-9_$] > + RecordKW <- < 'record' ![a-zA-Z0-9_$] > + ImplementsClause <- 'implements' ^ TypeList + PermitsClause <- 'permits' ^ TypeList TypeList <- Type (',' Type)* TypeParams <- '<' TypeParam (',' TypeParam)* '>' TypeParam <- Identifier ('extends' Type ('&' Type)*)? @@ -82,90 +92,103 @@ class Java25GrammarExample { VarDecls <- VarDecl (',' VarDecl)* VarDecl <- Identifier Dims? ('=' VarInit)? VarInit <- '{' (VarInit (',' VarInit)* ','?)? '}' / Expr - MethodDecl <- TypeParams? Type Identifier '(' Params? ')' Dims? Throws? (Block / ';') + MethodDecl <- TypeParams? Type Identifier '(' ^ Params? ')' Dims? Throws? (Block / ';') Params <- Param (',' Param)* Param <- Annotation* Modifier* Type '...'? Identifier Dims? - Throws <- 'throws' TypeList - ConstructorDecl <- TypeParams? Identifier '(' Params? ')' Throws? Block + Throws <- 'throws' ^ TypeList + ConstructorDecl <- TypeParams? Identifier '(' ^ Params? ')' Throws? Block # === Blocks and Statements (JLS 14) === Block <- '{' BlockStmt* '}' BlockStmt <- LocalVar / LocalTypeDecl / Stmt LocalTypeDecl <- Annotation* Modifier* TypeKind LocalVar <- Modifier* LocalVarType VarDecls ';' - LocalVarType <- 'var' / Type + LocalVarType <- < 'var' ![a-zA-Z0-9_$] > / Type # Statement keywords use helper rules to combine keyword + word boundary as single token # This prevents the parser from skipping whitespace before the boundary check + # Cut operator after keyword rules commits to that statement type Stmt <- Block - / 'if' '(' Expr ')' Stmt ('else' Stmt)? - / 'while' '(' Expr ')' Stmt - / 'for' '(' ForCtrl ')' Stmt - / 'do' Stmt 'while' '(' Expr ')' ';' - / 'try' ResourceSpec? Block Catch* Finally? - / 'switch' '(' Expr ')' SwitchBlock + / IfKW ^ '(' Expr ')' Stmt ('else' Stmt)? + / WhileKW ^ '(' Expr ')' Stmt + / ForKW ^ '(' ForCtrl ')' Stmt + / DoKW ^ Stmt 'while' '(' Expr ')' ';' + / TryKW ^ ResourceSpec? Block Catch* Finally? + / SwitchKW ^ '(' Expr ')' SwitchBlock / ReturnKW Expr? ';' / ThrowKW Expr ';' / BreakKW Identifier? ';' / ContinueKW Identifier? ';' / AssertKW Expr (':' Expr)? ';' - / 'synchronized' '(' Expr ')' Block + / SynchronizedKW ^ '(' Expr ')' Block / YieldKW Expr ';' / Identifier ':' Stmt / Expr ';' / ';' # Helper rules: keyword with word boundary INSIDE token (prevents whitespace skip before boundary check) + IfKW <- < 'if' ![a-zA-Z0-9_$] > + WhileKW <- < 'while' ![a-zA-Z0-9_$] > + ForKW <- < 'for' ![a-zA-Z0-9_$] > + DoKW <- < 'do' ![a-zA-Z0-9_$] > + TryKW <- < 'try' ![a-zA-Z0-9_$] > + SwitchKW <- < 'switch' ![a-zA-Z0-9_$] > + SynchronizedKW <- < 'synchronized' ![a-zA-Z0-9_$] > ReturnKW <- < 'return' ![a-zA-Z0-9_$] > ThrowKW <- < 'throw' ![a-zA-Z0-9_$] > BreakKW <- < 'break' ![a-zA-Z0-9_$] > ContinueKW <- < 'continue' ![a-zA-Z0-9_$] > AssertKW <- < 'assert' ![a-zA-Z0-9_$] > YieldKW <- < 'yield' ![a-zA-Z0-9_$] > + CatchKW <- < 'catch' ![a-zA-Z0-9_$] > + FinallyKW <- < 'finally' ![a-zA-Z0-9_$] > + WhenKW <- < 'when' ![a-zA-Z0-9_$] > ForCtrl <- ForInit? ';' Expr? ';' ExprList? / LocalVarType Identifier ':' Expr ForInit <- LocalVarNoSemi / ExprList LocalVarNoSemi <- Modifier* LocalVarType VarDecls ResourceSpec <- '(' Resource (';' Resource)* ';'? ')' Resource <- Modifier* LocalVarType Identifier '=' Expr / QualifiedName - Catch <- 'catch' '(' Modifier* Type ('|' Type)* Identifier ')' Block - Finally <- 'finally' Block + Catch <- CatchKW ^ '(' Modifier* Type ('|' Type)* Identifier ')' Block + Finally <- FinallyKW ^ Block SwitchBlock <- '{' SwitchRule* '}' - SwitchRule <- SwitchLabel '->' (Expr ';' / Block / 'throw' Expr ';') / SwitchLabel ':' BlockStmt* + SwitchRule <- SwitchLabel '->' (Expr ';' / Block / ThrowKW Expr ';') / SwitchLabel ':' BlockStmt* # === Switch Labels and Patterns (JLS 14.11, 14.30) === - SwitchLabel <- 'case' 'null' (',' 'default')? / 'case' CaseItem (',' CaseItem)* Guard? / 'default' + SwitchLabel <- 'case' ^ ('null' (',' 'default')? / CaseItem (',' CaseItem)* Guard?) / 'default' CaseItem <- Pattern / QualifiedName &('->' / ',' / ':' / 'when') / Expr Pattern <- RecordPattern / TypePattern TypePattern <- &(LocalVarType Identifier) LocalVarType Identifier / '_' RecordPattern <- RefType '(' PatternList? ')' PatternList <- Pattern (',' Pattern)* - Guard <- 'when' Expr + Guard <- WhenKW Expr Expr <- Assignment Assignment <- Ternary (('=' / '>>>=' / '>>=' / '<<=' / '+=' / '-=' / '*=' / '/=' / '%=' / '&=' / '|=' / '^=') Assignment)? Ternary <- LogOr ('?' Expr ':' Ternary)? LogOr <- LogAnd ('||' LogAnd)* LogAnd <- BitOr ('&&' BitOr)* - BitOr <- BitXor (!'||' '|' BitXor)* - BitXor <- BitAnd ('^' BitAnd)* - BitAnd <- Equality (!'&&' '&' Equality)* + BitOr <- BitXor (!'||' !'|=' '|' BitXor)* + BitXor <- BitAnd (!'^=' '^' BitAnd)* + BitAnd <- Equality (!'&&' !'&=' '&' Equality)* Equality <- Relational (('==' / '!=') Relational)* Relational <- Shift (('<=' / '>=' / '<' / '>') Shift / 'instanceof' (Pattern / Type))? - Shift <- Additive (('<<' / '>>>' / '>>') Additive)* - Additive <- Multiplicative (('+' / '-' !'>') Multiplicative)* - Multiplicative <- Unary (('*' / '/' / '%') Unary)* + Shift <- Additive ((!'<<=' '<<' / !'>>>=' '>>>' / !'>>=' !'>>>=' '>>') Additive)* + Additive <- Multiplicative ((!'+=' '+' / !'-=' !'->' '-') Multiplicative)* + Multiplicative <- Unary ((!'*=' '*' / !'/=' '/' / !'%=' '%') Unary)* Unary <- ('++' / '--' / '+' / '-' / '!' / '~') Unary / '(' Type ('&' Type)* ')' Unary / Postfix Postfix <- Primary PostOp* PostOp <- '.' TypeArgs? Identifier ('(' Args? ')')? / '.' 'class' / '.' 'this' / '[' Expr ']' / '(' Args? ')' / '++' / '--' / '::' TypeArgs? (Identifier / 'new') - Primary <- Literal / 'this' / 'super' / 'new' TypeArgs? Type ('(' Args? ')' ClassBody? / Dims? VarInit?) / 'switch' '(' Expr ')' SwitchBlock / Lambda / '(' Expr ')' / QualifiedName + Primary <- Literal / < 'this' ![a-zA-Z0-9_$] > / < 'super' ![a-zA-Z0-9_$] > / < 'new' ![a-zA-Z0-9_$] > TypeArgs? Type ('(' Args? ')' ClassBody? / Dims? VarInit?) / SwitchKW '(' Expr ')' SwitchBlock / Lambda / '(' Expr ')' / TypeExpr / QualifiedName + TypeExpr <- Type ('.' < 'class' > / '::' TypeArgs? (< 'new' > / Identifier)) Lambda <- LambdaParams '->' (Expr / Block) LambdaParams <- Identifier / '_' / '(' LambdaParam? (',' LambdaParam)* ')' - LambdaParam <- Annotation* Modifier* (('var' / Type) &('...' / Identifier / '_'))? '...'? (Identifier / '_') + LambdaParam <- Annotation* Modifier* ((< 'var' ![a-zA-Z0-9_$] > / Type) &('...' / Identifier / '_'))? '...'? (Identifier / '_') Args <- Expr (',' Expr)* ExprList <- Expr (',' Expr)* # === Types with Type-Use Annotations (JSR 308 / JLS 4.11) === Type <- Annotation* (PrimType / RefType) Dims? - PrimType <- 'boolean' / 'byte' / 'short' / 'int' / 'long' / 'float' / 'double' / 'char' / 'void' - RefType <- AnnotatedTypeName ('.' AnnotatedTypeName)* + PrimType <- < ('boolean' / 'byte' / 'short' / 'int' / 'long' / 'float' / 'double' / 'char' / 'void') ![a-zA-Z0-9_$] > + # Use lookahead BEFORE consuming '.' to avoid capturing it when followed by keyword (e.g., 'HashMap.class') + RefType <- AnnotatedTypeName (&('.' ('@' / Identifier)) '.' AnnotatedTypeName)* AnnotatedTypeName <- Annotation* Identifier TypeArgs? Dims <- (Annotation* '[' ']')+ TypeArgs <- '<' '>' / '<' TypeArg (',' TypeArg)* '>' @@ -175,12 +198,12 @@ class Java25GrammarExample { QualifiedName <- Identifier (&('.' Identifier) '.' Identifier)* Identifier <- !Keyword < [a-zA-Z_$] [a-zA-Z0-9_$]* > - Modifier <- 'public' / 'protected' / 'private' / 'static' / 'final' / 'abstract' / 'native' / 'synchronized' / 'transient' / 'volatile' / 'strictfp' / 'default' / 'sealed' / 'non-sealed' + Modifier <- < ('public' / 'protected' / 'private' / 'static' / 'final' / 'abstract' / 'native' / 'synchronized' / 'transient' / 'volatile' / 'strictfp' / 'default' / 'sealed' / 'non-sealed') ![a-zA-Z0-9_$] > Annotation <- '@' !'interface' QualifiedName ('(' AnnotationValue? ')')? AnnotationValue <- Identifier '=' AnnotationElem (',' Identifier '=' AnnotationElem)* / AnnotationElem AnnotationElem <- Annotation / '{' (AnnotationElem (',' AnnotationElem)* ','?)? '}' / Ternary - Literal <- 'null' / 'true' / 'false' / CharLit / StringLit / NumLit + Literal <- < ('null' / 'true' / 'false') ![a-zA-Z0-9_$] > / CharLit / StringLit / NumLit CharLit <- < '\\'' ([^'\\\\] / '\\\\' .)* '\\'' > StringLit <- < '\"\"\"' (!'\"\"\"' .)* '\"\"\"' > / < '"' ([^"\\\\] / '\\\\' .)* '"' > NumLit <- < '0' [xX] [0-9a-fA-F_]+ [lL]? > / < '0' [bB] [01_]+ [lL]? > / < [0-9][0-9_]* ('.' [0-9_]*)? ([eE] [+\\-]? [0-9_]+)? [fFdDlL]? > / < '.' [0-9_]+ ([eE] [+\\-]? [0-9_]+)? [fFdD]? > diff --git a/src/test/java/org/pragmatica/peg/generator/ParserGeneratorTest.java b/src/test/java/org/pragmatica/peg/generator/ParserGeneratorTest.java index e1dd565..2100072 100644 --- a/src/test/java/org/pragmatica/peg/generator/ParserGeneratorTest.java +++ b/src/test/java/org/pragmatica/peg/generator/ParserGeneratorTest.java @@ -3,6 +3,11 @@ import org.junit.jupiter.api.Test; import org.pragmatica.peg.PegParser; +import javax.tools.ToolProvider; +import java.net.URL; +import java.net.URLClassLoader; +import java.nio.file.Files; + import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.*; @@ -341,4 +346,128 @@ void generateCst_basicMode_noErrorCaseInSwitch() { // No Error case in switch assertFalse(source.contains("case CstNode.Error")); } + + @Test + void generatedAstParser_errorReportsAtFurthestPosition() throws Exception { + // Grammar that requires 'abc' followed by letters - no backtracking possible + var result = PegParser.generateParser(""" + Root <- 'start' [a-z]+ + %whitespace <- [ ]* + """, + "test.furthest", + "FurthestAstParser" + ); + + assertTrue(result.isSuccess()); + var source = result.unwrap(); + + // Compile and run + var parser = compileAndInstantiate(source, "test.furthest.FurthestAstParser"); + var parseMethod = parser.getClass().getMethod("parse", String.class); + + // Parse valid input - should succeed + var validResult = parseMethod.invoke(parser, "start abc"); + assertThat(validResult.toString()).contains("Success"); + + // Parse input with error after 'start' - number instead of letters + // "start 123" - error is at position 6 (the '1'), not at 1:1 + var errorResult = parseMethod.invoke(parser, "start 123"); + var errorString = errorResult.toString(); + assertThat(errorString).contains("Failure"); + // Should report error at column 7 (after "start "), not at column 1 + assertThat(errorString).doesNotContain("column=1,"); + assertThat(errorString).contains("column=7"); + } + + @Test + void generatedCstParser_errorReportsAtFurthestPosition() throws Exception { + var result = PegParser.generateCstParser(""" + Root <- 'start' [a-z]+ + %whitespace <- [ ]* + """, + "test.furthest.cst", + "FurthestCstParser", + ErrorReporting.BASIC + ); + + assertTrue(result.isSuccess()); + var source = result.unwrap(); + + // Compile and run + var parser = compileAndInstantiate(source, "test.furthest.cst.FurthestCstParser"); + var parseMethod = parser.getClass().getMethod("parse", String.class); + + // Parse input with error after 'start' + var errorResult = parseMethod.invoke(parser, "start 123"); + var errorString = errorResult.toString(); + assertThat(errorString).contains("Failure"); + // Should report error at furthest position, not at 1:1 + assertThat(errorString).doesNotContain("at 1:1"); + assertThat(errorString).contains("1:7"); + } + + @Test + void generatedCstParser_advanced_errorReportsAtFurthestPosition() throws Exception { + var result = PegParser.generateCstParser(""" + Root <- 'start' [a-z]+ + %whitespace <- [ ]* + """, + "test.furthest.adv", + "FurthestAdvParser", + ErrorReporting.ADVANCED + ); + + assertTrue(result.isSuccess()); + var source = result.unwrap(); + + // Compile and run + var parser = compileAndInstantiate(source, "test.furthest.adv.FurthestAdvParser"); + var parseWithDiagMethod = parser.getClass().getMethod("parseWithDiagnostics", String.class); + + // Parse input with error after 'start' + var diagResult = parseWithDiagMethod.invoke(parser, "start 123"); + + // Check diagnostics + var formatMethod = diagResult.getClass().getMethod("formatDiagnostics", String.class); + var formatted = (String) formatMethod.invoke(diagResult, "test.txt"); + + // Should report error at furthest position (line 1, column 7) + assertThat(formatted).contains("1:7"); + assertThat(formatted).doesNotContain("1:1"); + } + + // Helper to compile and instantiate a generated parser + private Object compileAndInstantiate(String source, String className) throws Exception { + var tempDir = Files.createTempDirectory("peglib-test"); + var packagePath = className.substring(0, className.lastIndexOf('.')).replace('.', '/'); + var simpleClassName = className.substring(className.lastIndexOf('.') + 1); + + var packageDir = tempDir.resolve(packagePath); + Files.createDirectories(packageDir); + + var sourceFile = packageDir.resolve(simpleClassName + ".java"); + Files.writeString(sourceFile, source); + + // Compile with error capture + var compiler = ToolProvider.getSystemJavaCompiler(); + var errStream = new java.io.ByteArrayOutputStream(); + var result = compiler.run(null, null, errStream, + "-d", tempDir.toString(), + "-cp", System.getProperty("java.class.path"), + sourceFile.toString() + ); + + if (result != 0) { + System.err.println("=== Generated source ==="); + System.err.println(source); + System.err.println("=== Compilation errors ==="); + System.err.println(errStream); + throw new RuntimeException("Compilation failed for " + className + ": " + errStream); + } + + // Load and instantiate + var classLoader = new URLClassLoader(new URL[]{tempDir.toUri().toURL()}); + var parserClass = classLoader.loadClass(className); + return parserClass.getDeclaredConstructor().newInstance(); + } } diff --git a/src/test/resources/ParseError.java b/src/test/resources/ParseError.java new file mode 100644 index 0000000..6ea0972 --- /dev/null +++ b/src/test/resources/ParseError.java @@ -0,0 +1,15 @@ +package test; + +class ParseError { + void validMethod() { + int x = 1; + } + + void anotherValid() { + String s = "hello"; + } + + void brokenMethod() { + int y = ; // syntax error - missing expression + } +}