diff --git a/experimental/parser/lex.go b/experimental/parser/lex.go index 9184e9f4..144b62de 100644 --- a/experimental/parser/lex.go +++ b/experimental/parser/lex.go @@ -96,7 +96,9 @@ func Lex(ctx token.Context, errs *report.Report) { } else { text = l.SeekEOF() } + l.Push(len("//")+len(text), token.Comment) + case r == '/' && l.Peek() == '*': l.cursor++ // Skip the *. @@ -116,7 +118,9 @@ func Lex(ctx token.Context, errs *report.Report) { l.Error(ErrUnmatched{Span: l.SpanFrom(l.cursor - 2)}) text = l.SeekEOF() } + l.Push(len("/*")+len(text), token.Comment) + case r == '*' && l.Peek() == '/': l.cursor++ // Skip the /. diff --git a/experimental/parser/lex_state.go b/experimental/parser/lex_state.go index b5fc91ee..e72feae1 100644 --- a/experimental/parser/lex_state.go +++ b/experimental/parser/lex_state.go @@ -15,6 +15,8 @@ package parser import ( + "fmt" + "slices" "strings" "unicode/utf8" @@ -31,11 +33,12 @@ type lexer struct { cursor, count int braces []token.ID -} -func (l *lexer) Push(length int, kind token.Kind) token.Token { - l.count++ - return l.Stream.Push(length, kind) + prev token.ID // The last non-skippable token. + + firstCommentSincePrev token.ID + firstCommentOnSameLine bool + parStart, parEnd token.ID } func (l *lexer) Cursor() int { @@ -107,6 +110,194 @@ func (l *lexer) SpanFrom(start int) report.Span { return l.Span(start, l.cursor) } +func (l *lexer) Push(length int, kind token.Kind) token.Token { + l.count++ + prev := l.prev.In(l.Context) + tok := l.Stream.Push(length, kind) + // NOTE: tok will have the Stream rather than l.Context as its context, + // which will cause issues when we call NewCursor below. + tok = tok.ID().In(l.Context) + + // NOTE: For the purposes of attributing comments, we need to know what line + // certain offsets are at. Although we could track this as we advance cursor, + // we instead use other methods to determine if two tokens are on the same + // line. This is for a couple of reasons. + // + // 1. Getting a line number from the line index is O(log n), but we can + // instead use strings.Index and friends in some places without going + // quadratic. + // + // 2. Having to examine every character directly locks us out of using e.g. + // strings.Index for certain operations, which is much more efficient + // than the naive for loop. + + switch { + case tok.Kind() == token.Comment: + isLineComment := strings.HasPrefix(tok.Text(), "//") + + if l.firstCommentSincePrev.Nil() { + l.firstCommentSincePrev = tok.ID() + + if !prev.Nil() && l.newLinesBetween(prev, tok, 1) == 0 { + // The first comment is always in a paragraph by itself if there + // is no newline between it and the comment start. + l.firstCommentOnSameLine = true + break + } + } + + if !isLineComment { + // Block comments cannot be made into paragraphs, so we must + // interrupt the current paragraph. + l.fuseParagraph() + break + } + + // Start building up a line comment paragraph if there isn't one + // currently. + if l.parStart.Nil() { + l.parStart = tok.ID() + } + l.parEnd = tok.ID() + + case tok.Kind() == token.Space: + // Note that line comments contain their newlines, except for a line + // comment at the end of the file. Thus, seeing a single new line + // means that if we are interrupting a line comment paragraph, and thus + // we must fuse the current paragraph. + if strings.Contains(tok.Text(), "\n") { + l.fuseParagraph() + } + + default: + l.fuseParagraph() + //nolint:dupword // False positive due to comments describing an algorithm. + if !l.firstCommentSincePrev.Nil() { + fmt.Println(l.firstCommentSincePrev.In(l.Context), tok) + comments := token.NewCursor(l.firstCommentSincePrev.In(l.Context), tok) + var first, second, penultimate, last token.Token + for { // Don't use l.Done() here, that tosses comment tokens. + next := comments.PopSkippable() + if next.Nil() { + break + } else if next.Kind() == token.Comment { + switch { + case first.Nil(): + first = next + case second.Nil(): + second = next + } + penultimate = last + last = next + } + } + fmt.Println(first, second, penultimate, last) + + // Determine if we need to donate first to the previous comment. + var donate bool + switch { + case prev.Nil(): + donate = false + case l.firstCommentOnSameLine: + donate = true + case l.newLinesBetween(prev, first, 2) < 2: + // Now we need to check the remaining three criteria for + // donate. These are: + // + // 1. Is there more than one comment. + // 2. Is the token one of the closers ), ], or } (but not + // >). + // 3. The line of the current token minus the end line of + // the first comment is greater than one. + switch { + case !second.Nil(): + donate = true + case slices.Contains([]string{")", "]", "}"}, tok.Text()): + donate = true + case l.newLinesBetween(first, tok, 2) > 1: + donate = true + } + } + + if donate { + prev.Comments().SetTrailing(first) + first = second + } + + // The leading comment must have precisely one newline between + // it and the new token. + if !first.Nil() && !last.Nil() && l.newLinesBetween(last, tok, 2) == 1 { + tok.Comments().SetLeading(last) + last = penultimate + } + + // Check if we have any detached comments left. This is the case + // when first and last are both non-nil and <=. If we donated the + // only comment, second will have been nil, so first is now nil. + // + // If we attached the only remaining comment after donating a + // comment, we would have had the following value evolution for + // first, second, penultimate and last: + // + // before donate: a, b, a, b + // after donate: b, b, a, b + // after attach: b, b, a, a + // + // Thus, when we check b < a, we find that we have nothing left to + // attach. + if !first.Nil() && !last.Nil() && first.ID() <= last.ID() { + tok.Comments().SetDetachedRange(first, last) + } + + l.firstCommentSincePrev = 0 + l.firstCommentOnSameLine = false + } + + l.prev = tok.ID() + } + return tok +} + +func (l *lexer) fuseParagraph() { + if !l.parStart.Nil() && l.parEnd != l.parStart { + token.Fuse( + l.parStart.In(l.Context), + l.parEnd.In(l.Context), + ) + } + l.parStart = 0 +} + +// newLinesBetween counts the number of \n characters between the end of a +// and the start of b, up to max. +// +// The final rune of a is included in this count, since comments may end in a +// \n rune. +// +//nolint:revive,predeclared // Complains about redefining max. +func (l *lexer) newLinesBetween(a, b token.Token, max int) int { + end := a.Span().End + if end != 0 { + // Account for the final rune of a. + end-- + } + + start := b.Span().Start + between := l.Text()[end:start] + + var total int + for total < max { + var found bool + _, between, found = strings.Cut(between, "\n") + if !found { + break + } + + total++ + } + return total +} + // mustProgress returns a progress checker for this lexer. func (l *lexer) mustProgress() mustProgress { return mustProgress{l, -1} diff --git a/experimental/parser/lex_test.go b/experimental/parser/lex_test.go index ba46f3f6..fd572be1 100644 --- a/experimental/parser/lex_test.go +++ b/experimental/parser/lex_test.go @@ -26,6 +26,7 @@ import ( "github.com/bufbuild/protocompile/experimental/report" "github.com/bufbuild/protocompile/experimental/token" "github.com/bufbuild/protocompile/internal/golden" + "github.com/bufbuild/protocompile/internal/iters" ) func TestRender(t *testing.T) { @@ -87,6 +88,22 @@ func TestRender(t *testing.T) { } } + comments := tok.Comments() + iters.Enumerate(comments.Detached())(func(i int, t token.Token) bool { + if i == 0 { + fmt.Fprintf(&tsv, "\t\tdetached:%v", t.ID()) + } else { + fmt.Fprintf(&tsv, ",%v", t.ID()) + } + return true + }) + if leading := comments.Leading(); !leading.Nil() { + fmt.Fprintf(&tsv, "\t\tleading:%v", leading.ID()) + } + if trailing := comments.Trailing(); !trailing.Nil() { + fmt.Fprintf(&tsv, "\t\ttrailing:%v", trailing.ID()) + } + tsv.WriteByte('\n') return true }) diff --git a/experimental/parser/testdata/lexer/comments/attribution.proto b/experimental/parser/testdata/lexer/comments/attribution.proto new file mode 100644 index 00000000..709ce9d6 --- /dev/null +++ b/experimental/parser/testdata/lexer/comments/attribution.proto @@ -0,0 +1,35 @@ +// This, as expected, is a leading comment for Foo. +message Foo { + // This is the TRAILING comment for Foo. (It is NOT + // a detached comment for baz.) + + // leading comment for baz + string baz = 1; + // trailing comment for baz +} +// This is NOT a trailing comment. It's also not considered +// a detached comment for Bar. It is discarded. + +// This IS a detached comment for Bar. + +// A leading comment for Bar. +message Bar { +} + +string name = 1; // trailing comment for name +// leading comment for id +uint64 id = 2; + +previousToken // this comment +// won't get merged into a +// group with these two lines +/* block comments */ /* are always their own groups */ // line comments +// can usually get joined into +// groups with adjacent lines + + // empty lines separate groups +// indentation does not impact grouping +/* a single block + * comment can span lines + */ +currentToken \ No newline at end of file diff --git a/experimental/parser/testdata/lexer/comments/attribution.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/attribution.proto.tokens.tsv new file mode 100644 index 00000000..7e9c7886 --- /dev/null +++ b/experimental/parser/testdata/lexer/comments/attribution.proto.tokens.tsv @@ -0,0 +1,78 @@ +# kind offsets linecol text +0 Comment 000:052 001:001 "// This, as expected, is a leading comment for Foo.\n" +1 Ident 052:059 002:001 "message" leading:Token(0) +2 Space 059:060 002:008 " " +3 Ident 060:063 002:009 "Foo" +4 Space 063:064 002:012 " " +5 Punct 064:233 002:013 "{" close:Token(23) trailing:Token(7) +6 Space 065:068 002:014 "\n " +7 Comment 068:154 003:003 "// This is the TRAILING comment for Foo. (It is NOT\n" close:Token(9) +8 Space 120:122 004:001 " " +9 Comment 068:154 003:003 "// a detached comment for baz.)\n" open:Token(7) +10 Space 154:157 005:001 "\n " +11 Comment 157:184 006:003 "// leading comment for baz\n" +12 Space 184:186 007:001 " " +13 Ident 186:192 007:003 "string" leading:Token(11) +14 Space 192:193 007:009 " " +15 Ident 193:196 007:010 "baz" +16 Space 196:197 007:013 " " +17 Punct 197:198 007:014 "=" +18 Space 198:199 007:015 " " +19 Number 199:200 007:016 "1" int:1 +20 Punct 200:201 007:017 ";" trailing:Token(22) +21 Space 201:204 007:018 "\n " +22 Comment 204:232 008:003 "// trailing comment for baz\n" +23 Punct 064:233 002:013 "}" open:Token(5) trailing:Token(25) +24 Space 233:234 009:002 "\n" +25 Comment 234:342 010:001 "// This is NOT a trailing comment. It's also not considered\n" close:Token(26) +26 Comment 234:342 010:001 "// a detached comment for Bar. It is discarded.\n" open:Token(25) +27 Space 342:343 012:001 "\n" +28 Comment 343:382 013:001 "// This IS a detached comment for Bar.\n" +29 Space 382:383 014:001 "\n" +30 Comment 383:413 015:001 "// A leading comment for Bar.\n" +31 Ident 413:420 016:001 "message" detached:Token(28) leading:Token(30) +32 Space 420:421 016:008 " " +33 Ident 421:424 016:009 "Bar" +34 Space 424:425 016:012 " " +35 Punct 425:428 016:013 "{" close:Token(37) +36 Space 426:427 016:014 "\n" +37 Punct 425:428 016:013 "}" open:Token(35) +38 Space 428:430 017:002 "\n\n" +39 Ident 430:436 019:001 "string" +40 Space 436:437 019:007 " " +41 Ident 437:441 019:008 "name" +42 Space 441:442 019:012 " " +43 Punct 442:443 019:013 "=" +44 Space 443:444 019:014 " " +45 Number 444:445 019:015 "1" int:1 +46 Punct 445:446 019:016 ";" trailing:Token(48) +47 Space 446:447 019:017 " " +48 Comment 447:476 019:018 "// trailing comment for name\n" +49 Comment 476:502 020:001 "// leading comment for id\n" +50 Ident 502:508 021:001 "uint64" leading:Token(49) +51 Space 508:509 021:007 " " +52 Ident 509:511 021:008 "id" +53 Space 511:512 021:010 " " +54 Punct 512:513 021:011 "=" +55 Space 513:514 021:012 " " +56 Number 514:515 021:013 "2" int:2 +57 Punct 515:516 021:014 ";" +58 Space 516:518 021:015 "\n\n" +59 Ident 518:531 023:001 "previousToken" trailing:Token(61) +60 Space 531:532 023:014 " " +61 Comment 532:548 023:015 "// this comment\n" +62 Comment 548:605 024:001 "// won't get merged into a\n" close:Token(63) +63 Comment 548:605 024:001 "// group with these two lines\n" open:Token(62) +64 Comment 605:625 026:001 "/* block comments */" +65 Space 625:626 026:021 " " +66 Comment 626:659 026:022 "/* are always their own groups */" +67 Space 659:660 026:055 " " +68 Comment 660:738 026:056 "// line comments\n" close:Token(70) +69 Comment 677:708 027:001 "// can usually get joined into\n" +70 Comment 660:738 026:056 "// groups with adjacent lines\n" open:Token(68) +71 Space 738:742 029:001 "\n " +72 Comment 742:813 030:004 "// empty lines separate groups\n" close:Token(73) +73 Comment 742:813 030:004 "// indentation does not impact grouping\n" open:Token(72) +74 Comment 813:860 032:001 "/* a single block\n * comment can span lines\n */" +75 Space 860:861 034:004 "\n" +76 Ident 861:873 035:001 "currentToken" detached:Token(62),Token(64),Token(66),Token(68),Token(72) leading:Token(74) diff --git a/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv index bfbf324e..33cbf845 100644 --- a/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv +++ b/experimental/parser/testdata/lexer/comments/nested.proto.tokens.tsv @@ -1,4 +1,4 @@ # kind offsets linecol text 0 Comment 000:039 001:001 "/*\n Nesting\n /* is not allowed */" 1 Space 039:040 003:025 "\n" -2 Unrecognized 040:042 004:001 "*/" +2 Unrecognized 040:042 004:001 "*/" leading:Token(0) diff --git a/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv b/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv index 1b577040..e402c620 100644 --- a/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv +++ b/experimental/parser/testdata/lexer/comments/ok.proto.tokens.tsv @@ -1,6 +1,6 @@ # kind offsets linecol text -0 Comment 000:024 001:001 "// Single line comment.\n" -1 Comment 024:045 002:001 "//go:style-intrinsic\n" +0 Comment 000:045 001:001 "// Single line comment.\n" close:Token(1) +1 Comment 000:045 001:001 "//go:style-intrinsic\n" open:Token(0) 2 Comment 045:072 003:001 "/*\n Multiline comment\n*/" 3 Space 072:074 005:003 "\n\n" 4 Comment 074:077 007:001 "//\n" diff --git a/experimental/parser/testdata/lexer/smoke.proto.tokens.tsv b/experimental/parser/testdata/lexer/smoke.proto.tokens.tsv index db0142db..2c1fde87 100644 --- a/experimental/parser/testdata/lexer/smoke.proto.tokens.tsv +++ b/experimental/parser/testdata/lexer/smoke.proto.tokens.tsv @@ -1,5 +1,5 @@ # kind offsets linecol text -0 Comment 000:046 001:001 "// Copyright 2020-2024 Buf Technologies, Inc.\n" +0 Comment 000:605 001:001 "// Copyright 2020-2024 Buf Technologies, Inc.\n" close:Token(12) 1 Comment 046:049 002:001 "//\n" 2 Comment 049:116 003:001 "// Licensed under the Apache License, Version 2.0 (the \"License\");\n" 3 Comment 116:184 004:001 "// you may not use this file except in compliance with the License.\n" @@ -11,9 +11,9 @@ 9 Comment 355:424 010:001 "// distributed under the License is distributed on an \"AS IS\" BASIS,\n" 10 Comment 424:500 011:001 "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" 11 Comment 500:571 012:001 "// See the License for the specific language governing permissions and\n" -12 Comment 571:605 013:001 "// limitations under the License.\n" +12 Comment 000:605 001:001 "// limitations under the License.\n" open:Token(0) 13 Space 605:606 014:001 "\n" -14 Ident 606:612 015:001 "syntax" +14 Ident 606:612 015:001 "syntax" detached:Token(0) 15 Space 612:613 015:007 " " 16 Punct 613:614 015:008 "=" 17 Space 614:615 015:009 " " @@ -28,7 +28,7 @@ 26 Punct 641:642 017:016 ";" 27 Space 642:644 017:017 "\n\n" 28 Comment 644:670 019:001 "// This is a doc comment.\n" -29 Ident 670:677 020:001 "message" +29 Ident 670:677 020:001 "message" leading:Token(28) 30 Space 677:678 020:008 " " 31 Ident 678:681 020:009 "Foo" 32 Space 681:682 020:012 " " diff --git a/experimental/token/comment.go b/experimental/token/comment.go new file mode 100644 index 00000000..68a38000 --- /dev/null +++ b/experimental/token/comment.go @@ -0,0 +1,242 @@ +// Copyright 2020-2024 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package token + +import ( + "fmt" + "strings" + "unicode" + + "github.com/bufbuild/protocompile/internal/iter" + "github.com/bufbuild/protocompile/internal/iters" +) + +// Comments provides access to manipulating a [Token]'s attached comments. +// +// Comments are represented as tokens. When many line comments appear in +// together, they will be grouped into a paragraph. These comment paragraphs +// are represented as token trees. To obtain the contents of a comment, +// consistent with how Protobuf requires them to be formatted for SourceCodeInfo, +// use [Token.CommentLines]. +// +// These functions are placed a separate struct to avoid cluttering [Token]'s +// method set. +type Comments struct { + Token Token +} + +type rawComments struct { + // start and end are the range of all detached comments. + start, end ID + + leading, trailing ID +} + +// Detached returns an iterator over this token's detached comments. +func (c Comments) Detached() iter.Seq[Token] { + s := c.Token.Context().Stream() + raw := s.comments[c.Token.ID()] + + return func(yield func(Token) bool) { + if raw.start.Nil() { + return + } + + c := NewCursor( + raw.start.In(c.Token.Context()), + raw.end.In(c.Token.Context()), + ) + + for { // Can't use Done(), that will skip comments. + tok := c.PopSkippable() + if tok.Nil() || (tok.Kind() == Comment && !yield(tok)) { + break + } + } + } +} + +// SetDetachedRange sets the range containing this token's detached comments. +// +// Panics of t, start, or end is synthetic, if start and end are not both nil or +// non-nil, or the underlying stream has been frozen. +func (c Comments) SetDetachedRange(start, end Token) { + if start.IsSynthetic() || end.IsSynthetic() { + panic("protocompile/token: cannot use synthetic tokens in Comments.SetDetachedRange") + } + if start.Nil() != end.Nil() { + panic("protocompile/token: both start/end passed to SetDetachedRange must be nil or non-nil") + } + + fmt.Println("detached", start, end, c) + + c.mutate(func(raw *rawComments) { + raw.start = start.ID() + raw.end = end.ID() + }) +} + +// Leading returns this token's leading comment, if it has one. +func (c Comments) Leading() Token { + s := c.Token.Context().Stream() + + return s.comments[c.Token.ID()].leading.In(c.Token.Context()) +} + +// SetLeading sets the leading comment for this token. +// +// Panics of t is synthetic or the underlying stream has been frozen. +func (c Comments) SetLeading(comment Token) { + fmt.Println("leading", comment, c) + + c.mutate(func(raw *rawComments) { + raw.leading = comment.ID() + }) +} + +// Trailing returns this token's trailing comment, if it has one. +func (c Comments) Trailing() Token { + s := c.Token.Context().Stream() + return s.comments[c.Token.ID()].trailing.In(c.Token.Context()) +} + +// SetTrailing sets the leading comment for this token. +// +// Panics of t is synthetic or the underlying stream has been frozen. +func (c Comments) SetTrailing(comment Token) { + fmt.Println("trailing", comment, c) + + c.mutate(func(raw *rawComments) { + raw.trailing = comment.ID() + }) +} + +// mutate performs a mutation on the comments struct for this token. +func (c Comments) mutate(cb func(*rawComments)) { + s := c.Token.Context().Stream() + if c.Token.IsSynthetic() { + panic("protocompile/token: modifying comments on a synthetic token is not yet implemented") + } + + if !c.Token.IsSynthetic() { + s.mustNotBeFrozen() + } + + if s.comments == nil { + s.comments = make(map[ID]rawComments) + } + + // My kingdom for maps.Upsert. + raw := s.comments[c.Token.id] + cb(&raw) + s.comments[c.Token.id] = raw +} + +type commentFormatter struct { + lines []string + margin string + quirks bool +} + +func (c *commentFormatter) appendComment(text string) { + if text, ok := strings.CutPrefix(text, "//"); ok { + // Pull off a trailing newline. A comment just before EOF will not + // have one. + text = strings.TrimSuffix(text, "\n") + + // If not in quirks mode, we want to pull a uniform amount of comment + // margin off of each line in this paragraph. We detect this to be the + // whitespace prefix of the first line. + if !c.quirks { + if c.margin == "" { + c.margin, text = trimSpace(text) + } else { + text = strings.TrimPrefix(text, c.margin) + } + } + + c.lines = append(c.lines, text) + return + } + + // This is a block comment. We need to remove the /**/, and for each + // line beyond the first, we must also remove leading whitespace and a + // *. + text = strings.TrimPrefix(strings.TrimSuffix(text, "*/"), "/*") + + // First, append all of the lines in the comment without modification. + start := len(c.lines) + c.lines = iters.AppendSeq(c.lines, iters.SplitString(text, "\n")) + lines := c.lines[start:] + + // When in quirks mode, all we need to do is strip whitespace and the + // asetrisk. + if c.quirks { + for i, line := range lines[1:] { + _, line = trimSpace(line) + line = strings.TrimPrefix(line, "*") + + lines[i+1] = line + } + return + } + + // Otherwise, we only want to remove the same amount of space for each + // line, *and* we want to only remove asterisks if every line other than + // the first has them. + var margin string + haveStars := true + for i, line := range lines[1:] { + if margin == "" { + margin, line = trimSpace(line) + } else { + line = strings.TrimPrefix(line, margin) + } + + if !strings.HasPrefix(line, "*") { + haveStars = false + } + + lines[i+1] = line + } + + // Now we can remove the asterisks. Note that we remove an asterisk + // from *all* lines, because many comment styles have a leading /**. + // + // TODO: for single-line block comments, we may want to handle Doxygen's + // /*< xyz */ comments in the future. + if haveStars { + var margin string + for i, line := range lines { + line = strings.TrimPrefix(line, "*") + + // *Also* remove margin after the asterisk! + if margin == "" { + margin, line = trimSpace(line) + } else { + line = strings.TrimPrefix(line, margin) + } + + lines[i] = line + } + } +} + +func trimSpace(s string) (space, rest string) { + suffix := strings.TrimLeftFunc(s, func(r rune) bool { + return unicode.Is(unicode.Pattern_White_Space, r) + }) + return s[:len(s)-len(suffix)], suffix +} diff --git a/experimental/token/cursor.go b/experimental/token/cursor.go index cb1d4f70..7e2100f7 100644 --- a/experimental/token/cursor.go +++ b/experimental/token/cursor.go @@ -55,7 +55,11 @@ func NewCursor(start, end Token) *Cursor { panic(fmt.Sprintf("protocompile/token: passed nil token to NewCursor: %v, %v", start, end)) } if start.Context() != end.Context() { - panic("protocompile/token: passed tokens from different context to NewCursor") + panic(fmt.Sprintf( + "protocompile/token: passed tokens from different context to NewCursor: %q, %q", + start.Context().Stream().Path(), + end.Context().Stream().Path(), + )) } if start.IsSynthetic() || end.IsSynthetic() { panic("protocompile/token: passed synthetic token to NewCursor") diff --git a/experimental/token/stream.go b/experimental/token/stream.go index ed834e14..df458877 100644 --- a/experimental/token/stream.go +++ b/experimental/token/stream.go @@ -60,6 +60,8 @@ type Stream struct { // All values in this map are string, uint64, or float64. literals map[ID]any + comments map[ID]rawComments + // If true, no further mutations (except for synthetic tokens) are // permitted. frozen bool @@ -108,13 +110,18 @@ func (s *Stream) Freeze() { s.frozen = true } +// mustNotBeFrozen panics if s is frozen. +func (s *Stream) mustNotBeFrozen() { + if s.frozen { + panic("protocompile/token: attempted to mutate frozen stream") + } +} + // Push mints the next token referring to a piece of the input source. // // Panics if this stream is frozen. func (s *Stream) Push(length int, kind Kind) Token { - if s.frozen { - panic("protocompile/token: attempted to mutate frozen stream") - } + s.mustNotBeFrozen() if length < 0 || length > math.MaxInt32 { panic(fmt.Sprintf("protocompile/token: Push() called with invalid length: %d", length)) diff --git a/experimental/token/token.go b/experimental/token/token.go index f090534d..4abed24c 100644 --- a/experimental/token/token.go +++ b/experimental/token/token.go @@ -241,8 +241,8 @@ func SetValue[T Value](token Token, value T) { } stream := token.Context().Stream() - if token.nat() != nil && stream.frozen { - panic("protocompile/token: attempted to mutate frozen stream") + if !token.IsSynthetic() { + stream.mustNotBeFrozen() } if stream.literals == nil { @@ -264,8 +264,8 @@ func ClearValue(token Token) { } stream := token.Context().Stream() - if token.nat() != nil && stream.frozen { - panic("protocompile/token: attempted to mutate frozen stream") + if !token.IsSynthetic() { + stream.mustNotBeFrozen() } delete(stream.literals, token.id) @@ -279,9 +279,7 @@ func Fuse(open, close Token) { //nolint:predeclared,revive // For close. if open.Context().Stream() != close.Context().Stream() { panic("protocompile/token: attempted to fuse tokens from different streams") } - if open.Context().Stream().frozen { - panic("protocompile/token: attempted to mutate frozen stream") - } + open.Context().Stream().mustNotBeFrozen() impl1 := open.nat() if impl1 == nil { @@ -329,6 +327,47 @@ func (t Token) Children() *Cursor { } } +// Comments returns the attached comments for this token. +// +// This is a helper to aid in writing code like t.Comments().Leading(). +func (t Token) Comments() Comments { + return Comments{t} +} + +// CommentLines returns the lines of a comment token with punctuation removed. +// If quirks is set, this will match the quirky documented behavior of protoc. +// Otherwise, it will use a more intelligent algorithm that avoids wrecking the +// indentation of complex comments. +// +// Returns nil if t is not a comment. +func (t Token) CommentLines(quirks bool) []string { + if t.Kind() != Comment { + return nil + } + + cf := commentFormatter{quirks: quirks} + + a, b := t.StartEnd() + if a == b { + cf.appendComment(a.Text()) + return cf.lines + } + + cf.appendComment(a.Text()) + children := a.Children() + for { // Can't use Done(), that skips comments. + next := children.PopSkippable() + if next.Nil() { + break + } + if next.Kind() == Comment { + cf.appendComment(next.Text()) + } + } + cf.appendComment(b.Text()) + return cf.lines +} + // Name converts this token into its corresponding identifier name, potentially // performing normalization. // diff --git a/experimental/token/token_test.go b/experimental/token/token_test.go index 39643c0f..4d6e9e1a 100644 --- a/experimental/token/token_test.go +++ b/experimental/token/token_test.go @@ -21,6 +21,7 @@ import ( "github.com/bufbuild/protocompile/experimental/report" "github.com/bufbuild/protocompile/experimental/token" + "github.com/bufbuild/protocompile/internal/iters" ) type Context struct { @@ -74,7 +75,7 @@ func TestLeafTokens(t *testing.T) { assert.True(tok.IsLeaf()) assert.Equal(text, tok.Text()) assert.Equal(token.Ident, abc.Kind()) - tokensEq(t, collect(tok.Children().Rest())) + tokensEq(t, iters.Collect(tok.Children().Rest())) } assertIdent(abc, 0, 3, "abc") @@ -86,7 +87,7 @@ func TestLeafTokens(t *testing.T) { assert.True(jkl.IsLeaf()) assert.True(jkl.IsSynthetic()) assert.Equal("jkl", jkl.Text()) - tokensEq(t, collect(jkl.Children().Rest())) + tokensEq(t, iters.Collect(jkl.Children().Rest())) } func TestTreeTokens(t *testing.T) { @@ -133,11 +134,11 @@ func TestTreeTokens(t *testing.T) { tokenEq(t, start, open) tokenEq(t, end, close) - tokensEq(t, collect(open2.Children().Rest()), x) - tokensEq(t, collect(close2.Children().Rest()), x) + tokensEq(t, iters.Collect(open2.Children().Rest()), x) + tokensEq(t, iters.Collect(close2.Children().Rest()), x) - tokensEq(t, collect(open.Children().Rest()), def, open2, comma, ghi) - tokensEq(t, collect(close.Children().Rest()), def, open2, comma, ghi) + tokensEq(t, iters.Collect(open.Children().Rest()), def, open2, comma, ghi) + tokensEq(t, iters.Collect(close.Children().Rest()), def, open2, comma, ghi) open3 := s.NewPunct("(") close3 := s.NewPunct(")") @@ -152,7 +153,56 @@ func TestTreeTokens(t *testing.T) { tokenEq(t, start, open3) tokenEq(t, end, close3) - tokensEq(t, collect(close3.Children().Rest()), def, open2) + tokensEq(t, iters.Collect(close3.Children().Rest()), def, open2) +} + +func TestCommentText(t *testing.T) { + t.Parallel() + assert := assert.New(t) + + ctx := NewContext(` +// Foo +// Bar +// Baz + +// abcd xyz + +/* Foo + Bar + Baz +*/ + +/** Foo + * Bar + * Baz */ +`) + s := ctx.Stream() + + s.Push(1, token.Space) + + c1 := s.Push(7, token.Comment) + _ = s.Push(7, token.Comment) + c2 := s.Push(7, token.Comment) + s.Push(1, token.Space) + token.Fuse(c1, c2) + + c3 := s.Push(12, token.Comment) + s.Push(1, token.Space) + + c4 := s.Push(23, token.Comment) + s.Push(2, token.Space) + + c5 := s.Push(26, token.Comment) + + assert.Equal([]string{" Foo", " Bar", " Baz"}, c1.CommentLines(true)) + assert.Equal([]string{" abcd xyz"}, c3.CommentLines(true)) + assert.Equal([]string{" Foo", "Bar", "Baz", ""}, c4.CommentLines(true)) + assert.Equal([]string{"* Foo", " Bar", " Baz "}, c5.CommentLines(true)) + + assert.Equal([]string{"Foo", "Bar", "Baz"}, c1.CommentLines(false)) + assert.Equal([]string{"abcd xyz"}, c3.CommentLines(false)) + assert.Equal([]string{" Foo", "Bar", "Baz", ""}, c4.CommentLines(false)) + assert.Equal([]string{"Foo", "Bar", "Baz "}, c5.CommentLines(false)) } // tokenEq is the singular version of tokensEq. @@ -172,12 +222,3 @@ func tokensEq(t *testing.T, tokens []token.Token, expected ...token.Token) { } assert.Equal(t, b, a) } - -// collect is a polyfill for [slices.Collect]. -func collect[T any](iter func(func(T) bool)) (s []T) { - iter(func(t T) bool { - s = append(s, t) - return true - }) - return -} diff --git a/internal/iters/map.go b/internal/iters/map.go new file mode 100644 index 00000000..6068d95b --- /dev/null +++ b/internal/iters/map.go @@ -0,0 +1,32 @@ +// Copyright 2020-2024 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iters + +import "github.com/bufbuild/protocompile/internal/iter" + +// Enumerate adapts a Seq into a Seq2 that counts how many elements have been +// yielded so far. +func Enumerate[T any](seq iter.Seq[T]) iter.Seq2[int, T] { + return func(yield func(int, T) bool) { + var total int + seq(func(v T) bool { + if !yield(total, v) { + return false + } + total++ + return true + }) + } +} diff --git a/internal/iters/partition.go b/internal/iters/partition.go index 95295d5e..35a9ca4d 100644 --- a/internal/iters/partition.go +++ b/internal/iters/partition.go @@ -14,7 +14,11 @@ package iters -import "github.com/bufbuild/protocompile/internal/iter" +import ( + "strings" + + "github.com/bufbuild/protocompile/internal/iter" +) // Partition returns an iterator of subslices of s such that each yielded // slice is delimited according to delimit. Also yields the starting index of @@ -43,3 +47,24 @@ func Partition[T any](s []T, delimit func(a, b *T) bool) iter.Seq2[int, []T] { } } } + +// SplitString returns an iterator over pieces of s separated by substr. +// +// This function is equivalent to slices.Values(strings.Split(s, substr)), +// but it avoids an allocation. +func SplitString(s string, substr string) iter.Seq[string] { + return func(yield func(s string) bool) { + if s == "" { + yield(s) + return + } + + for { + before, after, ok := strings.Cut(s, substr) + if !yield(before) || !ok { + break + } + s = after + } + } +}