Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cmd/zoekt-git-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"log"
"os"
"path/filepath"
"runtime/debug"
"runtime/pprof"
"strings"

Expand Down Expand Up @@ -54,6 +55,16 @@ func run() int {
// Tune GOMAXPROCS to match Linux container CPU quota.
_, _ = maxprocs.Set()

// Optimize GC for batch indexing: disable percentage-based GC and rely
// on a memory limit instead. This reduces GC/madvise overhead by ~30%
// for large repos. Both can be overridden via GOMEMLIMIT / GOGC env vars.
if os.Getenv("GOMEMLIMIT") == "" {
debug.SetMemoryLimit(2 << 30) // 2 GiB default
}
if os.Getenv("GOGC") == "" {
debug.SetGCPercent(-1)
}

if *cpuProfile != "" {
f, err := os.Create(*cpuProfile)
if err != nil {
Expand Down
38 changes: 34 additions & 4 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,9 +682,42 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
// builder. Large blobs are skipped without reading content into memory.
// keys must correspond 1:1 (in order) with the ids passed to newCatfileReader.
// The reader is always closed when this function returns.
// contentSlab reduces per-file heap allocations by sub-slicing from a
// shared buffer. Each returned slice has its capacity capped (3-index
// slice) so appending to one file's content cannot overwrite adjacent
// data. Files larger than the slab get their own allocation.
type contentSlab struct {
buf []byte
cap int
}

func newContentSlab(slabCap int) contentSlab {
return contentSlab{
buf: make([]byte, 0, slabCap),
cap: slabCap,
}
}

// alloc returns a byte slice of length n. The caller must write into it
// immediately (the bytes are uninitialized when sourced from the slab).
func (s *contentSlab) alloc(n int) []byte {
if n > s.cap {
return make([]byte, n)
}
if len(s.buf)+n > cap(s.buf) {
s.buf = make([]byte, n, s.cap)
return s.buf[:n:n]
}
off := len(s.buf)
s.buf = s.buf[:off+n]
return s.buf[off : off+n : off+n]
}

func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]BlobLocation, opts Options, builder *index.Builder) error {
defer cr.Close()

slab := newContentSlab(16 << 20) // 16 MB per slab

for idx, key := range keys {
size, missing, excluded, err := cr.Next()
if err != nil {
Expand All @@ -707,10 +740,7 @@ func indexCatfileBlobs(cr *catfileReader, keys []fileKey, repos map[fileKey]Blob
// Skip without reading content into memory.
doc = skippedDoc(key, branches, index.SkipReasonTooLarge)
} else {
// Pre-allocate and read the full blob content in one call.
// io.ReadFull is preferred over io.LimitedReader here as it
// avoids the intermediate allocation and the size is known.
content := make([]byte, size)
content := slab.alloc(size)
if _, err := io.ReadFull(cr, content); err != nil {
return fmt.Errorf("read blob %s: %w", keyFullPath, err)
}
Expand Down
72 changes: 72 additions & 0 deletions gitindex/slab_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package gitindex

import "testing"

func TestContentSlab(t *testing.T) {
t.Run("fits in slab", func(t *testing.T) {
s := newContentSlab(1024)
b := s.alloc(100)
if len(b) != 100 {
t.Fatalf("len = %d, want 100", len(b))
}
if cap(b) != 100 {
t.Fatalf("cap = %d, want 100 (3-index slice)", cap(b))
}
})

t.Run("cap is capped so append cannot corrupt adjacent data", func(t *testing.T) {
s := newContentSlab(1024)
a := s.alloc(10)
copy(a, []byte("aaaaaaaaaa"))

b := s.alloc(10)
copy(b, []byte("bbbbbbbbbb"))

// Appending to a must not overwrite b.
a = append(a, 'X') // triggers new backing array since cap==len
if string(b) != "bbbbbbbbbb" {
t.Fatalf("adjacent data corrupted: got %q", b)
}
_ = a
})

t.Run("slab rollover", func(t *testing.T) {
s := newContentSlab(64)
a := s.alloc(60)
if len(a) != 60 || cap(a) != 60 {
t.Fatalf("a: len=%d cap=%d", len(a), cap(a))
}
// Next alloc doesn't fit in remaining 4 bytes → new slab.
b := s.alloc(10)
if len(b) != 10 || cap(b) != 10 {
t.Fatalf("b: len=%d cap=%d", len(b), cap(b))
}
// a and b should not share backing arrays.
copy(a, make([]byte, 60))
copy(b, []byte("0123456789"))
if string(b) != "0123456789" {
t.Fatal("rollover corrupted data")
}
})

t.Run("oversized allocation", func(t *testing.T) {
s := newContentSlab(64)
b := s.alloc(128)
if len(b) != 128 {
t.Fatalf("len = %d, want 128", len(b))
}
// Oversized alloc should not consume slab space.
c := s.alloc(32)
if len(c) != 32 || cap(c) != 32 {
t.Fatalf("c: len=%d cap=%d", len(c), cap(c))
}
})

t.Run("zero size", func(t *testing.T) {
s := newContentSlab(64)
b := s.alloc(0)
if len(b) != 0 {
t.Fatalf("len = %d, want 0", len(b))
}
})
}
23 changes: 19 additions & 4 deletions index/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (

"github.com/bmatcuk/doublestar"
"github.com/dustin/go-humanize"
"github.com/go-enry/go-enry/v2"
"github.com/rs/xid"
"golang.org/x/sys/unix"

Expand Down Expand Up @@ -625,6 +624,11 @@ func (b *Builder) Add(doc Document) error {
doc.SkipReason = skip
}

// Pre-compute file category while content is still available.
// This overlaps with catfile I/O and avoids redundant enry calls
// later in sortDocuments and ShardBuilder.Add.
DetermineFileCategory(&doc)

b.todo = append(b.todo, &doc)

if doc.SkipReason == SkipReasonNone {
Expand Down Expand Up @@ -888,18 +892,21 @@ func rank(d *Document, origIdx int) []float64 {
skipped = 1.0
}

// Use pre-computed Category from DetermineFileCategory instead of
// calling enry.IsGenerated/IsVendor/IsTest again. The category is
// computed before sorting in buildShard, avoiding redundant regex work.
generated := 0.0
if enry.IsGenerated(d.Name, d.Content) {
if d.Category == FileCategoryGenerated {
generated = 1.0
}

vendor := 0.0
if enry.IsVendor(d.Name) {
if d.Category == FileCategoryVendored {
vendor = 1.0
}

test := 0.0
if enry.IsTest(d.Name) {
if d.Category == FileCategoryTest {
test = 1.0
}

Expand Down Expand Up @@ -935,6 +942,14 @@ func rank(d *Document, origIdx int) []float64 {
}

func sortDocuments(todo []*Document) {
// Pre-compute file categories so rank() can use cached values
// instead of calling enry functions redundantly.
for _, t := range todo {
if t.Category == FileCategoryMissing {
DetermineFileCategory(t)
}
}

rs := make([]rankedDoc, 0, len(todo))
for i, t := range todo {
rd := rankedDoc{t, rank(t, i)}
Expand Down
25 changes: 20 additions & 5 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ func newPostingsBuilder(shardMaxBytes int) *postingsBuilder {
}
}


// reset clears the builder for reuse. All postingList allocations
// (backing arrays, map entries, ASCII array slots) are retained so the
// next shard build avoids re-allocating them.
Expand Down Expand Up @@ -235,8 +236,15 @@ func (s *postingsBuilder) newSearchableString(data []byte, byteSections []Docume
s.postings[ng] = pl
}
}
m := binary.PutUvarint(buf[:], uint64(newOff-pl.lastOff))
pl.data = append(pl.data, buf[:m]...)
delta := uint64(newOff - pl.lastOff)
if delta < 0x80 {
// Single-byte varint fast path: ~80% of deltas are < 128.
// append(slice, byte) is cheaper than append(slice, slice...).
pl.data = append(pl.data, byte(delta))
} else {
m := binary.PutUvarint(buf[:], delta)
pl.data = append(pl.data, buf[:m]...)
}
pl.lastOff = newOff
}
s.runeCount += runeIndex
Expand Down Expand Up @@ -536,8 +544,13 @@ func DetermineLanguageIfUnknown(doc *Document) {

// Add a file which only occurs in certain branches.
func (b *ShardBuilder) Add(doc Document) error {
if index := bytes.IndexByte(doc.Content, 0); index > 0 {
doc.SkipReason = SkipReasonBinary
// Skip binary check and category detection if already computed
// (e.g., by Builder.Add which calls DocChecker.Check and
// DetermineFileCategory before docs reach buildShard).
if doc.Category == FileCategoryMissing {
if index := bytes.IndexByte(doc.Content, 0); index > 0 {
doc.SkipReason = SkipReasonBinary
}
}

if doc.SkipReason != SkipReasonNone {
Expand All @@ -547,7 +560,9 @@ func (b *ShardBuilder) Add(doc Document) error {
}

DetermineLanguageIfUnknown(&doc)
DetermineFileCategory(&doc)
if doc.Category == FileCategoryMissing {
DetermineFileCategory(&doc)
}

sort.Sort(symbolSlice{doc.Symbols, doc.SymbolsMetaData})
var last DocumentSection
Expand Down
Loading