From 4ef828e33f9c36b19db3d5885af1e2af0421328a Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 16 Jun 2025 15:28:24 +0530 Subject: [PATCH 01/25] passing zap config via new plugin APIs --- index/scorch/merge.go | 10 +++++----- index/scorch/persister.go | 4 ++-- index/scorch/scorch.go | 4 +++- index/scorch/segment_plugin.go | 8 ++++++++ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index e17288410..e2267bf81 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -372,8 +372,8 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) prevBytesReadTotal := cumulateBytesRead(segmentsToMerge) - newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path, - cw.cancelCh, s) + newDocNums, _, err := s.segPlugin.MergeEx(segmentsToMerge, docsToDrop, path, + cw.cancelCh, s, s.segmentConfig) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) @@ -391,7 +391,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, return fmt.Errorf("merging failed: %v", err) } - seg, err = s.segPlugin.Open(path) + seg, err = s.segPlugin.OpenEx(path, s.segmentConfig) if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) @@ -540,7 +540,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, // the newly merged segment is already flushed out to disk, just needs // to be opened using mmap. newDocIDs, _, err := - s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s) + s.segPlugin.MergeEx(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig) if err != nil { em.Lock() errs = append(errs, err) @@ -555,7 +555,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, s.markIneligibleForRemoval(filename) newMergedSegmentIDs[id] = newSegmentID newDocIDsSet[id] = newDocIDs - newMergedSegments[id], err = s.segPlugin.Open(path) + newMergedSegments[id], err = s.segPlugin.OpenEx(path, s.segmentConfig) if err != nil { em.Lock() errs = append(errs, err) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index d0c013a1d..2bc88f4f7 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -804,7 +804,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint } }() for segmentID, path := range newSegmentPaths { - newSegments[segmentID], err = s.segPlugin.Open(path) + newSegments[segmentID], err = s.segPlugin.OpenEx(path, s.segmentConfig) if err != nil { return fmt.Errorf("error opening new segment at %s, %v", path, err) } @@ -1016,7 +1016,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return nil, fmt.Errorf("segment path missing") } segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) - seg, err := s.segPlugin.Open(segmentPath) + segment, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig) if err != nil { return nil, fmt.Errorf("error opening bolt segment: %v", err) } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 329de598e..72435a91c 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -45,6 +45,7 @@ type Scorch struct { readOnly bool version uint8 config map[string]interface{} + segmentConfig map[string]interface{} analysisQueue *index.AnalysisQueue path string @@ -154,6 +155,7 @@ func NewScorch(storeName string, forceMergeRequestCh: make(chan *mergerCtrl, 1), segPlugin: defaultSegmentPlugin, copyScheduled: map[string]int{}, + segmentConfig: make(map[string]interface{}), } forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config) @@ -497,7 +499,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { stats := newFieldStats() if len(analysisResults) > 0 { - newSegment, bufBytes, err = s.segPlugin.New(analysisResults) + newSegment, bufBytes, err = s.segPlugin.NewEx(analysisResults, s.segmentConfig) if err != nil { return err } diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index c44f9cf7b..4ff249a0b 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -46,10 +46,14 @@ type SegmentPlugin interface { // New takes a set of Documents and turns them into a new Segment New(results []index.Document) (segment.Segment, uint64, error) + NewEx(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error) + // Open attempts to open the file at the specified path and // return the corresponding Segment Open(path string) (segment.Segment, error) + OpenEx(path string, config map[string]interface{}) (segment.Segment, error) + // Merge takes a set of Segments, and creates a new segment on disk at // the specified path. // Drops is a set of bitmaps (one for each segment) indicating which @@ -67,6 +71,10 @@ type SegmentPlugin interface { Merge(segments []segment.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s segment.StatsReporter) ( [][]uint64, uint64, error) + + MergeEx(segments []segment.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s segment.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) } var supportedSegmentPlugins map[string]map[uint32]SegmentPlugin From 4a9ce7c5fd12bdbd43727cbf5521d31f67f615b1 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 15 Jan 2026 14:47:31 -0800 Subject: [PATCH 02/25] merge conflic resolve --- index/scorch/persister.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 2bc88f4f7..0571bd1e4 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -1016,7 +1016,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return nil, fmt.Errorf("segment path missing") } segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) - segment, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig) + seg, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig) if err != nil { return nil, fmt.Errorf("error opening bolt segment: %v", err) } From dd5ccbb6a46a22597b5256e2f297e8eff9c4e394 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 2 Feb 2026 11:39:41 -0800 Subject: [PATCH 03/25] *Ex -> *Using naming --- index/scorch/merge.go | 8 ++++---- index/scorch/persister.go | 4 ++-- index/scorch/scorch.go | 2 +- index/scorch/segment_plugin.go | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index e2267bf81..bca9bbb81 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -372,7 +372,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) prevBytesReadTotal := cumulateBytesRead(segmentsToMerge) - newDocNums, _, err := s.segPlugin.MergeEx(segmentsToMerge, docsToDrop, path, + newDocNums, _, err := s.segPlugin.MergeUsing(segmentsToMerge, docsToDrop, path, cw.cancelCh, s, s.segmentConfig) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) @@ -391,7 +391,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, return fmt.Errorf("merging failed: %v", err) } - seg, err = s.segPlugin.OpenEx(path, s.segmentConfig) + seg, err = s.segPlugin.OpenUsing(path, s.segmentConfig) if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) @@ -540,7 +540,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, // the newly merged segment is already flushed out to disk, just needs // to be opened using mmap. newDocIDs, _, err := - s.segPlugin.MergeEx(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig) + s.segPlugin.MergeUsing(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig) if err != nil { em.Lock() errs = append(errs, err) @@ -555,7 +555,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, s.markIneligibleForRemoval(filename) newMergedSegmentIDs[id] = newSegmentID newDocIDsSet[id] = newDocIDs - newMergedSegments[id], err = s.segPlugin.OpenEx(path, s.segmentConfig) + newMergedSegments[id], err = s.segPlugin.OpenUsing(path, s.segmentConfig) if err != nil { em.Lock() errs = append(errs, err) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 0571bd1e4..977097097 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -804,7 +804,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint } }() for segmentID, path := range newSegmentPaths { - newSegments[segmentID], err = s.segPlugin.OpenEx(path, s.segmentConfig) + newSegments[segmentID], err = s.segPlugin.OpenUsing(path, s.segmentConfig) if err != nil { return fmt.Errorf("error opening new segment at %s, %v", path, err) } @@ -1016,7 +1016,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return nil, fmt.Errorf("segment path missing") } segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) - seg, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig) + seg, err := s.segPlugin.OpenUsing(segmentPath, s.segmentConfig) if err != nil { return nil, fmt.Errorf("error opening bolt segment: %v", err) } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 72435a91c..efe052935 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -499,7 +499,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { stats := newFieldStats() if len(analysisResults) > 0 { - newSegment, bufBytes, err = s.segPlugin.NewEx(analysisResults, s.segmentConfig) + newSegment, bufBytes, err = s.segPlugin.NewUsing(analysisResults, s.segmentConfig) if err != nil { return err } diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index 4ff249a0b..16be8e440 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -46,13 +46,13 @@ type SegmentPlugin interface { // New takes a set of Documents and turns them into a new Segment New(results []index.Document) (segment.Segment, uint64, error) - NewEx(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error) + NewUsing(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error) // Open attempts to open the file at the specified path and // return the corresponding Segment Open(path string) (segment.Segment, error) - OpenEx(path string, config map[string]interface{}) (segment.Segment, error) + OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) // Merge takes a set of Segments, and creates a new segment on disk at // the specified path. @@ -72,7 +72,7 @@ type SegmentPlugin interface { closeCh chan struct{}, s segment.StatsReporter) ( [][]uint64, uint64, error) - MergeEx(segments []segment.Segment, drops []*roaring.Bitmap, path string, + MergeUsing(segments []segment.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s segment.StatsReporter, config map[string]interface{}) ( [][]uint64, uint64, error) } From b0876250608ee44cd1934dab4831accd296e4661 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Wed, 4 Feb 2026 13:46:12 -0800 Subject: [PATCH 04/25] go mod changes --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index 5448bac80..4736f4f88 100644 --- a/go.mod +++ b/go.mod @@ -19,13 +19,13 @@ require ( github.com/blevesearch/stempel v0.2.0 github.com/blevesearch/upsidedown_store_api v1.0.2 github.com/blevesearch/vellum v1.2.0 - github.com/blevesearch/zapx/v11 v11.4.2 - github.com/blevesearch/zapx/v12 v12.4.2 - github.com/blevesearch/zapx/v13 v13.4.2 - github.com/blevesearch/zapx/v14 v14.4.2 - github.com/blevesearch/zapx/v15 v15.4.2 - github.com/blevesearch/zapx/v16 v16.3.0 - github.com/blevesearch/zapx/v17 v17.0.1 + github.com/blevesearch/zapx/v11 v11.4.3 + github.com/blevesearch/zapx/v12 v12.4.3 + github.com/blevesearch/zapx/v13 v13.4.3 + github.com/blevesearch/zapx/v14 v14.4.3 + github.com/blevesearch/zapx/v15 v15.4.3 + github.com/blevesearch/zapx/v16 v16.3.1 + github.com/blevesearch/zapx/v17 v17.0.2-0.20260204210735-148661f2ddf6 github.com/couchbase/moss v0.2.0 github.com/spf13/cobra v1.10.2 go.etcd.io/bbolt v1.4.0 diff --git a/go.sum b/go.sum index 8207f8975..b8eba81f8 100644 --- a/go.sum +++ b/go.sum @@ -33,20 +33,20 @@ github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMG github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= github.com/blevesearch/vellum v1.2.0 h1:xkDiOEsHc2t3Cp0NsNZZ36pvc130sCzcGKOPMzXe+e0= github.com/blevesearch/vellum v1.2.0/go.mod h1:uEcfBJz7mAOf0Kvq6qoEKQQkLODBF46SINYNkZNae4k= -github.com/blevesearch/zapx/v11 v11.4.2 h1:l46SV+b0gFN+Rw3wUI1YdMWdSAVhskYuvxlcgpQFljs= -github.com/blevesearch/zapx/v11 v11.4.2/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc= -github.com/blevesearch/zapx/v12 v12.4.2 h1:fzRbhllQmEMUuAQ7zBuMvKRlcPA5ESTgWlDEoB9uQNE= -github.com/blevesearch/zapx/v12 v12.4.2/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58= -github.com/blevesearch/zapx/v13 v13.4.2 h1:46PIZCO/ZuKZYgxI8Y7lOJqX3Irkc3N8W82QTK3MVks= -github.com/blevesearch/zapx/v13 v13.4.2/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk= -github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT7fWYz0= -github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= -github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= -github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.3.0 h1:hF6VlN15E9CB40RMPyqOIhlDw1OOo9RItumhKMQktxw= -github.com/blevesearch/zapx/v16 v16.3.0/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY= -github.com/blevesearch/zapx/v17 v17.0.1 h1:kdojyNDiC4abVvsSwequvqYTBuLEXoG3c0UKyxe1+GM= -github.com/blevesearch/zapx/v17 v17.0.1/go.mod h1:gvr+JMDB9XvQUkT+CaYJhY7aMlez5EmXbkzOBCVyc7U= +github.com/blevesearch/zapx/v11 v11.4.3 h1:PTZOO5loKpHC/x/GzmPZNa9cw7GZIQxd5qRjwij9tHY= +github.com/blevesearch/zapx/v11 v11.4.3/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc= +github.com/blevesearch/zapx/v12 v12.4.3 h1:eElXvAaAX4m04t//CGBQAtHNPA+Q6A1hHZVrN3LSFYo= +github.com/blevesearch/zapx/v12 v12.4.3/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58= +github.com/blevesearch/zapx/v13 v13.4.3 h1:qsdhRhaSpVnqDFlRiH9vG5+KJ+dE7KAW9WyZz/KXAiE= +github.com/blevesearch/zapx/v13 v13.4.3/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk= +github.com/blevesearch/zapx/v14 v14.4.3 h1:GY4Hecx0C6UTmiNC2pKdeA2rOKiLR5/rwpU9WR51dgM= +github.com/blevesearch/zapx/v14 v14.4.3/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= +github.com/blevesearch/zapx/v15 v15.4.3 h1:iJiMJOHrz216jyO6lS0m9RTCEkprUnzvqAI2lc/0/CU= +github.com/blevesearch/zapx/v15 v15.4.3/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= +github.com/blevesearch/zapx/v16 v16.3.1 h1:ERxZUSC9UcuKggCQ6b3y4sTkyL4WnGOWuopzglR874g= +github.com/blevesearch/zapx/v16 v16.3.1/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY= +github.com/blevesearch/zapx/v17 v17.0.2-0.20260204210735-148661f2ddf6 h1:eqJh5al0dcPq6VsY6C+G4kva5BBffzMG+sN/SWg2/Eg= +github.com/blevesearch/zapx/v17 v17.0.2-0.20260204210735-148661f2ddf6/go.mod h1:gvr+JMDB9XvQUkT+CaYJhY7aMlez5EmXbkzOBCVyc7U= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= From 1287ed8b33d379608e83943ce7a03a7f141aef22 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 16 Jun 2025 15:23:42 +0530 Subject: [PATCH 05/25] fastmerge wip --- index/scorch/snapshot_index.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3422d9a14..7e8ca0de0 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -66,13 +66,16 @@ func init() { } type IndexSnapshot struct { - parent *Scorch - segment []*SegmentSnapshot - offsets []uint64 - internal map[string][]byte - epoch uint64 - size uint64 - creator string + parent *Scorch + + // POC: trainData is ephemeral + trainData [][]float32 + segment []*SegmentSnapshot + offsets []uint64 + internal map[string][]byte + epoch uint64 + size uint64 + creator string m sync.Mutex // Protects the fields that follow. refs int64 From 0d74ce2e24269935a5b62183b47d3ba5bd716c62 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Tue, 17 Jun 2025 12:12:16 +0530 Subject: [PATCH 06/25] use callbacks to collect and use train data while merging --- index/scorch/introducer.go | 4 ++++ index/scorch/merge.go | 19 +++++++++++++++++++ index/scorch/snapshot_index.go | 2 +- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index ef26532b0..7965cc5c3 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -360,6 +360,10 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { creator: "introduceMerge", } + if len(nextMerge.trainData) > 0 { + newSnapshot.trainData = append(root.trainData, nextMerge.trainData...) + } + var running, docsToPersistCount, memSegments, fileSegments uint64 var droppedSegmentFiles []string newSegmentDeleted := make([]*roaring.Bitmap, len(nextMerge.new)) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index bca9bbb81..31cac6e61 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -17,6 +17,7 @@ package scorch import ( "context" "fmt" + "math" "os" "strings" "sync" @@ -481,6 +482,7 @@ type mergedSegmentHistory struct { type segmentMerge struct { id []uint64 new []segment.Segment + trainData [][]float32 mergedSegHistory map[uint64]*mergedSegmentHistory notifyCh chan *mergeTaskIntroStatus mmaped uint32 @@ -527,6 +529,22 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, var em sync.Mutex var errs []error + var trainingSample [][]float32 + collectTrainData := func(segTrainData [][]float32) { + trainingSample = append(trainingSample, segTrainData...) + } + + numDocs, err := snapshot.DocCount() + if err != nil { + return nil, nil, err + } + trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 39) + + // collect train data only if needed + if len(snapshot.trainData) < int(trainingSampleSize) { + s.segmentConfig["collectTrainDataCallback"] = collectTrainData + } + s.segmentConfig["trainData"] = snapshot.trainData // deploy the workers to merge and flush the batches of segments concurrently // and create a new file segment for i := 0; i < numFlushes; i++ { @@ -601,6 +619,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, mergedSegHistory: make(map[uint64]*mergedSegmentHistory, numSegments), notifyCh: make(chan *mergeTaskIntroStatus), newCount: newMergedCount, + trainData: trainingSample, } // create a history map which maps the old in-memory segments with the specific diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 7e8ca0de0..056b013d6 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -68,7 +68,7 @@ func init() { type IndexSnapshot struct { parent *Scorch - // POC: trainData is ephemeral + // POC: trainData is ephemeral and read-only just like []*SegmentSnapshot trainData [][]float32 segment []*SegmentSnapshot offsets []uint64 From 2aa894921d53358b1b090856a88638046bded445 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Wed, 18 Jun 2025 16:49:32 +0530 Subject: [PATCH 07/25] serialized float array --- index/scorch/merge.go | 10 ++++++---- index/scorch/snapshot_index.go | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 31cac6e61..879003b32 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -19,6 +19,7 @@ import ( "fmt" "math" "os" + "slices" "strings" "sync" "sync/atomic" @@ -482,7 +483,7 @@ type mergedSegmentHistory struct { type segmentMerge struct { id []uint64 new []segment.Segment - trainData [][]float32 + trainData []float32 mergedSegHistory map[uint64]*mergedSegmentHistory notifyCh chan *mergeTaskIntroStatus mmaped uint32 @@ -529,9 +530,10 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, var em sync.Mutex var errs []error - var trainingSample [][]float32 - collectTrainData := func(segTrainData [][]float32) { - trainingSample = append(trainingSample, segTrainData...) + var trainingSample []float32 + collectTrainData := func(segTrainData []float32) { + // append a clone of the training sample + trainingSample = append(trainingSample, slices.Clone(segTrainData)...) } numDocs, err := snapshot.DocCount() diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 056b013d6..152647089 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -69,7 +69,7 @@ type IndexSnapshot struct { parent *Scorch // POC: trainData is ephemeral and read-only just like []*SegmentSnapshot - trainData [][]float32 + trainData []float32 segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte From 0f88485cbfd8f405e7d9249b8627e10654dfbc8f Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Tue, 8 Jul 2025 11:46:06 +0530 Subject: [PATCH 08/25] collect training sample on the file path as well --- index/scorch/merge.go | 21 ++++++++++++++++----- index/scorch/persister.go | 4 ++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 879003b32..80e4ef03f 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -365,6 +365,11 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, var seg segment.Segment var filename string + var trainingSample []float32 + collectTrainData := func(segTrainData []float32) { + // append a clone of the training sample + trainingSample = append(trainingSample, slices.Clone(segTrainData)...) + } if len(segmentsToMerge) > 0 { filename = zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) @@ -419,6 +424,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, newCount: seg.Count(), notifyCh: make(chan *mergeTaskIntroStatus), mmaped: 1, + trainData: trainingSample, } s.fireEvent(EventKindMergeTaskIntroductionStart, 0) @@ -536,17 +542,22 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, trainingSample = append(trainingSample, slices.Clone(segTrainData)...) } - numDocs, err := snapshot.DocCount() - if err != nil { - return nil, nil, err - } + // numDocs, err := snapshot.DocCount() + // if err != nil { + // return nil, nil, err + // } + + // harcoding the total docs for now, need to get it from CB level + numDocs := 1000000 trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 39) // collect train data only if needed if len(snapshot.trainData) < int(trainingSampleSize) { s.segmentConfig["collectTrainDataCallback"] = collectTrainData + } else { + s.segmentConfig["trainData"] = snapshot.trainData } - s.segmentConfig["trainData"] = snapshot.trainData + // deploy the workers to merge and flush the batches of segments concurrently // and create a new file segment for i := 0; i < numFlushes; i++ { diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 977097097..eb0d9b187 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -1005,6 +1005,10 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { rv.MergeUpdateFieldsInfo(segmentSnapshot.updatedFields) } running += segmentSnapshot.segment.Count() + // persistedSegment, ok := segmentSnapshot.segment.(segment.PersistedSegment) + // if ok { + // fmt.Println("segment path", persistedSegment.Path()) + // } } } return rv, nil From fce3d3b4dcd9308ca830dd846c9723223f3575c2 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 21 Aug 2025 19:52:06 +0530 Subject: [PATCH 09/25] cleanup debug logs --- index/scorch/introducer.go | 10 +++++++++- index/scorch/merge.go | 6 +++--- index/scorch/persister.go | 4 ---- index/scorch/snapshot_index.go | 14 ++++++++------ 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 7965cc5c3..6afda3347 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -129,6 +129,10 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { creator: "introduceSegment", } + if len(root.trainData) > 0 { + newSnapshot.trainData = root.trainData + } + // iterate through current segments var running uint64 var docsToPersistCount, memSegments, fileSegments uint64 @@ -284,6 +288,10 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { creator: "introducePersist", } + if len(root.trainData) > 0 { + newIndexSnapshot.trainData = root.trainData + } + var docsToPersistCount, memSegments, fileSegments uint64 for i, segmentSnapshot := range root.segment { // see if this segment has been replaced @@ -361,7 +369,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } if len(nextMerge.trainData) > 0 { - newSnapshot.trainData = append(root.trainData, nextMerge.trainData...) + newSnapshot.trainData = nextMerge.trainData } var running, docsToPersistCount, memSegments, fileSegments uint64 diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 80e4ef03f..4ae23297f 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -539,7 +539,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, var trainingSample []float32 collectTrainData := func(segTrainData []float32) { // append a clone of the training sample - trainingSample = append(trainingSample, slices.Clone(segTrainData)...) + trainingSample = append(trainingSample, segTrainData...) } // numDocs, err := snapshot.DocCount() @@ -549,10 +549,10 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, // harcoding the total docs for now, need to get it from CB level numDocs := 1000000 - trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 39) + trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 50) // collect train data only if needed - if len(snapshot.trainData) < int(trainingSampleSize) { + if len(snapshot.trainData)/768 < int(trainingSampleSize) { s.segmentConfig["collectTrainDataCallback"] = collectTrainData } else { s.segmentConfig["trainData"] = snapshot.trainData diff --git a/index/scorch/persister.go b/index/scorch/persister.go index eb0d9b187..977097097 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -1005,10 +1005,6 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { rv.MergeUpdateFieldsInfo(segmentSnapshot.updatedFields) } running += segmentSnapshot.segment.Count() - // persistedSegment, ok := segmentSnapshot.segment.(segment.PersistedSegment) - // if ok { - // fmt.Println("segment path", persistedSegment.Path()) - // } } } return rv, nil diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 152647089..dbd666cd0 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -70,12 +70,14 @@ type IndexSnapshot struct { // POC: trainData is ephemeral and read-only just like []*SegmentSnapshot trainData []float32 - segment []*SegmentSnapshot - offsets []uint64 - internal map[string][]byte - epoch uint64 - size uint64 - creator string + // trainSegments []*SegmentSnapshot // either store []float32 or []faissIndexes aka centroid indexes + + segment []*SegmentSnapshot + offsets []uint64 + internal map[string][]byte + epoch uint64 + size uint64 + creator string m sync.Mutex // Protects the fields that follow. refs int64 From 7cfa17bc209636c0b56d88a3cee6bcb653f7721a Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Tue, 28 Oct 2025 16:30:10 -0700 Subject: [PATCH 10/25] vector sources API --- mapping/mapping.go | 1 + mapping/mapping_no_vectors.go | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/mapping/mapping.go b/mapping/mapping.go index 7ff2f9927..e3a14b738 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -58,6 +58,7 @@ type IndexMapping interface { AnalyzerNamed(name string) analysis.Analyzer FieldMappingForPath(path string) FieldMapping + VectorSources() []string } // A SynonymMapping extends the IndexMapping interface to provide diff --git a/mapping/mapping_no_vectors.go b/mapping/mapping_no_vectors.go index cbe9d81bc..3e7e0403c 100644 --- a/mapping/mapping_no_vectors.go +++ b/mapping/mapping_no_vectors.go @@ -42,3 +42,10 @@ func validateFieldMapping(field *FieldMapping, path []string, fieldAliasCtx map[string]*FieldMapping) error { return validateFieldType(field) } + +// ----------------------------------------------------------------------------- +// vector source functions + +func (im *IndexMappingImpl) VectorSources() []string { + return []string{"vector indexing is not implemented"} +} From 853d687a9f44e5935a6d77f72dd76a56d7920513 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Wed, 26 Nov 2025 11:14:04 -0800 Subject: [PATCH 11/25] batch training support --- go.mod | 4 ++ index.go | 4 ++ index/scorch/introducer.go | 12 ------ index/scorch/merge.go | 32 -------------- index/scorch/persister.go | 5 +++ index/scorch/scorch.go | 76 ++++++++++++++++++++++++++++++++++ index/scorch/snapshot_index.go | 4 -- index_alias_impl.go | 19 +++++++++ index_impl.go | 17 ++++++++ mapping/mapping.go | 1 - mapping/mapping_no_vectors.go | 7 ---- 11 files changed, 125 insertions(+), 56 deletions(-) diff --git a/go.mod b/go.mod index 4736f4f88..cb73edb28 100644 --- a/go.mod +++ b/go.mod @@ -43,3 +43,7 @@ require ( github.com/spf13/pflag v1.0.9 // indirect golang.org/x/sys v0.40.0 // indirect ) + +replace github.com/blevesearch/scorch_segment_api/v2 => /Users/thejas.orkombu/fts/blevesearch/scorch_segment_api + +replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api \ No newline at end of file diff --git a/index.go b/index.go index 2f1ba5fbf..c083787c4 100644 --- a/index.go +++ b/index.go @@ -396,3 +396,7 @@ type InsightsIndex interface { // CentroidCardinalities returns the centroids (clusters) from IVF indexes ordered by data density. CentroidCardinalities(field string, limit int, desceding bool) ([]index.CentroidCardinality, error) } +type VectorIndex interface { + Index + Train(*Batch) error +} diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 6afda3347..ef26532b0 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -129,10 +129,6 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { creator: "introduceSegment", } - if len(root.trainData) > 0 { - newSnapshot.trainData = root.trainData - } - // iterate through current segments var running uint64 var docsToPersistCount, memSegments, fileSegments uint64 @@ -288,10 +284,6 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { creator: "introducePersist", } - if len(root.trainData) > 0 { - newIndexSnapshot.trainData = root.trainData - } - var docsToPersistCount, memSegments, fileSegments uint64 for i, segmentSnapshot := range root.segment { // see if this segment has been replaced @@ -368,10 +360,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { creator: "introduceMerge", } - if len(nextMerge.trainData) > 0 { - newSnapshot.trainData = nextMerge.trainData - } - var running, docsToPersistCount, memSegments, fileSegments uint64 var droppedSegmentFiles []string newSegmentDeleted := make([]*roaring.Bitmap, len(nextMerge.new)) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 4ae23297f..bca9bbb81 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -17,9 +17,7 @@ package scorch import ( "context" "fmt" - "math" "os" - "slices" "strings" "sync" "sync/atomic" @@ -365,11 +363,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, var seg segment.Segment var filename string - var trainingSample []float32 - collectTrainData := func(segTrainData []float32) { - // append a clone of the training sample - trainingSample = append(trainingSample, slices.Clone(segTrainData)...) - } if len(segmentsToMerge) > 0 { filename = zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) @@ -424,7 +417,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, newCount: seg.Count(), notifyCh: make(chan *mergeTaskIntroStatus), mmaped: 1, - trainData: trainingSample, } s.fireEvent(EventKindMergeTaskIntroductionStart, 0) @@ -489,7 +481,6 @@ type mergedSegmentHistory struct { type segmentMerge struct { id []uint64 new []segment.Segment - trainData []float32 mergedSegHistory map[uint64]*mergedSegmentHistory notifyCh chan *mergeTaskIntroStatus mmaped uint32 @@ -536,28 +527,6 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, var em sync.Mutex var errs []error - var trainingSample []float32 - collectTrainData := func(segTrainData []float32) { - // append a clone of the training sample - trainingSample = append(trainingSample, segTrainData...) - } - - // numDocs, err := snapshot.DocCount() - // if err != nil { - // return nil, nil, err - // } - - // harcoding the total docs for now, need to get it from CB level - numDocs := 1000000 - trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 50) - - // collect train data only if needed - if len(snapshot.trainData)/768 < int(trainingSampleSize) { - s.segmentConfig["collectTrainDataCallback"] = collectTrainData - } else { - s.segmentConfig["trainData"] = snapshot.trainData - } - // deploy the workers to merge and flush the batches of segments concurrently // and create a new file segment for i := 0; i < numFlushes; i++ { @@ -632,7 +601,6 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, mergedSegHistory: make(map[uint64]*mergedSegmentHistory, numSegments), notifyCh: make(chan *mergeTaskIntroStatus), newCount: newMergedCount, - trainData: trainingSample, } // create a history map which maps the old in-memory segments with the specific diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 977097097..919daec70 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -575,6 +575,11 @@ func copyToDirectory(srcPath string, d index.Directory) (int64, error) { return 0, fmt.Errorf("GetWriter err: %v", err) } + // skip + if dest == nil { + return 0, nil + } + sourceFileStat, err := os.Stat(srcPath) if err != nil { return 0, err diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index efe052935..0da7f2318 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -19,6 +19,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" "sync/atomic" "time" @@ -27,6 +28,7 @@ import ( "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" + "github.com/blevesearch/go-faiss" segment "github.com/blevesearch/scorch_segment_api/v2" bolt "go.etcd.io/bbolt" ) @@ -78,6 +80,8 @@ type Scorch struct { persisterNotifier chan *epochWatcher rootBolt *bolt.DB asyncTasks sync.WaitGroup + // not a real searchable segment, singleton + centroidIndex *SegmentSnapshot onEvent func(event Event) bool onAsyncError func(err error, path string) @@ -170,6 +174,12 @@ func NewScorch(storeName string, } } + // "pretraining": true + segConfig, ok := config["segmentConfig"].(map[string]interface{}) + if ok { + rv.segmentConfig = segConfig + } + typ, ok := config["spatialPlugin"].(string) if ok { if err := rv.loadSpatialAnalyzerPlugin(typ); err != nil { @@ -534,6 +544,72 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { return err } +func (s *Scorch) Train(batch *index.Batch) error { + s.rootLock.Lock() + defer s.rootLock.Unlock() + if s.centroidIndex != nil { + // singleton API + return nil + } + var trainData []index.Document + if s.centroidIndex == nil { + for key, doc := range batch.IndexOps { + if strings.HasPrefix(key, index.TrainDataPrefix) { + trainData = append(trainData, doc) + } + } + } + + // just builds a new vector index out of the train data provided + // it'll be an IVF index so the centroids are computed at this stage and + // this template will be used in the indexing down the line to index + // the data vectors. s.segmentConfig will mark this as a training phase + // and zap will handle it accordingly. + // + // note: this might index text data too, how to handle this? s.segmentConfig? + // todo: updates/deletes -> data drift detection + seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig) + if err != nil { + return err + } + filename := "centroid_index.zap" + path := filepath.Join(s.path, filename) + + switch seg := seg.(type) { + case segment.UnpersistedSegment: + err = persistToDirectory(seg, nil, path) + if err != nil { + return err + } + default: + return fmt.Errorf("segment is not a unpersisted segment") + } + + // persist and open the segment mmap mode. + persistedSegment, err := s.segPlugin.OpenEx(path, s.segmentConfig) + if err != nil { + return err + } + s.centroidIndex = &SegmentSnapshot{ + segment: persistedSegment, + } + s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex + return nil +} + +func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) { + // return the coarse quantizer of the centroid index belonging to the field + centroidIndexSegment, ok := s.centroidIndex.segment.(segment.CentroidIndexSegment) + if !ok { + return nil, fmt.Errorf("segment is not a centroid index segment") + } + coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) + if err != nil { + return nil, err + } + return coarseQuantizer, nil +} + func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats, ) error { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index dbd666cd0..cf0273534 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -68,10 +68,6 @@ func init() { type IndexSnapshot struct { parent *Scorch - // POC: trainData is ephemeral and read-only just like []*SegmentSnapshot - trainData []float32 - // trainSegments []*SegmentSnapshot // either store []float32 or []faissIndexes aka centroid indexes - segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte diff --git a/index_alias_impl.go b/index_alias_impl.go index 8212c74b9..ee7fbf2a6 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -103,6 +103,25 @@ func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition * return ErrorSynonymSearchNotSupported } +func (i *indexAliasImpl) Train(batch *Batch) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + + if vi, ok := i.indexes[0].(VectorIndex); ok { + return vi.Train(batch) + } + return fmt.Errorf("not a vector index") +} + func (i *indexAliasImpl) Delete(id string) error { i.mutex.RLock() defer i.mutex.RUnlock() diff --git a/index_impl.go b/index_impl.go index 586dacb3b..bd43a4c3e 100644 --- a/index_impl.go +++ b/index_impl.go @@ -369,6 +369,20 @@ func (i *indexImpl) IndexSynonym(id string, collection string, definition *Synon return err } +func (i *indexImpl) Train(batch *Batch) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + if vi, ok := i.i.(VectorIndex); ok { + return vi.Train(batch) + } + return fmt.Errorf("not a vector index") +} + // IndexAdvanced takes a document.Document object // skips the mapping and indexes it. func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) { @@ -1416,6 +1430,7 @@ func (m *searchHitSorter) Less(i, j int) bool { return c < 0 } +// CopyTo (index.Directory, filter) func (i *indexImpl) CopyTo(d index.Directory) (err error) { i.mutex.RLock() defer i.mutex.RUnlock() @@ -1429,6 +1444,8 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) { return fmt.Errorf("index implementation does not support copy reader") } + // copyIndex.Copy() -> copies the centroid index + copyReader := copyIndex.CopyReader() if copyReader == nil { return fmt.Errorf("index's copyReader is nil") diff --git a/mapping/mapping.go b/mapping/mapping.go index e3a14b738..7ff2f9927 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -58,7 +58,6 @@ type IndexMapping interface { AnalyzerNamed(name string) analysis.Analyzer FieldMappingForPath(path string) FieldMapping - VectorSources() []string } // A SynonymMapping extends the IndexMapping interface to provide diff --git a/mapping/mapping_no_vectors.go b/mapping/mapping_no_vectors.go index 3e7e0403c..cbe9d81bc 100644 --- a/mapping/mapping_no_vectors.go +++ b/mapping/mapping_no_vectors.go @@ -42,10 +42,3 @@ func validateFieldMapping(field *FieldMapping, path []string, fieldAliasCtx map[string]*FieldMapping) error { return validateFieldType(field) } - -// ----------------------------------------------------------------------------- -// vector source functions - -func (im *IndexMappingImpl) VectorSources() []string { - return []string{"vector indexing is not implemented"} -} From 65e171ff304b30069471561d3fe5cd2c9d111dc3 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Wed, 26 Nov 2025 11:15:13 -0800 Subject: [PATCH 12/25] wip: batch training + interfaces to reuse pre-trained file --- index.go | 5 + index/scorch/persister.go | 29 +++++- index/scorch/scorch.go | 165 ++++++++++++++++++++++++++++++++- index/scorch/snapshot_index.go | 4 + index_alias_impl.go | 5 +- index_impl.go | 38 +++++++- util/keys.go | 1 + 7 files changed, 233 insertions(+), 14 deletions(-) diff --git a/index.go b/index.go index c083787c4..21d016610 100644 --- a/index.go +++ b/index.go @@ -353,6 +353,11 @@ type IndexCopyable interface { CopyTo(d index.Directory) error } +type IndexFileCopyable interface { + UpdateFileInBolt(key []byte, value []byte) error + CopyFile(file string, d index.IndexDirectory) error +} + // FileSystemDirectory is the default implementation for the // index.Directory interface. type FileSystemDirectory string diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 919daec70..4ded4c23c 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -575,11 +575,6 @@ func copyToDirectory(srcPath string, d index.Directory) (int64, error) { return 0, fmt.Errorf("GetWriter err: %v", err) } - // skip - if dest == nil { - return 0, nil - } - sourceFileStat, err := os.Stat(srcPath) if err != nil { return 0, err @@ -858,10 +853,34 @@ func zapFileName(epoch uint64) string { return fmt.Sprintf("%012x.zap", epoch) } +func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error { + if bucket == nil { + return nil + } + segmentSnapshot, err := s.loadSegment(bucket) + if err != nil { + return err + } + s.rootLock.Lock() + defer s.rootLock.Unlock() + + s.centroidIndex = segmentSnapshot + return nil +} + // bolt snapshot code func (s *Scorch) loadFromBolt() error { err := s.rootBolt.View(func(tx *bolt.Tx) error { + centroidIndexBucket := tx.Bucket(util.BoltCentroidIndexKey) + if centroidIndexBucket == nil { + return nil + } + err := s.updateCentroidIndex(centroidIndexBucket) + if err != nil { + return err + } + snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 0da7f2318..74aa2e99f 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -15,8 +15,10 @@ package scorch import ( + "bytes" "encoding/json" "fmt" + "io" "os" "path/filepath" "strings" @@ -544,7 +546,19 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { return err } +func (s *Scorch) getInternal(key []byte) ([]byte, error) { + s.rootLock.RLock() + defer s.rootLock.RUnlock() + if string(key) == "_centroid_index_complete" { + return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil + } + return nil, nil +} + +// min 39 per centroid, recommeded 50 +// max 256 func (s *Scorch) Train(batch *index.Batch) error { + // is the lock really needed? s.rootLock.Lock() defer s.rootLock.Unlock() if s.centroidIndex != nil { @@ -554,6 +568,12 @@ func (s *Scorch) Train(batch *index.Batch) error { var trainData []index.Document if s.centroidIndex == nil { for key, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + // no need to track updates/deletes over here since + // the API is singleton + doc.AddIDField() + } if strings.HasPrefix(key, index.TrainDataPrefix) { trainData = append(trainData, doc) } @@ -568,11 +588,16 @@ func (s *Scorch) Train(batch *index.Batch) error { // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection - seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig) + s.segmentConfig["training"] = true + seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig) if err != nil { return err } - filename := "centroid_index.zap" + // reset the training flag once completed + s.segmentConfig["training"] = false + // not suffixing with .zap since the current garbage collection is tailored to .zap ext files + // we don't want to gc this file ever. + filename := "centroid_index" path := filepath.Join(s.path, filename) switch seg := seg.(type) { @@ -593,7 +618,56 @@ func (s *Scorch) Train(batch *index.Batch) error { s.centroidIndex = &SegmentSnapshot{ segment: persistedSegment, } - s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex + + fmt.Println("number of bytes written to centroid index", n) + // s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex + // updateBolt(tx, cetntroid) + // filename := "centroid_index" + // path := filepath.Join(s.path, filename) + // f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0600) + // if err != nil { + // return err + // } + + // bufw := bufio.NewWriter(f) + // _, err = bufw.Write([]byte(strings.Join([]string{"centroid_index1", path}, " "))) + // if err != nil { + // return err + // } + // err = bufw.Flush() + // if err != nil { + // return err + // } + // err = f.Sync() + // if err != nil { + // return err + // } + // err = f.Close() + // if err != nil { + // return err + // } + + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) + if err != nil { + return err + } + + err = snapshotsBucket.Put(util.BoltCentroidIndexKey, []byte(path)) + if err != nil { + return err + } + + err = tx.Commit() + if err != nil { + return err + } + return nil } @@ -1055,6 +1129,91 @@ func (s *Scorch) CopyReader() index.CopyReader { return rv } +func (s *Scorch) updateCentroidIndexInBolt(tx *bolt.Tx) error { + centroidIndexBucket, err := tx.CreateBucketIfNotExists(util.BoltCentroidIndexKey) + if err != nil { + return err + } + + err = centroidIndexBucket.Put(util.BoltPathKey, []byte("centroid_index.zap")) + if err != nil { + return err + } + + return nil +} + +func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error { + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) + if err != nil { + return err + } + + // currently this is specific to centroid index file update + if bytes.Equal(key, util.BoltCentroidIndexKey) { + // guard against duplicate updates + existingValue := snapshotsBucket.Get(key) + if existingValue != nil { + return fmt.Errorf("key already exists") + } + + err = snapshotsBucket.Put(key, value) + if err != nil { + return err + } + } + + err = tx.Commit() + if err != nil { + return err + } + + err = s.rootBolt.Sync() + if err != nil { + return err + } + + return nil +} + +// CopyFile copies a specific file to a destination directory which has an access to a bleve index +// doing a io.Copy() isn't enough because the file needs to be tracked in bolt file as well +func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error { + s.rootLock.Lock() + defer s.rootLock.Unlock() + + // this code is currently specific to centroid index file but is future proofed for other files + // to be updated in the dest's bolt + if strings.HasSuffix(file, "centroid_index") { + // centroid index file - this is outside the snapshots domain so the bolt update is different + err := d.UpdateFileInBolt(util.BoltCentroidIndexKey, []byte(file)) + if err != nil { + return err + } + } + + dest, err := d.GetWriter(filepath.Join("store", file)) + if err != nil { + return err + } + + source, err := os.Open(filepath.Join(s.path, file)) + if err != nil { + return err + } + + defer source.Close() + defer dest.Close() + _, err = io.Copy(dest, source) + return err +} + // external API to fire a scorch event (EventKindIndexStart) externally from bleve func (s *Scorch) FireIndexEvent() { s.fireEvent(EventKindIndexStart, 0) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index cf0273534..3585b31d8 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -467,6 +467,10 @@ func (is *IndexSnapshot) Fields() ([]string, error) { } func (is *IndexSnapshot) GetInternal(key []byte) ([]byte, error) { + _, ok := is.internal[string(key)] + if !ok { + return is.parent.getInternal(key) + } return is.internal[string(key)], nil } diff --git a/index_alias_impl.go b/index_alias_impl.go index ee7fbf2a6..8cc1d90ed 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -110,14 +110,13 @@ func (i *indexAliasImpl) Train(batch *Batch) error { if !i.open { return ErrorIndexClosed } - err := i.isAliasToSingleIndex() if err != nil { return err } - if vi, ok := i.indexes[0].(VectorIndex); ok { - return vi.Train(batch) + if vi, ok := i.indexes[0].(index.VectorIndex); ok { + return vi.Train(batch.internal) } return fmt.Errorf("not a vector index") } diff --git a/index_impl.go b/index_impl.go index bd43a4c3e..3391d78c5 100644 --- a/index_impl.go +++ b/index_impl.go @@ -377,8 +377,8 @@ func (i *indexImpl) Train(batch *Batch) error { return ErrorIndexClosed } - if vi, ok := i.i.(VectorIndex); ok { - return vi.Train(batch) + if vi, ok := i.i.(index.VectorIndex); ok { + return vi.Train(batch.internal) } return fmt.Errorf("not a vector index") } @@ -1430,6 +1430,38 @@ func (m *searchHitSorter) Less(i, j int) bool { return c < 0 } +func (i *indexImpl) CopyFile(file string, d index.IndexDirectory) (err error) { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + copyIndex, ok := i.i.(index.IndexFileCopyable) + if !ok { + return fmt.Errorf("index implementation does not support copy reader") + } + + return copyIndex.CopyFile(file, d) +} + +func (i *indexImpl) UpdateFileInBolt(key []byte, value []byte) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + copyIndex, ok := i.i.(index.IndexFileCopyable) + if !ok { + return fmt.Errorf("index implementation does not support file copy") + } + + return copyIndex.UpdateFileInBolt(key, value) +} + // CopyTo (index.Directory, filter) func (i *indexImpl) CopyTo(d index.Directory) (err error) { i.mutex.RLock() @@ -1459,7 +1491,7 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) { err = copyReader.CopyTo(d) if err != nil { - return fmt.Errorf("error copying index metadata: %v", err) + return fmt.Errorf("error copying index data: %v", err) } // copy the metadata diff --git a/util/keys.go b/util/keys.go index b71a7f48b..11c918865 100644 --- a/util/keys.go +++ b/util/keys.go @@ -17,6 +17,7 @@ package util var ( // Bolt keys BoltSnapshotsBucket = []byte{'s'} + BoltCentroidIndexKey = []byte{'c'} BoltPathKey = []byte{'p'} BoltDeletedKey = []byte{'d'} BoltInternalKey = []byte{'i'} From 2544512e582ad47624e7e4bb3321e00fae0fd3bc Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 11 Dec 2025 14:43:28 -0800 Subject: [PATCH 13/25] bug fix, debug logging --- centroid_index_test.go | 74 ++++++++++++++++++++++++++++++++++++++ go.mod | 20 ++++++++++- index.go | 2 ++ index/scorch/persister.go | 24 +++++++------ index/scorch/scorch.go | 75 ++++++++++++++------------------------- index_alias_impl.go | 12 +------ index_impl.go | 2 ++ 7 files changed, 138 insertions(+), 71 deletions(-) create mode 100644 centroid_index_test.go diff --git a/centroid_index_test.go b/centroid_index_test.go new file mode 100644 index 000000000..a7334236b --- /dev/null +++ b/centroid_index_test.go @@ -0,0 +1,74 @@ +//go:build vectors +// +build vectors + +package bleve + +import ( + "encoding/json" + "fmt" + "os" + "testing" + + "github.com/blevesearch/bleve/v2/analysis/lang/en" + "github.com/blevesearch/bleve/v2/mapping" + index "github.com/blevesearch/bleve_index_api" +) + +func loadSiftData() ([]map[string]interface{}, error) { + fileContent, err := os.ReadFile("~/fts/data/datasets/vec-sift-bucket.json") + if err != nil { + return nil, err + } + var documents []map[string]interface{} + err = json.Unmarshal(fileContent, &documents) + if err != nil { + return nil, err + } + return documents, nil +} + +func TestCentroidIndex(t *testing.T) { + _, _, err := readDatasetAndQueries(testInputCompressedFile) + if err != nil { + t.Fatal(err) + } + documents, err := loadSiftData() + if err != nil { + t.Fatal(err) + } + contentFieldMapping := NewTextFieldMapping() + contentFieldMapping.Analyzer = en.AnalyzerName + + vecFieldMappingL2 := mapping.NewVectorFieldMapping() + vecFieldMappingL2.Dims = 128 + vecFieldMappingL2.Similarity = index.EuclideanDistance + + indexMappingL2Norm := NewIndexMapping() + indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping) + indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingL2) + + idx, err := newIndexUsing(t.TempDir(), indexMappingL2Norm, Config.DefaultIndexType, Config.DefaultKVStore, nil) + if err != nil { + t.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + batch := idx.NewBatch() + for _, doc := range documents[:100000] { + docId := fmt.Sprintf("%s:%s", index.TrainDataPrefix, doc["id"]) + err = batch.Index(docId, doc) + if err != nil { + t.Fatal(err) + } + } + + err = idx.Train(batch) + if err != nil { + t.Fatal(err) + } +} diff --git a/go.mod b/go.mod index cb73edb28..c2ec2c4e6 100644 --- a/go.mod +++ b/go.mod @@ -44,6 +44,24 @@ require ( golang.org/x/sys v0.40.0 // indirect ) +replace github.com/blevesearch/bleve/v2 => /Users/thejas.orkombu/fts/blevesearch/bleve + +replace github.com/blevesearch/zapx/v11 => /Users/thejas.orkombu/fts/blevesearch/zapx11 + +replace github.com/blevesearch/zapx/v12 => /Users/thejas.orkombu/fts/blevesearch/zapx12 + +replace github.com/blevesearch/zapx/v13 => /Users/thejas.orkombu/fts/blevesearch/zapx13 + +replace github.com/blevesearch/zapx/v14 => /Users/thejas.orkombu/fts/blevesearch/zapx14 + +replace github.com/blevesearch/zapx/v15 => /Users/thejas.orkombu/fts/blevesearch/zapx15 + +replace github.com/blevesearch/zapx/v16 => /Users/thejas.orkombu/fts/blevesearch/zapx + replace github.com/blevesearch/scorch_segment_api/v2 => /Users/thejas.orkombu/fts/blevesearch/scorch_segment_api -replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api \ No newline at end of file +replace github.com/blevesearch/go-faiss => /Users/thejas.orkombu/fts/blevesearch/go-faiss + +replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api + +replace github.com/blevesearch/sear => /Users/thejas.orkombu/fts/blevesearch/sear diff --git a/index.go b/index.go index 21d016610..bd5421d85 100644 --- a/index.go +++ b/index.go @@ -51,10 +51,12 @@ func (b *Batch) Index(id string, data interface{}) error { eventIndex.FireIndexEvent() } doc := document.NewDocument(id) + // fmt.Printf("data is before mapping %#v\n", data) err := b.index.Mapping().MapDocument(doc, data) if err != nil { return err } + // fmt.Printf("data is after mapping %#v\n", doc) b.internal.Update(doc) b.lastDocSize = uint64(doc.Size() + diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 4ded4c23c..3a4bdacc4 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -857,13 +857,14 @@ func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error { if bucket == nil { return nil } + fmt.Println("updateCentroidIndex bucket", bucket != nil) segmentSnapshot, err := s.loadSegment(bucket) if err != nil { return err } s.rootLock.Lock() defer s.rootLock.Unlock() - + fmt.Println("updateCentroidIndex", segmentSnapshot.segment != nil) s.centroidIndex = segmentSnapshot return nil } @@ -872,15 +873,6 @@ func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error { func (s *Scorch) loadFromBolt() error { err := s.rootBolt.View(func(tx *bolt.Tx) error { - centroidIndexBucket := tx.Bucket(util.BoltCentroidIndexKey) - if centroidIndexBucket == nil { - return nil - } - err := s.updateCentroidIndex(centroidIndexBucket) - if err != nil { - return err - } - snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil @@ -897,6 +889,12 @@ func (s *Scorch) loadFromBolt() error { s.AddEligibleForRemoval(snapshotEpoch) continue } + // fmt.Println("loadFromBolt key %s", k) + // if k[0] == util.BoltCentroidIndexKey[0] { + // fmt.Println("loadFromBolt centroid index key", string(k)) + + // continue + // } snapshot := snapshots.Bucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) @@ -928,6 +926,12 @@ func (s *Scorch) loadFromBolt() error { foundRoot = true } + + centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey) + err := s.updateCentroidIndex(centroidIndexBucket) + if err != nil { + return err + } return nil }) if err != nil { diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 74aa2e99f..ca86da936 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -555,8 +555,6 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) { return nil, nil } -// min 39 per centroid, recommeded 50 -// max 256 func (s *Scorch) Train(batch *index.Batch) error { // is the lock really needed? s.rootLock.Lock() @@ -588,6 +586,7 @@ func (s *Scorch) Train(batch *index.Batch) error { // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection + s.segmentConfig["training"] = true seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig) if err != nil { @@ -620,33 +619,14 @@ func (s *Scorch) Train(batch *index.Batch) error { } fmt.Println("number of bytes written to centroid index", n) - // s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex - // updateBolt(tx, cetntroid) - // filename := "centroid_index" - // path := filepath.Join(s.path, filename) - // f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0600) - // if err != nil { - // return err - // } - - // bufw := bufio.NewWriter(f) - // _, err = bufw.Write([]byte(strings.Join([]string{"centroid_index1", path}, " "))) - // if err != nil { - // return err - // } - // err = bufw.Flush() - // if err != nil { - // return err - // } - // err = f.Sync() - // if err != nil { - // return err - // } - // err = f.Close() - // if err != nil { - // return err - // } + s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex + // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint + // where we can be sure that the centroid index is available for the indexing operations downstream + // + // note: when the scale increases massively especially with real world dimensions of 1536+, this API + // will have to be refactored to persist in a more resource efficient way. so having this bolt related + // code will help in tracking the progress a lot better and avoid any redudant data streaming operations. tx, err := s.rootBolt.Begin(true) if err != nil { return err @@ -658,7 +638,11 @@ func (s *Scorch) Train(batch *index.Batch) error { return err } - err = snapshotsBucket.Put(util.BoltCentroidIndexKey, []byte(path)) + centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) + if err != nil { + return err + } + err = centroidBucket.Put(util.BoltPathKey, []byte(filename)) if err != nil { return err } @@ -675,7 +659,7 @@ func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) { // return the coarse quantizer of the centroid index belonging to the field centroidIndexSegment, ok := s.centroidIndex.segment.(segment.CentroidIndexSegment) if !ok { - return nil, fmt.Errorf("segment is not a centroid index segment") + return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil) } coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) if err != nil { @@ -1129,20 +1113,6 @@ func (s *Scorch) CopyReader() index.CopyReader { return rv } -func (s *Scorch) updateCentroidIndexInBolt(tx *bolt.Tx) error { - centroidIndexBucket, err := tx.CreateBucketIfNotExists(util.BoltCentroidIndexKey) - if err != nil { - return err - } - - err = centroidIndexBucket.Put(util.BoltPathKey, []byte("centroid_index.zap")) - if err != nil { - return err - } - - return nil -} - func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error { tx, err := s.rootBolt.Begin(true) if err != nil { @@ -1157,13 +1127,20 @@ func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error { // currently this is specific to centroid index file update if bytes.Equal(key, util.BoltCentroidIndexKey) { - // guard against duplicate updates - existingValue := snapshotsBucket.Get(key) + // todo: guard against duplicate updates + centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) + if err != nil { + return err + } + if centroidBucket == nil { + return fmt.Errorf("centroid bucket not found") + } + existingValue := centroidBucket.Get(util.BoltPathKey) if existingValue != nil { - return fmt.Errorf("key already exists") + return fmt.Errorf("key already exists %v %v", s.path, string(existingValue)) } - err = snapshotsBucket.Put(key, value) + err = centroidBucket.Put(util.BoltPathKey, value) if err != nil { return err } @@ -1194,7 +1171,7 @@ func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error { // centroid index file - this is outside the snapshots domain so the bolt update is different err := d.UpdateFileInBolt(util.BoltCentroidIndexKey, []byte(file)) if err != nil { - return err + return fmt.Errorf("error updating dest index bolt: %w", err) } } diff --git a/index_alias_impl.go b/index_alias_impl.go index 8cc1d90ed..16f20ac45 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -107,17 +107,7 @@ func (i *indexAliasImpl) Train(batch *Batch) error { i.mutex.RLock() defer i.mutex.RUnlock() - if !i.open { - return ErrorIndexClosed - } - err := i.isAliasToSingleIndex() - if err != nil { - return err - } - - if vi, ok := i.indexes[0].(index.VectorIndex); ok { - return vi.Train(batch.internal) - } + // TODO: implement this return fmt.Errorf("not a vector index") } diff --git a/index_impl.go b/index_impl.go index 3391d78c5..ec567083f 100644 --- a/index_impl.go +++ b/index_impl.go @@ -326,11 +326,13 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) { i.FireIndexEvent() + // fmt.Printf("data is %#v\n", data) doc := document.NewDocument(id) err = i.m.MapDocument(doc, data) if err != nil { return } + // fmt.Printf("data is after mapping %#v\n", doc) err = i.i.Update(doc) return } From 3b0470bbe92b1aefa218e1dcffee8a14608162da Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 15 Dec 2025 11:35:30 -0800 Subject: [PATCH 14/25] wip: implement async trainer loop with incremental training support --- index/scorch/scorch.go | 173 +++++++++++++++++++++++++++++------------ util/keys.go | 1 + 2 files changed, 125 insertions(+), 49 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index ca86da936..888bc9847 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -16,6 +16,7 @@ package scorch import ( "bytes" + "encoding/binary" "encoding/json" "fmt" "io" @@ -84,6 +85,7 @@ type Scorch struct { asyncTasks sync.WaitGroup // not a real searchable segment, singleton centroidIndex *SegmentSnapshot + train chan *trainRequest onEvent func(event Event) bool onAsyncError func(err error, path string) @@ -101,6 +103,12 @@ func (t ScorchErrorType) Error() string { return string(t) } +type trainRequest struct { + sample segment.Segment + vecCount int + ackCh chan error +} + // ErrType values for ScorchError const ( ErrAsyncPanic = ScorchErrorType("async panic error") @@ -549,13 +557,118 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { func (s *Scorch) getInternal(key []byte) ([]byte, error) { s.rootLock.RLock() defer s.rootLock.RUnlock() + // todo: return the total number of vectors that have been processed so far in training + // in cbft use that as a checkpoint to resume training for n-x samples. if string(key) == "_centroid_index_complete" { return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil } return nil, nil } +// this is not a routine that will be running throughout the lifetime of the index. It's purpose +// is to only train the vector index before the data ingestion starts. +func (s *Scorch) trainerLoop() { + // some init stuff + s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex + var totalSamplesProcessed int + filename := "centroid_index" + path := filepath.Join(s.path, filename) + buf := make([]byte, binary.MaxVarintLen64) + for { + select { + case <-s.closeCh: + return + case trainReq := <-s.train: + sampleSeg := trainReq.sample + if s.centroidIndex == nil { + // new centroid index + s.centroidIndex = &SegmentSnapshot{ + segment: sampleSeg, + } + switch seg := sampleSeg.(type) { + case segment.UnpersistedSegment: + err := persistToDirectory(seg, nil, path) + if err != nil { + // clean up this ugly ass error handling code + trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err) + close(trainReq.ackCh) + } + default: + fmt.Errorf("segment is not a unpersisted segment") + close(s.closeCh) + } + } else { + // merge the new segment with the existing one, no need to persist? + // persist in a tmp file and then rename - is that a fair strategy? + _, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg}, + []*roaring.Bitmap{nil, nil}, "centroid_index.tmp", s.closeCh, nil, s.segmentConfig) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) + close(trainReq.ackCh) + } + + // close the existing centroid segment - it's supposed to be gc'd at this point + s.centroidIndex.segment.Close() + err = os.Rename(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index")) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err) + close(trainReq.ackCh) + } + } + + totalSamplesProcessed += trainReq.vecCount + // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint + // where we can be sure that the centroid index is available for the indexing operations downstream + // + // note: when the scale increases massively especially with real world dimensions of 1536+, this API + // will have to be refactored to persist in a more resource efficient way. so having this bolt related + // code will help in tracking the progress a lot better and avoid any redudant data streaming operations. + tx, err := s.rootBolt.Begin(true) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err) + close(trainReq.ackCh) + } + defer tx.Rollback() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err) + close(trainReq.ackCh) + } + + centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err) + close(trainReq.ackCh) + } + + err = centroidBucket.Put(util.BoltPathKey, []byte(filename)) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err) + close(trainReq.ackCh) + } + + // total number of vectors that have been processed so far for the training + n := binary.PutUvarint(buf, uint64(totalSamplesProcessed)) + err = centroidBucket.Put(util.BoltVecSamplesProcessedKey, buf[:n]) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error updating vec samples processed: %v", err) + close(trainReq.ackCh) + } + + err = tx.Commit() + if err != nil { + trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) + close(trainReq.ackCh) + } + + close(trainReq.ackCh) + } + } +} + func (s *Scorch) Train(batch *index.Batch) error { + // is the lock really needed? s.rootLock.Lock() defer s.rootLock.Unlock() @@ -586,7 +699,6 @@ func (s *Scorch) Train(batch *index.Batch) error { // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection - s.segmentConfig["training"] = true seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig) if err != nil { @@ -594,65 +706,28 @@ func (s *Scorch) Train(batch *index.Batch) error { } // reset the training flag once completed s.segmentConfig["training"] = false - // not suffixing with .zap since the current garbage collection is tailored to .zap ext files - // we don't want to gc this file ever. - filename := "centroid_index" - path := filepath.Join(s.path, filename) - switch seg := seg.(type) { - case segment.UnpersistedSegment: - err = persistToDirectory(seg, nil, path) - if err != nil { - return err - } - default: - return fmt.Errorf("segment is not a unpersisted segment") + trainReq := &trainRequest{ + sample: seg, + vecCount: len(trainData), // todo: multivector support + ackCh: make(chan error), } - // persist and open the segment mmap mode. - persistedSegment, err := s.segPlugin.OpenEx(path, s.segmentConfig) + s.train <- trainReq + err = <-trainReq.ackCh if err != nil { return err } - s.centroidIndex = &SegmentSnapshot{ - segment: persistedSegment, - } - fmt.Println("number of bytes written to centroid index", n) - s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex - - // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint - // where we can be sure that the centroid index is available for the indexing operations downstream - // - // note: when the scale increases massively especially with real world dimensions of 1536+, this API - // will have to be refactored to persist in a more resource efficient way. so having this bolt related - // code will help in tracking the progress a lot better and avoid any redudant data streaming operations. - tx, err := s.rootBolt.Begin(true) + centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig) if err != nil { return err } - defer tx.Rollback() - - snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) - if err != nil { - return err - } - - centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) - if err != nil { - return err - } - err = centroidBucket.Put(util.BoltPathKey, []byte(filename)) - if err != nil { - return err - } - - err = tx.Commit() - if err != nil { - return err + s.centroidIndex = &SegmentSnapshot{ + segment: centroidIndex, } - - return nil + fmt.Println("number of bytes written to centroid index", n) + return err } func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) { diff --git a/util/keys.go b/util/keys.go index 11c918865..67415e782 100644 --- a/util/keys.go +++ b/util/keys.go @@ -18,6 +18,7 @@ var ( // Bolt keys BoltSnapshotsBucket = []byte{'s'} BoltCentroidIndexKey = []byte{'c'} + BoltVecSamplesProcessedKey = []byte{'v'} BoltPathKey = []byte{'p'} BoltDeletedKey = []byte{'d'} BoltInternalKey = []byte{'i'} From edde0cad9b737e23b9739eb9088888df1b204ec2 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 15 Dec 2025 11:36:44 -0800 Subject: [PATCH 15/25] regulate train function using EventKindIndexStart --- index/scorch/scorch.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 888bc9847..30bf7ea02 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -668,6 +668,8 @@ func (s *Scorch) trainerLoop() { } func (s *Scorch) Train(batch *index.Batch) error { + // regulate the Train function + s.FireIndexEvent() // is the lock really needed? s.rootLock.Lock() From 758ed77c2f5fd88e26f978a16e88396c727d22f5 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Fri, 9 Jan 2026 13:50:07 -0800 Subject: [PATCH 16/25] incremental training bug fixes + better recoverability --- index/scorch/scorch.go | 85 ++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 30bf7ea02..29900b770 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -281,6 +281,9 @@ func (s *Scorch) Open() error { s.asyncTasks.Add(1) go s.introducerLoop() + s.asyncTasks.Add(1) + go s.trainerLoop() + if !s.readOnly && s.path != "" { s.asyncTasks.Add(1) go s.persisterLoop() @@ -356,6 +359,7 @@ func (s *Scorch) openBolt() error { s.persisterNotifier = make(chan *epochWatcher, 1) s.closeCh = make(chan struct{}) s.forceMergeRequestCh = make(chan *mergerCtrl, 1) + s.train = make(chan *trainRequest) if !s.readOnly && s.path != "" { err := s.removeOldZapFiles() // Before persister or merger create any new files. @@ -565,9 +569,21 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) { return nil, nil } +func moveFile(sourcePath, destPath string) error { + // rename is supposed to be atomic on the same filesystem + err := os.Rename(sourcePath, destPath) + if err != nil { + return fmt.Errorf("error renaming file: %v", err) + } + return nil +} + // this is not a routine that will be running throughout the lifetime of the index. It's purpose // is to only train the vector index before the data ingestion starts. func (s *Scorch) trainerLoop() { + defer func() { + s.asyncTasks.Done() + }() // some init stuff s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex var totalSamplesProcessed int @@ -581,10 +597,6 @@ func (s *Scorch) trainerLoop() { case trainReq := <-s.train: sampleSeg := trainReq.sample if s.centroidIndex == nil { - // new centroid index - s.centroidIndex = &SegmentSnapshot{ - segment: sampleSeg, - } switch seg := sampleSeg.(type) { case segment.UnpersistedSegment: err := persistToDirectory(seg, nil, path) @@ -592,30 +604,35 @@ func (s *Scorch) trainerLoop() { // clean up this ugly ass error handling code trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err) close(trainReq.ackCh) + return } default: fmt.Errorf("segment is not a unpersisted segment") close(s.closeCh) + return } } else { // merge the new segment with the existing one, no need to persist? // persist in a tmp file and then rename - is that a fair strategy? + fmt.Println("merging centroid index") + s.segmentConfig["training"] = true _, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg}, - []*roaring.Bitmap{nil, nil}, "centroid_index.tmp", s.closeCh, nil, s.segmentConfig) + []*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) close(trainReq.ackCh) } + // reset the training flag once completed + s.segmentConfig["training"] = false // close the existing centroid segment - it's supposed to be gc'd at this point s.centroidIndex.segment.Close() - err = os.Rename(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index")) + err = moveFile(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index")) if err != nil { trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err) close(trainReq.ackCh) } } - totalSamplesProcessed += trainReq.vecCount // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint // where we can be sure that the centroid index is available for the indexing operations downstream @@ -627,6 +644,7 @@ func (s *Scorch) trainerLoop() { if err != nil { trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err) close(trainReq.ackCh) + return } defer tx.Rollback() @@ -634,18 +652,21 @@ func (s *Scorch) trainerLoop() { if err != nil { trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err) close(trainReq.ackCh) + return } centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) if err != nil { trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err) close(trainReq.ackCh) + return } err = centroidBucket.Put(util.BoltPathKey, []byte(filename)) if err != nil { trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err) close(trainReq.ackCh) + return } // total number of vectors that have been processed so far for the training @@ -654,14 +675,25 @@ func (s *Scorch) trainerLoop() { if err != nil { trainReq.ackCh <- fmt.Errorf("error updating vec samples processed: %v", err) close(trainReq.ackCh) + return } err = tx.Commit() if err != nil { trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) close(trainReq.ackCh) + return } + centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) + close(trainReq.ackCh) + return + } + s.centroidIndex = &SegmentSnapshot{ + segment: centroidIndex, + } close(trainReq.ackCh) } } @@ -671,25 +703,20 @@ func (s *Scorch) Train(batch *index.Batch) error { // regulate the Train function s.FireIndexEvent() - // is the lock really needed? - s.rootLock.Lock() - defer s.rootLock.Unlock() - if s.centroidIndex != nil { - // singleton API - return nil - } + // // is the lock really needed? + // s.rootLock.Lock() + // defer s.rootLock.Unlock() + var trainData []index.Document - if s.centroidIndex == nil { - for key, doc := range batch.IndexOps { - if doc != nil { - // insert _id field - // no need to track updates/deletes over here since - // the API is singleton - doc.AddIDField() - } - if strings.HasPrefix(key, index.TrainDataPrefix) { - trainData = append(trainData, doc) - } + for key, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + // no need to track updates/deletes over here since + // the API is singleton + doc.AddIDField() + } + if strings.HasPrefix(key, index.TrainDataPrefix) { + trainData = append(trainData, doc) } } @@ -701,13 +728,10 @@ func (s *Scorch) Train(batch *index.Batch) error { // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection - s.segmentConfig["training"] = true seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig) if err != nil { return err } - // reset the training flag once completed - s.segmentConfig["training"] = false trainReq := &trainRequest{ sample: seg, @@ -718,6 +742,7 @@ func (s *Scorch) Train(batch *index.Batch) error { s.train <- trainReq err = <-trainReq.ackCh if err != nil { + fmt.Println("error training", err) return err } @@ -728,6 +753,10 @@ func (s *Scorch) Train(batch *index.Batch) error { s.centroidIndex = &SegmentSnapshot{ segment: centroidIndex, } + _, err = s.getCentroidIndex("emb") + if err != nil { + return err + } fmt.Println("number of bytes written to centroid index", n) return err } From 734e2727eb15df922ad792c3aa1eb19aefdf1ac2 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 15 Jan 2026 13:28:53 -0800 Subject: [PATCH 17/25] cleanup: --- index/scorch/merge.go | 1 + index/scorch/persister.go | 2 +- index/scorch/scorch.go | 17 ++++++----------- index/scorch/segment_plugin.go | 1 + 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index bca9bbb81..e7f7cce3f 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -537,6 +537,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, filename := zapFileName(newSegmentID) path := s.path + string(os.PathSeparator) + filename + fmt.Println("version while merging", s.segPlugin.Version()) // the newly merged segment is already flushed out to disk, just needs // to be opened using mmap. newDocIDs, _, err := diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 3a4bdacc4..00c26a5d2 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -425,7 +425,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste var totSize int var numSegsToFlushOut int var totDocs uint64 - + fmt.Println("persister doing its thing") // legacy behaviour of merge + flush of all in-memory segments in one-shot if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) { val := &flushable{ diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 29900b770..8153f65fb 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -616,6 +616,7 @@ func (s *Scorch) trainerLoop() { // persist in a tmp file and then rename - is that a fair strategy? fmt.Println("merging centroid index") s.segmentConfig["training"] = true + fmt.Println("version while merging", s.segPlugin.Version()) _, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg}, []*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig) if err != nil { @@ -685,6 +686,8 @@ func (s *Scorch) trainerLoop() { return } + // update the centroid index pointer + fmt.Println("version", s.segPlugin.Version()) centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) @@ -703,10 +706,6 @@ func (s *Scorch) Train(batch *index.Batch) error { // regulate the Train function s.FireIndexEvent() - // // is the lock really needed? - // s.rootLock.Lock() - // defer s.rootLock.Unlock() - var trainData []index.Document for key, doc := range batch.IndexOps { if doc != nil { @@ -745,14 +744,8 @@ func (s *Scorch) Train(batch *index.Batch) error { fmt.Println("error training", err) return err } + fmt.Println("got centroid index") - centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig) - if err != nil { - return err - } - s.centroidIndex = &SegmentSnapshot{ - segment: centroidIndex, - } _, err = s.getCentroidIndex("emb") if err != nil { return err @@ -767,6 +760,8 @@ func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) { if !ok { return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil) } + + fmt.Println("getting coarse quantizer", field) coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) if err != nil { return nil, err diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index 16be8e440..deeeffd3f 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -101,6 +101,7 @@ func RegisterSegmentPlugin(plugin SegmentPlugin, makeDefault bool) { } supportedSegmentPlugins[plugin.Type()][plugin.Version()] = plugin if makeDefault { + fmt.Println("registering default segment plugin", plugin.Type(), plugin.Version()) defaultSegmentPlugin = plugin } } From a41f99c5c76da1cbcdc858ed4d8f0d5c634c0c68 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 26 Jan 2026 12:59:07 -0800 Subject: [PATCH 18/25] cleanup and refactor the code to have the foundational stuff --- index/scorch/merge.go | 2 ++ index/scorch/persister.go | 6 ++---- index/scorch/scorch.go | 42 ++++++++++++++++++++------------------- util/keys.go | 2 +- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index e7f7cce3f..e75e1c23e 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -335,6 +335,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) mergedSegHistory := make(map[uint64]*mergedSegmentHistory, len(task.Segments)) + var files []string for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot @@ -350,6 +351,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, } else { segmentsToMerge = append(segmentsToMerge, segSnapshot.segment) docsToDrop = append(docsToDrop, segSnapshot.deleted) + files = append(files, persistedSeg.Path()) } // track the files getting merged for unsetting the // removal ineligibility. This helps to unflip files diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 00c26a5d2..1d118bb50 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -853,18 +853,16 @@ func zapFileName(epoch uint64) string { return fmt.Sprintf("%012x.zap", epoch) } -func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error { +func (s *Scorch) loadCentroidIndex(bucket *bolt.Bucket) error { if bucket == nil { return nil } - fmt.Println("updateCentroidIndex bucket", bucket != nil) segmentSnapshot, err := s.loadSegment(bucket) if err != nil { return err } s.rootLock.Lock() defer s.rootLock.Unlock() - fmt.Println("updateCentroidIndex", segmentSnapshot.segment != nil) s.centroidIndex = segmentSnapshot return nil } @@ -928,7 +926,7 @@ func (s *Scorch) loadFromBolt() error { } centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey) - err := s.updateCentroidIndex(centroidIndexBucket) + err := s.loadCentroidIndex(centroidIndexBucket) if err != nil { return err } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 8153f65fb..cb5367117 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -16,7 +16,6 @@ package scorch import ( "bytes" - "encoding/binary" "encoding/json" "fmt" "io" @@ -26,6 +25,7 @@ import ( "sync" "sync/atomic" "time" + "unsafe" "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/registry" @@ -563,7 +563,8 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) { defer s.rootLock.RUnlock() // todo: return the total number of vectors that have been processed so far in training // in cbft use that as a checkpoint to resume training for n-x samples. - if string(key) == "_centroid_index_complete" { + switch string(key) { + case string(util.BoltTrainCompleteKey): return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil } return nil, nil @@ -578,6 +579,10 @@ func moveFile(sourcePath, destPath string) error { return nil } +func boolToByte(b bool) byte { + return *(*byte)(unsafe.Pointer(&b)) +} + // this is not a routine that will be running throughout the lifetime of the index. It's purpose // is to only train the vector index before the data ingestion starts. func (s *Scorch) trainerLoop() { @@ -589,7 +594,6 @@ func (s *Scorch) trainerLoop() { var totalSamplesProcessed int filename := "centroid_index" path := filepath.Join(s.path, filename) - buf := make([]byte, binary.MaxVarintLen64) for { select { case <-s.closeCh: @@ -614,9 +618,7 @@ func (s *Scorch) trainerLoop() { } else { // merge the new segment with the existing one, no need to persist? // persist in a tmp file and then rename - is that a fair strategy? - fmt.Println("merging centroid index") s.segmentConfig["training"] = true - fmt.Println("version while merging", s.segPlugin.Version()) _, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg}, []*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig) if err != nil { @@ -647,7 +649,11 @@ func (s *Scorch) trainerLoop() { close(trainReq.ackCh) return } - defer tx.Rollback() + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) if err != nil { @@ -670,16 +676,14 @@ func (s *Scorch) trainerLoop() { return } - // total number of vectors that have been processed so far for the training - n := binary.PutUvarint(buf, uint64(totalSamplesProcessed)) - err = centroidBucket.Put(util.BoltVecSamplesProcessedKey, buf[:n]) + err = tx.Commit() if err != nil { - trainReq.ackCh <- fmt.Errorf("error updating vec samples processed: %v", err) + trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) close(trainReq.ackCh) return } - err = tx.Commit() + err = s.rootBolt.Sync() if err != nil { trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) close(trainReq.ackCh) @@ -687,7 +691,6 @@ func (s *Scorch) trainerLoop() { } // update the centroid index pointer - fmt.Println("version", s.segPlugin.Version()) centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) @@ -706,6 +709,8 @@ func (s *Scorch) Train(batch *index.Batch) error { // regulate the Train function s.FireIndexEvent() + // batch.InternalOps + var trainData []index.Document for key, doc := range batch.IndexOps { if doc != nil { @@ -741,15 +746,9 @@ func (s *Scorch) Train(batch *index.Batch) error { s.train <- trainReq err = <-trainReq.ackCh if err != nil { - fmt.Println("error training", err) return err } - fmt.Println("got centroid index") - _, err = s.getCentroidIndex("emb") - if err != nil { - return err - } fmt.Println("number of bytes written to centroid index", n) return err } @@ -761,7 +760,6 @@ func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) { return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil) } - fmt.Println("getting coarse quantizer", field) coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) if err != nil { return nil, err @@ -1219,7 +1217,11 @@ func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error { if err != nil { return err } - defer tx.Rollback() + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) if err != nil { diff --git a/util/keys.go b/util/keys.go index 67415e782..a1f3bfbbf 100644 --- a/util/keys.go +++ b/util/keys.go @@ -18,7 +18,7 @@ var ( // Bolt keys BoltSnapshotsBucket = []byte{'s'} BoltCentroidIndexKey = []byte{'c'} - BoltVecSamplesProcessedKey = []byte{'v'} + BoltTrainCompleteKey = []byte{'t'} BoltPathKey = []byte{'p'} BoltDeletedKey = []byte{'d'} BoltInternalKey = []byte{'i'} From 0331a9317f18b765814daab93b9afc370ed662c1 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 29 Jan 2026 11:02:24 -0800 Subject: [PATCH 19/25] refactor file transfer --- index.go | 7 --- index/scorch/merge.go | 1 - index/scorch/persister.go | 6 -- index/scorch/scorch.go | 100 ++------------------------------- index/scorch/segment_plugin.go | 1 - index/scorch/snapshot_index.go | 3 +- index_impl.go | 37 ------------ 7 files changed, 7 insertions(+), 148 deletions(-) diff --git a/index.go b/index.go index bd5421d85..c083787c4 100644 --- a/index.go +++ b/index.go @@ -51,12 +51,10 @@ func (b *Batch) Index(id string, data interface{}) error { eventIndex.FireIndexEvent() } doc := document.NewDocument(id) - // fmt.Printf("data is before mapping %#v\n", data) err := b.index.Mapping().MapDocument(doc, data) if err != nil { return err } - // fmt.Printf("data is after mapping %#v\n", doc) b.internal.Update(doc) b.lastDocSize = uint64(doc.Size() + @@ -355,11 +353,6 @@ type IndexCopyable interface { CopyTo(d index.Directory) error } -type IndexFileCopyable interface { - UpdateFileInBolt(key []byte, value []byte) error - CopyFile(file string, d index.IndexDirectory) error -} - // FileSystemDirectory is the default implementation for the // index.Directory interface. type FileSystemDirectory string diff --git a/index/scorch/merge.go b/index/scorch/merge.go index e75e1c23e..32de86bd4 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -539,7 +539,6 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, filename := zapFileName(newSegmentID) path := s.path + string(os.PathSeparator) + filename - fmt.Println("version while merging", s.segPlugin.Version()) // the newly merged segment is already flushed out to disk, just needs // to be opened using mmap. newDocIDs, _, err := diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 1d118bb50..fb250001b 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -425,7 +425,6 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste var totSize int var numSegsToFlushOut int var totDocs uint64 - fmt.Println("persister doing its thing") // legacy behaviour of merge + flush of all in-memory segments in one-shot if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) { val := &flushable{ @@ -887,12 +886,7 @@ func (s *Scorch) loadFromBolt() error { s.AddEligibleForRemoval(snapshotEpoch) continue } - // fmt.Println("loadFromBolt key %s", k) - // if k[0] == util.BoltCentroidIndexKey[0] { - // fmt.Println("loadFromBolt centroid index key", string(k)) - // continue - // } snapshot := snapshots.Bucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index cb5367117..5fa8483e7 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -15,10 +15,8 @@ package scorch import ( - "bytes" "encoding/json" "fmt" - "io" "os" "path/filepath" "strings" @@ -83,7 +81,7 @@ type Scorch struct { persisterNotifier chan *epochWatcher rootBolt *bolt.DB asyncTasks sync.WaitGroup - // not a real searchable segment, singleton + // not a real searchable segment centroidIndex *SegmentSnapshot train chan *trainRequest @@ -184,7 +182,6 @@ func NewScorch(storeName string, } } - // "pretraining": true segConfig, ok := config["segmentConfig"].(map[string]interface{}) if ok { rv.segmentConfig = segConfig @@ -592,7 +589,7 @@ func (s *Scorch) trainerLoop() { // some init stuff s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex var totalSamplesProcessed int - filename := "centroid_index" + filename := index.CentroidIndexFileName path := filepath.Join(s.path, filename) for { select { @@ -620,7 +617,7 @@ func (s *Scorch) trainerLoop() { // persist in a tmp file and then rename - is that a fair strategy? s.segmentConfig["training"] = true _, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg}, - []*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig) + []*roaring.Bitmap{nil, nil}, filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), s.closeCh, nil, s.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) close(trainReq.ackCh) @@ -630,7 +627,7 @@ func (s *Scorch) trainerLoop() { // close the existing centroid segment - it's supposed to be gc'd at this point s.centroidIndex.segment.Close() - err = moveFile(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index")) + err = moveFile(filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), filepath.Join(s.path, index.CentroidIndexFileName)) if err != nil { trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err) close(trainReq.ackCh) @@ -691,7 +688,7 @@ func (s *Scorch) trainerLoop() { } // update the centroid index pointer - centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig) + centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.centroidIndexFileName), s.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) close(trainReq.ackCh) @@ -709,8 +706,6 @@ func (s *Scorch) Train(batch *index.Batch) error { // regulate the Train function s.FireIndexEvent() - // batch.InternalOps - var trainData []index.Document for key, doc := range batch.IndexOps { if doc != nil { @@ -732,7 +727,7 @@ func (s *Scorch) Train(batch *index.Batch) error { // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection - seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig) + seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig) if err != nil { return err } @@ -749,7 +744,6 @@ func (s *Scorch) Train(batch *index.Batch) error { return err } - fmt.Println("number of bytes written to centroid index", n) return err } @@ -1212,88 +1206,6 @@ func (s *Scorch) CopyReader() index.CopyReader { return rv } -func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error { - tx, err := s.rootBolt.Begin(true) - if err != nil { - return err - } - defer func() { - if err != nil { - _ = tx.Rollback() - } - }() - - snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) - if err != nil { - return err - } - - // currently this is specific to centroid index file update - if bytes.Equal(key, util.BoltCentroidIndexKey) { - // todo: guard against duplicate updates - centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) - if err != nil { - return err - } - if centroidBucket == nil { - return fmt.Errorf("centroid bucket not found") - } - existingValue := centroidBucket.Get(util.BoltPathKey) - if existingValue != nil { - return fmt.Errorf("key already exists %v %v", s.path, string(existingValue)) - } - - err = centroidBucket.Put(util.BoltPathKey, value) - if err != nil { - return err - } - } - - err = tx.Commit() - if err != nil { - return err - } - - err = s.rootBolt.Sync() - if err != nil { - return err - } - - return nil -} - -// CopyFile copies a specific file to a destination directory which has an access to a bleve index -// doing a io.Copy() isn't enough because the file needs to be tracked in bolt file as well -func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error { - s.rootLock.Lock() - defer s.rootLock.Unlock() - - // this code is currently specific to centroid index file but is future proofed for other files - // to be updated in the dest's bolt - if strings.HasSuffix(file, "centroid_index") { - // centroid index file - this is outside the snapshots domain so the bolt update is different - err := d.UpdateFileInBolt(util.BoltCentroidIndexKey, []byte(file)) - if err != nil { - return fmt.Errorf("error updating dest index bolt: %w", err) - } - } - - dest, err := d.GetWriter(filepath.Join("store", file)) - if err != nil { - return err - } - - source, err := os.Open(filepath.Join(s.path, file)) - if err != nil { - return err - } - - defer source.Close() - defer dest.Close() - _, err = io.Copy(dest, source) - return err -} - // external API to fire a scorch event (EventKindIndexStart) externally from bleve func (s *Scorch) FireIndexEvent() { s.fireEvent(EventKindIndexStart, 0) diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index deeeffd3f..16be8e440 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -101,7 +101,6 @@ func RegisterSegmentPlugin(plugin SegmentPlugin, makeDefault bool) { } supportedSegmentPlugins[plugin.Type()][plugin.Version()] = plugin if makeDefault { - fmt.Println("registering default segment plugin", plugin.Type(), plugin.Version()) defaultSegmentPlugin = plugin } } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3585b31d8..688f9d903 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -66,8 +66,7 @@ func init() { } type IndexSnapshot struct { - parent *Scorch - + parent *Scorch segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte diff --git a/index_impl.go b/index_impl.go index ec567083f..0d7e1dd4d 100644 --- a/index_impl.go +++ b/index_impl.go @@ -326,13 +326,11 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) { i.FireIndexEvent() - // fmt.Printf("data is %#v\n", data) doc := document.NewDocument(id) err = i.m.MapDocument(doc, data) if err != nil { return } - // fmt.Printf("data is after mapping %#v\n", doc) err = i.i.Update(doc) return } @@ -1432,39 +1430,6 @@ func (m *searchHitSorter) Less(i, j int) bool { return c < 0 } -func (i *indexImpl) CopyFile(file string, d index.IndexDirectory) (err error) { - i.mutex.RLock() - defer i.mutex.RUnlock() - - if !i.open { - return ErrorIndexClosed - } - - copyIndex, ok := i.i.(index.IndexFileCopyable) - if !ok { - return fmt.Errorf("index implementation does not support copy reader") - } - - return copyIndex.CopyFile(file, d) -} - -func (i *indexImpl) UpdateFileInBolt(key []byte, value []byte) error { - i.mutex.RLock() - defer i.mutex.RUnlock() - - if !i.open { - return ErrorIndexClosed - } - - copyIndex, ok := i.i.(index.IndexFileCopyable) - if !ok { - return fmt.Errorf("index implementation does not support file copy") - } - - return copyIndex.UpdateFileInBolt(key, value) -} - -// CopyTo (index.Directory, filter) func (i *indexImpl) CopyTo(d index.Directory) (err error) { i.mutex.RLock() defer i.mutex.RUnlock() @@ -1478,8 +1443,6 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) { return fmt.Errorf("index implementation does not support copy reader") } - // copyIndex.Copy() -> copies the centroid index - copyReader := copyIndex.CopyReader() if copyReader == nil { return fmt.Errorf("index's copyReader is nil") From 8bb49d0f2d0535367ab1528f11eea2ee7b2522b9 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 29 Jan 2026 12:55:50 -0800 Subject: [PATCH 20/25] fix var name --- index/scorch/scorch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 5fa8483e7..ad6c7f558 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -688,7 +688,7 @@ func (s *Scorch) trainerLoop() { } // update the centroid index pointer - centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.centroidIndexFileName), s.segmentConfig) + centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.CentroidIndexFileName), s.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) close(trainReq.ackCh) From cca945a0ada953a61d9a718f5bfbc0c5b15f3b7d Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 29 Jan 2026 13:38:16 -0800 Subject: [PATCH 21/25] refactor the trainer --- index/scorch/persister.go | 18 +-- index/scorch/scorch.go | 216 ++---------------------------- index/scorch/train.go | 261 +++++++++++++++++++++++++++++++++++++ index/scorch/train_noop.go | 25 ++++ util/keys.go | 4 +- 5 files changed, 305 insertions(+), 219 deletions(-) create mode 100644 index/scorch/train.go create mode 100644 index/scorch/train_noop.go diff --git a/index/scorch/persister.go b/index/scorch/persister.go index fb250001b..4aad12900 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -852,18 +852,8 @@ func zapFileName(epoch uint64) string { return fmt.Sprintf("%012x.zap", epoch) } -func (s *Scorch) loadCentroidIndex(bucket *bolt.Bucket) error { - if bucket == nil { - return nil - } - segmentSnapshot, err := s.loadSegment(bucket) - if err != nil { - return err - } - s.rootLock.Lock() - defer s.rootLock.Unlock() - s.centroidIndex = segmentSnapshot - return nil +func (s *Scorch) loadTrainedData(bucket *bolt.Bucket) error { + return s.trainer.loadTrainedData(bucket) } // bolt snapshot code @@ -919,8 +909,8 @@ func (s *Scorch) loadFromBolt() error { foundRoot = true } - centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey) - err := s.loadCentroidIndex(centroidIndexBucket) + trainerBucket := snapshots.Bucket(util.BoltTrainerKey) + err := s.trainer.loadTrainedData(trainerBucket) if err != nil { return err } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index ad6c7f558..0a004eb8d 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -19,7 +19,6 @@ import ( "fmt" "os" "path/filepath" - "strings" "sync" "sync/atomic" "time" @@ -29,7 +28,6 @@ import ( "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" - "github.com/blevesearch/go-faiss" segment "github.com/blevesearch/scorch_segment_api/v2" bolt "go.etcd.io/bbolt" ) @@ -81,9 +79,8 @@ type Scorch struct { persisterNotifier chan *epochWatcher rootBolt *bolt.DB asyncTasks sync.WaitGroup - // not a real searchable segment - centroidIndex *SegmentSnapshot - train chan *trainRequest + + trainer trainer onEvent func(event Event) bool onAsyncError func(err error, path string) @@ -95,18 +92,19 @@ type Scorch struct { spatialPlugin index.SpatialAnalyzerPlugin } +type trainer interface { + trainLoop() + train(batch *index.Batch) error + loadTrainedData(*bolt.Bucket) error + getInternal(key []byte) ([]byte, error) +} + type ScorchErrorType string func (t ScorchErrorType) Error() string { return string(t) } -type trainRequest struct { - sample segment.Segment - vecCount int - ackCh chan error -} - // ErrType values for ScorchError const ( ErrAsyncPanic = ScorchErrorType("async panic error") @@ -279,7 +277,7 @@ func (s *Scorch) Open() error { go s.introducerLoop() s.asyncTasks.Add(1) - go s.trainerLoop() + go s.trainer.trainLoop() if !s.readOnly && s.path != "" { s.asyncTasks.Add(1) @@ -356,7 +354,6 @@ func (s *Scorch) openBolt() error { s.persisterNotifier = make(chan *epochWatcher, 1) s.closeCh = make(chan struct{}) s.forceMergeRequestCh = make(chan *mergerCtrl, 1) - s.train = make(chan *trainRequest) if !s.readOnly && s.path != "" { err := s.removeOldZapFiles() // Before persister or merger create any new files. @@ -558,207 +555,20 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { func (s *Scorch) getInternal(key []byte) ([]byte, error) { s.rootLock.RLock() defer s.rootLock.RUnlock() - // todo: return the total number of vectors that have been processed so far in training - // in cbft use that as a checkpoint to resume training for n-x samples. + switch string(key) { case string(util.BoltTrainCompleteKey): - return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil + return s.trainer.getInternal(key) } return nil, nil } -func moveFile(sourcePath, destPath string) error { - // rename is supposed to be atomic on the same filesystem - err := os.Rename(sourcePath, destPath) - if err != nil { - return fmt.Errorf("error renaming file: %v", err) - } - return nil -} - func boolToByte(b bool) byte { return *(*byte)(unsafe.Pointer(&b)) } -// this is not a routine that will be running throughout the lifetime of the index. It's purpose -// is to only train the vector index before the data ingestion starts. -func (s *Scorch) trainerLoop() { - defer func() { - s.asyncTasks.Done() - }() - // some init stuff - s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex - var totalSamplesProcessed int - filename := index.CentroidIndexFileName - path := filepath.Join(s.path, filename) - for { - select { - case <-s.closeCh: - return - case trainReq := <-s.train: - sampleSeg := trainReq.sample - if s.centroidIndex == nil { - switch seg := sampleSeg.(type) { - case segment.UnpersistedSegment: - err := persistToDirectory(seg, nil, path) - if err != nil { - // clean up this ugly ass error handling code - trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err) - close(trainReq.ackCh) - return - } - default: - fmt.Errorf("segment is not a unpersisted segment") - close(s.closeCh) - return - } - } else { - // merge the new segment with the existing one, no need to persist? - // persist in a tmp file and then rename - is that a fair strategy? - s.segmentConfig["training"] = true - _, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg}, - []*roaring.Bitmap{nil, nil}, filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), s.closeCh, nil, s.segmentConfig) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) - close(trainReq.ackCh) - } - // reset the training flag once completed - s.segmentConfig["training"] = false - - // close the existing centroid segment - it's supposed to be gc'd at this point - s.centroidIndex.segment.Close() - err = moveFile(filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), filepath.Join(s.path, index.CentroidIndexFileName)) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err) - close(trainReq.ackCh) - } - } - totalSamplesProcessed += trainReq.vecCount - // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint - // where we can be sure that the centroid index is available for the indexing operations downstream - // - // note: when the scale increases massively especially with real world dimensions of 1536+, this API - // will have to be refactored to persist in a more resource efficient way. so having this bolt related - // code will help in tracking the progress a lot better and avoid any redudant data streaming operations. - tx, err := s.rootBolt.Begin(true) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err) - close(trainReq.ackCh) - return - } - defer func() { - if err != nil { - _ = tx.Rollback() - } - }() - - snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err) - close(trainReq.ackCh) - return - } - - centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err) - close(trainReq.ackCh) - return - } - - err = centroidBucket.Put(util.BoltPathKey, []byte(filename)) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err) - close(trainReq.ackCh) - return - } - - err = tx.Commit() - if err != nil { - trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) - close(trainReq.ackCh) - return - } - - err = s.rootBolt.Sync() - if err != nil { - trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) - close(trainReq.ackCh) - return - } - - // update the centroid index pointer - centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.CentroidIndexFileName), s.segmentConfig) - if err != nil { - trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) - close(trainReq.ackCh) - return - } - s.centroidIndex = &SegmentSnapshot{ - segment: centroidIndex, - } - close(trainReq.ackCh) - } - } -} - func (s *Scorch) Train(batch *index.Batch) error { - // regulate the Train function - s.FireIndexEvent() - - var trainData []index.Document - for key, doc := range batch.IndexOps { - if doc != nil { - // insert _id field - // no need to track updates/deletes over here since - // the API is singleton - doc.AddIDField() - } - if strings.HasPrefix(key, index.TrainDataPrefix) { - trainData = append(trainData, doc) - } - } - - // just builds a new vector index out of the train data provided - // it'll be an IVF index so the centroids are computed at this stage and - // this template will be used in the indexing down the line to index - // the data vectors. s.segmentConfig will mark this as a training phase - // and zap will handle it accordingly. - // - // note: this might index text data too, how to handle this? s.segmentConfig? - // todo: updates/deletes -> data drift detection - seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig) - if err != nil { - return err - } - - trainReq := &trainRequest{ - sample: seg, - vecCount: len(trainData), // todo: multivector support - ackCh: make(chan error), - } - - s.train <- trainReq - err = <-trainReq.ackCh - if err != nil { - return err - } - - return err -} - -func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) { - // return the coarse quantizer of the centroid index belonging to the field - centroidIndexSegment, ok := s.centroidIndex.segment.(segment.CentroidIndexSegment) - if !ok { - return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil) - } - - coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) - if err != nil { - return nil, err - } - return coarseQuantizer, nil + return s.trainer.train(batch) } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, diff --git a/index/scorch/train.go b/index/scorch/train.go new file mode 100644 index 000000000..2c2777fcf --- /dev/null +++ b/index/scorch/train.go @@ -0,0 +1,261 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package scorch + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/blevesearch/bleve/v2/util" + index "github.com/blevesearch/bleve_index_api" + "github.com/blevesearch/go-faiss" + segment "github.com/blevesearch/scorch_segment_api/v2" + bolt "go.etcd.io/bbolt" +) + +type trainRequest struct { + sample segment.Segment + vecCount int + ackCh chan error +} + +type vectorTrainer struct { + parent *Scorch + + // not a real searchable segment + centroidIndex *SegmentSnapshot + trainCh chan *trainRequest +} + +func moveFile(sourcePath, destPath string) error { + // rename is supposed to be atomic on the same filesystem + err := os.Rename(sourcePath, destPath) + if err != nil { + return fmt.Errorf("error renaming file: %v", err) + } + return nil +} + +// this is not a routine that will be running throughout the lifetime of the index. It's purpose +// is to only train the vector index before the data ingestion starts. +func (t *vectorTrainer) trainerLoop() { + defer func() { + t.parent.asyncTasks.Done() + }() + // initialize stuff + t.parent.segmentConfig["getCentroidIndexCallback"] = t.getCentroidIndex + t.trainCh = make(chan *trainRequest) + var totalSamplesProcessed int + filename := index.CentroidIndexFileName + path := filepath.Join(t.parent.path, filename) + for { + select { + case <-t.parent.closeCh: + return + case trainReq := <-t.trainCh: + sampleSeg := trainReq.sample + if t.centroidIndex == nil { + switch seg := sampleSeg.(type) { + case segment.UnpersistedSegment: + err := persistToDirectory(seg, nil, path) + if err != nil { + // clean up this ugly ass error handling code + trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err) + close(trainReq.ackCh) + return + } + default: + fmt.Errorf("segment is not a unpersisted segment") + close(t.parent.closeCh) + return + } + } else { + // merge the new segment with the existing one, no need to persist? + // persist in a tmp file and then rename - is that a fair strategy? + t.parent.segmentConfig["training"] = true + _, _, err := t.parent.segPlugin.MergeEx([]segment.Segment{t.centroidIndex.segment, sampleSeg}, + []*roaring.Bitmap{nil, nil}, filepath.Join(t.parent.path, filename+".tmp"), t.parent.closeCh, nil, t.parent.segmentConfig) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) + close(trainReq.ackCh) + } + // reset the training flag once completed + t.parent.segmentConfig["training"] = false + + // close the existing centroid segment - it's supposed to be gc'd at this point + t.centroidIndex.segment.Close() + err = moveFile(filepath.Join(t.parent.path, filename+".tmp"), filepath.Join(t.parent.path, filename)) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err) + close(trainReq.ackCh) + } + } + totalSamplesProcessed += trainReq.vecCount + // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint + // where we can be sure that the centroid index is available for the indexing operations downstream + // + // note: when the scale increases massively especially with real world dimensions of 1536+, this API + // will have to be refactored to persist in a more resource efficient way. so having this bolt related + // code will help in tracking the progress a lot better and avoid any redudant data streaming operations. + tx, err := t.parent.rootBolt.Begin(true) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err) + close(trainReq.ackCh) + return + } + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err) + close(trainReq.ackCh) + return + } + + trainerBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltTrainerKey) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err) + close(trainReq.ackCh) + return + } + + err = trainerBucket.Put(util.BoltPathKey, []byte(filename)) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err) + close(trainReq.ackCh) + return + } + + err = tx.Commit() + if err != nil { + trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) + close(trainReq.ackCh) + return + } + + err = t.parent.rootBolt.Sync() + if err != nil { + trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) + close(trainReq.ackCh) + return + } + + // update the centroid index pointer + centroidIndex, err := t.parent.segPlugin.OpenEx(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) + close(trainReq.ackCh) + return + } + t.centroidIndex = &SegmentSnapshot{ + segment: centroidIndex, + } + close(trainReq.ackCh) + } + } +} + +func (t *vectorTrainer) loadTrainedData(bucket *bolt.Bucket) error { + if bucket == nil { + return nil + } + segmentSnapshot, err := t.parent.loadSegment(bucket) + if err != nil { + return err + } + t.parent.rootLock.Lock() + defer t.parent.rootLock.Unlock() + t.centroidIndex = segmentSnapshot + return nil +} + +func (t *vectorTrainer) train(batch *index.Batch) error { + // regulate the Train function + t.parent.FireIndexEvent() + + var trainData []index.Document + for key, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + // no need to track updates/deletes over here since + // the API is singleton + doc.AddIDField() + } + if strings.HasPrefix(key, index.TrainDataPrefix) { + trainData = append(trainData, doc) + } + } + + // just builds a new vector index out of the train data provided + // it'll be an IVF index so the centroids are computed at this stage and + // this template will be used in the indexing down the line to index + // the data vectors. s.segmentConfig will mark this as a training phase + // and zap will handle it accordingly. + // + // note: this might index text data too, how to handle this? s.segmentConfig? + // todo: updates/deletes -> data drift detection + seg, _, err := t.parent.segPlugin.NewEx(trainData, t.parent.segmentConfig) + if err != nil { + return err + } + + trainReq := &trainRequest{ + sample: seg, + vecCount: len(trainData), // todo: multivector support + ackCh: make(chan error), + } + + t.trainCh <- trainReq + err = <-trainReq.ackCh + if err != nil { + return err + } + + return err +} + +func (t *vectorTrainer) getInternal(key []byte) ([]byte, error) { + // todo: return the total number of vectors that have been processed so far in training + // in cbft use that as a checkpoint to resume training for n-x samples. + switch string(key) { + case string(util.BoltTrainCompleteKey): + return []byte(fmt.Sprintf("%t", t.centroidIndex != nil)), nil + } + return nil, nil +} + +func (t *vectorTrainer) getCentroidIndex(field string) (*faiss.IndexImpl, error) { + // return the coarse quantizer of the centroid index belonging to the field + centroidIndexSegment, ok := t.centroidIndex.segment.(segment.CentroidIndexSegment) + if !ok { + return nil, fmt.Errorf("segment is not a centroid index segment", t.centroidIndex.segment != nil) + } + + coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) + if err != nil { + return nil, err + } + return coarseQuantizer, nil +} diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go new file mode 100644 index 000000000..a16353c10 --- /dev/null +++ b/index/scorch/train_noop.go @@ -0,0 +1,25 @@ +//go:build !vectors +// +build !vectors + +package scorch + +import ( + "fmt" + + index "github.com/blevesearch/bleve_index_api" + bolt "go.etcd.io/bbolt" +) + +type noopTrainer struct { +} + +func (t *noopTrainer) trainLoop() {} + +func (t *noopTrainer) train(batch *index.Batch) error { + return fmt.Errorf("training is not supported with this build") +} + +func (t *noopTrainer) loadTrainedData(bucket *bolt.Bucket) error { + // noop + return nil +} diff --git a/util/keys.go b/util/keys.go index a1f3bfbbf..ce8965da2 100644 --- a/util/keys.go +++ b/util/keys.go @@ -17,8 +17,8 @@ package util var ( // Bolt keys BoltSnapshotsBucket = []byte{'s'} - BoltCentroidIndexKey = []byte{'c'} - BoltTrainCompleteKey = []byte{'t'} + BoltTrainerKey = []byte{'t'} + BoltTrainCompleteKey = []byte{'c'} BoltPathKey = []byte{'p'} BoltDeletedKey = []byte{'d'} BoltInternalKey = []byte{'i'} From 873c08651e21ecc6467bb2a291986a019934efef Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 29 Jan 2026 13:49:07 -0800 Subject: [PATCH 22/25] fix trainer impls --- index/scorch/train.go | 2 +- index/scorch/train_noop.go | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/index/scorch/train.go b/index/scorch/train.go index 2c2777fcf..5d0a00a0b 100644 --- a/index/scorch/train.go +++ b/index/scorch/train.go @@ -56,7 +56,7 @@ func moveFile(sourcePath, destPath string) error { // this is not a routine that will be running throughout the lifetime of the index. It's purpose // is to only train the vector index before the data ingestion starts. -func (t *vectorTrainer) trainerLoop() { +func (t *vectorTrainer) trainLoop() { defer func() { t.parent.asyncTasks.Done() }() diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go index a16353c10..60a8d09fd 100644 --- a/index/scorch/train_noop.go +++ b/index/scorch/train_noop.go @@ -23,3 +23,7 @@ func (t *noopTrainer) loadTrainedData(bucket *bolt.Bucket) error { // noop return nil } + +func (t *noopTrainer) getInternal(key []byte) ([]byte, error) { + return nil, nil +} From 26f7c12fecbbbdd07775cf26f8d0f5e13a4a19fd Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Thu, 29 Jan 2026 13:56:57 -0800 Subject: [PATCH 23/25] fix trainer init --- index/scorch/scorch.go | 2 ++ index/scorch/train.go | 8 +++++++- index/scorch/train_noop.go | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 0a004eb8d..77ce94044 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -222,6 +222,8 @@ func NewScorch(storeName string, return nil, err } + rv.trainer = initTrainer(rv) + return rv, nil } diff --git a/index/scorch/train.go b/index/scorch/train.go index 5d0a00a0b..8150ff0ea 100644 --- a/index/scorch/train.go +++ b/index/scorch/train.go @@ -37,6 +37,13 @@ type trainRequest struct { ackCh chan error } +func initTrainer(s *Scorch) *vectorTrainer { + return &vectorTrainer{ + parent: s, + trainCh: make(chan *trainRequest), + } +} + type vectorTrainer struct { parent *Scorch @@ -62,7 +69,6 @@ func (t *vectorTrainer) trainLoop() { }() // initialize stuff t.parent.segmentConfig["getCentroidIndexCallback"] = t.getCentroidIndex - t.trainCh = make(chan *trainRequest) var totalSamplesProcessed int filename := index.CentroidIndexFileName path := filepath.Join(t.parent.path, filename) diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go index 60a8d09fd..1f2a51bf8 100644 --- a/index/scorch/train_noop.go +++ b/index/scorch/train_noop.go @@ -10,6 +10,10 @@ import ( bolt "go.etcd.io/bbolt" ) +func initTrainer(s *Scorch) *noopTrainer { + return &noopTrainer{} +} + type noopTrainer struct { } From b267d4b1053150e8dd01c02a096d30b468735d8b Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 2 Feb 2026 12:50:10 -0800 Subject: [PATCH 24/25] merge conflict resolve --- index/scorch/train.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/train.go b/index/scorch/train.go index 8150ff0ea..4e31a3be4 100644 --- a/index/scorch/train.go +++ b/index/scorch/train.go @@ -97,7 +97,7 @@ func (t *vectorTrainer) trainLoop() { // merge the new segment with the existing one, no need to persist? // persist in a tmp file and then rename - is that a fair strategy? t.parent.segmentConfig["training"] = true - _, _, err := t.parent.segPlugin.MergeEx([]segment.Segment{t.centroidIndex.segment, sampleSeg}, + _, _, err := t.parent.segPlugin.MergeUsing([]segment.Segment{t.centroidIndex.segment, sampleSeg}, []*roaring.Bitmap{nil, nil}, filepath.Join(t.parent.path, filename+".tmp"), t.parent.closeCh, nil, t.parent.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) @@ -169,7 +169,7 @@ func (t *vectorTrainer) trainLoop() { } // update the centroid index pointer - centroidIndex, err := t.parent.segPlugin.OpenEx(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig) + centroidIndex, err := t.parent.segPlugin.OpenUsing(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) close(trainReq.ackCh) @@ -222,7 +222,7 @@ func (t *vectorTrainer) train(batch *index.Batch) error { // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection - seg, _, err := t.parent.segPlugin.NewEx(trainData, t.parent.segmentConfig) + seg, _, err := t.parent.segPlugin.NewUsing(trainData, t.parent.segmentConfig) if err != nil { return err } From ce43814cf11a1293647e7748a4d254795fee2913 Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Wed, 4 Feb 2026 14:34:13 -0800 Subject: [PATCH 25/25] cleanup + refactor code --- index/scorch/merge.go | 2 - index/scorch/persister.go | 1 - index/scorch/scorch.go | 21 +++++++-- index/scorch/train_noop.go | 14 ++++++ index/scorch/{train.go => train_vector.go} | 53 +++++++++++----------- index_alias_impl.go | 12 ++++- 6 files changed, 68 insertions(+), 35 deletions(-) rename index/scorch/{train.go => train_vector.go} (82%) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 32de86bd4..bca9bbb81 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -335,7 +335,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) mergedSegHistory := make(map[uint64]*mergedSegmentHistory, len(task.Segments)) - var files []string for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot @@ -351,7 +350,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, } else { segmentsToMerge = append(segmentsToMerge, segSnapshot.segment) docsToDrop = append(docsToDrop, segSnapshot.deleted) - files = append(files, persistedSeg.Path()) } // track the files getting merged for unsetting the // removal ineligibility. This helps to unflip files diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 4aad12900..3df4ac2e6 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -876,7 +876,6 @@ func (s *Scorch) loadFromBolt() error { s.AddEligibleForRemoval(snapshotEpoch) continue } - snapshot := snapshots.Bucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 77ce94044..afe2878f0 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -22,7 +22,6 @@ import ( "sync" "sync/atomic" "time" - "unsafe" "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/registry" @@ -92,10 +91,26 @@ type Scorch struct { spatialPlugin index.SpatialAnalyzerPlugin } +// trainer interface is used for training an index that has the concept +// of "learning". Naturally, a vector index is one such thing that would +// implement this interface. There can be multiple implementations of the +// training itself even for the same index type. +// +// this component is not supposed to interact with the other master routines +// of scorch and will be used only for training the index before the actual data +// ingestion starts. The routine should also be released once the +// training is marked as complete - which can be done using the BoltTrainCompleteKey +// key and a bool value. However the struct is still maintained for the pointer to +// the instance so that we can use in the later stages of the index lifecycle. type trainer interface { + // ephemeral trainLoop() + // for the training state and the ingestion of the samples train(batch *index.Batch) error + + // to load the metadata from the bolt under the BoltTrainerKey loadTrainedData(*bolt.Bucket) error + // to fetch the internal data from the component getInternal(key []byte) ([]byte, error) } @@ -565,10 +580,6 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) { return nil, nil } -func boolToByte(b bool) byte { - return *(*byte)(unsafe.Pointer(&b)) -} - func (s *Scorch) Train(batch *index.Batch) error { return s.trainer.train(batch) } diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go index 1f2a51bf8..d82b342c6 100644 --- a/index/scorch/train_noop.go +++ b/index/scorch/train_noop.go @@ -1,3 +1,17 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //go:build !vectors // +build !vectors diff --git a/index/scorch/train.go b/index/scorch/train_vector.go similarity index 82% rename from index/scorch/train.go rename to index/scorch/train_vector.go index 4e31a3be4..74cc6b4ed 100644 --- a/index/scorch/train.go +++ b/index/scorch/train_vector.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Couchbase, Inc. +// Copyright (c) 2026 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ import ( "os" "path/filepath" "strings" + "sync" "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/util" @@ -47,7 +48,9 @@ func initTrainer(s *Scorch) *vectorTrainer { type vectorTrainer struct { parent *Scorch - // not a real searchable segment + m sync.Mutex + // not a searchable segment in the sense that it won't return + // the data vectors. can return centroid vectors centroidIndex *SegmentSnapshot trainCh chan *trainRequest } @@ -68,10 +71,8 @@ func (t *vectorTrainer) trainLoop() { t.parent.asyncTasks.Done() }() // initialize stuff - t.parent.segmentConfig["getCentroidIndexCallback"] = t.getCentroidIndex - var totalSamplesProcessed int - filename := index.CentroidIndexFileName - path := filepath.Join(t.parent.path, filename) + t.parent.segmentConfig[index.CentroidIndexCallback] = t.getCentroidIndex + path := filepath.Join(t.parent.path, index.CentroidIndexFileName) for { select { case <-t.parent.closeCh: @@ -83,44 +84,41 @@ func (t *vectorTrainer) trainLoop() { case segment.UnpersistedSegment: err := persistToDirectory(seg, nil, path) if err != nil { - // clean up this ugly ass error handling code trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err) close(trainReq.ackCh) return } default: - fmt.Errorf("segment is not a unpersisted segment") - close(t.parent.closeCh) - return } } else { // merge the new segment with the existing one, no need to persist? // persist in a tmp file and then rename - is that a fair strategy? - t.parent.segmentConfig["training"] = true + t.parent.segmentConfig[index.TrainingKey] = true _, _, err := t.parent.segPlugin.MergeUsing([]segment.Segment{t.centroidIndex.segment, sampleSeg}, - []*roaring.Bitmap{nil, nil}, filepath.Join(t.parent.path, filename+".tmp"), t.parent.closeCh, nil, t.parent.segmentConfig) + []*roaring.Bitmap{nil, nil}, path+".tmp", t.parent.closeCh, nil, t.parent.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err) close(trainReq.ackCh) } // reset the training flag once completed - t.parent.segmentConfig["training"] = false + t.parent.segmentConfig[index.TrainingKey] = false // close the existing centroid segment - it's supposed to be gc'd at this point t.centroidIndex.segment.Close() - err = moveFile(filepath.Join(t.parent.path, filename+".tmp"), filepath.Join(t.parent.path, filename)) + err = moveFile(path+".tmp", path) if err != nil { trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err) close(trainReq.ackCh) } } - totalSamplesProcessed += trainReq.vecCount // a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint // where we can be sure that the centroid index is available for the indexing operations downstream // // note: when the scale increases massively especially with real world dimensions of 1536+, this API // will have to be refactored to persist in a more resource efficient way. so having this bolt related // code will help in tracking the progress a lot better and avoid any redudant data streaming operations. + // + // todo: rethink the frequency of bolt writes tx, err := t.parent.rootBolt.Begin(true) if err != nil { trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err) @@ -147,7 +145,7 @@ func (t *vectorTrainer) trainLoop() { return } - err = trainerBucket.Put(util.BoltPathKey, []byte(filename)) + err = trainerBucket.Put(util.BoltPathKey, []byte(index.CentroidIndexFileName)) if err != nil { trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err) close(trainReq.ackCh) @@ -163,26 +161,29 @@ func (t *vectorTrainer) trainLoop() { err = t.parent.rootBolt.Sync() if err != nil { - trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err) + trainReq.ackCh <- fmt.Errorf("error on bolt sync: %v", err) close(trainReq.ackCh) return } // update the centroid index pointer - centroidIndex, err := t.parent.segPlugin.OpenUsing(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig) + centroidIndex, err := t.parent.segPlugin.OpenUsing(path, t.parent.segmentConfig) if err != nil { trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err) close(trainReq.ackCh) return } + t.m.Lock() t.centroidIndex = &SegmentSnapshot{ segment: centroidIndex, } + t.m.Unlock() close(trainReq.ackCh) } } } +// loads the metadata specific to the centroid index from boltdb func (t *vectorTrainer) loadTrainedData(bucket *bolt.Bucket) error { if bucket == nil { return nil @@ -191,8 +192,8 @@ func (t *vectorTrainer) loadTrainedData(bucket *bolt.Bucket) error { if err != nil { return err } - t.parent.rootLock.Lock() - defer t.parent.rootLock.Unlock() + t.m.Lock() + defer t.m.Unlock() t.centroidIndex = segmentSnapshot return nil } @@ -215,10 +216,10 @@ func (t *vectorTrainer) train(batch *index.Batch) error { } // just builds a new vector index out of the train data provided - // it'll be an IVF index so the centroids are computed at this stage and - // this template will be used in the indexing down the line to index - // the data vectors. s.segmentConfig will mark this as a training phase - // and zap will handle it accordingly. + // this is not necessarily the final train data since this is submitted + // as a request to the trainer component to be merged. once the training + // is complete, the template will be used for other operations down the line + // like merge and search. // // note: this might index text data too, how to handle this? s.segmentConfig? // todo: updates/deletes -> data drift detection @@ -236,7 +237,7 @@ func (t *vectorTrainer) train(batch *index.Batch) error { t.trainCh <- trainReq err = <-trainReq.ackCh if err != nil { - return err + return fmt.Errorf("train_vector: train() err'd out with: %w", err) } return err @@ -256,7 +257,7 @@ func (t *vectorTrainer) getCentroidIndex(field string) (*faiss.IndexImpl, error) // return the coarse quantizer of the centroid index belonging to the field centroidIndexSegment, ok := t.centroidIndex.segment.(segment.CentroidIndexSegment) if !ok { - return nil, fmt.Errorf("segment is not a centroid index segment", t.centroidIndex.segment != nil) + return nil, fmt.Errorf("segment is not a centroid index segment") } coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field) diff --git a/index_alias_impl.go b/index_alias_impl.go index 16f20ac45..2839752e2 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -106,8 +106,18 @@ func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition * func (i *indexAliasImpl) Train(batch *Batch) error { i.mutex.RLock() defer i.mutex.RUnlock() + if !i.open { + return ErrorIndexClosed + } - // TODO: implement this + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + + if vi, ok := i.indexes[0].(VectorIndex); ok { + return vi.Train(batch) + } return fmt.Errorf("not a vector index") }