From 4ef828e33f9c36b19db3d5885af1e2af0421328a Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 16 Jun 2025 15:28:24 +0530
Subject: [PATCH 01/25] passing zap config via new plugin APIs

---
 index/scorch/merge.go          | 10 +++++-----
 index/scorch/persister.go      |  4 ++--
 index/scorch/scorch.go         |  4 +++-
 index/scorch/segment_plugin.go |  8 ++++++++
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index e17288410..e2267bf81 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -372,8 +372,8 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 
 			atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
 			prevBytesReadTotal := cumulateBytesRead(segmentsToMerge)
-			newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path,
-				cw.cancelCh, s)
+			newDocNums, _, err := s.segPlugin.MergeEx(segmentsToMerge, docsToDrop, path,
+				cw.cancelCh, s, s.segmentConfig)
 			atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
 
 			fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
@@ -391,7 +391,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 				return fmt.Errorf("merging failed: %v", err)
 			}
 
-			seg, err = s.segPlugin.Open(path)
+			seg, err = s.segPlugin.OpenEx(path, s.segmentConfig)
 			if err != nil {
 				s.unmarkIneligibleForRemoval(filename)
 				atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
@@ -540,7 +540,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 			// the newly merged segment is already flushed out to disk, just needs
 			// to be opened using mmap.
 			newDocIDs, _, err :=
-				s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s)
+				s.segPlugin.MergeEx(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig)
 			if err != nil {
 				em.Lock()
 				errs = append(errs, err)
@@ -555,7 +555,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 			s.markIneligibleForRemoval(filename)
 			newMergedSegmentIDs[id] = newSegmentID
 			newDocIDsSet[id] = newDocIDs
-			newMergedSegments[id], err = s.segPlugin.Open(path)
+			newMergedSegments[id], err = s.segPlugin.OpenEx(path, s.segmentConfig)
 			if err != nil {
 				em.Lock()
 				errs = append(errs, err)
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index d0c013a1d..2bc88f4f7 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -804,7 +804,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint
 			}
 		}()
 		for segmentID, path := range newSegmentPaths {
-			newSegments[segmentID], err = s.segPlugin.Open(path)
+			newSegments[segmentID], err = s.segPlugin.OpenEx(path, s.segmentConfig)
 			if err != nil {
 				return fmt.Errorf("error opening new segment at %s, %v", path, err)
 			}
@@ -1016,7 +1016,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
 		return nil, fmt.Errorf("segment path missing")
 	}
 	segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
-	seg, err := s.segPlugin.Open(segmentPath)
+	segment, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig)
 	if err != nil {
 		return nil, fmt.Errorf("error opening bolt segment: %v", err)
 	}
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 329de598e..72435a91c 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -45,6 +45,7 @@ type Scorch struct {
 	readOnly      bool
 	version       uint8
 	config        map[string]interface{}
+	segmentConfig map[string]interface{}
 	analysisQueue *index.AnalysisQueue
 	path          string
 
@@ -154,6 +155,7 @@ func NewScorch(storeName string,
 		forceMergeRequestCh:  make(chan *mergerCtrl, 1),
 		segPlugin:            defaultSegmentPlugin,
 		copyScheduled:        map[string]int{},
+		segmentConfig:        make(map[string]interface{}),
 	}
 
 	forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config)
@@ -497,7 +499,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
 	stats := newFieldStats()
 
 	if len(analysisResults) > 0 {
-		newSegment, bufBytes, err = s.segPlugin.New(analysisResults)
+		newSegment, bufBytes, err = s.segPlugin.NewEx(analysisResults, s.segmentConfig)
 		if err != nil {
 			return err
 		}
diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go
index c44f9cf7b..4ff249a0b 100644
--- a/index/scorch/segment_plugin.go
+++ b/index/scorch/segment_plugin.go
@@ -46,10 +46,14 @@ type SegmentPlugin interface {
 	// New takes a set of Documents and turns them into a new Segment
 	New(results []index.Document) (segment.Segment, uint64, error)
 
+	NewEx(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error)
+
 	// Open attempts to open the file at the specified path and
 	// return the corresponding Segment
 	Open(path string) (segment.Segment, error)
 
+	OpenEx(path string, config map[string]interface{}) (segment.Segment, error)
+
 	// Merge takes a set of Segments, and creates a new segment on disk at
 	// the specified path.
 	// Drops is a set of bitmaps (one for each segment) indicating which
@@ -67,6 +71,10 @@ type SegmentPlugin interface {
 	Merge(segments []segment.Segment, drops []*roaring.Bitmap, path string,
 		closeCh chan struct{}, s segment.StatsReporter) (
 		[][]uint64, uint64, error)
+
+	MergeEx(segments []segment.Segment, drops []*roaring.Bitmap, path string,
+		closeCh chan struct{}, s segment.StatsReporter, config map[string]interface{}) (
+		[][]uint64, uint64, error)
 }
 
 var supportedSegmentPlugins map[string]map[uint32]SegmentPlugin

From 4a9ce7c5fd12bdbd43727cbf5521d31f67f615b1 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 15 Jan 2026 14:47:31 -0800
Subject: [PATCH 02/25] merge conflic resolve

---
 index/scorch/persister.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 2bc88f4f7..0571bd1e4 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -1016,7 +1016,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
 		return nil, fmt.Errorf("segment path missing")
 	}
 	segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
-	segment, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig)
+	seg, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig)
 	if err != nil {
 		return nil, fmt.Errorf("error opening bolt segment: %v", err)
 	}

From dd5ccbb6a46a22597b5256e2f297e8eff9c4e394 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 2 Feb 2026 11:39:41 -0800
Subject: [PATCH 03/25] *Ex -> *Using naming

---
 index/scorch/merge.go          | 8 ++++----
 index/scorch/persister.go      | 4 ++--
 index/scorch/scorch.go         | 2 +-
 index/scorch/segment_plugin.go | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index e2267bf81..bca9bbb81 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -372,7 +372,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 
 			atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
 			prevBytesReadTotal := cumulateBytesRead(segmentsToMerge)
-			newDocNums, _, err := s.segPlugin.MergeEx(segmentsToMerge, docsToDrop, path,
+			newDocNums, _, err := s.segPlugin.MergeUsing(segmentsToMerge, docsToDrop, path,
 				cw.cancelCh, s, s.segmentConfig)
 			atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
 
@@ -391,7 +391,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 				return fmt.Errorf("merging failed: %v", err)
 			}
 
-			seg, err = s.segPlugin.OpenEx(path, s.segmentConfig)
+			seg, err = s.segPlugin.OpenUsing(path, s.segmentConfig)
 			if err != nil {
 				s.unmarkIneligibleForRemoval(filename)
 				atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
@@ -540,7 +540,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 			// the newly merged segment is already flushed out to disk, just needs
 			// to be opened using mmap.
 			newDocIDs, _, err :=
-				s.segPlugin.MergeEx(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig)
+				s.segPlugin.MergeUsing(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig)
 			if err != nil {
 				em.Lock()
 				errs = append(errs, err)
@@ -555,7 +555,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 			s.markIneligibleForRemoval(filename)
 			newMergedSegmentIDs[id] = newSegmentID
 			newDocIDsSet[id] = newDocIDs
-			newMergedSegments[id], err = s.segPlugin.OpenEx(path, s.segmentConfig)
+			newMergedSegments[id], err = s.segPlugin.OpenUsing(path, s.segmentConfig)
 			if err != nil {
 				em.Lock()
 				errs = append(errs, err)
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 0571bd1e4..977097097 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -804,7 +804,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint
 			}
 		}()
 		for segmentID, path := range newSegmentPaths {
-			newSegments[segmentID], err = s.segPlugin.OpenEx(path, s.segmentConfig)
+			newSegments[segmentID], err = s.segPlugin.OpenUsing(path, s.segmentConfig)
 			if err != nil {
 				return fmt.Errorf("error opening new segment at %s, %v", path, err)
 			}
@@ -1016,7 +1016,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
 		return nil, fmt.Errorf("segment path missing")
 	}
 	segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
-	seg, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig)
+	seg, err := s.segPlugin.OpenUsing(segmentPath, s.segmentConfig)
 	if err != nil {
 		return nil, fmt.Errorf("error opening bolt segment: %v", err)
 	}
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 72435a91c..efe052935 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -499,7 +499,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
 	stats := newFieldStats()
 
 	if len(analysisResults) > 0 {
-		newSegment, bufBytes, err = s.segPlugin.NewEx(analysisResults, s.segmentConfig)
+		newSegment, bufBytes, err = s.segPlugin.NewUsing(analysisResults, s.segmentConfig)
 		if err != nil {
 			return err
 		}
diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go
index 4ff249a0b..16be8e440 100644
--- a/index/scorch/segment_plugin.go
+++ b/index/scorch/segment_plugin.go
@@ -46,13 +46,13 @@ type SegmentPlugin interface {
 	// New takes a set of Documents and turns them into a new Segment
 	New(results []index.Document) (segment.Segment, uint64, error)
 
-	NewEx(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error)
+	NewUsing(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error)
 
 	// Open attempts to open the file at the specified path and
 	// return the corresponding Segment
 	Open(path string) (segment.Segment, error)
 
-	OpenEx(path string, config map[string]interface{}) (segment.Segment, error)
+	OpenUsing(path string, config map[string]interface{}) (segment.Segment, error)
 
 	// Merge takes a set of Segments, and creates a new segment on disk at
 	// the specified path.
@@ -72,7 +72,7 @@ type SegmentPlugin interface {
 		closeCh chan struct{}, s segment.StatsReporter) (
 		[][]uint64, uint64, error)
 
-	MergeEx(segments []segment.Segment, drops []*roaring.Bitmap, path string,
+	MergeUsing(segments []segment.Segment, drops []*roaring.Bitmap, path string,
 		closeCh chan struct{}, s segment.StatsReporter, config map[string]interface{}) (
 		[][]uint64, uint64, error)
 }

From b0876250608ee44cd1934dab4831accd296e4661 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Wed, 4 Feb 2026 13:46:12 -0800
Subject: [PATCH 04/25] go mod changes

---
 go.mod | 14 +++++++-------
 go.sum | 28 ++++++++++++++--------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/go.mod b/go.mod
index 5448bac80..4736f4f88 100644
--- a/go.mod
+++ b/go.mod
@@ -19,13 +19,13 @@ require (
 	github.com/blevesearch/stempel v0.2.0
 	github.com/blevesearch/upsidedown_store_api v1.0.2
 	github.com/blevesearch/vellum v1.2.0
-	github.com/blevesearch/zapx/v11 v11.4.2
-	github.com/blevesearch/zapx/v12 v12.4.2
-	github.com/blevesearch/zapx/v13 v13.4.2
-	github.com/blevesearch/zapx/v14 v14.4.2
-	github.com/blevesearch/zapx/v15 v15.4.2
-	github.com/blevesearch/zapx/v16 v16.3.0
-	github.com/blevesearch/zapx/v17 v17.0.1
+	github.com/blevesearch/zapx/v11 v11.4.3
+	github.com/blevesearch/zapx/v12 v12.4.3
+	github.com/blevesearch/zapx/v13 v13.4.3
+	github.com/blevesearch/zapx/v14 v14.4.3
+	github.com/blevesearch/zapx/v15 v15.4.3
+	github.com/blevesearch/zapx/v16 v16.3.1
+	github.com/blevesearch/zapx/v17 v17.0.2-0.20260204210735-148661f2ddf6
 	github.com/couchbase/moss v0.2.0
 	github.com/spf13/cobra v1.10.2
 	go.etcd.io/bbolt v1.4.0
diff --git a/go.sum b/go.sum
index 8207f8975..b8eba81f8 100644
--- a/go.sum
+++ b/go.sum
@@ -33,20 +33,20 @@ github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMG
 github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
 github.com/blevesearch/vellum v1.2.0 h1:xkDiOEsHc2t3Cp0NsNZZ36pvc130sCzcGKOPMzXe+e0=
 github.com/blevesearch/vellum v1.2.0/go.mod h1:uEcfBJz7mAOf0Kvq6qoEKQQkLODBF46SINYNkZNae4k=
-github.com/blevesearch/zapx/v11 v11.4.2 h1:l46SV+b0gFN+Rw3wUI1YdMWdSAVhskYuvxlcgpQFljs=
-github.com/blevesearch/zapx/v11 v11.4.2/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc=
-github.com/blevesearch/zapx/v12 v12.4.2 h1:fzRbhllQmEMUuAQ7zBuMvKRlcPA5ESTgWlDEoB9uQNE=
-github.com/blevesearch/zapx/v12 v12.4.2/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58=
-github.com/blevesearch/zapx/v13 v13.4.2 h1:46PIZCO/ZuKZYgxI8Y7lOJqX3Irkc3N8W82QTK3MVks=
-github.com/blevesearch/zapx/v13 v13.4.2/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk=
-github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT7fWYz0=
-github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
-github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k=
-github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
-github.com/blevesearch/zapx/v16 v16.3.0 h1:hF6VlN15E9CB40RMPyqOIhlDw1OOo9RItumhKMQktxw=
-github.com/blevesearch/zapx/v16 v16.3.0/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY=
-github.com/blevesearch/zapx/v17 v17.0.1 h1:kdojyNDiC4abVvsSwequvqYTBuLEXoG3c0UKyxe1+GM=
-github.com/blevesearch/zapx/v17 v17.0.1/go.mod h1:gvr+JMDB9XvQUkT+CaYJhY7aMlez5EmXbkzOBCVyc7U=
+github.com/blevesearch/zapx/v11 v11.4.3 h1:PTZOO5loKpHC/x/GzmPZNa9cw7GZIQxd5qRjwij9tHY=
+github.com/blevesearch/zapx/v11 v11.4.3/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc=
+github.com/blevesearch/zapx/v12 v12.4.3 h1:eElXvAaAX4m04t//CGBQAtHNPA+Q6A1hHZVrN3LSFYo=
+github.com/blevesearch/zapx/v12 v12.4.3/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58=
+github.com/blevesearch/zapx/v13 v13.4.3 h1:qsdhRhaSpVnqDFlRiH9vG5+KJ+dE7KAW9WyZz/KXAiE=
+github.com/blevesearch/zapx/v13 v13.4.3/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk=
+github.com/blevesearch/zapx/v14 v14.4.3 h1:GY4Hecx0C6UTmiNC2pKdeA2rOKiLR5/rwpU9WR51dgM=
+github.com/blevesearch/zapx/v14 v14.4.3/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
+github.com/blevesearch/zapx/v15 v15.4.3 h1:iJiMJOHrz216jyO6lS0m9RTCEkprUnzvqAI2lc/0/CU=
+github.com/blevesearch/zapx/v15 v15.4.3/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
+github.com/blevesearch/zapx/v16 v16.3.1 h1:ERxZUSC9UcuKggCQ6b3y4sTkyL4WnGOWuopzglR874g=
+github.com/blevesearch/zapx/v16 v16.3.1/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY=
+github.com/blevesearch/zapx/v17 v17.0.2-0.20260204210735-148661f2ddf6 h1:eqJh5al0dcPq6VsY6C+G4kva5BBffzMG+sN/SWg2/Eg=
+github.com/blevesearch/zapx/v17 v17.0.2-0.20260204210735-148661f2ddf6/go.mod h1:gvr+JMDB9XvQUkT+CaYJhY7aMlez5EmXbkzOBCVyc7U=
 github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
 github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
 github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=

From 1287ed8b33d379608e83943ce7a03a7f141aef22 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 16 Jun 2025 15:23:42 +0530
Subject: [PATCH 05/25] fastmerge wip

---
 index/scorch/snapshot_index.go | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index 3422d9a14..7e8ca0de0 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -66,13 +66,16 @@ func init() {
 }
 
 type IndexSnapshot struct {
-	parent   *Scorch
-	segment  []*SegmentSnapshot
-	offsets  []uint64
-	internal map[string][]byte
-	epoch    uint64
-	size     uint64
-	creator  string
+	parent *Scorch
+
+	// POC: trainData is ephemeral
+	trainData [][]float32
+	segment   []*SegmentSnapshot
+	offsets   []uint64
+	internal  map[string][]byte
+	epoch     uint64
+	size      uint64
+	creator   string
 
 	m    sync.Mutex // Protects the fields that follow.
 	refs int64

From 0d74ce2e24269935a5b62183b47d3ba5bd716c62 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Tue, 17 Jun 2025 12:12:16 +0530
Subject: [PATCH 06/25] use callbacks to collect and use train data while
 merging

---
 index/scorch/introducer.go     |  4 ++++
 index/scorch/merge.go          | 19 +++++++++++++++++++
 index/scorch/snapshot_index.go |  2 +-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go
index ef26532b0..7965cc5c3 100644
--- a/index/scorch/introducer.go
+++ b/index/scorch/introducer.go
@@ -360,6 +360,10 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
 		creator:  "introduceMerge",
 	}
 
+	if len(nextMerge.trainData) > 0 {
+		newSnapshot.trainData = append(root.trainData, nextMerge.trainData...)
+	}
+
 	var running, docsToPersistCount, memSegments, fileSegments uint64
 	var droppedSegmentFiles []string
 	newSegmentDeleted := make([]*roaring.Bitmap, len(nextMerge.new))
diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index bca9bbb81..31cac6e61 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -17,6 +17,7 @@ package scorch
 import (
 	"context"
 	"fmt"
+	"math"
 	"os"
 	"strings"
 	"sync"
@@ -481,6 +482,7 @@ type mergedSegmentHistory struct {
 type segmentMerge struct {
 	id               []uint64
 	new              []segment.Segment
+	trainData        [][]float32
 	mergedSegHistory map[uint64]*mergedSegmentHistory
 	notifyCh         chan *mergeTaskIntroStatus
 	mmaped           uint32
@@ -527,6 +529,22 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 	var em sync.Mutex
 	var errs []error
 
+	var trainingSample [][]float32
+	collectTrainData := func(segTrainData [][]float32) {
+		trainingSample = append(trainingSample, segTrainData...)
+	}
+
+	numDocs, err := snapshot.DocCount()
+	if err != nil {
+		return nil, nil, err
+	}
+	trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 39)
+
+	// collect train data only if needed
+	if len(snapshot.trainData) < int(trainingSampleSize) {
+		s.segmentConfig["collectTrainDataCallback"] = collectTrainData
+	}
+	s.segmentConfig["trainData"] = snapshot.trainData
 	// deploy the workers to merge and flush the batches of segments concurrently
 	// and create a new file segment
 	for i := 0; i < numFlushes; i++ {
@@ -601,6 +619,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 		mergedSegHistory: make(map[uint64]*mergedSegmentHistory, numSegments),
 		notifyCh:         make(chan *mergeTaskIntroStatus),
 		newCount:         newMergedCount,
+		trainData:        trainingSample,
 	}
 
 	// create a history map which maps the old in-memory segments with the specific
diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index 7e8ca0de0..056b013d6 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -68,7 +68,7 @@ func init() {
 type IndexSnapshot struct {
 	parent *Scorch
 
-	// POC: trainData is ephemeral
+	// POC: trainData is ephemeral and read-only just like []*SegmentSnapshot
 	trainData [][]float32
 	segment   []*SegmentSnapshot
 	offsets   []uint64

From 2aa894921d53358b1b090856a88638046bded445 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Wed, 18 Jun 2025 16:49:32 +0530
Subject: [PATCH 07/25] serialized float array

---
 index/scorch/merge.go          | 10 ++++++----
 index/scorch/snapshot_index.go |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index 31cac6e61..879003b32 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"math"
 	"os"
+	"slices"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -482,7 +483,7 @@ type mergedSegmentHistory struct {
 type segmentMerge struct {
 	id               []uint64
 	new              []segment.Segment
-	trainData        [][]float32
+	trainData        []float32
 	mergedSegHistory map[uint64]*mergedSegmentHistory
 	notifyCh         chan *mergeTaskIntroStatus
 	mmaped           uint32
@@ -529,9 +530,10 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 	var em sync.Mutex
 	var errs []error
 
-	var trainingSample [][]float32
-	collectTrainData := func(segTrainData [][]float32) {
-		trainingSample = append(trainingSample, segTrainData...)
+	var trainingSample []float32
+	collectTrainData := func(segTrainData []float32) {
+		// append a clone of the training sample
+		trainingSample = append(trainingSample, slices.Clone(segTrainData)...)
 	}
 
 	numDocs, err := snapshot.DocCount()
diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index 056b013d6..152647089 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -69,7 +69,7 @@ type IndexSnapshot struct {
 	parent *Scorch
 
 	// POC: trainData is ephemeral and read-only just like []*SegmentSnapshot
-	trainData [][]float32
+	trainData []float32
 	segment   []*SegmentSnapshot
 	offsets   []uint64
 	internal  map[string][]byte

From 0f88485cbfd8f405e7d9249b8627e10654dfbc8f Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Tue, 8 Jul 2025 11:46:06 +0530
Subject: [PATCH 08/25] collect training sample on the file path as well

---
 index/scorch/merge.go     | 21 ++++++++++++++++-----
 index/scorch/persister.go |  4 ++++
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index 879003b32..80e4ef03f 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -365,6 +365,11 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 
 		var seg segment.Segment
 		var filename string
+		var trainingSample []float32
+		collectTrainData := func(segTrainData []float32) {
+			// append a clone of the training sample
+			trainingSample = append(trainingSample, slices.Clone(segTrainData)...)
+		}
 		if len(segmentsToMerge) > 0 {
 			filename = zapFileName(newSegmentID)
 			s.markIneligibleForRemoval(filename)
@@ -419,6 +424,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 			newCount:         seg.Count(),
 			notifyCh:         make(chan *mergeTaskIntroStatus),
 			mmaped:           1,
+			trainData:        trainingSample,
 		}
 
 		s.fireEvent(EventKindMergeTaskIntroductionStart, 0)
@@ -536,17 +542,22 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 		trainingSample = append(trainingSample, slices.Clone(segTrainData)...)
 	}
 
-	numDocs, err := snapshot.DocCount()
-	if err != nil {
-		return nil, nil, err
-	}
+	// numDocs, err := snapshot.DocCount()
+	// if err != nil {
+	// 	return nil, nil, err
+	// }
+
+	// harcoding the total docs for now, need to get it from CB level
+	numDocs := 1000000
 	trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 39)
 
 	// collect train data only if needed
 	if len(snapshot.trainData) < int(trainingSampleSize) {
 		s.segmentConfig["collectTrainDataCallback"] = collectTrainData
+	} else {
+		s.segmentConfig["trainData"] = snapshot.trainData
 	}
-	s.segmentConfig["trainData"] = snapshot.trainData
+
 	// deploy the workers to merge and flush the batches of segments concurrently
 	// and create a new file segment
 	for i := 0; i < numFlushes; i++ {
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 977097097..eb0d9b187 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -1005,6 +1005,10 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
 				rv.MergeUpdateFieldsInfo(segmentSnapshot.updatedFields)
 			}
 			running += segmentSnapshot.segment.Count()
+			// persistedSegment, ok := segmentSnapshot.segment.(segment.PersistedSegment)
+			// if ok {
+			// 	fmt.Println("segment path", persistedSegment.Path())
+			// }
 		}
 	}
 	return rv, nil

From fce3d3b4dcd9308ca830dd846c9723223f3575c2 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 21 Aug 2025 19:52:06 +0530
Subject: [PATCH 09/25] cleanup debug logs

---
 index/scorch/introducer.go     | 10 +++++++++-
 index/scorch/merge.go          |  6 +++---
 index/scorch/persister.go      |  4 ----
 index/scorch/snapshot_index.go | 14 ++++++++------
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go
index 7965cc5c3..6afda3347 100644
--- a/index/scorch/introducer.go
+++ b/index/scorch/introducer.go
@@ -129,6 +129,10 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
 		creator:  "introduceSegment",
 	}
 
+	if len(root.trainData) > 0 {
+		newSnapshot.trainData = root.trainData
+	}
+
 	// iterate through current segments
 	var running uint64
 	var docsToPersistCount, memSegments, fileSegments uint64
@@ -284,6 +288,10 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
 		creator:  "introducePersist",
 	}
 
+	if len(root.trainData) > 0 {
+		newIndexSnapshot.trainData = root.trainData
+	}
+
 	var docsToPersistCount, memSegments, fileSegments uint64
 	for i, segmentSnapshot := range root.segment {
 		// see if this segment has been replaced
@@ -361,7 +369,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
 	}
 
 	if len(nextMerge.trainData) > 0 {
-		newSnapshot.trainData = append(root.trainData, nextMerge.trainData...)
+		newSnapshot.trainData = nextMerge.trainData
 	}
 
 	var running, docsToPersistCount, memSegments, fileSegments uint64
diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index 80e4ef03f..4ae23297f 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -539,7 +539,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 	var trainingSample []float32
 	collectTrainData := func(segTrainData []float32) {
 		// append a clone of the training sample
-		trainingSample = append(trainingSample, slices.Clone(segTrainData)...)
+		trainingSample = append(trainingSample, segTrainData...)
 	}
 
 	// numDocs, err := snapshot.DocCount()
@@ -549,10 +549,10 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 
 	// harcoding the total docs for now, need to get it from CB level
 	numDocs := 1000000
-	trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 39)
+	trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 50)
 
 	// collect train data only if needed
-	if len(snapshot.trainData) < int(trainingSampleSize) {
+	if len(snapshot.trainData)/768 < int(trainingSampleSize) {
 		s.segmentConfig["collectTrainDataCallback"] = collectTrainData
 	} else {
 		s.segmentConfig["trainData"] = snapshot.trainData
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index eb0d9b187..977097097 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -1005,10 +1005,6 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
 				rv.MergeUpdateFieldsInfo(segmentSnapshot.updatedFields)
 			}
 			running += segmentSnapshot.segment.Count()
-			// persistedSegment, ok := segmentSnapshot.segment.(segment.PersistedSegment)
-			// if ok {
-			// 	fmt.Println("segment path", persistedSegment.Path())
-			// }
 		}
 	}
 	return rv, nil
diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index 152647089..dbd666cd0 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -70,12 +70,14 @@ type IndexSnapshot struct {
 
 	// POC: trainData is ephemeral and read-only just like []*SegmentSnapshot
 	trainData []float32
-	segment   []*SegmentSnapshot
-	offsets   []uint64
-	internal  map[string][]byte
-	epoch     uint64
-	size      uint64
-	creator   string
+	// trainSegments []*SegmentSnapshot // either store []float32 or []faissIndexes aka centroid indexes
+
+	segment  []*SegmentSnapshot
+	offsets  []uint64
+	internal map[string][]byte
+	epoch    uint64
+	size     uint64
+	creator  string
 
 	m    sync.Mutex // Protects the fields that follow.
 	refs int64

From 7cfa17bc209636c0b56d88a3cee6bcb653f7721a Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Tue, 28 Oct 2025 16:30:10 -0700
Subject: [PATCH 10/25] vector sources API

---
 mapping/mapping.go            | 1 +
 mapping/mapping_no_vectors.go | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/mapping/mapping.go b/mapping/mapping.go
index 7ff2f9927..e3a14b738 100644
--- a/mapping/mapping.go
+++ b/mapping/mapping.go
@@ -58,6 +58,7 @@ type IndexMapping interface {
 	AnalyzerNamed(name string) analysis.Analyzer
 
 	FieldMappingForPath(path string) FieldMapping
+	VectorSources() []string
 }
 
 // A SynonymMapping extends the IndexMapping interface to provide
diff --git a/mapping/mapping_no_vectors.go b/mapping/mapping_no_vectors.go
index cbe9d81bc..3e7e0403c 100644
--- a/mapping/mapping_no_vectors.go
+++ b/mapping/mapping_no_vectors.go
@@ -42,3 +42,10 @@ func validateFieldMapping(field *FieldMapping, path []string,
 	fieldAliasCtx map[string]*FieldMapping) error {
 	return validateFieldType(field)
 }
+
+// -----------------------------------------------------------------------------
+// vector source functions
+
+func (im *IndexMappingImpl) VectorSources() []string {
+	return []string{"vector indexing is not implemented"}
+}

From 853d687a9f44e5935a6d77f72dd76a56d7920513 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Wed, 26 Nov 2025 11:14:04 -0800
Subject: [PATCH 11/25] batch training support

---
 go.mod                         |  4 ++
 index.go                       |  4 ++
 index/scorch/introducer.go     | 12 ------
 index/scorch/merge.go          | 32 --------------
 index/scorch/persister.go      |  5 +++
 index/scorch/scorch.go         | 76 ++++++++++++++++++++++++++++++++++
 index/scorch/snapshot_index.go |  4 --
 index_alias_impl.go            | 19 +++++++++
 index_impl.go                  | 17 ++++++++
 mapping/mapping.go             |  1 -
 mapping/mapping_no_vectors.go  |  7 ----
 11 files changed, 125 insertions(+), 56 deletions(-)

diff --git a/go.mod b/go.mod
index 4736f4f88..cb73edb28 100644
--- a/go.mod
+++ b/go.mod
@@ -43,3 +43,7 @@ require (
 	github.com/spf13/pflag v1.0.9 // indirect
 	golang.org/x/sys v0.40.0 // indirect
 )
+
+replace github.com/blevesearch/scorch_segment_api/v2 => /Users/thejas.orkombu/fts/blevesearch/scorch_segment_api
+
+replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api
\ No newline at end of file
diff --git a/index.go b/index.go
index 2f1ba5fbf..c083787c4 100644
--- a/index.go
+++ b/index.go
@@ -396,3 +396,7 @@ type InsightsIndex interface {
 	// CentroidCardinalities returns the centroids (clusters) from IVF indexes ordered by data density.
 	CentroidCardinalities(field string, limit int, desceding bool) ([]index.CentroidCardinality, error)
 }
+type VectorIndex interface {
+	Index
+	Train(*Batch) error
+}
diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go
index 6afda3347..ef26532b0 100644
--- a/index/scorch/introducer.go
+++ b/index/scorch/introducer.go
@@ -129,10 +129,6 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
 		creator:  "introduceSegment",
 	}
 
-	if len(root.trainData) > 0 {
-		newSnapshot.trainData = root.trainData
-	}
-
 	// iterate through current segments
 	var running uint64
 	var docsToPersistCount, memSegments, fileSegments uint64
@@ -288,10 +284,6 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
 		creator:  "introducePersist",
 	}
 
-	if len(root.trainData) > 0 {
-		newIndexSnapshot.trainData = root.trainData
-	}
-
 	var docsToPersistCount, memSegments, fileSegments uint64
 	for i, segmentSnapshot := range root.segment {
 		// see if this segment has been replaced
@@ -368,10 +360,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
 		creator:  "introduceMerge",
 	}
 
-	if len(nextMerge.trainData) > 0 {
-		newSnapshot.trainData = nextMerge.trainData
-	}
-
 	var running, docsToPersistCount, memSegments, fileSegments uint64
 	var droppedSegmentFiles []string
 	newSegmentDeleted := make([]*roaring.Bitmap, len(nextMerge.new))
diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index 4ae23297f..bca9bbb81 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -17,9 +17,7 @@ package scorch
 import (
 	"context"
 	"fmt"
-	"math"
 	"os"
-	"slices"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -365,11 +363,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 
 		var seg segment.Segment
 		var filename string
-		var trainingSample []float32
-		collectTrainData := func(segTrainData []float32) {
-			// append a clone of the training sample
-			trainingSample = append(trainingSample, slices.Clone(segTrainData)...)
-		}
 		if len(segmentsToMerge) > 0 {
 			filename = zapFileName(newSegmentID)
 			s.markIneligibleForRemoval(filename)
@@ -424,7 +417,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 			newCount:         seg.Count(),
 			notifyCh:         make(chan *mergeTaskIntroStatus),
 			mmaped:           1,
-			trainData:        trainingSample,
 		}
 
 		s.fireEvent(EventKindMergeTaskIntroductionStart, 0)
@@ -489,7 +481,6 @@ type mergedSegmentHistory struct {
 type segmentMerge struct {
 	id               []uint64
 	new              []segment.Segment
-	trainData        []float32
 	mergedSegHistory map[uint64]*mergedSegmentHistory
 	notifyCh         chan *mergeTaskIntroStatus
 	mmaped           uint32
@@ -536,28 +527,6 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 	var em sync.Mutex
 	var errs []error
 
-	var trainingSample []float32
-	collectTrainData := func(segTrainData []float32) {
-		// append a clone of the training sample
-		trainingSample = append(trainingSample, segTrainData...)
-	}
-
-	// numDocs, err := snapshot.DocCount()
-	// if err != nil {
-	// 	return nil, nil, err
-	// }
-
-	// harcoding the total docs for now, need to get it from CB level
-	numDocs := 1000000
-	trainingSampleSize := math.Ceil(4 * math.Sqrt(float64(numDocs)) * 50)
-
-	// collect train data only if needed
-	if len(snapshot.trainData)/768 < int(trainingSampleSize) {
-		s.segmentConfig["collectTrainDataCallback"] = collectTrainData
-	} else {
-		s.segmentConfig["trainData"] = snapshot.trainData
-	}
-
 	// deploy the workers to merge and flush the batches of segments concurrently
 	// and create a new file segment
 	for i := 0; i < numFlushes; i++ {
@@ -632,7 +601,6 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 		mergedSegHistory: make(map[uint64]*mergedSegmentHistory, numSegments),
 		notifyCh:         make(chan *mergeTaskIntroStatus),
 		newCount:         newMergedCount,
-		trainData:        trainingSample,
 	}
 
 	// create a history map which maps the old in-memory segments with the specific
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 977097097..919daec70 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -575,6 +575,11 @@ func copyToDirectory(srcPath string, d index.Directory) (int64, error) {
 		return 0, fmt.Errorf("GetWriter err: %v", err)
 	}
 
+	// skip
+	if dest == nil {
+		return 0, nil
+	}
+
 	sourceFileStat, err := os.Stat(srcPath)
 	if err != nil {
 		return 0, err
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index efe052935..0da7f2318 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -27,6 +28,7 @@ import (
 	"github.com/blevesearch/bleve/v2/registry"
 	"github.com/blevesearch/bleve/v2/util"
 	index "github.com/blevesearch/bleve_index_api"
+	"github.com/blevesearch/go-faiss"
 	segment "github.com/blevesearch/scorch_segment_api/v2"
 	bolt "go.etcd.io/bbolt"
 )
@@ -78,6 +80,8 @@ type Scorch struct {
 	persisterNotifier        chan *epochWatcher
 	rootBolt                 *bolt.DB
 	asyncTasks               sync.WaitGroup
+	// not a real searchable segment, singleton
+	centroidIndex *SegmentSnapshot
 
 	onEvent      func(event Event) bool
 	onAsyncError func(err error, path string)
@@ -170,6 +174,12 @@ func NewScorch(storeName string,
 		}
 	}
 
+	// "pretraining": true
+	segConfig, ok := config["segmentConfig"].(map[string]interface{})
+	if ok {
+		rv.segmentConfig = segConfig
+	}
+
 	typ, ok := config["spatialPlugin"].(string)
 	if ok {
 		if err := rv.loadSpatialAnalyzerPlugin(typ); err != nil {
@@ -534,6 +544,72 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
 	return err
 }
 
+func (s *Scorch) Train(batch *index.Batch) error {
+	s.rootLock.Lock()
+	defer s.rootLock.Unlock()
+	if s.centroidIndex != nil {
+		// singleton API
+		return nil
+	}
+	var trainData []index.Document
+	if s.centroidIndex == nil {
+		for key, doc := range batch.IndexOps {
+			if strings.HasPrefix(key, index.TrainDataPrefix) {
+				trainData = append(trainData, doc)
+			}
+		}
+	}
+
+	// just builds a new vector index out of the train data provided
+	// it'll be an IVF index so the centroids are computed at this stage and
+	// this template will be used in the indexing down the line to index
+	// the data vectors. s.segmentConfig will mark this as a training phase
+	// and zap will handle it accordingly.
+	//
+	// note: this might index text data too, how to handle this? s.segmentConfig?
+	// todo: updates/deletes -> data drift detection
+	seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
+	if err != nil {
+		return err
+	}
+	filename := "centroid_index.zap"
+	path := filepath.Join(s.path, filename)
+
+	switch seg := seg.(type) {
+	case segment.UnpersistedSegment:
+		err = persistToDirectory(seg, nil, path)
+		if err != nil {
+			return err
+		}
+	default:
+		return fmt.Errorf("segment is not a unpersisted segment")
+	}
+
+	// persist and open the segment mmap mode.
+	persistedSegment, err := s.segPlugin.OpenEx(path, s.segmentConfig)
+	if err != nil {
+		return err
+	}
+	s.centroidIndex = &SegmentSnapshot{
+		segment: persistedSegment,
+	}
+	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
+	return nil
+}
+
+func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
+	// return the coarse quantizer of the centroid index belonging to the field
+	centroidIndexSegment, ok := s.centroidIndex.segment.(segment.CentroidIndexSegment)
+	if !ok {
+		return nil, fmt.Errorf("segment is not a centroid index segment")
+	}
+	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
+	if err != nil {
+		return nil, err
+	}
+	return coarseQuantizer, nil
+}
+
 func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
 	internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats,
 ) error {
diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index dbd666cd0..cf0273534 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -68,10 +68,6 @@ func init() {
 type IndexSnapshot struct {
 	parent *Scorch
 
-	// POC: trainData is ephemeral and read-only just like []*SegmentSnapshot
-	trainData []float32
-	// trainSegments []*SegmentSnapshot // either store []float32 or []faissIndexes aka centroid indexes
-
 	segment  []*SegmentSnapshot
 	offsets  []uint64
 	internal map[string][]byte
diff --git a/index_alias_impl.go b/index_alias_impl.go
index 8212c74b9..ee7fbf2a6 100644
--- a/index_alias_impl.go
+++ b/index_alias_impl.go
@@ -103,6 +103,25 @@ func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition *
 	return ErrorSynonymSearchNotSupported
 }
 
+func (i *indexAliasImpl) Train(batch *Batch) error {
+	i.mutex.RLock()
+	defer i.mutex.RUnlock()
+
+	if !i.open {
+		return ErrorIndexClosed
+	}
+
+	err := i.isAliasToSingleIndex()
+	if err != nil {
+		return err
+	}
+
+	if vi, ok := i.indexes[0].(VectorIndex); ok {
+		return vi.Train(batch)
+	}
+	return fmt.Errorf("not a vector index")
+}
+
 func (i *indexAliasImpl) Delete(id string) error {
 	i.mutex.RLock()
 	defer i.mutex.RUnlock()
diff --git a/index_impl.go b/index_impl.go
index 586dacb3b..bd43a4c3e 100644
--- a/index_impl.go
+++ b/index_impl.go
@@ -369,6 +369,20 @@ func (i *indexImpl) IndexSynonym(id string, collection string, definition *Synon
 	return err
 }
 
+func (i *indexImpl) Train(batch *Batch) error {
+	i.mutex.RLock()
+	defer i.mutex.RUnlock()
+
+	if !i.open {
+		return ErrorIndexClosed
+	}
+
+	if vi, ok := i.i.(VectorIndex); ok {
+		return vi.Train(batch)
+	}
+	return fmt.Errorf("not a vector index")
+}
+
 // IndexAdvanced takes a document.Document object
 // skips the mapping and indexes it.
 func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) {
@@ -1416,6 +1430,7 @@ func (m *searchHitSorter) Less(i, j int) bool {
 	return c < 0
 }
 
+// CopyTo (index.Directory, filter)
 func (i *indexImpl) CopyTo(d index.Directory) (err error) {
 	i.mutex.RLock()
 	defer i.mutex.RUnlock()
@@ -1429,6 +1444,8 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) {
 		return fmt.Errorf("index implementation does not support copy reader")
 	}
 
+	// copyIndex.Copy() -> copies the centroid index
+
 	copyReader := copyIndex.CopyReader()
 	if copyReader == nil {
 		return fmt.Errorf("index's copyReader is nil")
diff --git a/mapping/mapping.go b/mapping/mapping.go
index e3a14b738..7ff2f9927 100644
--- a/mapping/mapping.go
+++ b/mapping/mapping.go
@@ -58,7 +58,6 @@ type IndexMapping interface {
 	AnalyzerNamed(name string) analysis.Analyzer
 
 	FieldMappingForPath(path string) FieldMapping
-	VectorSources() []string
 }
 
 // A SynonymMapping extends the IndexMapping interface to provide
diff --git a/mapping/mapping_no_vectors.go b/mapping/mapping_no_vectors.go
index 3e7e0403c..cbe9d81bc 100644
--- a/mapping/mapping_no_vectors.go
+++ b/mapping/mapping_no_vectors.go
@@ -42,10 +42,3 @@ func validateFieldMapping(field *FieldMapping, path []string,
 	fieldAliasCtx map[string]*FieldMapping) error {
 	return validateFieldType(field)
 }
-
-// -----------------------------------------------------------------------------
-// vector source functions
-
-func (im *IndexMappingImpl) VectorSources() []string {
-	return []string{"vector indexing is not implemented"}
-}

From 65e171ff304b30069471561d3fe5cd2c9d111dc3 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Wed, 26 Nov 2025 11:15:13 -0800
Subject: [PATCH 12/25] wip: batch training + interfaces to reuse pre-trained
 file

---
 index.go                       |   5 +
 index/scorch/persister.go      |  29 +++++-
 index/scorch/scorch.go         | 165 ++++++++++++++++++++++++++++++++-
 index/scorch/snapshot_index.go |   4 +
 index_alias_impl.go            |   5 +-
 index_impl.go                  |  38 +++++++-
 util/keys.go                   |   1 +
 7 files changed, 233 insertions(+), 14 deletions(-)

diff --git a/index.go b/index.go
index c083787c4..21d016610 100644
--- a/index.go
+++ b/index.go
@@ -353,6 +353,11 @@ type IndexCopyable interface {
 	CopyTo(d index.Directory) error
 }
 
+type IndexFileCopyable interface {
+	UpdateFileInBolt(key []byte, value []byte) error
+	CopyFile(file string, d index.IndexDirectory) error
+}
+
 // FileSystemDirectory is the default implementation for the
 // index.Directory interface.
 type FileSystemDirectory string
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 919daec70..4ded4c23c 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -575,11 +575,6 @@ func copyToDirectory(srcPath string, d index.Directory) (int64, error) {
 		return 0, fmt.Errorf("GetWriter err: %v", err)
 	}
 
-	// skip
-	if dest == nil {
-		return 0, nil
-	}
-
 	sourceFileStat, err := os.Stat(srcPath)
 	if err != nil {
 		return 0, err
@@ -858,10 +853,34 @@ func zapFileName(epoch uint64) string {
 	return fmt.Sprintf("%012x.zap", epoch)
 }
 
+func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error {
+	if bucket == nil {
+		return nil
+	}
+	segmentSnapshot, err := s.loadSegment(bucket)
+	if err != nil {
+		return err
+	}
+	s.rootLock.Lock()
+	defer s.rootLock.Unlock()
+
+	s.centroidIndex = segmentSnapshot
+	return nil
+}
+
 // bolt snapshot code
 
 func (s *Scorch) loadFromBolt() error {
 	err := s.rootBolt.View(func(tx *bolt.Tx) error {
+		centroidIndexBucket := tx.Bucket(util.BoltCentroidIndexKey)
+		if centroidIndexBucket == nil {
+			return nil
+		}
+		err := s.updateCentroidIndex(centroidIndexBucket)
+		if err != nil {
+			return err
+		}
+
 		snapshots := tx.Bucket(util.BoltSnapshotsBucket)
 		if snapshots == nil {
 			return nil
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 0da7f2318..74aa2e99f 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -15,8 +15,10 @@
 package scorch
 
 import (
+	"bytes"
 	"encoding/json"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"strings"
@@ -544,7 +546,19 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
 	return err
 }
 
+func (s *Scorch) getInternal(key []byte) ([]byte, error) {
+	s.rootLock.RLock()
+	defer s.rootLock.RUnlock()
+	if string(key) == "_centroid_index_complete" {
+		return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil
+	}
+	return nil, nil
+}
+
+// min 39 per centroid, recommeded 50
+// max 256
 func (s *Scorch) Train(batch *index.Batch) error {
+	// is the lock really needed?
 	s.rootLock.Lock()
 	defer s.rootLock.Unlock()
 	if s.centroidIndex != nil {
@@ -554,6 +568,12 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	var trainData []index.Document
 	if s.centroidIndex == nil {
 		for key, doc := range batch.IndexOps {
+			if doc != nil {
+				// insert _id field
+				// no need to track updates/deletes over here since
+				// the API is singleton
+				doc.AddIDField()
+			}
 			if strings.HasPrefix(key, index.TrainDataPrefix) {
 				trainData = append(trainData, doc)
 			}
@@ -568,11 +588,16 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
-	seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
+	s.segmentConfig["training"] = true
+	seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
 	if err != nil {
 		return err
 	}
-	filename := "centroid_index.zap"
+	// reset the training flag once completed
+	s.segmentConfig["training"] = false
+	// not suffixing with .zap since the current garbage collection is tailored to .zap ext files
+	// we don't want to gc this file ever.
+	filename := "centroid_index"
 	path := filepath.Join(s.path, filename)
 
 	switch seg := seg.(type) {
@@ -593,7 +618,56 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	s.centroidIndex = &SegmentSnapshot{
 		segment: persistedSegment,
 	}
-	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
+
+	fmt.Println("number of bytes written to centroid index", n)
+	// s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
+	// updateBolt(tx, cetntroid)
+	// filename := "centroid_index"
+	// path := filepath.Join(s.path, filename)
+	// f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0600)
+	// if err != nil {
+	// 	return err
+	// }
+
+	// bufw := bufio.NewWriter(f)
+	// _, err = bufw.Write([]byte(strings.Join([]string{"centroid_index1", path}, " ")))
+	// if err != nil {
+	// 	return err
+	// }
+	// err = bufw.Flush()
+	// if err != nil {
+	// 	return err
+	// }
+	// err = f.Sync()
+	// if err != nil {
+	// 	return err
+	// }
+	// err = f.Close()
+	// if err != nil {
+	// 	return err
+	// }
+
+	tx, err := s.rootBolt.Begin(true)
+	if err != nil {
+		return err
+	}
+	defer tx.Rollback()
+
+	snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
+	if err != nil {
+		return err
+	}
+
+	err = snapshotsBucket.Put(util.BoltCentroidIndexKey, []byte(path))
+	if err != nil {
+		return err
+	}
+
+	err = tx.Commit()
+	if err != nil {
+		return err
+	}
+
 	return nil
 }
 
@@ -1055,6 +1129,91 @@ func (s *Scorch) CopyReader() index.CopyReader {
 	return rv
 }
 
+func (s *Scorch) updateCentroidIndexInBolt(tx *bolt.Tx) error {
+	centroidIndexBucket, err := tx.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
+	if err != nil {
+		return err
+	}
+
+	err = centroidIndexBucket.Put(util.BoltPathKey, []byte("centroid_index.zap"))
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error {
+	tx, err := s.rootBolt.Begin(true)
+	if err != nil {
+		return err
+	}
+	defer tx.Rollback()
+
+	snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
+	if err != nil {
+		return err
+	}
+
+	// currently this is specific to centroid index file update
+	if bytes.Equal(key, util.BoltCentroidIndexKey) {
+		// guard against duplicate updates
+		existingValue := snapshotsBucket.Get(key)
+		if existingValue != nil {
+			return fmt.Errorf("key already exists")
+		}
+
+		err = snapshotsBucket.Put(key, value)
+		if err != nil {
+			return err
+		}
+	}
+
+	err = tx.Commit()
+	if err != nil {
+		return err
+	}
+
+	err = s.rootBolt.Sync()
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// CopyFile copies a specific file to a destination directory which has an access to a bleve index
+// doing a io.Copy() isn't enough because the file needs to be tracked in bolt file as well
+func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error {
+	s.rootLock.Lock()
+	defer s.rootLock.Unlock()
+
+	// this code is currently specific to centroid index file but is future proofed for other files
+	// to be updated in the dest's bolt
+	if strings.HasSuffix(file, "centroid_index") {
+		// centroid index file - this is outside the snapshots domain so the bolt update is different
+		err := d.UpdateFileInBolt(util.BoltCentroidIndexKey, []byte(file))
+		if err != nil {
+			return err
+		}
+	}
+
+	dest, err := d.GetWriter(filepath.Join("store", file))
+	if err != nil {
+		return err
+	}
+
+	source, err := os.Open(filepath.Join(s.path, file))
+	if err != nil {
+		return err
+	}
+
+	defer source.Close()
+	defer dest.Close()
+	_, err = io.Copy(dest, source)
+	return err
+}
+
 // external API to fire a scorch event (EventKindIndexStart) externally from bleve
 func (s *Scorch) FireIndexEvent() {
 	s.fireEvent(EventKindIndexStart, 0)
diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index cf0273534..3585b31d8 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -467,6 +467,10 @@ func (is *IndexSnapshot) Fields() ([]string, error) {
 }
 
 func (is *IndexSnapshot) GetInternal(key []byte) ([]byte, error) {
+	_, ok := is.internal[string(key)]
+	if !ok {
+		return is.parent.getInternal(key)
+	}
 	return is.internal[string(key)], nil
 }
 
diff --git a/index_alias_impl.go b/index_alias_impl.go
index ee7fbf2a6..8cc1d90ed 100644
--- a/index_alias_impl.go
+++ b/index_alias_impl.go
@@ -110,14 +110,13 @@ func (i *indexAliasImpl) Train(batch *Batch) error {
 	if !i.open {
 		return ErrorIndexClosed
 	}
-
 	err := i.isAliasToSingleIndex()
 	if err != nil {
 		return err
 	}
 
-	if vi, ok := i.indexes[0].(VectorIndex); ok {
-		return vi.Train(batch)
+	if vi, ok := i.indexes[0].(index.VectorIndex); ok {
+		return vi.Train(batch.internal)
 	}
 	return fmt.Errorf("not a vector index")
 }
diff --git a/index_impl.go b/index_impl.go
index bd43a4c3e..3391d78c5 100644
--- a/index_impl.go
+++ b/index_impl.go
@@ -377,8 +377,8 @@ func (i *indexImpl) Train(batch *Batch) error {
 		return ErrorIndexClosed
 	}
 
-	if vi, ok := i.i.(VectorIndex); ok {
-		return vi.Train(batch)
+	if vi, ok := i.i.(index.VectorIndex); ok {
+		return vi.Train(batch.internal)
 	}
 	return fmt.Errorf("not a vector index")
 }
@@ -1430,6 +1430,38 @@ func (m *searchHitSorter) Less(i, j int) bool {
 	return c < 0
 }
 
+func (i *indexImpl) CopyFile(file string, d index.IndexDirectory) (err error) {
+	i.mutex.RLock()
+	defer i.mutex.RUnlock()
+
+	if !i.open {
+		return ErrorIndexClosed
+	}
+
+	copyIndex, ok := i.i.(index.IndexFileCopyable)
+	if !ok {
+		return fmt.Errorf("index implementation does not support copy reader")
+	}
+
+	return copyIndex.CopyFile(file, d)
+}
+
+func (i *indexImpl) UpdateFileInBolt(key []byte, value []byte) error {
+	i.mutex.RLock()
+	defer i.mutex.RUnlock()
+
+	if !i.open {
+		return ErrorIndexClosed
+	}
+
+	copyIndex, ok := i.i.(index.IndexFileCopyable)
+	if !ok {
+		return fmt.Errorf("index implementation does not support file copy")
+	}
+
+	return copyIndex.UpdateFileInBolt(key, value)
+}
+
 // CopyTo (index.Directory, filter)
 func (i *indexImpl) CopyTo(d index.Directory) (err error) {
 	i.mutex.RLock()
@@ -1459,7 +1491,7 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) {
 
 	err = copyReader.CopyTo(d)
 	if err != nil {
-		return fmt.Errorf("error copying index metadata: %v", err)
+		return fmt.Errorf("error copying index data: %v", err)
 	}
 
 	// copy the metadata
diff --git a/util/keys.go b/util/keys.go
index b71a7f48b..11c918865 100644
--- a/util/keys.go
+++ b/util/keys.go
@@ -17,6 +17,7 @@ package util
 var (
 	// Bolt keys
 	BoltSnapshotsBucket           = []byte{'s'}
+	BoltCentroidIndexKey          = []byte{'c'}
 	BoltPathKey                   = []byte{'p'}
 	BoltDeletedKey                = []byte{'d'}
 	BoltInternalKey               = []byte{'i'}

From 2544512e582ad47624e7e4bb3321e00fae0fd3bc Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 11 Dec 2025 14:43:28 -0800
Subject: [PATCH 13/25] bug fix, debug logging

---
 centroid_index_test.go    | 74 ++++++++++++++++++++++++++++++++++++++
 go.mod                    | 20 ++++++++++-
 index.go                  |  2 ++
 index/scorch/persister.go | 24 +++++++------
 index/scorch/scorch.go    | 75 ++++++++++++++-------------------------
 index_alias_impl.go       | 12 +------
 index_impl.go             |  2 ++
 7 files changed, 138 insertions(+), 71 deletions(-)
 create mode 100644 centroid_index_test.go

diff --git a/centroid_index_test.go b/centroid_index_test.go
new file mode 100644
index 000000000..a7334236b
--- /dev/null
+++ b/centroid_index_test.go
@@ -0,0 +1,74 @@
+//go:build vectors
+// +build vectors
+
+package bleve
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis/lang/en"
+	"github.com/blevesearch/bleve/v2/mapping"
+	index "github.com/blevesearch/bleve_index_api"
+)
+
+func loadSiftData() ([]map[string]interface{}, error) {
+	fileContent, err := os.ReadFile("~/fts/data/datasets/vec-sift-bucket.json")
+	if err != nil {
+		return nil, err
+	}
+	var documents []map[string]interface{}
+	err = json.Unmarshal(fileContent, &documents)
+	if err != nil {
+		return nil, err
+	}
+	return documents, nil
+}
+
+func TestCentroidIndex(t *testing.T) {
+	_, _, err := readDatasetAndQueries(testInputCompressedFile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	documents, err := loadSiftData()
+	if err != nil {
+		t.Fatal(err)
+	}
+	contentFieldMapping := NewTextFieldMapping()
+	contentFieldMapping.Analyzer = en.AnalyzerName
+
+	vecFieldMappingL2 := mapping.NewVectorFieldMapping()
+	vecFieldMappingL2.Dims = 128
+	vecFieldMappingL2.Similarity = index.EuclideanDistance
+
+	indexMappingL2Norm := NewIndexMapping()
+	indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
+	indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingL2)
+
+	idx, err := newIndexUsing(t.TempDir(), indexMappingL2Norm, Config.DefaultIndexType, Config.DefaultKVStore, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		err := idx.Close()
+		if err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	batch := idx.NewBatch()
+	for _, doc := range documents[:100000] {
+		docId := fmt.Sprintf("%s:%s", index.TrainDataPrefix, doc["id"])
+		err = batch.Index(docId, doc)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	err = idx.Train(batch)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/go.mod b/go.mod
index cb73edb28..c2ec2c4e6 100644
--- a/go.mod
+++ b/go.mod
@@ -44,6 +44,24 @@ require (
 	golang.org/x/sys v0.40.0 // indirect
 )
 
+replace github.com/blevesearch/bleve/v2 => /Users/thejas.orkombu/fts/blevesearch/bleve
+
+replace github.com/blevesearch/zapx/v11 => /Users/thejas.orkombu/fts/blevesearch/zapx11
+
+replace github.com/blevesearch/zapx/v12 => /Users/thejas.orkombu/fts/blevesearch/zapx12
+
+replace github.com/blevesearch/zapx/v13 => /Users/thejas.orkombu/fts/blevesearch/zapx13
+
+replace github.com/blevesearch/zapx/v14 => /Users/thejas.orkombu/fts/blevesearch/zapx14
+
+replace github.com/blevesearch/zapx/v15 => /Users/thejas.orkombu/fts/blevesearch/zapx15
+
+replace github.com/blevesearch/zapx/v16 => /Users/thejas.orkombu/fts/blevesearch/zapx
+
 replace github.com/blevesearch/scorch_segment_api/v2 => /Users/thejas.orkombu/fts/blevesearch/scorch_segment_api
 
-replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api
\ No newline at end of file
+replace github.com/blevesearch/go-faiss => /Users/thejas.orkombu/fts/blevesearch/go-faiss
+
+replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api
+
+replace github.com/blevesearch/sear => /Users/thejas.orkombu/fts/blevesearch/sear
diff --git a/index.go b/index.go
index 21d016610..bd5421d85 100644
--- a/index.go
+++ b/index.go
@@ -51,10 +51,12 @@ func (b *Batch) Index(id string, data interface{}) error {
 		eventIndex.FireIndexEvent()
 	}
 	doc := document.NewDocument(id)
+	// fmt.Printf("data is before mapping %#v\n", data)
 	err := b.index.Mapping().MapDocument(doc, data)
 	if err != nil {
 		return err
 	}
+	// fmt.Printf("data is after mapping %#v\n", doc)
 	b.internal.Update(doc)
 
 	b.lastDocSize = uint64(doc.Size() +
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 4ded4c23c..3a4bdacc4 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -857,13 +857,14 @@ func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error {
 	if bucket == nil {
 		return nil
 	}
+	fmt.Println("updateCentroidIndex bucket", bucket != nil)
 	segmentSnapshot, err := s.loadSegment(bucket)
 	if err != nil {
 		return err
 	}
 	s.rootLock.Lock()
 	defer s.rootLock.Unlock()
-
+	fmt.Println("updateCentroidIndex", segmentSnapshot.segment != nil)
 	s.centroidIndex = segmentSnapshot
 	return nil
 }
@@ -872,15 +873,6 @@ func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error {
 
 func (s *Scorch) loadFromBolt() error {
 	err := s.rootBolt.View(func(tx *bolt.Tx) error {
-		centroidIndexBucket := tx.Bucket(util.BoltCentroidIndexKey)
-		if centroidIndexBucket == nil {
-			return nil
-		}
-		err := s.updateCentroidIndex(centroidIndexBucket)
-		if err != nil {
-			return err
-		}
-
 		snapshots := tx.Bucket(util.BoltSnapshotsBucket)
 		if snapshots == nil {
 			return nil
@@ -897,6 +889,12 @@ func (s *Scorch) loadFromBolt() error {
 				s.AddEligibleForRemoval(snapshotEpoch)
 				continue
 			}
+			// fmt.Println("loadFromBolt key %s", k)
+			// if k[0] == util.BoltCentroidIndexKey[0] {
+			// 	fmt.Println("loadFromBolt centroid index key", string(k))
+
+			// 	continue
+			// }
 			snapshot := snapshots.Bucket(k)
 			if snapshot == nil {
 				log.Printf("snapshot key, but bucket missing %x, continuing", k)
@@ -928,6 +926,12 @@ func (s *Scorch) loadFromBolt() error {
 
 			foundRoot = true
 		}
+
+		centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey)
+		err := s.updateCentroidIndex(centroidIndexBucket)
+		if err != nil {
+			return err
+		}
 		return nil
 	})
 	if err != nil {
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 74aa2e99f..ca86da936 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -555,8 +555,6 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) {
 	return nil, nil
 }
 
-// min 39 per centroid, recommeded 50
-// max 256
 func (s *Scorch) Train(batch *index.Batch) error {
 	// is the lock really needed?
 	s.rootLock.Lock()
@@ -588,6 +586,7 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
+
 	s.segmentConfig["training"] = true
 	seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
 	if err != nil {
@@ -620,33 +619,14 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	}
 
 	fmt.Println("number of bytes written to centroid index", n)
-	// s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
-	// updateBolt(tx, cetntroid)
-	// filename := "centroid_index"
-	// path := filepath.Join(s.path, filename)
-	// f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0600)
-	// if err != nil {
-	// 	return err
-	// }
-
-	// bufw := bufio.NewWriter(f)
-	// _, err = bufw.Write([]byte(strings.Join([]string{"centroid_index1", path}, " ")))
-	// if err != nil {
-	// 	return err
-	// }
-	// err = bufw.Flush()
-	// if err != nil {
-	// 	return err
-	// }
-	// err = f.Sync()
-	// if err != nil {
-	// 	return err
-	// }
-	// err = f.Close()
-	// if err != nil {
-	// 	return err
-	// }
+	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
 
+	// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
+	// where we can be sure that the centroid index is available for the indexing operations downstream
+	//
+	// note: when the scale increases massively especially with real world dimensions of 1536+, this API
+	// will have to be refactored to persist in a more resource efficient way. so having this bolt related
+	// code will help in tracking the progress a lot better and avoid any redudant data streaming operations.
 	tx, err := s.rootBolt.Begin(true)
 	if err != nil {
 		return err
@@ -658,7 +638,11 @@ func (s *Scorch) Train(batch *index.Batch) error {
 		return err
 	}
 
-	err = snapshotsBucket.Put(util.BoltCentroidIndexKey, []byte(path))
+	centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
+	if err != nil {
+		return err
+	}
+	err = centroidBucket.Put(util.BoltPathKey, []byte(filename))
 	if err != nil {
 		return err
 	}
@@ -675,7 +659,7 @@ func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
 	// return the coarse quantizer of the centroid index belonging to the field
 	centroidIndexSegment, ok := s.centroidIndex.segment.(segment.CentroidIndexSegment)
 	if !ok {
-		return nil, fmt.Errorf("segment is not a centroid index segment")
+		return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil)
 	}
 	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
 	if err != nil {
@@ -1129,20 +1113,6 @@ func (s *Scorch) CopyReader() index.CopyReader {
 	return rv
 }
 
-func (s *Scorch) updateCentroidIndexInBolt(tx *bolt.Tx) error {
-	centroidIndexBucket, err := tx.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
-	if err != nil {
-		return err
-	}
-
-	err = centroidIndexBucket.Put(util.BoltPathKey, []byte("centroid_index.zap"))
-	if err != nil {
-		return err
-	}
-
-	return nil
-}
-
 func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error {
 	tx, err := s.rootBolt.Begin(true)
 	if err != nil {
@@ -1157,13 +1127,20 @@ func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error {
 
 	// currently this is specific to centroid index file update
 	if bytes.Equal(key, util.BoltCentroidIndexKey) {
-		// guard against duplicate updates
-		existingValue := snapshotsBucket.Get(key)
+		// todo: guard against duplicate updates
+		centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
+		if err != nil {
+			return err
+		}
+		if centroidBucket == nil {
+			return fmt.Errorf("centroid bucket not found")
+		}
+		existingValue := centroidBucket.Get(util.BoltPathKey)
 		if existingValue != nil {
-			return fmt.Errorf("key already exists")
+			return fmt.Errorf("key already exists %v %v", s.path, string(existingValue))
 		}
 
-		err = snapshotsBucket.Put(key, value)
+		err = centroidBucket.Put(util.BoltPathKey, value)
 		if err != nil {
 			return err
 		}
@@ -1194,7 +1171,7 @@ func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error {
 		// centroid index file - this is outside the snapshots domain so the bolt update is different
 		err := d.UpdateFileInBolt(util.BoltCentroidIndexKey, []byte(file))
 		if err != nil {
-			return err
+			return fmt.Errorf("error updating dest index bolt: %w", err)
 		}
 	}
 
diff --git a/index_alias_impl.go b/index_alias_impl.go
index 8cc1d90ed..16f20ac45 100644
--- a/index_alias_impl.go
+++ b/index_alias_impl.go
@@ -107,17 +107,7 @@ func (i *indexAliasImpl) Train(batch *Batch) error {
 	i.mutex.RLock()
 	defer i.mutex.RUnlock()
 
-	if !i.open {
-		return ErrorIndexClosed
-	}
-	err := i.isAliasToSingleIndex()
-	if err != nil {
-		return err
-	}
-
-	if vi, ok := i.indexes[0].(index.VectorIndex); ok {
-		return vi.Train(batch.internal)
-	}
+	// TODO: implement this
 	return fmt.Errorf("not a vector index")
 }
 
diff --git a/index_impl.go b/index_impl.go
index 3391d78c5..ec567083f 100644
--- a/index_impl.go
+++ b/index_impl.go
@@ -326,11 +326,13 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) {
 
 	i.FireIndexEvent()
 
+	// fmt.Printf("data is %#v\n", data)
 	doc := document.NewDocument(id)
 	err = i.m.MapDocument(doc, data)
 	if err != nil {
 		return
 	}
+	// fmt.Printf("data is after mapping %#v\n", doc)
 	err = i.i.Update(doc)
 	return
 }

From 3b0470bbe92b1aefa218e1dcffee8a14608162da Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 15 Dec 2025 11:35:30 -0800
Subject: [PATCH 14/25] wip: implement async trainer loop with incremental
 training support

---
 index/scorch/scorch.go | 173 +++++++++++++++++++++++++++++------------
 util/keys.go           |   1 +
 2 files changed, 125 insertions(+), 49 deletions(-)

diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index ca86da936..888bc9847 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -16,6 +16,7 @@ package scorch
 
 import (
 	"bytes"
+	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -84,6 +85,7 @@ type Scorch struct {
 	asyncTasks               sync.WaitGroup
 	// not a real searchable segment, singleton
 	centroidIndex *SegmentSnapshot
+	train         chan *trainRequest
 
 	onEvent      func(event Event) bool
 	onAsyncError func(err error, path string)
@@ -101,6 +103,12 @@ func (t ScorchErrorType) Error() string {
 	return string(t)
 }
 
+type trainRequest struct {
+	sample   segment.Segment
+	vecCount int
+	ackCh    chan error
+}
+
 // ErrType values for ScorchError
 const (
 	ErrAsyncPanic   = ScorchErrorType("async panic error")
@@ -549,13 +557,118 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
 func (s *Scorch) getInternal(key []byte) ([]byte, error) {
 	s.rootLock.RLock()
 	defer s.rootLock.RUnlock()
+	// todo: return the total number of vectors that have been processed so far in training
+	// in cbft use that as a checkpoint to resume training for n-x samples.
 	if string(key) == "_centroid_index_complete" {
 		return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil
 	}
 	return nil, nil
 }
 
+// this is not a routine that will be running throughout the lifetime of the index. It's purpose
+// is to only train the vector index before the data ingestion starts.
+func (s *Scorch) trainerLoop() {
+	// some init stuff
+	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
+	var totalSamplesProcessed int
+	filename := "centroid_index"
+	path := filepath.Join(s.path, filename)
+	buf := make([]byte, binary.MaxVarintLen64)
+	for {
+		select {
+		case <-s.closeCh:
+			return
+		case trainReq := <-s.train:
+			sampleSeg := trainReq.sample
+			if s.centroidIndex == nil {
+				// new centroid index
+				s.centroidIndex = &SegmentSnapshot{
+					segment: sampleSeg,
+				}
+				switch seg := sampleSeg.(type) {
+				case segment.UnpersistedSegment:
+					err := persistToDirectory(seg, nil, path)
+					if err != nil {
+						// clean up this ugly ass error handling code
+						trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err)
+						close(trainReq.ackCh)
+					}
+				default:
+					fmt.Errorf("segment is not a unpersisted segment")
+					close(s.closeCh)
+				}
+			} else {
+				// merge the new segment with the existing one, no need to persist?
+				// persist in a tmp file and then rename - is that a fair strategy?
+				_, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg},
+					[]*roaring.Bitmap{nil, nil}, "centroid_index.tmp", s.closeCh, nil, s.segmentConfig)
+				if err != nil {
+					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
+					close(trainReq.ackCh)
+				}
+
+				// close the existing centroid segment - it's supposed to be gc'd at this point
+				s.centroidIndex.segment.Close()
+				err = os.Rename(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index"))
+				if err != nil {
+					trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err)
+					close(trainReq.ackCh)
+				}
+			}
+
+			totalSamplesProcessed += trainReq.vecCount
+			// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
+			// where we can be sure that the centroid index is available for the indexing operations downstream
+			//
+			// note: when the scale increases massively especially with real world dimensions of 1536+, this API
+			// will have to be refactored to persist in a more resource efficient way. so having this bolt related
+			// code will help in tracking the progress a lot better and avoid any redudant data streaming operations.
+			tx, err := s.rootBolt.Begin(true)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err)
+				close(trainReq.ackCh)
+			}
+			defer tx.Rollback()
+
+			snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err)
+				close(trainReq.ackCh)
+			}
+
+			centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err)
+				close(trainReq.ackCh)
+			}
+
+			err = centroidBucket.Put(util.BoltPathKey, []byte(filename))
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err)
+				close(trainReq.ackCh)
+			}
+
+			// total number of vectors that have been processed so far for the training
+			n := binary.PutUvarint(buf, uint64(totalSamplesProcessed))
+			err = centroidBucket.Put(util.BoltVecSamplesProcessedKey, buf[:n])
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error updating vec samples processed: %v", err)
+				close(trainReq.ackCh)
+			}
+
+			err = tx.Commit()
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
+				close(trainReq.ackCh)
+			}
+
+			close(trainReq.ackCh)
+		}
+	}
+}
+
 func (s *Scorch) Train(batch *index.Batch) error {
+
 	// is the lock really needed?
 	s.rootLock.Lock()
 	defer s.rootLock.Unlock()
@@ -586,7 +699,6 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
-
 	s.segmentConfig["training"] = true
 	seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
 	if err != nil {
@@ -594,65 +706,28 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	}
 	// reset the training flag once completed
 	s.segmentConfig["training"] = false
-	// not suffixing with .zap since the current garbage collection is tailored to .zap ext files
-	// we don't want to gc this file ever.
-	filename := "centroid_index"
-	path := filepath.Join(s.path, filename)
 
-	switch seg := seg.(type) {
-	case segment.UnpersistedSegment:
-		err = persistToDirectory(seg, nil, path)
-		if err != nil {
-			return err
-		}
-	default:
-		return fmt.Errorf("segment is not a unpersisted segment")
+	trainReq := &trainRequest{
+		sample:   seg,
+		vecCount: len(trainData), // todo: multivector support
+		ackCh:    make(chan error),
 	}
 
-	// persist and open the segment mmap mode.
-	persistedSegment, err := s.segPlugin.OpenEx(path, s.segmentConfig)
+	s.train <- trainReq
+	err = <-trainReq.ackCh
 	if err != nil {
 		return err
 	}
-	s.centroidIndex = &SegmentSnapshot{
-		segment: persistedSegment,
-	}
 
-	fmt.Println("number of bytes written to centroid index", n)
-	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
-
-	// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
-	// where we can be sure that the centroid index is available for the indexing operations downstream
-	//
-	// note: when the scale increases massively especially with real world dimensions of 1536+, this API
-	// will have to be refactored to persist in a more resource efficient way. so having this bolt related
-	// code will help in tracking the progress a lot better and avoid any redudant data streaming operations.
-	tx, err := s.rootBolt.Begin(true)
+	centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig)
 	if err != nil {
 		return err
 	}
-	defer tx.Rollback()
-
-	snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
-	if err != nil {
-		return err
-	}
-
-	centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
-	if err != nil {
-		return err
-	}
-	err = centroidBucket.Put(util.BoltPathKey, []byte(filename))
-	if err != nil {
-		return err
-	}
-
-	err = tx.Commit()
-	if err != nil {
-		return err
+	s.centroidIndex = &SegmentSnapshot{
+		segment: centroidIndex,
 	}
-
-	return nil
+	fmt.Println("number of bytes written to centroid index", n)
+	return err
 }
 
 func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
diff --git a/util/keys.go b/util/keys.go
index 11c918865..67415e782 100644
--- a/util/keys.go
+++ b/util/keys.go
@@ -18,6 +18,7 @@ var (
 	// Bolt keys
 	BoltSnapshotsBucket           = []byte{'s'}
 	BoltCentroidIndexKey          = []byte{'c'}
+	BoltVecSamplesProcessedKey    = []byte{'v'}
 	BoltPathKey                   = []byte{'p'}
 	BoltDeletedKey                = []byte{'d'}
 	BoltInternalKey               = []byte{'i'}

From edde0cad9b737e23b9739eb9088888df1b204ec2 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 15 Dec 2025 11:36:44 -0800
Subject: [PATCH 15/25] regulate train function using EventKindIndexStart

---
 index/scorch/scorch.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 888bc9847..30bf7ea02 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -668,6 +668,8 @@ func (s *Scorch) trainerLoop() {
 }
 
 func (s *Scorch) Train(batch *index.Batch) error {
+	// regulate the Train function
+	s.FireIndexEvent()
 
 	// is the lock really needed?
 	s.rootLock.Lock()

From 758ed77c2f5fd88e26f978a16e88396c727d22f5 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Fri, 9 Jan 2026 13:50:07 -0800
Subject: [PATCH 16/25] incremental training bug fixes + better recoverability

---
 index/scorch/scorch.go | 85 ++++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 28 deletions(-)

diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 30bf7ea02..29900b770 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -281,6 +281,9 @@ func (s *Scorch) Open() error {
 	s.asyncTasks.Add(1)
 	go s.introducerLoop()
 
+	s.asyncTasks.Add(1)
+	go s.trainerLoop()
+
 	if !s.readOnly && s.path != "" {
 		s.asyncTasks.Add(1)
 		go s.persisterLoop()
@@ -356,6 +359,7 @@ func (s *Scorch) openBolt() error {
 	s.persisterNotifier = make(chan *epochWatcher, 1)
 	s.closeCh = make(chan struct{})
 	s.forceMergeRequestCh = make(chan *mergerCtrl, 1)
+	s.train = make(chan *trainRequest)
 
 	if !s.readOnly && s.path != "" {
 		err := s.removeOldZapFiles() // Before persister or merger create any new files.
@@ -565,9 +569,21 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) {
 	return nil, nil
 }
 
+func moveFile(sourcePath, destPath string) error {
+	// rename is supposed to be atomic on the same filesystem
+	err := os.Rename(sourcePath, destPath)
+	if err != nil {
+		return fmt.Errorf("error renaming file: %v", err)
+	}
+	return nil
+}
+
 // this is not a routine that will be running throughout the lifetime of the index. It's purpose
 // is to only train the vector index before the data ingestion starts.
 func (s *Scorch) trainerLoop() {
+	defer func() {
+		s.asyncTasks.Done()
+	}()
 	// some init stuff
 	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
 	var totalSamplesProcessed int
@@ -581,10 +597,6 @@ func (s *Scorch) trainerLoop() {
 		case trainReq := <-s.train:
 			sampleSeg := trainReq.sample
 			if s.centroidIndex == nil {
-				// new centroid index
-				s.centroidIndex = &SegmentSnapshot{
-					segment: sampleSeg,
-				}
 				switch seg := sampleSeg.(type) {
 				case segment.UnpersistedSegment:
 					err := persistToDirectory(seg, nil, path)
@@ -592,30 +604,35 @@ func (s *Scorch) trainerLoop() {
 						// clean up this ugly ass error handling code
 						trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err)
 						close(trainReq.ackCh)
+						return
 					}
 				default:
 					fmt.Errorf("segment is not a unpersisted segment")
 					close(s.closeCh)
+					return
 				}
 			} else {
 				// merge the new segment with the existing one, no need to persist?
 				// persist in a tmp file and then rename - is that a fair strategy?
+				fmt.Println("merging centroid index")
+				s.segmentConfig["training"] = true
 				_, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg},
-					[]*roaring.Bitmap{nil, nil}, "centroid_index.tmp", s.closeCh, nil, s.segmentConfig)
+					[]*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig)
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
 					close(trainReq.ackCh)
 				}
+				// reset the training flag once completed
+				s.segmentConfig["training"] = false
 
 				// close the existing centroid segment - it's supposed to be gc'd at this point
 				s.centroidIndex.segment.Close()
-				err = os.Rename(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index"))
+				err = moveFile(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index"))
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err)
 					close(trainReq.ackCh)
 				}
 			}
-
 			totalSamplesProcessed += trainReq.vecCount
 			// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
 			// where we can be sure that the centroid index is available for the indexing operations downstream
@@ -627,6 +644,7 @@ func (s *Scorch) trainerLoop() {
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err)
 				close(trainReq.ackCh)
+				return
 			}
 			defer tx.Rollback()
 
@@ -634,18 +652,21 @@ func (s *Scorch) trainerLoop() {
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err)
 				close(trainReq.ackCh)
+				return
 			}
 
 			centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err)
 				close(trainReq.ackCh)
+				return
 			}
 
 			err = centroidBucket.Put(util.BoltPathKey, []byte(filename))
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err)
 				close(trainReq.ackCh)
+				return
 			}
 
 			// total number of vectors that have been processed so far for the training
@@ -654,14 +675,25 @@ func (s *Scorch) trainerLoop() {
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error updating vec samples processed: %v", err)
 				close(trainReq.ackCh)
+				return
 			}
 
 			err = tx.Commit()
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
 				close(trainReq.ackCh)
+				return
 			}
 
+			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+			s.centroidIndex = &SegmentSnapshot{
+				segment: centroidIndex,
+			}
 			close(trainReq.ackCh)
 		}
 	}
@@ -671,25 +703,20 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	// regulate the Train function
 	s.FireIndexEvent()
 
-	// is the lock really needed?
-	s.rootLock.Lock()
-	defer s.rootLock.Unlock()
-	if s.centroidIndex != nil {
-		// singleton API
-		return nil
-	}
+	// // is the lock really needed?
+	// s.rootLock.Lock()
+	// defer s.rootLock.Unlock()
+
 	var trainData []index.Document
-	if s.centroidIndex == nil {
-		for key, doc := range batch.IndexOps {
-			if doc != nil {
-				// insert _id field
-				// no need to track updates/deletes over here since
-				// the API is singleton
-				doc.AddIDField()
-			}
-			if strings.HasPrefix(key, index.TrainDataPrefix) {
-				trainData = append(trainData, doc)
-			}
+	for key, doc := range batch.IndexOps {
+		if doc != nil {
+			// insert _id field
+			// no need to track updates/deletes over here since
+			// the API is singleton
+			doc.AddIDField()
+		}
+		if strings.HasPrefix(key, index.TrainDataPrefix) {
+			trainData = append(trainData, doc)
 		}
 	}
 
@@ -701,13 +728,10 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
-	s.segmentConfig["training"] = true
 	seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
 	if err != nil {
 		return err
 	}
-	// reset the training flag once completed
-	s.segmentConfig["training"] = false
 
 	trainReq := &trainRequest{
 		sample:   seg,
@@ -718,6 +742,7 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	s.train <- trainReq
 	err = <-trainReq.ackCh
 	if err != nil {
+		fmt.Println("error training", err)
 		return err
 	}
 
@@ -728,6 +753,10 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	s.centroidIndex = &SegmentSnapshot{
 		segment: centroidIndex,
 	}
+	_, err = s.getCentroidIndex("emb")
+	if err != nil {
+		return err
+	}
 	fmt.Println("number of bytes written to centroid index", n)
 	return err
 }

From 734e2727eb15df922ad792c3aa1eb19aefdf1ac2 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 15 Jan 2026 13:28:53 -0800
Subject: [PATCH 17/25] cleanup:

---
 index/scorch/merge.go          |  1 +
 index/scorch/persister.go      |  2 +-
 index/scorch/scorch.go         | 17 ++++++-----------
 index/scorch/segment_plugin.go |  1 +
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index bca9bbb81..e7f7cce3f 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -537,6 +537,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 			filename := zapFileName(newSegmentID)
 			path := s.path + string(os.PathSeparator) + filename
 
+			fmt.Println("version while merging", s.segPlugin.Version())
 			// the newly merged segment is already flushed out to disk, just needs
 			// to be opened using mmap.
 			newDocIDs, _, err :=
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 3a4bdacc4..00c26a5d2 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -425,7 +425,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste
 	var totSize int
 	var numSegsToFlushOut int
 	var totDocs uint64
-
+	fmt.Println("persister doing its thing")
 	// legacy behaviour of merge + flush of all in-memory segments in one-shot
 	if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) {
 		val := &flushable{
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 29900b770..8153f65fb 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -616,6 +616,7 @@ func (s *Scorch) trainerLoop() {
 				// persist in a tmp file and then rename - is that a fair strategy?
 				fmt.Println("merging centroid index")
 				s.segmentConfig["training"] = true
+				fmt.Println("version while merging", s.segPlugin.Version())
 				_, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg},
 					[]*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig)
 				if err != nil {
@@ -685,6 +686,8 @@ func (s *Scorch) trainerLoop() {
 				return
 			}
 
+			// update the centroid index pointer
+			fmt.Println("version", s.segPlugin.Version())
 			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
@@ -703,10 +706,6 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	// regulate the Train function
 	s.FireIndexEvent()
 
-	// // is the lock really needed?
-	// s.rootLock.Lock()
-	// defer s.rootLock.Unlock()
-
 	var trainData []index.Document
 	for key, doc := range batch.IndexOps {
 		if doc != nil {
@@ -745,14 +744,8 @@ func (s *Scorch) Train(batch *index.Batch) error {
 		fmt.Println("error training", err)
 		return err
 	}
+	fmt.Println("got centroid index")
 
-	centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig)
-	if err != nil {
-		return err
-	}
-	s.centroidIndex = &SegmentSnapshot{
-		segment: centroidIndex,
-	}
 	_, err = s.getCentroidIndex("emb")
 	if err != nil {
 		return err
@@ -767,6 +760,8 @@ func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
 	if !ok {
 		return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil)
 	}
+
+	fmt.Println("getting coarse quantizer", field)
 	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
 	if err != nil {
 		return nil, err
diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go
index 16be8e440..deeeffd3f 100644
--- a/index/scorch/segment_plugin.go
+++ b/index/scorch/segment_plugin.go
@@ -101,6 +101,7 @@ func RegisterSegmentPlugin(plugin SegmentPlugin, makeDefault bool) {
 	}
 	supportedSegmentPlugins[plugin.Type()][plugin.Version()] = plugin
 	if makeDefault {
+		fmt.Println("registering default segment plugin", plugin.Type(), plugin.Version())
 		defaultSegmentPlugin = plugin
 	}
 }

From a41f99c5c76da1cbcdc858ed4d8f0d5c634c0c68 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 26 Jan 2026 12:59:07 -0800
Subject: [PATCH 18/25] cleanup and refactor the code to have the foundational
 stuff

---
 index/scorch/merge.go     |  2 ++
 index/scorch/persister.go |  6 ++----
 index/scorch/scorch.go    | 42 ++++++++++++++++++++-------------------
 util/keys.go              |  2 +-
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index e7f7cce3f..e75e1c23e 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -335,6 +335,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 		docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
 		mergedSegHistory := make(map[uint64]*mergedSegmentHistory, len(task.Segments))
 
+		var files []string
 		for _, planSegment := range task.Segments {
 			if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
 				oldMap[segSnapshot.id] = segSnapshot
@@ -350,6 +351,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 					} else {
 						segmentsToMerge = append(segmentsToMerge, segSnapshot.segment)
 						docsToDrop = append(docsToDrop, segSnapshot.deleted)
+						files = append(files, persistedSeg.Path())
 					}
 					// track the files getting merged for unsetting the
 					// removal ineligibility. This helps to unflip files
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 00c26a5d2..1d118bb50 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -853,18 +853,16 @@ func zapFileName(epoch uint64) string {
 	return fmt.Sprintf("%012x.zap", epoch)
 }
 
-func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error {
+func (s *Scorch) loadCentroidIndex(bucket *bolt.Bucket) error {
 	if bucket == nil {
 		return nil
 	}
-	fmt.Println("updateCentroidIndex bucket", bucket != nil)
 	segmentSnapshot, err := s.loadSegment(bucket)
 	if err != nil {
 		return err
 	}
 	s.rootLock.Lock()
 	defer s.rootLock.Unlock()
-	fmt.Println("updateCentroidIndex", segmentSnapshot.segment != nil)
 	s.centroidIndex = segmentSnapshot
 	return nil
 }
@@ -928,7 +926,7 @@ func (s *Scorch) loadFromBolt() error {
 		}
 
 		centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey)
-		err := s.updateCentroidIndex(centroidIndexBucket)
+		err := s.loadCentroidIndex(centroidIndexBucket)
 		if err != nil {
 			return err
 		}
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 8153f65fb..cb5367117 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -16,7 +16,6 @@ package scorch
 
 import (
 	"bytes"
-	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -26,6 +25,7 @@ import (
 	"sync"
 	"sync/atomic"
 	"time"
+	"unsafe"
 
 	"github.com/RoaringBitmap/roaring/v2"
 	"github.com/blevesearch/bleve/v2/registry"
@@ -563,7 +563,8 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) {
 	defer s.rootLock.RUnlock()
 	// todo: return the total number of vectors that have been processed so far in training
 	// in cbft use that as a checkpoint to resume training for n-x samples.
-	if string(key) == "_centroid_index_complete" {
+	switch string(key) {
+	case string(util.BoltTrainCompleteKey):
 		return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil
 	}
 	return nil, nil
@@ -578,6 +579,10 @@ func moveFile(sourcePath, destPath string) error {
 	return nil
 }
 
+func boolToByte(b bool) byte {
+	return *(*byte)(unsafe.Pointer(&b))
+}
+
 // this is not a routine that will be running throughout the lifetime of the index. It's purpose
 // is to only train the vector index before the data ingestion starts.
 func (s *Scorch) trainerLoop() {
@@ -589,7 +594,6 @@ func (s *Scorch) trainerLoop() {
 	var totalSamplesProcessed int
 	filename := "centroid_index"
 	path := filepath.Join(s.path, filename)
-	buf := make([]byte, binary.MaxVarintLen64)
 	for {
 		select {
 		case <-s.closeCh:
@@ -614,9 +618,7 @@ func (s *Scorch) trainerLoop() {
 			} else {
 				// merge the new segment with the existing one, no need to persist?
 				// persist in a tmp file and then rename - is that a fair strategy?
-				fmt.Println("merging centroid index")
 				s.segmentConfig["training"] = true
-				fmt.Println("version while merging", s.segPlugin.Version())
 				_, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg},
 					[]*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig)
 				if err != nil {
@@ -647,7 +649,11 @@ func (s *Scorch) trainerLoop() {
 				close(trainReq.ackCh)
 				return
 			}
-			defer tx.Rollback()
+			defer func() {
+				if err != nil {
+					_ = tx.Rollback()
+				}
+			}()
 
 			snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
 			if err != nil {
@@ -670,16 +676,14 @@ func (s *Scorch) trainerLoop() {
 				return
 			}
 
-			// total number of vectors that have been processed so far for the training
-			n := binary.PutUvarint(buf, uint64(totalSamplesProcessed))
-			err = centroidBucket.Put(util.BoltVecSamplesProcessedKey, buf[:n])
+			err = tx.Commit()
 			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error updating vec samples processed: %v", err)
+				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
 				close(trainReq.ackCh)
 				return
 			}
 
-			err = tx.Commit()
+			err = s.rootBolt.Sync()
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
 				close(trainReq.ackCh)
@@ -687,7 +691,6 @@ func (s *Scorch) trainerLoop() {
 			}
 
 			// update the centroid index pointer
-			fmt.Println("version", s.segPlugin.Version())
 			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
@@ -706,6 +709,8 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	// regulate the Train function
 	s.FireIndexEvent()
 
+	// batch.InternalOps
+
 	var trainData []index.Document
 	for key, doc := range batch.IndexOps {
 		if doc != nil {
@@ -741,15 +746,9 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	s.train <- trainReq
 	err = <-trainReq.ackCh
 	if err != nil {
-		fmt.Println("error training", err)
 		return err
 	}
-	fmt.Println("got centroid index")
 
-	_, err = s.getCentroidIndex("emb")
-	if err != nil {
-		return err
-	}
 	fmt.Println("number of bytes written to centroid index", n)
 	return err
 }
@@ -761,7 +760,6 @@ func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
 		return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil)
 	}
 
-	fmt.Println("getting coarse quantizer", field)
 	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
 	if err != nil {
 		return nil, err
@@ -1219,7 +1217,11 @@ func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error {
 	if err != nil {
 		return err
 	}
-	defer tx.Rollback()
+	defer func() {
+		if err != nil {
+			_ = tx.Rollback()
+		}
+	}()
 
 	snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
 	if err != nil {
diff --git a/util/keys.go b/util/keys.go
index 67415e782..a1f3bfbbf 100644
--- a/util/keys.go
+++ b/util/keys.go
@@ -18,7 +18,7 @@ var (
 	// Bolt keys
 	BoltSnapshotsBucket           = []byte{'s'}
 	BoltCentroidIndexKey          = []byte{'c'}
-	BoltVecSamplesProcessedKey    = []byte{'v'}
+	BoltTrainCompleteKey          = []byte{'t'}
 	BoltPathKey                   = []byte{'p'}
 	BoltDeletedKey                = []byte{'d'}
 	BoltInternalKey               = []byte{'i'}

From 0331a9317f18b765814daab93b9afc370ed662c1 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 29 Jan 2026 11:02:24 -0800
Subject: [PATCH 19/25] refactor file transfer

---
 index.go                       |   7 ---
 index/scorch/merge.go          |   1 -
 index/scorch/persister.go      |   6 --
 index/scorch/scorch.go         | 100 ++-------------------------------
 index/scorch/segment_plugin.go |   1 -
 index/scorch/snapshot_index.go |   3 +-
 index_impl.go                  |  37 ------------
 7 files changed, 7 insertions(+), 148 deletions(-)

diff --git a/index.go b/index.go
index bd5421d85..c083787c4 100644
--- a/index.go
+++ b/index.go
@@ -51,12 +51,10 @@ func (b *Batch) Index(id string, data interface{}) error {
 		eventIndex.FireIndexEvent()
 	}
 	doc := document.NewDocument(id)
-	// fmt.Printf("data is before mapping %#v\n", data)
 	err := b.index.Mapping().MapDocument(doc, data)
 	if err != nil {
 		return err
 	}
-	// fmt.Printf("data is after mapping %#v\n", doc)
 	b.internal.Update(doc)
 
 	b.lastDocSize = uint64(doc.Size() +
@@ -355,11 +353,6 @@ type IndexCopyable interface {
 	CopyTo(d index.Directory) error
 }
 
-type IndexFileCopyable interface {
-	UpdateFileInBolt(key []byte, value []byte) error
-	CopyFile(file string, d index.IndexDirectory) error
-}
-
 // FileSystemDirectory is the default implementation for the
 // index.Directory interface.
 type FileSystemDirectory string
diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index e75e1c23e..32de86bd4 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -539,7 +539,6 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
 			filename := zapFileName(newSegmentID)
 			path := s.path + string(os.PathSeparator) + filename
 
-			fmt.Println("version while merging", s.segPlugin.Version())
 			// the newly merged segment is already flushed out to disk, just needs
 			// to be opened using mmap.
 			newDocIDs, _, err :=
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 1d118bb50..fb250001b 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -425,7 +425,6 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste
 	var totSize int
 	var numSegsToFlushOut int
 	var totDocs uint64
-	fmt.Println("persister doing its thing")
 	// legacy behaviour of merge + flush of all in-memory segments in one-shot
 	if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) {
 		val := &flushable{
@@ -887,12 +886,7 @@ func (s *Scorch) loadFromBolt() error {
 				s.AddEligibleForRemoval(snapshotEpoch)
 				continue
 			}
-			// fmt.Println("loadFromBolt key %s", k)
-			// if k[0] == util.BoltCentroidIndexKey[0] {
-			// 	fmt.Println("loadFromBolt centroid index key", string(k))
 
-			// 	continue
-			// }
 			snapshot := snapshots.Bucket(k)
 			if snapshot == nil {
 				log.Printf("snapshot key, but bucket missing %x, continuing", k)
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index cb5367117..5fa8483e7 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -15,10 +15,8 @@
 package scorch
 
 import (
-	"bytes"
 	"encoding/json"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"strings"
@@ -83,7 +81,7 @@ type Scorch struct {
 	persisterNotifier        chan *epochWatcher
 	rootBolt                 *bolt.DB
 	asyncTasks               sync.WaitGroup
-	// not a real searchable segment, singleton
+	// not a real searchable segment
 	centroidIndex *SegmentSnapshot
 	train         chan *trainRequest
 
@@ -184,7 +182,6 @@ func NewScorch(storeName string,
 		}
 	}
 
-	// "pretraining": true
 	segConfig, ok := config["segmentConfig"].(map[string]interface{})
 	if ok {
 		rv.segmentConfig = segConfig
@@ -592,7 +589,7 @@ func (s *Scorch) trainerLoop() {
 	// some init stuff
 	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
 	var totalSamplesProcessed int
-	filename := "centroid_index"
+	filename := index.CentroidIndexFileName
 	path := filepath.Join(s.path, filename)
 	for {
 		select {
@@ -620,7 +617,7 @@ func (s *Scorch) trainerLoop() {
 				// persist in a tmp file and then rename - is that a fair strategy?
 				s.segmentConfig["training"] = true
 				_, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg},
-					[]*roaring.Bitmap{nil, nil}, filepath.Join(s.path, "centroid_index.tmp"), s.closeCh, nil, s.segmentConfig)
+					[]*roaring.Bitmap{nil, nil}, filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), s.closeCh, nil, s.segmentConfig)
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
 					close(trainReq.ackCh)
@@ -630,7 +627,7 @@ func (s *Scorch) trainerLoop() {
 
 				// close the existing centroid segment - it's supposed to be gc'd at this point
 				s.centroidIndex.segment.Close()
-				err = moveFile(filepath.Join(s.path, "centroid_index.tmp"), filepath.Join(s.path, "centroid_index"))
+				err = moveFile(filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), filepath.Join(s.path, index.CentroidIndexFileName))
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err)
 					close(trainReq.ackCh)
@@ -691,7 +688,7 @@ func (s *Scorch) trainerLoop() {
 			}
 
 			// update the centroid index pointer
-			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, "centroid_index"), s.segmentConfig)
+			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.centroidIndexFileName), s.segmentConfig)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
 				close(trainReq.ackCh)
@@ -709,8 +706,6 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	// regulate the Train function
 	s.FireIndexEvent()
 
-	// batch.InternalOps
-
 	var trainData []index.Document
 	for key, doc := range batch.IndexOps {
 		if doc != nil {
@@ -732,7 +727,7 @@ func (s *Scorch) Train(batch *index.Batch) error {
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
-	seg, n, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
+	seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
 	if err != nil {
 		return err
 	}
@@ -749,7 +744,6 @@ func (s *Scorch) Train(batch *index.Batch) error {
 		return err
 	}
 
-	fmt.Println("number of bytes written to centroid index", n)
 	return err
 }
 
@@ -1212,88 +1206,6 @@ func (s *Scorch) CopyReader() index.CopyReader {
 	return rv
 }
 
-func (s *Scorch) UpdateFileInBolt(key []byte, value []byte) error {
-	tx, err := s.rootBolt.Begin(true)
-	if err != nil {
-		return err
-	}
-	defer func() {
-		if err != nil {
-			_ = tx.Rollback()
-		}
-	}()
-
-	snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
-	if err != nil {
-		return err
-	}
-
-	// currently this is specific to centroid index file update
-	if bytes.Equal(key, util.BoltCentroidIndexKey) {
-		// todo: guard against duplicate updates
-		centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
-		if err != nil {
-			return err
-		}
-		if centroidBucket == nil {
-			return fmt.Errorf("centroid bucket not found")
-		}
-		existingValue := centroidBucket.Get(util.BoltPathKey)
-		if existingValue != nil {
-			return fmt.Errorf("key already exists %v %v", s.path, string(existingValue))
-		}
-
-		err = centroidBucket.Put(util.BoltPathKey, value)
-		if err != nil {
-			return err
-		}
-	}
-
-	err = tx.Commit()
-	if err != nil {
-		return err
-	}
-
-	err = s.rootBolt.Sync()
-	if err != nil {
-		return err
-	}
-
-	return nil
-}
-
-// CopyFile copies a specific file to a destination directory which has an access to a bleve index
-// doing a io.Copy() isn't enough because the file needs to be tracked in bolt file as well
-func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error {
-	s.rootLock.Lock()
-	defer s.rootLock.Unlock()
-
-	// this code is currently specific to centroid index file but is future proofed for other files
-	// to be updated in the dest's bolt
-	if strings.HasSuffix(file, "centroid_index") {
-		// centroid index file - this is outside the snapshots domain so the bolt update is different
-		err := d.UpdateFileInBolt(util.BoltCentroidIndexKey, []byte(file))
-		if err != nil {
-			return fmt.Errorf("error updating dest index bolt: %w", err)
-		}
-	}
-
-	dest, err := d.GetWriter(filepath.Join("store", file))
-	if err != nil {
-		return err
-	}
-
-	source, err := os.Open(filepath.Join(s.path, file))
-	if err != nil {
-		return err
-	}
-
-	defer source.Close()
-	defer dest.Close()
-	_, err = io.Copy(dest, source)
-	return err
-}
-
 // external API to fire a scorch event (EventKindIndexStart) externally from bleve
 func (s *Scorch) FireIndexEvent() {
 	s.fireEvent(EventKindIndexStart, 0)
diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go
index deeeffd3f..16be8e440 100644
--- a/index/scorch/segment_plugin.go
+++ b/index/scorch/segment_plugin.go
@@ -101,7 +101,6 @@ func RegisterSegmentPlugin(plugin SegmentPlugin, makeDefault bool) {
 	}
 	supportedSegmentPlugins[plugin.Type()][plugin.Version()] = plugin
 	if makeDefault {
-		fmt.Println("registering default segment plugin", plugin.Type(), plugin.Version())
 		defaultSegmentPlugin = plugin
 	}
 }
diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go
index 3585b31d8..688f9d903 100644
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@@ -66,8 +66,7 @@ func init() {
 }
 
 type IndexSnapshot struct {
-	parent *Scorch
-
+	parent   *Scorch
 	segment  []*SegmentSnapshot
 	offsets  []uint64
 	internal map[string][]byte
diff --git a/index_impl.go b/index_impl.go
index ec567083f..0d7e1dd4d 100644
--- a/index_impl.go
+++ b/index_impl.go
@@ -326,13 +326,11 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) {
 
 	i.FireIndexEvent()
 
-	// fmt.Printf("data is %#v\n", data)
 	doc := document.NewDocument(id)
 	err = i.m.MapDocument(doc, data)
 	if err != nil {
 		return
 	}
-	// fmt.Printf("data is after mapping %#v\n", doc)
 	err = i.i.Update(doc)
 	return
 }
@@ -1432,39 +1430,6 @@ func (m *searchHitSorter) Less(i, j int) bool {
 	return c < 0
 }
 
-func (i *indexImpl) CopyFile(file string, d index.IndexDirectory) (err error) {
-	i.mutex.RLock()
-	defer i.mutex.RUnlock()
-
-	if !i.open {
-		return ErrorIndexClosed
-	}
-
-	copyIndex, ok := i.i.(index.IndexFileCopyable)
-	if !ok {
-		return fmt.Errorf("index implementation does not support copy reader")
-	}
-
-	return copyIndex.CopyFile(file, d)
-}
-
-func (i *indexImpl) UpdateFileInBolt(key []byte, value []byte) error {
-	i.mutex.RLock()
-	defer i.mutex.RUnlock()
-
-	if !i.open {
-		return ErrorIndexClosed
-	}
-
-	copyIndex, ok := i.i.(index.IndexFileCopyable)
-	if !ok {
-		return fmt.Errorf("index implementation does not support file copy")
-	}
-
-	return copyIndex.UpdateFileInBolt(key, value)
-}
-
-// CopyTo (index.Directory, filter)
 func (i *indexImpl) CopyTo(d index.Directory) (err error) {
 	i.mutex.RLock()
 	defer i.mutex.RUnlock()
@@ -1478,8 +1443,6 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) {
 		return fmt.Errorf("index implementation does not support copy reader")
 	}
 
-	// copyIndex.Copy() -> copies the centroid index
-
 	copyReader := copyIndex.CopyReader()
 	if copyReader == nil {
 		return fmt.Errorf("index's copyReader is nil")

From 8bb49d0f2d0535367ab1528f11eea2ee7b2522b9 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 29 Jan 2026 12:55:50 -0800
Subject: [PATCH 20/25] fix var name

---
 index/scorch/scorch.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 5fa8483e7..ad6c7f558 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -688,7 +688,7 @@ func (s *Scorch) trainerLoop() {
 			}
 
 			// update the centroid index pointer
-			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.centroidIndexFileName), s.segmentConfig)
+			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.CentroidIndexFileName), s.segmentConfig)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
 				close(trainReq.ackCh)

From cca945a0ada953a61d9a718f5bfbc0c5b15f3b7d Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 29 Jan 2026 13:38:16 -0800
Subject: [PATCH 21/25] refactor the trainer

---
 index/scorch/persister.go  |  18 +--
 index/scorch/scorch.go     | 216 ++----------------------------
 index/scorch/train.go      | 261 +++++++++++++++++++++++++++++++++++++
 index/scorch/train_noop.go |  25 ++++
 util/keys.go               |   4 +-
 5 files changed, 305 insertions(+), 219 deletions(-)
 create mode 100644 index/scorch/train.go
 create mode 100644 index/scorch/train_noop.go

diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index fb250001b..4aad12900 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -852,18 +852,8 @@ func zapFileName(epoch uint64) string {
 	return fmt.Sprintf("%012x.zap", epoch)
 }
 
-func (s *Scorch) loadCentroidIndex(bucket *bolt.Bucket) error {
-	if bucket == nil {
-		return nil
-	}
-	segmentSnapshot, err := s.loadSegment(bucket)
-	if err != nil {
-		return err
-	}
-	s.rootLock.Lock()
-	defer s.rootLock.Unlock()
-	s.centroidIndex = segmentSnapshot
-	return nil
+func (s *Scorch) loadTrainedData(bucket *bolt.Bucket) error {
+	return s.trainer.loadTrainedData(bucket)
 }
 
 // bolt snapshot code
@@ -919,8 +909,8 @@ func (s *Scorch) loadFromBolt() error {
 			foundRoot = true
 		}
 
-		centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey)
-		err := s.loadCentroidIndex(centroidIndexBucket)
+		trainerBucket := snapshots.Bucket(util.BoltTrainerKey)
+		err := s.trainer.loadTrainedData(trainerBucket)
 		if err != nil {
 			return err
 		}
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index ad6c7f558..0a004eb8d 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -19,7 +19,6 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
-	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -29,7 +28,6 @@ import (
 	"github.com/blevesearch/bleve/v2/registry"
 	"github.com/blevesearch/bleve/v2/util"
 	index "github.com/blevesearch/bleve_index_api"
-	"github.com/blevesearch/go-faiss"
 	segment "github.com/blevesearch/scorch_segment_api/v2"
 	bolt "go.etcd.io/bbolt"
 )
@@ -81,9 +79,8 @@ type Scorch struct {
 	persisterNotifier        chan *epochWatcher
 	rootBolt                 *bolt.DB
 	asyncTasks               sync.WaitGroup
-	// not a real searchable segment
-	centroidIndex *SegmentSnapshot
-	train         chan *trainRequest
+
+	trainer trainer
 
 	onEvent      func(event Event) bool
 	onAsyncError func(err error, path string)
@@ -95,18 +92,19 @@ type Scorch struct {
 	spatialPlugin index.SpatialAnalyzerPlugin
 }
 
+type trainer interface {
+	trainLoop()
+	train(batch *index.Batch) error
+	loadTrainedData(*bolt.Bucket) error
+	getInternal(key []byte) ([]byte, error)
+}
+
 type ScorchErrorType string
 
 func (t ScorchErrorType) Error() string {
 	return string(t)
 }
 
-type trainRequest struct {
-	sample   segment.Segment
-	vecCount int
-	ackCh    chan error
-}
-
 // ErrType values for ScorchError
 const (
 	ErrAsyncPanic   = ScorchErrorType("async panic error")
@@ -279,7 +277,7 @@ func (s *Scorch) Open() error {
 	go s.introducerLoop()
 
 	s.asyncTasks.Add(1)
-	go s.trainerLoop()
+	go s.trainer.trainLoop()
 
 	if !s.readOnly && s.path != "" {
 		s.asyncTasks.Add(1)
@@ -356,7 +354,6 @@ func (s *Scorch) openBolt() error {
 	s.persisterNotifier = make(chan *epochWatcher, 1)
 	s.closeCh = make(chan struct{})
 	s.forceMergeRequestCh = make(chan *mergerCtrl, 1)
-	s.train = make(chan *trainRequest)
 
 	if !s.readOnly && s.path != "" {
 		err := s.removeOldZapFiles() // Before persister or merger create any new files.
@@ -558,207 +555,20 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
 func (s *Scorch) getInternal(key []byte) ([]byte, error) {
 	s.rootLock.RLock()
 	defer s.rootLock.RUnlock()
-	// todo: return the total number of vectors that have been processed so far in training
-	// in cbft use that as a checkpoint to resume training for n-x samples.
+
 	switch string(key) {
 	case string(util.BoltTrainCompleteKey):
-		return []byte(fmt.Sprintf("%t", s.centroidIndex != nil)), nil
+		return s.trainer.getInternal(key)
 	}
 	return nil, nil
 }
 
-func moveFile(sourcePath, destPath string) error {
-	// rename is supposed to be atomic on the same filesystem
-	err := os.Rename(sourcePath, destPath)
-	if err != nil {
-		return fmt.Errorf("error renaming file: %v", err)
-	}
-	return nil
-}
-
 func boolToByte(b bool) byte {
 	return *(*byte)(unsafe.Pointer(&b))
 }
 
-// this is not a routine that will be running throughout the lifetime of the index. It's purpose
-// is to only train the vector index before the data ingestion starts.
-func (s *Scorch) trainerLoop() {
-	defer func() {
-		s.asyncTasks.Done()
-	}()
-	// some init stuff
-	s.segmentConfig["getCentroidIndexCallback"] = s.getCentroidIndex
-	var totalSamplesProcessed int
-	filename := index.CentroidIndexFileName
-	path := filepath.Join(s.path, filename)
-	for {
-		select {
-		case <-s.closeCh:
-			return
-		case trainReq := <-s.train:
-			sampleSeg := trainReq.sample
-			if s.centroidIndex == nil {
-				switch seg := sampleSeg.(type) {
-				case segment.UnpersistedSegment:
-					err := persistToDirectory(seg, nil, path)
-					if err != nil {
-						// clean up this ugly ass error handling code
-						trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err)
-						close(trainReq.ackCh)
-						return
-					}
-				default:
-					fmt.Errorf("segment is not a unpersisted segment")
-					close(s.closeCh)
-					return
-				}
-			} else {
-				// merge the new segment with the existing one, no need to persist?
-				// persist in a tmp file and then rename - is that a fair strategy?
-				s.segmentConfig["training"] = true
-				_, _, err := s.segPlugin.MergeEx([]segment.Segment{s.centroidIndex.segment, sampleSeg},
-					[]*roaring.Bitmap{nil, nil}, filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), s.closeCh, nil, s.segmentConfig)
-				if err != nil {
-					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
-					close(trainReq.ackCh)
-				}
-				// reset the training flag once completed
-				s.segmentConfig["training"] = false
-
-				// close the existing centroid segment - it's supposed to be gc'd at this point
-				s.centroidIndex.segment.Close()
-				err = moveFile(filepath.Join(s.path, index.CentroidIndexFileName+".tmp"), filepath.Join(s.path, index.CentroidIndexFileName))
-				if err != nil {
-					trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err)
-					close(trainReq.ackCh)
-				}
-			}
-			totalSamplesProcessed += trainReq.vecCount
-			// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
-			// where we can be sure that the centroid index is available for the indexing operations downstream
-			//
-			// note: when the scale increases massively especially with real world dimensions of 1536+, this API
-			// will have to be refactored to persist in a more resource efficient way. so having this bolt related
-			// code will help in tracking the progress a lot better and avoid any redudant data streaming operations.
-			tx, err := s.rootBolt.Begin(true)
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-			defer func() {
-				if err != nil {
-					_ = tx.Rollback()
-				}
-			}()
-
-			snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-
-			centroidBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltCentroidIndexKey)
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-
-			err = centroidBucket.Put(util.BoltPathKey, []byte(filename))
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-
-			err = tx.Commit()
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-
-			err = s.rootBolt.Sync()
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-
-			// update the centroid index pointer
-			centroidIndex, err := s.segPlugin.OpenEx(filepath.Join(s.path, index.CentroidIndexFileName), s.segmentConfig)
-			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
-				close(trainReq.ackCh)
-				return
-			}
-			s.centroidIndex = &SegmentSnapshot{
-				segment: centroidIndex,
-			}
-			close(trainReq.ackCh)
-		}
-	}
-}
-
 func (s *Scorch) Train(batch *index.Batch) error {
-	// regulate the Train function
-	s.FireIndexEvent()
-
-	var trainData []index.Document
-	for key, doc := range batch.IndexOps {
-		if doc != nil {
-			// insert _id field
-			// no need to track updates/deletes over here since
-			// the API is singleton
-			doc.AddIDField()
-		}
-		if strings.HasPrefix(key, index.TrainDataPrefix) {
-			trainData = append(trainData, doc)
-		}
-	}
-
-	// just builds a new vector index out of the train data provided
-	// it'll be an IVF index so the centroids are computed at this stage and
-	// this template will be used in the indexing down the line to index
-	// the data vectors. s.segmentConfig will mark this as a training phase
-	// and zap will handle it accordingly.
-	//
-	// note: this might index text data too, how to handle this? s.segmentConfig?
-	// todo: updates/deletes -> data drift detection
-	seg, _, err := s.segPlugin.NewEx(trainData, s.segmentConfig)
-	if err != nil {
-		return err
-	}
-
-	trainReq := &trainRequest{
-		sample:   seg,
-		vecCount: len(trainData), // todo: multivector support
-		ackCh:    make(chan error),
-	}
-
-	s.train <- trainReq
-	err = <-trainReq.ackCh
-	if err != nil {
-		return err
-	}
-
-	return err
-}
-
-func (s *Scorch) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
-	// return the coarse quantizer of the centroid index belonging to the field
-	centroidIndexSegment, ok := s.centroidIndex.segment.(segment.CentroidIndexSegment)
-	if !ok {
-		return nil, fmt.Errorf("segment is not a centroid index segment", s.centroidIndex.segment != nil)
-	}
-
-	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
-	if err != nil {
-		return nil, err
-	}
-	return coarseQuantizer, nil
+	return s.trainer.train(batch)
 }
 
 func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
diff --git a/index/scorch/train.go b/index/scorch/train.go
new file mode 100644
index 000000000..2c2777fcf
--- /dev/null
+++ b/index/scorch/train.go
@@ -0,0 +1,261 @@
+//  Copyright (c) 2018 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors
+// +build vectors
+
+package scorch
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/RoaringBitmap/roaring/v2"
+	"github.com/blevesearch/bleve/v2/util"
+	index "github.com/blevesearch/bleve_index_api"
+	"github.com/blevesearch/go-faiss"
+	segment "github.com/blevesearch/scorch_segment_api/v2"
+	bolt "go.etcd.io/bbolt"
+)
+
+type trainRequest struct {
+	sample   segment.Segment
+	vecCount int
+	ackCh    chan error
+}
+
+type vectorTrainer struct {
+	parent *Scorch
+
+	// not a real searchable segment
+	centroidIndex *SegmentSnapshot
+	trainCh       chan *trainRequest
+}
+
+func moveFile(sourcePath, destPath string) error {
+	// rename is supposed to be atomic on the same filesystem
+	err := os.Rename(sourcePath, destPath)
+	if err != nil {
+		return fmt.Errorf("error renaming file: %v", err)
+	}
+	return nil
+}
+
+// this is not a routine that will be running throughout the lifetime of the index. It's purpose
+// is to only train the vector index before the data ingestion starts.
+func (t *vectorTrainer) trainerLoop() {
+	defer func() {
+		t.parent.asyncTasks.Done()
+	}()
+	// initialize stuff
+	t.parent.segmentConfig["getCentroidIndexCallback"] = t.getCentroidIndex
+	t.trainCh = make(chan *trainRequest)
+	var totalSamplesProcessed int
+	filename := index.CentroidIndexFileName
+	path := filepath.Join(t.parent.path, filename)
+	for {
+		select {
+		case <-t.parent.closeCh:
+			return
+		case trainReq := <-t.trainCh:
+			sampleSeg := trainReq.sample
+			if t.centroidIndex == nil {
+				switch seg := sampleSeg.(type) {
+				case segment.UnpersistedSegment:
+					err := persistToDirectory(seg, nil, path)
+					if err != nil {
+						// clean up this ugly ass error handling code
+						trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err)
+						close(trainReq.ackCh)
+						return
+					}
+				default:
+					fmt.Errorf("segment is not a unpersisted segment")
+					close(t.parent.closeCh)
+					return
+				}
+			} else {
+				// merge the new segment with the existing one, no need to persist?
+				// persist in a tmp file and then rename - is that a fair strategy?
+				t.parent.segmentConfig["training"] = true
+				_, _, err := t.parent.segPlugin.MergeEx([]segment.Segment{t.centroidIndex.segment, sampleSeg},
+					[]*roaring.Bitmap{nil, nil}, filepath.Join(t.parent.path, filename+".tmp"), t.parent.closeCh, nil, t.parent.segmentConfig)
+				if err != nil {
+					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
+					close(trainReq.ackCh)
+				}
+				// reset the training flag once completed
+				t.parent.segmentConfig["training"] = false
+
+				// close the existing centroid segment - it's supposed to be gc'd at this point
+				t.centroidIndex.segment.Close()
+				err = moveFile(filepath.Join(t.parent.path, filename+".tmp"), filepath.Join(t.parent.path, filename))
+				if err != nil {
+					trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err)
+					close(trainReq.ackCh)
+				}
+			}
+			totalSamplesProcessed += trainReq.vecCount
+			// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
+			// where we can be sure that the centroid index is available for the indexing operations downstream
+			//
+			// note: when the scale increases massively especially with real world dimensions of 1536+, this API
+			// will have to be refactored to persist in a more resource efficient way. so having this bolt related
+			// code will help in tracking the progress a lot better and avoid any redudant data streaming operations.
+			tx, err := t.parent.rootBolt.Begin(true)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+			defer func() {
+				if err != nil {
+					_ = tx.Rollback()
+				}
+			}()
+
+			snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error creating snapshots bucket: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+
+			trainerBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltTrainerKey)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error creating centroid bucket: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+
+			err = trainerBucket.Put(util.BoltPathKey, []byte(filename))
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+
+			err = tx.Commit()
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+
+			err = t.parent.rootBolt.Sync()
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+
+			// update the centroid index pointer
+			centroidIndex, err := t.parent.segPlugin.OpenEx(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig)
+			if err != nil {
+				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
+				close(trainReq.ackCh)
+				return
+			}
+			t.centroidIndex = &SegmentSnapshot{
+				segment: centroidIndex,
+			}
+			close(trainReq.ackCh)
+		}
+	}
+}
+
+func (t *vectorTrainer) loadTrainedData(bucket *bolt.Bucket) error {
+	if bucket == nil {
+		return nil
+	}
+	segmentSnapshot, err := t.parent.loadSegment(bucket)
+	if err != nil {
+		return err
+	}
+	t.parent.rootLock.Lock()
+	defer t.parent.rootLock.Unlock()
+	t.centroidIndex = segmentSnapshot
+	return nil
+}
+
+func (t *vectorTrainer) train(batch *index.Batch) error {
+	// regulate the Train function
+	t.parent.FireIndexEvent()
+
+	var trainData []index.Document
+	for key, doc := range batch.IndexOps {
+		if doc != nil {
+			// insert _id field
+			// no need to track updates/deletes over here since
+			// the API is singleton
+			doc.AddIDField()
+		}
+		if strings.HasPrefix(key, index.TrainDataPrefix) {
+			trainData = append(trainData, doc)
+		}
+	}
+
+	// just builds a new vector index out of the train data provided
+	// it'll be an IVF index so the centroids are computed at this stage and
+	// this template will be used in the indexing down the line to index
+	// the data vectors. s.segmentConfig will mark this as a training phase
+	// and zap will handle it accordingly.
+	//
+	// note: this might index text data too, how to handle this? s.segmentConfig?
+	// todo: updates/deletes -> data drift detection
+	seg, _, err := t.parent.segPlugin.NewEx(trainData, t.parent.segmentConfig)
+	if err != nil {
+		return err
+	}
+
+	trainReq := &trainRequest{
+		sample:   seg,
+		vecCount: len(trainData), // todo: multivector support
+		ackCh:    make(chan error),
+	}
+
+	t.trainCh <- trainReq
+	err = <-trainReq.ackCh
+	if err != nil {
+		return err
+	}
+
+	return err
+}
+
+func (t *vectorTrainer) getInternal(key []byte) ([]byte, error) {
+	// todo: return the total number of vectors that have been processed so far in training
+	// in cbft use that as a checkpoint to resume training for n-x samples.
+	switch string(key) {
+	case string(util.BoltTrainCompleteKey):
+		return []byte(fmt.Sprintf("%t", t.centroidIndex != nil)), nil
+	}
+	return nil, nil
+}
+
+func (t *vectorTrainer) getCentroidIndex(field string) (*faiss.IndexImpl, error) {
+	// return the coarse quantizer of the centroid index belonging to the field
+	centroidIndexSegment, ok := t.centroidIndex.segment.(segment.CentroidIndexSegment)
+	if !ok {
+		return nil, fmt.Errorf("segment is not a centroid index segment", t.centroidIndex.segment != nil)
+	}
+
+	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
+	if err != nil {
+		return nil, err
+	}
+	return coarseQuantizer, nil
+}
diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go
new file mode 100644
index 000000000..a16353c10
--- /dev/null
+++ b/index/scorch/train_noop.go
@@ -0,0 +1,25 @@
+//go:build !vectors
+// +build !vectors
+
+package scorch
+
+import (
+	"fmt"
+
+	index "github.com/blevesearch/bleve_index_api"
+	bolt "go.etcd.io/bbolt"
+)
+
+type noopTrainer struct {
+}
+
+func (t *noopTrainer) trainLoop() {}
+
+func (t *noopTrainer) train(batch *index.Batch) error {
+	return fmt.Errorf("training is not supported with this build")
+}
+
+func (t *noopTrainer) loadTrainedData(bucket *bolt.Bucket) error {
+	// noop
+	return nil
+}
diff --git a/util/keys.go b/util/keys.go
index a1f3bfbbf..ce8965da2 100644
--- a/util/keys.go
+++ b/util/keys.go
@@ -17,8 +17,8 @@ package util
 var (
 	// Bolt keys
 	BoltSnapshotsBucket           = []byte{'s'}
-	BoltCentroidIndexKey          = []byte{'c'}
-	BoltTrainCompleteKey          = []byte{'t'}
+	BoltTrainerKey                = []byte{'t'}
+	BoltTrainCompleteKey          = []byte{'c'}
 	BoltPathKey                   = []byte{'p'}
 	BoltDeletedKey                = []byte{'d'}
 	BoltInternalKey               = []byte{'i'}

From 873c08651e21ecc6467bb2a291986a019934efef Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 29 Jan 2026 13:49:07 -0800
Subject: [PATCH 22/25] fix trainer impls

---
 index/scorch/train.go      | 2 +-
 index/scorch/train_noop.go | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/index/scorch/train.go b/index/scorch/train.go
index 2c2777fcf..5d0a00a0b 100644
--- a/index/scorch/train.go
+++ b/index/scorch/train.go
@@ -56,7 +56,7 @@ func moveFile(sourcePath, destPath string) error {
 
 // this is not a routine that will be running throughout the lifetime of the index. It's purpose
 // is to only train the vector index before the data ingestion starts.
-func (t *vectorTrainer) trainerLoop() {
+func (t *vectorTrainer) trainLoop() {
 	defer func() {
 		t.parent.asyncTasks.Done()
 	}()
diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go
index a16353c10..60a8d09fd 100644
--- a/index/scorch/train_noop.go
+++ b/index/scorch/train_noop.go
@@ -23,3 +23,7 @@ func (t *noopTrainer) loadTrainedData(bucket *bolt.Bucket) error {
 	// noop
 	return nil
 }
+
+func (t *noopTrainer) getInternal(key []byte) ([]byte, error) {
+	return nil, nil
+}

From 26f7c12fecbbbdd07775cf26f8d0f5e13a4a19fd Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Thu, 29 Jan 2026 13:56:57 -0800
Subject: [PATCH 23/25] fix trainer init

---
 index/scorch/scorch.go     | 2 ++
 index/scorch/train.go      | 8 +++++++-
 index/scorch/train_noop.go | 4 ++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 0a004eb8d..77ce94044 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -222,6 +222,8 @@ func NewScorch(storeName string,
 		return nil, err
 	}
 
+	rv.trainer = initTrainer(rv)
+
 	return rv, nil
 }
 
diff --git a/index/scorch/train.go b/index/scorch/train.go
index 5d0a00a0b..8150ff0ea 100644
--- a/index/scorch/train.go
+++ b/index/scorch/train.go
@@ -37,6 +37,13 @@ type trainRequest struct {
 	ackCh    chan error
 }
 
+func initTrainer(s *Scorch) *vectorTrainer {
+	return &vectorTrainer{
+		parent:  s,
+		trainCh: make(chan *trainRequest),
+	}
+}
+
 type vectorTrainer struct {
 	parent *Scorch
 
@@ -62,7 +69,6 @@ func (t *vectorTrainer) trainLoop() {
 	}()
 	// initialize stuff
 	t.parent.segmentConfig["getCentroidIndexCallback"] = t.getCentroidIndex
-	t.trainCh = make(chan *trainRequest)
 	var totalSamplesProcessed int
 	filename := index.CentroidIndexFileName
 	path := filepath.Join(t.parent.path, filename)
diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go
index 60a8d09fd..1f2a51bf8 100644
--- a/index/scorch/train_noop.go
+++ b/index/scorch/train_noop.go
@@ -10,6 +10,10 @@ import (
 	bolt "go.etcd.io/bbolt"
 )
 
+func initTrainer(s *Scorch) *noopTrainer {
+	return &noopTrainer{}
+}
+
 type noopTrainer struct {
 }
 

From b267d4b1053150e8dd01c02a096d30b468735d8b Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Mon, 2 Feb 2026 12:50:10 -0800
Subject: [PATCH 24/25] merge conflict resolve

---
 index/scorch/train.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/index/scorch/train.go b/index/scorch/train.go
index 8150ff0ea..4e31a3be4 100644
--- a/index/scorch/train.go
+++ b/index/scorch/train.go
@@ -97,7 +97,7 @@ func (t *vectorTrainer) trainLoop() {
 				// merge the new segment with the existing one, no need to persist?
 				// persist in a tmp file and then rename - is that a fair strategy?
 				t.parent.segmentConfig["training"] = true
-				_, _, err := t.parent.segPlugin.MergeEx([]segment.Segment{t.centroidIndex.segment, sampleSeg},
+				_, _, err := t.parent.segPlugin.MergeUsing([]segment.Segment{t.centroidIndex.segment, sampleSeg},
 					[]*roaring.Bitmap{nil, nil}, filepath.Join(t.parent.path, filename+".tmp"), t.parent.closeCh, nil, t.parent.segmentConfig)
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
@@ -169,7 +169,7 @@ func (t *vectorTrainer) trainLoop() {
 			}
 
 			// update the centroid index pointer
-			centroidIndex, err := t.parent.segPlugin.OpenEx(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig)
+			centroidIndex, err := t.parent.segPlugin.OpenUsing(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
 				close(trainReq.ackCh)
@@ -222,7 +222,7 @@ func (t *vectorTrainer) train(batch *index.Batch) error {
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
-	seg, _, err := t.parent.segPlugin.NewEx(trainData, t.parent.segmentConfig)
+	seg, _, err := t.parent.segPlugin.NewUsing(trainData, t.parent.segmentConfig)
 	if err != nil {
 		return err
 	}

From ce43814cf11a1293647e7748a4d254795fee2913 Mon Sep 17 00:00:00 2001
From: Thejas-bhat <thejas.orkombu@couchbase.com>
Date: Wed, 4 Feb 2026 14:34:13 -0800
Subject: [PATCH 25/25] cleanup + refactor code

---
 index/scorch/merge.go                      |  2 -
 index/scorch/persister.go                  |  1 -
 index/scorch/scorch.go                     | 21 +++++++--
 index/scorch/train_noop.go                 | 14 ++++++
 index/scorch/{train.go => train_vector.go} | 53 +++++++++++-----------
 index_alias_impl.go                        | 12 ++++-
 6 files changed, 68 insertions(+), 35 deletions(-)
 rename index/scorch/{train.go => train_vector.go} (82%)

diff --git a/index/scorch/merge.go b/index/scorch/merge.go
index 32de86bd4..bca9bbb81 100644
--- a/index/scorch/merge.go
+++ b/index/scorch/merge.go
@@ -335,7 +335,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 		docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
 		mergedSegHistory := make(map[uint64]*mergedSegmentHistory, len(task.Segments))
 
-		var files []string
 		for _, planSegment := range task.Segments {
 			if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
 				oldMap[segSnapshot.id] = segSnapshot
@@ -351,7 +350,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
 					} else {
 						segmentsToMerge = append(segmentsToMerge, segSnapshot.segment)
 						docsToDrop = append(docsToDrop, segSnapshot.deleted)
-						files = append(files, persistedSeg.Path())
 					}
 					// track the files getting merged for unsetting the
 					// removal ineligibility. This helps to unflip files
diff --git a/index/scorch/persister.go b/index/scorch/persister.go
index 4aad12900..3df4ac2e6 100644
--- a/index/scorch/persister.go
+++ b/index/scorch/persister.go
@@ -876,7 +876,6 @@ func (s *Scorch) loadFromBolt() error {
 				s.AddEligibleForRemoval(snapshotEpoch)
 				continue
 			}
-
 			snapshot := snapshots.Bucket(k)
 			if snapshot == nil {
 				log.Printf("snapshot key, but bucket missing %x, continuing", k)
diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go
index 77ce94044..afe2878f0 100644
--- a/index/scorch/scorch.go
+++ b/index/scorch/scorch.go
@@ -22,7 +22,6 @@ import (
 	"sync"
 	"sync/atomic"
 	"time"
-	"unsafe"
 
 	"github.com/RoaringBitmap/roaring/v2"
 	"github.com/blevesearch/bleve/v2/registry"
@@ -92,10 +91,26 @@ type Scorch struct {
 	spatialPlugin index.SpatialAnalyzerPlugin
 }
 
+// trainer interface is used for training an index that has the concept
+// of "learning". Naturally, a vector index is one such thing that would
+// implement this interface. There can be multiple implementations of the
+// training itself even for the same index type.
+//
+// this component is not supposed to interact with the other master routines
+// of scorch and will be used only for training the index before the actual data
+// ingestion starts. The routine should also be released once the
+// training is marked as complete - which can be done using the BoltTrainCompleteKey
+// key and a bool value. However the struct is still maintained for the pointer to
+// the instance so that we can use in the later stages of the index lifecycle.
 type trainer interface {
+	// ephemeral
 	trainLoop()
+	// for the training state and the ingestion of the samples
 	train(batch *index.Batch) error
+
+	// to load the metadata from the bolt under the BoltTrainerKey
 	loadTrainedData(*bolt.Bucket) error
+	// to fetch the internal data from the component
 	getInternal(key []byte) ([]byte, error)
 }
 
@@ -565,10 +580,6 @@ func (s *Scorch) getInternal(key []byte) ([]byte, error) {
 	return nil, nil
 }
 
-func boolToByte(b bool) byte {
-	return *(*byte)(unsafe.Pointer(&b))
-}
-
 func (s *Scorch) Train(batch *index.Batch) error {
 	return s.trainer.train(batch)
 }
diff --git a/index/scorch/train_noop.go b/index/scorch/train_noop.go
index 1f2a51bf8..d82b342c6 100644
--- a/index/scorch/train_noop.go
+++ b/index/scorch/train_noop.go
@@ -1,3 +1,17 @@
+//  Copyright (c) 2026 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 //go:build !vectors
 // +build !vectors
 
diff --git a/index/scorch/train.go b/index/scorch/train_vector.go
similarity index 82%
rename from index/scorch/train.go
rename to index/scorch/train_vector.go
index 4e31a3be4..74cc6b4ed 100644
--- a/index/scorch/train.go
+++ b/index/scorch/train_vector.go
@@ -1,4 +1,4 @@
-//  Copyright (c) 2018 Couchbase, Inc.
+//  Copyright (c) 2026 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 
 	"github.com/RoaringBitmap/roaring/v2"
 	"github.com/blevesearch/bleve/v2/util"
@@ -47,7 +48,9 @@ func initTrainer(s *Scorch) *vectorTrainer {
 type vectorTrainer struct {
 	parent *Scorch
 
-	// not a real searchable segment
+	m sync.Mutex
+	// not a searchable segment in the sense that it won't return
+	// the data vectors. can return centroid vectors
 	centroidIndex *SegmentSnapshot
 	trainCh       chan *trainRequest
 }
@@ -68,10 +71,8 @@ func (t *vectorTrainer) trainLoop() {
 		t.parent.asyncTasks.Done()
 	}()
 	// initialize stuff
-	t.parent.segmentConfig["getCentroidIndexCallback"] = t.getCentroidIndex
-	var totalSamplesProcessed int
-	filename := index.CentroidIndexFileName
-	path := filepath.Join(t.parent.path, filename)
+	t.parent.segmentConfig[index.CentroidIndexCallback] = t.getCentroidIndex
+	path := filepath.Join(t.parent.path, index.CentroidIndexFileName)
 	for {
 		select {
 		case <-t.parent.closeCh:
@@ -83,44 +84,41 @@ func (t *vectorTrainer) trainLoop() {
 				case segment.UnpersistedSegment:
 					err := persistToDirectory(seg, nil, path)
 					if err != nil {
-						// clean up this ugly ass error handling code
 						trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err)
 						close(trainReq.ackCh)
 						return
 					}
 				default:
-					fmt.Errorf("segment is not a unpersisted segment")
-					close(t.parent.closeCh)
-					return
 				}
 			} else {
 				// merge the new segment with the existing one, no need to persist?
 				// persist in a tmp file and then rename - is that a fair strategy?
-				t.parent.segmentConfig["training"] = true
+				t.parent.segmentConfig[index.TrainingKey] = true
 				_, _, err := t.parent.segPlugin.MergeUsing([]segment.Segment{t.centroidIndex.segment, sampleSeg},
-					[]*roaring.Bitmap{nil, nil}, filepath.Join(t.parent.path, filename+".tmp"), t.parent.closeCh, nil, t.parent.segmentConfig)
+					[]*roaring.Bitmap{nil, nil}, path+".tmp", t.parent.closeCh, nil, t.parent.segmentConfig)
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error merging centroid index: %v", err)
 					close(trainReq.ackCh)
 				}
 				// reset the training flag once completed
-				t.parent.segmentConfig["training"] = false
+				t.parent.segmentConfig[index.TrainingKey] = false
 
 				// close the existing centroid segment - it's supposed to be gc'd at this point
 				t.centroidIndex.segment.Close()
-				err = moveFile(filepath.Join(t.parent.path, filename+".tmp"), filepath.Join(t.parent.path, filename))
+				err = moveFile(path+".tmp", path)
 				if err != nil {
 					trainReq.ackCh <- fmt.Errorf("error renaming centroid index: %v", err)
 					close(trainReq.ackCh)
 				}
 			}
-			totalSamplesProcessed += trainReq.vecCount
 			// a bolt transaction is necessary for failover-recovery scenario and also serves as a checkpoint
 			// where we can be sure that the centroid index is available for the indexing operations downstream
 			//
 			// note: when the scale increases massively especially with real world dimensions of 1536+, this API
 			// will have to be refactored to persist in a more resource efficient way. so having this bolt related
 			// code will help in tracking the progress a lot better and avoid any redudant data streaming operations.
+			//
+			// todo: rethink the frequency of bolt writes
 			tx, err := t.parent.rootBolt.Begin(true)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error starting bolt transaction: %v", err)
@@ -147,7 +145,7 @@ func (t *vectorTrainer) trainLoop() {
 				return
 			}
 
-			err = trainerBucket.Put(util.BoltPathKey, []byte(filename))
+			err = trainerBucket.Put(util.BoltPathKey, []byte(index.CentroidIndexFileName))
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error updating centroid bucket: %v", err)
 				close(trainReq.ackCh)
@@ -163,26 +161,29 @@ func (t *vectorTrainer) trainLoop() {
 
 			err = t.parent.rootBolt.Sync()
 			if err != nil {
-				trainReq.ackCh <- fmt.Errorf("error committing bolt transaction: %v", err)
+				trainReq.ackCh <- fmt.Errorf("error on bolt sync: %v", err)
 				close(trainReq.ackCh)
 				return
 			}
 
 			// update the centroid index pointer
-			centroidIndex, err := t.parent.segPlugin.OpenUsing(filepath.Join(t.parent.path, index.CentroidIndexFileName), t.parent.segmentConfig)
+			centroidIndex, err := t.parent.segPlugin.OpenUsing(path, t.parent.segmentConfig)
 			if err != nil {
 				trainReq.ackCh <- fmt.Errorf("error opening centroid index: %v", err)
 				close(trainReq.ackCh)
 				return
 			}
+			t.m.Lock()
 			t.centroidIndex = &SegmentSnapshot{
 				segment: centroidIndex,
 			}
+			t.m.Unlock()
 			close(trainReq.ackCh)
 		}
 	}
 }
 
+// loads the metadata specific to the centroid index from boltdb
 func (t *vectorTrainer) loadTrainedData(bucket *bolt.Bucket) error {
 	if bucket == nil {
 		return nil
@@ -191,8 +192,8 @@ func (t *vectorTrainer) loadTrainedData(bucket *bolt.Bucket) error {
 	if err != nil {
 		return err
 	}
-	t.parent.rootLock.Lock()
-	defer t.parent.rootLock.Unlock()
+	t.m.Lock()
+	defer t.m.Unlock()
 	t.centroidIndex = segmentSnapshot
 	return nil
 }
@@ -215,10 +216,10 @@ func (t *vectorTrainer) train(batch *index.Batch) error {
 	}
 
 	// just builds a new vector index out of the train data provided
-	// it'll be an IVF index so the centroids are computed at this stage and
-	// this template will be used in the indexing down the line to index
-	// the data vectors. s.segmentConfig will mark this as a training phase
-	// and zap will handle it accordingly.
+	// this is not necessarily the final train data since this is submitted
+	// as a request to the trainer component to be merged. once the training
+	// is complete, the template will be used for other operations down the line
+	// like merge and search.
 	//
 	// note: this might index text data too, how to handle this? s.segmentConfig?
 	// todo: updates/deletes -> data drift detection
@@ -236,7 +237,7 @@ func (t *vectorTrainer) train(batch *index.Batch) error {
 	t.trainCh <- trainReq
 	err = <-trainReq.ackCh
 	if err != nil {
-		return err
+		return fmt.Errorf("train_vector: train() err'd out with: %w", err)
 	}
 
 	return err
@@ -256,7 +257,7 @@ func (t *vectorTrainer) getCentroidIndex(field string) (*faiss.IndexImpl, error)
 	// return the coarse quantizer of the centroid index belonging to the field
 	centroidIndexSegment, ok := t.centroidIndex.segment.(segment.CentroidIndexSegment)
 	if !ok {
-		return nil, fmt.Errorf("segment is not a centroid index segment", t.centroidIndex.segment != nil)
+		return nil, fmt.Errorf("segment is not a centroid index segment")
 	}
 
 	coarseQuantizer, err := centroidIndexSegment.GetCoarseQuantizer(field)
diff --git a/index_alias_impl.go b/index_alias_impl.go
index 16f20ac45..2839752e2 100644
--- a/index_alias_impl.go
+++ b/index_alias_impl.go
@@ -106,8 +106,18 @@ func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition *
 func (i *indexAliasImpl) Train(batch *Batch) error {
 	i.mutex.RLock()
 	defer i.mutex.RUnlock()
+	if !i.open {
+		return ErrorIndexClosed
+	}
 
-	// TODO: implement this
+	err := i.isAliasToSingleIndex()
+	if err != nil {
+		return err
+	}
+
+	if vi, ok := i.indexes[0].(VectorIndex); ok {
+		return vi.Train(batch)
+	}
 	return fmt.Errorf("not a vector index")
 }