@@ -267,12 +267,12 @@ export class ProjectOperator implements Operator {
267267 const batch = await this . upstream . next ( ) ;
268268 if ( ! batch ) return null ;
269269
270- for ( const row of batch ) {
271- for ( const key of Object . keys ( row ) ) {
272- if ( ! this . keep . has ( key ) ) delete row [ key ] ;
273- }
274- }
275- return batch ;
270+ // Create new projected row objects — no delete, no V8 hidden class deopt
271+ return batch . map ( row => {
272+ const out : Row = { } ;
273+ for ( const k of this . keep ) if ( k in row ) out [ k ] = row [ k ] ;
274+ return out ;
275+ } ) ;
276276 }
277277
278278 async close ( ) : Promise < void > {
@@ -298,19 +298,18 @@ export class AggregateOperator implements Operator {
298298 if ( this . consumed ) return null ;
299299 this . consumed = true ;
300300
301- // Stream all batches through partial aggregation
302- const partials : PartialAgg [ ] = [ ] ;
301+ // Incrementally merge partials — O(groups) memory, not O(batches × groups)
302+ let merged : PartialAgg | null = null ;
303303 while ( true ) {
304304 const batch = await this . upstream . next ( ) ;
305305 if ( ! batch ) break ;
306- partials . push ( computePartialAgg ( batch , this . query ) ) ;
306+ const partial = computePartialAgg ( batch , this . query ) ;
307+ merged = merged ? mergePartialAggs ( [ merged , partial ] ) : partial ;
307308 }
308309
309- if ( partials . length === 0 ) {
310+ if ( ! merged ) {
310311 return finalizePartialAgg ( { states : [ ] } , this . query ) ;
311312 }
312-
313- const merged = partials . length === 1 ? partials [ 0 ] : mergePartialAggs ( partials ) ;
314313 return finalizePartialAgg ( merged , this . query ) ;
315314 }
316315
@@ -673,89 +672,144 @@ export class HashJoinOperator implements Operator {
673672 private async buildOrPartition ( ) : Promise < void > {
674673 if ( this . hashMap || this . partitionCount > 0 ) return ;
675674
676- const inMemoryRows : Row [ ] = [ ] ;
677- let exceeds = false ;
675+ // Sample first batch to estimate whether right side fits in memory.
676+ // If estimated total exceeds budget, go straight to partitioned path
677+ // instead of accumulating rows we'll have to re-spill.
678+ const firstBatch = await this . right . next ( ) ;
679+ if ( ! firstBatch ) {
680+ // Empty right side — build empty hash map
681+ this . hashMap = new Map < string , Row [ ] > ( ) ;
682+ return ;
683+ }
678684
679- // Consume right side, tracking memory
680- while ( true ) {
681- const batch = await this . right . next ( ) ;
682- if ( ! batch ) break ;
683- for ( const row of batch ) {
684- const rowSize = estimateRowSize ( row ) ;
685- this . buildSizeBytes += rowSize ;
686- inMemoryRows . push ( row ) ;
685+ let batchSizeBytes = 0 ;
686+ for ( const row of firstBatch ) batchSizeBytes += estimateRowSize ( row ) ;
687+ const avgRowSize = batchSizeBytes / firstBatch . length ;
687688
688- if ( this . spill && this . buildSizeBytes > this . memoryBudget ) {
689- exceeds = true ;
690- break ;
689+ // Heuristic: if first batch already exceeds half the budget, go partitioned
690+ const goPartitioned = this . spill && batchSizeBytes > this . memoryBudget / 2 ;
691+
692+ if ( ! goPartitioned ) {
693+ // Optimistic in-memory path: consume right side, tracking memory
694+ const inMemoryRows : Row [ ] = [ ...firstBatch ] ;
695+ this . buildSizeBytes = batchSizeBytes ;
696+ let exceeds = false ;
697+
698+ while ( true ) {
699+ const batch = await this . right . next ( ) ;
700+ if ( ! batch ) break ;
701+ for ( const row of batch ) {
702+ const rowSize = estimateRowSize ( row ) ;
703+ this . buildSizeBytes += rowSize ;
704+ inMemoryRows . push ( row ) ;
705+
706+ if ( this . spill && this . buildSizeBytes > this . memoryBudget ) {
707+ exceeds = true ;
708+ break ;
709+ }
691710 }
711+ if ( exceeds ) break ;
692712 }
693- if ( exceeds ) break ;
694- }
695713
696- if ( ! exceeds ) {
697- // Fits in memory — build hash map directly
698- this . hashMap = new Map < string , Row [ ] > ( ) ;
699- for ( const row of inMemoryRows ) {
700- const key = this . toJoinKey ( row [ this . rightKey ] ) ;
701- const bucket = this . hashMap . get ( key ) ;
702- if ( bucket ) bucket . push ( row ) ;
703- else this . hashMap . set ( key , [ row ] ) ;
714+ if ( ! exceeds ) {
715+ // Fits in memory — build hash map directly
716+ this . hashMap = new Map < string , Row [ ] > ( ) ;
717+ for ( const row of inMemoryRows ) {
718+ const key = this . toJoinKey ( row [ this . rightKey ] ) ;
719+ const bucket = this . hashMap . get ( key ) ;
720+ if ( bucket ) bucket . push ( row ) ;
721+ else this . hashMap . set ( key , [ row ] ) ;
722+ }
723+ return ;
704724 }
725+
726+ // Fell through: exceeded budget mid-stream, partition what we have
727+ this . buildExceeded = true ;
728+ this . partitionCount = Math . max ( 4 , Math . ceil ( this . buildSizeBytes / ( this . memoryBudget / 2 ) ) ) ;
729+ await this . partitionRightRows ( inMemoryRows ) ;
730+ inMemoryRows . length = 0 ;
731+
732+ // Continue consuming remaining right-side rows
733+ await this . consumeRemainingRight ( ) ;
705734 return ;
706735 }
707736
708- // Switch to Grace hash join: partition both sides with bounded memory.
709- // Use partitionCount that ensures each partition fits in memory budget.
737+ // Proactive partition path: first batch signals right side is large
710738 this . buildExceeded = true ;
711- this . partitionCount = Math . max ( 4 , Math . ceil ( this . buildSizeBytes / ( this . memoryBudget / 2 ) ) ) ;
712- const bucketBudget = Math . floor ( this . memoryBudget / this . partitionCount ) ;
739+ this . buildSizeBytes = batchSizeBytes ;
740+ // Estimate partition count from first batch size × expected batch count
741+ // Use conservative estimate: assume at least 4× more data coming
742+ const estimatedTotal = batchSizeBytes * 4 ;
743+ this . partitionCount = Math . max ( 4 , Math . ceil ( estimatedTotal / ( this . memoryBudget / 2 ) ) ) ;
744+
745+ await this . partitionRightRows ( firstBatch ) ;
746+ await this . consumeRemainingRight ( ) ;
747+ }
713748
714- // Spill right-side rows we already consumed, flushing per-bucket when full
749+ /** Partition an array of right-side rows into spill buckets. */
750+ private async partitionRightRows ( rows : Row [ ] ) : Promise < void > {
751+ const bucketBudget = Math . floor ( this . memoryBudget / this . partitionCount ) ;
715752 const rightBuckets : Row [ ] [ ] = Array . from ( { length : this . partitionCount } , ( ) => [ ] ) ;
716753 const rightBucketBytes : number [ ] = new Array ( this . partitionCount ) . fill ( 0 ) ;
717- this . rightPartitionIds = new Array ( this . partitionCount ) . fill ( "" ) ;
754+ if ( ! this . rightPartitionIds . length ) {
755+ this . rightPartitionIds = new Array ( this . partitionCount ) . fill ( "" ) ;
756+ }
718757
719- const flushRightBucket = async ( bi : number ) : Promise < void > => {
758+ const flushBucket = async ( bi : number ) : Promise < void > => {
720759 if ( rightBuckets [ bi ] . length === 0 ) return ;
721760 const spillId = await this . spill ! . writeRun ( rightBuckets [ bi ] ) ;
722- // Append to existing partition by tracking multiple spill IDs per partition
723761 this . rightPartitionIds [ bi ] = this . rightPartitionIds [ bi ]
724762 ? this . rightPartitionIds [ bi ] + "|" + spillId
725763 : spillId ;
726764 rightBuckets [ bi ] = [ ] ;
727765 rightBucketBytes [ bi ] = 0 ;
728766 } ;
729767
730- for ( const row of inMemoryRows ) {
768+ for ( const row of rows ) {
731769 const bi = this . hashPartition ( row [ this . rightKey ] , this . partitionCount ) ;
732770 const rowSize = estimateRowSize ( row ) ;
733771 if ( rightBucketBytes [ bi ] + rowSize > bucketBudget && rightBuckets [ bi ] . length > 0 ) {
734- await flushRightBucket ( bi ) ;
772+ await flushBucket ( bi ) ;
735773 }
736774 rightBuckets [ bi ] . push ( row ) ;
737775 rightBucketBytes [ bi ] += rowSize ;
738776 }
739- // Free inMemoryRows — no longer needed
740- inMemoryRows . length = 0 ;
741777
742- // Continue consuming remaining right-side rows
778+ for ( let bi = 0 ; bi < this . partitionCount ; bi ++ ) await flushBucket ( bi ) ;
779+ }
780+
781+ /** Consume remaining right-side batches into partitions. */
782+ private async consumeRemainingRight ( ) : Promise < void > {
783+ const bucketBudget = Math . floor ( this . memoryBudget / this . partitionCount ) ;
784+ const rightBuckets : Row [ ] [ ] = Array . from ( { length : this . partitionCount } , ( ) => [ ] ) ;
785+ const rightBucketBytes : number [ ] = new Array ( this . partitionCount ) . fill ( 0 ) ;
786+
787+ const flushBucket = async ( bi : number ) : Promise < void > => {
788+ if ( rightBuckets [ bi ] . length === 0 ) return ;
789+ const spillId = await this . spill ! . writeRun ( rightBuckets [ bi ] ) ;
790+ this . rightPartitionIds [ bi ] = this . rightPartitionIds [ bi ]
791+ ? this . rightPartitionIds [ bi ] + "|" + spillId
792+ : spillId ;
793+ rightBuckets [ bi ] = [ ] ;
794+ rightBucketBytes [ bi ] = 0 ;
795+ } ;
796+
743797 while ( true ) {
744798 const batch = await this . right . next ( ) ;
745799 if ( ! batch ) break ;
746800 for ( const row of batch ) {
747801 const bi = this . hashPartition ( row [ this . rightKey ] , this . partitionCount ) ;
748802 const rowSize = estimateRowSize ( row ) ;
803+ this . buildSizeBytes += rowSize ;
749804 if ( rightBucketBytes [ bi ] + rowSize > bucketBudget && rightBuckets [ bi ] . length > 0 ) {
750- await flushRightBucket ( bi ) ;
805+ await flushBucket ( bi ) ;
751806 }
752807 rightBuckets [ bi ] . push ( row ) ;
753808 rightBucketBytes [ bi ] += rowSize ;
754809 }
755810 }
756811
757- // Flush remaining right buckets
758- for ( let bi = 0 ; bi < this . partitionCount ; bi ++ ) await flushRightBucket ( bi ) ;
812+ for ( let bi = 0 ; bi < this . partitionCount ; bi ++ ) await flushBucket ( bi ) ;
759813
760814 // Consume and partition left side with same bounded approach
761815 const leftBuckets : Row [ ] [ ] = Array . from ( { length : this . partitionCount } , ( ) => [ ] ) ;
0 commit comments