keymanapp · jahorton · Jan 28, 2026 · Nov 4, 2025
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
@@ -15,6 +15,7 @@ import { SearchQuotientNode, PathInputProperties } from "./search-quotient-node.
 import { TokenSplitMap } from "./context-tokenization.js";
 import { LegacyQuotientSpur } from "./legacy-quotient-spur.js";
 import { LegacyQuotientRoot } from "./legacy-quotient-root.js";
+import { generateSubsetId } from './tokenization-subsets.js';
 
 import Distribution = LexicalModelTypes.Distribution;
 import LexicalModel = LexicalModelTypes.LexicalModel;
@@ -113,7 +114,8 @@ export class ContextToken {
             start: 0,
             transitionId: undefined
           },
-          bestProbFromSet: BASE_PROBABILITY
+          bestProbFromSet: BASE_PROBABILITY,
+          subsetId: generateSubsetId()
         };
         searchModule = new LegacyQuotientSpur(searchModule, [{sample: transform, p: BASE_PROBABILITY}], inputMetadata);
       });
@@ -313,11 +315,11 @@ export class ContextToken {
         constructingToken = new ContextToken(lexicalModel);
         backupToken = new ContextToken(constructingToken);
         constructingToken.addInput({
+          ...priorSourceInput,
           segment: {
             ...priorSourceInput.segment,
             start: priorSourceInput.segment.start + extraCharsAdded
-          },
-          bestProbFromSet: priorSourceInput.bestProbFromSet
+          }
         }, tailDistribution);
 
         const lenToCommit = lenBeforeLastApply + extraCharsAdded;

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -591,7 +591,8 @@ export class ContextTokenization {
           transitionId: sourceInput.id,
           start: appliedLength
         },
-        bestProbFromSet: bestProbFromSet
+        bestProbFromSet: bestProbFromSet,
+        subsetId: pendingTokenization.inputSubsetId
       }, distribution);
       appliedLength += KMWString.length(distribution[0].sample.insert);
 

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts
@@ -78,6 +78,17 @@ export interface PathInputProperties {
    * input is included within the SearchSpace's correction space.
    */
   bestProbFromSet: number;
+
+  /**
+   * A unique identifier noting membership in a specific set of input possibilities with
+   * sufficiently similar properties that all correspond to the same "input segment".
+   *
+   * This tends to serve as an identifying factor for tokenized input distributions,
+   * indicating the distributions were all sourced from the same original input event.
+   *
+   * @see PendingTokenization.inputSubsetId
+   */
+  subsetId: number;
 }
 
 /**

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts
@@ -13,6 +13,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types';
 
 import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js';
 import { generateSpaceSeed, PathResult, SearchQuotientNode, PathInputProperties } from './search-quotient-node.js';
+import { generateSubsetId } from './tokenization-subsets.js';
 
 import Distribution = LexicalModelTypes.Distribution;
 import ProbabilityMass = LexicalModelTypes.ProbabilityMass;
@@ -74,7 +75,8 @@ export abstract class SearchQuotientSpur implements SearchQuotientNode {
           transitionId: keystroke.sample.id,
           start: 0
         },
-        bestProbFromSet: keystroke.p
+        bestProbFromSet: keystroke.p,
+        subsetId: generateSubsetId()
       }
     };
     const inputSrc = inputSource as PathInputProperties;

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts
@@ -7,17 +7,38 @@ import { ContextTokenization, TokenizationEdgeAlignment, TokenizationTransitionE
 import Distribution = LexicalModelTypes.Distribution;
 import Transform = LexicalModelTypes.Transform;
 
+let SUBSET_ID_SEED = 0;
+
+export function generateSubsetId() {
+  return SUBSET_ID_SEED++;
+}
+
 export interface PendingTokenization {
   /**
    * The edge window corresponding to the common tokenization for the subset's inputs
    */
   alignment: TokenizationEdgeAlignment,
+
   /**
    * A set of incoming keystrokes with compatible effects when applied.
    *
    * If passed to `subsetByInterval`, the transforms should result in a single subset.
    */
   inputs: Distribution<Map<number, Transform>>
+
+  /**
+   * A unique identifier associated with this PendingTokenization and its
+   * transforms within `SearchSpace`s.  This ID assists with detecting when
+   * split transforms are re-merged during SearchSpace merges.  Only
+   * input-sources with matching subset ID come from the same subset, and thus
+   * only they should be candidates for re-merging a previous split.
+   *
+   * The subset ID does not necessarily match the transition ID; in fact, there
+   * may be a one-to-many relationship between transition ID and
+   * `inputSubsetId`.  Note that the original transition ID may be found within
+   * each `Transform` value entry found within the `.inputs` map if desired.
+   */
+  inputSubsetId: number;
 }
 
 /**
@@ -134,7 +155,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit
   // in this tokenization, but that doesn't matter here - we want to imply the
   // represented keystroke range.
   const boundaryEdgeIndex = editBoundary.tokenIndex - edgeWindow.sliceIndex;
-  const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`;
+  const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`; // source range is part of it
 
   components.push(boundaryComponent);
 
@@ -155,7 +176,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit
   for(const {0: relativeIndex, 1: transform} of tokenizedTransform.entries()) {
     const insertLen = KMWString.length(transform.insert);
     if(relativeIndex > 0) {
-      // The true boundary lie before the insert if the value is non-zero;
+      // The true boundary lies before the insert if the value is non-zero;
       // don't differentiate here!
       boundaryTextLen = 0;
     }
@@ -188,16 +209,29 @@ export class TokenizationSubsetBuilder {
     const key = this.keyer(precomputation);
 
     // Should file the object and its transform data appropriately.
+    //
+    // Maps any number of Tokenizations and their incoming alignment data to a common key
+    // for final tokenization forms.
     const entry: TokenizationSubset = this._subsets.get(key) ?? {
       pendingSet: new Map(),
       key: key
     }
-    const forTokenization = entry.pendingSet.get(tokenization) ?? {
+
+    // Finds any previously-accumulated data corresponding to both the incoming and
+    // target final tokenization form, creating an empty entry if none yet exists.
+    const forTokenization: PendingTokenization = entry.pendingSet.get(tokenization) ?? {
       alignment: precomputation.alignment,
-      inputs: []
+      inputs: [],
+      inputSubsetId: generateSubsetId()
     };
+
+    // Adds the incoming tokenized transform data for the pairing...
     forTokenization.inputs.push({sample: precomputation.tokenizedTransform, p});
+    // and ensures that the pairing's data-accumulator is in the map.
     entry.pendingSet.set(tokenization, forTokenization);
+
+    // Also ensures that the target tokenization's data (accumulating the pairings)
+    // is made available within the top-level map.
     this._subsets.set(key, entry);
   }