From fd45aa428186090a1a48b57b3bfa78832d061886 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 4 Nov 2025 10:51:18 -0600 Subject: [PATCH] refactor(web): adds unique identifier to transform-tokenization subsets This addition will allow us to clearly and cleanly indicate transforms that are two (or more) halves of the same original whole. It is notably more selective than just the original transition ID and is better suited for indicating split-transform cases. Build-bot: skip build:web Test-bot: skip --- .../src/main/correction/context-token.ts | 8 +- .../main/correction/context-tokenization.ts | 3 +- .../main/correction/search-quotient-node.ts | 11 ++ .../main/correction/search-quotient-spur.ts | 4 +- .../main/correction/tokenization-subsets.ts | 42 +++++- .../context/context-token.tests.ts | 134 +++++++++++++----- .../context/context-tokenization.tests.ts | 47 ++++-- .../context/tokenization-subsets.tests.ts | 25 ++-- .../search-quotient-spur.tests.ts | 20 ++- 9 files changed, 222 insertions(+), 72 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index 5e1cf528681..1e6d32cf2ad 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -15,6 +15,7 @@ import { SearchQuotientNode, PathInputProperties } from "./search-quotient-node. import { TokenSplitMap } from "./context-tokenization.js"; import { LegacyQuotientSpur } from "./legacy-quotient-spur.js"; import { LegacyQuotientRoot } from "./legacy-quotient-root.js"; +import { generateSubsetId } from './tokenization-subsets.js'; import Distribution = LexicalModelTypes.Distribution; import LexicalModel = LexicalModelTypes.LexicalModel; @@ -113,7 +114,8 @@ export class ContextToken { start: 0, transitionId: undefined }, - bestProbFromSet: BASE_PROBABILITY + bestProbFromSet: BASE_PROBABILITY, + subsetId: generateSubsetId() }; searchModule = new LegacyQuotientSpur(searchModule, [{sample: transform, p: BASE_PROBABILITY}], inputMetadata); }); @@ -313,11 +315,11 @@ export class ContextToken { constructingToken = new ContextToken(lexicalModel); backupToken = new ContextToken(constructingToken); constructingToken.addInput({ + ...priorSourceInput, segment: { ...priorSourceInput.segment, start: priorSourceInput.segment.start + extraCharsAdded - }, - bestProbFromSet: priorSourceInput.bestProbFromSet + } }, tailDistribution); const lenToCommit = lenBeforeLastApply + extraCharsAdded; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index 20c95b1ecc2..2cdb7d7020b 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -591,7 +591,8 @@ export class ContextTokenization { transitionId: sourceInput.id, start: appliedLength }, - bestProbFromSet: bestProbFromSet + bestProbFromSet: bestProbFromSet, + subsetId: pendingTokenization.inputSubsetId }, distribution); appliedLength += KMWString.length(distribution[0].sample.insert); diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts index cf6b2198d58..9df43e9e9a6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts @@ -78,6 +78,17 @@ export interface PathInputProperties { * input is included within the SearchSpace's correction space. */ bestProbFromSet: number; + + /** + * A unique identifier noting membership in a specific set of input possibilities with + * sufficiently similar properties that all correspond to the same "input segment". + * + * This tends to serve as an identifying factor for tokenized input distributions, + * indicating the distributions were all sourced from the same original input event. + * + * @see PendingTokenization.inputSubsetId + */ + subsetId: number; } /** diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts index 1dccb58c543..dd36c13490d 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts @@ -13,6 +13,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js'; import { generateSpaceSeed, PathResult, SearchQuotientNode, PathInputProperties } from './search-quotient-node.js'; +import { generateSubsetId } from './tokenization-subsets.js'; import Distribution = LexicalModelTypes.Distribution; import ProbabilityMass = LexicalModelTypes.ProbabilityMass; @@ -74,7 +75,8 @@ export abstract class SearchQuotientSpur implements SearchQuotientNode { transitionId: keystroke.sample.id, start: 0 }, - bestProbFromSet: keystroke.p + bestProbFromSet: keystroke.p, + subsetId: generateSubsetId() } }; const inputSrc = inputSource as PathInputProperties; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts index d7cab30e463..27c505f3539 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts @@ -7,17 +7,38 @@ import { ContextTokenization, TokenizationEdgeAlignment, TokenizationTransitionE import Distribution = LexicalModelTypes.Distribution; import Transform = LexicalModelTypes.Transform; +let SUBSET_ID_SEED = 0; + +export function generateSubsetId() { + return SUBSET_ID_SEED++; +} + export interface PendingTokenization { /** * The edge window corresponding to the common tokenization for the subset's inputs */ alignment: TokenizationEdgeAlignment, + /** * A set of incoming keystrokes with compatible effects when applied. * * If passed to `subsetByInterval`, the transforms should result in a single subset. */ inputs: Distribution> + + /** + * A unique identifier associated with this PendingTokenization and its + * transforms within `SearchSpace`s. This ID assists with detecting when + * split transforms are re-merged during SearchSpace merges. Only + * input-sources with matching subset ID come from the same subset, and thus + * only they should be candidates for re-merging a previous split. + * + * The subset ID does not necessarily match the transition ID; in fact, there + * may be a one-to-many relationship between transition ID and + * `inputSubsetId`. Note that the original transition ID may be found within + * each `Transform` value entry found within the `.inputs` map if desired. + */ + inputSubsetId: number; } /** @@ -134,7 +155,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit // in this tokenization, but that doesn't matter here - we want to imply the // represented keystroke range. const boundaryEdgeIndex = editBoundary.tokenIndex - edgeWindow.sliceIndex; - const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`; + const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`; // source range is part of it components.push(boundaryComponent); @@ -155,7 +176,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit for(const {0: relativeIndex, 1: transform} of tokenizedTransform.entries()) { const insertLen = KMWString.length(transform.insert); if(relativeIndex > 0) { - // The true boundary lie before the insert if the value is non-zero; + // The true boundary lies before the insert if the value is non-zero; // don't differentiate here! boundaryTextLen = 0; } @@ -188,16 +209,29 @@ export class TokenizationSubsetBuilder { const key = this.keyer(precomputation); // Should file the object and its transform data appropriately. + // + // Maps any number of Tokenizations and their incoming alignment data to a common key + // for final tokenization forms. const entry: TokenizationSubset = this._subsets.get(key) ?? { pendingSet: new Map(), key: key } - const forTokenization = entry.pendingSet.get(tokenization) ?? { + + // Finds any previously-accumulated data corresponding to both the incoming and + // target final tokenization form, creating an empty entry if none yet exists. + const forTokenization: PendingTokenization = entry.pendingSet.get(tokenization) ?? { alignment: precomputation.alignment, - inputs: [] + inputs: [], + inputSubsetId: generateSubsetId() }; + + // Adds the incoming tokenized transform data for the pairing... forTokenization.inputs.push({sample: precomputation.tokenizedTransform, p}); + // and ensures that the pairing's data-accumulator is in the map. entry.pendingSet.set(tokenization, forTokenization); + + // Also ensures that the target tokenization's data (accumulating the pairings) + // is made available within the top-level map. this._subsets.set(key, entry); } diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index 49e17f81c97..e242b79c546 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -14,7 +14,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { ContextToken, correction, getBestMatches, models, preprocessInputSources, quotientPathHasInputs, SearchQuotientSpur } from '@keymanapp/lm-worker/test-index'; +import { ContextToken, correction, generateSubsetId, getBestMatches, models, preprocessInputSources, quotientPathHasInputs, SearchQuotientSpur } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; import ExecutionTimer = correction.ExecutionTimer; @@ -123,6 +123,7 @@ describe('ContextToken', function() { it("merges three tokens from single previously-split transforms", () => { const srcTransform = { insert: "can't", deleteLeft: 0, deleteRight: 0, id: 1 }; + const srcSubsetId = generateSubsetId(); const token1 = new ContextToken(plainModel); const token2 = new ContextToken(plainModel); @@ -134,7 +135,8 @@ describe('ContextToken', function() { transitionId: srcTransform.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetId }, [{sample: {insert: 'can', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); token2.addInput({ @@ -143,7 +145,8 @@ describe('ContextToken', function() { transitionId: srcTransform.id, start: 3 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetId }, [{sample: {insert: "'", deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); token3.addInput({ @@ -152,7 +155,8 @@ describe('ContextToken', function() { transitionId: srcTransform.id, start: 4 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetId }, [{sample: {insert: 't', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); const merged = ContextToken.merge([token1, token2, token3], plainModel); @@ -162,7 +166,8 @@ describe('ContextToken', function() { trueTransform: srcTransform, transitionId: srcTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: srcSubsetId } ]); assert.equal(merged.searchModule.inputCount, 1); assert.deepEqual((merged.searchModule as SearchQuotientSpur).lastInput, [{sample: srcTransform, p: 1}]); @@ -177,6 +182,12 @@ describe('ContextToken', function() { { insert: "our", deleteLeft: 0, deleteRight: 0, id: 3 }, { insert: "grapes", deleteLeft: 0, deleteRight: 0, id: 4 } ]; + const srcSubsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; // apples const token1 = new ContextToken(plainModel); @@ -194,7 +205,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[0].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[0] }, [{sample: srcTransforms[0], p: 1}]); token1.addInput({ segment: { @@ -202,7 +214,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token2.addInput({ @@ -211,7 +224,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 1 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: "and", deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ @@ -220,7 +234,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 4 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ segment: { @@ -228,7 +243,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[2].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[2] }, [{sample: srcTransforms[2], p: 1}]); token4.addInput({ @@ -237,7 +253,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[3].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[3] }, [{sample: srcTransforms[3], p: 1}]); const merged = ContextToken.merge(tokensToMerge, plainModel); @@ -247,7 +264,8 @@ describe('ContextToken', function() { trueTransform: t, transitionId: t.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: srcSubsetIds[i] }))); assert.isTrue(quotientPathHasInputs( merged.searchModule, @@ -264,6 +282,12 @@ describe('ContextToken', function() { { insert: toMathematicalSMP("our"), deleteLeft: 0, deleteRight: 0, id: 3 }, { insert: toMathematicalSMP("grapes"), deleteLeft: 0, deleteRight: 0, id: 4 } ]; + const srcSubsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; // apples const token1 = new ContextToken(plainModel); @@ -281,7 +305,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[0].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[0] }, [{sample: srcTransforms[0], p: 1}]); token1.addInput({ segment: { @@ -289,7 +314,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token2.addInput({ @@ -298,7 +324,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 1 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: toMathematicalSMP("and"), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ @@ -307,7 +334,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 4 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ segment: { @@ -315,7 +343,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[2].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[2] }, [{sample: srcTransforms[2], p: 1}]); token4.addInput({ @@ -324,7 +353,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[3].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[3] }, [{sample: srcTransforms[3], p: 1}]); const merged = ContextToken.merge(tokensToMerge, plainModel); @@ -334,7 +364,8 @@ describe('ContextToken', function() { trueTransform: t, transitionId: t.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: srcSubsetIds[i] }))); assert.isTrue(quotientPathHasInputs( merged.searchModule, @@ -372,7 +403,8 @@ describe('ContextToken', function() { trueTransform: keystrokeDistributions[i][0].sample, transitionId: keystrokeDistributions[i][0].sample.id, start: 0 - }, bestProbFromSet: .75 + }, bestProbFromSet: .75, + subsetId: generateSubsetId() }, keystrokeDistributions[i]); }; @@ -406,6 +438,7 @@ describe('ContextToken', function() { ] ]; const splitTextArray = ['big', 'large', 'transform']; + const subsetId = generateSubsetId(); const tokenToSplit = new ContextToken(plainModel); for(let i = 0; i < keystrokeDistributions.length; i++) { @@ -415,7 +448,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[i][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId }, keystrokeDistributions[i]); }; @@ -449,7 +483,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[0][0].sample.id, start: i }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId }))); for(let i = 0; i < resultsOfSplit.length; i++) { @@ -472,6 +507,11 @@ describe('ContextToken', function() { ] ]; const splitTextArray = ['large', 'long', 'transforms']; + const subsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; const tokenToSplit = new ContextToken(plainModel); for(let i = 0; i < keystrokeDistributions.length; i++) { @@ -481,7 +521,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[i][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[i] }, keystrokeDistributions[i]); }; @@ -511,14 +552,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[0][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[0] }, { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, ]); assert.deepEqual(resultsOfSplit[1].inputSegments, [ @@ -528,14 +571,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[1][0].sample.id, start: 'arge'.length }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] }, ]); assert.deepEqual(resultsOfSplit[2].inputSegments, [ @@ -545,7 +590,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[2][0].sample.id, start: 'ng'.length, }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] } ]); @@ -612,6 +658,11 @@ describe('ContextToken', function() { ] ]; const splitTextArray = ['large', 'long', 'transforms'].map(t => toMathematicalSMP(t)); + const subsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; const tokenToSplit = new ContextToken(plainModel); for(let i = 0; i < keystrokeDistributions.length; i++) { @@ -621,7 +672,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[i][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[i] }, keystrokeDistributions[i]); }; @@ -650,14 +702,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[0][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[0] }, { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, ]); assert.deepEqual(resultsOfSplit[1].inputSegments, [{ @@ -666,14 +720,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[1][0].sample.id, start: 'arge'.length }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] } ]); assert.deepEqual(resultsOfSplit[2].inputSegments, [{ @@ -682,7 +738,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[2][0].sample.id, start: 'ng'.length }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] }]); assert.isTrue(quotientPathHasInputs( @@ -746,13 +803,20 @@ describe('preprocessInputSources', () => { { insert: 'ngtransforms', deleteLeft: 4, deleteRight: 0, id: 13 } ]; + const subsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; + const results = preprocessInputSources(transforms.map((t, i) => ({ segment: { trueTransform: t, transitionId: t.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[i] }))); assert.equal(results.length, transforms.length); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index 2a5dcd59614..3adf8585c48 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -24,6 +24,7 @@ import { EditOperation, EditTuple, ExtendedEditOperation, + generateSubsetId, models, PendingTokenization, SearchQuotientSpur, @@ -54,7 +55,8 @@ function toTransformToken(text: string, transformId?: number) { trueTransform: textAsTransform, transitionId: textAsTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ { sample: textAsTransform, p: 1 } ]); token.isWhitespace = isWhitespace; return token; @@ -120,7 +122,8 @@ describe('ContextTokenization', function() { const map = new Map(); map.set(0, emptyTransform); return map; - })(), p: 1}] + })(), p: 1}], + inputSubsetId: generateSubsetId() }; let tokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); @@ -152,7 +155,8 @@ describe('ContextTokenization', function() { const map = new Map(); map.set(0, emptyTransform); return map; - })(), p: 1}] + })(), p: 1}], + inputSubsetId: generateSubsetId() }; let baseTokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); @@ -206,7 +210,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -259,7 +264,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 2 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -296,7 +302,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -342,7 +349,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -398,7 +406,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -437,6 +446,7 @@ describe('ContextTokenization', function() { inputTransformMap.set( 0, { insert: 'day', deleteLeft: 6, id: 42 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); + const subsetId = generateSubsetId(); const tokenization = baseTokenization.evaluateTransition({ alignment: { merges: [], @@ -449,7 +459,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: subsetId }, plainModel, inputTransform, @@ -469,7 +480,8 @@ describe('ContextTokenization', function() { trueTransform: inputTransform, transitionId: inputTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId }); // The new tail tokens should not include anything from the original tail; @@ -479,7 +491,8 @@ describe('ContextTokenization', function() { trueTransform: inputTransform, transitionId: inputTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId }]); assert.deepEqual(tokenization.tokens[tokenization.tokens.length-1].inputSegments, [{ segment: { @@ -487,7 +500,8 @@ describe('ContextTokenization', function() { transitionId: inputTransform.id, start: 1 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId }]); const tailIndex = tokenization.tokens.length - 1; @@ -530,7 +544,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, { insert: ' ', deleteLeft: 0 }, @@ -593,7 +608,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, { insert: 't', deleteLeft: 0 }, @@ -659,7 +675,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts index 880ad7782f3..073c780744e 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts @@ -20,6 +20,7 @@ import { buildEdgeWindow, ContextToken, ContextTokenization, + generateSubsetId, models, precomputationSubsetKeyer, TokenizationTransitionEdits, @@ -188,7 +189,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ {sample: {insert: 'te', deleteLeft: 0, id: 13}, p: 1} ]); @@ -223,7 +225,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ {sample: {insert: 't', deleteLeft: 0}, p: 1} ]); @@ -272,7 +275,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [{sample: {insert: 'ts', deleteLeft: 0, id: 13}, p: 1} ]); return token; @@ -307,7 +311,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ {sample: {insert: 't', deleteLeft: 0, id: 13}, p: 1} ]); @@ -767,7 +772,8 @@ describe('TokenizationSubsetBuilder', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ { sample: trueSourceTransform, p: .6 } ]); @@ -781,7 +787,8 @@ describe('TokenizationSubsetBuilder', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ { sample: { insert: 's', deleteLeft: 0, id: 13 }, p: .4 } ]); @@ -822,7 +829,8 @@ describe('TokenizationSubsetBuilder', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: .6 + }, bestProbFromSet: .6, + subsetId: generateSubsetId() }, [ { sample: trueSourceTransform, p: .6 } ]); @@ -835,7 +843,8 @@ describe('TokenizationSubsetBuilder', function() { deleteLeft: 1 }, transitionId: 13, start: 0 - }, bestProbFromSet: .6 + }, bestProbFromSet: .6, + subsetId: generateSubsetId() }, [ { sample: { insert: 'a', deleteLeft: 0, id: 13}, p: .4 } ]); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts index 5554e05e1ed..837d5e0cae3 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts @@ -83,7 +83,9 @@ describe('SearchQuotientSpur', () => { transitionId: leadEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: leadEdgeDistribution[0].p + bestProbFromSet: leadEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: extendedPath.inputSegments[0].subsetId } ]); @@ -138,14 +140,18 @@ describe('SearchQuotientSpur', () => { transitionId: leadEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: leadEdgeDistribution[0].p + bestProbFromSet: leadEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[0].subsetId }, { segment: { trueTransform: tailEdgeDistribution[0].sample, transitionId: tailEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: tailEdgeDistribution[0].p + bestProbFromSet: tailEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[1].subsetId } ]); @@ -216,14 +222,18 @@ describe('SearchQuotientSpur', () => { transitionId: leadEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: leadEdgeDistribution[0].p + bestProbFromSet: leadEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[0].subsetId }, { segment: { trueTransform: tailEdgeDistribution[0].sample, transitionId: tailEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: tailEdgeDistribution[0].p + bestProbFromSet: tailEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[1].subsetId } ]);