diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index 5e1cf528681..1e6d32cf2ad 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -15,6 +15,7 @@ import { SearchQuotientNode, PathInputProperties } from "./search-quotient-node. import { TokenSplitMap } from "./context-tokenization.js"; import { LegacyQuotientSpur } from "./legacy-quotient-spur.js"; import { LegacyQuotientRoot } from "./legacy-quotient-root.js"; +import { generateSubsetId } from './tokenization-subsets.js'; import Distribution = LexicalModelTypes.Distribution; import LexicalModel = LexicalModelTypes.LexicalModel; @@ -113,7 +114,8 @@ export class ContextToken { start: 0, transitionId: undefined }, - bestProbFromSet: BASE_PROBABILITY + bestProbFromSet: BASE_PROBABILITY, + subsetId: generateSubsetId() }; searchModule = new LegacyQuotientSpur(searchModule, [{sample: transform, p: BASE_PROBABILITY}], inputMetadata); }); @@ -313,11 +315,11 @@ export class ContextToken { constructingToken = new ContextToken(lexicalModel); backupToken = new ContextToken(constructingToken); constructingToken.addInput({ + ...priorSourceInput, segment: { ...priorSourceInput.segment, start: priorSourceInput.segment.start + extraCharsAdded - }, - bestProbFromSet: priorSourceInput.bestProbFromSet + } }, tailDistribution); const lenToCommit = lenBeforeLastApply + extraCharsAdded; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index 20c95b1ecc2..2cdb7d7020b 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -591,7 +591,8 @@ export class ContextTokenization { transitionId: sourceInput.id, start: appliedLength }, - bestProbFromSet: bestProbFromSet + bestProbFromSet: bestProbFromSet, + subsetId: pendingTokenization.inputSubsetId }, distribution); appliedLength += KMWString.length(distribution[0].sample.insert); diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts index cf6b2198d58..9df43e9e9a6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts @@ -78,6 +78,17 @@ export interface PathInputProperties { * input is included within the SearchSpace's correction space. */ bestProbFromSet: number; + + /** + * A unique identifier noting membership in a specific set of input possibilities with + * sufficiently similar properties that all correspond to the same "input segment". + * + * This tends to serve as an identifying factor for tokenized input distributions, + * indicating the distributions were all sourced from the same original input event. + * + * @see PendingTokenization.inputSubsetId + */ + subsetId: number; } /** diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts index 1dccb58c543..dd36c13490d 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts @@ -13,6 +13,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js'; import { generateSpaceSeed, PathResult, SearchQuotientNode, PathInputProperties } from './search-quotient-node.js'; +import { generateSubsetId } from './tokenization-subsets.js'; import Distribution = LexicalModelTypes.Distribution; import ProbabilityMass = LexicalModelTypes.ProbabilityMass; @@ -74,7 +75,8 @@ export abstract class SearchQuotientSpur implements SearchQuotientNode { transitionId: keystroke.sample.id, start: 0 }, - bestProbFromSet: keystroke.p + bestProbFromSet: keystroke.p, + subsetId: generateSubsetId() } }; const inputSrc = inputSource as PathInputProperties; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts index d7cab30e463..27c505f3539 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts @@ -7,17 +7,38 @@ import { ContextTokenization, TokenizationEdgeAlignment, TokenizationTransitionE import Distribution = LexicalModelTypes.Distribution; import Transform = LexicalModelTypes.Transform; +let SUBSET_ID_SEED = 0; + +export function generateSubsetId() { + return SUBSET_ID_SEED++; +} + export interface PendingTokenization { /** * The edge window corresponding to the common tokenization for the subset's inputs */ alignment: TokenizationEdgeAlignment, + /** * A set of incoming keystrokes with compatible effects when applied. * * If passed to `subsetByInterval`, the transforms should result in a single subset. */ inputs: Distribution> + + /** + * A unique identifier associated with this PendingTokenization and its + * transforms within `SearchSpace`s. This ID assists with detecting when + * split transforms are re-merged during SearchSpace merges. Only + * input-sources with matching subset ID come from the same subset, and thus + * only they should be candidates for re-merging a previous split. + * + * The subset ID does not necessarily match the transition ID; in fact, there + * may be a one-to-many relationship between transition ID and + * `inputSubsetId`. Note that the original transition ID may be found within + * each `Transform` value entry found within the `.inputs` map if desired. + */ + inputSubsetId: number; } /** @@ -134,7 +155,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit // in this tokenization, but that doesn't matter here - we want to imply the // represented keystroke range. const boundaryEdgeIndex = editBoundary.tokenIndex - edgeWindow.sliceIndex; - const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`; + const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`; // source range is part of it components.push(boundaryComponent); @@ -155,7 +176,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit for(const {0: relativeIndex, 1: transform} of tokenizedTransform.entries()) { const insertLen = KMWString.length(transform.insert); if(relativeIndex > 0) { - // The true boundary lie before the insert if the value is non-zero; + // The true boundary lies before the insert if the value is non-zero; // don't differentiate here! boundaryTextLen = 0; } @@ -188,16 +209,29 @@ export class TokenizationSubsetBuilder { const key = this.keyer(precomputation); // Should file the object and its transform data appropriately. + // + // Maps any number of Tokenizations and their incoming alignment data to a common key + // for final tokenization forms. const entry: TokenizationSubset = this._subsets.get(key) ?? { pendingSet: new Map(), key: key } - const forTokenization = entry.pendingSet.get(tokenization) ?? { + + // Finds any previously-accumulated data corresponding to both the incoming and + // target final tokenization form, creating an empty entry if none yet exists. + const forTokenization: PendingTokenization = entry.pendingSet.get(tokenization) ?? { alignment: precomputation.alignment, - inputs: [] + inputs: [], + inputSubsetId: generateSubsetId() }; + + // Adds the incoming tokenized transform data for the pairing... forTokenization.inputs.push({sample: precomputation.tokenizedTransform, p}); + // and ensures that the pairing's data-accumulator is in the map. entry.pendingSet.set(tokenization, forTokenization); + + // Also ensures that the target tokenization's data (accumulating the pairings) + // is made available within the top-level map. this._subsets.set(key, entry); } diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index 49e17f81c97..e242b79c546 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -14,7 +14,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { ContextToken, correction, getBestMatches, models, preprocessInputSources, quotientPathHasInputs, SearchQuotientSpur } from '@keymanapp/lm-worker/test-index'; +import { ContextToken, correction, generateSubsetId, getBestMatches, models, preprocessInputSources, quotientPathHasInputs, SearchQuotientSpur } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; import ExecutionTimer = correction.ExecutionTimer; @@ -123,6 +123,7 @@ describe('ContextToken', function() { it("merges three tokens from single previously-split transforms", () => { const srcTransform = { insert: "can't", deleteLeft: 0, deleteRight: 0, id: 1 }; + const srcSubsetId = generateSubsetId(); const token1 = new ContextToken(plainModel); const token2 = new ContextToken(plainModel); @@ -134,7 +135,8 @@ describe('ContextToken', function() { transitionId: srcTransform.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetId }, [{sample: {insert: 'can', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); token2.addInput({ @@ -143,7 +145,8 @@ describe('ContextToken', function() { transitionId: srcTransform.id, start: 3 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetId }, [{sample: {insert: "'", deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); token3.addInput({ @@ -152,7 +155,8 @@ describe('ContextToken', function() { transitionId: srcTransform.id, start: 4 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetId }, [{sample: {insert: 't', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); const merged = ContextToken.merge([token1, token2, token3], plainModel); @@ -162,7 +166,8 @@ describe('ContextToken', function() { trueTransform: srcTransform, transitionId: srcTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: srcSubsetId } ]); assert.equal(merged.searchModule.inputCount, 1); assert.deepEqual((merged.searchModule as SearchQuotientSpur).lastInput, [{sample: srcTransform, p: 1}]); @@ -177,6 +182,12 @@ describe('ContextToken', function() { { insert: "our", deleteLeft: 0, deleteRight: 0, id: 3 }, { insert: "grapes", deleteLeft: 0, deleteRight: 0, id: 4 } ]; + const srcSubsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; // apples const token1 = new ContextToken(plainModel); @@ -194,7 +205,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[0].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[0] }, [{sample: srcTransforms[0], p: 1}]); token1.addInput({ segment: { @@ -202,7 +214,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token2.addInput({ @@ -211,7 +224,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 1 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: "and", deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ @@ -220,7 +234,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 4 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ segment: { @@ -228,7 +243,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[2].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[2] }, [{sample: srcTransforms[2], p: 1}]); token4.addInput({ @@ -237,7 +253,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[3].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[3] }, [{sample: srcTransforms[3], p: 1}]); const merged = ContextToken.merge(tokensToMerge, plainModel); @@ -247,7 +264,8 @@ describe('ContextToken', function() { trueTransform: t, transitionId: t.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: srcSubsetIds[i] }))); assert.isTrue(quotientPathHasInputs( merged.searchModule, @@ -264,6 +282,12 @@ describe('ContextToken', function() { { insert: toMathematicalSMP("our"), deleteLeft: 0, deleteRight: 0, id: 3 }, { insert: toMathematicalSMP("grapes"), deleteLeft: 0, deleteRight: 0, id: 4 } ]; + const srcSubsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; // apples const token1 = new ContextToken(plainModel); @@ -281,7 +305,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[0].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[0] }, [{sample: srcTransforms[0], p: 1}]); token1.addInput({ segment: { @@ -289,7 +314,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token2.addInput({ @@ -298,7 +324,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 1 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: toMathematicalSMP("and"), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ @@ -307,7 +334,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[1].id, start: 4 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[1] }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); token3.addInput({ segment: { @@ -315,7 +343,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[2].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[2] }, [{sample: srcTransforms[2], p: 1}]); token4.addInput({ @@ -324,7 +353,8 @@ describe('ContextToken', function() { transitionId: srcTransforms[3].id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: srcSubsetIds[3] }, [{sample: srcTransforms[3], p: 1}]); const merged = ContextToken.merge(tokensToMerge, plainModel); @@ -334,7 +364,8 @@ describe('ContextToken', function() { trueTransform: t, transitionId: t.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: srcSubsetIds[i] }))); assert.isTrue(quotientPathHasInputs( merged.searchModule, @@ -372,7 +403,8 @@ describe('ContextToken', function() { trueTransform: keystrokeDistributions[i][0].sample, transitionId: keystrokeDistributions[i][0].sample.id, start: 0 - }, bestProbFromSet: .75 + }, bestProbFromSet: .75, + subsetId: generateSubsetId() }, keystrokeDistributions[i]); }; @@ -406,6 +438,7 @@ describe('ContextToken', function() { ] ]; const splitTextArray = ['big', 'large', 'transform']; + const subsetId = generateSubsetId(); const tokenToSplit = new ContextToken(plainModel); for(let i = 0; i < keystrokeDistributions.length; i++) { @@ -415,7 +448,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[i][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId }, keystrokeDistributions[i]); }; @@ -449,7 +483,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[0][0].sample.id, start: i }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId }))); for(let i = 0; i < resultsOfSplit.length; i++) { @@ -472,6 +507,11 @@ describe('ContextToken', function() { ] ]; const splitTextArray = ['large', 'long', 'transforms']; + const subsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; const tokenToSplit = new ContextToken(plainModel); for(let i = 0; i < keystrokeDistributions.length; i++) { @@ -481,7 +521,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[i][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[i] }, keystrokeDistributions[i]); }; @@ -511,14 +552,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[0][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[0] }, { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, ]); assert.deepEqual(resultsOfSplit[1].inputSegments, [ @@ -528,14 +571,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[1][0].sample.id, start: 'arge'.length }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] }, ]); assert.deepEqual(resultsOfSplit[2].inputSegments, [ @@ -545,7 +590,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[2][0].sample.id, start: 'ng'.length, }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] } ]); @@ -612,6 +658,11 @@ describe('ContextToken', function() { ] ]; const splitTextArray = ['large', 'long', 'transforms'].map(t => toMathematicalSMP(t)); + const subsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; const tokenToSplit = new ContextToken(plainModel); for(let i = 0; i < keystrokeDistributions.length; i++) { @@ -621,7 +672,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[i][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[i] }, keystrokeDistributions[i]); }; @@ -650,14 +702,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[0][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[0] }, { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, ]); assert.deepEqual(resultsOfSplit[1].inputSegments, [{ @@ -666,14 +720,16 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[1][0].sample.id, start: 'arge'.length }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[1] }, { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] } ]); assert.deepEqual(resultsOfSplit[2].inputSegments, [{ @@ -682,7 +738,8 @@ describe('ContextToken', function() { transitionId: keystrokeDistributions[2][0].sample.id, start: 'ng'.length }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[2] }]); assert.isTrue(quotientPathHasInputs( @@ -746,13 +803,20 @@ describe('preprocessInputSources', () => { { insert: 'ngtransforms', deleteLeft: 4, deleteRight: 0, id: 13 } ]; + const subsetIds = [ + generateSubsetId(), + generateSubsetId(), + generateSubsetId() + ]; + const results = preprocessInputSources(transforms.map((t, i) => ({ segment: { trueTransform: t, transitionId: t.id, start: 0 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId: subsetIds[i] }))); assert.equal(results.length, transforms.length); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index 2a5dcd59614..3adf8585c48 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -24,6 +24,7 @@ import { EditOperation, EditTuple, ExtendedEditOperation, + generateSubsetId, models, PendingTokenization, SearchQuotientSpur, @@ -54,7 +55,8 @@ function toTransformToken(text: string, transformId?: number) { trueTransform: textAsTransform, transitionId: textAsTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ { sample: textAsTransform, p: 1 } ]); token.isWhitespace = isWhitespace; return token; @@ -120,7 +122,8 @@ describe('ContextTokenization', function() { const map = new Map(); map.set(0, emptyTransform); return map; - })(), p: 1}] + })(), p: 1}], + inputSubsetId: generateSubsetId() }; let tokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); @@ -152,7 +155,8 @@ describe('ContextTokenization', function() { const map = new Map(); map.set(0, emptyTransform); return map; - })(), p: 1}] + })(), p: 1}], + inputSubsetId: generateSubsetId() }; let baseTokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); @@ -206,7 +210,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -259,7 +264,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 2 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -296,7 +302,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -342,7 +349,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -398,7 +406,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, @@ -437,6 +446,7 @@ describe('ContextTokenization', function() { inputTransformMap.set( 0, { insert: 'day', deleteLeft: 6, id: 42 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); + const subsetId = generateSubsetId(); const tokenization = baseTokenization.evaluateTransition({ alignment: { merges: [], @@ -449,7 +459,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: subsetId }, plainModel, inputTransform, @@ -469,7 +480,8 @@ describe('ContextTokenization', function() { trueTransform: inputTransform, transitionId: inputTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId }); // The new tail tokens should not include anything from the original tail; @@ -479,7 +491,8 @@ describe('ContextTokenization', function() { trueTransform: inputTransform, transitionId: inputTransform.id, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId }]); assert.deepEqual(tokenization.tokens[tokenization.tokens.length-1].inputSegments, [{ segment: { @@ -487,7 +500,8 @@ describe('ContextTokenization', function() { transitionId: inputTransform.id, start: 1 }, - bestProbFromSet: 1 + bestProbFromSet: 1, + subsetId }]); const tailIndex = tokenization.tokens.length - 1; @@ -530,7 +544,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, { insert: ' ', deleteLeft: 0 }, @@ -593,7 +608,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, { insert: 't', deleteLeft: 0 }, @@ -659,7 +675,8 @@ describe('ContextTokenization', function() { }, removedTokenCount: 0 }, - inputs: [{ sample: inputTransformMap, p: 1 }] + inputs: [{ sample: inputTransformMap, p: 1 }], + inputSubsetId: generateSubsetId() }, plainModel, inputTransform, diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts index 880ad7782f3..073c780744e 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts @@ -20,6 +20,7 @@ import { buildEdgeWindow, ContextToken, ContextTokenization, + generateSubsetId, models, precomputationSubsetKeyer, TokenizationTransitionEdits, @@ -188,7 +189,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ {sample: {insert: 'te', deleteLeft: 0, id: 13}, p: 1} ]); @@ -223,7 +225,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ {sample: {insert: 't', deleteLeft: 0}, p: 1} ]); @@ -272,7 +275,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [{sample: {insert: 'ts', deleteLeft: 0, id: 13}, p: 1} ]); return token; @@ -307,7 +311,8 @@ describe('precomputationSubsetKeyer', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ {sample: {insert: 't', deleteLeft: 0, id: 13}, p: 1} ]); @@ -767,7 +772,8 @@ describe('TokenizationSubsetBuilder', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ { sample: trueSourceTransform, p: .6 } ]); @@ -781,7 +787,8 @@ describe('TokenizationSubsetBuilder', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: 1 + }, bestProbFromSet: 1, + subsetId: generateSubsetId() }, [ { sample: { insert: 's', deleteLeft: 0, id: 13 }, p: .4 } ]); @@ -822,7 +829,8 @@ describe('TokenizationSubsetBuilder', function() { id: 13 }, transitionId: 13, start: 0 - }, bestProbFromSet: .6 + }, bestProbFromSet: .6, + subsetId: generateSubsetId() }, [ { sample: trueSourceTransform, p: .6 } ]); @@ -835,7 +843,8 @@ describe('TokenizationSubsetBuilder', function() { deleteLeft: 1 }, transitionId: 13, start: 0 - }, bestProbFromSet: .6 + }, bestProbFromSet: .6, + subsetId: generateSubsetId() }, [ { sample: { insert: 'a', deleteLeft: 0, id: 13}, p: .4 } ]); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts index 5554e05e1ed..837d5e0cae3 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts @@ -83,7 +83,9 @@ describe('SearchQuotientSpur', () => { transitionId: leadEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: leadEdgeDistribution[0].p + bestProbFromSet: leadEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: extendedPath.inputSegments[0].subsetId } ]); @@ -138,14 +140,18 @@ describe('SearchQuotientSpur', () => { transitionId: leadEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: leadEdgeDistribution[0].p + bestProbFromSet: leadEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[0].subsetId }, { segment: { trueTransform: tailEdgeDistribution[0].sample, transitionId: tailEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: tailEdgeDistribution[0].p + bestProbFromSet: tailEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[1].subsetId } ]); @@ -216,14 +222,18 @@ describe('SearchQuotientSpur', () => { transitionId: leadEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: leadEdgeDistribution[0].p + bestProbFromSet: leadEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[0].subsetId }, { segment: { trueTransform: tailEdgeDistribution[0].sample, transitionId: tailEdgeDistribution[0].sample.id, start: 0 }, - bestProbFromSet: tailEdgeDistribution[0].p + bestProbFromSet: tailEdgeDistribution[0].p, + // Just write in the variable-value entry; the rest should match perfectly. + subsetId: length2Path.inputSegments[1].subsetId } ]);