Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { SearchQuotientNode, PathInputProperties } from "./search-quotient-node.
import { TokenSplitMap } from "./context-tokenization.js";
import { LegacyQuotientSpur } from "./legacy-quotient-spur.js";
import { LegacyQuotientRoot } from "./legacy-quotient-root.js";
import { generateSubsetId } from './tokenization-subsets.js';

import Distribution = LexicalModelTypes.Distribution;
import LexicalModel = LexicalModelTypes.LexicalModel;
Expand Down Expand Up @@ -113,7 +114,8 @@ export class ContextToken {
start: 0,
transitionId: undefined
},
bestProbFromSet: BASE_PROBABILITY
bestProbFromSet: BASE_PROBABILITY,
subsetId: generateSubsetId()
};
searchModule = new LegacyQuotientSpur(searchModule, [{sample: transform, p: BASE_PROBABILITY}], inputMetadata);
});
Expand Down Expand Up @@ -313,11 +315,11 @@ export class ContextToken {
constructingToken = new ContextToken(lexicalModel);
backupToken = new ContextToken(constructingToken);
constructingToken.addInput({
...priorSourceInput,
segment: {
...priorSourceInput.segment,
start: priorSourceInput.segment.start + extraCharsAdded
},
bestProbFromSet: priorSourceInput.bestProbFromSet
}
}, tailDistribution);

const lenToCommit = lenBeforeLastApply + extraCharsAdded;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,8 @@ export class ContextTokenization {
transitionId: sourceInput.id,
start: appliedLength
},
bestProbFromSet: bestProbFromSet
bestProbFromSet: bestProbFromSet,
subsetId: pendingTokenization.inputSubsetId
}, distribution);
appliedLength += KMWString.length(distribution[0].sample.insert);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ export interface PathInputProperties {
* input is included within the SearchSpace's correction space.
*/
bestProbFromSet: number;

/**
* A unique identifier noting membership in a specific set of input possibilities with
* sufficiently similar properties that all correspond to the same "input segment".
*
* This tends to serve as an identifying factor for tokenized input distributions,
* indicating the distributions were all sourced from the same original input event.
*
* @see PendingTokenization.inputSubsetId
*/
subsetId: number;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types';

import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js';
import { generateSpaceSeed, PathResult, SearchQuotientNode, PathInputProperties } from './search-quotient-node.js';
import { generateSubsetId } from './tokenization-subsets.js';

import Distribution = LexicalModelTypes.Distribution;
import ProbabilityMass = LexicalModelTypes.ProbabilityMass;
Expand Down Expand Up @@ -74,7 +75,8 @@ export abstract class SearchQuotientSpur implements SearchQuotientNode {
transitionId: keystroke.sample.id,
start: 0
},
bestProbFromSet: keystroke.p
bestProbFromSet: keystroke.p,
subsetId: generateSubsetId()
}
};
const inputSrc = inputSource as PathInputProperties;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,38 @@ import { ContextTokenization, TokenizationEdgeAlignment, TokenizationTransitionE
import Distribution = LexicalModelTypes.Distribution;
import Transform = LexicalModelTypes.Transform;

let SUBSET_ID_SEED = 0;

export function generateSubsetId() {
return SUBSET_ID_SEED++;
}

export interface PendingTokenization {
/**
* The edge window corresponding to the common tokenization for the subset's inputs
*/
alignment: TokenizationEdgeAlignment,

/**
* A set of incoming keystrokes with compatible effects when applied.
*
* If passed to `subsetByInterval`, the transforms should result in a single subset.
*/
inputs: Distribution<Map<number, Transform>>

/**
* A unique identifier associated with this PendingTokenization and its
* transforms within `SearchSpace`s. This ID assists with detecting when
* split transforms are re-merged during SearchSpace merges. Only
* input-sources with matching subset ID come from the same subset, and thus
* only they should be candidates for re-merging a previous split.
*
* The subset ID does not necessarily match the transition ID; in fact, there
* may be a one-to-many relationship between transition ID and
* `inputSubsetId`. Note that the original transition ID may be found within
* each `Transform` value entry found within the `.inputs` map if desired.
*/
inputSubsetId: number;
}

/**
Expand Down Expand Up @@ -134,7 +155,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit
// in this tokenization, but that doesn't matter here - we want to imply the
// represented keystroke range.
const boundaryEdgeIndex = editBoundary.tokenIndex - edgeWindow.sliceIndex;
const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`;
const boundaryComponent = `B${editBoundary.tokenIndex}=${editBoundary.sourceRangeKey}`; // source range is part of it

components.push(boundaryComponent);

Expand All @@ -155,7 +176,7 @@ export function precomputationSubsetKeyer(tokenizationEdits: TokenizationTransit
for(const {0: relativeIndex, 1: transform} of tokenizedTransform.entries()) {
const insertLen = KMWString.length(transform.insert);
if(relativeIndex > 0) {
// The true boundary lie before the insert if the value is non-zero;
// The true boundary lies before the insert if the value is non-zero;
// don't differentiate here!
boundaryTextLen = 0;
}
Expand Down Expand Up @@ -188,16 +209,29 @@ export class TokenizationSubsetBuilder {
const key = this.keyer(precomputation);

// Should file the object and its transform data appropriately.
//
// Maps any number of Tokenizations and their incoming alignment data to a common key
// for final tokenization forms.
const entry: TokenizationSubset = this._subsets.get(key) ?? {
pendingSet: new Map(),
key: key
}
const forTokenization = entry.pendingSet.get(tokenization) ?? {

// Finds any previously-accumulated data corresponding to both the incoming and
// target final tokenization form, creating an empty entry if none yet exists.
const forTokenization: PendingTokenization = entry.pendingSet.get(tokenization) ?? {
alignment: precomputation.alignment,
inputs: []
inputs: [],
inputSubsetId: generateSubsetId()
};

// Adds the incoming tokenized transform data for the pairing...
forTokenization.inputs.push({sample: precomputation.tokenizedTransform, p});
// and ensures that the pairing's data-accumulator is in the map.
entry.pendingSet.set(tokenization, forTokenization);

// Also ensures that the target tokenization's data (accumulating the pairings)
// is made available within the top-level map.
this._subsets.set(key, entry);
}

Expand Down
Loading