diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts index 54977ff1b6f..a5285f754ba 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts @@ -288,9 +288,12 @@ export class SearchNode { // // Following previous insertions is fine, as is following a proper // match/substitution. + if(this.lastEdgeType == PathEdge.DELETION) { + return []; + } if(this.priorInput.length > 0) { const priorInput = this.priorInput[this.priorInput.length - 1].sample; - if(TransformUtils.isEmpty(priorInput) || this.lastEdgeType == PathEdge.DELETION) { + if(TransformUtils.isEmpty(priorInput)) { return []; } } @@ -311,25 +314,6 @@ export class SearchNode { return edges; } - /** - * This method is used while stepping through intermediate deletion 'edges' - * for transforms with multi-character insert strings. - * @returns - */ - private processDeletionSubset(): SearchNode { - // We're on easy street: all transforms are already essentially merged into - // a single mass here. - let calculation = this.calculation.addInputChar(SENTINEL_CODE_UNIT); - - // If on a 'delete' style path, we just build out the paths into a single - // merged path. - const node = new SearchNode(this); - node.calculation = calculation; - // no update to the match string or traversal to be found here... - node.partialEdge.subsetSubindex++; - return node; - } - /** * Finalizes the results of search nodes that represent the last step for * processing multi-character insert transforms. @@ -378,10 +362,6 @@ export class SearchNode { return [this.tryFinalize()]; } - if(!partialEdge.doSubsetMatching) { - return [this.processDeletionSubset().tryFinalize()]; - } - // After this, it's all substitution / matching. const traversal = this.currentTraversal; let nodesToReturn: SearchNode[] = []; @@ -472,20 +452,20 @@ export class SearchNode { } /** - * Called by `buildDeletionEdges` and `buildSubstitutionEdges` to construct - * intermediate TransformSubset-based nodes that extend the search path one - * step into the incoming input transforms in an efficiently-batched manner. + * Called by `buildSubstitutionEdges` to construct intermediate + * TransformSubset-based nodes that extend the search path one step into the + * incoming input transforms in an efficiently-batched manner. * * When an incoming character cannot match the next character for the node's - * represented lexicon prefix - be it due to not adding one (deletions) or - * due to not being the same character, all mismatching cases are merged into - * one, reducing the rate of expansion for the search graph. + * represented lexicon prefix - be it due to not adding one (deletions) or due + * to not being the same character, all mismatching cases are merged into one, + * reducing the rate of expansion for the search graph. * @param dist * @param isSubstitution * @param edgeId * @returns */ - private setupSubsetProcessing(dist: Distribution, isSubstitution: boolean, edgeId: number) { + private setupSubsetProcessing(dist: Distribution, edgeId: number) { if(this.hasPartialInput) { throw new Error("Invalid state: will not take new input while still processing Transform subset"); } @@ -509,16 +489,13 @@ export class SearchNode { continue; } - const node = new SearchNode(this, edgeId, isSubstitution ? PathEdge.SUBSTITUTION : PathEdge.DELETION); + const node = new SearchNode(this, edgeId, PathEdge.SUBSTITUTION); node.calculation = edgeCalc; node.partialEdge = { - doSubsetMatching: isSubstitution, + doSubsetMatching: true, subsetSubindex: 0, - transformSubset: isSubstitution ? insSubset : { - ...insSubset, - entries: [ { sample: { insert: SENTINEL_CODE_UNIT.repeat(ins), deleteLeft: dl }, p: insSubset.cumulativeMass}] - } - } + transformSubset: insSubset + }; // Get the traversal at the new end location. (Root is always at index 0.) node.matchedTraversals = this.matchedTraversals.slice(0, newMatchLength+1); @@ -539,7 +516,20 @@ export class SearchNode { * input keystroke. */ buildDeletionEdges(dist: Distribution, edgeId: number): SearchNode[] { - return this.setupSubsetProcessing(dist, false, edgeId); + const deletedSample = { + sample: { + insert: SENTINEL_CODE_UNIT, + deleteLeft: 0 + }, + p: dist.reduce((accum, curr) => curr.p + accum, 0) + }; + + const node = new SearchNode(this, edgeId, PathEdge.DELETION); + node.calculation = this.calculation.addInputChar(SENTINEL_CODE_UNIT); + // Mark that we've "processed" the input distribution, even if just by deleting it. + node.priorInput.push(deletedSample); + + return [node]; } /** @@ -556,7 +546,7 @@ export class SearchNode { // substitutions are _not_ adequately represented by one 'insertion' + one // 'deletion' step. Explicit substitution / match-oriented processing is // required. - return this.setupSubsetProcessing(dist, true, edgeId); + return this.setupSubsetProcessing(dist, edgeId); } /** diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts index ee37a0f7938..d87bd2146bc 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts @@ -164,11 +164,9 @@ export class SearchQuotientSpur implements SearchQuotientNode { let deletions = node.buildDeletionEdges(input, childSpace.spaceId); let substitutions = node.buildSubstitutionEdges(input, childSpace.spaceId); - const batch = deletions.concat(substitutions); - // Skip the queue for the first pass; there will ALWAYS be at least one pass, // and queue-enqueing does come with a cost. Avoid the unnecessary overhead. - return batch.flatMap(e => e.processSubsetEdge()); + return substitutions.flatMap(e => e.processSubsetEdge()).concat(deletions); }); childSpace.completedPaths = []; @@ -203,14 +201,13 @@ export class SearchQuotientSpur implements SearchQuotientNode { let deletionEdges: SearchNode[] = []; if(!substitutionsOnly) { - deletionEdges = currentNode.buildDeletionEdges(this.inputs, this.spaceId); + deletionEdges = currentNode.buildDeletionEdges(this.inputs, this.spaceId); } const substitutionEdges = currentNode.buildSubstitutionEdges(this.inputs, this.spaceId); - let batch = deletionEdges.concat(substitutionEdges); // Skip the queue for the first pass; there will ALWAYS be at least one pass, // and queue-enqueing does come with a cost - avoid unnecessary overhead here. - batch = batch.flatMap(e => e.processSubsetEdge()); + const batch = substitutionEdges.flatMap(e => e.processSubsetEdge()).concat(deletionEdges); this.selectionQueue.enqueueAll(batch); // We didn't reach an end-node, so we just end the iteration and continue the search. diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts index a3a7c8db016..5335e3fe9d1 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts @@ -408,7 +408,7 @@ describe('Correction Distance Modeler', () => { const edges = rootNode.buildDeletionEdges([{ sample: { insert: 'd', deleteLeft: 0 }, p: 1 - }], SEARCH_EDGE_SEED++).flatMap(n => n.processSubsetEdge()); + }], SEARCH_EDGE_SEED++); assert.isAbove(edges.length, 0); const firstChild = edges[0]; @@ -467,7 +467,7 @@ describe('Correction Distance Modeler', () => { }); }); - describe('buildDeletionEdges @ token root', () => { + it('buildDeletionEdges @ token root: makes single mass deletion batch', () => { const synthDistribution = [ // Transform, probability // insert 1, deleteLeft 0, p: 0.50 @@ -483,113 +483,28 @@ describe('Correction Distance Modeler', () => { {sample: {insert: 'tr', deleteLeft: 1}, p: 0.15}, // 4 distinct subsets. ]; + const probSum = synthDistribution.reduce((accum, cur) => accum + cur.p, 0); - it('step 1: batches deletion edge(s) for input transforms', () => { - const rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); - - const rootSeed = SEARCH_EDGE_SEED++; - const rootNode = new correction.SearchNode(rootTraversal, rootSeed); - assert.equal(rootNode.calculation.getHeuristicFinalCost(), 0); - - const subsetSeed = SEARCH_EDGE_SEED++; - const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); - assert.equal(subsetNodes.length, 4); - subsetNodes.sort((a, b) => a.currentCost - b.currentCost); - const expectedCosts = [0.5, .25, 0.15, 0.1].map(x => -Math.log(x)) - // The known subs for the subsets defined above. - for(let i=0; i < expectedCosts.length; i++) { - assert.isTrue(subsetNodes[i].hasPartialInput); - // From root: the deleteLeft 1 entries have nothing to delete. - assert.equal((subsetNodes[i].currentTraversal as TrieTraversal).prefix, ''); - assert.equal(subsetNodes[i].spaceId, subsetSeed); - - // Allow a little value wiggle due to double-precision limitations. - assert.approximately(subsetNodes[i].inputSamplingCost, expectedCosts[i], 1e-8); - // No actual edit-tracking is done yet, so these should also match. - assert.approximately(subsetNodes[i].currentCost, expectedCosts[i], 1e-8); - } - }); - - it('step 2: first processing layer resolves zero + one char inserts', () => { - // From "step 1" above, assertions removed - const rootTraversal = testModel.traverseFromRoot(); - const rootSeed = SEARCH_EDGE_SEED++; - const rootNode = new correction.SearchNode(rootTraversal, rootSeed); - const subsetSeed = SEARCH_EDGE_SEED++; - const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); - subsetNodes.sort((a, b) => a.currentCost - b.currentCost); - - const processedNodes = subsetNodes.flatMap(n => n.processSubsetEdge()); - // All delete-oriented Transform subsets condense down to a single - // transform (and edge) each. - assert.equal(processedNodes.length, 4); - processedNodes.forEach((n) => assert.equal(n.spaceId, subsetSeed)); - - // Sorted index 0: 1 insert - should be processed already, in a single - // step. - assert.isFalse(processedNodes[0].hasPartialInput); - assert.equal(processedNodes[0].editCount, 1); - assert.equal(lastEntry(processedNodes[0].calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNodes[0].inputSamplingCost, subsetNodes[0].inputSamplingCost); - assert.isAbove(processedNodes[0].currentCost, subsetNodes[0].currentCost); - - // Sorted index 3: 0 insert - should be processed already, in a single - // step. - assert.isFalse(processedNodes[3].hasPartialInput); - // No insert string => no sentinel char to delete. - assert.equal(processedNodes[3].editCount, 0); - assert.isUndefined(lastEntry(processedNodes[3].calculation.inputSequence)); - assert.equal(processedNodes[3].inputSamplingCost, subsetNodes[3].inputSamplingCost); - assert.equal(processedNodes[3].currentCost, subsetNodes[3].currentCost); - - // Sorted indices 2, 3: both had 2 inserts. - assert.isTrue(processedNodes[1].hasPartialInput); - assert.equal(processedNodes[1].editCount, 1); - assert.equal(lastEntry(processedNodes[1].calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNodes[1].inputSamplingCost, subsetNodes[1].inputSamplingCost); - assert.isAbove(processedNodes[1].currentCost, subsetNodes[1].currentCost); - - assert.isTrue(processedNodes[2].hasPartialInput); - assert.equal(processedNodes[2].editCount, 1); - assert.equal(lastEntry(processedNodes[2].calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNodes[2].inputSamplingCost, subsetNodes[2].inputSamplingCost); - assert.isAbove(processedNodes[2].currentCost, subsetNodes[2].currentCost); - }); + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); - it('step 3: second processing layer resolves two char inserts', () => { - // From "steps 0, 1" above, assertions removed - const rootTraversal = testModel.traverseFromRoot(); - const rootSeed = SEARCH_EDGE_SEED++; - const rootNode = new correction.SearchNode(rootTraversal, rootSeed); - const subsetSeed = SEARCH_EDGE_SEED++; - const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); - subsetNodes.sort((a, b) => a.currentCost - b.currentCost); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new correction.SearchNode(rootTraversal, rootSeed); + assert.equal(rootNode.calculation.getHeuristicFinalCost(), 0); - // Two nodes were unprocessed at the end of the last step; we handle - // them now and filter the others out. We need the indices here to be - // aligned. - const step2Nodes = subsetNodes.flatMap(n => n.processSubsetEdge()).filter(n => n.hasPartialInput); - const processedNodes = step2Nodes.flatMap(n => n.processSubsetEdge()); - - // All delete-oriented Transform subsets condense down to a single - // transform (and edge) each. - assert.equal(processedNodes.length, 2); - // All nodes are now done processing. - processedNodes.forEach((processedNode, index) => { - assert.isFalse(processedNode.hasPartialInput); - assert.isFalse(processedNode.hasPartialInput); - assert.equal(processedNode.editCount, 2); - assert.equal(processedNode.spaceId, subsetSeed); - assert.equal(processedNode.calculation.inputSequence.length, 2); - assert.equal(lastEntry(processedNode.calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNode.inputSamplingCost, step2Nodes[index].inputSamplingCost); - assert.isAbove(processedNode.currentCost, step2Nodes[index].currentCost); - }); - }); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); + assert.equal(subsetNodes.length, 1); + assert.isFalse(subsetNodes[0].hasPartialInput); + // From root: deleted inputs don't perform their deletes OR inserts past a single + // sentinel char. + assert.equal((subsetNodes[0].currentTraversal as TrieTraversal).prefix, ''); + assert.equal(subsetNodes[0].spaceId, subsetSeed); + assert.equal(subsetNodes[0].editCount, rootNode.editCount + 1); + assert.equal(subsetNodes[0].inputSamplingCost, rootNode.inputSamplingCost - Math.log(probSum)); }); - describe(`buildDeletionEdges starting @ 'te' prefix`, () => { + it(`buildDeletionEdges @ 'te' prefix: makes single mass deletion batch`, () => { // start prefix 'te' const synthDistribution = [ @@ -607,111 +522,21 @@ describe('Correction Distance Modeler', () => { {sample: {insert: 'al', deleteLeft: 1}, p: 0.15}, // tal(k) // 4 distinct subsets. ]; - - it('step 1: batches deletion edge(s) for input transforms', () => { - const teNode = fetchCommonTENode(); - assert.equal(teNode.calculation.getHeuristicFinalCost(), 0); - - const subsetSeed = SEARCH_EDGE_SEED++; - const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); - assert.equal(subsetNodes.length, 4); - subsetNodes.sort((a, b) => a.currentCost - b.currentCost); - - - const expectedCosts = [0.5, .25, 0.15, 0.1].map(x => -Math.log(x) + teNode.currentCost); - // The known subs for the subsets defined above. - for(let i=0; i < expectedCosts.length; i++) { - assert.isTrue(subsetNodes[i].hasPartialInput); - // From a 'te' prefix, the deleteLeft 1 entries do have something to delete. - assert.equal((subsetNodes[i].currentTraversal as TrieTraversal).prefix, (i == 0 || i == 2) ? 'te' : 't'); - assert.notEqual(subsetNodes[i].spaceId, teNode.spaceId); - assert.equal(subsetNodes[i].spaceId, subsetSeed); - - // Allow a little value wiggle due to double-precision limitations. - assert.approximately(subsetNodes[i].inputSamplingCost, expectedCosts[i], 1e-8); - // No actual edit-tracking is done yet, so these should also match. - assert.approximately(subsetNodes[i].currentCost, expectedCosts[i], 1e-8); - } - }); - - it('step 2: first processing layer resolves zero + one char inserts', () => { - // From "step 1" above, assertions removed - const teNode = fetchCommonTENode(); - const subsetSeed = SEARCH_EDGE_SEED++; - const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); - subsetNodes.sort((a, b) => a.currentCost - b.currentCost); - - const processedNodes = subsetNodes.flatMap(n => n.processSubsetEdge()); - // All delete-oriented Transform subsets condense down to a single - // transform (and edge) each. - assert.equal(processedNodes.length, 4); - processedNodes.forEach(n => assert.notEqual(n.spaceId, teNode.spaceId)); - processedNodes.forEach(n => assert.equal(n.spaceId, subsetSeed)); - - // Sorted index 0: 1 insert - should be processed already, in a single - // step. - assert.isFalse(processedNodes[0].hasPartialInput); - assert.equal(processedNodes[0].editCount, 1); - assert.equal(lastEntry(processedNodes[0].calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNodes[0].inputSamplingCost, subsetNodes[0].inputSamplingCost); - assert.isAbove(processedNodes[0].currentCost, subsetNodes[0].currentCost); - - // Sorted index 3: 0 insert - should be processed already, in a single - // step. - assert.isFalse(processedNodes[3].hasPartialInput); - // No insert string => no sentinel char to delete. - assert.equal(processedNodes[3].editCount, 0); - // ... and a prior character exists. - assert.equal(lastEntry(processedNodes[3].calculation.inputSequence), 't'); - assert.equal(processedNodes[3].inputSamplingCost, subsetNodes[3].inputSamplingCost); - assert.equal(processedNodes[3].currentCost, subsetNodes[3].currentCost); - - // Sorted indices 2, 3: both had 2 inserts. - assert.isTrue(processedNodes[1].hasPartialInput); - assert.equal(processedNodes[1].editCount, 1); - assert.equal(lastEntry(processedNodes[1].calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNodes[1].inputSamplingCost, subsetNodes[1].inputSamplingCost); - assert.isAbove(processedNodes[1].currentCost, subsetNodes[1].currentCost); - - assert.isTrue(processedNodes[2].hasPartialInput); - assert.equal(processedNodes[2].editCount, 1); - assert.equal(lastEntry(processedNodes[2].calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNodes[2].inputSamplingCost, subsetNodes[2].inputSamplingCost); - assert.isAbove(processedNodes[2].currentCost, subsetNodes[2].currentCost); - }); - - it('step 3: second processing layer resolves two char inserts', () => { - const teNode = fetchCommonTENode(); - const subsetSeed = SEARCH_EDGE_SEED++; - const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); - subsetNodes.sort((a, b) => a.currentCost - b.currentCost); - - // Two nodes were unprocessed at the end of the last step; we handle - // them now and filter the others out. We need the indices here to be - // aligned. - const step2Nodes = subsetNodes.flatMap(n => n.processSubsetEdge()).filter(n => n.hasPartialInput); - - function assertExpectedProperties(processedNodes: SearchNode[], baseNode: SearchNode, dl: number) { - // All delete-oriented Transform subsets condense down to a single - // transform (and edge) each. - assert.equal(processedNodes.length, 1); - // All nodes are now done processing. - processedNodes.forEach((processedNode) => { - assert.isFalse(processedNode.hasPartialInput); - assert.isFalse(processedNode.hasPartialInput); - assert.equal(processedNode.editCount, 2); - assert.notEqual(processedNode.spaceId, teNode.spaceId); - assert.equal(processedNode.spaceId, subsetSeed); - assert.equal(processedNode.calculation.inputSequence.length, 4 - dl); - assert.equal(lastEntry(processedNode.calculation.inputSequence), SENTINEL_CODE_UNIT); - assert.equal(processedNode.inputSamplingCost, baseNode.inputSamplingCost); - assert.isAbove(processedNode.currentCost, baseNode.currentCost); - }); - } - - assertExpectedProperties(step2Nodes[0].processSubsetEdge(), step2Nodes[0], 1); - assertExpectedProperties(step2Nodes[1].processSubsetEdge(), step2Nodes[1], 0); - }); + const probSum = synthDistribution.reduce((accum, cur) => accum + cur.p, 0); + + const teNode = fetchCommonTENode(); + assert.equal(teNode.calculation.getHeuristicFinalCost(), 0); + + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); + assert.equal(subsetNodes.length, 1); + assert.isFalse(subsetNodes[0].hasPartialInput); + // From root: deleted inputs don't perform their deletes OR inserts past a single + // sentinel char. + assert.equal((subsetNodes[0].currentTraversal as TrieTraversal).prefix, 'te'); + assert.equal(subsetNodes[0].spaceId, subsetSeed); + assert.equal(subsetNodes[0].editCount, teNode.editCount + 1); + assert.equal(subsetNodes[0].inputSamplingCost, teNode.inputSamplingCost - Math.log(probSum)); }); describe('buildSubstitutionEdges @ token root', () => {