Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 67 additions & 8 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ let config = {
]
};

/// stringify is used to save the index
const stringify = function(arr) {
let output = '[';
let separator = '';
Expand Down Expand Up @@ -55,6 +56,7 @@ const stringify = function(arr) {
return output + ']';
}

/// oposite of stringify, used when calling the Wade function using a string (which was created via stringify())
const parse = function(str) {
let arr = [];
let stack = [arr];
Expand Down Expand Up @@ -115,6 +117,7 @@ const getTerms = function(entry) {
return terms;
}

// preprocess a string before it is split into terms
const processEntry = function(entry) {
if(entry.length === 0) {
return entry;
Expand All @@ -129,6 +132,16 @@ const processEntry = function(entry) {
}
}

// used during search to add the information found in data to the results
// parameters:
// results, an array where the results are added to in the following form: {index, score}
// resultIndexes, maps documentIDs to their index in the results-array
// increment, a values that describes how much weight the current search-term has on the overall score
// data, a node-value from the index, which is an Array of the following form:
// [termOffset, relevance, documentId+]
// termOffset is used by the Trie structure and irrelevant for this step
// relevance is the relevance of the term
// documentId+ are one or more document ids. the same documentIds may appear multiple times for one term, thus increasing the score of this document for the given term
const update = function(results, resultIndexes, increment, data) {
const relevance = data[1];
for(let i = 2; i < data.length; i++) {
Expand All @@ -148,6 +161,10 @@ const update = function(results, resultIndexes, increment, data) {
}

const Wade = function(data) {
// search returns an Array of Objects with the keys "index" and "score"
// the results are NOT sorted
// prior to presenting the results, they should be sorted by score
// "index" is the documentID that was found
const search = function(query) {
const index = search.index;
const processed = processEntry(query);
Expand All @@ -162,6 +179,7 @@ const Wade = function(data) {
const exactTermsLength = termsLength - 1;
const increment = 1 / termsLength;

// process all but last terms in an exact-match fashion
exactOuter: for(let i = 0; i < exactTermsLength; i++) {
const term = terms[i];
const termLength = term.length - 1;
Expand All @@ -184,6 +202,7 @@ const Wade = function(data) {
}
}

// process the last term in a fuzzy fashion to allow for prefix-search during typing
const lastTerm = terms[exactTermsLength];
const lastTermLength = lastTerm.length - 1;
let node = index;
Expand All @@ -198,7 +217,8 @@ const Wade = function(data) {

node = node[lastTermIndex];
}


// once the last term's node is found, add all its children to the results
if(node !== undefined) {
let nodes = [node];
for(let i = 0; i < nodes.length; i++) {
Expand Down Expand Up @@ -233,13 +253,33 @@ const Wade = function(data) {

Wade.index = function(data) {
let dataLength = 0;

// ranges is used to fill the Trie object
// the Trie is like a tree of Dictionaries, where each key is a character in a string
// Instead of Dictionaries, Wade stores the Trie in a hierarchy of Arrays
// in order to be able to access the Array using [0], [1], etc, each Node stores the offset of the first character
// Character "a" has codepoint 97, so if the offset is -96 (1-97) then the first stored Character can be "a"
// if the range goes from g-h then the node's Array can be of size 2, the offset can be -102 and both "g" and "h" can be accessed
// as not all nodes in the Trie store subsequent characters, the Arrays are still typically densly populates
// ranges is a hierarchy of objects that store the keys .minimum and .maximum
let ranges = {};

// processed is a long array that contains information for each term.
// it is NOT an array of tuples, instead each term is stored in 3 successive indices with index
// i = documentID,
// i+1 = number of terms in document,
// i+2 = Array of chars of term (as Array of Integers)
// the array is used only in an initial step to process all terms in all documents
let processed = [];

for(let i = 0; i < data.length; i++) {
// entry is a preprocessed version of the input document (i.e. normalized in some way)
const entry = processEntry(data[i]);

if(entry.length !== 0) {
// Terms are the input document split into terms.
// Terms are in order of appearance and added as many times as they appear.
// Adding terms multiple times is done to adjust the relevance of this term during search
const terms = getTerms(entry);
const termsLength = terms.length;

Expand Down Expand Up @@ -290,17 +330,18 @@ Wade.index = function(data) {
processedTerm.push(lowByte);
}

processed.push(i);
processed.push(termsLength);
processed.push(processedTerm);
processed.push(i); // document ID
processed.push(termsLength); // number of terms in document
processed.push(processedTerm); // term's characters (as array of numbers)
}
}

dataLength++;
}
// now all terms are extracted from all documents

const indexMinimum = ranges.minimum;
const indexMaximum = ranges.maximum;
const indexMinimum = ranges.minimum; // lowest character value
const indexMaximum = ranges.maximum; // highest character value
let indexSize = 1;
let indexOffset;

Expand All @@ -309,10 +350,23 @@ Wade.index = function(data) {
indexOffset = 1 - indexMinimum;
}

// nodeDataSets is an Array of all node-values in the Trie
// the array is used to perform a final update on the weights
// node-values have to following form: [termOffset, 1/numberOfTermsInDocument, documentId]
// if more than one document-Id are stored, the content changes to:
// [termOffset, sum(1/numberOfTermsInDocument), documentId1, documentId2, ...]
// the node values are stored at index 0 in the Trie's nodes
let nodeDataSets = [];
// index is a Trie, it's n items wide, depending on the range-width of the current character
// using precomputed termRanges allows for directly allocating the Arrays in the correct size
// if the term-ranges are fully used, the Array is not sparsely populates, it's index from 0 to n
// if the term-ranges are NOT fully used, the Array has some unpopulated slots
// each node looks like this: [ node-value, subNodes]
// the node-value stores the start-index of the current term-ranges so that the Array may be accessed from indices 0-n instead of n-m. As the Arrays may be sparsely filled, this isn't always the case, but it can be the case
let index = new Array(indexSize);
index[0] = [indexOffset];

// iterate in 3-item steps
for(let i = 0; i < processed.length; i += 3) {
const dataIndex = processed[i];
const termsLength = processed[i + 1];
Expand All @@ -321,13 +375,15 @@ Wade.index = function(data) {
let node = index;
let termRanges = ranges;

// iterate through Trie, possibly creating new Nodes, until second-to-last character
for(let j = 0; j < processedTermLength; j++) {
const char = processedTerm[j];
const charIndex = char + node[0][0];
let termNode = node[charIndex];
let termNode = node[charIndex]; // get existing sub-Trie-node or nil
termRanges = termRanges[char];

if(termNode === undefined) {
// allocate new Trie node
const termMinimum = termRanges.minimum;
const termMaximum = termRanges.maximum;
termNode = node[charIndex] = new Array(termMaximum - termMinimum + 2);
Expand All @@ -336,7 +392,8 @@ Wade.index = function(data) {

node = termNode;
}


// now process the last character, inserting the Term's data into the node
const lastChar = processedTerm[processedTermLength];
const lastCharIndex = lastChar + node[0][0]
let lastTermNode = node[lastCharIndex];
Expand Down Expand Up @@ -369,6 +426,8 @@ Wade.index = function(data) {
}
}

// currently the node-values store the sum of 1/numberOfDocumentsPerTerm at [1]
// update the weight with number of documents via 1.5 - (sum / numberOfDocuments)
for(let i = 0; i < nodeDataSets.length; i++) {
let nodeData = nodeDataSets[i];
nodeData[1] = 1.5 - (nodeData[1] / dataLength);
Expand Down