diff --git a/src/index.js b/src/index.js index 15d033e..563bc10 100644 --- a/src/index.js +++ b/src/index.js @@ -26,6 +26,7 @@ let config = { ] }; +/// stringify is used to save the index const stringify = function(arr) { let output = '['; let separator = ''; @@ -55,6 +56,7 @@ const stringify = function(arr) { return output + ']'; } +/// oposite of stringify, used when calling the Wade function using a string (which was created via stringify()) const parse = function(str) { let arr = []; let stack = [arr]; @@ -115,6 +117,7 @@ const getTerms = function(entry) { return terms; } +// preprocess a string before it is split into terms const processEntry = function(entry) { if(entry.length === 0) { return entry; @@ -129,6 +132,16 @@ const processEntry = function(entry) { } } +// used during search to add the information found in data to the results +// parameters: +// results, an array where the results are added to in the following form: {index, score} +// resultIndexes, maps documentIDs to their index in the results-array +// increment, a values that describes how much weight the current search-term has on the overall score +// data, a node-value from the index, which is an Array of the following form: +// [termOffset, relevance, documentId+] +// termOffset is used by the Trie structure and irrelevant for this step +// relevance is the relevance of the term +// documentId+ are one or more document ids. the same documentIds may appear multiple times for one term, thus increasing the score of this document for the given term const update = function(results, resultIndexes, increment, data) { const relevance = data[1]; for(let i = 2; i < data.length; i++) { @@ -148,6 +161,10 @@ const update = function(results, resultIndexes, increment, data) { } const Wade = function(data) { + // search returns an Array of Objects with the keys "index" and "score" + // the results are NOT sorted + // prior to presenting the results, they should be sorted by score + // "index" is the documentID that was found const search = function(query) { const index = search.index; const processed = processEntry(query); @@ -162,6 +179,7 @@ const Wade = function(data) { const exactTermsLength = termsLength - 1; const increment = 1 / termsLength; + // process all but last terms in an exact-match fashion exactOuter: for(let i = 0; i < exactTermsLength; i++) { const term = terms[i]; const termLength = term.length - 1; @@ -184,6 +202,7 @@ const Wade = function(data) { } } + // process the last term in a fuzzy fashion to allow for prefix-search during typing const lastTerm = terms[exactTermsLength]; const lastTermLength = lastTerm.length - 1; let node = index; @@ -198,7 +217,8 @@ const Wade = function(data) { node = node[lastTermIndex]; } - + + // once the last term's node is found, add all its children to the results if(node !== undefined) { let nodes = [node]; for(let i = 0; i < nodes.length; i++) { @@ -233,13 +253,33 @@ const Wade = function(data) { Wade.index = function(data) { let dataLength = 0; + + // ranges is used to fill the Trie object + // the Trie is like a tree of Dictionaries, where each key is a character in a string + // Instead of Dictionaries, Wade stores the Trie in a hierarchy of Arrays + // in order to be able to access the Array using [0], [1], etc, each Node stores the offset of the first character + // Character "a" has codepoint 97, so if the offset is -96 (1-97) then the first stored Character can be "a" + // if the range goes from g-h then the node's Array can be of size 2, the offset can be -102 and both "g" and "h" can be accessed + // as not all nodes in the Trie store subsequent characters, the Arrays are still typically densly populates + // ranges is a hierarchy of objects that store the keys .minimum and .maximum let ranges = {}; + + // processed is a long array that contains information for each term. + // it is NOT an array of tuples, instead each term is stored in 3 successive indices with index + // i = documentID, + // i+1 = number of terms in document, + // i+2 = Array of chars of term (as Array of Integers) + // the array is used only in an initial step to process all terms in all documents let processed = []; for(let i = 0; i < data.length; i++) { + // entry is a preprocessed version of the input document (i.e. normalized in some way) const entry = processEntry(data[i]); if(entry.length !== 0) { + // Terms are the input document split into terms. + // Terms are in order of appearance and added as many times as they appear. + // Adding terms multiple times is done to adjust the relevance of this term during search const terms = getTerms(entry); const termsLength = terms.length; @@ -290,17 +330,18 @@ Wade.index = function(data) { processedTerm.push(lowByte); } - processed.push(i); - processed.push(termsLength); - processed.push(processedTerm); + processed.push(i); // document ID + processed.push(termsLength); // number of terms in document + processed.push(processedTerm); // term's characters (as array of numbers) } } dataLength++; } + // now all terms are extracted from all documents - const indexMinimum = ranges.minimum; - const indexMaximum = ranges.maximum; + const indexMinimum = ranges.minimum; // lowest character value + const indexMaximum = ranges.maximum; // highest character value let indexSize = 1; let indexOffset; @@ -309,10 +350,23 @@ Wade.index = function(data) { indexOffset = 1 - indexMinimum; } + // nodeDataSets is an Array of all node-values in the Trie + // the array is used to perform a final update on the weights + // node-values have to following form: [termOffset, 1/numberOfTermsInDocument, documentId] + // if more than one document-Id are stored, the content changes to: + // [termOffset, sum(1/numberOfTermsInDocument), documentId1, documentId2, ...] + // the node values are stored at index 0 in the Trie's nodes let nodeDataSets = []; + // index is a Trie, it's n items wide, depending on the range-width of the current character + // using precomputed termRanges allows for directly allocating the Arrays in the correct size + // if the term-ranges are fully used, the Array is not sparsely populates, it's index from 0 to n + // if the term-ranges are NOT fully used, the Array has some unpopulated slots + // each node looks like this: [ node-value, subNodes] + // the node-value stores the start-index of the current term-ranges so that the Array may be accessed from indices 0-n instead of n-m. As the Arrays may be sparsely filled, this isn't always the case, but it can be the case let index = new Array(indexSize); index[0] = [indexOffset]; + // iterate in 3-item steps for(let i = 0; i < processed.length; i += 3) { const dataIndex = processed[i]; const termsLength = processed[i + 1]; @@ -321,13 +375,15 @@ Wade.index = function(data) { let node = index; let termRanges = ranges; + // iterate through Trie, possibly creating new Nodes, until second-to-last character for(let j = 0; j < processedTermLength; j++) { const char = processedTerm[j]; const charIndex = char + node[0][0]; - let termNode = node[charIndex]; + let termNode = node[charIndex]; // get existing sub-Trie-node or nil termRanges = termRanges[char]; if(termNode === undefined) { + // allocate new Trie node const termMinimum = termRanges.minimum; const termMaximum = termRanges.maximum; termNode = node[charIndex] = new Array(termMaximum - termMinimum + 2); @@ -336,7 +392,8 @@ Wade.index = function(data) { node = termNode; } - + + // now process the last character, inserting the Term's data into the node const lastChar = processedTerm[processedTermLength]; const lastCharIndex = lastChar + node[0][0] let lastTermNode = node[lastCharIndex]; @@ -369,6 +426,8 @@ Wade.index = function(data) { } } + // currently the node-values store the sum of 1/numberOfDocumentsPerTerm at [1] + // update the weight with number of documents via 1.5 - (sum / numberOfDocuments) for(let i = 0; i < nodeDataSets.length; i++) { let nodeData = nodeDataSets[i]; nodeData[1] = 1.5 - (nodeData[1] / dataLength);