kbrsh · karstenBriksoft · May 13, 2023
diff --git a/src/index.js b/src/index.js
@@ -26,6 +26,7 @@ let config = {
   ]
 };
 
+/// stringify is used to save the index
 const stringify = function(arr) {
   let output = '[';
   let separator = '';
@@ -55,6 +56,7 @@ const stringify = function(arr) {
   return output + ']';
 }
 
+/// oposite of stringify, used when calling the Wade function using a string (which was created via stringify())
 const parse = function(str) {
   let arr = [];
   let stack = [arr];
@@ -115,6 +117,7 @@ const getTerms = function(entry) {
   return terms;
 }
 
+// preprocess a string before it is split into terms
 const processEntry = function(entry) {
   if(entry.length === 0) {
     return entry;
@@ -129,6 +132,16 @@ const processEntry = function(entry) {
   }
 }
 
+// used during search to add the information found in data to the results
+// parameters:
+//  results, an array where the results are added to in the following form: {index, score}
+//  resultIndexes, maps documentIDs to their index in the results-array
+//  increment, a values that describes how much weight the current search-term has on the overall score
+//  data, a node-value from the index, which is an Array of the following form:
+//    [termOffset, relevance, documentId+]
+//     termOffset is used by the Trie structure and irrelevant for this step
+//     relevance is the relevance of the term
+//     documentId+ are one or more document ids. the same documentIds may appear multiple times for one term, thus increasing the score of this document for the given term
 const update = function(results, resultIndexes, increment, data) {
   const relevance = data[1];
   for(let i = 2; i < data.length; i++) {
@@ -148,6 +161,10 @@ const update = function(results, resultIndexes, increment, data) {
 }
 
 const Wade = function(data) {
+  // search returns an Array of Objects with the keys "index" and "score"
+  // the results are NOT sorted
+  // prior to presenting the results, they should be sorted by score
+  // "index" is the documentID that was found
   const search = function(query) {
     const index = search.index;
     const processed = processEntry(query);
@@ -162,6 +179,7 @@ const Wade = function(data) {
       const exactTermsLength = termsLength - 1;
       const increment = 1 / termsLength;
 
+      // process all but last terms in an exact-match fashion
       exactOuter: for(let i = 0; i < exactTermsLength; i++) {
         const term = terms[i];
         const termLength = term.length - 1;
@@ -184,6 +202,7 @@ const Wade = function(data) {
         }
       }
 
+      // process the last term in a fuzzy fashion to allow for prefix-search during typing
       const lastTerm = terms[exactTermsLength];
       const lastTermLength = lastTerm.length - 1;
       let node = index;
@@ -198,7 +217,8 @@ const Wade = function(data) {
 
         node = node[lastTermIndex];
       }
-
+
+      // once the last term's node is found, add all its children to the results
       if(node !== undefined) {
         let nodes = [node];
         for(let i = 0; i < nodes.length; i++) {
@@ -233,13 +253,33 @@ const Wade = function(data) {
 
 Wade.index = function(data) {
   let dataLength = 0;
+
+  // ranges is used to fill the Trie object
+  // the Trie is like a tree of Dictionaries, where each key is a character in a string
+  // Instead of Dictionaries, Wade stores the Trie in a hierarchy of Arrays
+  // in order to be able to access the Array using [0], [1], etc, each Node stores the offset of the first character
+  // Character "a" has codepoint 97, so if the offset is -96 (1-97) then the first stored Character can be "a"
+  // if the range goes from g-h then the node's Array can be of size 2, the offset can be -102 and both "g" and "h" can be accessed
+  // as not all nodes in the Trie store subsequent characters, the Arrays are still typically densly populates
+  // ranges is a hierarchy of objects that store the keys .minimum and .maximum
   let ranges = {};
+
+  // processed is a long array that contains information for each term.
+  // it is NOT an array of tuples, instead each term is stored in 3 successive indices with index 
+  //   i = documentID, 
+  //   i+1 = number of terms in document, 
+  //   i+2 = Array of chars of term (as Array of Integers)
+  // the array is used only in an initial step to process all terms in all documents
   let processed = [];
 
   for(let i = 0; i < data.length; i++) {
+	// entry is a preprocessed version of the input document (i.e. normalized in some way)
     const entry = processEntry(data[i]);
 
     if(entry.length !== 0) {
+	  // Terms are the input document split into terms. 
+	  // Terms are in order of appearance and added as many times as they appear.
+	  // Adding terms multiple times is done to adjust the relevance of this term during search
       const terms = getTerms(entry);
       const termsLength = terms.length;
 
@@ -290,17 +330,18 @@ Wade.index = function(data) {
           processedTerm.push(lowByte);
         }
 
-        processed.push(i);
-        processed.push(termsLength);
-        processed.push(processedTerm);
+        processed.push(i); // document ID
+        processed.push(termsLength); // number of terms in document
+        processed.push(processedTerm); // term's characters (as array of numbers)
       }
     }
 
     dataLength++;
   }
+  // now all terms are extracted from all documents
 
-  const indexMinimum = ranges.minimum;
-  const indexMaximum = ranges.maximum;
+  const indexMinimum = ranges.minimum; // lowest character value
+  const indexMaximum = ranges.maximum; // highest character value
   let indexSize = 1;
   let indexOffset;
 
@@ -309,10 +350,23 @@ Wade.index = function(data) {
     indexOffset = 1 - indexMinimum;
   }
 
+  // nodeDataSets is an Array of all node-values in the Trie
+  // the array is used to perform a final update on the weights
+  // node-values have to following form: [termOffset, 1/numberOfTermsInDocument, documentId]
+  // if more than one document-Id are stored, the content changes to:
+  //    [termOffset, sum(1/numberOfTermsInDocument), documentId1, documentId2, ...]
+  // the node values are stored at index 0 in the Trie's nodes
   let nodeDataSets = [];
+  // index is a Trie, it's n items wide, depending on the range-width of the current character
+  // using precomputed termRanges allows for directly allocating the Arrays in the correct size
+  // if the term-ranges are fully used, the Array is not sparsely populates, it's index from 0 to n
+  // if the term-ranges are NOT fully used, the Array has some unpopulated slots
+  // each node looks like this: [ node-value, subNodes]
+  // the node-value stores the start-index of the current term-ranges so that the Array may be accessed from indices 0-n instead of n-m. As the Arrays may be sparsely filled, this isn't always the case, but it can be the case
   let index = new Array(indexSize);
   index[0] = [indexOffset];
 
+  // iterate in 3-item steps
   for(let i = 0; i < processed.length; i += 3) {
     const dataIndex = processed[i];
     const termsLength = processed[i + 1];
@@ -321,13 +375,15 @@ Wade.index = function(data) {
     let node = index;
     let termRanges = ranges;
 
+    // iterate through Trie, possibly creating new Nodes, until second-to-last character
     for(let j = 0; j < processedTermLength; j++) {
       const char = processedTerm[j];
       const charIndex = char + node[0][0];
-      let termNode = node[charIndex];
+      let termNode = node[charIndex]; // get existing sub-Trie-node or nil
       termRanges = termRanges[char];
 
       if(termNode === undefined) {
+        // allocate new Trie node
         const termMinimum = termRanges.minimum;
         const termMaximum = termRanges.maximum;
         termNode = node[charIndex] = new Array(termMaximum - termMinimum + 2);
@@ -336,7 +392,8 @@ Wade.index = function(data) {
 
       node = termNode;
     }
-
+
+    // now process the last character, inserting the Term's data into the node
     const lastChar = processedTerm[processedTermLength];
     const lastCharIndex = lastChar + node[0][0]
     let lastTermNode = node[lastCharIndex];
@@ -369,6 +426,8 @@ Wade.index = function(data) {
     }
   }
 
+  // currently the node-values store the sum of 1/numberOfDocumentsPerTerm at [1]
+  // update the weight with number of documents via 1.5 - (sum / numberOfDocuments)
   for(let i = 0; i < nodeDataSets.length; i++) {
     let nodeData = nodeDataSets[i];
     nodeData[1] = 1.5 - (nodeData[1] / dataLength);