diff --git a/tlsh.ts b/tlsh.ts index 53fe8e3..6fb4bee 100644 --- a/tlsh.ts +++ b/tlsh.ts @@ -2,7 +2,7 @@ import { bit_pair_diff_table, v_table } from "./tables.ts"; const LOG1_5 = 0.4054651; const LOG1_3 = 0.26236426; -const LOG1_1 = 0.095310180; +const LOG1_1 = 0.09531018; const SALT = new Uint8Array([2, 3, 5, 7, 11, 13]); const CODE_SIZE = 32; const WINDOW_LENGTH = 5; @@ -12,624 +12,753 @@ const MIN_INPUT_SIZE = 50; const HASH_LEN = 70; class ChunkState { - buckets = new Uint8Array(NUM_BUCKETS).fill(0); - chunk = new Uint8Array(WINDOW_LENGTH).fill(0); - store = new Uint8Array(2).fill(0); - chunk3 = new Uint8Array(3).fill(0); - checksum = 0; - fileSize = 0; - - constructor(buf: Uint8Array) { - this.chunk = Uint8Array.from(buf.slice(0, 5)).reverse(); - - this.fileSize = WINDOW_LENGTH - 1; - - // loop through the buffer and calculate the pearson hash for - // the contents of each window as it moves over the buffer. - for (let i = WINDOW_LENGTH; i < buf.length; i++) { - this.chunk3[0] = this.chunk[0]; - this.chunk3[1] = this.chunk[1]; - this.chunk3[2] = this.checksum; - - this.checksum = this.pearsonHash(0, this.chunk3[0], this.chunk3[1], this.chunk3[2]); - - this.chunk3[2] = this.chunk[2]; - this.buckets[this.pearsonHash(SALT[0], this.chunk3[0], this.chunk3[1], this.chunk3[2])]++; - - this.chunk3[2] = this.chunk[3]; - this.buckets[this.pearsonHash(SALT[1], this.chunk3[0], this.chunk3[1], this.chunk3[2])]++; - - this.chunk3[1] = this.chunk[2]; - this.buckets[this.pearsonHash(SALT[2], this.chunk3[0], this.chunk3[1], this.chunk3[2])]++; - - this.chunk3[2] = this.chunk[4]; - this.buckets[this.pearsonHash(SALT[3], this.chunk3[0], this.chunk3[1], this.chunk3[2])]++; - - this.chunk3[1] = this.chunk[1]; - this.buckets[this.pearsonHash(SALT[4], this.chunk3[0], this.chunk3[1], this.chunk3[2])]++; - - this.chunk3[1] = this.chunk[3]; - this.buckets[this.pearsonHash(SALT[5], this.chunk3[0], this.chunk3[1], this.chunk3[2])]++; - - const temp1 = this.chunk[0]; - const temp2 = this.chunk[1]; - const temp3 = this.chunk[2]; - const temp4 = this.chunk[3]; - this.chunk[1] = temp1; - this.chunk[2] = temp2; - this.chunk[3] = temp3; - this.chunk[4] = temp4; + buckets = new Uint8Array(NUM_BUCKETS).fill(0); + chunk = new Uint8Array(WINDOW_LENGTH).fill(0); + store = new Uint8Array(2).fill(0); + chunk3 = new Uint8Array(3).fill(0); + checksum = 0; + fileSize = 0; + + constructor(buf: Uint8Array) { + this.chunk = Uint8Array.from(buf.slice(0, 5)).reverse(); + + this.fileSize = WINDOW_LENGTH - 1; + + // loop through the buffer and calculate the pearson hash for + // the contents of each window as it moves over the buffer. + for (let i = WINDOW_LENGTH; i < buf.length; i++) { + this.chunk3[0] = this.chunk[0]; + this.chunk3[1] = this.chunk[1]; + this.chunk3[2] = this.checksum; + + this.checksum = this.pearsonHash( + 0, + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ); + + this.chunk3[2] = this.chunk[2]; + this.buckets[ + this.pearsonHash( + SALT[0], + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ) + ]++; + + this.chunk3[2] = this.chunk[3]; + this.buckets[ + this.pearsonHash( + SALT[1], + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ) + ]++; + + this.chunk3[1] = this.chunk[2]; + this.buckets[ + this.pearsonHash( + SALT[2], + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ) + ]++; + + this.chunk3[2] = this.chunk[4]; + this.buckets[ + this.pearsonHash( + SALT[3], + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ) + ]++; + + this.chunk3[1] = this.chunk[1]; + this.buckets[ + this.pearsonHash( + SALT[4], + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ) + ]++; + + this.chunk3[1] = this.chunk[3]; + this.buckets[ + this.pearsonHash( + SALT[5], + this.chunk3[0], + this.chunk3[1], + this.chunk3[2] + ) + ]++; + + const temp1 = this.chunk[0]; + const temp2 = this.chunk[1]; + const temp3 = this.chunk[2]; + const temp4 = this.chunk[3]; + this.chunk[1] = temp1; + this.chunk[2] = temp2; + this.chunk[3] = temp3; + this.chunk[4] = temp4; + + this.chunk[0] = buf[i]; + this.fileSize++; + } + } + + private pearsonHash( + salt: number, + c1: number, + c2: number, + c3: number + ): number { + let h = 0; + h = v_table[h ^ salt]; + h = v_table[h ^ c1]; + h = v_table[h ^ c2]; + h = v_table[h ^ c3]; + return h; + } +} - this.chunk[0] = buf[i]; - this.fileSize++; - } +function parseHexStringToArrayOfNumbers(hexString: string): Array { + const result = []; + const len = hexString.length; + + // Iterate two characters at a time + for (let i = 0; i < len - 1; i += 2) { + const charCode1 = hexString.charCodeAt(i); + const charCode2 = hexString.charCodeAt(i + 1); + + let val1; + let val2; + + // Convert first hex char to its decimal value + if (charCode1 >= 48 && charCode1 <= 57) { + // '0' through '9' + val1 = charCode1 - 48; + } else if (charCode1 >= 65 && charCode1 <= 70) { + // 'A' through 'F' (uppercase) + val1 = charCode1 - 55; + } else if (charCode1 >= 97 && charCode1 <= 102) { + // 'a' through 'f' (lowercase) + val1 = charCode1 - 87; + } else { + // For invalid characters, we can decide how to handle it. + // For performance, we might just assume valid input or assign 0. + // For robustness, you might skip the pair or throw an error. + // Here, we'll assign 0 and continue. + val1 = 0; } - private pearsonHash(salt: number, c1: number, c2: number, c3: number): number { - let h = 0; - h = v_table[h ^ salt]; - h = v_table[h ^ c1]; - h = v_table[h ^ c2]; - h = v_table[h ^ c3]; - return h; + // Convert second hex char to its decimal value + if (charCode2 >= 48 && charCode2 <= 57) { + // '0' through '9' + val2 = charCode2 - 48; + } else if (charCode2 >= 65 && charCode2 <= 70) { + // 'A' through 'F' (uppercase) + val2 = charCode2 - 55; + } else if (charCode2 >= 97 && charCode2 <= 102) { + // 'a' through 'f' (lowercase) + val2 = charCode2 - 87; + } else { + val2 = 0; } -} -export class Tlsh { - checksum = 0; - lValue = 0; - qRatio = 0; - q1Ratio = 0; - q2Ratio = 0; - code: Uint8Array = new Uint8Array(CODE_SIZE).fill(0); - complete = false; - - /** - * creates a new TLSH hash object with the provided bytes - */ - constructor(buf?: Uint8Array) { - if (buf == undefined) { - return; - } + // Combine the two values using bitwise operations + result.push((val1 << 4) | val2); + } - if (buf.length < MIN_INPUT_SIZE) { - return; - } + return result; +} - const state = new ChunkState(buf); +export class Tlsh { + checksum = 0; + lValue = 0; + qRatio = 0; + q1Ratio = 0; + q2Ratio = 0; + code: Uint8Array = new Uint8Array(CODE_SIZE).fill(0); + complete = false; + + /** + * creates a new TLSH hash object with the provided bytes + */ + constructor(buf?: Uint8Array) { + if (buf == undefined) { + return; + } - this.checksum = state.checksum; - this.lValue = this.lValueCalc(buf.length); + if (buf.length < MIN_INPUT_SIZE) { + return; + } - const tempBuckets = new Uint8Array(NUM_BUCKETS).fill(0); - for (let i = 0; i < NUM_BUCKETS; i++) { - tempBuckets[i] = state.buckets[i]; - } + const state = new ChunkState(buf); - const [q1, q2, q3] = this.quartilePoints(tempBuckets); - if (q3 == 0) { - return; - } + this.checksum = state.checksum; + this.lValue = this.lValueCalc(buf.length); - this.q1Ratio = (q1 * 100 / q3) % 16; - this.q2Ratio = (q2 * 100 / q3) % 16; - this.qRatio = ((this.q1Ratio & 0xf) << 4) | (this.q2Ratio & 0xf); - - for (let i = 0; i < CODE_SIZE; i++) { - let h = 0; - for (let j = 0; j < 4; j++) { - const k = state.buckets[4 * i + j]; - if (q3 < k) { - h += 3 << (j * 2); - } else if (q2 < k) { - h += 2 << (j * 2); - } else if (q1 < k) { - h += 1 << (j * 2); - } - } - - this.code[(CODE_SIZE - 1) - i] = h; - } + const tempBuckets = new Uint8Array(NUM_BUCKETS).fill(0); + for (let i = 0; i < NUM_BUCKETS; i++) { + tempBuckets[i] = state.buckets[i]; + } - this.complete = true; + const [q1, q2, q3] = this.quartilePoints(tempBuckets); + if (q3 == 0) { + return; } - /** - * Returns a new TLSH object given a valid TLSH hash value - * @param str - TLSH hash value - * @returns the Tlsh object representation - */ - static from(str: string): Tlsh { - if (str.length != HASH_LEN && str.length != HASH_LEN + 2) { - throw new Error("invalid TLSH hash value"); + this.q1Ratio = ((q1 * 100) / q3) % 16; + this.q2Ratio = ((q2 * 100) / q3) % 16; + this.qRatio = ((this.q1Ratio & 0xf) << 4) | (this.q2Ratio & 0xf); + + for (let i = 0; i < CODE_SIZE; i++) { + let h = 0; + for (let j = 0; j < 4; j++) { + const k = state.buckets[4 * i + j]; + if (q3 < k) { + h += 3 << (j * 2); + } else if (q2 < k) { + h += 2 << (j * 2); + } else if (q1 < k) { + h += 1 << (j * 2); } + } - let raw = str.toUpperCase(); - if (raw.length == HASH_LEN + 2) { - if (raw[0] == "T" && raw[1] == "2") { - raw = raw.substring(2); - } else { - throw new Error("invalid TLSH hash value, non-supported version or T-type"); - } - } + this.code[CODE_SIZE - 1 - i] = h; + } - const bytes: Array = (raw.match(/.{1,2}/g) || []).map((x) => { - return parseInt(x, 16); - }); + this.complete = true; + } + + /** + * Returns a new TLSH object given a valid TLSH hash value + * @param str - TLSH hash value + * @returns the Tlsh object representation + */ + static from(str: string): Tlsh { + if (str.length != HASH_LEN && str.length != HASH_LEN + 2) { + throw new Error("invalid TLSH hash value"); + } - if (bytes.length != HASH_LEN / 2) { - throw new Error("unable to extract bytes of hash correctly"); - } + if (str.length == HASH_LEN + 2) { + if ((str[0] === "T" || str[0] === "t") && str[1] === "2") { + str = str.substring(2); + } else { + throw new Error( + "invalid TLSH hash value, non-supported version or T-type" + ); + } + } - const checksum = Tlsh.swapByte(bytes[0]); - const lValue = Tlsh.swapByte(bytes[1]); - const qRatio = Tlsh.swapByte(bytes[2]); + const bytes = parseHexStringToArrayOfNumbers(str); - const q1Ratio = (qRatio >> 4) & 0xf; - const q2Ratio = qRatio & 0xf; + if (bytes.length != HASH_LEN / 2) { + throw new Error("unable to extract bytes of hash correctly"); + } + const checksum = Tlsh.swapByte(bytes[0]); + const lValue = Tlsh.swapByte(bytes[1]); + const qRatio = Tlsh.swapByte(bytes[2]); - const code = new Uint8Array(CODE_SIZE).fill(0); - for (let i = 3; i < bytes.length; i++) { - code[i - 3] = bytes[i]; - } + const q1Ratio = (qRatio >> 4) & 0xf; + const q2Ratio = qRatio & 0xf; - const tlsh = new Tlsh(); - tlsh.checksum = checksum; - tlsh.lValue = lValue; - tlsh.qRatio = qRatio; - tlsh.q1Ratio = q1Ratio; - tlsh.q2Ratio = q2Ratio; - tlsh.code = code; - tlsh.complete = true; + const code = new Uint8Array(CODE_SIZE).fill(0); + for (let i = 3; i < bytes.length; i++) { + code[i - 3] = bytes[i]; + } - return tlsh; + const tlsh = new Tlsh(); + tlsh.checksum = checksum; + tlsh.lValue = lValue; + tlsh.qRatio = qRatio; + tlsh.q1Ratio = q1Ratio; + tlsh.q2Ratio = q2Ratio; + tlsh.code = code; + tlsh.complete = true; + + return tlsh; + } + + /** + * diff takes in a second Tlsh object and calculates the difference or distance between the two Tlsh objects. + * @param b second Tlsh object + * @returns number, the distance between the two Tlsh objects + */ + diff(b: Tlsh): number { + let diff = 0; + + const q1Diff = this.mod_diff(this.q1Ratio, b.q1Ratio, 16); + if (q1Diff <= 1) { + diff += q1Diff; + } else { + diff += (q1Diff - 1) * 12; } - /** - * diff takes in a second Tlsh object and calculates the difference or distance between the two Tlsh objects. - * @param b second Tlsh object - * @returns number, the distance between the two Tlsh objects - */ - diff(b: Tlsh): number { - let diff = 0; - - const q1Diff = this.mod_diff(this.q1Ratio, b.q1Ratio, 16); - if (q1Diff <= 1) { - diff += q1Diff; - } else { - diff += (q1Diff - 1) * 12; - } + const q2Diff = this.mod_diff(this.q2Ratio, b.q2Ratio, 16); + if (q2Diff <= 1) { + diff += q2Diff; + } else { + diff += (q2Diff - 1) * 12; + } - const q2Diff = this.mod_diff(this.q2Ratio, b.q2Ratio, 16); - if (q2Diff <= 1) { - diff += q2Diff; - } else { - diff += (q2Diff - 1) * 12; - } + if (this.checksum != b.checksum) { + diff++; + } - if (this.checksum != b.checksum) { - diff++; - } + diff += this.digest_distance(this.code, b.code); - diff += this.digest_distance(this.code, b.code); + return diff; + } - return diff; + /** + * toString converts the Tlsh object into its string representation + * @returns string + */ + toString(): string { + let buf = ""; + if (!this.complete) { + return "TNULL"; } - /** - * toString converts the Tlsh object into its string representation - * @returns string - */ - toString(): string { - let buf = ""; - if (!this.complete) { - return "TNULL"; - } - - buf = buf.concat( - Tlsh.swapByte(this.checksum).toString(16).padStart(2, "0"), - ); - buf = buf.concat( - Tlsh.swapByte(this.lValue).toString(16).padStart(2, "0"), - ); - buf = buf.concat( - Tlsh.swapByte(this.qRatio).toString(16).padStart(2, "0"), - ); + buf = buf.concat( + Tlsh.swapByte(this.checksum).toString(16).padStart(2, "0") + ); + buf = buf.concat(Tlsh.swapByte(this.lValue).toString(16).padStart(2, "0")); + buf = buf.concat(Tlsh.swapByte(this.qRatio).toString(16).padStart(2, "0")); - for (let i = 0; i < CODE_SIZE; i++) { - buf = buf.concat(this.code[i].toString(16).padStart(2, "0")); - } - - return buf.toUpperCase(); + for (let i = 0; i < CODE_SIZE; i++) { + buf = buf.concat(this.code[i].toString(16).padStart(2, "0")); } - private mod_diff(x: number, y: number, R: number): number { - let dl = 0; - let dr = 0; + return buf.toUpperCase(); + } - if (y > x) { - dl = y - x; - dr = x + R - y; - } else { - dl = x - y; - dr = y + R - x; - } + private mod_diff(x: number, y: number, R: number): number { + let dl = 0; + let dr = 0; - if (dl > dr) { - return dr; - } else { - return dl; - } + if (y > x) { + dl = y - x; + dr = x + R - y; + } else { + dl = x - y; + dr = y + R - x; } - private digest_distance(x: Uint8Array, y: Uint8Array): number { - let diff = 0; - for (let i = 0; i < CODE_SIZE; i++) { - diff += bit_pair_diff_table[x[i]][y[i]]; - } - - return diff; + if (dl > dr) { + return dr; + } else { + return dl; } + } - static swapByte(input: number): number { - let out = 0; - out = ((input & 0xf0) >> 4) & 0x0f; - out |= ((input & 0x0f) << 4) & 0xf0; - return out; + private digest_distance(x: Uint8Array, y: Uint8Array): number { + let diff = 0; + for (let i = 0; i < CODE_SIZE; i++) { + diff += bit_pair_diff_table[x[i]][y[i]]; } - private lValueCalc(len: number): number { - let l = 0; - - if (len <= 656) { - l = Math.floor(Math.log(len) / LOG1_5); - } else if (len <= 3199) { - l = Math.floor(Math.log(len) / LOG1_3 - 8.71777); - } else { - l = Math.floor(Math.log(len) / LOG1_1 - 62.5472); - } - - return l; + return diff; + } + + static swapByte(input: number): number { + let out = 0; + out = ((input & 0xf0) >> 4) & 0x0f; + out |= ((input & 0x0f) << 4) & 0xf0; + return out; + } + + private lValueCalc(len: number): number { + let l = 0; + + if (len <= 656) { + l = Math.floor(Math.log(len) / LOG1_5); + } else if (len <= 3199) { + l = Math.floor(Math.log(len) / LOG1_3 - 8.71777); + } else { + l = Math.floor(Math.log(len) / LOG1_1 - 62.5472); } - private quartilePoints(buckets: Uint8Array): Uint8Array { - let spl = 0; - let spr = 0; - let q1 = 0; - let q2 = 0; - let q3 = 0; - - const p1 = (EFF_BUCKETS / 4) - 1; - const p2 = (EFF_BUCKETS / 2) - 1; - const p3 = EFF_BUCKETS - (EFF_BUCKETS / 4) - 1; - const end = EFF_BUCKETS - 1; - const bucketsCopy = Uint8Array.from(buckets); - - const shortCutLeft = new Uint8Array(EFF_BUCKETS).fill(0); - const shortCutRight = new Uint8Array(EFF_BUCKETS).fill(0); - - let l = 0; - let r = 0; - - for (l = 0, r = end;;) { - const ret = this.partition(bucketsCopy, l, r); - - if (ret > p2) { - r = ret - 1; - shortCutRight[spr] = ret; - spr++; - } else if (ret < p2) { - l = ret + 1; - shortCutLeft[spl] = ret; - spl++; - } else { - q2 = bucketsCopy[p2]; - break; - } - } + return l; + } + + private quartilePoints(buckets: Uint8Array): Uint8Array { + let spl = 0; + let spr = 0; + let q1 = 0; + let q2 = 0; + let q3 = 0; + + const p1 = EFF_BUCKETS / 4 - 1; + const p2 = EFF_BUCKETS / 2 - 1; + const p3 = EFF_BUCKETS - EFF_BUCKETS / 4 - 1; + const end = EFF_BUCKETS - 1; + const bucketsCopy = Uint8Array.from(buckets); + + const shortCutLeft = new Uint8Array(EFF_BUCKETS).fill(0); + const shortCutRight = new Uint8Array(EFF_BUCKETS).fill(0); + + let l = 0; + let r = 0; + + for (l = 0, r = end; ; ) { + const ret = this.partition(bucketsCopy, l, r); + + if (ret > p2) { + r = ret - 1; + shortCutRight[spr] = ret; + spr++; + } else if (ret < p2) { + l = ret + 1; + shortCutLeft[spl] = ret; + spl++; + } else { + q2 = bucketsCopy[p2]; + break; + } + } - shortCutLeft[spl] = p2 - 1; - shortCutRight[spr] = p2 + 1; - - for (let i = 0, l = 0; i <= spl; i++) { - r = shortCutLeft[i]; - if (r > p1) { - while (true) { - const ret = this.partition(bucketsCopy, l, r); - if (ret > p1) { - r = ret - 1; - } else if (ret < p1) { - l = ret + 1; - } else { - q1 = bucketsCopy[p1]; - break; - } - } - break; - } else if (r < p1) { - l = r; - } else { - q1 = bucketsCopy[p1]; - break; - } + shortCutLeft[spl] = p2 - 1; + shortCutRight[spr] = p2 + 1; + + for (let i = 0, l = 0; i <= spl; i++) { + r = shortCutLeft[i]; + if (r > p1) { + while (true) { + const ret = this.partition(bucketsCopy, l, r); + if (ret > p1) { + r = ret - 1; + } else if (ret < p1) { + l = ret + 1; + } else { + q1 = bucketsCopy[p1]; + break; + } } + break; + } else if (r < p1) { + l = r; + } else { + q1 = bucketsCopy[p1]; + break; + } + } - for (let i = 0, r = end; i <= spr; i++) { - l = shortCutRight[i]; - if (l < p3) { - while (true) { - const ret = this.partition(bucketsCopy, l, r); - if (ret > p3) { - r = ret - 1; - } else if (ret < p3) { - l = ret + 1; - } else { - q3 = bucketsCopy[p3]; - break; - } - } - break; - } else if (l > p3) { - r = l; - } else { - q3 = bucketsCopy[p3]; - break; - } + for (let i = 0, r = end; i <= spr; i++) { + l = shortCutRight[i]; + if (l < p3) { + while (true) { + const ret = this.partition(bucketsCopy, l, r); + if (ret > p3) { + r = ret - 1; + } else if (ret < p3) { + l = ret + 1; + } else { + q3 = bucketsCopy[p3]; + break; + } } - return new Uint8Array([q1, q2, q3]); + break; + } else if (l > p3) { + r = l; + } else { + q3 = bucketsCopy[p3]; + break; + } } + return new Uint8Array([q1, q2, q3]); + } - private partition(buf: Uint8Array, left: number, right: number): number { - if (left == right) { - return left; - } + private partition(buf: Uint8Array, left: number, right: number): number { + if (left == right) { + return left; + } - if (left + 1 == right) { - if (buf[left] > buf[right]) { - const temp = buf[right]; - buf[right] = buf[left]; - buf[left] = temp; - } + if (left + 1 == right) { + if (buf[left] > buf[right]) { + const temp = buf[right]; + buf[right] = buf[left]; + buf[left] = temp; + } - return left; - } + return left; + } - let ret = left; - const pivot = (left + right) >> 1; - const val = buf[pivot]; + let ret = left; + const pivot = (left + right) >> 1; + const val = buf[pivot]; - buf[pivot] = buf[right]; - buf[right] = val; + buf[pivot] = buf[right]; + buf[right] = val; - for (let i = left; i < right; i++) { - if (buf[i] < val) { - const temp = buf[i]; - buf[i] = buf[ret]; - buf[ret] = temp; - ret++; - } - } + for (let i = left; i < right; i++) { + if (buf[i] < val) { + const temp = buf[i]; + buf[i] = buf[ret]; + buf[ret] = temp; + ret++; + } + } - buf[right] = buf[ret]; - buf[ret] = val; + buf[right] = buf[ret]; + buf[ret] = val; - return ret; - } + return ret; + } } export class TreeNode { - left: TreeNode | undefined; - right: TreeNode | undefined; - splitPoint: number = 0; - splitKey: Tlsh | undefined; - isLeaf: boolean = false; - items: Array = []; - - constructor( - left: TreeNode | undefined, - right: TreeNode | undefined, - splitPoint: number, - splitKey: Tlsh | undefined, - isLeaf: boolean, - items: Array, - ) { - this.left = left; - this.right = right; - this.splitPoint = splitPoint; - this.splitKey = splitKey; - this.isLeaf = isLeaf; - this.items = items; - } + left: TreeNode | undefined; + right: TreeNode | undefined; + splitPoint: number = 0; + splitKey: Tlsh | undefined; + isLeaf: boolean = false; + items: Array = []; + + constructor( + left: TreeNode | undefined, + right: TreeNode | undefined, + splitPoint: number, + splitKey: Tlsh | undefined, + isLeaf: boolean, + items: Array + ) { + this.left = left; + this.right = right; + this.splitPoint = splitPoint; + this.splitKey = splitKey; + this.isLeaf = isLeaf; + this.items = items; + } } type SplitResult = { - left: Array; - right: Array; - splitKey: Tlsh; - splitPoint: number; + left: Array; + right: Array; + splitKey: Tlsh; + splitPoint: number; }; export class TlshTree { - public size = 0; - public numLeafs = 0; - leafSize = 10; - node: TreeNode; - - constructor(hashes: Array, leafSize: number = 10) { - const dedup: Map = new Map(); - for (let i = 0; i < hashes.length; i++) { - if (hashes[i].length != HASH_LEN && hashes[i].length != HASH_LEN + 2) { - continue; - } - - if (dedup.has(hashes[i])) { - continue; - } else { - dedup.set(hashes[i], true); - } - } - - const tlshList = Array.from(dedup.keys()).map((x) => Tlsh.from(x)); - this.size = tlshList.length; - this.leafSize = leafSize; - this.node = this.build(tlshList, leafSize); + public size = 0; + public numLeafs = 0; + leafSize = 10; + node: TreeNode; + + constructor(hashes: Array, leafSize: number = 10) { + const dedup: Map = new Map(); + for (let i = 0; i < hashes.length; i++) { + if (hashes[i].length != HASH_LEN && hashes[i].length != HASH_LEN + 2) { + continue; + } + + if (dedup.has(hashes[i])) { + continue; + } else { + dedup.set(hashes[i], true); + } } - /** - * dump will create a pretty string representation of the entire tree - * @returns string - */ - dump(): string { - let out = ""; - const _dump = (space: string, node: TreeNode) => { - if (node.isLeaf) { - for (let i = 0; i < node.items.length; i++) { - out = out.concat(space + node.items[i].toString() + "\n"); - } - } else { - out = out.concat(space + "left\n"); - if (node.left == undefined) { - throw new Error("unexpected empty left node"); - } - _dump(space + ". ", node.left); - - out = out.concat(space + "right\n"); - if (node.right == undefined) { - throw new Error("unexpected empty right node"); - } - _dump(space + ". ", node.right); - } - }; - - _dump("", this.node); - - return out; - } - - /** - * isPresent will say if a hash is present in the tree within a - * certain distance. This will be slightly faster than - * searchTraverse because it stops once it finds one match. - * @param tlsh is the value to find nearest neighbors for - * @param distance is the threshold for what is a close neighbor - * @returns a boolean, true if the hash is present within the - * given distance - */ - isPresent(hash: string, distance: number): boolean { - const _traverse = (tlsh: Tlsh, distance: number, node: TreeNode): boolean => { - if (node.isLeaf) { - for (let i = 0; i < node.items.length; i++) { - if (tlsh.diff(node.items[i]) <= distance) { - return true; - } - } - - return false; - } - - if (node.splitKey == undefined || node.left == undefined || node.right == undefined) { - return false; - } - - if (tlsh.diff(node.splitKey) < node.splitPoint) { - return _traverse(tlsh, distance, node.left); - } else { - return _traverse(tlsh, distance, node.right); - } - }; - - const tlsh = Tlsh.from(hash); - return _traverse(tlsh, distance, this.node); - } + const tlshList = Array.from(dedup.keys()).map((x) => Tlsh.from(x)); + this.size = tlshList.length; + this.leafSize = leafSize; + this.node = this.build(tlshList, leafSize); + } + + /** + * dump will create a pretty string representation of the entire tree + * @returns string + */ + dump(): string { + let out = ""; + const _dump = (space: string, node: TreeNode) => { + if (node.isLeaf) { + for (let i = 0; i < node.items.length; i++) { + out = out.concat(space + node.items[i].toString() + "\n"); + } + } else { + out = out.concat(space + "left\n"); + if (node.left == undefined) { + throw new Error("unexpected empty left node"); + } + _dump(space + ". ", node.left); - /** - * search will find the nearest `distance` hashes from - * the input hash. The returned array is also sorted with the - * closest matches first. - * @param node is the current node in the tree - * @param tlsh is the value to find nearest neighbors for - * @param distance is the threshold for what is a close neighbor - * @returns a sorted list of hashes. - */ - search(hash: string, distance: number): Array { - const _search = (node: TreeNode, tlsh: Tlsh, distance: number): string[] => { - if (node.isLeaf) { - const out: [string, number][] = []; - for (let i = 0; i < node.items.length; i++) { - const diff = tlsh.diff(node.items[i]); - if (diff <= distance) { - out.push([node.items[i].toString(), diff]); - } - } - - out.sort((a, b) => a[1] - b[1]); - return out.map((xs) => xs[0]); - } - - if (node.splitKey == undefined || node.left == undefined || node.right == undefined) { - throw new Error("unexpected error traversing the tree, splitKey, left, or right is undefiend"); - } - - if (tlsh.diff(node.splitKey) < node.splitPoint) { - return _search(node.left, tlsh, distance); - } else { - return _search(node.right, tlsh, distance); - } - }; - - const tlsh = Tlsh.from(hash); - return _search(this.node, tlsh, distance); - } + out = out.concat(space + "right\n"); + if (node.right == undefined) { + throw new Error("unexpected empty right node"); + } + _dump(space + ". ", node.right); + } + }; + + _dump("", this.node); + + return out; + } + + /** + * isPresent will say if a hash is present in the tree within a + * certain distance. This will be slightly faster than + * searchTraverse because it stops once it finds one match. + * @param tlsh is the value to find nearest neighbors for + * @param distance is the threshold for what is a close neighbor + * @returns a boolean, true if the hash is present within the + * given distance + */ + isPresent(hash: string, distance: number): boolean { + const _traverse = ( + tlsh: Tlsh, + distance: number, + node: TreeNode + ): boolean => { + if (node.isLeaf) { + for (let i = 0; i < node.items.length; i++) { + if (tlsh.diff(node.items[i]) <= distance) { + return true; + } + } - /** - * build is the entry point for building the Tlsh tree - */ - private build(tlshList: Array, leafSize: number): TreeNode { - const splitResult = this.splitNodes(tlshList, leafSize); - if (splitResult == undefined) { - this.numLeafs++; - return new TreeNode(undefined, undefined, 0, undefined, true, tlshList); + return false; + } + + if ( + node.splitKey == undefined || + node.left == undefined || + node.right == undefined + ) { + return false; + } + + if (tlsh.diff(node.splitKey) < node.splitPoint) { + return _traverse(tlsh, distance, node.left); + } else { + return _traverse(tlsh, distance, node.right); + } + }; + + const tlsh = Tlsh.from(hash); + return _traverse(tlsh, distance, this.node); + } + + /** + * search will find the nearest `distance` hashes from + * the input hash. The returned array is also sorted with the + * closest matches first. + * @param node is the current node in the tree + * @param tlsh is the value to find nearest neighbors for + * @param distance is the threshold for what is a close neighbor + * @returns a sorted list of hashes. + */ + search(hash: string, distance: number): Array { + const _search = ( + node: TreeNode, + tlsh: Tlsh, + distance: number + ): string[] => { + if (node.isLeaf) { + const out: [string, number][] = []; + for (let i = 0; i < node.items.length; i++) { + const diff = tlsh.diff(node.items[i]); + if (diff <= distance) { + out.push([node.items[i].toString(), diff]); + } } - const left = this.build(splitResult.left, leafSize); - const right = this.build(splitResult.right, leafSize); + out.sort((a, b) => a[1] - b[1]); + return out.map((xs) => xs[0]); + } + + if ( + node.splitKey == undefined || + node.left == undefined || + node.right == undefined + ) { + throw new Error( + "unexpected error traversing the tree, splitKey, left, or right is undefiend" + ); + } + + if (tlsh.diff(node.splitKey) < node.splitPoint) { + return _search(node.left, tlsh, distance); + } else { + return _search(node.right, tlsh, distance); + } + }; + + const tlsh = Tlsh.from(hash); + return _search(this.node, tlsh, distance); + } + + /** + * build is the entry point for building the Tlsh tree + */ + private build(tlshList: Array, leafSize: number): TreeNode { + const splitResult = this.splitNodes(tlshList, leafSize); + if (splitResult == undefined) { + this.numLeafs++; + return new TreeNode(undefined, undefined, 0, undefined, true, tlshList); + } - return new TreeNode(left, right, splitResult.splitPoint, splitResult.splitKey, false, []); + const left = this.build(splitResult.left, leafSize); + const right = this.build(splitResult.right, leafSize); + + return new TreeNode( + left, + right, + splitResult.splitPoint, + splitResult.splitKey, + false, + [] + ); + } + + /** splitNodes will take an array of Tlsh and find an optimial split point such that each partition is at least + * `minSplitSize`. + */ + private splitNodes( + tlshList: Array, + leafSize: number + ): SplitResult | undefined { + if (tlshList.length <= leafSize) { + return undefined; } - /** splitNodes will take an array of Tlsh and find an optimial split point such that each partition is at least - * `minSplitSize`. - */ - private splitNodes(tlshList: Array, leafSize: number): SplitResult | undefined { - if (tlshList.length <= leafSize) { - return undefined; - } + let splitPoint = 10; + const jumpSize = 10; + const minSplitSize = tlshList.length * 0.3; - let splitPoint = 5; - const jumpSize = 5; - const minSplitSize = tlshList.length * 0.3; - - for (let i = 0; i < tlshList.length; i++) { - const splitKey = tlshList[i]; - const left = []; - const right = []; - - for (let j = 0; j < tlshList.length; j++) { - const diff = splitKey.diff(tlshList[j]); - if (diff <= splitPoint) { - left.push(tlshList[j]); - } else { - right.push(tlshList[j]); - } - } - - if (left.length > minSplitSize && right.length > minSplitSize) { - return { left, right, splitKey, splitPoint }; - } - - splitPoint += jumpSize; + for (let i = 0; i < tlshList.length; i++) { + const splitKey = tlshList[i]; + const left = []; + const right = []; + + for (let j = 0; j < tlshList.length; j++) { + const diff = splitKey.diff(tlshList[j]); + if (diff <= splitPoint) { + left.push(tlshList[j]); + } else { + right.push(tlshList[j]); } + } + if (splitPoint > 255) { return undefined; + } + + if (left.length > minSplitSize && right.length > minSplitSize) { + return { left, right, splitKey, splitPoint }; + } + + splitPoint += jumpSize; } + + return undefined; + } } diff --git a/tlsh_bench.ts b/tlsh_bench.ts new file mode 100644 index 0000000..b5319cb --- /dev/null +++ b/tlsh_bench.ts @@ -0,0 +1,30 @@ +import { TlshTree } from "./tlsh.ts"; + +function generateRandomTLSH(): string { + const hexChars = "0123456789ABCDEF"; + + const getRandomHex = (length: number): string => { + return Array.from( + { length }, + () => hexChars[Math.floor(Math.random() * 16)] + ).join(""); + }; + + const checksum = getRandomHex(6); // Checksum prefix + const lengthVal = getRandomHex(3); // Length + const qRatio = getRandomHex(3); // Q-ratio + const body = getRandomHex(58); // Main hash body + + return checksum + lengthVal + qRatio + body; +} + +function generateTLSHHashes(count: number = 10): string[] { + return Array.from({ length: count }, () => generateRandomTLSH()); +} + +const hashes = generateTLSHHashes(100000); +console.log(`Generated ${hashes.length} TLSH hashes.`); + +Deno.bench("benchmark tree creation with 100k hashes", () => { + const _tree = new TlshTree(hashes, 100); +});