From 34f1cfc84bb1455012201e8f9e348dcd4e3f36fa Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Thu, 14 Sep 2017 21:45:08 +0300 Subject: [PATCH 1/7] fix 128 bit SimHash add 256 and 512 bit SimHash --- .../SimHash/Comparator/GaussianComparator.php | 24 +++++++- lib/Tga/SimHash/Fingerprint.php | 29 ++-------- lib/Tga/SimHash/SimHash.php | 6 +- .../SimHash/Tokenizer/String128Tokenizer.php | 9 ++- .../SimHash/Tokenizer/String256Tokenizer.php | 58 +++++++++++++++++++ .../SimHash/Tokenizer/String512Tokenizer.php | 58 +++++++++++++++++++ 6 files changed, 157 insertions(+), 27 deletions(-) create mode 100644 lib/Tga/SimHash/Tokenizer/String256Tokenizer.php create mode 100644 lib/Tga/SimHash/Tokenizer/String512Tokenizer.php diff --git a/lib/Tga/SimHash/Comparator/GaussianComparator.php b/lib/Tga/SimHash/Comparator/GaussianComparator.php index baa8dad..da1fe4c 100644 --- a/lib/Tga/SimHash/Comparator/GaussianComparator.php +++ b/lib/Tga/SimHash/Comparator/GaussianComparator.php @@ -35,6 +35,28 @@ public function __construct($deviation = 4) $this->deviation = $deviation; } + /** + * Count Differences beetwin fingerprints. + * + * @param int $size + * @param Fingerprint $fp1 + * @param Fingerprint $fp2 + * @return int + */ + private static function countDifferences(int $size, Fingerprint $fp1, Fingerprint $fp2) { + $val1 = $fp1->getBinary(); + $val2 = $fp2->getBinary(); + + $ret = 0; + for ($i = 0; $i < $size; $i++) { + if ($val1[$i] != $val2[$i]) { + $ret++; + } + } + + return $ret; + } + /** * Compare the two fingerprints and return a similarity index between 0 and 1. * @@ -51,7 +73,7 @@ public function compare(Fingerprint $fp1, Fingerprint $fp2) )); } - $countDifferences = substr_count(decbin($fp1->getDecimal() ^ $fp2->getDecimal()), '1'); + $countDifferences = GaussianComparator::countDifferences($fp1->getSize(), $fp1, $fp2); return $this->computeSimilarityIndex($countDifferences); } diff --git a/lib/Tga/SimHash/Fingerprint.php b/lib/Tga/SimHash/Fingerprint.php index 8df9c20..a1f796b 100644 --- a/lib/Tga/SimHash/Fingerprint.php +++ b/lib/Tga/SimHash/Fingerprint.php @@ -24,19 +24,19 @@ class Fingerprint /** * @var float */ - protected $decimalValue; + protected $value; /** * Constructor * * @param int $size - * @param float $decimalValue + * @param string $value */ - public function __construct($size, $decimalValue) + public function __construct($size, $value) { $this->size = $size; - $this->decimalValue = (float) $decimalValue; + $this->value = $value; } /** @@ -55,16 +55,6 @@ public function getSize() return $this->size; } - /** - * Get the decimal value - * - * @return float - */ - public function getDecimal() - { - return $this->decimalValue; - } - /** * Get the binary value as a string * @@ -72,16 +62,7 @@ public function getDecimal() */ public function getBinary() { - return str_pad(decbin($this->decimalValue), $this->size, '0', STR_PAD_LEFT); + return $this->value; } - /** - * Get the hexadecimal value as a string - * - * @return string - */ - public function getHexa() - { - return dechex($this->decimalValue); - } } \ No newline at end of file diff --git a/lib/Tga/SimHash/SimHash.php b/lib/Tga/SimHash/SimHash.php index 3406812..909e8a4 100644 --- a/lib/Tga/SimHash/SimHash.php +++ b/lib/Tga/SimHash/SimHash.php @@ -22,6 +22,8 @@ class SimHash const SIMHASH_32 = 32; const SIMHASH_64 = 64; const SIMHASH_128 = 128; + const SIMHASH_256 = 256; + const SIMHASH_512 = 512; /** * @var Tokenizer\TokenizerInterface[] @@ -42,6 +44,8 @@ public function __construct() $this->tokenizers = [ new Tokenizer\String64Tokenizer(), new Tokenizer\String128Tokenizer(), + new Tokenizer\String256Tokenizer(), + new Tokenizer\String512Tokenizer(), new Tokenizer\String32Tokenizer() ]; @@ -104,7 +108,7 @@ public function hash($elements, $size = self::SIMHASH_64) } } - return new Fingerprint($size, bindec(implode('', $fingerprint))); + return new Fingerprint($size, implode('', $fingerprint)); } /** diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php index f374ee4..28ea7bd 100644 --- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php @@ -18,13 +18,20 @@ */ class String128Tokenizer implements TokenizerInterface { + + protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + /** * @param string $element * @return string */ public function tokenize($element) { - return str_pad(base_convert(md5($element), 16, 2), 128, '0', STR_PAD_LEFT); + $hash = md5($element); + $hash = str_replace(self::$search, self::$replace, $hash); + $hash = str_pad($hash, 512, '0', STR_PAD_LEFT); + return $hash; } /** diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php new file mode 100644 index 0000000..17ba559 --- /dev/null +++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php @@ -0,0 +1,58 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Tga\SimHash\Tokenizer; + +/** + * Tokenizer for strings that generate 256 bit tokens. + * + * @author Titouan Galopin + */ +class String128Tokenizer implements TokenizerInterface +{ + + protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + + /** + * @param string $element + * @return string + */ + public function tokenize($element) + { + $hash = hash('sha256', $element); + $hash = str_replace(self::$search, self::$replace, $hash); + $hash = str_pad($hash, 256, '0', STR_PAD_LEFT); + return $hash; + } + + /** + * Does this tokenizer supports the given element + * + * @param string $element + * @return boolean + */ + public function supportsElement($element) + { + return is_string($element); + } + + /** + * Does this tokenizer return tokens of the given size + * + * @param int $size + * @return boolean + */ + public function supportsSize($size) + { + return $size === 256; + } +} \ No newline at end of file diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php new file mode 100644 index 0000000..27a846d --- /dev/null +++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php @@ -0,0 +1,58 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Tga\SimHash\Tokenizer; + +/** + * Tokenizer for strings that generate 512 bit tokens. + * + * @author Titouan Galopin + */ +class String128Tokenizer implements TokenizerInterface +{ + + protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + + /** + * @param string $element + * @return string + */ + public function tokenize($element) + { + $hash = hash('sha512', $element); + $hash = str_replace(self::$search, self::$replace, $hash); + $hash = str_pad($hash, 512, '0', STR_PAD_LEFT); + return $hash; + } + + /** + * Does this tokenizer supports the given element + * + * @param string $element + * @return boolean + */ + public function supportsElement($element) + { + return is_string($element); + } + + /** + * Does this tokenizer return tokens of the given size + * + * @param int $size + * @return boolean + */ + public function supportsSize($size) + { + return $size === 512; + } +} \ No newline at end of file From 3c7017757941e31e71c383748c8681adbbcdaec9 Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Thu, 14 Sep 2017 22:00:43 +0300 Subject: [PATCH 2/7] small bug fix --- lib/Tga/SimHash/Tokenizer/String256Tokenizer.php | 2 +- lib/Tga/SimHash/Tokenizer/String512Tokenizer.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php index 17ba559..7d82856 100644 --- a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php @@ -16,7 +16,7 @@ * * @author Titouan Galopin */ -class String128Tokenizer implements TokenizerInterface +class String256Tokenizer implements TokenizerInterface { protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php index 27a846d..b4724cd 100644 --- a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php @@ -16,7 +16,7 @@ * * @author Titouan Galopin */ -class String128Tokenizer implements TokenizerInterface +class String512Tokenizer implements TokenizerInterface { protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); From 2938ad5776c7f3be60dba8bd113cba8074e0ee8b Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Tue, 19 Sep 2017 22:53:24 +0300 Subject: [PATCH 3/7] * change comment * add an ending line to this file --- lib/Tga/SimHash/Comparator/ComparatorInterface.php | 2 +- lib/Tga/SimHash/Comparator/GaussianComparator.php | 4 ++-- lib/Tga/SimHash/Extractor/ExtractorInterface.php | 2 +- lib/Tga/SimHash/Extractor/HtmlExtractor.php | 2 +- lib/Tga/SimHash/Extractor/SimpleTextExtractor.php | 2 +- lib/Tga/SimHash/Fingerprint.php | 2 +- lib/Tga/SimHash/SimHash.php | 2 +- lib/Tga/SimHash/Tokenizer/String128Tokenizer.php | 2 +- lib/Tga/SimHash/Tokenizer/String256Tokenizer.php | 2 +- lib/Tga/SimHash/Tokenizer/String32Tokenizer.php | 2 +- lib/Tga/SimHash/Tokenizer/String512Tokenizer.php | 2 +- lib/Tga/SimHash/Tokenizer/String64Tokenizer.php | 2 +- lib/Tga/SimHash/Tokenizer/TokenizerInterface.php | 2 +- lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php | 2 +- lib/Tga/SimHash/Vectorizer/VectorizerInterface.php | 2 +- 15 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/Tga/SimHash/Comparator/ComparatorInterface.php b/lib/Tga/SimHash/Comparator/ComparatorInterface.php index c7233ba..e699aa5 100644 --- a/lib/Tga/SimHash/Comparator/ComparatorInterface.php +++ b/lib/Tga/SimHash/Comparator/ComparatorInterface.php @@ -28,4 +28,4 @@ interface ComparatorInterface * @return float */ public function compare(Fingerprint $fp1, Fingerprint $fp2); -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Comparator/GaussianComparator.php b/lib/Tga/SimHash/Comparator/GaussianComparator.php index da1fe4c..239f4cd 100644 --- a/lib/Tga/SimHash/Comparator/GaussianComparator.php +++ b/lib/Tga/SimHash/Comparator/GaussianComparator.php @@ -36,7 +36,7 @@ public function __construct($deviation = 4) } /** - * Count Differences beetwin fingerprints. + * Count differences between fingerprints. * * @param int $size * @param Fingerprint $fp1 @@ -104,4 +104,4 @@ protected function gaussianDensity($x) return $y; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Extractor/ExtractorInterface.php b/lib/Tga/SimHash/Extractor/ExtractorInterface.php index 273e00b..60df639 100644 --- a/lib/Tga/SimHash/Extractor/ExtractorInterface.php +++ b/lib/Tga/SimHash/Extractor/ExtractorInterface.php @@ -26,4 +26,4 @@ interface ExtractorInterface * @return array */ public function extract($input); -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Extractor/HtmlExtractor.php b/lib/Tga/SimHash/Extractor/HtmlExtractor.php index 1360782..ddadb63 100644 --- a/lib/Tga/SimHash/Extractor/HtmlExtractor.php +++ b/lib/Tga/SimHash/Extractor/HtmlExtractor.php @@ -94,4 +94,4 @@ protected function parseBody(\DOMDocument $document) return $node->nodeValue; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php b/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php index e13a8f7..c24a7ab 100644 --- a/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php +++ b/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php @@ -34,4 +34,4 @@ public function extract($text) return explode('-', $slugifiedText); } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Fingerprint.php b/lib/Tga/SimHash/Fingerprint.php index a1f796b..db3dba2 100644 --- a/lib/Tga/SimHash/Fingerprint.php +++ b/lib/Tga/SimHash/Fingerprint.php @@ -65,4 +65,4 @@ public function getBinary() return $this->value; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/SimHash.php b/lib/Tga/SimHash/SimHash.php index 909e8a4..6f13e5e 100644 --- a/lib/Tga/SimHash/SimHash.php +++ b/lib/Tga/SimHash/SimHash.php @@ -179,4 +179,4 @@ protected function findTokenizer($element, $size) return $tokenizer; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php index 28ea7bd..55cb738 100644 --- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php @@ -55,4 +55,4 @@ public function supportsSize($size) { return $size === 128; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php index 7d82856..9bd42a7 100644 --- a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php @@ -55,4 +55,4 @@ public function supportsSize($size) { return $size === 256; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php index 4824939..1b6a24d 100644 --- a/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php @@ -48,4 +48,4 @@ public function supportsSize($size) { return $size === 32; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php index b4724cd..26527d2 100644 --- a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php @@ -55,4 +55,4 @@ public function supportsSize($size) { return $size === 512; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php index fbd95eb..b25c9b3 100644 --- a/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php @@ -108,4 +108,4 @@ private function buildTable() return $crc64tab; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php b/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php index 68dc0e1..942c276 100644 --- a/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php +++ b/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php @@ -41,4 +41,4 @@ public function supportsElement($element); * @return boolean */ public function supportsSize($size); -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php b/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php index e55d86f..045f229 100644 --- a/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php +++ b/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php @@ -61,4 +61,4 @@ protected function createWeightTokens($tokens) return $weightTokens; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php b/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php index 9e83ad7..117b902 100644 --- a/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php +++ b/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php @@ -24,4 +24,4 @@ interface VectorizerInterface * @return array */ public function vectorize(array $tokens, $size); -} \ No newline at end of file +} From 776d0d782a2fb8c056991302efb3f0d4f4eaa9d8 Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Tue, 19 Sep 2017 23:18:10 +0300 Subject: [PATCH 4/7] change protected to private --- lib/Tga/SimHash/Tokenizer/String128Tokenizer.php | 4 ++-- lib/Tga/SimHash/Tokenizer/String256Tokenizer.php | 4 ++-- lib/Tga/SimHash/Tokenizer/String512Tokenizer.php | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php index 55cb738..c0d574d 100644 --- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php @@ -19,8 +19,8 @@ class String128Tokenizer implements TokenizerInterface { - protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); - protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); /** * @param string $element diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php index 9bd42a7..b1426e6 100644 --- a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php @@ -19,8 +19,8 @@ class String256Tokenizer implements TokenizerInterface { - protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); - protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); /** * @param string $element diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php index 26527d2..5b6058c 100644 --- a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php @@ -19,8 +19,8 @@ class String512Tokenizer implements TokenizerInterface { - protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); - protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); /** * @param string $element From 896912d250cf6099ca5fd7938bc667f470348b61 Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Thu, 21 Sep 2017 16:55:08 +0300 Subject: [PATCH 5/7] fix by code review https://github.com/tgalopin/SimHashPhp/pull/5 --- lib/Tga/SimHash/Fingerprint.php | 4 ++-- lib/Tga/SimHash/Tokenizer/String128Tokenizer.php | 1 - lib/Tga/SimHash/Tokenizer/String256Tokenizer.php | 1 - lib/Tga/SimHash/Tokenizer/String512Tokenizer.php | 1 - 4 files changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/Tga/SimHash/Fingerprint.php b/lib/Tga/SimHash/Fingerprint.php index db3dba2..d53f241 100644 --- a/lib/Tga/SimHash/Fingerprint.php +++ b/lib/Tga/SimHash/Fingerprint.php @@ -22,9 +22,9 @@ class Fingerprint protected $size; /** - * @var float + * @var string */ - protected $value; + private $value; /** diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php index c0d574d..bcf2628 100644 --- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php @@ -18,7 +18,6 @@ */ class String128Tokenizer implements TokenizerInterface { - private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php index b1426e6..6acf316 100644 --- a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php @@ -18,7 +18,6 @@ */ class String256Tokenizer implements TokenizerInterface { - private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php index 5b6058c..c495e07 100644 --- a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php @@ -18,7 +18,6 @@ */ class String512Tokenizer implements TokenizerInterface { - private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); From 284477d311a8be9cf192918217d2a4eed74904e2 Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Tue, 21 Aug 2018 18:10:32 +0300 Subject: [PATCH 6/7] support php 5.3+ --- lib/Tga/SimHash/Comparator/GaussianComparator.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Tga/SimHash/Comparator/GaussianComparator.php b/lib/Tga/SimHash/Comparator/GaussianComparator.php index 239f4cd..632b69f 100644 --- a/lib/Tga/SimHash/Comparator/GaussianComparator.php +++ b/lib/Tga/SimHash/Comparator/GaussianComparator.php @@ -43,7 +43,7 @@ public function __construct($deviation = 4) * @param Fingerprint $fp2 * @return int */ - private static function countDifferences(int $size, Fingerprint $fp1, Fingerprint $fp2) { + private static function countDifferences($size, Fingerprint $fp1, Fingerprint $fp2) { $val1 = $fp1->getBinary(); $val2 = $fp2->getBinary(); From b263fc75168a9e11a33ab19ba549ded234e9f8e4 Mon Sep 17 00:00:00 2001 From: nicolaichuk Date: Tue, 21 Aug 2018 18:17:03 +0300 Subject: [PATCH 7/7] fix function call by code review @bbalet https://github.com/tgalopin/SimHashPhp/pull/5#discussion_r211200121 --- lib/Tga/SimHash/Tokenizer/String128Tokenizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php index bcf2628..28586b0 100644 --- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php @@ -29,7 +29,7 @@ public function tokenize($element) { $hash = md5($element); $hash = str_replace(self::$search, self::$replace, $hash); - $hash = str_pad($hash, 512, '0', STR_PAD_LEFT); + $hash = str_pad($hash, 128, '0', STR_PAD_LEFT); return $hash; }