diff --git a/lib/Tga/SimHash/Comparator/ComparatorInterface.php b/lib/Tga/SimHash/Comparator/ComparatorInterface.php index c7233ba..e699aa5 100644 --- a/lib/Tga/SimHash/Comparator/ComparatorInterface.php +++ b/lib/Tga/SimHash/Comparator/ComparatorInterface.php @@ -28,4 +28,4 @@ interface ComparatorInterface * @return float */ public function compare(Fingerprint $fp1, Fingerprint $fp2); -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Comparator/GaussianComparator.php b/lib/Tga/SimHash/Comparator/GaussianComparator.php index baa8dad..632b69f 100644 --- a/lib/Tga/SimHash/Comparator/GaussianComparator.php +++ b/lib/Tga/SimHash/Comparator/GaussianComparator.php @@ -35,6 +35,28 @@ public function __construct($deviation = 4) $this->deviation = $deviation; } + /** + * Count differences between fingerprints. + * + * @param int $size + * @param Fingerprint $fp1 + * @param Fingerprint $fp2 + * @return int + */ + private static function countDifferences($size, Fingerprint $fp1, Fingerprint $fp2) { + $val1 = $fp1->getBinary(); + $val2 = $fp2->getBinary(); + + $ret = 0; + for ($i = 0; $i < $size; $i++) { + if ($val1[$i] != $val2[$i]) { + $ret++; + } + } + + return $ret; + } + /** * Compare the two fingerprints and return a similarity index between 0 and 1. * @@ -51,7 +73,7 @@ public function compare(Fingerprint $fp1, Fingerprint $fp2) )); } - $countDifferences = substr_count(decbin($fp1->getDecimal() ^ $fp2->getDecimal()), '1'); + $countDifferences = GaussianComparator::countDifferences($fp1->getSize(), $fp1, $fp2); return $this->computeSimilarityIndex($countDifferences); } @@ -82,4 +104,4 @@ protected function gaussianDensity($x) return $y; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Extractor/ExtractorInterface.php b/lib/Tga/SimHash/Extractor/ExtractorInterface.php index 273e00b..60df639 100644 --- a/lib/Tga/SimHash/Extractor/ExtractorInterface.php +++ b/lib/Tga/SimHash/Extractor/ExtractorInterface.php @@ -26,4 +26,4 @@ interface ExtractorInterface * @return array */ public function extract($input); -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Extractor/HtmlExtractor.php b/lib/Tga/SimHash/Extractor/HtmlExtractor.php index 1360782..ddadb63 100644 --- a/lib/Tga/SimHash/Extractor/HtmlExtractor.php +++ b/lib/Tga/SimHash/Extractor/HtmlExtractor.php @@ -94,4 +94,4 @@ protected function parseBody(\DOMDocument $document) return $node->nodeValue; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php b/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php index e13a8f7..c24a7ab 100644 --- a/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php +++ b/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php @@ -34,4 +34,4 @@ public function extract($text) return explode('-', $slugifiedText); } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Fingerprint.php b/lib/Tga/SimHash/Fingerprint.php index 8df9c20..d53f241 100644 --- a/lib/Tga/SimHash/Fingerprint.php +++ b/lib/Tga/SimHash/Fingerprint.php @@ -22,21 +22,21 @@ class Fingerprint protected $size; /** - * @var float + * @var string */ - protected $decimalValue; + private $value; /** * Constructor * * @param int $size - * @param float $decimalValue + * @param string $value */ - public function __construct($size, $decimalValue) + public function __construct($size, $value) { $this->size = $size; - $this->decimalValue = (float) $decimalValue; + $this->value = $value; } /** @@ -55,16 +55,6 @@ public function getSize() return $this->size; } - /** - * Get the decimal value - * - * @return float - */ - public function getDecimal() - { - return $this->decimalValue; - } - /** * Get the binary value as a string * @@ -72,16 +62,7 @@ public function getDecimal() */ public function getBinary() { - return str_pad(decbin($this->decimalValue), $this->size, '0', STR_PAD_LEFT); + return $this->value; } - /** - * Get the hexadecimal value as a string - * - * @return string - */ - public function getHexa() - { - return dechex($this->decimalValue); - } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/SimHash.php b/lib/Tga/SimHash/SimHash.php index 3406812..6f13e5e 100644 --- a/lib/Tga/SimHash/SimHash.php +++ b/lib/Tga/SimHash/SimHash.php @@ -22,6 +22,8 @@ class SimHash const SIMHASH_32 = 32; const SIMHASH_64 = 64; const SIMHASH_128 = 128; + const SIMHASH_256 = 256; + const SIMHASH_512 = 512; /** * @var Tokenizer\TokenizerInterface[] @@ -42,6 +44,8 @@ public function __construct() $this->tokenizers = [ new Tokenizer\String64Tokenizer(), new Tokenizer\String128Tokenizer(), + new Tokenizer\String256Tokenizer(), + new Tokenizer\String512Tokenizer(), new Tokenizer\String32Tokenizer() ]; @@ -104,7 +108,7 @@ public function hash($elements, $size = self::SIMHASH_64) } } - return new Fingerprint($size, bindec(implode('', $fingerprint))); + return new Fingerprint($size, implode('', $fingerprint)); } /** @@ -175,4 +179,4 @@ protected function findTokenizer($element, $size) return $tokenizer; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php index f374ee4..28586b0 100644 --- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php @@ -18,13 +18,19 @@ */ class String128Tokenizer implements TokenizerInterface { + private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + /** * @param string $element * @return string */ public function tokenize($element) { - return str_pad(base_convert(md5($element), 16, 2), 128, '0', STR_PAD_LEFT); + $hash = md5($element); + $hash = str_replace(self::$search, self::$replace, $hash); + $hash = str_pad($hash, 128, '0', STR_PAD_LEFT); + return $hash; } /** @@ -48,4 +54,4 @@ public function supportsSize($size) { return $size === 128; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php new file mode 100644 index 0000000..6acf316 --- /dev/null +++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php @@ -0,0 +1,57 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Tga\SimHash\Tokenizer; + +/** + * Tokenizer for strings that generate 256 bit tokens. + * + * @author Titouan Galopin + */ +class String256Tokenizer implements TokenizerInterface +{ + private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + + /** + * @param string $element + * @return string + */ + public function tokenize($element) + { + $hash = hash('sha256', $element); + $hash = str_replace(self::$search, self::$replace, $hash); + $hash = str_pad($hash, 256, '0', STR_PAD_LEFT); + return $hash; + } + + /** + * Does this tokenizer supports the given element + * + * @param string $element + * @return boolean + */ + public function supportsElement($element) + { + return is_string($element); + } + + /** + * Does this tokenizer return tokens of the given size + * + * @param int $size + * @return boolean + */ + public function supportsSize($size) + { + return $size === 256; + } +} diff --git a/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php index 4824939..1b6a24d 100644 --- a/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php @@ -48,4 +48,4 @@ public function supportsSize($size) { return $size === 32; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php new file mode 100644 index 0000000..c495e07 --- /dev/null +++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php @@ -0,0 +1,57 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Tga\SimHash\Tokenizer; + +/** + * Tokenizer for strings that generate 512 bit tokens. + * + * @author Titouan Galopin + */ +class String512Tokenizer implements TokenizerInterface +{ + private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); + private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111'); + + /** + * @param string $element + * @return string + */ + public function tokenize($element) + { + $hash = hash('sha512', $element); + $hash = str_replace(self::$search, self::$replace, $hash); + $hash = str_pad($hash, 512, '0', STR_PAD_LEFT); + return $hash; + } + + /** + * Does this tokenizer supports the given element + * + * @param string $element + * @return boolean + */ + public function supportsElement($element) + { + return is_string($element); + } + + /** + * Does this tokenizer return tokens of the given size + * + * @param int $size + * @return boolean + */ + public function supportsSize($size) + { + return $size === 512; + } +} diff --git a/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php index fbd95eb..b25c9b3 100644 --- a/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php +++ b/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php @@ -108,4 +108,4 @@ private function buildTable() return $crc64tab; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php b/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php index 68dc0e1..942c276 100644 --- a/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php +++ b/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php @@ -41,4 +41,4 @@ public function supportsElement($element); * @return boolean */ public function supportsSize($size); -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php b/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php index e55d86f..045f229 100644 --- a/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php +++ b/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php @@ -61,4 +61,4 @@ protected function createWeightTokens($tokens) return $weightTokens; } -} \ No newline at end of file +} diff --git a/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php b/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php index 9e83ad7..117b902 100644 --- a/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php +++ b/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php @@ -24,4 +24,4 @@ interface VectorizerInterface * @return array */ public function vectorize(array $tokens, $size); -} \ No newline at end of file +}