diff --git a/lib/Tga/SimHash/Comparator/ComparatorInterface.php b/lib/Tga/SimHash/Comparator/ComparatorInterface.php
index c7233ba..e699aa5 100644
--- a/lib/Tga/SimHash/Comparator/ComparatorInterface.php
+++ b/lib/Tga/SimHash/Comparator/ComparatorInterface.php
@@ -28,4 +28,4 @@ interface ComparatorInterface
* @return float
*/
public function compare(Fingerprint $fp1, Fingerprint $fp2);
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Comparator/GaussianComparator.php b/lib/Tga/SimHash/Comparator/GaussianComparator.php
index baa8dad..632b69f 100644
--- a/lib/Tga/SimHash/Comparator/GaussianComparator.php
+++ b/lib/Tga/SimHash/Comparator/GaussianComparator.php
@@ -35,6 +35,28 @@ public function __construct($deviation = 4)
$this->deviation = $deviation;
}
+ /**
+ * Count differences between fingerprints.
+ *
+ * @param int $size
+ * @param Fingerprint $fp1
+ * @param Fingerprint $fp2
+ * @return int
+ */
+ private static function countDifferences($size, Fingerprint $fp1, Fingerprint $fp2) {
+ $val1 = $fp1->getBinary();
+ $val2 = $fp2->getBinary();
+
+ $ret = 0;
+ for ($i = 0; $i < $size; $i++) {
+ if ($val1[$i] != $val2[$i]) {
+ $ret++;
+ }
+ }
+
+ return $ret;
+ }
+
/**
* Compare the two fingerprints and return a similarity index between 0 and 1.
*
@@ -51,7 +73,7 @@ public function compare(Fingerprint $fp1, Fingerprint $fp2)
));
}
- $countDifferences = substr_count(decbin($fp1->getDecimal() ^ $fp2->getDecimal()), '1');
+ $countDifferences = GaussianComparator::countDifferences($fp1->getSize(), $fp1, $fp2);
return $this->computeSimilarityIndex($countDifferences);
}
@@ -82,4 +104,4 @@ protected function gaussianDensity($x)
return $y;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Extractor/ExtractorInterface.php b/lib/Tga/SimHash/Extractor/ExtractorInterface.php
index 273e00b..60df639 100644
--- a/lib/Tga/SimHash/Extractor/ExtractorInterface.php
+++ b/lib/Tga/SimHash/Extractor/ExtractorInterface.php
@@ -26,4 +26,4 @@ interface ExtractorInterface
* @return array
*/
public function extract($input);
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Extractor/HtmlExtractor.php b/lib/Tga/SimHash/Extractor/HtmlExtractor.php
index 1360782..ddadb63 100644
--- a/lib/Tga/SimHash/Extractor/HtmlExtractor.php
+++ b/lib/Tga/SimHash/Extractor/HtmlExtractor.php
@@ -94,4 +94,4 @@ protected function parseBody(\DOMDocument $document)
return $node->nodeValue;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php b/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php
index e13a8f7..c24a7ab 100644
--- a/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php
+++ b/lib/Tga/SimHash/Extractor/SimpleTextExtractor.php
@@ -34,4 +34,4 @@ public function extract($text)
return explode('-', $slugifiedText);
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Fingerprint.php b/lib/Tga/SimHash/Fingerprint.php
index 8df9c20..d53f241 100644
--- a/lib/Tga/SimHash/Fingerprint.php
+++ b/lib/Tga/SimHash/Fingerprint.php
@@ -22,21 +22,21 @@ class Fingerprint
protected $size;
/**
- * @var float
+ * @var string
*/
- protected $decimalValue;
+ private $value;
/**
* Constructor
*
* @param int $size
- * @param float $decimalValue
+ * @param string $value
*/
- public function __construct($size, $decimalValue)
+ public function __construct($size, $value)
{
$this->size = $size;
- $this->decimalValue = (float) $decimalValue;
+ $this->value = $value;
}
/**
@@ -55,16 +55,6 @@ public function getSize()
return $this->size;
}
- /**
- * Get the decimal value
- *
- * @return float
- */
- public function getDecimal()
- {
- return $this->decimalValue;
- }
-
/**
* Get the binary value as a string
*
@@ -72,16 +62,7 @@ public function getDecimal()
*/
public function getBinary()
{
- return str_pad(decbin($this->decimalValue), $this->size, '0', STR_PAD_LEFT);
+ return $this->value;
}
- /**
- * Get the hexadecimal value as a string
- *
- * @return string
- */
- public function getHexa()
- {
- return dechex($this->decimalValue);
- }
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/SimHash.php b/lib/Tga/SimHash/SimHash.php
index 3406812..6f13e5e 100644
--- a/lib/Tga/SimHash/SimHash.php
+++ b/lib/Tga/SimHash/SimHash.php
@@ -22,6 +22,8 @@ class SimHash
const SIMHASH_32 = 32;
const SIMHASH_64 = 64;
const SIMHASH_128 = 128;
+ const SIMHASH_256 = 256;
+ const SIMHASH_512 = 512;
/**
* @var Tokenizer\TokenizerInterface[]
@@ -42,6 +44,8 @@ public function __construct()
$this->tokenizers = [
new Tokenizer\String64Tokenizer(),
new Tokenizer\String128Tokenizer(),
+ new Tokenizer\String256Tokenizer(),
+ new Tokenizer\String512Tokenizer(),
new Tokenizer\String32Tokenizer()
];
@@ -104,7 +108,7 @@ public function hash($elements, $size = self::SIMHASH_64)
}
}
- return new Fingerprint($size, bindec(implode('', $fingerprint)));
+ return new Fingerprint($size, implode('', $fingerprint));
}
/**
@@ -175,4 +179,4 @@ protected function findTokenizer($element, $size)
return $tokenizer;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php
index f374ee4..28586b0 100644
--- a/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php
+++ b/lib/Tga/SimHash/Tokenizer/String128Tokenizer.php
@@ -18,13 +18,19 @@
*/
class String128Tokenizer implements TokenizerInterface
{
+ private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
+ private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');
+
/**
* @param string $element
* @return string
*/
public function tokenize($element)
{
- return str_pad(base_convert(md5($element), 16, 2), 128, '0', STR_PAD_LEFT);
+ $hash = md5($element);
+ $hash = str_replace(self::$search, self::$replace, $hash);
+ $hash = str_pad($hash, 128, '0', STR_PAD_LEFT);
+ return $hash;
}
/**
@@ -48,4 +54,4 @@ public function supportsSize($size)
{
return $size === 128;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php
new file mode 100644
index 0000000..6acf316
--- /dev/null
+++ b/lib/Tga/SimHash/Tokenizer/String256Tokenizer.php
@@ -0,0 +1,57 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Tga\SimHash\Tokenizer;
+
+/**
+ * Tokenizer for strings that generate 256 bit tokens.
+ *
+ * @author Titouan Galopin
+ */
+class String256Tokenizer implements TokenizerInterface
+{
+ private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
+ private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');
+
+ /**
+ * @param string $element
+ * @return string
+ */
+ public function tokenize($element)
+ {
+ $hash = hash('sha256', $element);
+ $hash = str_replace(self::$search, self::$replace, $hash);
+ $hash = str_pad($hash, 256, '0', STR_PAD_LEFT);
+ return $hash;
+ }
+
+ /**
+ * Does this tokenizer supports the given element
+ *
+ * @param string $element
+ * @return boolean
+ */
+ public function supportsElement($element)
+ {
+ return is_string($element);
+ }
+
+ /**
+ * Does this tokenizer return tokens of the given size
+ *
+ * @param int $size
+ * @return boolean
+ */
+ public function supportsSize($size)
+ {
+ return $size === 256;
+ }
+}
diff --git a/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php
index 4824939..1b6a24d 100644
--- a/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php
+++ b/lib/Tga/SimHash/Tokenizer/String32Tokenizer.php
@@ -48,4 +48,4 @@ public function supportsSize($size)
{
return $size === 32;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php
new file mode 100644
index 0000000..c495e07
--- /dev/null
+++ b/lib/Tga/SimHash/Tokenizer/String512Tokenizer.php
@@ -0,0 +1,57 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Tga\SimHash\Tokenizer;
+
+/**
+ * Tokenizer for strings that generate 512 bit tokens.
+ *
+ * @author Titouan Galopin
+ */
+class String512Tokenizer implements TokenizerInterface
+{
+ private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
+ private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');
+
+ /**
+ * @param string $element
+ * @return string
+ */
+ public function tokenize($element)
+ {
+ $hash = hash('sha512', $element);
+ $hash = str_replace(self::$search, self::$replace, $hash);
+ $hash = str_pad($hash, 512, '0', STR_PAD_LEFT);
+ return $hash;
+ }
+
+ /**
+ * Does this tokenizer supports the given element
+ *
+ * @param string $element
+ * @return boolean
+ */
+ public function supportsElement($element)
+ {
+ return is_string($element);
+ }
+
+ /**
+ * Does this tokenizer return tokens of the given size
+ *
+ * @param int $size
+ * @return boolean
+ */
+ public function supportsSize($size)
+ {
+ return $size === 512;
+ }
+}
diff --git a/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php b/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php
index fbd95eb..b25c9b3 100644
--- a/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php
+++ b/lib/Tga/SimHash/Tokenizer/String64Tokenizer.php
@@ -108,4 +108,4 @@ private function buildTable()
return $crc64tab;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php b/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php
index 68dc0e1..942c276 100644
--- a/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php
+++ b/lib/Tga/SimHash/Tokenizer/TokenizerInterface.php
@@ -41,4 +41,4 @@ public function supportsElement($element);
* @return boolean
*/
public function supportsSize($size);
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php b/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php
index e55d86f..045f229 100644
--- a/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php
+++ b/lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php
@@ -61,4 +61,4 @@ protected function createWeightTokens($tokens)
return $weightTokens;
}
-}
\ No newline at end of file
+}
diff --git a/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php b/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php
index 9e83ad7..117b902 100644
--- a/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php
+++ b/lib/Tga/SimHash/Vectorizer/VectorizerInterface.php
@@ -24,4 +24,4 @@ interface VectorizerInterface
* @return array
*/
public function vectorize(array $tokens, $size);
-}
\ No newline at end of file
+}