Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Comparator/ComparatorInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ interface ComparatorInterface
* @return float
*/
public function compare(Fingerprint $fp1, Fingerprint $fp2);
}
}
26 changes: 24 additions & 2 deletions lib/Tga/SimHash/Comparator/GaussianComparator.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,28 @@ public function __construct($deviation = 4)
$this->deviation = $deviation;
}

/**
* Count differences between fingerprints.
*
* @param int $size
* @param Fingerprint $fp1
* @param Fingerprint $fp2
* @return int
*/
private static function countDifferences($size, Fingerprint $fp1, Fingerprint $fp2) {
$val1 = $fp1->getBinary();
$val2 = $fp2->getBinary();

$ret = 0;
for ($i = 0; $i < $size; $i++) {
if ($val1[$i] != $val2[$i]) {
$ret++;
}
}

return $ret;
}

/**
* Compare the two fingerprints and return a similarity index between 0 and 1.
*
Expand All @@ -51,7 +73,7 @@ public function compare(Fingerprint $fp1, Fingerprint $fp2)
));
}

$countDifferences = substr_count(decbin($fp1->getDecimal() ^ $fp2->getDecimal()), '1');
$countDifferences = GaussianComparator::countDifferences($fp1->getSize(), $fp1, $fp2);

return $this->computeSimilarityIndex($countDifferences);
}
Expand Down Expand Up @@ -82,4 +104,4 @@ protected function gaussianDensity($x)

return $y;
}
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Extractor/ExtractorInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ interface ExtractorInterface
* @return array
*/
public function extract($input);
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Extractor/HtmlExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,4 @@ protected function parseBody(\DOMDocument $document)

return $node->nodeValue;
}
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Extractor/SimpleTextExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ public function extract($text)

return explode('-', $slugifiedText);
}
}
}
33 changes: 7 additions & 26 deletions lib/Tga/SimHash/Fingerprint.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,21 @@ class Fingerprint
protected $size;

/**
* @var float
* @var string
*/
protected $decimalValue;
private $value;


/**
* Constructor
*
* @param int $size
* @param float $decimalValue
* @param string $value
*/
public function __construct($size, $decimalValue)
public function __construct($size, $value)
{
$this->size = $size;
$this->decimalValue = (float) $decimalValue;
$this->value = $value;
}

/**
Expand All @@ -55,33 +55,14 @@ public function getSize()
return $this->size;
}

/**
* Get the decimal value
*
* @return float
*/
public function getDecimal()
{
return $this->decimalValue;
}

/**
* Get the binary value as a string
*
* @return string
*/
public function getBinary()
{
return str_pad(decbin($this->decimalValue), $this->size, '0', STR_PAD_LEFT);
return $this->value;
}

/**
* Get the hexadecimal value as a string
*
* @return string
*/
public function getHexa()
{
return dechex($this->decimalValue);
}
}
}
8 changes: 6 additions & 2 deletions lib/Tga/SimHash/SimHash.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class SimHash
const SIMHASH_32 = 32;
const SIMHASH_64 = 64;
const SIMHASH_128 = 128;
const SIMHASH_256 = 256;
const SIMHASH_512 = 512;

/**
* @var Tokenizer\TokenizerInterface[]
Expand All @@ -42,6 +44,8 @@ public function __construct()
$this->tokenizers = [
new Tokenizer\String64Tokenizer(),
new Tokenizer\String128Tokenizer(),
new Tokenizer\String256Tokenizer(),
new Tokenizer\String512Tokenizer(),
new Tokenizer\String32Tokenizer()
];

Expand Down Expand Up @@ -104,7 +108,7 @@ public function hash($elements, $size = self::SIMHASH_64)
}
}

return new Fingerprint($size, bindec(implode('', $fingerprint)));
return new Fingerprint($size, implode('', $fingerprint));
}

/**
Expand Down Expand Up @@ -175,4 +179,4 @@ protected function findTokenizer($element, $size)

return $tokenizer;
}
}
}
10 changes: 8 additions & 2 deletions lib/Tga/SimHash/Tokenizer/String128Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,19 @@
*/
class String128Tokenizer implements TokenizerInterface
{
private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');

/**
* @param string $element
* @return string
*/
public function tokenize($element)
{
return str_pad(base_convert(md5($element), 16, 2), 128, '0', STR_PAD_LEFT);
$hash = md5($element);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to split this in parts?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For more comfort debugging

$hash = str_replace(self::$search, self::$replace, $hash);
$hash = str_pad($hash, 128, '0', STR_PAD_LEFT);
return $hash;
}

/**
Expand All @@ -48,4 +54,4 @@ public function supportsSize($size)
{
return $size === 128;
}
}
}
57 changes: 57 additions & 0 deletions lib/Tga/SimHash/Tokenizer/String256Tokenizer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<?php

/*
* This file is part of the SimHashPhp package.
*
* (c) Titouan Galopin <http://titouangalopin.com/>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Tga\SimHash\Tokenizer;

/**
* Tokenizer for strings that generate 256 bit tokens.
*
* @author Titouan Galopin <http://titouangalopin.com/>
*/
class String256Tokenizer implements TokenizerInterface
{
private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');

/**
* @param string $element
* @return string
*/
public function tokenize($element)
{
$hash = hash('sha256', $element);
$hash = str_replace(self::$search, self::$replace, $hash);
$hash = str_pad($hash, 256, '0', STR_PAD_LEFT);
return $hash;
}

/**
* Does this tokenizer supports the given element
*
* @param string $element
* @return boolean
*/
public function supportsElement($element)
{
return is_string($element);
}

/**
* Does this tokenizer return tokens of the given size
*
* @param int $size
* @return boolean
*/
public function supportsSize($size)
{
return $size === 256;
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Tokenizer/String32Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ public function supportsSize($size)
{
return $size === 32;
}
}
}
57 changes: 57 additions & 0 deletions lib/Tga/SimHash/Tokenizer/String512Tokenizer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<?php

/*
* This file is part of the SimHashPhp package.
*
* (c) Titouan Galopin <http://titouangalopin.com/>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Tga\SimHash\Tokenizer;

/**
* Tokenizer for strings that generate 512 bit tokens.
*
* @author Titouan Galopin <http://titouangalopin.com/>
*/
class String512Tokenizer implements TokenizerInterface
{
private static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
private static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');

/**
* @param string $element
* @return string
*/
public function tokenize($element)
{
$hash = hash('sha512', $element);
$hash = str_replace(self::$search, self::$replace, $hash);
$hash = str_pad($hash, 512, '0', STR_PAD_LEFT);
return $hash;
}

/**
* Does this tokenizer supports the given element
*
* @param string $element
* @return boolean
*/
public function supportsElement($element)
{
return is_string($element);
}

/**
* Does this tokenizer return tokens of the given size
*
* @param int $size
* @return boolean
*/
public function supportsSize($size)
{
return $size === 512;
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Tokenizer/String64Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,4 @@ private function buildTable()

return $crc64tab;
}
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Tokenizer/TokenizerInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ public function supportsElement($element);
* @return boolean
*/
public function supportsSize($size);
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Vectorizer/DefaultVectorizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ protected function createWeightTokens($tokens)

return $weightTokens;
}
}
}
2 changes: 1 addition & 1 deletion lib/Tga/SimHash/Vectorizer/VectorizerInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ interface VectorizerInterface
* @return array
*/
public function vectorize(array $tokens, $size);
}
}