diff --git a/finediff.php b/finediff.php
index 2e38b77..0fad956 100644
--- a/finediff.php
+++ b/finediff.php
@@ -31,8 +31,13 @@
* @link http://www.raymondhill.net/finediff/
* @version 0.6
* @license MIT License (http://www.opensource.org/licenses/mit-license.php)
+ *
+ * 10-Dec-2011 (Christoph Mewes):
+ * - added UTF-8 support, fixed strange usage of htmlentities
*/
+mb_internal_encoding('UTF-8');
+
/**
* Usage (simplest):
*
@@ -107,13 +112,13 @@ public function getFromLen() {
return 0;
}
public function getToLen() {
- return strlen($this->text);
+ return mb_strlen($this->text);
}
public function getText() {
return $this->text;
}
public function getOpcode() {
- $to_len = strlen($this->text);
+ $to_len = mb_strlen($this->text);
if ( $to_len === 1 ) {
return "i:{$this->text}";
}
@@ -130,7 +135,7 @@ public function getFromLen() {
return $this->fromLen;
}
public function getToLen() {
- return strlen($this->text);
+ return mb_strlen($this->text);
}
public function getText() {
return $this->text;
@@ -142,7 +147,7 @@ public function getOpcode() {
else {
$del_opcode = "d{$this->fromLen}";
}
- $to_len = strlen($this->text);
+ $to_len = mb_strlen($this->text);
if ( $to_len === 1 ) {
return "{$del_opcode}i:{$this->text}";
}
@@ -177,6 +182,8 @@ public function increase($size) {
* Collection of ops
*/
class FineDiffOps {
+ public $edits = array();
+
public function appendOpcode($opcode, $from, $from_offset, $from_len) {
if ( $opcode === 'c' ) {
$edits[] = new FineDiffCopyOp($from_len);
@@ -185,10 +192,9 @@ public function appendOpcode($opcode, $from, $from_offset, $from_len) {
$edits[] = new FineDiffDeleteOp($from_len);
}
else /* if ( $opcode === 'i' ) */ {
- $edits[] = new FineDiffInsertOp(substr($from, $from_offset, $from_len));
+ $edits[] = new FineDiffInsertOp(mb_substr($from, $from_offset, $from_len));
}
}
- public $edits = array();
}
/**
@@ -299,14 +305,14 @@ public static function renderFromOpcodes($from, $opcodes, $callback) {
if ( !is_callable($callback) ) {
return;
}
- $opcodes_len = strlen($opcodes);
+ $opcodes_len = mb_strlen($opcodes);
$from_offset = $opcodes_offset = 0;
while ( $opcodes_offset < $opcodes_len ) {
- $opcode = substr($opcodes, $opcodes_offset, 1);
+ $opcode = mb_substr($opcodes, $opcodes_offset, 1);
$opcodes_offset++;
- $n = intval(substr($opcodes, $opcodes_offset));
+ $n = intval(mb_substr($opcodes, $opcodes_offset));
if ( $n ) {
- $opcodes_offset += strlen(strval($n));
+ $opcodes_offset += mb_strlen(strval($n));
}
else {
$n = 1;
@@ -394,7 +400,7 @@ private function _processGranularity($from_segment, $to_segment) {
// increase granularity
if ( $fragment_edit instanceof FineDiffReplaceOp && $has_next_stage ) {
$this->_processGranularity(
- substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()),
+ mb_substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()),
$fragment_edit->getText()
);
}
@@ -432,8 +438,8 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$result = array();
// fragment-level diffing
- $from_text_len = strlen($from_text);
- $to_text_len = strlen($to_text);
+ $from_text_len = mb_strlen($from_text);
+ $to_text_len = mb_strlen($to_text);
$from_fragments = FineDiff::extractFragments($from_text, $delimiters);
$to_fragments = FineDiff::extractFragments($to_text, $delimiters);
@@ -454,7 +460,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$result[$from_segment_start * 4] = new FineDiffDeleteOp($from_segment_length);
}
else if ( $to_segment_length ) {
- $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_length));
+ $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_length));
}
continue;
}
@@ -468,7 +474,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
while ( $from_base_fragment_index < $from_segment_end ) {
$from_base_fragment = $from_fragments[$from_base_fragment_index];
- $from_base_fragment_length = strlen($from_base_fragment);
+ $from_base_fragment_length = mb_strlen($from_base_fragment);
// performance boost: cache array keys
if ( !isset($cached_array_keys_for_current_segment[$from_base_fragment]) ) {
if ( !isset($cached_array_keys[$from_base_fragment]) ) {
@@ -510,7 +516,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
if ( $from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index] ) {
break;
}
- $fragment_length = strlen($from_fragments[$fragment_from_index]);
+ $fragment_length = mb_strlen($from_fragments[$fragment_from_index]);
$fragment_index_offset += $fragment_length;
}
if ( $fragment_index_offset > $best_copy_length ) {
@@ -519,7 +525,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$best_to_start = $to_base_fragment_index;
}
}
- $from_base_fragment_index += strlen($from_base_fragment);
+ $from_base_fragment_index += mb_strlen($from_base_fragment);
// If match is larger than half segment size, no point trying to find better
// TODO: Really?
if ( $best_copy_length >= $from_segment_length / 2) {
@@ -538,7 +544,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$jobs[] = array($best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end);
}
else {
- $result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, substr($to_text, $to_segment_start, $to_segment_length));
+ $result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, mb_substr($to_text, $to_segment_start, $to_segment_length));
}
}
@@ -564,7 +570,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
*/
private static function doCharDiff($from_text, $to_text) {
$result = array();
- $jobs = array(array(0, strlen($from_text), 0, strlen($to_text)));
+ $jobs = array(array(0, mb_strlen($from_text), 0, mb_strlen($to_text)));
while ( $job = array_pop($jobs) ) {
// get the segments which must be diff'ed
list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job;
@@ -577,7 +583,7 @@ private static function doCharDiff($from_text, $to_text) {
$result[$from_segment_start * 4 + 0] = new FineDiffDeleteOp($from_segment_len);
}
else if ( $to_segment_len ) {
- $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_len));
+ $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_len));
}
continue;
}
@@ -587,7 +593,7 @@ private static function doCharDiff($from_text, $to_text) {
$to_copy_start = $to_segment_start;
$to_copy_start_max = $to_segment_end - $copy_len;
while ( $to_copy_start <= $to_copy_start_max ) {
- $from_copy_start = strpos(substr($from_text, $from_segment_start, $from_segment_len), substr($to_text, $to_copy_start, $copy_len));
+ $from_copy_start = mb_strpos(mb_substr($from_text, $from_segment_start, $from_segment_len), mb_substr($to_text, $to_copy_start, $copy_len));
if ( $from_copy_start !== false ) {
$from_copy_start += $from_segment_start;
break 2;
@@ -603,7 +609,7 @@ private static function doCharDiff($from_text, $to_text) {
$from_copy_start = $from_segment_start;
$from_copy_start_max = $from_segment_end - $copy_len;
while ( $from_copy_start <= $from_copy_start_max ) {
- $to_copy_start = strpos(substr($to_text, $to_segment_start, $to_segment_len), substr($from_text, $from_copy_start, $copy_len));
+ $to_copy_start = mb_strpos(mb_substr($to_text, $to_segment_start, $to_segment_len), mb_substr($from_text, $from_copy_start, $copy_len));
if ( $to_copy_start !== false ) {
$to_copy_start += $to_segment_start;
break 2;
@@ -621,7 +627,7 @@ private static function doCharDiff($from_text, $to_text) {
}
// no match, so delete all, insert all
else {
- $result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, substr($to_text, $to_segment_start, $to_segment_len));
+ $result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, mb_substr($to_text, $to_segment_start, $to_segment_len));
}
}
ksort($result, SORT_NUMERIC);
@@ -641,19 +647,19 @@ private static function doCharDiff($from_text, $to_text) {
private static function extractFragments($text, $delimiters) {
// special case: split into characters
if ( empty($delimiters) ) {
- $chars = str_split($text, 1);
- $chars[strlen($text)] = '';
+ $chars = self::splitToChars($text);
+ $chars[] = '';
return $chars;
}
$fragments = array();
$start = $end = 0;
for (;;) {
- $end += strcspn($text, $delimiters, $end);
- $end += strspn($text, $delimiters, $end);
+ $end += self::mb_strcspn($text, $delimiters, $end);
+ $end += self::mb_strspn($text, $delimiters, $end);
if ( $end === $start ) {
break;
}
- $fragments[$start] = substr($text, $start, $end - $start);
+ $fragments[$start] = mb_substr($text, $start, $end - $start);
$start = $end;
}
$fragments[$start] = '';
@@ -665,24 +671,57 @@ private static function extractFragments($text, $delimiters) {
*/
private static function renderToTextFromOpcode($opcode, $from, $from_offset, $from_len) {
if ( $opcode === 'c' || $opcode === 'i' ) {
- echo substr($from, $from_offset, $from_len);
+ echo mb_substr($from, $from_offset, $from_len);
}
}
private static function renderDiffToHTMLFromOpcode($opcode, $from, $from_offset, $from_len) {
if ( $opcode === 'c' ) {
- echo htmlentities(htmlentities(substr($from, $from_offset, $from_len)));
+ echo htmlspecialchars(mb_substr($from, $from_offset, $from_len));
}
else if ( $opcode === 'd' ) {
- $deletion = substr($from, $from_offset, $from_len);
- if ( strcspn($deletion, " \n\r") === 0 ) {
+ $deletion = mb_substr($from, $from_offset, $from_len);
+ if ( strcspn($deletion, " \n\r") === 0 ) { // no mb_ here is okay
$deletion = str_replace(array("\n","\r"), array('\n','\r'), $deletion);
}
- echo '', htmlentities(htmlentities($deletion)), '';
+ echo '', htmlspecialchars($deletion), '';
}
else /* if ( $opcode === 'i' ) */ {
- echo '', htmlentities(htmlentities(substr($from, $from_offset, $from_len))), '';
+ echo '', htmlspecialchars(mb_substr($from, $from_offset, $from_len), ENT_QUOTES), '';
+ }
}
+
+ private static function splitToChars($str) {
+ preg_match_all('/./us', $str, $matches);
+ $matches = $matches[0];
+
+ if (count($matches) === 0) return array('');
+ return $matches;
+ }
+
+ private static function mb_strcspn($str, $delimiters, $start) {
+ $dels = self::splitToChars($delimiters);
+ $min = mb_strlen($str);
+
+ foreach ($dels as $del) {
+ $pos = mb_strpos($str, $del, $start);
+ if ($pos !== false && $pos < $min) $min = $pos;
+ }
+
+ return $min - $start;
}
+
+ private static function mb_strspn($str, $delimiters, $start) {
+ $str = mb_substr($str, $start);
+ $dels = self::splitToChars($delimiters);
+
+ foreach ($dels as $idx => $del) {
+ $dels[$idx] = preg_quote($del, '/');
}
+ $dels = implode('|', $dels);
+
+ preg_match("/^($dels)+/us", $str, $match);
+ return $match ? mb_strlen($match[0]) : 0;
+ }
+}