diff --git a/finediff.php b/finediff.php index 2e38b77..0fad956 100644 --- a/finediff.php +++ b/finediff.php @@ -31,8 +31,13 @@ * @link http://www.raymondhill.net/finediff/ * @version 0.6 * @license MIT License (http://www.opensource.org/licenses/mit-license.php) + * + * 10-Dec-2011 (Christoph Mewes): + * - added UTF-8 support, fixed strange usage of htmlentities */ +mb_internal_encoding('UTF-8'); + /** * Usage (simplest): * @@ -107,13 +112,13 @@ public function getFromLen() { return 0; } public function getToLen() { - return strlen($this->text); + return mb_strlen($this->text); } public function getText() { return $this->text; } public function getOpcode() { - $to_len = strlen($this->text); + $to_len = mb_strlen($this->text); if ( $to_len === 1 ) { return "i:{$this->text}"; } @@ -130,7 +135,7 @@ public function getFromLen() { return $this->fromLen; } public function getToLen() { - return strlen($this->text); + return mb_strlen($this->text); } public function getText() { return $this->text; @@ -142,7 +147,7 @@ public function getOpcode() { else { $del_opcode = "d{$this->fromLen}"; } - $to_len = strlen($this->text); + $to_len = mb_strlen($this->text); if ( $to_len === 1 ) { return "{$del_opcode}i:{$this->text}"; } @@ -177,6 +182,8 @@ public function increase($size) { * Collection of ops */ class FineDiffOps { + public $edits = array(); + public function appendOpcode($opcode, $from, $from_offset, $from_len) { if ( $opcode === 'c' ) { $edits[] = new FineDiffCopyOp($from_len); @@ -185,10 +192,9 @@ public function appendOpcode($opcode, $from, $from_offset, $from_len) { $edits[] = new FineDiffDeleteOp($from_len); } else /* if ( $opcode === 'i' ) */ { - $edits[] = new FineDiffInsertOp(substr($from, $from_offset, $from_len)); + $edits[] = new FineDiffInsertOp(mb_substr($from, $from_offset, $from_len)); } } - public $edits = array(); } /** @@ -299,14 +305,14 @@ public static function renderFromOpcodes($from, $opcodes, $callback) { if ( !is_callable($callback) ) { return; } - $opcodes_len = strlen($opcodes); + $opcodes_len = mb_strlen($opcodes); $from_offset = $opcodes_offset = 0; while ( $opcodes_offset < $opcodes_len ) { - $opcode = substr($opcodes, $opcodes_offset, 1); + $opcode = mb_substr($opcodes, $opcodes_offset, 1); $opcodes_offset++; - $n = intval(substr($opcodes, $opcodes_offset)); + $n = intval(mb_substr($opcodes, $opcodes_offset)); if ( $n ) { - $opcodes_offset += strlen(strval($n)); + $opcodes_offset += mb_strlen(strval($n)); } else { $n = 1; @@ -394,7 +400,7 @@ private function _processGranularity($from_segment, $to_segment) { // increase granularity if ( $fragment_edit instanceof FineDiffReplaceOp && $has_next_stage ) { $this->_processGranularity( - substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()), + mb_substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()), $fragment_edit->getText() ); } @@ -432,8 +438,8 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $result = array(); // fragment-level diffing - $from_text_len = strlen($from_text); - $to_text_len = strlen($to_text); + $from_text_len = mb_strlen($from_text); + $to_text_len = mb_strlen($to_text); $from_fragments = FineDiff::extractFragments($from_text, $delimiters); $to_fragments = FineDiff::extractFragments($to_text, $delimiters); @@ -454,7 +460,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $result[$from_segment_start * 4] = new FineDiffDeleteOp($from_segment_length); } else if ( $to_segment_length ) { - $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_length)); + $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_length)); } continue; } @@ -468,7 +474,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { while ( $from_base_fragment_index < $from_segment_end ) { $from_base_fragment = $from_fragments[$from_base_fragment_index]; - $from_base_fragment_length = strlen($from_base_fragment); + $from_base_fragment_length = mb_strlen($from_base_fragment); // performance boost: cache array keys if ( !isset($cached_array_keys_for_current_segment[$from_base_fragment]) ) { if ( !isset($cached_array_keys[$from_base_fragment]) ) { @@ -510,7 +516,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { if ( $from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index] ) { break; } - $fragment_length = strlen($from_fragments[$fragment_from_index]); + $fragment_length = mb_strlen($from_fragments[$fragment_from_index]); $fragment_index_offset += $fragment_length; } if ( $fragment_index_offset > $best_copy_length ) { @@ -519,7 +525,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $best_to_start = $to_base_fragment_index; } } - $from_base_fragment_index += strlen($from_base_fragment); + $from_base_fragment_index += mb_strlen($from_base_fragment); // If match is larger than half segment size, no point trying to find better // TODO: Really? if ( $best_copy_length >= $from_segment_length / 2) { @@ -538,7 +544,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $jobs[] = array($best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end); } else { - $result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, substr($to_text, $to_segment_start, $to_segment_length)); + $result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, mb_substr($to_text, $to_segment_start, $to_segment_length)); } } @@ -564,7 +570,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { */ private static function doCharDiff($from_text, $to_text) { $result = array(); - $jobs = array(array(0, strlen($from_text), 0, strlen($to_text))); + $jobs = array(array(0, mb_strlen($from_text), 0, mb_strlen($to_text))); while ( $job = array_pop($jobs) ) { // get the segments which must be diff'ed list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job; @@ -577,7 +583,7 @@ private static function doCharDiff($from_text, $to_text) { $result[$from_segment_start * 4 + 0] = new FineDiffDeleteOp($from_segment_len); } else if ( $to_segment_len ) { - $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_len)); + $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_len)); } continue; } @@ -587,7 +593,7 @@ private static function doCharDiff($from_text, $to_text) { $to_copy_start = $to_segment_start; $to_copy_start_max = $to_segment_end - $copy_len; while ( $to_copy_start <= $to_copy_start_max ) { - $from_copy_start = strpos(substr($from_text, $from_segment_start, $from_segment_len), substr($to_text, $to_copy_start, $copy_len)); + $from_copy_start = mb_strpos(mb_substr($from_text, $from_segment_start, $from_segment_len), mb_substr($to_text, $to_copy_start, $copy_len)); if ( $from_copy_start !== false ) { $from_copy_start += $from_segment_start; break 2; @@ -603,7 +609,7 @@ private static function doCharDiff($from_text, $to_text) { $from_copy_start = $from_segment_start; $from_copy_start_max = $from_segment_end - $copy_len; while ( $from_copy_start <= $from_copy_start_max ) { - $to_copy_start = strpos(substr($to_text, $to_segment_start, $to_segment_len), substr($from_text, $from_copy_start, $copy_len)); + $to_copy_start = mb_strpos(mb_substr($to_text, $to_segment_start, $to_segment_len), mb_substr($from_text, $from_copy_start, $copy_len)); if ( $to_copy_start !== false ) { $to_copy_start += $to_segment_start; break 2; @@ -621,7 +627,7 @@ private static function doCharDiff($from_text, $to_text) { } // no match, so delete all, insert all else { - $result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, substr($to_text, $to_segment_start, $to_segment_len)); + $result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, mb_substr($to_text, $to_segment_start, $to_segment_len)); } } ksort($result, SORT_NUMERIC); @@ -641,19 +647,19 @@ private static function doCharDiff($from_text, $to_text) { private static function extractFragments($text, $delimiters) { // special case: split into characters if ( empty($delimiters) ) { - $chars = str_split($text, 1); - $chars[strlen($text)] = ''; + $chars = self::splitToChars($text); + $chars[] = ''; return $chars; } $fragments = array(); $start = $end = 0; for (;;) { - $end += strcspn($text, $delimiters, $end); - $end += strspn($text, $delimiters, $end); + $end += self::mb_strcspn($text, $delimiters, $end); + $end += self::mb_strspn($text, $delimiters, $end); if ( $end === $start ) { break; } - $fragments[$start] = substr($text, $start, $end - $start); + $fragments[$start] = mb_substr($text, $start, $end - $start); $start = $end; } $fragments[$start] = ''; @@ -665,24 +671,57 @@ private static function extractFragments($text, $delimiters) { */ private static function renderToTextFromOpcode($opcode, $from, $from_offset, $from_len) { if ( $opcode === 'c' || $opcode === 'i' ) { - echo substr($from, $from_offset, $from_len); + echo mb_substr($from, $from_offset, $from_len); } } private static function renderDiffToHTMLFromOpcode($opcode, $from, $from_offset, $from_len) { if ( $opcode === 'c' ) { - echo htmlentities(htmlentities(substr($from, $from_offset, $from_len))); + echo htmlspecialchars(mb_substr($from, $from_offset, $from_len)); } else if ( $opcode === 'd' ) { - $deletion = substr($from, $from_offset, $from_len); - if ( strcspn($deletion, " \n\r") === 0 ) { + $deletion = mb_substr($from, $from_offset, $from_len); + if ( strcspn($deletion, " \n\r") === 0 ) { // no mb_ here is okay $deletion = str_replace(array("\n","\r"), array('\n','\r'), $deletion); } - echo '', htmlentities(htmlentities($deletion)), ''; + echo '', htmlspecialchars($deletion), ''; } else /* if ( $opcode === 'i' ) */ { - echo '', htmlentities(htmlentities(substr($from, $from_offset, $from_len))), ''; + echo '', htmlspecialchars(mb_substr($from, $from_offset, $from_len), ENT_QUOTES), ''; + } } + + private static function splitToChars($str) { + preg_match_all('/./us', $str, $matches); + $matches = $matches[0]; + + if (count($matches) === 0) return array(''); + return $matches; + } + + private static function mb_strcspn($str, $delimiters, $start) { + $dels = self::splitToChars($delimiters); + $min = mb_strlen($str); + + foreach ($dels as $del) { + $pos = mb_strpos($str, $del, $start); + if ($pos !== false && $pos < $min) $min = $pos; + } + + return $min - $start; } + + private static function mb_strspn($str, $delimiters, $start) { + $str = mb_substr($str, $start); + $dels = self::splitToChars($delimiters); + + foreach ($dels as $idx => $del) { + $dels[$idx] = preg_quote($del, '/'); } + $dels = implode('|', $dels); + + preg_match("/^($dels)+/us", $str, $match); + return $match ? mb_strlen($match[0]) : 0; + } +}