From 521c2fa47f6658f1c452f678fde0728620ae30e2 Mon Sep 17 00:00:00 2001 From: Christoph Mewes Date: Mon, 12 Dec 2011 01:18:09 +0100 Subject: [PATCH] added UTF-8 support, reformatted code --- finediff.php | 653 +++++++++++++++++++++++++++------------------------ 1 file changed, 347 insertions(+), 306 deletions(-) diff --git a/finediff.php b/finediff.php index f0d913d..d38c7f2 100644 --- a/finediff.php +++ b/finediff.php @@ -1,236 +1,250 @@ copy->insert -* command (swap) for when the inserted segment is exactly the same -* as the deleted one, and with only a copy operation in between. -* TODO: How often this case occurs? Is it worth it? Can only -* be done as a postprocessing method (->optimize()?) -*/ + * Persisted opcodes (string) are a sequence of atomic opcode. + * A single opcode can be one of the following: + * c | c{n} | d | d{n} | i:{c} | i{length}:{s} + * 'c' = copy one character from source + * 'c{n}' = copy n characters from source + * 'd' = skip one character from source + * 'd{n}' = skip n characters from source + * 'i:{c} = insert character 'c' + * 'i{n}:{s}' = insert string s, which is of length n + * + * Do not exist as of now, under consideration: + * 'm{n}:{o} = move n characters from source o characters ahead. + * It would be essentially a shortcut for a delete->copy->insert + * command (swap) for when the inserted segment is exactly the same + * as the deleted one, and with only a copy operation in between. + * TODO: How often this case occurs? Is it worth it? Can only + * be done as a postprocessing method (->optimize()?) + */ abstract class FineDiffOp { abstract public function getFromLen(); abstract public function getToLen(); abstract public function getOpcode(); - } +} class FineDiffDeleteOp extends FineDiffOp { public function __construct($len) { $this->fromLen = $len; - } + } + public function getFromLen() { return $this->fromLen; - } + } + public function getToLen() { return 0; - } + } + public function getOpcode() { if ( $this->fromLen === 1 ) { return 'd'; - } - return "d{$this->fromLen}"; } + return "d{$this->fromLen}"; } +} class FineDiffInsertOp extends FineDiffOp { public function __construct($text) { $this->text = $text; - } + } + public function getFromLen() { return 0; - } + } + public function getToLen() { - return strlen($this->text); - } + return mb_strlen($this->text); + } + public function getText() { return $this->text; - } + } + public function getOpcode() { - $to_len = strlen($this->text); + $to_len = mb_strlen($this->text); if ( $to_len === 1 ) { return "i:{$this->text}"; - } - return "i{$to_len}:{$this->text}"; } + return "i{$to_len}:{$this->text}"; } +} class FineDiffReplaceOp extends FineDiffOp { public function __construct($fromLen, $text) { $this->fromLen = $fromLen; $this->text = $text; - } + } + public function getFromLen() { return $this->fromLen; - } + } + public function getToLen() { - return strlen($this->text); - } + return mb_strlen($this->text); + } + public function getText() { return $this->text; - } + } + public function getOpcode() { if ( $this->fromLen === 1 ) { $del_opcode = 'd'; - } + } else { $del_opcode = "d{$this->fromLen}"; - } - $to_len = strlen($this->text); + } + $to_len = mb_strlen($this->text); if ( $to_len === 1 ) { return "{$del_opcode}i:{$this->text}"; - } - return "{$del_opcode}i{$to_len}:{$this->text}"; } + return "{$del_opcode}i{$to_len}:{$this->text}"; } +} class FineDiffCopyOp extends FineDiffOp { public function __construct($len) { $this->len = $len; - } + } + public function getFromLen() { return $this->len; - } + } + public function getToLen() { return $this->len; - } + } + public function getOpcode() { if ( $this->len === 1 ) { return 'c'; - } - return "c{$this->len}"; } + return "c{$this->len}"; + } + public function increase($size) { return $this->len += $size; - } } +} /** -* FineDiff ops -* -* Collection of ops -*/ + * FineDiff ops + * + * Collection of ops + */ class FineDiffOps { + public $edits = array(); + public function appendOpcode($opcode, $from, $from_offset, $from_len) { if ( $opcode === 'c' ) { $edits[] = new FineDiffCopyOp($from_len); - } + } else if ( $opcode === 'd' ) { $edits[] = new FineDiffDeleteOp($from_len); - } + } else /* if ( $opcode === 'i' ) */ { - $edits[] = new FineDiffInsertOp(substr($from, $from_offset, $from_len)); - } + $edits[] = new FineDiffInsertOp(mb_substr($from, $from_offset, $from_len)); } - public $edits = array(); } +} /** -* FineDiff class -* -* TODO: Document -* -*/ + * FineDiff class + * + * TODO: Document + * + */ class FineDiff { - - /**------------------------------------------------------------------------ - * - * Public section - * - */ - /** - * Constructor - * ... - * The $granularityStack allows FineDiff to be configurable so that - * a particular stack tailored to the specific content of a document can - * be passed. - */ + * Constructor + * ... + * The $granularityStack allows FineDiff to be configurable so that + * a particular stack tailored to the specific content of a document can + * be passed. + */ public function __construct($from_text = '', $to_text = '', $granularityStack = null) { // setup stack for generic text documents by default - $this->granularityStack = $granularityStack ? $granularityStack : FineDiff::$characterGranularity; + $this->granularityStack = $granularityStack ? $granularityStack : self::$characterGranularity; $this->edits = array(); $this->from_text = $from_text; $this->doDiff($from_text, $to_text); - } + } public function getOps() { return $this->edits; - } + } public function getOpcodes() { $opcodes = array(); foreach ( $this->edits as $edit ) { $opcodes[] = $edit->getOpcode(); - } - return implode('', $opcodes); } + return implode('', $opcodes); + } public function renderDiffToHTML() { $in_offset = 0; @@ -238,136 +252,130 @@ public function renderDiffToHTML() { foreach ( $this->edits as $edit ) { $n = $edit->getFromLen(); if ( $edit instanceof FineDiffCopyOp ) { - FineDiff::renderDiffToHTMLFromOpcode('c', $this->from_text, $in_offset, $n); - } + self::renderDiffToHTMLFromOpcode('c', $this->from_text, $in_offset, $n); + } else if ( $edit instanceof FineDiffDeleteOp ) { - FineDiff::renderDiffToHTMLFromOpcode('d', $this->from_text, $in_offset, $n); - } + self::renderDiffToHTMLFromOpcode('d', $this->from_text, $in_offset, $n); + } else if ( $edit instanceof FineDiffInsertOp ) { - FineDiff::renderDiffToHTMLFromOpcode('i', $edit->getText(), 0, $edit->getToLen()); - } + self::renderDiffToHTMLFromOpcode('i', $edit->getText(), 0, $edit->getToLen()); + } else /* if ( $edit instanceof FineDiffReplaceOp ) */ { - FineDiff::renderDiffToHTMLFromOpcode('d', $this->from_text, $in_offset, $n); - FineDiff::renderDiffToHTMLFromOpcode('i', $edit->getText(), 0, $edit->getToLen()); - } - $in_offset += $n; + self::renderDiffToHTMLFromOpcode('d', $this->from_text, $in_offset, $n); + self::renderDiffToHTMLFromOpcode('i', $edit->getText(), 0, $edit->getToLen()); } - return ob_get_clean(); + $in_offset += $n; } + return ob_get_clean(); + } /**------------------------------------------------------------------------ - * Return an opcodes string describing the diff between a "From" and a - * "To" string - */ + * Return an opcodes string describing the diff between a "From" and a + * "To" string + */ public static function getDiffOpcodes($from, $to, $granularities = null) { - $diff = new FineDiff($from, $to, $granularities); + $diff = new self($from, $to, $granularities); return $diff->getOpcodes(); - } + } /**------------------------------------------------------------------------ - * Return an iterable collection of diff ops from an opcodes string - */ + * Return an iterable collection of diff ops from an opcodes string + */ public static function getDiffOpsFromOpcodes($opcodes) { $diffops = new FineDiffOps(); - FineDiff::renderFromOpcodes(null, $opcodes, array($diffops,'appendOpcode')); + self::renderFromOpcodes(null, $opcodes, array($diffops,'appendOpcode')); return $diffops->edits; - } + } /**------------------------------------------------------------------------ - * Re-create the "To" string from the "From" string and an "Opcodes" string - */ + * Re-create the "To" string from the "From" string and an "Opcodes" string + */ public static function renderToTextFromOpcodes($from, $opcodes) { ob_start(); - FineDiff::renderFromOpcodes($from, $opcodes, array('FineDiff','renderToTextFromOpcode')); + self::renderFromOpcodes($from, $opcodes, array('FineDiff','renderToTextFromOpcode')); return ob_get_clean(); - } + } - /**------------------------------------------------------------------------ - * Render the diff to an HTML string - */ + /** + * Render the diff to an HTML string + */ public static function renderDiffToHTMLFromOpcodes($from, $opcodes) { ob_start(); - FineDiff::renderFromOpcodes($from, $opcodes, array('FineDiff','renderDiffToHTMLFromOpcode')); + self::renderFromOpcodes($from, $opcodes, array('FineDiff', 'renderDiffToHTMLFromOpcode')); return ob_get_clean(); - } + } - /**------------------------------------------------------------------------ - * Generic opcodes parser, user must supply callback for handling - * single opcode - */ + /** + * Generic opcodes parser, user must supply callback for handling + * single opcode + */ public static function renderFromOpcodes($from, $opcodes, $callback) { if ( !is_callable($callback) ) { return; - } - $opcodes_len = strlen($opcodes); + } + $opcodes_len = mb_strlen($opcodes); $from_offset = $opcodes_offset = 0; while ( $opcodes_offset < $opcodes_len ) { - $opcode = substr($opcodes, $opcodes_offset, 1); + $opcode = mb_substr($opcodes, $opcodes_offset, 1); $opcodes_offset++; - $n = intval(substr($opcodes, $opcodes_offset)); + $n = intval(mb_substr($opcodes, $opcodes_offset)); if ( $n ) { - $opcodes_offset += strlen(strval($n)); - } + $opcodes_offset += mb_strlen(strval($n)); + } else { $n = 1; - } + } if ( $opcode === 'c' ) { // copy n characters from source call_user_func($callback, 'c', $from, $from_offset, $n, ''); $from_offset += $n; - } + } else if ( $opcode === 'd' ) { // delete n characters from source call_user_func($callback, 'd', $from, $from_offset, $n, ''); $from_offset += $n; - } + } else /* if ( $opcode === 'i' ) */ { // insert n characters from opcodes call_user_func($callback, 'i', $opcodes, $opcodes_offset + 1, $n); $opcodes_offset += 1 + $n; - } } } + } /** - * Stock granularity stacks and delimiters - */ + * Stock granularity stacks and delimiters + */ const paragraphDelimiters = "\n\r"; public static $paragraphGranularity = array( FineDiff::paragraphDelimiters - ); + ); const sentenceDelimiters = ".\n\r"; public static $sentenceGranularity = array( FineDiff::paragraphDelimiters, FineDiff::sentenceDelimiters - ); + ); const wordDelimiters = " \t.\n\r"; public static $wordGranularity = array( FineDiff::paragraphDelimiters, FineDiff::sentenceDelimiters, FineDiff::wordDelimiters - ); + ); const characterDelimiters = ""; public static $characterGranularity = array( FineDiff::paragraphDelimiters, FineDiff::sentenceDelimiters, FineDiff::wordDelimiters, FineDiff::characterDelimiters - ); + ); public static $textStack = array( ".", " \t.\n\r", "" - ); - - /**------------------------------------------------------------------------ - * - * Private section - * - */ + ); /** - * Entry point to compute the diff. - */ + * Entry point to compute the diff. + */ private function doDiff($from_text, $to_text) { $this->last_edit = false; $this->stackpointer = 0; @@ -376,64 +384,64 @@ private function doDiff($from_text, $to_text) { // can't diff without at least one granularity specifier if ( empty($this->granularityStack) ) { return; - } - $this->_processGranularity($from_text, $to_text); } + $this->_processGranularity($from_text, $to_text); + } /** - * This is the recursive function which is responsible for - * handling/increasing granularity. - * - * Incrementally increasing the granularity is key to compute the - * overall diff in a very efficient way. - */ + * This is the recursive function which is responsible for + * handling/increasing granularity. + * + * Incrementally increasing the granularity is key to compute the + * overall diff in a very efficient way. + */ private function _processGranularity($from_segment, $to_segment) { $delimiters = $this->granularityStack[$this->stackpointer++]; $has_next_stage = $this->stackpointer < count($this->granularityStack); foreach ( FineDiff::doFragmentDiff($from_segment, $to_segment, $delimiters) as $fragment_edit ) { // increase granularity - if ( $fragment_edit instanceof fineDiffReplaceOp && $has_next_stage ) { + if ( $fragment_edit instanceof FineDiffReplaceOp && $has_next_stage ) { $this->_processGranularity( - substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()), + mb_substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()), $fragment_edit->getText() ); - } + } // fuse copy ops whenever possible - else if ( $fragment_edit instanceof fineDiffCopyOp && $this->last_edit instanceof fineDiffCopyOp ) { + else if ( $fragment_edit instanceof FineDiffCopyOp && $this->last_edit instanceof FineDiffCopyOp ) { $this->edits[count($this->edits)-1]->increase($fragment_edit->getFromLen()); $this->from_offset += $fragment_edit->getFromLen(); - } + } else { - /* $fragment_edit instanceof fineDiffCopyOp */ - /* $fragment_edit instanceof fineDiffDeleteOp */ - /* $fragment_edit instanceof fineDiffInsertOp */ + /* $fragment_edit instanceof FineDiffCopyOp */ + /* $fragment_edit instanceof FineDiffDeleteOp */ + /* $fragment_edit instanceof FineDiffInsertOp */ $this->edits[] = $this->last_edit = $fragment_edit; $this->from_offset += $fragment_edit->getFromLen(); - } } - $this->stackpointer--; } + $this->stackpointer--; + } /** - * This is the core algorithm which actually perform the diff itself, - * fragmenting the strings as per specified delimiters. - * - * This function is naturally recursive, however for performance purpose - * a local job queue is used instead of outright recursivity. - */ + * This is the core algorithm which actually perform the diff itself, + * fragmenting the strings as per specified delimiters. + * + * This function is naturally recursive, however for performance purpose + * a local job queue is used instead of outright recursivity. + */ private static function doFragmentDiff($from_text, $to_text, $delimiters) { // Empty delimiter means character-level diffing. // In such case, use code path optimized for character-level // diffing. if ( empty($delimiters) ) { return FineDiff::doCharDiff($from_text, $to_text); - } + } $result = array(); // fragment-level diffing - $from_text_len = strlen($from_text); - $to_text_len = strlen($to_text); + $from_text_len = mb_strlen($from_text); + $to_text_len = mb_strlen($to_text); $from_fragments = FineDiff::extractFragments($from_text, $delimiters); $to_fragments = FineDiff::extractFragments($to_text, $delimiters); @@ -451,13 +459,13 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $to_segment_length = $to_segment_end - $to_segment_start; if ( !$from_segment_length || !$to_segment_length ) { if ( $from_segment_length ) { - $result[$from_segment_start * 4] = new fineDiffDeleteOp($from_segment_length); - } + $result[$from_segment_start * 4] = new FineDiffDeleteOp($from_segment_length); + } else if ( $to_segment_length ) { - $result[$from_segment_start * 4 + 1] = new fineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_length)); - } - continue; + $result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_length)); } + continue; + } // find longest copy operation for the current segments $best_copy_length = 0; @@ -468,15 +476,15 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { while ( $from_base_fragment_index < $from_segment_end ) { $from_base_fragment = $from_fragments[$from_base_fragment_index]; - $from_base_fragment_length = strlen($from_base_fragment); + $from_base_fragment_length = mb_strlen($from_base_fragment); // performance boost: cache array keys if ( !isset($cached_array_keys_for_current_segment[$from_base_fragment]) ) { if ( !isset($cached_array_keys[$from_base_fragment]) ) { $to_all_fragment_indices = $cached_array_keys[$from_base_fragment] = array_keys($to_fragments, $from_base_fragment, true); - } + } else { $to_all_fragment_indices = $cached_array_keys[$from_base_fragment]; - } + } // get only indices which falls within current segment if ( $to_segment_start > 0 || $to_segment_end < $to_text_len ) { $to_fragment_indices = array(); @@ -484,16 +492,16 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { if ( $to_fragment_index < $to_segment_start ) { continue; } if ( $to_fragment_index >= $to_segment_end ) { break; } $to_fragment_indices[] = $to_fragment_index; - } - $cached_array_keys_for_current_segment[$from_base_fragment] = $to_fragment_indices; } + $cached_array_keys_for_current_segment[$from_base_fragment] = $to_fragment_indices; + } else { $to_fragment_indices = $to_all_fragment_indices; - } } + } else { $to_fragment_indices = $cached_array_keys_for_current_segment[$from_base_fragment]; - } + } // iterate through collected indices foreach ( $to_fragment_indices as $to_base_fragment_index ) { $fragment_index_offset = $from_base_fragment_length; @@ -502,69 +510,69 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $fragment_from_index = $from_base_fragment_index + $fragment_index_offset; if ( $fragment_from_index >= $from_segment_end ) { break; - } + } $fragment_to_index = $to_base_fragment_index + $fragment_index_offset; if ( $fragment_to_index >= $to_segment_end ) { break; - } + } if ( $from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index] ) { break; - } - $fragment_length = strlen($from_fragments[$fragment_from_index]); - $fragment_index_offset += $fragment_length; } + $fragment_length = mb_strlen($from_fragments[$fragment_from_index]); + $fragment_index_offset += $fragment_length; + } if ( $fragment_index_offset > $best_copy_length ) { $best_copy_length = $fragment_index_offset; $best_from_start = $from_base_fragment_index; $best_to_start = $to_base_fragment_index; - } } - $from_base_fragment_index += strlen($from_base_fragment); + } + $from_base_fragment_index += mb_strlen($from_base_fragment); // If match is larger than half segment size, no point trying to find better // TODO: Really? if ( $best_copy_length >= $from_segment_length / 2) { break; - } + } // no point to keep looking if what is left is less than // current best match if ( $from_base_fragment_index + $best_copy_length >= $from_segment_end ) { break; - } } + } if ( $best_copy_length ) { $jobs[] = array($from_segment_start, $best_from_start, $to_segment_start, $best_to_start); - $result[$best_from_start * 4 + 2] = new fineDiffCopyOp($best_copy_length); + $result[$best_from_start * 4 + 2] = new FineDiffCopyOp($best_copy_length); $jobs[] = array($best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end); - } + } else { - $result[$from_segment_start * 4 ] = new fineDiffReplaceOp($from_segment_length, substr($to_text, $to_segment_start, $to_segment_length)); - } + $result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, mb_substr($to_text, $to_segment_start, $to_segment_length)); } + } ksort($result, SORT_NUMERIC); return array_values($result); - } + } /** - * Perform a character-level diff. - * - * The algorithm is quite similar to doFragmentDiff(), except that - * the code path is optimized for character-level diff -- strpos() is - * used to find out the longest common subequence of characters. - * - * We try to find a match using the longest possible subsequence, which - * is at most the length of the shortest of the two strings, then incrementally - * reduce the size until a match is found. - * - * I still need to study more the performance of this function. It - * appears that for long strings, the generic doFragmentDiff() is more - * performant. For word-sized strings, doCharDiff() is somewhat more - * performant. - */ + * Perform a character-level diff. + * + * The algorithm is quite similar to doFragmentDiff(), except that + * the code path is optimized for character-level diff -- strpos() is + * used to find out the longest common subequence of characters. + * + * We try to find a match using the longest possible subsequence, which + * is at most the length of the shortest of the two strings, then incrementally + * reduce the size until a match is found. + * + * I still need to study more the performance of this function. It + * appears that for long strings, the generic doFragmentDiff() is more + * performant. For word-sized strings, doCharDiff() is somewhat more + * performant. + */ private static function doCharDiff($from_text, $to_text) { $result = array(); - $jobs = array(array(0, strlen($from_text), 0, strlen($to_text))); + $jobs = array(array(0, mb_strlen($from_text), 0, mb_strlen($to_text))); while ( $job = array_pop($jobs) ) { // get the segments which must be diff'ed list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job; @@ -575,114 +583,147 @@ private static function doCharDiff($from_text, $to_text) { if ( !$from_segment_len || !$to_segment_len ) { if ( $from_segment_len ) { $result[$from_segment_start * 4 + 0] = new fineDiffDeleteOp($from_segment_len); - } + } else if ( $to_segment_len ) { - $result[$from_segment_start * 4 + 1] = new fineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_len)); - } - continue; + $result[$from_segment_start * 4 + 1] = new fineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_len)); } + continue; + } if ( $from_segment_len >= $to_segment_len ) { $copy_len = $to_segment_len; while ( $copy_len ) { $to_copy_start = $to_segment_start; $to_copy_start_max = $to_segment_end - $copy_len; while ( $to_copy_start <= $to_copy_start_max ) { - $from_copy_start = strpos(substr($from_text, $from_segment_start, $from_segment_len), substr($to_text, $to_copy_start, $copy_len)); + $from_copy_start = mb_strpos(mb_substr($from_text, $from_segment_start, $from_segment_len), mb_substr($to_text, $to_copy_start, $copy_len)); if ( $from_copy_start !== false ) { $from_copy_start += $from_segment_start; break 2; - } - $to_copy_start++; } - $copy_len--; + $to_copy_start++; } + $copy_len--; } + } else { $copy_len = $from_segment_len; while ( $copy_len ) { $from_copy_start = $from_segment_start; $from_copy_start_max = $from_segment_end - $copy_len; while ( $from_copy_start <= $from_copy_start_max ) { - $to_copy_start = strpos(substr($to_text, $to_segment_start, $to_segment_len), substr($from_text, $from_copy_start, $copy_len)); + $to_copy_start = mb_strpos(mb_substr($to_text, $to_segment_start, $to_segment_len), mb_substr($from_text, $from_copy_start, $copy_len)); if ( $to_copy_start !== false ) { $to_copy_start += $to_segment_start; break 2; - } - $from_copy_start++; } - $copy_len--; + $from_copy_start++; } + $copy_len--; } + } // match found if ( $copy_len ) { $jobs[] = array($from_segment_start, $from_copy_start, $to_segment_start, $to_copy_start); $result[$from_copy_start * 4 + 2] = new FineDiffCopyOp($copy_len); $jobs[] = array($from_copy_start + $copy_len, $from_segment_end, $to_copy_start + $copy_len, $to_segment_end); - } + } // no match, so delete all, insert all else { - $result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, substr($to_text, $to_segment_start, $to_segment_len)); - } + $result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, mb_substr($to_text, $to_segment_start, $to_segment_len)); } + } ksort($result, SORT_NUMERIC); return array_values($result); - } + } /** - * Efficiently fragment the text into an array according to - * specified delimiters. - * No delimiters means fragment into single character. - * The array indices are the offset of the fragments into - * the input string. - * A sentinel empty fragment is always added at the end. - * Careful: No check is performed as to the validity of the - * delimiters. - */ + * Efficiently fragment the text into an array according to + * specified delimiters. + * No delimiters means fragment into single character. + * The array indices are the offset of the fragments into + * the input string. + * A sentinel empty fragment is always added at the end. + * Careful: No check is performed as to the validity of the + * delimiters. + */ private static function extractFragments($text, $delimiters) { // special case: split into characters if ( empty($delimiters) ) { - $chars = str_split($text, 1); - $chars[strlen($text)] = ''; + $chars = self::splitToChars($text); + $chars[] = ''; return $chars; - } + } $fragments = array(); $start = $end = 0; for (;;) { - $end += strcspn($text, $delimiters, $end); - $end += strspn($text, $delimiters, $end); + $end += self::mb_strcspn($text, $delimiters, $end); + $end += self::mb_strspn($text, $delimiters, $end); if ( $end === $start ) { break; - } - $fragments[$start] = substr($text, $start, $end - $start); - $start = $end; } + $fragments[$start] = mb_substr($text, $start, $end - $start); + $start = $end; + } $fragments[$start] = ''; return $fragments; - } + } /** - * Stock opcode renderers - */ + * Stock opcode renderers + */ private static function renderToTextFromOpcode($opcode, $from, $from_offset, $from_len) { if ( $opcode === 'c' || $opcode === 'i' ) { - echo substr($from, $from_offset, $from_len); - } + echo mb_substr($from, $from_offset, $from_len); } + } private static function renderDiffToHTMLFromOpcode($opcode, $from, $from_offset, $from_len) { if ( $opcode === 'c' ) { - echo htmlentities(htmlentities(substr($from, $from_offset, $from_len))); - } + echo htmlspecialchars(mb_substr($from, $from_offset, $from_len)); + } else if ( $opcode === 'd' ) { - $deletion = substr($from, $from_offset, $from_len); - if ( strcspn($deletion, " \n\r") === 0 ) { + $deletion = mb_substr($from, $from_offset, $from_len); + if ( strcspn($deletion, " \n\r") === 0 ) { // no mb_ here is okay $deletion = str_replace(array("\n","\r"), array('\n','\r'), $deletion); - } - echo '', htmlentities(htmlentities($deletion)), ''; } + echo '', htmlspecialchars($deletion), ''; + } else /* if ( $opcode === 'i' ) */ { - echo '', htmlentities(htmlentities(substr($from, $from_offset, $from_len))), ''; - } + echo '', htmlspecialchars(mb_substr($from, $from_offset, $from_len), ENT_QUOTES), ''; } } + private static function splitToChars($str) { + preg_match_all('/./us', $str, $matches); + $matches = $matches[0]; + + if (count($matches) === 0) return array(''); + return $matches; + } + + private static function mb_strcspn($str, $delimiters, $start) { + $dels = self::splitToChars($delimiters); + $min = mb_strlen($str); + + foreach ($dels as $del) { + $pos = mb_strpos($str, $del, $start); + if ($pos !== false && $pos < $min) $min = $pos; + } + + return $min - $start; + } + + private static function mb_strspn($str, $delimiters, $start) { + $str = mb_substr($str, $start); + $dels = self::splitToChars($delimiters); + + foreach ($dels as $idx => $del) { + $dels[$idx] = preg_quote($del, '/'); + } + + $dels = implode('|', $dels); + + preg_match("/^($dels)+/us", $str, $match); + return $match ? mb_strlen($match[0]) : 0; + } +}