From c6577403974f97a32c861dba9cb747e3c6496a9d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 20 Jan 2021 13:10:12 +0900 Subject: [PATCH 01/10] extract the method that detects the sentence offsets and add new tests --- .../lang/impl/PragmaticSentenceDetector.java | 26 ++-- .../impl/PragmaticSentenceDetectorTest.java | 124 ++++++++++++++++++ 2 files changed, 140 insertions(+), 10 deletions(-) create mode 100644 grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index 24525e58f7..d42e7bd143 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -65,15 +65,22 @@ public List detect(String text, Language lang) { //System.out.println(text); //System.out.println(ret.toString()); + List retList = (List) ret; + + List result = getSentenceOffsets(text, retList); + + return result; + } + + protected static List getSentenceOffsets(String text, List retList) { // build offset positions from the string chunks List result = new ArrayList<>(); int pos = 0; int previousEnd = 0; // indicate when the sentence as provided by the Pragmatic Segmented does not match the original string - // and we had to "massage" the string to identify/approximate offsets in the original string + // and we had to "massage" the string to identify/approximate offsets in the original string boolean recovered = false; - List retList = (List) ret; - for(int i=0; i detect(String text, Language lang) { LOGGER.warn("Extracted sentence does not match orginal text - " + chunk); // Unfortunately the pragmatic segmenter can modify the string when it gives back the array of sentences as string. - // it usually concerns removed white space, which then make it hard to locate exactly the offsets. + // it usually concerns removed white space, which then make it hard to locate exactly the offsets. // we take as first fallback the previous end of sentence and move it to the next non space character // next heuristics is to use the next sentence matching to re-synchronize to the original text @@ -93,11 +100,11 @@ public List detect(String text, Language lang) { // "The dissolved oxygen concentration in the sediment was measured in the lab with an OX-500 micro electrode (Unisense, Aarhus, Denmark) and was below detection limit (\0.01 mg l -1 )." // -> ["The dissolved oxygen concentration in the sediment was measured in the lab with an OX-500 micro electrode (Unisense, Aarhus, Denmark) and was below detection limit (((((((((\\0.01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 ).01 mg l -1 )."] // original full paragraph: Nonylphenol polluted sediment was collected in June 2005 from the Spanish Huerva River in Zaragoza (41°37 0 23 00 N, 0°54 0 28 00 W), which is a tributary of the Ebro River. At the moment of sampling, the river water had a temperature of 25.1°C, a redox potential of 525 mV and a pH of 7.82. The water contained 3.8 mg l -1 dissolved oxygen. The dissolved oxygen concentration in the sediment was measured in the lab with an OX-500 micro electrode (Unisense, Aarhus, Denmark) and was below detection limit (\0.01 mg l -1 ). The redox potential, temperature and pH were not determined in the sediment for practical reasons. Sediment was taken anaerobically with stainless steel cores, and transported on ice to the laboratory. Cores were opened in an anaerobic glove box with ±1% H 2 -gas and ±99% N 2 -gas to maintain anaerobic conditions, and the sediment was put in a glass jar. The glass jar was stored at 4°C in an anaerobic box that was flushed with N 2 -gas. The sediment contained a mixture of tNP isomers (20 mg kg -1 dry weight), but 4-n-NP was not present in the sediment. The chromatogram of the gas chromatography-mass spectrometry (GC-MS) of the mixture of tNP isomers present in the sediment was comparable to the chromatogram of the tNP technical mixture ordered from Merck. The individual branched isomers were not identified. The total organic carbon fraction of the sediment was 3.5% and contained mainly clay particles with a diameter size \ 32 lM. - // it's less frequent that white space removal, but can happen hundred of times when processing thousand PDF + // it's less frequent that white space removal, but can happen hundred of times when processing thousand PDF // -> note it might be related to jruby sharing of the string and encoding/escaping if (previousEnd != pos) { - // previous sentence was "recovered", which means we are unsure about its end offset + // previous sentence was "recovered", which means we are unsure about its end offset start = text.indexOf(chunk, previousEnd); if (start != -1) { // apparently the current sentence match a bit before the end offset of the previous sentence, which mean that @@ -108,7 +115,7 @@ public List detect(String text, Language lang) { while(newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') { newPreviousEnd--; if (start - newPreviousEnd > 10) { - // this is a break to avoid going too far + // this is a break to avoid going too far newPreviousEnd = start; // but look back previous character to cover general case if (newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') { @@ -128,7 +135,7 @@ public List detect(String text, Language lang) { while(text.charAt(start) == ' ') { start++; if (start - previousEnd > 10) { - // this is a break to avoid going too far + // this is a break to avoid going too far start = previousEnd+1; } } @@ -139,7 +146,7 @@ public List detect(String text, Language lang) { int end = start+chunk.length(); // in case the last sentence is modified - if (end > text.length() && i == retList.size()-1) + if (end > text.length() && i == retList.size()-1) end = text.length(); result.add(new OffsetPosition(start, end)); @@ -149,7 +156,6 @@ public List detect(String text, Language lang) { else previousEnd = pos; } - return result; } } diff --git a/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java b/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java new file mode 100644 index 0000000000..d5e3500ade --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java @@ -0,0 +1,124 @@ +package org.grobid.core.lang.impl; + +import org.grobid.core.utilities.OffsetPosition; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.hasSize; +import static org.junit.Assert.assertThat; + +public class PragmaticSentenceDetectorTest { + + @Before + public void setUp() { + } + + @Test + public void testGetSentenceSpans() { + String original_text = "This is the original text. Some spaces are going to be removed."; + List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + + assertThat(sentence_spans, hasSize(2)); + assertThat(sentence_spans.get(0).start, is(0)); + assertThat(sentence_spans.get(0).end, is(26)); + assertThat(sentence_spans.get(1).start, is(29)); + assertThat(sentence_spans.get(1).end, is(65)); + } + + @Test + public void testGetSentenceSpanMismatchFirstSentence() { + String original_text = "This is the original text. Some spaces are going to be removed."; + List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + + assertThat(sentence_spans, hasSize(2)); + assertThat(sentence_spans.get(0).start, is(0)); + assertThat(sentence_spans.get(0).end, is(28)); + assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("This is the original text.")); + assertThat(sentence_spans.get(1).start, is(31)); + assertThat(sentence_spans.get(1).end, is(67)); + assertThat(original_text.substring(sentence_spans.get(1).start, sentence_spans.get(1).end), is("Some spaces are going to be removed.")); + } + + + @Test + public void testGetSentenceSpanMismatchSecondSentence() { + String original_text = "This is the original text. Some spaces are going to be removed."; + List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + + assertThat(sentence_spans, hasSize(2)); + assertThat(sentence_spans.get(0).start, is(0)); + assertThat(sentence_spans.get(0).end, is(26)); + assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("This is the original text.")); + assertThat(sentence_spans.get(1).start, is(29)); + assertThat(sentence_spans.get(1).end, is(68)); + assertThat(original_text.substring(sentence_spans.get(1).start, sentence_spans.get(1).end), is("Some spaces are going to be removed.")); + } + + @Test + public void testGetSentenceSpanMismatchSecondSentence_sameSentence() { + String original_text = "This is the original text. This is the original text."; + List sentences = Arrays.asList("This is the original text.", "This is the original text."); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + + assertThat(sentence_spans, hasSize(2)); + assertThat(sentence_spans.get(0).start, is(0)); + assertThat(sentence_spans.get(0).end, is(26)); + assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("This is the original text.")); + assertThat(sentence_spans.get(1).start, is(29)); + assertThat(sentence_spans.get(1).end, is(58)); + assertThat(original_text.substring(sentence_spans.get(1).start, sentence_spans.get(1).end), is("This is the original text.")); + } + + @Test + public void testGetSentenceSpanMismatchAllSentences() { + String original_text = "This is the original text. Some spaces are going to be removed."; + List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + + assertThat(sentence_spans, hasSize(2)); + assertThat(sentence_spans.get(0).start, is(0)); + assertThat(sentence_spans.get(0).end, is(29)); + assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("This is the original text.")); + assertThat(sentence_spans.get(1).start, is(32)); + assertThat(sentence_spans.get(1).end, is(71)); + assertThat(original_text.substring(sentence_spans.get(1).start, sentence_spans.get(1).end), is("Some spaces are going to be removed.")); + } + + @Test + public void testGetSentenceSpanMismatch_realCase() { + String original_text = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs. Most of these runs show two stages of evolution -the first being a turbulent steady state and the second reflecting thermal instability that leads to multiphase condensation. The first stage occurs after an eddy turnover time scale for most of our runs. It depends on the amplitude of forcing, and thus on the parameter f turb (the fraction of turbulent heating). The second stage of evolution has much higher density fluctuations ( δρ rms / ρ ≥ 1). In this stage, the gas separates into hot and cold phases due to thermal instability. The multiphase gas formation time scale (t mp ) is very different for different parameter choices."; + List sentences = Arrays.asList("Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs."); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + + assertThat(sentence_spans, hasSize(1)); + assertThat(sentence_spans.get(0).start, is(0)); + assertThat(sentence_spans.get(0).end, is(143)); + assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs.")); + } + +// def test_find_in_text_mismatch_real_case(self): +// text = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs. Most of these runs show two stages of evolution -the first being a turbulent steady state and the second reflecting thermal instability that leads to multiphase condensation. The first stage occurs after an eddy turnover time scale for most of our runs. It depends on the amplitude of forcing, and thus on the parameter f turb (the fraction of turbulent heating). The second stage of evolution has much higher density fluctuations ( δρ rms / ρ ≥ 1). In this stage, the gas separates into hot and cold phases due to thermal instability. The multiphase gas formation time scale (t mp ) is very different for different parameter choices." +// sentence = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs." +// +// in_text, start = find_in_text(sentence, text) +// assert start == 0 +// assert in_text == 'Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs.' +// +// def test_find_in_text(self): +// sentence = 'This is the original text.' +// text = 'This is the original text. Some spaces are going to be removed.' +// +// in_text, start = find_in_text(sentence, text) +// +// assert start == 0 +// assert in_text == 'This is the original text.' + + + } \ No newline at end of file From 89cc7b9a7ad0a45250ecf4009ae2804a802291ff Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Jan 2021 16:27:42 +0900 Subject: [PATCH 02/10] Implementing a more robust sentence detector when the sentences are (wrongly) modified from the sentence parser --- build.gradle | 2 +- .../lang/impl/PragmaticSentenceDetector.java | 176 +++++++++++++++--- .../impl/PragmaticSentenceDetectorTest.java | 67 ++++--- 3 files changed, 194 insertions(+), 51 deletions(-) diff --git a/build.gradle b/build.gradle index 24a3c749bf..b1c0b2a47f 100644 --- a/build.gradle +++ b/build.gradle @@ -248,7 +248,7 @@ project("grobid-core") { implementation 'black.ninia:jep:3.9.1' implementation 'org.apache.opennlp:opennlp-tools:1.9.1' implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0' - + compile group: 'org.bitbucket.cowwoc', name: 'diff-match-patch', version: '1.2' shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1" } diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index d42e7bd143..867a51991f 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -1,39 +1,42 @@ package org.grobid.core.lang.impl; -import org.jruby.embed.PathType; -import org.jruby.embed.ScriptingContainer; -import org.jruby.embed.LocalContextScope; -import org.jruby.embed.LocalVariableBehavior; - -import org.grobid.core.lang.SentenceDetector; +import com.google.common.base.Joiner; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.bitbucket.cowwoc.diffmatchpatch.DiffMatchPatch; import org.grobid.core.lang.Language; -import org.grobid.core.utilities.OffsetPosition; +import org.grobid.core.lang.SentenceDetector; import org.grobid.core.utilities.GrobidProperties; - +import org.grobid.core.utilities.OffsetPosition; +import org.jruby.embed.LocalContextScope; +import org.jruby.embed.LocalVariableBehavior; +import org.jruby.embed.PathType; +import org.jruby.embed.ScriptingContainer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; -import java.io.*; +import java.util.stream.Collectors; /** * Implementation of sentence segmentation via the Pragmatic Segmenter - * */ public class PragmaticSentenceDetector implements SentenceDetector { - private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class); + private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class); private ScriptingContainer instance = null; public PragmaticSentenceDetector() { - String segmenterRbFile = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + - File.separator + "pragmatic_segmenter"+ File.separator + "segmenter.rb"; + String segmenterRbFile = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + + File.separator + "pragmatic_segmenter" + File.separator + "segmenter.rb"; String segmenterLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation"; /*String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + File.separator + "pragmatic_segmenter" + File.separator + "gem" + File.separator + "gems" + File.separator + "unicode-0.4.4.4-java" + File.separator + "lib";*/ - String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + + String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + File.separator + "pragmatic_segmenter" + File.separator + "lib"; //System.out.println(vendorLoadPath); @@ -49,7 +52,7 @@ public PragmaticSentenceDetector() { @Override public List detect(String text) { - return detect(text, new Language(Language.EN)); + return detect(text, new Language(Language.EN)); } @Override @@ -72,6 +75,131 @@ public List detect(String text, Language lang) { return result; } + public static Pair findInText(String subString, String text) { + + LinkedList diffs = new DiffMatchPatch().diffMain(text, subString); + List list = new ArrayList<>(); + + // Transform to a char based sequence + diffs.stream().forEach(d -> { + String text_chunk = d.text; + DiffMatchPatch.Operation operation = d.operation; + String op = " "; + if (operation.equals(DiffMatchPatch.Operation.INSERT)) { + op = "+"; + } else if (operation.equals(DiffMatchPatch.Operation.DELETE)) { + op = "-"; + } + + for (int i = 0; i < text_chunk.toCharArray().length; i++) { + String sb = op + " " + text_chunk.toCharArray()[i]; + list.add(sb); + } + }); + + List list_cleaned = list.stream().filter(d -> d.charAt(0) != '+').collect(Collectors.toList()); +// System.out.println(list_cleaned); + + boolean inside = false; + List output = new ArrayList<>(); + for (int i = 0; i < list_cleaned.size(); i++) { + String item = list_cleaned.get(i); + if (item.charAt(0) == '-' && !inside) { + continue; + } else { + inside = true; + output.add(String.valueOf(text.charAt(i))); + } + } + + for (int i = output.size() - 1; i > -1; i--) { + String item = list_cleaned.get(i); + if (item.charAt(0) == '-' || item.charAt(0) == '+') { + output.remove(i); + } else { + break; + } + } + String adaptedSubString = Joiner.on("").join(output); + int start = text.indexOf(adaptedSubString); + + return Pair.of(adaptedSubString, start); + } + + + protected static List getSentenceSpans(String text, List retList) { + // build offset positions from the string chunks + List result = new ArrayList<>(); + + int previousEnd = -1; + int previousStart = -1; + + for (int i = 0; i < retList.size(); i++) { + String sentence = retList.get(i); + String sentenceClean = StringUtils.strip(sentence, "\n"); + + int start = -1; + int end = -1; + + if (previousEnd > -1) { + start = text.indexOf(sentenceClean, previousEnd); + } else { + text.indexOf(sentenceClean); + } + + + String outputStr = ""; + if (start == -1) { + if (previousEnd > -1) { + start = text.replace("\n", " ").indexOf(sentenceClean, previousEnd); + } else { + start = text.replace("\n", " ").indexOf(sentenceClean); + } + + if (start == -1) { + + String textAdapted = text; + + if (previousEnd > -1) { + textAdapted = text.substring(previousEnd); + Pair inText = findInText(sentenceClean, textAdapted); + start = inText.getRight(); + outputStr = inText.getLeft(); + start += previousEnd; + } else if (previousStart > -1) { + textAdapted = text.substring(previousStart); + Pair inText = findInText(sentenceClean, textAdapted); + start = inText.getRight(); + outputStr = inText.getLeft(); + start += previousEnd; + } else { + Pair inText = findInText(sentenceClean, textAdapted); + start = inText.getRight(); + outputStr = inText.getLeft(); + } + end = start + outputStr.length(); + if (start == -1) { + System.out.println("- The starting offset is -1. We have tried to recover it, but probably something is still wrong. Please check. "); + System.out.println(outputStr + " / " + textAdapted); + } + } else { + end = start + sentenceClean.length(); + } + } else { + end = start + sentenceClean.length(); + } + + if (start > -1) { + previousEnd = end; + } + + result.add(new OffsetPosition(start, end)); + } + + return result; + } + + @Deprecated protected static List getSentenceOffsets(String text, List retList) { // build offset positions from the string chunks List result = new ArrayList<>(); @@ -80,7 +208,7 @@ protected static List getSentenceOffsets(String text, List getSentenceOffsets(String text, List 0) { int newPreviousEnd = start; - while(newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') { + while (newPreviousEnd >= 1 && text.charAt(newPreviousEnd - 1) == ' ') { newPreviousEnd--; if (start - newPreviousEnd > 10) { // this is a break to avoid going too far newPreviousEnd = start; // but look back previous character to cover general case - if (newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') { + if (newPreviousEnd >= 1 && text.charAt(newPreviousEnd - 1) == ' ') { newPreviousEnd--; } } } - result.get(result.size()-1).end = newPreviousEnd; + result.get(result.size() - 1).end = newPreviousEnd; } } } @@ -132,25 +260,25 @@ protected static List getSentenceOffsets(String text, List 10) { // this is a break to avoid going too far - start = previousEnd+1; + start = previousEnd + 1; } } recovered = true; } } - int end = start+chunk.length(); + int end = start + chunk.length(); // in case the last sentence is modified - if (end > text.length() && i == retList.size()-1) + if (end > text.length() && i == retList.size() - 1) end = text.length(); result.add(new OffsetPosition(start, end)); - pos = start+chunk.length(); + pos = start + chunk.length(); if (recovered) previousEnd += 1; else diff --git a/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java b/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java index d5e3500ade..64e9ee6757 100644 --- a/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java @@ -1,5 +1,6 @@ package org.grobid.core.lang.impl; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.utilities.OffsetPosition; import org.junit.Before; import org.junit.Test; @@ -21,7 +22,7 @@ public void setUp() { public void testGetSentenceSpans() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -34,7 +35,7 @@ public void testGetSentenceSpans() { public void testGetSentenceSpanMismatchFirstSentence() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -50,7 +51,7 @@ public void testGetSentenceSpanMismatchFirstSentence() { public void testGetSentenceSpanMismatchSecondSentence() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -65,7 +66,7 @@ public void testGetSentenceSpanMismatchSecondSentence() { public void testGetSentenceSpanMismatchSecondSentence_sameSentence() { String original_text = "This is the original text. This is the original text."; List sentences = Arrays.asList("This is the original text.", "This is the original text."); - List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -80,7 +81,7 @@ public void testGetSentenceSpanMismatchSecondSentence_sameSentence() { public void testGetSentenceSpanMismatchAllSentences() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -88,14 +89,14 @@ public void testGetSentenceSpanMismatchAllSentences() { assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("This is the original text.")); assertThat(sentence_spans.get(1).start, is(32)); assertThat(sentence_spans.get(1).end, is(71)); - assertThat(original_text.substring(sentence_spans.get(1).start, sentence_spans.get(1).end), is("Some spaces are going to be removed.")); + assertThat(original_text.substring(sentence_spans.get(1).start, sentence_spans.get(1).end), is("Some spaces are going to be removed.")); } @Test public void testGetSentenceSpanMismatch_realCase() { String original_text = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs. Most of these runs show two stages of evolution -the first being a turbulent steady state and the second reflecting thermal instability that leads to multiphase condensation. The first stage occurs after an eddy turnover time scale for most of our runs. It depends on the amplitude of forcing, and thus on the parameter f turb (the fraction of turbulent heating). The second stage of evolution has much higher density fluctuations ( δρ rms / ρ ≥ 1). In this stage, the gas separates into hot and cold phases due to thermal instability. The multiphase gas formation time scale (t mp ) is very different for different parameter choices."; List sentences = Arrays.asList("Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs."); - List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); assertThat(sentence_spans, hasSize(1)); assertThat(sentence_spans.get(0).start, is(0)); @@ -103,22 +104,36 @@ public void testGetSentenceSpanMismatch_realCase() { assertThat(original_text.substring(sentence_spans.get(0).start, sentence_spans.get(0).end), is("Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs.")); } -// def test_find_in_text_mismatch_real_case(self): -// text = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs. Most of these runs show two stages of evolution -the first being a turbulent steady state and the second reflecting thermal instability that leads to multiphase condensation. The first stage occurs after an eddy turnover time scale for most of our runs. It depends on the amplitude of forcing, and thus on the parameter f turb (the fraction of turbulent heating). The second stage of evolution has much higher density fluctuations ( δρ rms / ρ ≥ 1). In this stage, the gas separates into hot and cold phases due to thermal instability. The multiphase gas formation time scale (t mp ) is very different for different parameter choices." -// sentence = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs." -// -// in_text, start = find_in_text(sentence, text) -// assert start == 0 -// assert in_text == 'Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs.' -// -// def test_find_in_text(self): -// sentence = 'This is the original text.' -// text = 'This is the original text. Some spaces are going to be removed.' -// -// in_text, start = find_in_text(sentence, text) -// -// assert start == 0 -// assert in_text == 'This is the original text.' - - - } \ No newline at end of file + @Test + public void testFindInText() throws Exception { + String originalText = "This is the original text. Some spaces are going to be removed."; + List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); + Pair inText = PragmaticSentenceDetector.findInText(sentences.get(0), originalText); + + assertThat(inText.getRight(), is(0)); + assertThat(inText.getLeft(), is("This is the original text.")); + } + + + @Test + public void testFindInText_mismatchRealCase() throws Exception { + String originalText = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs. Most of these runs show two stages of evolution -the first being a turbulent steady state and the second reflecting thermal instability that leads to multiphase condensation. The first stage occurs after an eddy turnover time scale for most of our runs. It depends on the amplitude of forcing, and thus on the parameter f turb (the fraction of turbulent heating). The second stage of evolution has much higher density fluctuations ( δρ rms / ρ ≥ 1). In this stage, the gas separates into hot and cold phases due to thermal instability. The multiphase gas formation time scale (t mp ) is very different for different parameter choices."; + String sentence = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs."; + + Pair inText = PragmaticSentenceDetector.findInText(sentence, originalText); + + assertThat(inText.getRight(), is(0)); + assertThat(inText.getLeft(), is("Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs.")); + } + + @Test + public void testFindInText_errorCase() throws Exception { + String originalText = "In two species of toads and in Salamandra, which are among the most terrestrial of lissamphibians, SF were developed more prominently (i.e. the fibres were thicker and covered greater area of the section) than in species spending more time in water, such as the fire-bellied toads (compare Fig. 4b, c, g with a). Thus, one could suspect that this may be related to greater forces acting on the limbs during terrestrial locomotion. However, SF are very well developed in femora and humeri of the aquatic Chinese salamander Andrias davidianus (Canoville et al. 2018), as well as in the Triassic temnospondyl Metoposaurus krasiejowensis, which is interpreted as almost exclusively aquatic (nonetheless, this amphibian was probably able to burrow—this requires strong muscles, which would be consistent with the presence of well developed SF; Konietzko-Meier and Sander 2013). On the other hand, we did not observe well developed SF in P. fuscus, despite partially burrowing lifestyle of this amphibian. However, in closely related P. varaldii these fibres can readily be observed in at least some specimens (Guarino et al. 2011). Also, it should be noted that the presence of SF may be dependent on a number of physiological stimuli, at least in mammals. These include influence of hormones (such as estrogen), degree of physical activity, ageing or pathologies such as osteoporosis or osteoarthritis (Aaron 2012). Explaining the reasons of these differences in amphibians requires further studies."; + String sentence = "In two species of toads and in Salamandra, which are among the most terrestrial of lissamphibians, SF were developed more prominently (i.e. the fibres were thicker and covered greater area of the section) than in species spending more time in water, such as the fire-bellied toads (compare Fig. 4b, c, g with a)."; + + Pair inText = PragmaticSentenceDetector.findInText(sentence, originalText); + + assertThat(inText.getRight(), is(0)); + assertThat(inText.getLeft(), is("In two species of toads and in Salamandra, which are among the most terrestrial of lissamphibians, SF were developed more prominently (i.e. the fibres were thicker and covered greater area of the section) than in species spending more time in water, such as the fire-bellied toads (compare Fig. 4b, c, g with a).")); + } +} \ No newline at end of file From 3de86448a0fd0da26f931d0432462501d629e4b7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Jan 2021 16:41:24 +0900 Subject: [PATCH 03/10] minor corrections, using the logger --- .../grobid/core/lang/impl/PragmaticSentenceDetector.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index 867a51991f..08b7ff2e2c 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -179,8 +179,8 @@ protected static List getSentenceSpans(String text, List } end = start + outputStr.length(); if (start == -1) { - System.out.println("- The starting offset is -1. We have tried to recover it, but probably something is still wrong. Please check. "); - System.out.println(outputStr + " / " + textAdapted); + LOGGER.warn("The starting offset is -1. We have tried to recover it, but probably something is still wrong. Please check. "); + LOGGER.warn(outputStr + " / " + textAdapted); } } else { end = start + sentenceClean.length(); @@ -188,6 +188,7 @@ protected static List getSentenceSpans(String text, List } else { end = start + sentenceClean.length(); } + previousStart = start; if (start > -1) { previousEnd = end; @@ -199,6 +200,7 @@ protected static List getSentenceSpans(String text, List return result; } + //Use getSentenceSpans @Deprecated protected static List getSentenceOffsets(String text, List retList) { // build offset positions from the string chunks From 393aed8b3e09332e52150570c3012a553ef01dc2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 27 Jan 2021 16:45:04 +0900 Subject: [PATCH 04/10] plugging the new code in --- .../grobid/core/lang/impl/PragmaticSentenceDetector.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index 08b7ff2e2c..08424b1496 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -127,7 +127,7 @@ public static Pair findInText(String subString, String text) { } - protected static List getSentenceSpans(String text, List retList) { + protected static List getSentenceOffsets(String text, List retList) { // build offset positions from the string chunks List result = new ArrayList<>(); @@ -200,9 +200,9 @@ protected static List getSentenceSpans(String text, List return result; } - //Use getSentenceSpans + //Use getSentenceOffsets @Deprecated - protected static List getSentenceOffsets(String text, List retList) { + protected static List getSentenceOffsetsOld(String text, List retList) { // build offset positions from the string chunks List result = new ArrayList<>(); int pos = 0; From 1486795868c31b3cd2f7092e4df481dc01e0843c Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 11 May 2021 08:53:25 +0900 Subject: [PATCH 05/10] add more tests, some code improvements --- .../grobid/core/document/TEIFormatter.java | 8 +-- .../grobid/core/lang/SentenceDetector.java | 4 +- .../lang/impl/PragmaticSentenceDetector.java | 2 +- .../core/utilities/SentenceUtilities.java | 6 +- .../impl/PragmaticSentenceDetectorTest.java | 70 +++++++++++++++---- 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 890f11f985..2ae5ad0dbf 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1415,8 +1415,8 @@ public void segmentIntoSentences(Element curParagraph, List curPara // for readability in another conditional if (((Element) theNode).getLocalName().equals("ref")) { // map character offset of the node - mapRefNodes.put(new Integer(pos), theNode); - refPositions.add(new Integer(pos)); + mapRefNodes.put(pos, theNode); + refPositions.add(pos); String chunk = theNode.getValue(); forbiddenPositions.add(new OffsetPosition(pos, pos+chunk.length())); @@ -1513,12 +1513,12 @@ public void segmentIntoSentences(Element curParagraph, List curPara int sentenceLength = theSentences.get(i).end - pos; // check if we have a ref between pos and pos+sentenceLength for(int j=refIndex; j= pos+posInSentence && refPos <= pos+sentenceLength) { - Node valueNode = mapRefNodes.get(new Integer(refPos)); + Node valueNode = mapRefNodes.get(refPos); if (pos+posInSentence < refPos) sentenceElement.appendChild(text.substring(pos+posInSentence, refPos)); valueNode.detach(); diff --git a/grobid-core/src/main/java/org/grobid/core/lang/SentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/SentenceDetector.java index 4754807de2..d90d986bb7 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/SentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/SentenceDetector.java @@ -14,7 +14,7 @@ public interface SentenceDetector { * @return a list of offset positions indicating start and end character * position of the recognized sentence in the text */ - public List detect(String text); + List detect(String text); /** @@ -24,5 +24,5 @@ public interface SentenceDetector { * @return a list of offset positions indicating start and end character * position of the recognized sentence in the text */ - public List detect(String text, Language lang); + List detect(String text, Language lang); } diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index 08424b1496..c9c8a463fd 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -144,7 +144,7 @@ protected static List getSentenceOffsets(String text, List -1) { start = text.indexOf(sentenceClean, previousEnd); } else { - text.indexOf(sentenceClean); + start = text.indexOf(sentenceClean); } diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index a424e5e808..459a067b0c 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -200,7 +200,7 @@ public List runSentenceDetection(String text, List runSentenceDetection(String text, List= text.length()) break; - if (this.toSkipTokenNoHyphen(nextToken.getText())) { + if (toSkipTokenNoHyphen(nextToken.getText())) { buffer += nextToken.getText().length(); continue; } - if (this.isValidSuperScriptNumericalReferenceMarker(nextToken)) { + if (isValidSuperScriptNumericalReferenceMarker(nextToken)) { pushedEnd += buffer + nextToken.getText().length(); buffer = 0; } else diff --git a/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java b/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java index 64e9ee6757..32cde0e799 100644 --- a/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorTest.java @@ -19,10 +19,10 @@ public void setUp() { } @Test - public void testGetSentenceSpans() { + public void testGetSentenceOffsets() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -32,10 +32,10 @@ public void testGetSentenceSpans() { } @Test - public void testGetSentenceSpanMismatchFirstSentence() { + public void testGetSentenceOffsetsMismatchFirstSentence() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -48,10 +48,10 @@ public void testGetSentenceSpanMismatchFirstSentence() { @Test - public void testGetSentenceSpanMismatchSecondSentence() { + public void testGetSentenceOffsetsMismatchSecondSentence() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -63,10 +63,10 @@ public void testGetSentenceSpanMismatchSecondSentence() { } @Test - public void testGetSentenceSpanMismatchSecondSentence_sameSentence() { + public void testGetSentenceOffsetsMismatchSecondSentence_sameSentence() { String original_text = "This is the original text. This is the original text."; List sentences = Arrays.asList("This is the original text.", "This is the original text."); - List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -78,10 +78,10 @@ public void testGetSentenceSpanMismatchSecondSentence_sameSentence() { } @Test - public void testGetSentenceSpanMismatchAllSentences() { + public void testGetSentenceOffsetsMismatchAllSentences() { String original_text = "This is the original text. Some spaces are going to be removed."; List sentences = Arrays.asList("This is the original text.", "Some spaces are going to be removed."); - List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); assertThat(sentence_spans, hasSize(2)); assertThat(sentence_spans.get(0).start, is(0)); @@ -93,10 +93,10 @@ public void testGetSentenceSpanMismatchAllSentences() { } @Test - public void testGetSentenceSpanMismatch_realCase() { + public void testGetSentenceOffsetsMismatch_realCase() { String original_text = "Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs. Most of these runs show two stages of evolution -the first being a turbulent steady state and the second reflecting thermal instability that leads to multiphase condensation. The first stage occurs after an eddy turnover time scale for most of our runs. It depends on the amplitude of forcing, and thus on the parameter f turb (the fraction of turbulent heating). The second stage of evolution has much higher density fluctuations ( δρ rms / ρ ≥ 1). In this stage, the gas separates into hot and cold phases due to thermal instability. The multiphase gas formation time scale (t mp ) is very different for different parameter choices."; List sentences = Arrays.asList("Figure 5 shows the time evolution of the volumeaveraged rms density fluctuations (normalized to the mean density) in our thermal balance runs."); - List sentence_spans = PragmaticSentenceDetector.getSentenceSpans(original_text, sentences); + List sentence_spans = PragmaticSentenceDetector.getSentenceOffsets(original_text, sentences); assertThat(sentence_spans, hasSize(1)); assertThat(sentence_spans.get(0).start, is(0)); @@ -136,4 +136,50 @@ public void testFindInText_errorCase() throws Exception { assertThat(inText.getRight(), is(0)); assertThat(inText.getLeft(), is("In two species of toads and in Salamandra, which are among the most terrestrial of lissamphibians, SF were developed more prominently (i.e. the fibres were thicker and covered greater area of the section) than in species spending more time in water, such as the fire-bellied toads (compare Fig. 4b, c, g with a).")); } + + @Test + public void testGetSentenceOffsets_realcase_2() throws Exception { + + String originalText = "With the success of large-scale pre-training and multilingual modeling in Natural Language Processing (NLP), recent years have seen a proliferation of large, web-mined text datasets covering hundreds of languages. However, to date there has been no systematic analysis of the quality of these publicly available datasets, or whether the datasets actually contain content in the languages they claim to represent. In this work, we manually audit the quality of 205 languagespecific corpora released with five major public datasets (CCAligned, ParaCrawl, WikiMatrix, OSCAR, mC4), and audit the correctness of language codes in a sixth (JW300). We find that lower-resource corpora have systematic issues: at least 15 corpora are completely erroneous, and a significant fraction contains less than 50% sentences of acceptable quality. Similarly, we find 82 corpora that are mislabeled or use nonstandard/ambiguous language codes. We demonstrate that these issues are easy to detect even for non-speakers of the languages in question, and supplement the human judgements with automatic analyses. Inspired by our analysis, we recommend techniques to evaluate and improve multilingual corpora and discuss the risks that come with low-quality data releases."; + + List sentences = Arrays.asList( + "With the success of large-scale pre-training and multilingual modeling in Natural Language Processing (NLP), recent years have seen a proliferation of large, web-mined text datasets covering hundreds of languages.", + "However, to date there has been no systematic analysis of the quality of these publicly available datasets, or whether the datasets actually contain content in the languages they claim to represent.", + "In this work, we manually audit the quality of 205 languagespecific corpora released with five major public datasets (CCAligned, ParaCrawl, WikiMatrix, OSCAR, mC4), and audit the correctness of language codes in a sixth (JW300).", + "We find that lower-resource corpora have systematic issues: at least 15 corpora are completely erroneous, and a significant fraction contains less than 50% sentences of acceptable quality.", + "Similarly, we find 82 corpora that are mislabeled or use nonstandard/ambiguous language codes.", + "We demonstrate that these issues are easy to detect even for non-speakers of the languages in question, and supplement the human judgements with automatic analyses.", + "Inspired by our analysis, we recommend techniques to evaluate and improve multilingual corpora and discuss the risks that come with low-quality data releases." + ); + List sentenceSpans = PragmaticSentenceDetector.getSentenceOffsets(originalText, sentences); + + assertThat(sentenceSpans, hasSize(7)); + for (int i = 0; i < sentenceSpans.size(); i++) { + assertThat(originalText.substring(sentenceSpans.get(i).start, sentenceSpans.get(i).end), is(sentences.get(i))); + } + + } + + @Test + public void testGetSentenceOffsets_realcase_3() throws Exception { + + String originalText = "CCAligned ) is a 119language 1 parallel dataset built off 68 snapshots of Common Crawl. Documents are aligned if they are in the same language according to FastText LangID (Joulin et al., 2016(Joulin et al., , 2017, and have the same URL but for a differing language code. These alignments are refined with cross-lingual LASER embeddings (Artetxe and Schwenk, 2019). For sentence-level data, they split on newlines and align with LASER, but perform no further filtering. Human annotators evaluated the quality of document alignments for six languages (de, zh, ar, ro, et, my) selected for their different scripts and amount of retrieved documents, reporting precision of over 90%. The quality of the extracted parallel sentences is evaluated in a machine translation (MT) task on six European (da, cr, sl, sk, lt, et) languages of the TED corpus (Qi et al., 2018) (Qi et al., 2018); WMT-5: cs, de, fi, lv, ro. POS/DEP-5: part-of-speech labeling and dependency parsing for bg, ca, da, fi, id."; + + List sentences = Arrays.asList( + "CCAligned ) is a 119language 1 parallel dataset built off 68 snapshots of Common Crawl.", + "Documents are aligned if they are in the same language according to FastText LangID (Joulin et al., 2016(Joulin et al., , 2017, and have the same URL but for a differing language code.", + "These alignments are refined with cross-lingual LASER embeddings (Artetxe and Schwenk, 2019).", + "For sentence-level data, they split on newlines and align with LASER, but perform no further filtering.", + "Human annotators evaluated the quality of document alignments for six languages (de, zh, ar, ro, et, my) selected for their different scripts and amount of retrieved documents, reporting precision of over 90%.", + "The quality of the extracted parallel sentences is evaluated in a machine translation (MT) task on six European (da, cr, sl, sk, lt, et) languages of the TED corpus (Qi et al., 2018) (Qi et al., 2018); WMT-5: cs, de, fi, lv, ro.", + "POS/DEP-5: part-of-speech labeling and dependency parsing for bg, ca, da, fi, id." + ); + List sentenceSpans = PragmaticSentenceDetector.getSentenceOffsets(originalText, sentences); + + assertThat(sentenceSpans, hasSize(7)); + for (int i = 0; i < sentenceSpans.size(); i++) { + assertThat(originalText.substring(sentenceSpans.get(i).start, sentenceSpans.get(i).end), is(sentences.get(i))); + } + + } } \ No newline at end of file From 5ca94c4dd846073ed0ab2761c7f81e8ca9a2ab1b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 16 Aug 2021 10:01:08 +0900 Subject: [PATCH 06/10] minor renaming and adding test to reproduce issue 753 --- .../core/utilities/SentenceUtilities.java | 21 +++-- .../core/utilities/SentenceUtilitiesTest.java | 87 ++++++++++++++++++- 2 files changed, 98 insertions(+), 10 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index 459a067b0c..c421d1a20c 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -130,6 +130,9 @@ public List runSentenceDetection(String text, List runSentenceDetection(String text, List forbidden, List textLayoutTokens, Language lang) { + + //String text2 = LayoutTokensUtil.toText(textLayoutTokens); + if (text == null) return null; try { @@ -143,24 +146,24 @@ public List runSentenceDetection(String text, List finalSentencePositions = new ArrayList<>(); int forbiddenIndex = 0; - for(int j=0; j < sentencePositions.size(); j++) { - OffsetPosition position = sentencePositions.get(j); + for(int sentencePositionId=0; sentencePositionId < sentencePositions.size(); sentencePositionId++) { + OffsetPosition sentencePosition = sentencePositions.get(sentencePositionId); for(int i=forbiddenIndex; i < forbidden.size(); i++) { OffsetPosition forbiddenPos = forbidden.get(i); - if (forbiddenPos.end < position.end) + if (forbiddenPos.end < sentencePosition.end) continue; - if (forbiddenPos.start > position.end) + if (forbiddenPos.start > sentencePosition.end) break; - while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) { - if (j+1 < sentencePositions.size()) { - position.end = sentencePositions.get(j+1).end; - j++; + while ( (forbiddenPos.start < sentencePosition.end && sentencePosition.end < forbiddenPos.end) ) { + if (sentencePositionId+1 < sentencePositions.size()) { + sentencePosition.end = sentencePositions.get(sentencePositionId+1).end; + sentencePositionId++; forbiddenIndex = i; } else break; } } - finalSentencePositions.add(position); + finalSentencePositions.add(sentencePosition); } // as a heuristics for all implementations, because they clearly all fail for this case, we diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java index 646edac2dd..de701255ff 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java @@ -1,12 +1,15 @@ package org.grobid.core.utilities; import org.grobid.core.GrobidModels; +import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.engines.DateParser; import org.grobid.core.lang.SentenceDetector; import org.grobid.core.lang.SentenceDetectorFactory; +import org.grobid.core.layout.LayoutToken; import org.grobid.core.lexicon.Lexicon; import org.grobid.core.main.LibraryLoader; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.powermock.api.easymock.PowerMock; @@ -41,7 +44,7 @@ public void setUp() { GrobidConfig.ModelParameters modelParameters = new GrobidConfig.ModelParameters(); modelParameters.name = "bao"; GrobidProperties.addModel(modelParameters); - + sentenceDetectorFactoryMock = createMock(SentenceDetectorFactory.class); sentenceDetectorMock = createMock(SentenceDetector.class); target = SentenceUtilities.getInstance(); @@ -199,6 +202,88 @@ public void testCorrectSegmentation_shouldCancelWrongSegmentation2() throws Exce assertThat(theSentences.size(), is(1)); } + @Test + @Ignore("reproduce issue #753") + public void testRealCaseDesynchronisation_shouldReturnCorrectSentences() throws Exception { + String text = "CCAligned ) is a 119language 1 parallel dataset built off 68 snapshots of Common Crawl. Documents are aligned if they are in the same language according to FastText LangID (Joulin et al., 2016(Joulin et al., , 2017, and have the same URL but for a differing language code. These alignments are refined with cross-lingual LASER embeddings (Artetxe and Schwenk, 2019). For sentence-level data, they split on newlines and align with LASER, but perform no further filtering. Human annotators evaluated the quality of document alignments for six languages (de, zh, ar, ro, et, my) selected for their different scripts and amount of retrieved documents, reporting precision of over 90%. The quality of the extracted parallel sentences is evaluated in a machine translation (MT) task on six European (da, cr, sl, sk, lt, et) languages of the TED corpus (Qi et al., 2018), where it compares favorably to systems built on crawled sentences from WikiMatrix and ParaCrawl (Qi et al., 2018); WMT-5: cs, de, fi, lv, ro. POS/DEP-5: part-of-speech labeling and dependency parsing for bg, ca, da, fi, id."; + + String textLayoutToken = "CCAligned (El-Kishky et al., 2020) is a 119-\n" + + "language 1 parallel dataset built off 68 snapshots \n" + + "of Common Crawl. Documents are aligned if they \n" + + "are in the same language according to FastText \n" + + "LangID (Joulin et al., 2016, 2017), and have the \n" + + "same URL but for a differing language code. These \n" + + "alignments are refined with cross-lingual LASER \n" + + "embeddings (Artetxe and Schwenk, 2019). For \n" + + "sentence-level data, they split on newlines and \n" + + "align with LASER, but perform no further filtering. \n" + + "Human annotators evaluated the quality of docu-\n" + + "ment alignments for six languages (de, zh, ar, ro, et, my) selected for their different scripts and \n" + + "amount of retrieved documents, reporting precision \n" + + "of over 90%. The quality of the extracted paral-\n" + + "lel sentences is evaluated in a machine translation \n" + + "(MT) task on six European (da, cr, sl, sk, lt, et) languages of the TED corpus(Qi et al., 2018), \n" + + "where it compares favorably to systems built on \n" + + "crawled sentences from WikiMatrix and ParaCrawl \n" + + "(Qi et al., 2018); WMT-5: cs, \n" + + "de, fi, lv, ro. POS/DEP-5: part-of-speech labeling and dependency parsing for bg, ca, da, fi, id. \n" + + "\n"; + + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(textLayoutToken); + + tokens.get(25).setSuperscript(true); + + List referencesSpans = Arrays.asList( + new OffsetPosition(172, 192), + new OffsetPosition(192, 214), + new OffsetPosition(338, 365), + new OffsetPosition(551, 575), + new OffsetPosition(793, 817), + new OffsetPosition(846, 863), + new OffsetPosition(963, 980) + ); + + List sentencesPositions = Arrays.asList( + new OffsetPosition(0, 87), + new OffsetPosition(88, 272), + new OffsetPosition(273, 366), + new OffsetPosition(367, 470), + new OffsetPosition(471, 680), + new OffsetPosition(681, 1008), + new OffsetPosition(1009, 1090) + ); + + expect(sentenceDetectorFactoryMock.getInstance()).andReturn(sentenceDetectorMock); + expect(sentenceDetectorMock.detect(text, null)).andReturn(sentencesPositions); + replay(sentenceDetectorFactoryMock, sentenceDetectorMock); + + List theSentences = SentenceUtilities.getInstance().runSentenceDetection(text, referencesSpans, tokens, null); + verify(sentenceDetectorFactoryMock, sentenceDetectorMock); + + assertThat(theSentences.size(), is(7)); + + assertThat(theSentences.get(0).start, is(0)); + assertThat(theSentences.get(0).end, is(87)); + + assertThat(theSentences.get(1).start, is(88)); + assertThat(theSentences.get(1).end, is(272)); + + assertThat(theSentences.get(2).start, is(273)); + assertThat(theSentences.get(2).end, is(366)); + + assertThat(theSentences.get(3).start, is(367)); + assertThat(theSentences.get(3).end, is(470)); + + assertThat(theSentences.get(4).start, is(471)); + assertThat(theSentences.get(4).end, is(680)); + + assertThat(theSentences.get(5).start, is(681)); + assertThat(theSentences.get(5).end, is(1008)); + + assertThat(theSentences.get(6).start, is(1009)); + assertThat(theSentences.get(6).end, is(1090)); + } + private List getPositions(String paragraph, List refs) { List positions = new ArrayList<>(); int previousRefEnd = 0; From 42e612c2aa57b9e0f22e93a52530b47c81b4db36 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 28 Jul 2022 17:22:04 +0900 Subject: [PATCH 07/10] moved the dependency-buggy Apache2 library for diff-match-path into grobid --- build.gradle | 1 - .../lang/impl/PragmaticSentenceDetector.java | 4 +- .../utilities/matching/DiffMatchPatch.java | 2471 +++++++++++++++++ .../matching/DiffMatchPatchTest.java | 988 +++++++ .../core/utilities/matching/Speedtest1.txt | 230 ++ .../core/utilities/matching/Speedtest2.txt | 188 ++ 6 files changed, 3879 insertions(+), 3 deletions(-) create mode 100644 grobid-core/src/main/java/org/grobid/core/utilities/matching/DiffMatchPatch.java create mode 100644 grobid-core/src/test/java/org/grobid/core/utilities/matching/DiffMatchPatchTest.java create mode 100644 grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest1.txt create mode 100644 grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest2.txt diff --git a/build.gradle b/build.gradle index c6520b0120..b4c1cb3862 100644 --- a/build.gradle +++ b/build.gradle @@ -252,7 +252,6 @@ project("grobid-core") { implementation 'black.ninia:jep:4.0.2' implementation 'org.apache.opennlp:opennlp-tools:1.9.1' implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0' - compile group: 'org.bitbucket.cowwoc', name: 'diff-match-patch', version: '1.2' shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1" } diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index c9c8a463fd..f0b8940e4f 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -3,11 +3,11 @@ import com.google.common.base.Joiner; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; -import org.bitbucket.cowwoc.diffmatchpatch.DiffMatchPatch; import org.grobid.core.lang.Language; import org.grobid.core.lang.SentenceDetector; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.OffsetPosition; +import org.grobid.core.utilities.matching.DiffMatchPatch; import org.jruby.embed.LocalContextScope; import org.jruby.embed.LocalVariableBehavior; import org.jruby.embed.PathType; @@ -77,7 +77,7 @@ public List detect(String text, Language lang) { public static Pair findInText(String subString, String text) { - LinkedList diffs = new DiffMatchPatch().diffMain(text, subString); + LinkedList diffs = new DiffMatchPatch().diff_main(text, subString); List list = new ArrayList<>(); // Transform to a char based sequence diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/matching/DiffMatchPatch.java b/grobid-core/src/main/java/org/grobid/core/utilities/matching/DiffMatchPatch.java new file mode 100644 index 0000000000..6a8254f0ec --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/utilities/matching/DiffMatchPatch.java @@ -0,0 +1,2471 @@ +/* + * Diff Match and Patch + * Copyright 2018 The diff-match-patch Authors. + * https://github.com/google/diff-match-patch + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.grobid.core.utilities.matching; + +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/* + * Functions for diff, match and patch. + * Computes the difference between two texts to create a patch. + * Applies the patch onto another text, allowing for errors. + * + * @author fraser@google.com (Neil Fraser) + */ + +/** + * Class containing the diff, match and patch methods. + * Also contains the behaviour settings. + */ +public class DiffMatchPatch { + + // Defaults. + // Set these on your diff_match_patch instance to override the defaults. + + /** + * Number of seconds to map a diff before giving up (0 for infinity). + */ + public float Diff_Timeout = 1.0f; + /** + * Cost of an empty edit operation in terms of edit characters. + */ + public short Diff_EditCost = 4; + /** + * At what point is no match declared (0.0 = perfection, 1.0 = very loose). + */ + public float Match_Threshold = 0.5f; + /** + * How far to search for a match (0 = exact location, 1000+ = broad match). + * A match this many characters away from the expected location will add + * 1.0 to the score (0.0 is a perfect match). + */ + public int Match_Distance = 1000; + /** + * When deleting a large block of text (over ~64 characters), how close do + * the contents have to be to match the expected contents. (0.0 = perfection, + * 1.0 = very loose). Note that Match_Threshold controls how closely the + * end points of a delete need to match. + */ + public float Patch_DeleteThreshold = 0.5f; + /** + * Chunk size for context length. + */ + public short Patch_Margin = 4; + + /** + * The number of bits in an int. + */ + private short Match_MaxBits = 32; + + /** + * Internal class for returning results from diff_linesToChars(). + * Other less paranoid languages just use a three-element array. + */ + protected static class LinesToCharsResult { + protected String chars1; + protected String chars2; + protected List lineArray; + + protected LinesToCharsResult(String chars1, String chars2, + List lineArray) { + this.chars1 = chars1; + this.chars2 = chars2; + this.lineArray = lineArray; + } + } + + + // DIFF FUNCTIONS + + + /** + * The data structure representing a diff is a Linked list of Diff objects: + * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), + * Diff(Operation.EQUAL, " world.")} + * which means: delete "Hello", add "Goodbye" and keep " world." + */ + public enum Operation { + DELETE, INSERT, EQUAL + } + + /** + * Find the differences between two texts. + * Run a faster, slightly less optimal diff. + * This method allows the 'checklines' of diff_main() to be optional. + * Most of the time checklines is wanted, so default to true. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @return Linked List of Diff objects. + */ + public LinkedList diff_main(String text1, String text2) { + return diff_main(text1, text2, true); + } + + /** + * Find the differences between two texts. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @return Linked List of Diff objects. + */ + public LinkedList diff_main(String text1, String text2, + boolean checklines) { + // Set a deadline by which time the diff must be complete. + long deadline; + if (Diff_Timeout <= 0) { + deadline = Long.MAX_VALUE; + } else { + deadline = System.currentTimeMillis() + (long) (Diff_Timeout * 1000); + } + return diff_main(text1, text2, checklines, deadline); + } + + /** + * Find the differences between two texts. Simplifies the problem by + * stripping any common prefix or suffix off the texts before diffing. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @param deadline Time when the diff should be complete by. Used + * internally for recursive calls. Users should set DiffTimeout instead. + * @return Linked List of Diff objects. + */ + private LinkedList diff_main(String text1, String text2, + boolean checklines, long deadline) { + // Check for null inputs. + if (text1 == null || text2 == null) { + throw new IllegalArgumentException("Null inputs. (diff_main)"); + } + + // Check for equality (speedup). + LinkedList diffs; + if (text1.equals(text2)) { + diffs = new LinkedList(); + if (text1.length() != 0) { + diffs.add(new Diff(Operation.EQUAL, text1)); + } + return diffs; + } + + // Trim off common prefix (speedup). + int commonlength = diff_commonPrefix(text1, text2); + String commonprefix = text1.substring(0, commonlength); + text1 = text1.substring(commonlength); + text2 = text2.substring(commonlength); + + // Trim off common suffix (speedup). + commonlength = diff_commonSuffix(text1, text2); + String commonsuffix = text1.substring(text1.length() - commonlength); + text1 = text1.substring(0, text1.length() - commonlength); + text2 = text2.substring(0, text2.length() - commonlength); + + // Compute the diff on the middle block. + diffs = diff_compute(text1, text2, checklines, deadline); + + // Restore the prefix and suffix. + if (commonprefix.length() != 0) { + diffs.addFirst(new Diff(Operation.EQUAL, commonprefix)); + } + if (commonsuffix.length() != 0) { + diffs.addLast(new Diff(Operation.EQUAL, commonsuffix)); + } + + diff_cleanupMerge(diffs); + return diffs; + } + + /** + * Find the differences between two texts. Assumes that the texts do not + * have any common prefix or suffix. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @param deadline Time when the diff should be complete by. + * @return Linked List of Diff objects. + */ + private LinkedList diff_compute(String text1, String text2, + boolean checklines, long deadline) { + LinkedList diffs = new LinkedList(); + + if (text1.length() == 0) { + // Just add some text (speedup). + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + if (text2.length() == 0) { + // Just delete some text (speedup). + diffs.add(new Diff(Operation.DELETE, text1)); + return diffs; + } + + String longtext = text1.length() > text2.length() ? text1 : text2; + String shorttext = text1.length() > text2.length() ? text2 : text1; + int i = longtext.indexOf(shorttext); + if (i != -1) { + // Shorter text is inside the longer text (speedup). + Operation op = (text1.length() > text2.length()) ? + Operation.DELETE : Operation.INSERT; + diffs.add(new Diff(op, longtext.substring(0, i))); + diffs.add(new Diff(Operation.EQUAL, shorttext)); + diffs.add(new Diff(op, longtext.substring(i + shorttext.length()))); + return diffs; + } + + if (shorttext.length() == 1) { + // Single character string. + // After the previous speedup, the character can't be an equality. + diffs.add(new Diff(Operation.DELETE, text1)); + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + // Check to see if the problem can be split in two. + String[] hm = diff_halfMatch(text1, text2); + if (hm != null) { + // A half-match was found, sort out the return data. + String text1_a = hm[0]; + String text1_b = hm[1]; + String text2_a = hm[2]; + String text2_b = hm[3]; + String mid_common = hm[4]; + // Send both pairs off for separate processing. + LinkedList diffs_a = diff_main(text1_a, text2_a, + checklines, deadline); + LinkedList diffs_b = diff_main(text1_b, text2_b, + checklines, deadline); + // Merge the results. + diffs = diffs_a; + diffs.add(new Diff(Operation.EQUAL, mid_common)); + diffs.addAll(diffs_b); + return diffs; + } + + if (checklines && text1.length() > 100 && text2.length() > 100) { + return diff_lineMode(text1, text2, deadline); + } + + return diff_bisect(text1, text2, deadline); + } + + /** + * Do a quick line-level diff on both strings, then rediff the parts for + * greater accuracy. + * This speedup can produce non-minimal diffs. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param deadline Time when the diff should be complete by. + * @return Linked List of Diff objects. + */ + private LinkedList diff_lineMode(String text1, String text2, + long deadline) { + // Scan the text on a line-by-line basis first. + LinesToCharsResult a = diff_linesToChars(text1, text2); + text1 = a.chars1; + text2 = a.chars2; + List linearray = a.lineArray; + + LinkedList diffs = diff_main(text1, text2, false, deadline); + + // Convert the diff back to original text. + diff_charsToLines(diffs, linearray); + // Eliminate freak matches (e.g. blank lines) + diff_cleanupSemantic(diffs); + + // Rediff any replacement blocks, this time character-by-character. + // Add a dummy entry at the end. + diffs.add(new Diff(Operation.EQUAL, "")); + int count_delete = 0; + int count_insert = 0; + String text_delete = ""; + String text_insert = ""; + ListIterator pointer = diffs.listIterator(); + Diff thisDiff = pointer.next(); + while (thisDiff != null) { + switch (thisDiff.operation) { + case INSERT: + count_insert++; + text_insert += thisDiff.text; + break; + case DELETE: + count_delete++; + text_delete += thisDiff.text; + break; + case EQUAL: + // Upon reaching an equality, check for prior redundancies. + if (count_delete >= 1 && count_insert >= 1) { + // Delete the offending records and add the merged ones. + pointer.previous(); + for (int j = 0; j < count_delete + count_insert; j++) { + pointer.previous(); + pointer.remove(); + } + for (Diff subDiff : diff_main(text_delete, text_insert, false, + deadline)) { + pointer.add(subDiff); + } + } + count_insert = 0; + count_delete = 0; + text_delete = ""; + text_insert = ""; + break; + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + diffs.removeLast(); // Remove the dummy entry at the end. + + return diffs; + } + + /** + * Find the 'middle snake' of a diff, split the problem in two + * and return the recursively constructed diff. + * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param deadline Time at which to bail if not yet complete. + * @return LinkedList of Diff objects. + */ + protected LinkedList diff_bisect(String text1, String text2, + long deadline) { + // Cache the text lengths to prevent multiple calls. + int text1_length = text1.length(); + int text2_length = text2.length(); + int max_d = (text1_length + text2_length + 1) / 2; + int v_offset = max_d; + int v_length = 2 * max_d; + int[] v1 = new int[v_length]; + int[] v2 = new int[v_length]; + for (int x = 0; x < v_length; x++) { + v1[x] = -1; + v2[x] = -1; + } + v1[v_offset + 1] = 0; + v2[v_offset + 1] = 0; + int delta = text1_length - text2_length; + // If the total number of characters is odd, then the front path will + // collide with the reverse path. + boolean front = (delta % 2 != 0); + // Offsets for start and end of k loop. + // Prevents mapping of space beyond the grid. + int k1start = 0; + int k1end = 0; + int k2start = 0; + int k2end = 0; + for (int d = 0; d < max_d; d++) { + // Bail out if deadline is reached. + if (System.currentTimeMillis() > deadline) { + break; + } + + // Walk the front path one step. + for (int k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { + int k1_offset = v_offset + k1; + int x1; + if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) { + x1 = v1[k1_offset + 1]; + } else { + x1 = v1[k1_offset - 1] + 1; + } + int y1 = x1 - k1; + while (x1 < text1_length && y1 < text2_length + && text1.charAt(x1) == text2.charAt(y1)) { + x1++; + y1++; + } + v1[k1_offset] = x1; + if (x1 > text1_length) { + // Ran off the right of the graph. + k1end += 2; + } else if (y1 > text2_length) { + // Ran off the bottom of the graph. + k1start += 2; + } else if (front) { + int k2_offset = v_offset + delta - k1; + if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) { + // Mirror x2 onto top-left coordinate system. + int x2 = text1_length - v2[k2_offset]; + if (x1 >= x2) { + // Overlap detected. + return diff_bisectSplit(text1, text2, x1, y1, deadline); + } + } + } + } + + // Walk the reverse path one step. + for (int k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { + int k2_offset = v_offset + k2; + int x2; + if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) { + x2 = v2[k2_offset + 1]; + } else { + x2 = v2[k2_offset - 1] + 1; + } + int y2 = x2 - k2; + while (x2 < text1_length && y2 < text2_length + && text1.charAt(text1_length - x2 - 1) + == text2.charAt(text2_length - y2 - 1)) { + x2++; + y2++; + } + v2[k2_offset] = x2; + if (x2 > text1_length) { + // Ran off the left of the graph. + k2end += 2; + } else if (y2 > text2_length) { + // Ran off the top of the graph. + k2start += 2; + } else if (!front) { + int k1_offset = v_offset + delta - k2; + if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) { + int x1 = v1[k1_offset]; + int y1 = v_offset + x1 - k1_offset; + // Mirror x2 onto top-left coordinate system. + x2 = text1_length - x2; + if (x1 >= x2) { + // Overlap detected. + return diff_bisectSplit(text1, text2, x1, y1, deadline); + } + } + } + } + } + // Diff took too long and hit the deadline or + // number of diffs equals number of characters, no commonality at all. + LinkedList diffs = new LinkedList(); + diffs.add(new Diff(Operation.DELETE, text1)); + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + /** + * Given the location of the 'middle snake', split the diff in two parts + * and recurse. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param x Index of split point in text1. + * @param y Index of split point in text2. + * @param deadline Time at which to bail if not yet complete. + * @return LinkedList of Diff objects. + */ + private LinkedList diff_bisectSplit(String text1, String text2, + int x, int y, long deadline) { + String text1a = text1.substring(0, x); + String text2a = text2.substring(0, y); + String text1b = text1.substring(x); + String text2b = text2.substring(y); + + // Compute both diffs serially. + LinkedList diffs = diff_main(text1a, text2a, false, deadline); + LinkedList diffsb = diff_main(text1b, text2b, false, deadline); + + diffs.addAll(diffsb); + return diffs; + } + + /** + * Split two texts into a list of strings. Reduce the texts to a string of + * hashes where each Unicode character represents one line. + * @param text1 First string. + * @param text2 Second string. + * @return An object containing the encoded text1, the encoded text2 and + * the List of unique strings. The zeroth element of the List of + * unique strings is intentionally blank. + */ + protected LinesToCharsResult diff_linesToChars(String text1, String text2) { + List lineArray = new ArrayList(); + Map lineHash = new HashMap(); + // e.g. linearray[4] == "Hello\n" + // e.g. linehash.get("Hello\n") == 4 + + // "\x00" is a valid character, but various debuggers don't like it. + // So we'll insert a junk entry to avoid generating a null character. + lineArray.add(""); + + // Allocate 2/3rds of the space for text1, the rest for text2. + String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash, 40000); + String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash, 65535); + return new LinesToCharsResult(chars1, chars2, lineArray); + } + + /** + * Split a text into a list of strings. Reduce the texts to a string of + * hashes where each Unicode character represents one line. + * @param text String to encode. + * @param lineArray List of unique strings. + * @param lineHash Map of strings to indices. + * @param maxLines Maximum length of lineArray. + * @return Encoded string. + */ + private String diff_linesToCharsMunge(String text, List lineArray, + Map lineHash, int maxLines) { + int lineStart = 0; + int lineEnd = -1; + String line; + StringBuilder chars = new StringBuilder(); + // Walk the text, pulling out a substring for each line. + // text.split('\n') would would temporarily double our memory footprint. + // Modifying text would create many large strings to garbage collect. + while (lineEnd < text.length() - 1) { + lineEnd = text.indexOf('\n', lineStart); + if (lineEnd == -1) { + lineEnd = text.length() - 1; + } + line = text.substring(lineStart, lineEnd + 1); + + if (lineHash.containsKey(line)) { + chars.append(String.valueOf((char) (int) lineHash.get(line))); + } else { + if (lineArray.size() == maxLines) { + // Bail out at 65535 because + // String.valueOf((char) 65536).equals(String.valueOf(((char) 0))) + line = text.substring(lineStart); + lineEnd = text.length(); + } + lineArray.add(line); + lineHash.put(line, lineArray.size() - 1); + chars.append(String.valueOf((char) (lineArray.size() - 1))); + } + lineStart = lineEnd + 1; + } + return chars.toString(); + } + + /** + * Rehydrate the text in a diff from a string of line hashes to real lines of + * text. + * @param diffs List of Diff objects. + * @param lineArray List of unique strings. + */ + protected void diff_charsToLines(List diffs, + List lineArray) { + StringBuilder text; + for (Diff diff : diffs) { + text = new StringBuilder(); + for (int j = 0; j < diff.text.length(); j++) { + text.append(lineArray.get(diff.text.charAt(j))); + } + diff.text = text.toString(); + } + } + + /** + * Determine the common prefix of two strings + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the start of each string. + */ + public int diff_commonPrefix(String text1, String text2) { + // Performance analysis: https://neil.fraser.name/news/2007/10/09/ + int n = Math.min(text1.length(), text2.length()); + for (int i = 0; i < n; i++) { + if (text1.charAt(i) != text2.charAt(i)) { + return i; + } + } + return n; + } + + /** + * Determine the common suffix of two strings + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the end of each string. + */ + public int diff_commonSuffix(String text1, String text2) { + // Performance analysis: https://neil.fraser.name/news/2007/10/09/ + int text1_length = text1.length(); + int text2_length = text2.length(); + int n = Math.min(text1_length, text2_length); + for (int i = 1; i <= n; i++) { + if (text1.charAt(text1_length - i) != text2.charAt(text2_length - i)) { + return i - 1; + } + } + return n; + } + + /** + * Determine if the suffix of one string is the prefix of another. + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the end of the first + * string and the start of the second string. + */ + protected int diff_commonOverlap(String text1, String text2) { + // Cache the text lengths to prevent multiple calls. + int text1_length = text1.length(); + int text2_length = text2.length(); + // Eliminate the null case. + if (text1_length == 0 || text2_length == 0) { + return 0; + } + // Truncate the longer string. + if (text1_length > text2_length) { + text1 = text1.substring(text1_length - text2_length); + } else if (text1_length < text2_length) { + text2 = text2.substring(0, text1_length); + } + int text_length = Math.min(text1_length, text2_length); + // Quick check for the worst case. + if (text1.equals(text2)) { + return text_length; + } + + // Start by looking for a single character match + // and increase length until no match is found. + // Performance analysis: https://neil.fraser.name/news/2010/11/04/ + int best = 0; + int length = 1; + while (true) { + String pattern = text1.substring(text_length - length); + int found = text2.indexOf(pattern); + if (found == -1) { + return best; + } + length += found; + if (found == 0 || text1.substring(text_length - length).equals( + text2.substring(0, length))) { + best = length; + length++; + } + } + } + + /** + * Do the two texts share a substring which is at least half the length of + * the longer text? + * This speedup can produce non-minimal diffs. + * @param text1 First string. + * @param text2 Second string. + * @return Five element String array, containing the prefix of text1, the + * suffix of text1, the prefix of text2, the suffix of text2 and the + * common middle. Or null if there was no match. + */ + protected String[] diff_halfMatch(String text1, String text2) { + if (Diff_Timeout <= 0) { + // Don't risk returning a non-optimal diff if we have unlimited time. + return null; + } + String longtext = text1.length() > text2.length() ? text1 : text2; + String shorttext = text1.length() > text2.length() ? text2 : text1; + if (longtext.length() < 4 || shorttext.length() * 2 < longtext.length()) { + return null; // Pointless. + } + + // First check if the second quarter is the seed for a half-match. + String[] hm1 = diff_halfMatchI(longtext, shorttext, + (longtext.length() + 3) / 4); + // Check again based on the third quarter. + String[] hm2 = diff_halfMatchI(longtext, shorttext, + (longtext.length() + 1) / 2); + String[] hm; + if (hm1 == null && hm2 == null) { + return null; + } else if (hm2 == null) { + hm = hm1; + } else if (hm1 == null) { + hm = hm2; + } else { + // Both matched. Select the longest. + hm = hm1[4].length() > hm2[4].length() ? hm1 : hm2; + } + + // A half-match was found, sort out the return data. + if (text1.length() > text2.length()) { + return hm; + //return new String[]{hm[0], hm[1], hm[2], hm[3], hm[4]}; + } else { + return new String[]{hm[2], hm[3], hm[0], hm[1], hm[4]}; + } + } + + /** + * Does a substring of shorttext exist within longtext such that the + * substring is at least half the length of longtext? + * @param longtext Longer string. + * @param shorttext Shorter string. + * @param i Start index of quarter length substring within longtext. + * @return Five element String array, containing the prefix of longtext, the + * suffix of longtext, the prefix of shorttext, the suffix of shorttext + * and the common middle. Or null if there was no match. + */ + private String[] diff_halfMatchI(String longtext, String shorttext, int i) { + // Start with a 1/4 length substring at position i as a seed. + String seed = longtext.substring(i, i + longtext.length() / 4); + int j = -1; + String best_common = ""; + String best_longtext_a = "", best_longtext_b = ""; + String best_shorttext_a = "", best_shorttext_b = ""; + while ((j = shorttext.indexOf(seed, j + 1)) != -1) { + int prefixLength = diff_commonPrefix(longtext.substring(i), + shorttext.substring(j)); + int suffixLength = diff_commonSuffix(longtext.substring(0, i), + shorttext.substring(0, j)); + if (best_common.length() < suffixLength + prefixLength) { + best_common = shorttext.substring(j - suffixLength, j) + + shorttext.substring(j, j + prefixLength); + best_longtext_a = longtext.substring(0, i - suffixLength); + best_longtext_b = longtext.substring(i + prefixLength); + best_shorttext_a = shorttext.substring(0, j - suffixLength); + best_shorttext_b = shorttext.substring(j + prefixLength); + } + } + if (best_common.length() * 2 >= longtext.length()) { + return new String[]{best_longtext_a, best_longtext_b, + best_shorttext_a, best_shorttext_b, best_common}; + } else { + return null; + } + } + + /** + * Reduce the number of edits by eliminating semantically trivial equalities. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupSemantic(LinkedList diffs) { + if (diffs.isEmpty()) { + return; + } + boolean changes = false; + Deque equalities = new ArrayDeque(); // Double-ended queue of qualities. + String lastEquality = null; // Always equal to equalities.peek().text + ListIterator pointer = diffs.listIterator(); + // Number of characters that changed prior to the equality. + int length_insertions1 = 0; + int length_deletions1 = 0; + // Number of characters that changed after the equality. + int length_insertions2 = 0; + int length_deletions2 = 0; + Diff thisDiff = pointer.next(); + while (thisDiff != null) { + if (thisDiff.operation == Operation.EQUAL) { + // Equality found. + equalities.push(thisDiff); + length_insertions1 = length_insertions2; + length_deletions1 = length_deletions2; + length_insertions2 = 0; + length_deletions2 = 0; + lastEquality = thisDiff.text; + } else { + // An insertion or deletion. + if (thisDiff.operation == Operation.INSERT) { + length_insertions2 += thisDiff.text.length(); + } else { + length_deletions2 += thisDiff.text.length(); + } + // Eliminate an equality that is smaller or equal to the edits on both + // sides of it. + if (lastEquality != null && (lastEquality.length() + <= Math.max(length_insertions1, length_deletions1)) + && (lastEquality.length() + <= Math.max(length_insertions2, length_deletions2))) { + //System.out.println("Splitting: '" + lastEquality + "'"); + // Walk back to offending equality. + while (thisDiff != equalities.peek()) { + thisDiff = pointer.previous(); + } + pointer.next(); + + // Replace equality with a delete. + pointer.set(new Diff(Operation.DELETE, lastEquality)); + // Insert a corresponding an insert. + pointer.add(new Diff(Operation.INSERT, lastEquality)); + + equalities.pop(); // Throw away the equality we just deleted. + if (!equalities.isEmpty()) { + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop(); + } + if (equalities.isEmpty()) { + // There are no previous equalities, walk back to the start. + while (pointer.hasPrevious()) { + pointer.previous(); + } + } else { + // There is a safe equality we can fall back to. + thisDiff = equalities.peek(); + while (thisDiff != pointer.previous()) { + // Intentionally empty loop. + } + } + + length_insertions1 = 0; // Reset the counters. + length_insertions2 = 0; + length_deletions1 = 0; + length_deletions2 = 0; + lastEquality = null; + changes = true; + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + + // Normalize the diff. + if (changes) { + diff_cleanupMerge(diffs); + } + diff_cleanupSemanticLossless(diffs); + + // Find any overlaps between deletions and insertions. + // e.g: abcxxxxxxdef + // -> abcxxxdef + // e.g: xxxabcdefxxx + // -> defxxxabc + // Only extract an overlap if it is as big as the edit ahead or behind it. + pointer = diffs.listIterator(); + Diff prevDiff = null; + thisDiff = null; + if (pointer.hasNext()) { + prevDiff = pointer.next(); + if (pointer.hasNext()) { + thisDiff = pointer.next(); + } + } + while (thisDiff != null) { + if (prevDiff.operation == Operation.DELETE && + thisDiff.operation == Operation.INSERT) { + String deletion = prevDiff.text; + String insertion = thisDiff.text; + int overlap_length1 = this.diff_commonOverlap(deletion, insertion); + int overlap_length2 = this.diff_commonOverlap(insertion, deletion); + if (overlap_length1 >= overlap_length2) { + if (overlap_length1 >= deletion.length() / 2.0 || + overlap_length1 >= insertion.length() / 2.0) { + // Overlap found. Insert an equality and trim the surrounding edits. + pointer.previous(); + pointer.add(new Diff(Operation.EQUAL, + insertion.substring(0, overlap_length1))); + prevDiff.text = + deletion.substring(0, deletion.length() - overlap_length1); + thisDiff.text = insertion.substring(overlap_length1); + // pointer.add inserts the element before the cursor, so there is + // no need to step past the new element. + } + } else { + if (overlap_length2 >= deletion.length() / 2.0 || + overlap_length2 >= insertion.length() / 2.0) { + // Reverse overlap found. + // Insert an equality and swap and trim the surrounding edits. + pointer.previous(); + pointer.add(new Diff(Operation.EQUAL, + deletion.substring(0, overlap_length2))); + prevDiff.operation = Operation.INSERT; + prevDiff.text = + insertion.substring(0, insertion.length() - overlap_length2); + thisDiff.operation = Operation.DELETE; + thisDiff.text = deletion.substring(overlap_length2); + // pointer.add inserts the element before the cursor, so there is + // no need to step past the new element. + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + prevDiff = thisDiff; + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Look for single edits surrounded on both sides by equalities + * which can be shifted sideways to align the edit to a word boundary. + * e.g: The cat came. -> The cat came. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupSemanticLossless(LinkedList diffs) { + String equality1, edit, equality2; + String commonString; + int commonOffset; + int score, bestScore; + String bestEquality1, bestEdit, bestEquality2; + // Create a new iterator at the start. + ListIterator pointer = diffs.listIterator(); + Diff prevDiff = pointer.hasNext() ? pointer.next() : null; + Diff thisDiff = pointer.hasNext() ? pointer.next() : null; + Diff nextDiff = pointer.hasNext() ? pointer.next() : null; + // Intentionally ignore the first and last element (don't need checking). + while (nextDiff != null) { + if (prevDiff.operation == Operation.EQUAL && + nextDiff.operation == Operation.EQUAL) { + // This is a single edit surrounded by equalities. + equality1 = prevDiff.text; + edit = thisDiff.text; + equality2 = nextDiff.text; + + // First, shift the edit as far left as possible. + commonOffset = diff_commonSuffix(equality1, edit); + if (commonOffset != 0) { + commonString = edit.substring(edit.length() - commonOffset); + equality1 = equality1.substring(0, equality1.length() - commonOffset); + edit = commonString + edit.substring(0, edit.length() - commonOffset); + equality2 = commonString + equality2; + } + + // Second, step character by character right, looking for the best fit. + bestEquality1 = equality1; + bestEdit = edit; + bestEquality2 = equality2; + bestScore = diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2); + while (edit.length() != 0 && equality2.length() != 0 + && edit.charAt(0) == equality2.charAt(0)) { + equality1 += edit.charAt(0); + edit = edit.substring(1) + equality2.charAt(0); + equality2 = equality2.substring(1); + score = diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2); + // The >= encourages trailing rather than leading whitespace on edits. + if (score >= bestScore) { + bestScore = score; + bestEquality1 = equality1; + bestEdit = edit; + bestEquality2 = equality2; + } + } + + if (!prevDiff.text.equals(bestEquality1)) { + // We have an improvement, save it back to the diff. + if (bestEquality1.length() != 0) { + prevDiff.text = bestEquality1; + } else { + pointer.previous(); // Walk past nextDiff. + pointer.previous(); // Walk past thisDiff. + pointer.previous(); // Walk past prevDiff. + pointer.remove(); // Delete prevDiff. + pointer.next(); // Walk past thisDiff. + pointer.next(); // Walk past nextDiff. + } + thisDiff.text = bestEdit; + if (bestEquality2.length() != 0) { + nextDiff.text = bestEquality2; + } else { + pointer.remove(); // Delete nextDiff. + nextDiff = thisDiff; + thisDiff = prevDiff; + } + } + } + prevDiff = thisDiff; + thisDiff = nextDiff; + nextDiff = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Given two strings, compute a score representing whether the internal + * boundary falls on logical boundaries. + * Scores range from 6 (best) to 0 (worst). + * @param one First string. + * @param two Second string. + * @return The score. + */ + private int diff_cleanupSemanticScore(String one, String two) { + if (one.length() == 0 || two.length() == 0) { + // Edges are the best. + return 6; + } + + // Each port of this function behaves slightly differently due to + // subtle differences in each language's definition of things like + // 'whitespace'. Since this function's purpose is largely cosmetic, + // the choice has been made to use each language's native features + // rather than force total conformity. + char char1 = one.charAt(one.length() - 1); + char char2 = two.charAt(0); + boolean nonAlphaNumeric1 = !Character.isLetterOrDigit(char1); + boolean nonAlphaNumeric2 = !Character.isLetterOrDigit(char2); + boolean whitespace1 = nonAlphaNumeric1 && Character.isWhitespace(char1); + boolean whitespace2 = nonAlphaNumeric2 && Character.isWhitespace(char2); + boolean lineBreak1 = whitespace1 + && Character.getType(char1) == Character.CONTROL; + boolean lineBreak2 = whitespace2 + && Character.getType(char2) == Character.CONTROL; + boolean blankLine1 = lineBreak1 && BLANKLINEEND.matcher(one).find(); + boolean blankLine2 = lineBreak2 && BLANKLINESTART.matcher(two).find(); + + if (blankLine1 || blankLine2) { + // Five points for blank lines. + return 5; + } else if (lineBreak1 || lineBreak2) { + // Four points for line breaks. + return 4; + } else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) { + // Three points for end of sentences. + return 3; + } else if (whitespace1 || whitespace2) { + // Two points for whitespace. + return 2; + } else if (nonAlphaNumeric1 || nonAlphaNumeric2) { + // One point for non-alphanumeric. + return 1; + } + return 0; + } + + // Define some regex patterns for matching boundaries. + private Pattern BLANKLINEEND + = Pattern.compile("\\n\\r?\\n\\Z", Pattern.DOTALL); + private Pattern BLANKLINESTART + = Pattern.compile("\\A\\r?\\n\\r?\\n", Pattern.DOTALL); + + /** + * Reduce the number of edits by eliminating operationally trivial equalities. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupEfficiency(LinkedList diffs) { + if (diffs.isEmpty()) { + return; + } + boolean changes = false; + Deque equalities = new ArrayDeque(); // Double-ended queue of equalities. + String lastEquality = null; // Always equal to equalities.peek().text + ListIterator pointer = diffs.listIterator(); + // Is there an insertion operation before the last equality. + boolean pre_ins = false; + // Is there a deletion operation before the last equality. + boolean pre_del = false; + // Is there an insertion operation after the last equality. + boolean post_ins = false; + // Is there a deletion operation after the last equality. + boolean post_del = false; + Diff thisDiff = pointer.next(); + Diff safeDiff = thisDiff; // The last Diff that is known to be unsplittable. + while (thisDiff != null) { + if (thisDiff.operation == Operation.EQUAL) { + // Equality found. + if (thisDiff.text.length() < Diff_EditCost && (post_ins || post_del)) { + // Candidate found. + equalities.push(thisDiff); + pre_ins = post_ins; + pre_del = post_del; + lastEquality = thisDiff.text; + } else { + // Not a candidate, and can never become one. + equalities.clear(); + lastEquality = null; + safeDiff = thisDiff; + } + post_ins = post_del = false; + } else { + // An insertion or deletion. + if (thisDiff.operation == Operation.DELETE) { + post_del = true; + } else { + post_ins = true; + } + /* + * Five types to be split: + * ABXYCD + * AXCD + * ABXC + * AXCD + * ABXC + */ + if (lastEquality != null + && ((pre_ins && pre_del && post_ins && post_del) + || ((lastEquality.length() < Diff_EditCost / 2) + && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) + + (post_ins ? 1 : 0) + (post_del ? 1 : 0)) == 3))) { + //System.out.println("Splitting: '" + lastEquality + "'"); + // Walk back to offending equality. + while (thisDiff != equalities.peek()) { + thisDiff = pointer.previous(); + } + pointer.next(); + + // Replace equality with a delete. + pointer.set(new Diff(Operation.DELETE, lastEquality)); + // Insert a corresponding an insert. + pointer.add(thisDiff = new Diff(Operation.INSERT, lastEquality)); + + equalities.pop(); // Throw away the equality we just deleted. + lastEquality = null; + if (pre_ins && pre_del) { + // No changes made which could affect previous entry, keep going. + post_ins = post_del = true; + equalities.clear(); + safeDiff = thisDiff; + } else { + if (!equalities.isEmpty()) { + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop(); + } + if (equalities.isEmpty()) { + // There are no previous questionable equalities, + // walk back to the last known safe diff. + thisDiff = safeDiff; + } else { + // There is an equality we can fall back to. + thisDiff = equalities.peek(); + } + while (thisDiff != pointer.previous()) { + // Intentionally empty loop. + } + post_ins = post_del = false; + } + + changes = true; + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + + if (changes) { + diff_cleanupMerge(diffs); + } + } + + /** + * Reorder and merge like edit sections. Merge equalities. + * Any edit section can move as long as it doesn't cross an equality. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupMerge(LinkedList diffs) { + diffs.add(new Diff(Operation.EQUAL, "")); // Add a dummy entry at the end. + ListIterator pointer = diffs.listIterator(); + int count_delete = 0; + int count_insert = 0; + String text_delete = ""; + String text_insert = ""; + Diff thisDiff = pointer.next(); + Diff prevEqual = null; + int commonlength; + while (thisDiff != null) { + switch (thisDiff.operation) { + case INSERT: + count_insert++; + text_insert += thisDiff.text; + prevEqual = null; + break; + case DELETE: + count_delete++; + text_delete += thisDiff.text; + prevEqual = null; + break; + case EQUAL: + if (count_delete + count_insert > 1) { + boolean both_types = count_delete != 0 && count_insert != 0; + // Delete the offending records. + pointer.previous(); // Reverse direction. + while (count_delete-- > 0) { + pointer.previous(); + pointer.remove(); + } + while (count_insert-- > 0) { + pointer.previous(); + pointer.remove(); + } + if (both_types) { + // Factor out any common prefixies. + commonlength = diff_commonPrefix(text_insert, text_delete); + if (commonlength != 0) { + if (pointer.hasPrevious()) { + thisDiff = pointer.previous(); + assert thisDiff.operation == Operation.EQUAL + : "Previous diff should have been an equality."; + thisDiff.text += text_insert.substring(0, commonlength); + pointer.next(); + } else { + pointer.add(new Diff(Operation.EQUAL, + text_insert.substring(0, commonlength))); + } + text_insert = text_insert.substring(commonlength); + text_delete = text_delete.substring(commonlength); + } + // Factor out any common suffixies. + commonlength = diff_commonSuffix(text_insert, text_delete); + if (commonlength != 0) { + thisDiff = pointer.next(); + thisDiff.text = text_insert.substring(text_insert.length() + - commonlength) + thisDiff.text; + text_insert = text_insert.substring(0, text_insert.length() + - commonlength); + text_delete = text_delete.substring(0, text_delete.length() + - commonlength); + pointer.previous(); + } + } + // Insert the merged records. + if (text_delete.length() != 0) { + pointer.add(new Diff(Operation.DELETE, text_delete)); + } + if (text_insert.length() != 0) { + pointer.add(new Diff(Operation.INSERT, text_insert)); + } + // Step forward to the equality. + thisDiff = pointer.hasNext() ? pointer.next() : null; + } else if (prevEqual != null) { + // Merge this equality with the previous one. + prevEqual.text += thisDiff.text; + pointer.remove(); + thisDiff = pointer.previous(); + pointer.next(); // Forward direction + } + count_insert = 0; + count_delete = 0; + text_delete = ""; + text_insert = ""; + prevEqual = thisDiff; + break; + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + if (diffs.getLast().text.length() == 0) { + diffs.removeLast(); // Remove the dummy entry at the end. + } + + /* + * Second pass: look for single edits surrounded on both sides by equalities + * which can be shifted sideways to eliminate an equality. + * e.g: ABAC -> ABAC + */ + boolean changes = false; + // Create a new iterator at the start. + // (As opposed to walking the current one back.) + pointer = diffs.listIterator(); + Diff prevDiff = pointer.hasNext() ? pointer.next() : null; + thisDiff = pointer.hasNext() ? pointer.next() : null; + Diff nextDiff = pointer.hasNext() ? pointer.next() : null; + // Intentionally ignore the first and last element (don't need checking). + while (nextDiff != null) { + if (prevDiff.operation == Operation.EQUAL && + nextDiff.operation == Operation.EQUAL) { + // This is a single edit surrounded by equalities. + if (thisDiff.text.endsWith(prevDiff.text)) { + // Shift the edit over the previous equality. + thisDiff.text = prevDiff.text + + thisDiff.text.substring(0, thisDiff.text.length() + - prevDiff.text.length()); + nextDiff.text = prevDiff.text + nextDiff.text; + pointer.previous(); // Walk past nextDiff. + pointer.previous(); // Walk past thisDiff. + pointer.previous(); // Walk past prevDiff. + pointer.remove(); // Delete prevDiff. + pointer.next(); // Walk past thisDiff. + thisDiff = pointer.next(); // Walk past nextDiff. + nextDiff = pointer.hasNext() ? pointer.next() : null; + changes = true; + } else if (thisDiff.text.startsWith(nextDiff.text)) { + // Shift the edit over the next equality. + prevDiff.text += nextDiff.text; + thisDiff.text = thisDiff.text.substring(nextDiff.text.length()) + + nextDiff.text; + pointer.remove(); // Delete nextDiff. + nextDiff = pointer.hasNext() ? pointer.next() : null; + changes = true; + } + } + prevDiff = thisDiff; + thisDiff = nextDiff; + nextDiff = pointer.hasNext() ? pointer.next() : null; + } + // If shifts were made, the diff needs reordering and another shift sweep. + if (changes) { + diff_cleanupMerge(diffs); + } + } + + /** + * loc is a location in text1, compute and return the equivalent location in + * text2. + * e.g. "The cat" vs "The big cat", 1->1, 5->8 + * @param diffs List of Diff objects. + * @param loc Location within text1. + * @return Location within text2. + */ + public int diff_xIndex(List diffs, int loc) { + int chars1 = 0; + int chars2 = 0; + int last_chars1 = 0; + int last_chars2 = 0; + Diff lastDiff = null; + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.INSERT) { + // Equality or deletion. + chars1 += aDiff.text.length(); + } + if (aDiff.operation != Operation.DELETE) { + // Equality or insertion. + chars2 += aDiff.text.length(); + } + if (chars1 > loc) { + // Overshot the location. + lastDiff = aDiff; + break; + } + last_chars1 = chars1; + last_chars2 = chars2; + } + if (lastDiff != null && lastDiff.operation == Operation.DELETE) { + // The location was deleted. + return last_chars2; + } + // Add the remaining character length. + return last_chars2 + (loc - last_chars1); + } + + /** + * Convert a Diff list into a pretty HTML report. + * @param diffs List of Diff objects. + * @return HTML representation. + */ + public String diff_prettyHtml(List diffs) { + StringBuilder html = new StringBuilder(); + for (Diff aDiff : diffs) { + String text = aDiff.text.replace("&", "&").replace("<", "<") + .replace(">", ">").replace("\n", "¶
"); + switch (aDiff.operation) { + case INSERT: + html.append("").append(text) + .append(""); + break; + case DELETE: + html.append("").append(text) + .append(""); + break; + case EQUAL: + html.append("").append(text).append(""); + break; + } + } + return html.toString(); + } + + /** + * Compute and return the source text (all equalities and deletions). + * @param diffs List of Diff objects. + * @return Source text. + */ + public String diff_text1(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.INSERT) { + text.append(aDiff.text); + } + } + return text.toString(); + } + + /** + * Compute and return the destination text (all equalities and insertions). + * @param diffs List of Diff objects. + * @return Destination text. + */ + public String diff_text2(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.DELETE) { + text.append(aDiff.text); + } + } + return text.toString(); + } + + /** + * Compute the Levenshtein distance; the number of inserted, deleted or + * substituted characters. + * @param diffs List of Diff objects. + * @return Number of changes. + */ + public int diff_levenshtein(List diffs) { + int levenshtein = 0; + int insertions = 0; + int deletions = 0; + for (Diff aDiff : diffs) { + switch (aDiff.operation) { + case INSERT: + insertions += aDiff.text.length(); + break; + case DELETE: + deletions += aDiff.text.length(); + break; + case EQUAL: + // A deletion and an insertion is one substitution. + levenshtein += Math.max(insertions, deletions); + insertions = 0; + deletions = 0; + break; + } + } + levenshtein += Math.max(insertions, deletions); + return levenshtein; + } + + /** + * Crush the diff into an encoded string which describes the operations + * required to transform text1 into text2. + * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. + * Operations are tab-separated. Inserted text is escaped using %xx notation. + * @param diffs List of Diff objects. + * @return Delta text. + */ + public String diff_toDelta(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + switch (aDiff.operation) { + case INSERT: + try { + text.append("+").append(URLEncoder.encode(aDiff.text, "UTF-8") + .replace('+', ' ')).append("\t"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } + break; + case DELETE: + text.append("-").append(aDiff.text.length()).append("\t"); + break; + case EQUAL: + text.append("=").append(aDiff.text.length()).append("\t"); + break; + } + } + String delta = text.toString(); + if (delta.length() != 0) { + // Strip off trailing tab character. + delta = delta.substring(0, delta.length() - 1); + delta = unescapeForEncodeUriCompatability(delta); + } + return delta; + } + + /** + * Given the original text1, and an encoded string which describes the + * operations required to transform text1 into text2, compute the full diff. + * @param text1 Source string for the diff. + * @param delta Delta text. + * @return Array of Diff objects or null if invalid. + * @throws IllegalArgumentException If invalid input. + */ + public LinkedList diff_fromDelta(String text1, String delta) + throws IllegalArgumentException { + LinkedList diffs = new LinkedList(); + int pointer = 0; // Cursor in text1 + String[] tokens = delta.split("\t"); + for (String token : tokens) { + if (token.length() == 0) { + // Blank tokens are ok (from a trailing \t). + continue; + } + // Each token begins with a one character parameter which specifies the + // operation of this token (delete, insert, equality). + String param = token.substring(1); + switch (token.charAt(0)) { + case '+': + // decode would change all "+" to " " + param = param.replace("+", "%2B"); + try { + param = URLDecoder.decode(param, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } catch (IllegalArgumentException e) { + // Malformed URI sequence. + throw new IllegalArgumentException( + "Illegal escape in diff_fromDelta: " + param, e); + } + diffs.add(new Diff(Operation.INSERT, param)); + break; + case '-': + // Fall through. + case '=': + int n; + try { + n = Integer.parseInt(param); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "Invalid number in diff_fromDelta: " + param, e); + } + if (n < 0) { + throw new IllegalArgumentException( + "Negative number in diff_fromDelta: " + param); + } + String text; + try { + text = text1.substring(pointer, pointer += n); + } catch (StringIndexOutOfBoundsException e) { + throw new IllegalArgumentException("Delta length (" + pointer + + ") larger than source text length (" + text1.length() + + ").", e); + } + if (token.charAt(0) == '=') { + diffs.add(new Diff(Operation.EQUAL, text)); + } else { + diffs.add(new Diff(Operation.DELETE, text)); + } + break; + default: + // Anything else is an error. + throw new IllegalArgumentException( + "Invalid diff operation in diff_fromDelta: " + token.charAt(0)); + } + } + if (pointer != text1.length()) { + throw new IllegalArgumentException("Delta length (" + pointer + + ") smaller than source text length (" + text1.length() + ")."); + } + return diffs; + } + + + // MATCH FUNCTIONS + + + /** + * Locate the best instance of 'pattern' in 'text' near 'loc'. + * Returns -1 if no match found. + * @param text The text to search. + * @param pattern The pattern to search for. + * @param loc The location to search around. + * @return Best match index or -1. + */ + public int match_main(String text, String pattern, int loc) { + // Check for null inputs. + if (text == null || pattern == null) { + throw new IllegalArgumentException("Null inputs. (match_main)"); + } + + loc = Math.max(0, Math.min(loc, text.length())); + if (text.equals(pattern)) { + // Shortcut (potentially not guaranteed by the algorithm) + return 0; + } else if (text.length() == 0) { + // Nothing to match. + return -1; + } else if (loc + pattern.length() <= text.length() + && text.substring(loc, loc + pattern.length()).equals(pattern)) { + // Perfect match at the perfect spot! (Includes case of null pattern) + return loc; + } else { + // Do a fuzzy compare. + return match_bitap(text, pattern, loc); + } + } + + /** + * Locate the best instance of 'pattern' in 'text' near 'loc' using the + * Bitap algorithm. Returns -1 if no match found. + * @param text The text to search. + * @param pattern The pattern to search for. + * @param loc The location to search around. + * @return Best match index or -1. + */ + protected int match_bitap(String text, String pattern, int loc) { + assert (Match_MaxBits == 0 || pattern.length() <= Match_MaxBits) + : "Pattern too long for this application."; + + // Initialise the alphabet. + Map s = match_alphabet(pattern); + + // Highest score beyond which we give up. + double score_threshold = Match_Threshold; + // Is there a nearby exact match? (speedup) + int best_loc = text.indexOf(pattern, loc); + if (best_loc != -1) { + score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + // What about in the other direction? (speedup) + best_loc = text.lastIndexOf(pattern, loc + pattern.length()); + if (best_loc != -1) { + score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + } + } + + // Initialise the bit arrays. + int matchmask = 1 << (pattern.length() - 1); + best_loc = -1; + + int bin_min, bin_mid; + int bin_max = pattern.length() + text.length(); + // Empty initialization added to appease Java compiler. + int[] last_rd = new int[0]; + for (int d = 0; d < pattern.length(); d++) { + // Scan for the best match; each iteration allows for one more error. + // Run a binary search to determine how far from 'loc' we can stray at + // this error level. + bin_min = 0; + bin_mid = bin_max; + while (bin_min < bin_mid) { + if (match_bitapScore(d, loc + bin_mid, loc, pattern) + <= score_threshold) { + bin_min = bin_mid; + } else { + bin_max = bin_mid; + } + bin_mid = (bin_max - bin_min) / 2 + bin_min; + } + // Use the result from this iteration as the maximum for the next. + bin_max = bin_mid; + int start = Math.max(1, loc - bin_mid + 1); + int finish = Math.min(loc + bin_mid, text.length()) + pattern.length(); + + int[] rd = new int[finish + 2]; + rd[finish + 1] = (1 << d) - 1; + for (int j = finish; j >= start; j--) { + int charMatch; + if (text.length() <= j - 1 || !s.containsKey(text.charAt(j - 1))) { + // Out of range. + charMatch = 0; + } else { + charMatch = s.get(text.charAt(j - 1)); + } + if (d == 0) { + // First pass: exact match. + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; + } else { + // Subsequent passes: fuzzy match. + rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) + | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; + } + if ((rd[j] & matchmask) != 0) { + double score = match_bitapScore(d, j - 1, loc, pattern); + // This match will almost certainly be better than any existing + // match. But check anyway. + if (score <= score_threshold) { + // Told you so. + score_threshold = score; + best_loc = j - 1; + if (best_loc > loc) { + // When passing loc, don't exceed our current distance from loc. + start = Math.max(1, 2 * loc - best_loc); + } else { + // Already passed loc, downhill from here on in. + break; + } + } + } + } + if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) { + // No hope for a (better) match at greater error levels. + break; + } + last_rd = rd; + } + return best_loc; + } + + /** + * Compute and return the score for a match with e errors and x location. + * @param e Number of errors in match. + * @param x Location of match. + * @param loc Expected location of match. + * @param pattern Pattern being sought. + * @return Overall score for match (0.0 = good, 1.0 = bad). + */ + private double match_bitapScore(int e, int x, int loc, String pattern) { + float accuracy = (float) e / pattern.length(); + int proximity = Math.abs(loc - x); + if (Match_Distance == 0) { + // Dodge divide by zero error. + return proximity == 0 ? accuracy : 1.0; + } + return accuracy + (proximity / (float) Match_Distance); + } + + /** + * Initialise the alphabet for the Bitap algorithm. + * @param pattern The text to encode. + * @return Hash of character locations. + */ + protected Map match_alphabet(String pattern) { + Map s = new HashMap(); + char[] char_pattern = pattern.toCharArray(); + for (char c : char_pattern) { + s.put(c, 0); + } + int i = 0; + for (char c : char_pattern) { + s.put(c, s.get(c) | (1 << (pattern.length() - i - 1))); + i++; + } + return s; + } + + + // PATCH FUNCTIONS + + + /** + * Increase the context until it is unique, + * but don't let the pattern expand beyond Match_MaxBits. + * @param patch The patch to grow. + * @param text Source text. + */ + protected void patch_addContext(Patch patch, String text) { + if (text.length() == 0) { + return; + } + String pattern = text.substring(patch.start2, patch.start2 + patch.length1); + int padding = 0; + + // Look for the first and last matches of pattern in text. If two different + // matches are found, increase the pattern length. + while (text.indexOf(pattern) != text.lastIndexOf(pattern) + && pattern.length() < Match_MaxBits - Patch_Margin - Patch_Margin) { + padding += Patch_Margin; + pattern = text.substring(Math.max(0, patch.start2 - padding), + Math.min(text.length(), patch.start2 + patch.length1 + padding)); + } + // Add one chunk for good luck. + padding += Patch_Margin; + + // Add the prefix. + String prefix = text.substring(Math.max(0, patch.start2 - padding), + patch.start2); + if (prefix.length() != 0) { + patch.diffs.addFirst(new Diff(Operation.EQUAL, prefix)); + } + // Add the suffix. + String suffix = text.substring(patch.start2 + patch.length1, + Math.min(text.length(), patch.start2 + patch.length1 + padding)); + if (suffix.length() != 0) { + patch.diffs.addLast(new Diff(Operation.EQUAL, suffix)); + } + + // Roll back the start points. + patch.start1 -= prefix.length(); + patch.start2 -= prefix.length(); + // Extend the lengths. + patch.length1 += prefix.length() + suffix.length(); + patch.length2 += prefix.length() + suffix.length(); + } + + /** + * Compute a list of patches to turn text1 into text2. + * A set of diffs will be computed. + * @param text1 Old text. + * @param text2 New text. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(String text1, String text2) { + if (text1 == null || text2 == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + // No diffs provided, compute our own. + LinkedList diffs = diff_main(text1, text2, true); + if (diffs.size() > 2) { + diff_cleanupSemantic(diffs); + diff_cleanupEfficiency(diffs); + } + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text1 will be derived from the provided diffs. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(LinkedList diffs) { + if (diffs == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + // No origin string provided, compute our own. + String text1 = diff_text1(diffs); + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text2 is ignored, diffs are the delta between text1 and text2. + * @param text1 Old text + * @param text2 Ignored. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + * @deprecated Prefer patch_make(String text1, LinkedList diffs). + */ + @Deprecated public LinkedList patch_make(String text1, String text2, + LinkedList diffs) { + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text2 is not provided, diffs are the delta between text1 and text2. + * @param text1 Old text. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(String text1, LinkedList diffs) { + if (text1 == null || diffs == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + + LinkedList patches = new LinkedList(); + if (diffs.isEmpty()) { + return patches; // Get rid of the null case. + } + Patch patch = new Patch(); + int char_count1 = 0; // Number of characters into the text1 string. + int char_count2 = 0; // Number of characters into the text2 string. + // Start with text1 (prepatch_text) and apply the diffs until we arrive at + // text2 (postpatch_text). We recreate the patches one by one to determine + // context info. + String prepatch_text = text1; + String postpatch_text = text1; + for (Diff aDiff : diffs) { + if (patch.diffs.isEmpty() && aDiff.operation != Operation.EQUAL) { + // A new patch starts here. + patch.start1 = char_count1; + patch.start2 = char_count2; + } + + switch (aDiff.operation) { + case INSERT: + patch.diffs.add(aDiff); + patch.length2 += aDiff.text.length(); + postpatch_text = postpatch_text.substring(0, char_count2) + + aDiff.text + postpatch_text.substring(char_count2); + break; + case DELETE: + patch.length1 += aDiff.text.length(); + patch.diffs.add(aDiff); + postpatch_text = postpatch_text.substring(0, char_count2) + + postpatch_text.substring(char_count2 + aDiff.text.length()); + break; + case EQUAL: + if (aDiff.text.length() <= 2 * Patch_Margin + && !patch.diffs.isEmpty() && aDiff != diffs.getLast()) { + // Small equality inside a patch. + patch.diffs.add(aDiff); + patch.length1 += aDiff.text.length(); + patch.length2 += aDiff.text.length(); + } + + if (aDiff.text.length() >= 2 * Patch_Margin && !patch.diffs.isEmpty()) { + // Time for a new patch. + if (!patch.diffs.isEmpty()) { + patch_addContext(patch, prepatch_text); + patches.add(patch); + patch = new Patch(); + // Unlike Unidiff, our patch lists have a rolling context. + // https://github.com/google/diff-match-patch/wiki/Unidiff + // Update prepatch text & pos to reflect the application of the + // just completed patch. + prepatch_text = postpatch_text; + char_count1 = char_count2; + } + } + break; + } + + // Update the current character count. + if (aDiff.operation != Operation.INSERT) { + char_count1 += aDiff.text.length(); + } + if (aDiff.operation != Operation.DELETE) { + char_count2 += aDiff.text.length(); + } + } + // Pick up the leftover patch if not empty. + if (!patch.diffs.isEmpty()) { + patch_addContext(patch, prepatch_text); + patches.add(patch); + } + + return patches; + } + + /** + * Given an array of patches, return another array that is identical. + * @param patches Array of Patch objects. + * @return Array of Patch objects. + */ + public LinkedList patch_deepCopy(LinkedList patches) { + LinkedList patchesCopy = new LinkedList(); + for (Patch aPatch : patches) { + Patch patchCopy = new Patch(); + for (Diff aDiff : aPatch.diffs) { + Diff diffCopy = new Diff(aDiff.operation, aDiff.text); + patchCopy.diffs.add(diffCopy); + } + patchCopy.start1 = aPatch.start1; + patchCopy.start2 = aPatch.start2; + patchCopy.length1 = aPatch.length1; + patchCopy.length2 = aPatch.length2; + patchesCopy.add(patchCopy); + } + return patchesCopy; + } + + /** + * Merge a set of patches onto the text. Return a patched text, as well + * as an array of true/false values indicating which patches were applied. + * @param patches Array of Patch objects + * @param text Old text. + * @return Two element Object array, containing the new text and an array of + * boolean values. + */ + public Object[] patch_apply(LinkedList patches, String text) { + if (patches.isEmpty()) { + return new Object[]{text, new boolean[0]}; + } + + // Deep copy the patches so that no changes are made to originals. + patches = patch_deepCopy(patches); + + String nullPadding = patch_addPadding(patches); + text = nullPadding + text + nullPadding; + patch_splitMax(patches); + + int x = 0; + // delta keeps track of the offset between the expected and actual location + // of the previous patch. If there are patches expected at positions 10 and + // 20, but the first patch was found at 12, delta is 2 and the second patch + // has an effective expected position of 22. + int delta = 0; + boolean[] results = new boolean[patches.size()]; + for (Patch aPatch : patches) { + int expected_loc = aPatch.start2 + delta; + String text1 = diff_text1(aPatch.diffs); + int start_loc; + int end_loc = -1; + if (text1.length() > this.Match_MaxBits) { + // patch_splitMax will only provide an oversized pattern in the case of + // a monster delete. + start_loc = match_main(text, + text1.substring(0, this.Match_MaxBits), expected_loc); + if (start_loc != -1) { + end_loc = match_main(text, + text1.substring(text1.length() - this.Match_MaxBits), + expected_loc + text1.length() - this.Match_MaxBits); + if (end_loc == -1 || start_loc >= end_loc) { + // Can't find valid trailing context. Drop this patch. + start_loc = -1; + } + } + } else { + start_loc = match_main(text, text1, expected_loc); + } + if (start_loc == -1) { + // No match found. :( + results[x] = false; + // Subtract the delta for this failed patch from subsequent patches. + delta -= aPatch.length2 - aPatch.length1; + } else { + // Found a match. :) + results[x] = true; + delta = start_loc - expected_loc; + String text2; + if (end_loc == -1) { + text2 = text.substring(start_loc, + Math.min(start_loc + text1.length(), text.length())); + } else { + text2 = text.substring(start_loc, + Math.min(end_loc + this.Match_MaxBits, text.length())); + } + if (text1.equals(text2)) { + // Perfect match, just shove the replacement text in. + text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) + + text.substring(start_loc + text1.length()); + } else { + // Imperfect match. Run a diff to get a framework of equivalent + // indices. + LinkedList diffs = diff_main(text1, text2, false); + if (text1.length() > this.Match_MaxBits + && diff_levenshtein(diffs) / (float) text1.length() + > this.Patch_DeleteThreshold) { + // The end points match, but the content is unacceptably bad. + results[x] = false; + } else { + diff_cleanupSemanticLossless(diffs); + int index1 = 0; + for (Diff aDiff : aPatch.diffs) { + if (aDiff.operation != Operation.EQUAL) { + int index2 = diff_xIndex(diffs, index1); + if (aDiff.operation == Operation.INSERT) { + // Insertion + text = text.substring(0, start_loc + index2) + aDiff.text + + text.substring(start_loc + index2); + } else if (aDiff.operation == Operation.DELETE) { + // Deletion + text = text.substring(0, start_loc + index2) + + text.substring(start_loc + diff_xIndex(diffs, + index1 + aDiff.text.length())); + } + } + if (aDiff.operation != Operation.DELETE) { + index1 += aDiff.text.length(); + } + } + } + } + } + x++; + } + // Strip the padding off. + text = text.substring(nullPadding.length(), text.length() + - nullPadding.length()); + return new Object[]{text, results}; + } + + /** + * Add some padding on text start and end so that edges can match something. + * Intended to be called only from within patch_apply. + * @param patches Array of Patch objects. + * @return The padding string added to each side. + */ + public String patch_addPadding(LinkedList patches) { + short paddingLength = this.Patch_Margin; + String nullPadding = ""; + for (short x = 1; x <= paddingLength; x++) { + nullPadding += String.valueOf((char) x); + } + + // Bump all the patches forward. + for (Patch aPatch : patches) { + aPatch.start1 += paddingLength; + aPatch.start2 += paddingLength; + } + + // Add some padding on start of first diff. + Patch patch = patches.getFirst(); + LinkedList diffs = patch.diffs; + if (diffs.isEmpty() || diffs.getFirst().operation != Operation.EQUAL) { + // Add nullPadding equality. + diffs.addFirst(new Diff(Operation.EQUAL, nullPadding)); + patch.start1 -= paddingLength; // Should be 0. + patch.start2 -= paddingLength; // Should be 0. + patch.length1 += paddingLength; + patch.length2 += paddingLength; + } else if (paddingLength > diffs.getFirst().text.length()) { + // Grow first equality. + Diff firstDiff = diffs.getFirst(); + int extraLength = paddingLength - firstDiff.text.length(); + firstDiff.text = nullPadding.substring(firstDiff.text.length()) + + firstDiff.text; + patch.start1 -= extraLength; + patch.start2 -= extraLength; + patch.length1 += extraLength; + patch.length2 += extraLength; + } + + // Add some padding on end of last diff. + patch = patches.getLast(); + diffs = patch.diffs; + if (diffs.isEmpty() || diffs.getLast().operation != Operation.EQUAL) { + // Add nullPadding equality. + diffs.addLast(new Diff(Operation.EQUAL, nullPadding)); + patch.length1 += paddingLength; + patch.length2 += paddingLength; + } else if (paddingLength > diffs.getLast().text.length()) { + // Grow last equality. + Diff lastDiff = diffs.getLast(); + int extraLength = paddingLength - lastDiff.text.length(); + lastDiff.text += nullPadding.substring(0, extraLength); + patch.length1 += extraLength; + patch.length2 += extraLength; + } + + return nullPadding; + } + + /** + * Look through the patches and break up any which are longer than the + * maximum limit of the match algorithm. + * Intended to be called only from within patch_apply. + * @param patches LinkedList of Patch objects. + */ + public void patch_splitMax(LinkedList patches) { + short patch_size = Match_MaxBits; + String precontext, postcontext; + Patch patch; + int start1, start2; + boolean empty; + Operation diff_type; + String diff_text; + ListIterator pointer = patches.listIterator(); + Patch bigpatch = pointer.hasNext() ? pointer.next() : null; + while (bigpatch != null) { + if (bigpatch.length1 <= Match_MaxBits) { + bigpatch = pointer.hasNext() ? pointer.next() : null; + continue; + } + // Remove the big old patch. + pointer.remove(); + start1 = bigpatch.start1; + start2 = bigpatch.start2; + precontext = ""; + while (!bigpatch.diffs.isEmpty()) { + // Create one of several smaller patches. + patch = new Patch(); + empty = true; + patch.start1 = start1 - precontext.length(); + patch.start2 = start2 - precontext.length(); + if (precontext.length() != 0) { + patch.length1 = patch.length2 = precontext.length(); + patch.diffs.add(new Diff(Operation.EQUAL, precontext)); + } + while (!bigpatch.diffs.isEmpty() + && patch.length1 < patch_size - Patch_Margin) { + diff_type = bigpatch.diffs.getFirst().operation; + diff_text = bigpatch.diffs.getFirst().text; + if (diff_type == Operation.INSERT) { + // Insertions are harmless. + patch.length2 += diff_text.length(); + start2 += diff_text.length(); + patch.diffs.addLast(bigpatch.diffs.removeFirst()); + empty = false; + } else if (diff_type == Operation.DELETE && patch.diffs.size() == 1 + && patch.diffs.getFirst().operation == Operation.EQUAL + && diff_text.length() > 2 * patch_size) { + // This is a large deletion. Let it pass in one chunk. + patch.length1 += diff_text.length(); + start1 += diff_text.length(); + empty = false; + patch.diffs.add(new Diff(diff_type, diff_text)); + bigpatch.diffs.removeFirst(); + } else { + // Deletion or equality. Only take as much as we can stomach. + diff_text = diff_text.substring(0, Math.min(diff_text.length(), + patch_size - patch.length1 - Patch_Margin)); + patch.length1 += diff_text.length(); + start1 += diff_text.length(); + if (diff_type == Operation.EQUAL) { + patch.length2 += diff_text.length(); + start2 += diff_text.length(); + } else { + empty = false; + } + patch.diffs.add(new Diff(diff_type, diff_text)); + if (diff_text.equals(bigpatch.diffs.getFirst().text)) { + bigpatch.diffs.removeFirst(); + } else { + bigpatch.diffs.getFirst().text = bigpatch.diffs.getFirst().text + .substring(diff_text.length()); + } + } + } + // Compute the head context for the next patch. + precontext = diff_text2(patch.diffs); + precontext = precontext.substring(Math.max(0, precontext.length() + - Patch_Margin)); + // Append the end context for this patch. + if (diff_text1(bigpatch.diffs).length() > Patch_Margin) { + postcontext = diff_text1(bigpatch.diffs).substring(0, Patch_Margin); + } else { + postcontext = diff_text1(bigpatch.diffs); + } + if (postcontext.length() != 0) { + patch.length1 += postcontext.length(); + patch.length2 += postcontext.length(); + if (!patch.diffs.isEmpty() + && patch.diffs.getLast().operation == Operation.EQUAL) { + patch.diffs.getLast().text += postcontext; + } else { + patch.diffs.add(new Diff(Operation.EQUAL, postcontext)); + } + } + if (!empty) { + pointer.add(patch); + } + } + bigpatch = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Take a list of patches and return a textual representation. + * @param patches List of Patch objects. + * @return Text representation of patches. + */ + public String patch_toText(List patches) { + StringBuilder text = new StringBuilder(); + for (Patch aPatch : patches) { + text.append(aPatch); + } + return text.toString(); + } + + /** + * Parse a textual representation of patches and return a List of Patch + * objects. + * @param textline Text representation of patches. + * @return List of Patch objects. + * @throws IllegalArgumentException If invalid input. + */ + public List patch_fromText(String textline) + throws IllegalArgumentException { + List patches = new LinkedList(); + if (textline.length() == 0) { + return patches; + } + List textList = Arrays.asList(textline.split("\n")); + LinkedList text = new LinkedList(textList); + Patch patch; + Pattern patchHeader + = Pattern.compile("^@@ -(\\d+),?(\\d*) \\+(\\d+),?(\\d*) @@$"); + Matcher m; + char sign; + String line; + while (!text.isEmpty()) { + m = patchHeader.matcher(text.getFirst()); + if (!m.matches()) { + throw new IllegalArgumentException( + "Invalid patch string: " + text.getFirst()); + } + patch = new Patch(); + patches.add(patch); + patch.start1 = Integer.parseInt(m.group(1)); + if (m.group(2).length() == 0) { + patch.start1--; + patch.length1 = 1; + } else if (m.group(2).equals("0")) { + patch.length1 = 0; + } else { + patch.start1--; + patch.length1 = Integer.parseInt(m.group(2)); + } + + patch.start2 = Integer.parseInt(m.group(3)); + if (m.group(4).length() == 0) { + patch.start2--; + patch.length2 = 1; + } else if (m.group(4).equals("0")) { + patch.length2 = 0; + } else { + patch.start2--; + patch.length2 = Integer.parseInt(m.group(4)); + } + text.removeFirst(); + + while (!text.isEmpty()) { + try { + sign = text.getFirst().charAt(0); + } catch (IndexOutOfBoundsException e) { + // Blank line? Whatever. + text.removeFirst(); + continue; + } + line = text.getFirst().substring(1); + line = line.replace("+", "%2B"); // decode would change all "+" to " " + try { + line = URLDecoder.decode(line, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } catch (IllegalArgumentException e) { + // Malformed URI sequence. + throw new IllegalArgumentException( + "Illegal escape in patch_fromText: " + line, e); + } + if (sign == '-') { + // Deletion. + patch.diffs.add(new Diff(Operation.DELETE, line)); + } else if (sign == '+') { + // Insertion. + patch.diffs.add(new Diff(Operation.INSERT, line)); + } else if (sign == ' ') { + // Minor equality. + patch.diffs.add(new Diff(Operation.EQUAL, line)); + } else if (sign == '@') { + // Start of next patch. + break; + } else { + // WTF? + throw new IllegalArgumentException( + "Invalid patch mode '" + sign + "' in: " + line); + } + text.removeFirst(); + } + } + return patches; + } + + + /** + * Class representing one diff operation. + */ + public static class Diff { + /** + * One of: INSERT, DELETE or EQUAL. + */ + public Operation operation; + /** + * The text associated with this diff operation. + */ + public String text; + + /** + * Constructor. Initializes the diff with the provided values. + * @param operation One of INSERT, DELETE or EQUAL. + * @param text The text being applied. + */ + public Diff(Operation operation, String text) { + // Construct a diff with the specified operation and text. + this.operation = operation; + this.text = text; + } + + /** + * Display a human-readable version of this Diff. + * @return text version. + */ + public String toString() { + String prettyText = this.text.replace('\n', '\u00b6'); + return "Diff(" + this.operation + ",\"" + prettyText + "\")"; + } + + /** + * Create a numeric hash value for a Diff. + * This function is not used by DMP. + * @return Hash value. + */ + @Override + public int hashCode() { + final int prime = 31; + int result = (operation == null) ? 0 : operation.hashCode(); + result += prime * ((text == null) ? 0 : text.hashCode()); + return result; + } + + /** + * Is this Diff equivalent to another Diff? + * @param obj Another Diff to compare against. + * @return true or false. + */ + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Diff other = (Diff) obj; + if (operation != other.operation) { + return false; + } + if (text == null) { + if (other.text != null) { + return false; + } + } else if (!text.equals(other.text)) { + return false; + } + return true; + } + } + + + /** + * Class representing one patch operation. + */ + public static class Patch { + public LinkedList diffs; + public int start1; + public int start2; + public int length1; + public int length2; + + /** + * Constructor. Initializes with an empty list of diffs. + */ + public Patch() { + this.diffs = new LinkedList(); + } + + /** + * Emulate GNU diff's format. + * Header: @@ -382,8 +481,9 @@ + * Indices are printed as 1-based, not 0-based. + * @return The GNU diff string. + */ + public String toString() { + String coords1, coords2; + if (this.length1 == 0) { + coords1 = this.start1 + ",0"; + } else if (this.length1 == 1) { + coords1 = Integer.toString(this.start1 + 1); + } else { + coords1 = (this.start1 + 1) + "," + this.length1; + } + if (this.length2 == 0) { + coords2 = this.start2 + ",0"; + } else if (this.length2 == 1) { + coords2 = Integer.toString(this.start2 + 1); + } else { + coords2 = (this.start2 + 1) + "," + this.length2; + } + StringBuilder text = new StringBuilder(); + text.append("@@ -").append(coords1).append(" +").append(coords2) + .append(" @@\n"); + // Escape the body of the patch with %xx notation. + for (Diff aDiff : this.diffs) { + switch (aDiff.operation) { + case INSERT: + text.append('+'); + break; + case DELETE: + text.append('-'); + break; + case EQUAL: + text.append(' '); + break; + } + try { + text.append(URLEncoder.encode(aDiff.text, "UTF-8").replace('+', ' ')) + .append("\n"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } + } + return unescapeForEncodeUriCompatability(text.toString()); + } + } + + /** + * Unescape selected chars for compatability with JavaScript's encodeURI. + * In speed critical applications this could be dropped since the + * receiving application will certainly decode these fine. + * Note that this function is case-sensitive. Thus "%3f" would not be + * unescaped. But this is ok because it is only called with the output of + * URLEncoder.encode which returns uppercase hex. + * + * Example: "%3F" -> "?", "%24" -> "$", etc. + * + * @param str The string to escape. + * @return The escaped string. + */ + private static String unescapeForEncodeUriCompatability(String str) { + return str.replace("%21", "!").replace("%7E", "~") + .replace("%27", "'").replace("%28", "(").replace("%29", ")") + .replace("%3B", ";").replace("%2F", "/").replace("%3F", "?") + .replace("%3A", ":").replace("%40", "@").replace("%26", "&") + .replace("%3D", "=").replace("%2B", "+").replace("%24", "$") + .replace("%2C", ",").replace("%23", "#"); + } +} diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/matching/DiffMatchPatchTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/matching/DiffMatchPatchTest.java new file mode 100644 index 0000000000..dfad9ba5cf --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/utilities/matching/DiffMatchPatchTest.java @@ -0,0 +1,988 @@ +/* + * Diff Match and Patch -- Test harness + * Copyright 2018 The diff-match-patch Authors. + * https://github.com/google/diff-match-patch + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Compile from diff-match-patch/java with: + * javac -d classes src/name/fraser/neil/plaintext/diff_match_patch.java tests/name/fraser/neil/plaintext/diff_match_patch_test.java + * Execute with: + * java -classpath classes name/fraser/neil/plaintext/diff_match_patch_test + */ + +package org.grobid.core.utilities.matching; + +import java.util.*; + +public class DiffMatchPatchTest { + + private static DiffMatchPatch dmp; + private static DiffMatchPatch.Operation DELETE = DiffMatchPatch.Operation.DELETE; + private static DiffMatchPatch.Operation EQUAL = DiffMatchPatch.Operation.EQUAL; + private static DiffMatchPatch.Operation INSERT = DiffMatchPatch.Operation.INSERT; + + + // DIFF TEST FUNCTIONS + + + public static void testDiffCommonPrefix() { + // Detect any common prefix. + assertEquals("diff_commonPrefix: Null case.", 0, dmp.diff_commonPrefix("abc", "xyz")); + + assertEquals("diff_commonPrefix: Non-null case.", 4, dmp.diff_commonPrefix("1234abcdef", "1234xyz")); + + assertEquals("diff_commonPrefix: Whole case.", 4, dmp.diff_commonPrefix("1234", "1234xyz")); + } + + public static void testDiffCommonSuffix() { + // Detect any common suffix. + assertEquals("diff_commonSuffix: Null case.", 0, dmp.diff_commonSuffix("abc", "xyz")); + + assertEquals("diff_commonSuffix: Non-null case.", 4, dmp.diff_commonSuffix("abcdef1234", "xyz1234")); + + assertEquals("diff_commonSuffix: Whole case.", 4, dmp.diff_commonSuffix("1234", "xyz1234")); + } + + public static void testDiffCommonOverlap() { + // Detect any suffix/prefix overlap. + assertEquals("diff_commonOverlap: Null case.", 0, dmp.diff_commonOverlap("", "abcd")); + + assertEquals("diff_commonOverlap: Whole case.", 3, dmp.diff_commonOverlap("abc", "abcd")); + + assertEquals("diff_commonOverlap: No overlap.", 0, dmp.diff_commonOverlap("123456", "abcd")); + + assertEquals("diff_commonOverlap: Overlap.", 3, dmp.diff_commonOverlap("123456xxx", "xxxabcd")); + + // Some overly clever languages (C#) may treat ligatures as equal to their + // component letters. E.g. U+FB01 == 'fi' + assertEquals("diff_commonOverlap: Unicode.", 0, dmp.diff_commonOverlap("fi", "\ufb01i")); + } + + public static void testDiffHalfmatch() { + // Detect a halfmatch. + dmp.Diff_Timeout = 1; + assertNull("diff_halfMatch: No match #1.", dmp.diff_halfMatch("1234567890", "abcdef")); + + assertNull("diff_halfMatch: No match #2.", dmp.diff_halfMatch("12345", "23")); + + assertArrayEquals("diff_halfMatch: Single Match #1.", new String[]{"12", "90", "a", "z", "345678"}, dmp.diff_halfMatch("1234567890", "a345678z")); + + assertArrayEquals("diff_halfMatch: Single Match #2.", new String[]{"a", "z", "12", "90", "345678"}, dmp.diff_halfMatch("a345678z", "1234567890")); + + assertArrayEquals("diff_halfMatch: Single Match #3.", new String[]{"abc", "z", "1234", "0", "56789"}, dmp.diff_halfMatch("abc56789z", "1234567890")); + + assertArrayEquals("diff_halfMatch: Single Match #4.", new String[]{"a", "xyz", "1", "7890", "23456"}, dmp.diff_halfMatch("a23456xyz", "1234567890")); + + assertArrayEquals("diff_halfMatch: Multiple Matches #1.", new String[]{"12123", "123121", "a", "z", "1234123451234"}, dmp.diff_halfMatch("121231234123451234123121", "a1234123451234z")); + + assertArrayEquals("diff_halfMatch: Multiple Matches #2.", new String[]{"", "-=-=-=-=-=", "x", "", "x-=-=-=-=-=-=-="}, dmp.diff_halfMatch("x-=-=-=-=-=-=-=-=-=-=-=-=", "xx-=-=-=-=-=-=-=")); + + assertArrayEquals("diff_halfMatch: Multiple Matches #3.", new String[]{"-=-=-=-=-=", "", "", "y", "-=-=-=-=-=-=-=y"}, dmp.diff_halfMatch("-=-=-=-=-=-=-=-=-=-=-=-=y", "-=-=-=-=-=-=-=yy")); + + // Optimal diff would be -q+x=H-i+e=lloHe+Hu=llo-Hew+y not -qHillo+x=HelloHe-w+Hulloy + assertArrayEquals("diff_halfMatch: Non-optimal halfmatch.", new String[]{"qHillo", "w", "x", "Hulloy", "HelloHe"}, dmp.diff_halfMatch("qHilloHelloHew", "xHelloHeHulloy")); + + dmp.Diff_Timeout = 0; + assertNull("diff_halfMatch: Optimal no halfmatch.", dmp.diff_halfMatch("qHilloHelloHew", "xHelloHeHulloy")); + } + + public static void testDiffLinesToChars() { + // Convert lines down to characters. + ArrayList tmpVector = new ArrayList(); + tmpVector.add(""); + tmpVector.add("alpha\n"); + tmpVector.add("beta\n"); + assertLinesToCharsResultEquals("diff_linesToChars: Shared lines.", new DiffMatchPatch.LinesToCharsResult("\u0001\u0002\u0001", "\u0002\u0001\u0002", tmpVector), dmp.diff_linesToChars("alpha\nbeta\nalpha\n", "beta\nalpha\nbeta\n")); + + tmpVector.clear(); + tmpVector.add(""); + tmpVector.add("alpha\r\n"); + tmpVector.add("beta\r\n"); + tmpVector.add("\r\n"); + assertLinesToCharsResultEquals("diff_linesToChars: Empty string and blank lines.", new DiffMatchPatch.LinesToCharsResult("", "\u0001\u0002\u0003\u0003", tmpVector), dmp.diff_linesToChars("", "alpha\r\nbeta\r\n\r\n\r\n")); + + tmpVector.clear(); + tmpVector.add(""); + tmpVector.add("a"); + tmpVector.add("b"); + assertLinesToCharsResultEquals("diff_linesToChars: No linebreaks.", new DiffMatchPatch.LinesToCharsResult("\u0001", "\u0002", tmpVector), dmp.diff_linesToChars("a", "b")); + + // More than 256 to reveal any 8-bit limitations. + int n = 300; + tmpVector.clear(); + StringBuilder lineList = new StringBuilder(); + StringBuilder charList = new StringBuilder(); + for (int i = 1; i < n + 1; i++) { + tmpVector.add(i + "\n"); + lineList.append(i + "\n"); + charList.append(String.valueOf((char) i)); + } + assertEquals("Test initialization fail #1.", n, tmpVector.size()); + String lines = lineList.toString(); + String chars = charList.toString(); + assertEquals("Test initialization fail #2.", n, chars.length()); + tmpVector.add(0, ""); + assertLinesToCharsResultEquals("diff_linesToChars: More than 256.", new DiffMatchPatch.LinesToCharsResult(chars, "", tmpVector), dmp.diff_linesToChars(lines, "")); + } + + public static void testDiffCharsToLines() { + // First check that Diff equality works. + assertTrue("diff_charsToLines: Equality #1.", new DiffMatchPatch.Diff(EQUAL, "a").equals(new DiffMatchPatch.Diff(EQUAL, "a"))); + + assertEquals("diff_charsToLines: Equality #2.", new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(EQUAL, "a")); + + // Convert chars up to lines. + LinkedList diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "\u0001\u0002\u0001"), new DiffMatchPatch.Diff(INSERT, "\u0002\u0001\u0002")); + ArrayList tmpVector = new ArrayList(); + tmpVector.add(""); + tmpVector.add("alpha\n"); + tmpVector.add("beta\n"); + dmp.diff_charsToLines(diffs, tmpVector); + assertEquals("diff_charsToLines: Shared lines.", diffList(new DiffMatchPatch.Diff(EQUAL, "alpha\nbeta\nalpha\n"), new DiffMatchPatch.Diff(INSERT, "beta\nalpha\nbeta\n")), diffs); + + // More than 256 to reveal any 8-bit limitations. + int n = 300; + tmpVector.clear(); + StringBuilder lineList = new StringBuilder(); + StringBuilder charList = new StringBuilder(); + for (int i = 1; i < n + 1; i++) { + tmpVector.add(i + "\n"); + lineList.append(i + "\n"); + charList.append(String.valueOf((char) i)); + } + assertEquals("Test initialization fail #3.", n, tmpVector.size()); + String lines = lineList.toString(); + String chars = charList.toString(); + assertEquals("Test initialization fail #4.", n, chars.length()); + tmpVector.add(0, ""); + diffs = diffList(new DiffMatchPatch.Diff(DELETE, chars)); + dmp.diff_charsToLines(diffs, tmpVector); + assertEquals("diff_charsToLines: More than 256.", diffList(new DiffMatchPatch.Diff(DELETE, lines)), diffs); + + // More than 65536 to verify any 16-bit limitation. + lineList = new StringBuilder(); + for (int i = 0; i < 66000; i++) { + lineList.append(i + "\n"); + } + chars = lineList.toString(); + DiffMatchPatch.LinesToCharsResult results = dmp.diff_linesToChars(chars, ""); + diffs = diffList(new DiffMatchPatch.Diff(INSERT, results.chars1)); + dmp.diff_charsToLines(diffs, results.lineArray); + assertEquals("diff_charsToLines: More than 65536.", chars, diffs.getFirst().text); + } + + public static void testDiffCleanupMerge() { + // Cleanup a messy diff. + LinkedList diffs = diffList(); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Null case.", diffList(), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(INSERT, "c")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: No change case.", diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(INSERT, "c")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(EQUAL, "b"), new DiffMatchPatch.Diff(EQUAL, "c")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Merge equalities.", diffList(new DiffMatchPatch.Diff(EQUAL, "abc")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(DELETE, "c")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Merge deletions.", diffList(new DiffMatchPatch.Diff(DELETE, "abc")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(INSERT, "b"), new DiffMatchPatch.Diff(INSERT, "c")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Merge insertions.", diffList(new DiffMatchPatch.Diff(INSERT, "abc")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(INSERT, "b"), new DiffMatchPatch.Diff(DELETE, "c"), new DiffMatchPatch.Diff(INSERT, "d"), new DiffMatchPatch.Diff(EQUAL, "e"), new DiffMatchPatch.Diff(EQUAL, "f")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Merge interweave.", diffList(new DiffMatchPatch.Diff(DELETE, "ac"), new DiffMatchPatch.Diff(INSERT, "bd"), new DiffMatchPatch.Diff(EQUAL, "ef")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(INSERT, "abc"), new DiffMatchPatch.Diff(DELETE, "dc")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Prefix and suffix detection.", diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "d"), new DiffMatchPatch.Diff(INSERT, "b"), new DiffMatchPatch.Diff(EQUAL, "c")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "x"), new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(INSERT, "abc"), new DiffMatchPatch.Diff(DELETE, "dc"), new DiffMatchPatch.Diff(EQUAL, "y")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Prefix and suffix detection with equalities.", diffList(new DiffMatchPatch.Diff(EQUAL, "xa"), new DiffMatchPatch.Diff(DELETE, "d"), new DiffMatchPatch.Diff(INSERT, "b"), new DiffMatchPatch.Diff(EQUAL, "cy")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(INSERT, "ba"), new DiffMatchPatch.Diff(EQUAL, "c")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Slide edit left.", diffList(new DiffMatchPatch.Diff(INSERT, "ab"), new DiffMatchPatch.Diff(EQUAL, "ac")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "c"), new DiffMatchPatch.Diff(INSERT, "ab"), new DiffMatchPatch.Diff(EQUAL, "a")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Slide edit right.", diffList(new DiffMatchPatch.Diff(EQUAL, "ca"), new DiffMatchPatch.Diff(INSERT, "ba")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(EQUAL, "c"), new DiffMatchPatch.Diff(DELETE, "ac"), new DiffMatchPatch.Diff(EQUAL, "x")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Slide edit left recursive.", diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(EQUAL, "acx")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "x"), new DiffMatchPatch.Diff(DELETE, "ca"), new DiffMatchPatch.Diff(EQUAL, "c"), new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(EQUAL, "a")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Slide edit right recursive.", diffList(new DiffMatchPatch.Diff(EQUAL, "xca"), new DiffMatchPatch.Diff(DELETE, "cba")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(INSERT, "ab"), new DiffMatchPatch.Diff(EQUAL, "c")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Empty merge.", diffList(new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(EQUAL, "bc")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, ""), new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(EQUAL, "b")); + dmp.diff_cleanupMerge(diffs); + assertEquals("diff_cleanupMerge: Empty equality.", diffList(new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(EQUAL, "b")), diffs); + } + + public static void testDiffCleanupSemanticLossless() { + // Slide diffs to match logical boundaries. + LinkedList diffs = diffList(); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Null case.", diffList(), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "AAA\r\n\r\nBBB"), new DiffMatchPatch.Diff(INSERT, "\r\nDDD\r\n\r\nBBB"), new DiffMatchPatch.Diff(EQUAL, "\r\nEEE")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Blank lines.", diffList(new DiffMatchPatch.Diff(EQUAL, "AAA\r\n\r\n"), new DiffMatchPatch.Diff(INSERT, "BBB\r\nDDD\r\n\r\n"), new DiffMatchPatch.Diff(EQUAL, "BBB\r\nEEE")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "AAA\r\nBBB"), new DiffMatchPatch.Diff(INSERT, " DDD\r\nBBB"), new DiffMatchPatch.Diff(EQUAL, " EEE")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Line boundaries.", diffList(new DiffMatchPatch.Diff(EQUAL, "AAA\r\n"), new DiffMatchPatch.Diff(INSERT, "BBB DDD\r\n"), new DiffMatchPatch.Diff(EQUAL, "BBB EEE")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "The c"), new DiffMatchPatch.Diff(INSERT, "ow and the c"), new DiffMatchPatch.Diff(EQUAL, "at.")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Word boundaries.", diffList(new DiffMatchPatch.Diff(EQUAL, "The "), new DiffMatchPatch.Diff(INSERT, "cow and the "), new DiffMatchPatch.Diff(EQUAL, "cat.")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "The-c"), new DiffMatchPatch.Diff(INSERT, "ow-and-the-c"), new DiffMatchPatch.Diff(EQUAL, "at.")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Alphanumeric boundaries.", diffList(new DiffMatchPatch.Diff(EQUAL, "The-"), new DiffMatchPatch.Diff(INSERT, "cow-and-the-"), new DiffMatchPatch.Diff(EQUAL, "cat.")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(EQUAL, "ax")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Hitting the start.", diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(EQUAL, "aax")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "xa"), new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(EQUAL, "a")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Hitting the end.", diffList(new DiffMatchPatch.Diff(EQUAL, "xaa"), new DiffMatchPatch.Diff(DELETE, "a")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "The xxx. The "), new DiffMatchPatch.Diff(INSERT, "zzz. The "), new DiffMatchPatch.Diff(EQUAL, "yyy.")); + dmp.diff_cleanupSemanticLossless(diffs); + assertEquals("diff_cleanupSemanticLossless: Sentence boundaries.", diffList(new DiffMatchPatch.Diff(EQUAL, "The xxx."), new DiffMatchPatch.Diff(INSERT, " The zzz."), new DiffMatchPatch.Diff(EQUAL, " The yyy.")), diffs); + } + + public static void testDiffCleanupSemantic() { + // Cleanup semantically trivial equalities. + LinkedList diffs = diffList(); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Null case.", diffList(), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "cd"), new DiffMatchPatch.Diff(EQUAL, "12"), new DiffMatchPatch.Diff(DELETE, "e")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: No elimination #1.", diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "cd"), new DiffMatchPatch.Diff(EQUAL, "12"), new DiffMatchPatch.Diff(DELETE, "e")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(INSERT, "ABC"), new DiffMatchPatch.Diff(EQUAL, "1234"), new DiffMatchPatch.Diff(DELETE, "wxyz")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: No elimination #2.", diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(INSERT, "ABC"), new DiffMatchPatch.Diff(EQUAL, "1234"), new DiffMatchPatch.Diff(DELETE, "wxyz")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(EQUAL, "b"), new DiffMatchPatch.Diff(DELETE, "c")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Simple elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(INSERT, "b")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(EQUAL, "cd"), new DiffMatchPatch.Diff(DELETE, "e"), new DiffMatchPatch.Diff(EQUAL, "f"), new DiffMatchPatch.Diff(INSERT, "g")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Backpass elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abcdef"), new DiffMatchPatch.Diff(INSERT, "cdfg")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(INSERT, "1"), new DiffMatchPatch.Diff(EQUAL, "A"), new DiffMatchPatch.Diff(DELETE, "B"), new DiffMatchPatch.Diff(INSERT, "2"), new DiffMatchPatch.Diff(EQUAL, "_"), new DiffMatchPatch.Diff(INSERT, "1"), new DiffMatchPatch.Diff(EQUAL, "A"), new DiffMatchPatch.Diff(DELETE, "B"), new DiffMatchPatch.Diff(INSERT, "2")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Multiple elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "AB_AB"), new DiffMatchPatch.Diff(INSERT, "1A2_1A2")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "The c"), new DiffMatchPatch.Diff(DELETE, "ow and the c"), new DiffMatchPatch.Diff(EQUAL, "at.")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Word boundaries.", diffList(new DiffMatchPatch.Diff(EQUAL, "The "), new DiffMatchPatch.Diff(DELETE, "cow and the "), new DiffMatchPatch.Diff(EQUAL, "cat.")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "abcxx"), new DiffMatchPatch.Diff(INSERT, "xxdef")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: No overlap elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abcxx"), new DiffMatchPatch.Diff(INSERT, "xxdef")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "abcxxx"), new DiffMatchPatch.Diff(INSERT, "xxxdef")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Overlap elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(EQUAL, "xxx"), new DiffMatchPatch.Diff(INSERT, "def")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "xxxabc"), new DiffMatchPatch.Diff(INSERT, "defxxx")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Reverse overlap elimination.", diffList(new DiffMatchPatch.Diff(INSERT, "def"), new DiffMatchPatch.Diff(EQUAL, "xxx"), new DiffMatchPatch.Diff(DELETE, "abc")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "abcd1212"), new DiffMatchPatch.Diff(INSERT, "1212efghi"), new DiffMatchPatch.Diff(EQUAL, "----"), new DiffMatchPatch.Diff(DELETE, "A3"), new DiffMatchPatch.Diff(INSERT, "3BC")); + dmp.diff_cleanupSemantic(diffs); + assertEquals("diff_cleanupSemantic: Two overlap eliminations.", diffList(new DiffMatchPatch.Diff(DELETE, "abcd"), new DiffMatchPatch.Diff(EQUAL, "1212"), new DiffMatchPatch.Diff(INSERT, "efghi"), new DiffMatchPatch.Diff(EQUAL, "----"), new DiffMatchPatch.Diff(DELETE, "A"), new DiffMatchPatch.Diff(EQUAL, "3"), new DiffMatchPatch.Diff(INSERT, "BC")), diffs); + } + + public static void testDiffCleanupEfficiency() { + // Cleanup operationally trivial equalities. + dmp.Diff_EditCost = 4; + LinkedList diffs = diffList(); + dmp.diff_cleanupEfficiency(diffs); + assertEquals("diff_cleanupEfficiency: Null case.", diffList(), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "12"), new DiffMatchPatch.Diff(EQUAL, "wxyz"), new DiffMatchPatch.Diff(DELETE, "cd"), new DiffMatchPatch.Diff(INSERT, "34")); + dmp.diff_cleanupEfficiency(diffs); + assertEquals("diff_cleanupEfficiency: No elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "12"), new DiffMatchPatch.Diff(EQUAL, "wxyz"), new DiffMatchPatch.Diff(DELETE, "cd"), new DiffMatchPatch.Diff(INSERT, "34")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "12"), new DiffMatchPatch.Diff(EQUAL, "xyz"), new DiffMatchPatch.Diff(DELETE, "cd"), new DiffMatchPatch.Diff(INSERT, "34")); + dmp.diff_cleanupEfficiency(diffs); + assertEquals("diff_cleanupEfficiency: Four-edit elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abxyzcd"), new DiffMatchPatch.Diff(INSERT, "12xyz34")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(INSERT, "12"), new DiffMatchPatch.Diff(EQUAL, "x"), new DiffMatchPatch.Diff(DELETE, "cd"), new DiffMatchPatch.Diff(INSERT, "34")); + dmp.diff_cleanupEfficiency(diffs); + assertEquals("diff_cleanupEfficiency: Three-edit elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "xcd"), new DiffMatchPatch.Diff(INSERT, "12x34")), diffs); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "12"), new DiffMatchPatch.Diff(EQUAL, "xy"), new DiffMatchPatch.Diff(INSERT, "34"), new DiffMatchPatch.Diff(EQUAL, "z"), new DiffMatchPatch.Diff(DELETE, "cd"), new DiffMatchPatch.Diff(INSERT, "56")); + dmp.diff_cleanupEfficiency(diffs); + assertEquals("diff_cleanupEfficiency: Backpass elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abxyzcd"), new DiffMatchPatch.Diff(INSERT, "12xy34z56")), diffs); + + dmp.Diff_EditCost = 5; + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ab"), new DiffMatchPatch.Diff(INSERT, "12"), new DiffMatchPatch.Diff(EQUAL, "wxyz"), new DiffMatchPatch.Diff(DELETE, "cd"), new DiffMatchPatch.Diff(INSERT, "34")); + dmp.diff_cleanupEfficiency(diffs); + assertEquals("diff_cleanupEfficiency: High cost elimination.", diffList(new DiffMatchPatch.Diff(DELETE, "abwxyzcd"), new DiffMatchPatch.Diff(INSERT, "12wxyz34")), diffs); + dmp.Diff_EditCost = 4; + } + + public static void testDiffPrettyHtml() { + // Pretty print. + LinkedList diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a\n"), new DiffMatchPatch.Diff(DELETE, "b"), new DiffMatchPatch.Diff(INSERT, "c&d")); + assertEquals("diff_prettyHtml:", "
<B>b</B>c&d", dmp.diff_prettyHtml(diffs)); + } + + public static void testDiffText() { + // Compute the source and destination texts. + LinkedList diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "jump"), new DiffMatchPatch.Diff(DELETE, "s"), new DiffMatchPatch.Diff(INSERT, "ed"), new DiffMatchPatch.Diff(EQUAL, " over "), new DiffMatchPatch.Diff(DELETE, "the"), new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(EQUAL, " lazy")); + assertEquals("diff_text1:", "jumps over the lazy", dmp.diff_text1(diffs)); + assertEquals("diff_text2:", "jumped over a lazy", dmp.diff_text2(diffs)); + } + + public static void testDiffDelta() { + // Convert a diff into delta string. + LinkedList diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "jump"), new DiffMatchPatch.Diff(DELETE, "s"), new DiffMatchPatch.Diff(INSERT, "ed"), new DiffMatchPatch.Diff(EQUAL, " over "), new DiffMatchPatch.Diff(DELETE, "the"), new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(EQUAL, " lazy"), new DiffMatchPatch.Diff(INSERT, "old dog")); + String text1 = dmp.diff_text1(diffs); + assertEquals("diff_text1: Base text.", "jumps over the lazy", text1); + + String delta = dmp.diff_toDelta(diffs); + assertEquals("diff_toDelta:", "=4\t-1\t+ed\t=6\t-3\t+a\t=5\t+old dog", delta); + + // Convert delta string into a diff. + assertEquals("diff_fromDelta: Normal.", diffs, dmp.diff_fromDelta(text1, delta)); + + // Generates error (19 < 20). + try { + dmp.diff_fromDelta(text1 + "x", delta); + fail("diff_fromDelta: Too long."); + } catch (IllegalArgumentException ex) { + // Exception expected. + } + + // Generates error (19 > 18). + try { + dmp.diff_fromDelta(text1.substring(1), delta); + fail("diff_fromDelta: Too short."); + } catch (IllegalArgumentException ex) { + // Exception expected. + } + + // Generates error (%c3%xy invalid Unicode). + try { + dmp.diff_fromDelta("", "+%c3%xy"); + fail("diff_fromDelta: Invalid character."); + } catch (IllegalArgumentException ex) { + // Exception expected. + } + + // Test deltas with special characters. + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "\u0680 \000 \t %"), new DiffMatchPatch.Diff(DELETE, "\u0681 \001 \n ^"), new DiffMatchPatch.Diff(INSERT, "\u0682 \002 \\ |")); + text1 = dmp.diff_text1(diffs); + assertEquals("diff_text1: Unicode text.", "\u0680 \000 \t %\u0681 \001 \n ^", text1); + + delta = dmp.diff_toDelta(diffs); + assertEquals("diff_toDelta: Unicode.", "=7\t-7\t+%DA%82 %02 %5C %7C", delta); + + assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta)); + + // Verify pool of unchanged characters. + diffs = diffList(new DiffMatchPatch.Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")); + String text2 = dmp.diff_text2(diffs); + assertEquals("diff_text2: Unchanged characters.", "A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ", text2); + + delta = dmp.diff_toDelta(diffs); + assertEquals("diff_toDelta: Unchanged characters.", "+A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ", delta); + + // Convert delta string into a diff. + assertEquals("diff_fromDelta: Unchanged characters.", diffs, dmp.diff_fromDelta("", delta)); + + // 160 kb string. + String a = "abcdefghij"; + for (int i = 0; i < 14; i++) { + a += a; + } + diffs = diffList(new DiffMatchPatch.Diff(INSERT, a)); + delta = dmp.diff_toDelta(diffs); + assertEquals("diff_toDelta: 160kb string.", "+" + a, delta); + + // Convert delta string into a diff. + assertEquals("diff_fromDelta: 160kb string.", diffs, dmp.diff_fromDelta("", delta)); + } + + public static void testDiffXIndex() { + // Translate a location in text1 to text2. + LinkedList diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(INSERT, "1234"), new DiffMatchPatch.Diff(EQUAL, "xyz")); + assertEquals("diff_xIndex: Translation on equality.", 5, dmp.diff_xIndex(diffs, 2)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "1234"), new DiffMatchPatch.Diff(EQUAL, "xyz")); + assertEquals("diff_xIndex: Translation on deletion.", 1, dmp.diff_xIndex(diffs, 3)); + } + + public static void testDiffLevenshtein() { + LinkedList diffs = diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(INSERT, "1234"), new DiffMatchPatch.Diff(EQUAL, "xyz")); + assertEquals("diff_levenshtein: Levenshtein with trailing equality.", 4, dmp.diff_levenshtein(diffs)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "xyz"), new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(INSERT, "1234")); + assertEquals("diff_levenshtein: Levenshtein with leading equality.", 4, dmp.diff_levenshtein(diffs)); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "abc"), new DiffMatchPatch.Diff(EQUAL, "xyz"), new DiffMatchPatch.Diff(INSERT, "1234")); + assertEquals("diff_levenshtein: Levenshtein with middle equality.", 7, dmp.diff_levenshtein(diffs)); + } + + public static void testDiffBisect() { + // Normal. + String a = "cat"; + String b = "map"; + // Since the resulting diff hasn't been normalized, it would be ok if + // the insertion and deletion pairs are swapped. + // If the order changes, tweak this test as required. + LinkedList diffs = diffList(new DiffMatchPatch.Diff(DELETE, "c"), new DiffMatchPatch.Diff(INSERT, "m"), new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "t"), new DiffMatchPatch.Diff(INSERT, "p")); + assertEquals("diff_bisect: Normal.", diffs, dmp.diff_bisect(a, b, Long.MAX_VALUE)); + + // Timeout. + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "cat"), new DiffMatchPatch.Diff(INSERT, "map")); + assertEquals("diff_bisect: Timeout.", diffs, dmp.diff_bisect(a, b, 0)); + } + + public static void testDiffMain() { + // Perform a trivial diff. + LinkedList diffs = diffList(); + assertEquals("diff_main: Null case.", diffs, dmp.diff_main("", "", false)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "abc")); + assertEquals("diff_main: Equality.", diffs, dmp.diff_main("abc", "abc", false)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "ab"), new DiffMatchPatch.Diff(INSERT, "123"), new DiffMatchPatch.Diff(EQUAL, "c")); + assertEquals("diff_main: Simple insertion.", diffs, dmp.diff_main("abc", "ab123c", false)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "123"), new DiffMatchPatch.Diff(EQUAL, "bc")); + assertEquals("diff_main: Simple deletion.", diffs, dmp.diff_main("a123bc", "abc", false)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(INSERT, "123"), new DiffMatchPatch.Diff(EQUAL, "b"), new DiffMatchPatch.Diff(INSERT, "456"), new DiffMatchPatch.Diff(EQUAL, "c")); + assertEquals("diff_main: Two insertions.", diffs, dmp.diff_main("abc", "a123b456c", false)); + + diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "123"), new DiffMatchPatch.Diff(EQUAL, "b"), new DiffMatchPatch.Diff(DELETE, "456"), new DiffMatchPatch.Diff(EQUAL, "c")); + assertEquals("diff_main: Two deletions.", diffs, dmp.diff_main("a123b456c", "abc", false)); + + // Perform a real diff. + // Switch off the timeout. + dmp.Diff_Timeout = 0; + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(INSERT, "b")); + assertEquals("diff_main: Simple case #1.", diffs, dmp.diff_main("a", "b", false)); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "Apple"), new DiffMatchPatch.Diff(INSERT, "Banana"), new DiffMatchPatch.Diff(EQUAL, "s are a"), new DiffMatchPatch.Diff(INSERT, "lso"), new DiffMatchPatch.Diff(EQUAL, " fruit.")); + assertEquals("diff_main: Simple case #2.", diffs, dmp.diff_main("Apples are a fruit.", "Bananas are also fruit.", false)); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "a"), new DiffMatchPatch.Diff(INSERT, "\u0680"), new DiffMatchPatch.Diff(EQUAL, "x"), new DiffMatchPatch.Diff(DELETE, "\t"), new DiffMatchPatch.Diff(INSERT, "\000")); + assertEquals("diff_main: Simple case #3.", diffs, dmp.diff_main("ax\t", "\u0680x\000", false)); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "1"), new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "y"), new DiffMatchPatch.Diff(EQUAL, "b"), new DiffMatchPatch.Diff(DELETE, "2"), new DiffMatchPatch.Diff(INSERT, "xab")); + assertEquals("diff_main: Overlap #1.", diffs, dmp.diff_main("1ayb2", "abxab", false)); + + diffs = diffList(new DiffMatchPatch.Diff(INSERT, "xaxcx"), new DiffMatchPatch.Diff(EQUAL, "abc"), new DiffMatchPatch.Diff(DELETE, "y")); + assertEquals("diff_main: Overlap #2.", diffs, dmp.diff_main("abcy", "xaxcxabc", false)); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "ABCD"), new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(DELETE, "="), new DiffMatchPatch.Diff(INSERT, "-"), new DiffMatchPatch.Diff(EQUAL, "bcd"), new DiffMatchPatch.Diff(DELETE, "="), new DiffMatchPatch.Diff(INSERT, "-"), new DiffMatchPatch.Diff(EQUAL, "efghijklmnopqrs"), new DiffMatchPatch.Diff(DELETE, "EFGHIJKLMNOefg")); + assertEquals("diff_main: Overlap #3.", diffs, dmp.diff_main("ABCDa=bcd=efghijklmnopqrsEFGHIJKLMNOefg", "a-bcd-efghijklmnopqrs", false)); + + diffs = diffList(new DiffMatchPatch.Diff(INSERT, " "), new DiffMatchPatch.Diff(EQUAL, "a"), new DiffMatchPatch.Diff(INSERT, "nd"), new DiffMatchPatch.Diff(EQUAL, " [[Pennsylvania]]"), new DiffMatchPatch.Diff(DELETE, " and [[New")); + assertEquals("diff_main: Large equality.", diffs, dmp.diff_main("a [[Pennsylvania]] and [[New", " and [[Pennsylvania]]", false)); + + dmp.Diff_Timeout = 0.1f; // 100ms + String a = "`Twas brillig, and the slithy toves\nDid gyre and gimble in the wabe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.\n"; + String b = "I am the very model of a modern major general,\nI've information vegetable, animal, and mineral,\nI know the kings of England, and I quote the fights historical,\nFrom Marathon to Waterloo, in order categorical.\n"; + // Increase the text lengths by 1024 times to ensure a timeout. + for (int i = 0; i < 10; i++) { + a += a; + b += b; + } + long startTime = System.currentTimeMillis(); + dmp.diff_main(a, b); + long endTime = System.currentTimeMillis(); + // Test that we took at least the timeout period. + assertTrue("diff_main: Timeout min.", dmp.Diff_Timeout * 1000 <= endTime - startTime); + // Test that we didn't take forever (be forgiving). + // Theoretically this test could fail very occasionally if the + // OS task swaps or locks up for a second at the wrong moment. + assertTrue("diff_main: Timeout max.", dmp.Diff_Timeout * 1000 * 2 > endTime - startTime); + dmp.Diff_Timeout = 0; + + // Test the linemode speedup. + // Must be long to pass the 100 char cutoff. + a = "1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n"; + b = "abcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\n"; + assertEquals("diff_main: Simple line-mode.", dmp.diff_main(a, b, true), dmp.diff_main(a, b, false)); + + a = "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"; + b = "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij"; + assertEquals("diff_main: Single line-mode.", dmp.diff_main(a, b, true), dmp.diff_main(a, b, false)); + + a = "1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n"; + b = "abcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n"; + String[] texts_linemode = diff_rebuildtexts(dmp.diff_main(a, b, true)); + String[] texts_textmode = diff_rebuildtexts(dmp.diff_main(a, b, false)); + assertArrayEquals("diff_main: Overlap line-mode.", texts_textmode, texts_linemode); + + // Test null inputs. + try { + dmp.diff_main(null, null); + fail("diff_main: Null inputs."); + } catch (IllegalArgumentException ex) { + // Error expected. + } + } + + + // MATCH TEST FUNCTIONS + + + public static void testMatchAlphabet() { + // Initialise the bitmasks for Bitap. + Map bitmask; + bitmask = new HashMap(); + bitmask.put('a', 4); bitmask.put('b', 2); bitmask.put('c', 1); + assertEquals("match_alphabet: Unique.", bitmask, dmp.match_alphabet("abc")); + + bitmask = new HashMap(); + bitmask.put('a', 37); bitmask.put('b', 18); bitmask.put('c', 8); + assertEquals("match_alphabet: Duplicates.", bitmask, dmp.match_alphabet("abcaba")); + } + + public static void testMatchBitap() { + // Bitap algorithm. + dmp.Match_Distance = 100; + dmp.Match_Threshold = 0.5f; + assertEquals("match_bitap: Exact match #1.", 5, dmp.match_bitap("abcdefghijk", "fgh", 5)); + + assertEquals("match_bitap: Exact match #2.", 5, dmp.match_bitap("abcdefghijk", "fgh", 0)); + + assertEquals("match_bitap: Fuzzy match #1.", 4, dmp.match_bitap("abcdefghijk", "efxhi", 0)); + + assertEquals("match_bitap: Fuzzy match #2.", 2, dmp.match_bitap("abcdefghijk", "cdefxyhijk", 5)); + + assertEquals("match_bitap: Fuzzy match #3.", -1, dmp.match_bitap("abcdefghijk", "bxy", 1)); + + assertEquals("match_bitap: Overflow.", 2, dmp.match_bitap("123456789xx0", "3456789x0", 2)); + + assertEquals("match_bitap: Before start match.", 0, dmp.match_bitap("abcdef", "xxabc", 4)); + + assertEquals("match_bitap: Beyond end match.", 3, dmp.match_bitap("abcdef", "defyy", 4)); + + assertEquals("match_bitap: Oversized pattern.", 0, dmp.match_bitap("abcdef", "xabcdefy", 0)); + + dmp.Match_Threshold = 0.4f; + assertEquals("match_bitap: Threshold #1.", 4, dmp.match_bitap("abcdefghijk", "efxyhi", 1)); + + dmp.Match_Threshold = 0.3f; + assertEquals("match_bitap: Threshold #2.", -1, dmp.match_bitap("abcdefghijk", "efxyhi", 1)); + + dmp.Match_Threshold = 0.0f; + assertEquals("match_bitap: Threshold #3.", 1, dmp.match_bitap("abcdefghijk", "bcdef", 1)); + + dmp.Match_Threshold = 0.5f; + assertEquals("match_bitap: Multiple select #1.", 0, dmp.match_bitap("abcdexyzabcde", "abccde", 3)); + + assertEquals("match_bitap: Multiple select #2.", 8, dmp.match_bitap("abcdexyzabcde", "abccde", 5)); + + dmp.Match_Distance = 10; // Strict location. + assertEquals("match_bitap: Distance test #1.", -1, dmp.match_bitap("abcdefghijklmnopqrstuvwxyz", "abcdefg", 24)); + + assertEquals("match_bitap: Distance test #2.", 0, dmp.match_bitap("abcdefghijklmnopqrstuvwxyz", "abcdxxefg", 1)); + + dmp.Match_Distance = 1000; // Loose location. + assertEquals("match_bitap: Distance test #3.", 0, dmp.match_bitap("abcdefghijklmnopqrstuvwxyz", "abcdefg", 24)); + } + + public static void testMatchMain() { + // Full match. + assertEquals("match_main: Equality.", 0, dmp.match_main("abcdef", "abcdef", 1000)); + + assertEquals("match_main: Null text.", -1, dmp.match_main("", "abcdef", 1)); + + assertEquals("match_main: Null pattern.", 3, dmp.match_main("abcdef", "", 3)); + + assertEquals("match_main: Exact match.", 3, dmp.match_main("abcdef", "de", 3)); + + assertEquals("match_main: Beyond end match.", 3, dmp.match_main("abcdef", "defy", 4)); + + assertEquals("match_main: Oversized pattern.", 0, dmp.match_main("abcdef", "abcdefy", 0)); + + dmp.Match_Threshold = 0.7f; + assertEquals("match_main: Complex match.", 4, dmp.match_main("I am the very model of a modern major general.", " that berry ", 5)); + dmp.Match_Threshold = 0.5f; + + // Test null inputs. + try { + dmp.match_main(null, null, 0); + fail("match_main: Null inputs."); + } catch (IllegalArgumentException ex) { + // Error expected. + } + } + + + // PATCH TEST FUNCTIONS + + + public static void testPatchObj() { + // Patch Object. + DiffMatchPatch.Patch p = new DiffMatchPatch.Patch(); + p.start1 = 20; + p.start2 = 21; + p.length1 = 18; + p.length2 = 17; + p.diffs = diffList(new DiffMatchPatch.Diff(EQUAL, "jump"), new DiffMatchPatch.Diff(DELETE, "s"), new DiffMatchPatch.Diff(INSERT, "ed"), new DiffMatchPatch.Diff(EQUAL, " over "), new DiffMatchPatch.Diff(DELETE, "the"), new DiffMatchPatch.Diff(INSERT, "a"), new DiffMatchPatch.Diff(EQUAL, "\nlaz")); + String strp = "@@ -21,18 +22,17 @@\n jump\n-s\n+ed\n over \n-the\n+a\n %0Alaz\n"; + assertEquals("Patch: toString.", strp, p.toString()); + } + + public static void testPatchFromText() { + assertTrue("patch_fromText: #0.", dmp.patch_fromText("").isEmpty()); + + String strp = "@@ -21,18 +22,17 @@\n jump\n-s\n+ed\n over \n-the\n+a\n %0Alaz\n"; + assertEquals("patch_fromText: #1.", strp, dmp.patch_fromText(strp).get(0).toString()); + + assertEquals("patch_fromText: #2.", "@@ -1 +1 @@\n-a\n+b\n", dmp.patch_fromText("@@ -1 +1 @@\n-a\n+b\n").get(0).toString()); + + assertEquals("patch_fromText: #3.", "@@ -1,3 +0,0 @@\n-abc\n", dmp.patch_fromText("@@ -1,3 +0,0 @@\n-abc\n").get(0).toString()); + + assertEquals("patch_fromText: #4.", "@@ -0,0 +1,3 @@\n+abc\n", dmp.patch_fromText("@@ -0,0 +1,3 @@\n+abc\n").get(0).toString()); + + // Generates error. + try { + dmp.patch_fromText("Bad\nPatch\n"); + fail("patch_fromText: #5."); + } catch (IllegalArgumentException ex) { + // Exception expected. + } + } + + public static void testPatchToText() { + String strp = "@@ -21,18 +22,17 @@\n jump\n-s\n+ed\n over \n-the\n+a\n laz\n"; + List patches; + patches = dmp.patch_fromText(strp); + assertEquals("patch_toText: Single.", strp, dmp.patch_toText(patches)); + + strp = "@@ -1,9 +1,9 @@\n-f\n+F\n oo+fooba\n@@ -7,9 +7,9 @@\n obar\n-,\n+.\n tes\n"; + patches = dmp.patch_fromText(strp); + assertEquals("patch_toText: Dual.", strp, dmp.patch_toText(patches)); + } + + public static void testPatchAddContext() { + dmp.Patch_Margin = 4; + DiffMatchPatch.Patch p; + p = dmp.patch_fromText("@@ -21,4 +21,10 @@\n-jump\n+somersault\n").get(0); + dmp.patch_addContext(p, "The quick brown fox jumps over the lazy dog."); + assertEquals("patch_addContext: Simple case.", "@@ -17,12 +17,18 @@\n fox \n-jump\n+somersault\n s ov\n", p.toString()); + + p = dmp.patch_fromText("@@ -21,4 +21,10 @@\n-jump\n+somersault\n").get(0); + dmp.patch_addContext(p, "The quick brown fox jumps."); + assertEquals("patch_addContext: Not enough trailing context.", "@@ -17,10 +17,16 @@\n fox \n-jump\n+somersault\n s.\n", p.toString()); + + p = dmp.patch_fromText("@@ -3 +3,2 @@\n-e\n+at\n").get(0); + dmp.patch_addContext(p, "The quick brown fox jumps."); + assertEquals("patch_addContext: Not enough leading context.", "@@ -1,7 +1,8 @@\n Th\n-e\n+at\n qui\n", p.toString()); + + p = dmp.patch_fromText("@@ -3 +3,2 @@\n-e\n+at\n").get(0); + dmp.patch_addContext(p, "The quick brown fox jumps. The quick brown fox crashes."); + assertEquals("patch_addContext: Ambiguity.", "@@ -1,27 +1,28 @@\n Th\n-e\n+at\n quick brown fox jumps. \n", p.toString()); + } + + @SuppressWarnings("deprecation") + public static void testPatchMake() { + LinkedList patches; + patches = dmp.patch_make("", ""); + assertEquals("patch_make: Null case.", "", dmp.patch_toText(patches)); + + String text1 = "The quick brown fox jumps over the lazy dog."; + String text2 = "That quick brown fox jumped over a lazy dog."; + String expectedPatch = "@@ -1,8 +1,7 @@\n Th\n-at\n+e\n qui\n@@ -21,17 +21,18 @@\n jump\n-ed\n+s\n over \n-a\n+the\n laz\n"; + // The second patch must be "-21,17 +21,18", not "-22,17 +21,18" due to rolling context. + patches = dmp.patch_make(text2, text1); + assertEquals("patch_make: Text2+Text1 inputs.", expectedPatch, dmp.patch_toText(patches)); + + expectedPatch = "@@ -1,11 +1,12 @@\n Th\n-e\n+at\n quick b\n@@ -22,18 +22,17 @@\n jump\n-s\n+ed\n over \n-the\n+a\n laz\n"; + patches = dmp.patch_make(text1, text2); + assertEquals("patch_make: Text1+Text2 inputs.", expectedPatch, dmp.patch_toText(patches)); + + LinkedList diffs = dmp.diff_main(text1, text2, false); + patches = dmp.patch_make(diffs); + assertEquals("patch_make: Diff input.", expectedPatch, dmp.patch_toText(patches)); + + patches = dmp.patch_make(text1, diffs); + assertEquals("patch_make: Text1+Diff inputs.", expectedPatch, dmp.patch_toText(patches)); + + patches = dmp.patch_make(text1, text2, diffs); + assertEquals("patch_make: Text1+Text2+Diff inputs (deprecated).", expectedPatch, dmp.patch_toText(patches)); + + patches = dmp.patch_make("`1234567890-=[]\\;',./", "~!@#$%^&*()_+{}|:\"<>?"); + assertEquals("patch_toText: Character encoding.", "@@ -1,21 +1,21 @@\n-%601234567890-=%5B%5D%5C;',./\n+~!@#$%25%5E&*()_+%7B%7D%7C:%22%3C%3E?\n", dmp.patch_toText(patches)); + + diffs = diffList(new DiffMatchPatch.Diff(DELETE, "`1234567890-=[]\\;',./"), new DiffMatchPatch.Diff(INSERT, "~!@#$%^&*()_+{}|:\"<>?")); + assertEquals("patch_fromText: Character decoding.", diffs, dmp.patch_fromText("@@ -1,21 +1,21 @@\n-%601234567890-=%5B%5D%5C;',./\n+~!@#$%25%5E&*()_+%7B%7D%7C:%22%3C%3E?\n").get(0).diffs); + + text1 = ""; + for (int x = 0; x < 100; x++) { + text1 += "abcdef"; + } + text2 = text1 + "123"; + expectedPatch = "@@ -573,28 +573,31 @@\n cdefabcdefabcdefabcdefabcdef\n+123\n"; + patches = dmp.patch_make(text1, text2); + assertEquals("patch_make: Long string with repeats.", expectedPatch, dmp.patch_toText(patches)); + + // Test null inputs. + try { + dmp.patch_make(null); + fail("patch_make: Null inputs."); + } catch (IllegalArgumentException ex) { + // Error expected. + } + } + + public static void testPatchSplitMax() { + // Assumes that Match_MaxBits is 32. + LinkedList patches; + patches = dmp.patch_make("abcdefghijklmnopqrstuvwxyz01234567890", "XabXcdXefXghXijXklXmnXopXqrXstXuvXwxXyzX01X23X45X67X89X0"); + dmp.patch_splitMax(patches); + assertEquals("patch_splitMax: #1.", "@@ -1,32 +1,46 @@\n+X\n ab\n+X\n cd\n+X\n ef\n+X\n gh\n+X\n ij\n+X\n kl\n+X\n mn\n+X\n op\n+X\n qr\n+X\n st\n+X\n uv\n+X\n wx\n+X\n yz\n+X\n 012345\n@@ -25,13 +39,18 @@\n zX01\n+X\n 23\n+X\n 45\n+X\n 67\n+X\n 89\n+X\n 0\n", dmp.patch_toText(patches)); + + patches = dmp.patch_make("abcdef1234567890123456789012345678901234567890123456789012345678901234567890uvwxyz", "abcdefuvwxyz"); + String oldToText = dmp.patch_toText(patches); + dmp.patch_splitMax(patches); + assertEquals("patch_splitMax: #2.", oldToText, dmp.patch_toText(patches)); + + patches = dmp.patch_make("1234567890123456789012345678901234567890123456789012345678901234567890", "abc"); + dmp.patch_splitMax(patches); + assertEquals("patch_splitMax: #3.", "@@ -1,32 +1,4 @@\n-1234567890123456789012345678\n 9012\n@@ -29,32 +1,4 @@\n-9012345678901234567890123456\n 7890\n@@ -57,14 +1,3 @@\n-78901234567890\n+abc\n", dmp.patch_toText(patches)); + + patches = dmp.patch_make("abcdefghij , h : 0 , t : 1 abcdefghij , h : 0 , t : 1 abcdefghij , h : 0 , t : 1", "abcdefghij , h : 1 , t : 1 abcdefghij , h : 1 , t : 1 abcdefghij , h : 0 , t : 1"); + dmp.patch_splitMax(patches); + assertEquals("patch_splitMax: #4.", "@@ -2,32 +2,32 @@\n bcdefghij , h : \n-0\n+1\n , t : 1 abcdef\n@@ -29,32 +29,32 @@\n bcdefghij , h : \n-0\n+1\n , t : 1 abcdef\n", dmp.patch_toText(patches)); + } + + public static void testPatchAddPadding() { + LinkedList patches; + patches = dmp.patch_make("", "test"); + assertEquals("patch_addPadding: Both edges full.", "@@ -0,0 +1,4 @@\n+test\n", dmp.patch_toText(patches)); + dmp.patch_addPadding(patches); + assertEquals("patch_addPadding: Both edges full.", "@@ -1,8 +1,12 @@\n %01%02%03%04\n+test\n %01%02%03%04\n", dmp.patch_toText(patches)); + + patches = dmp.patch_make("XY", "XtestY"); + assertEquals("patch_addPadding: Both edges partial.", "@@ -1,2 +1,6 @@\n X\n+test\n Y\n", dmp.patch_toText(patches)); + dmp.patch_addPadding(patches); + assertEquals("patch_addPadding: Both edges partial.", "@@ -2,8 +2,12 @@\n %02%03%04X\n+test\n Y%01%02%03\n", dmp.patch_toText(patches)); + + patches = dmp.patch_make("XXXXYYYY", "XXXXtestYYYY"); + assertEquals("patch_addPadding: Both edges none.", "@@ -1,8 +1,12 @@\n XXXX\n+test\n YYYY\n", dmp.patch_toText(patches)); + dmp.patch_addPadding(patches); + assertEquals("patch_addPadding: Both edges none.", "@@ -5,8 +5,12 @@\n XXXX\n+test\n YYYY\n", dmp.patch_toText(patches)); + } + + public static void testPatchApply() { + dmp.Match_Distance = 1000; + dmp.Match_Threshold = 0.5f; + dmp.Patch_DeleteThreshold = 0.5f; + LinkedList patches; + patches = dmp.patch_make("", ""); + Object[] results = dmp.patch_apply(patches, "Hello world."); + boolean[] boolArray = (boolean[]) results[1]; + String resultStr = results[0] + "\t" + boolArray.length; + assertEquals("patch_apply: Null case.", "Hello world.\t0", resultStr); + + patches = dmp.patch_make("The quick brown fox jumps over the lazy dog.", "That quick brown fox jumped over a lazy dog."); + results = dmp.patch_apply(patches, "The quick brown fox jumps over the lazy dog."); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Exact match.", "That quick brown fox jumped over a lazy dog.\ttrue\ttrue", resultStr); + + results = dmp.patch_apply(patches, "The quick red rabbit jumps over the tired tiger."); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Partial match.", "That quick red rabbit jumped over a tired tiger.\ttrue\ttrue", resultStr); + + results = dmp.patch_apply(patches, "I am the very model of a modern major general."); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Failed match.", "I am the very model of a modern major general.\tfalse\tfalse", resultStr); + + patches = dmp.patch_make("x1234567890123456789012345678901234567890123456789012345678901234567890y", "xabcy"); + results = dmp.patch_apply(patches, "x123456789012345678901234567890-----++++++++++-----123456789012345678901234567890y"); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Big delete, small change.", "xabcy\ttrue\ttrue", resultStr); + + patches = dmp.patch_make("x1234567890123456789012345678901234567890123456789012345678901234567890y", "xabcy"); + results = dmp.patch_apply(patches, "x12345678901234567890---------------++++++++++---------------12345678901234567890y"); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Big delete, big change 1.", "xabc12345678901234567890---------------++++++++++---------------12345678901234567890y\tfalse\ttrue", resultStr); + + dmp.Patch_DeleteThreshold = 0.6f; + patches = dmp.patch_make("x1234567890123456789012345678901234567890123456789012345678901234567890y", "xabcy"); + results = dmp.patch_apply(patches, "x12345678901234567890---------------++++++++++---------------12345678901234567890y"); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Big delete, big change 2.", "xabcy\ttrue\ttrue", resultStr); + dmp.Patch_DeleteThreshold = 0.5f; + + // Compensate for failed patch. + dmp.Match_Threshold = 0.0f; + dmp.Match_Distance = 0; + patches = dmp.patch_make("abcdefghijklmnopqrstuvwxyz--------------------1234567890", "abcXXXXXXXXXXdefghijklmnopqrstuvwxyz--------------------1234567YYYYYYYYYY890"); + results = dmp.patch_apply(patches, "ABCDEFGHIJKLMNOPQRSTUVWXYZ--------------------1234567890"); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0] + "\t" + boolArray[1]; + assertEquals("patch_apply: Compensate for failed patch.", "ABCDEFGHIJKLMNOPQRSTUVWXYZ--------------------1234567YYYYYYYYYY890\tfalse\ttrue", resultStr); + dmp.Match_Threshold = 0.5f; + dmp.Match_Distance = 1000; + + patches = dmp.patch_make("", "test"); + String patchStr = dmp.patch_toText(patches); + dmp.patch_apply(patches, ""); + assertEquals("patch_apply: No side effects.", patchStr, dmp.patch_toText(patches)); + + patches = dmp.patch_make("The quick brown fox jumps over the lazy dog.", "Woof"); + patchStr = dmp.patch_toText(patches); + dmp.patch_apply(patches, "The quick brown fox jumps over the lazy dog."); + assertEquals("patch_apply: No side effects with major delete.", patchStr, dmp.patch_toText(patches)); + + patches = dmp.patch_make("", "test"); + results = dmp.patch_apply(patches, ""); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0]; + assertEquals("patch_apply: Edge exact match.", "test\ttrue", resultStr); + + patches = dmp.patch_make("XY", "XtestY"); + results = dmp.patch_apply(patches, "XY"); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0]; + assertEquals("patch_apply: Near edge exact match.", "XtestY\ttrue", resultStr); + + patches = dmp.patch_make("y", "y123"); + results = dmp.patch_apply(patches, "x"); + boolArray = (boolean[]) results[1]; + resultStr = results[0] + "\t" + boolArray[0]; + assertEquals("patch_apply: Edge partial match.", "x123\ttrue", resultStr); + } + + private static void assertEquals(String error_msg, Object a, Object b) { + if (!a.toString().equals(b.toString())) { + throw new Error("assertEquals fail:\n Expected: " + a + "\n Actual: " + b + + "\n" + error_msg); + } + } + + private static void assertTrue(String error_msg, boolean a) { + if (!a) { + throw new Error("assertTrue fail: " + error_msg); + } + } + + private static void assertNull(String error_msg, Object n) { + if (n != null) { + throw new Error("assertNull fail: " + error_msg); + } + } + + private static void fail(String error_msg) { + throw new Error("Fail: " + error_msg); + } + + private static void assertArrayEquals(String error_msg, Object[] a, Object[] b) { + List list_a = Arrays.asList(a); + List list_b = Arrays.asList(b); + assertEquals(error_msg, list_a, list_b); + } + + private static void assertLinesToCharsResultEquals(String error_msg, + DiffMatchPatch.LinesToCharsResult a, DiffMatchPatch.LinesToCharsResult b) { + assertEquals(error_msg, a.chars1, b.chars1); + assertEquals(error_msg, a.chars2, b.chars2); + assertEquals(error_msg, a.lineArray, b.lineArray); + } + + // Construct the two texts which made up the diff originally. + private static String[] diff_rebuildtexts(LinkedList diffs) { + String[] text = {"", ""}; + for (DiffMatchPatch.Diff myDiff : diffs) { + if (myDiff.operation != DiffMatchPatch.Operation.INSERT) { + text[0] += myDiff.text; + } + if (myDiff.operation != DiffMatchPatch.Operation.DELETE) { + text[1] += myDiff.text; + } + } + return text; + } + + // Private function for quickly building lists of diffs. + private static LinkedList diffList(DiffMatchPatch.Diff... diffs) { + return new LinkedList(Arrays.asList(diffs)); + } + + public static void main(String args[]) { + dmp = new DiffMatchPatch(); + + testDiffCommonPrefix(); + testDiffCommonSuffix(); + testDiffCommonOverlap(); + testDiffHalfmatch(); + testDiffLinesToChars(); + testDiffCharsToLines(); + testDiffCleanupMerge(); + testDiffCleanupSemanticLossless(); + testDiffCleanupSemantic(); + testDiffCleanupEfficiency(); + testDiffPrettyHtml(); + testDiffText(); + testDiffDelta(); + testDiffXIndex(); + testDiffLevenshtein(); + testDiffBisect(); + testDiffMain(); + + testMatchAlphabet(); + testMatchBitap(); + testMatchMain(); + + testPatchObj(); + testPatchFromText(); + testPatchToText(); + testPatchAddContext(); + testPatchMake(); + testPatchSplitMax(); + testPatchAddPadding(); + testPatchApply(); + + System.out.println("All tests passed."); + } +} diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest1.txt b/grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest1.txt new file mode 100644 index 0000000000..54b438fd79 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest1.txt @@ -0,0 +1,230 @@ +This is a '''list of newspapers published by [[Journal Register Company]]'''. + +The company owns daily and weekly newspapers, other print media properties and newspaper-affiliated local Websites in the [[U.S.]] states of [[Connecticut]], [[Michigan]], [[New York]], [[Ohio]] and [[Pennsylvania]], organized in six geographic "clusters":[http://www.journalregister.com/newspapers.html Journal Register Company: Our Newspapers], accessed February 10, 2008. + +== Capital-Saratoga == +Three dailies, associated weeklies and [[pennysaver]]s in greater [[Albany, New York]]; also [http://www.capitalcentral.com capitalcentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com]. + +* ''The Oneida Daily Dispatch'' {{WS|oneidadispatch.com}} of [[Oneida, New York]] +* ''[[The Record (Troy)|The Record]]'' {{WS|troyrecord.com}} of [[Troy, New York]] +* ''[[The Saratogian]]'' {{WS|saratogian.com}} of [[Saratoga Springs, New York]] +* Weeklies: +** ''Community News'' {{WS|cnweekly.com}} weekly of [[Clifton Park, New York]] +** ''Rome Observer'' of [[Rome, New York]] +** ''Life & Times of Utica'' of [[Utica, New York]] + +== Connecticut == +Five dailies, associated weeklies and [[pennysaver]]s in the state of [[Connecticut]]; also [http://www.ctcentral.com CTcentral.com], [http://www.ctcarsandtrucks.com CTCarsAndTrucks.com] and [http://www.jobsinct.com JobsInCT.com]. + +* ''The Middletown Press'' {{WS|middletownpress.com}} of [[Middletown, Connecticut|Middletown]] +* ''[[New Haven Register]]'' {{WS|newhavenregister.com}} of [[New Haven, Connecticut|New Haven]] +* ''The Register Citizen'' {{WS|registercitizen.com}} of [[Torrington, Connecticut|Torrington]] + +* [[New Haven Register#Competitors|Elm City Newspapers]] {{WS|ctcentral.com}} +** ''The Advertiser'' of [[East Haven, Connecticut|East Haven]] +** ''Hamden Chronicle'' of [[Hamden, Connecticut|Hamden]] +** ''Milford Weekly'' of [[Milford, Connecticut|Milford]] +** ''The Orange Bulletin'' of [[Orange, Connecticut|Orange]] +** ''The Post'' of [[North Haven, Connecticut|North Haven]] +** ''Shelton Weekly'' of [[Shelton, Connecticut|Shelton]] +** ''The Stratford Bard'' of [[Stratford, Connecticut|Stratford]] +** ''Wallingford Voice'' of [[Wallingford, Connecticut|Wallingford]] +** ''West Haven News'' of [[West Haven, Connecticut|West Haven]] +* Housatonic Publications +** ''The New Milford Times'' {{WS|newmilfordtimes.com}} of [[New Milford, Connecticut|New Milford]] +** ''The Brookfield Journal'' of [[Brookfield, Connecticut|Brookfield]] +** ''The Kent Good Times Dispatch'' of [[Kent, Connecticut|Kent]] +** ''The Bethel Beacon'' of [[Bethel, Connecticut|Bethel]] +** ''The Litchfield Enquirer'' of [[Litchfield, Connecticut|Litchfield]] +** ''Litchfield County Times'' of [[Litchfield, Connecticut|Litchfield]] +* Imprint Newspapers {{WS|imprintnewspapers.com}} +** ''West Hartford News'' of [[West Hartford, Connecticut|West Hartford]] +** ''Windsor Journal'' of [[Windsor, Connecticut|Windsor]] +** ''Windsor Locks Journal'' of [[Windsor Locks, Connecticut|Windsor Locks]] +** ''Avon Post'' of [[Avon, Connecticut|Avon]] +** ''Farmington Post'' of [[Farmington, Connecticut|Farmington]] +** ''Simsbury Post'' of [[Simsbury, Connecticut|Simsbury]] +** ''Tri-Town Post'' of [[Burlington, Connecticut|Burlington]], [[Canton, Connecticut|Canton]] and [[Harwinton, Connecticut|Harwinton]] +* Minuteman Publications +** ''[[Fairfield Minuteman]]'' of [[Fairfield, Connecticut|Fairfield]] +** ''The Westport Minuteman'' {{WS|westportminuteman.com}} of [[Westport, Connecticut|Westport]] +* Shoreline Newspapers weeklies: +** ''Branford Review'' of [[Branford, Connecticut|Branford]] +** ''Clinton Recorder'' of [[Clinton, Connecticut|Clinton]] +** ''The Dolphin'' of [[Naval Submarine Base New London]] in [[New London, Connecticut|New London]] +** ''Main Street News'' {{WS|ctmainstreetnews.com}} of [[Essex, Connecticut|Essex]] +** ''Pictorial Gazette'' of [[Old Saybrook, Connecticut|Old Saybrook]] +** ''Regional Express'' of [[Colchester, Connecticut|Colchester]] +** ''Regional Standard'' of [[Colchester, Connecticut|Colchester]] +** ''Shoreline Times'' {{WS|shorelinetimes.com}} of [[Guilford, Connecticut|Guilford]] +** ''Shore View East'' of [[Madison, Connecticut|Madison]] +** ''Shore View West'' of [[Guilford, Connecticut|Guilford]] +* Other weeklies: +** ''Registro'' {{WS|registroct.com}} of [[New Haven, Connecticut|New Haven]] +** ''Thomaston Express'' {{WS|thomastownexpress.com}} of [[Thomaston, Connecticut|Thomaston]] +** ''Foothills Traders'' {{WS|foothillstrader.com}} of Torrington, Bristol, Canton + +== Michigan == +Four dailies, associated weeklies and [[pennysaver]]s in the state of [[Michigan]]; also [http://www.micentralhomes.com MIcentralhomes.com] and [http://www.micentralautos.com MIcentralautos.com] +* ''[[Oakland Press]]'' {{WS|theoaklandpress.com}} of [[Oakland, Michigan|Oakland]] +* ''Daily Tribune'' {{WS|dailytribune.com}} of [[Royal Oak, Michigan|Royal Oak]] +* ''Macomb Daily'' {{WS|macombdaily.com}} of [[Mt. Clemens, Michigan|Mt. Clemens]] +* ''[[Morning Sun]]'' {{WS|themorningsun.com}} of [[Mount Pleasant, Michigan|Mount Pleasant]] +* Heritage Newspapers {{WS|heritage.com}} +** ''Belleville View'' +** ''Ile Camera'' +** ''Monroe Guardian'' +** ''Ypsilanti Courier'' +** ''News-Herald'' +** ''Press & Guide'' +** ''Chelsea Standard & Dexter Leader'' +** ''Manchester Enterprise'' +** ''Milan News-Leader'' +** ''Saline Reporter'' +* Independent Newspapers {{WS|sourcenewspapers.com}} +** ''Advisor'' +** ''Source'' +* Morning Star {{WS|morningstarpublishing.com}} +** ''Alma Reminder'' +** ''Alpena Star'' +** ''Antrim County News'' +** ''Carson City Reminder'' +** ''The Leader & Kalkaskian'' +** ''Ogemaw/Oscoda County Star'' +** ''Petoskey/Charlevoix Star'' +** ''Presque Isle Star'' +** ''Preview Community Weekly'' +** ''Roscommon County Star'' +** ''St. Johns Reminder'' +** ''Straits Area Star'' +** ''The (Edmore) Advertiser'' +* Voice Newspapers {{WS|voicenews.com}} +** ''Armada Times'' +** ''Bay Voice'' +** ''Blue Water Voice'' +** ''Downriver Voice'' +** ''Macomb Township Voice'' +** ''North Macomb Voice'' +** ''Weekend Voice'' +** ''Suburban Lifestyles'' {{WS|suburbanlifestyles.com}} + +== Mid-Hudson == +One daily, associated magazines in the [[Hudson River Valley]] of [[New York]]; also [http://www.midhudsoncentral.com MidHudsonCentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com]. + +* ''[[Daily Freeman]]'' {{WS|dailyfreeman.com}} of [[Kingston, New York]] + +== Ohio == +Two dailies, associated magazines and three shared Websites, all in the state of [[Ohio]]: [http://www.allaroundcleveland.com AllAroundCleveland.com], [http://www.allaroundclevelandcars.com AllAroundClevelandCars.com] and [http://www.allaroundclevelandjobs.com AllAroundClevelandJobs.com]. + +* ''[[The News-Herald (Ohio)|The News-Herald]]'' {{WS|news-herald.com}} of [[Willoughby, Ohio|Willoughby]] +* ''[[The Morning Journal]]'' {{WS|morningjournal.com}} of [[Lorain, Ohio|Lorain]] + +== Philadelphia area == +Seven dailies and associated weeklies and magazines in [[Pennsylvania]] and [[New Jersey]], and associated Websites: [http://www.allaroundphilly.com AllAroundPhilly.com], [http://www.jobsinnj.com JobsInNJ.com], [http://www.jobsinpa.com JobsInPA.com], and [http://www.phillycarsearch.com PhillyCarSearch.com]. + +* ''The Daily Local'' {{WS|dailylocal.com}} of [[West Chester, Pennsylvania|West Chester]] +* ''[[Delaware County Daily and Sunday Times]] {{WS|delcotimes.com}} of Primos +* ''[[The Mercury (Pennsylvania)|The Mercury]]'' {{WS|pottstownmercury.com}} of [[Pottstown, Pennsylvania|Pottstown]] +* ''The Phoenix'' {{WS|phoenixvillenews.com}} of [[Phoenixville, Pennsylvania|Phoenixville]] +* ''[[The Reporter (Lansdale)|The Reporter]]'' {{WS|thereporteronline.com}} of [[Lansdale, Pennsylvania|Lansdale]] +* ''The Times Herald'' {{WS|timesherald.com}} of [[Norristown, Pennsylvania|Norristown]] +* ''[[The Trentonian]]'' {{WS|trentonian.com}} of [[Trenton, New Jersey]] + +* Weeklies +** ''El Latino Expreso'' of [[Trenton, New Jersey]] +** ''La Voz'' of [[Norristown, Pennsylvania]] +** ''The Village News'' of [[Downingtown, Pennsylvania]] +** ''The Times Record'' of [[Kennett Square, Pennsylvania]] +** ''The Tri-County Record'' {{WS|tricountyrecord.com}} of [[Morgantown, Pennsylvania]] +** ''News of Delaware County'' {{WS|newsofdelawarecounty.com}}of [[Havertown, Pennsylvania]] +** ''Main Line Times'' {{WS|mainlinetimes.com}}of [[Ardmore, Pennsylvania]] +** ''Penny Pincher'' of [[Pottstown, Pennsylvania]] +** ''Town Talk'' {{WS|towntalknews.com}} of [[Ridley, Pennsylvania]] +* Chesapeake Publishing {{WS|pa8newsgroup.com}} +** ''Solanco Sun Ledger'' of [[Quarryville, Pennsylvania]] +** ''Columbia Ledger'' of [[Columbia, Pennsylvania]] +** ''Coatesville Ledger'' of [[Downingtown, Pennsylvania]] +** ''Parkesburg Post Ledger'' of [[Quarryville, Pennsylvania]] +** ''Downingtown Ledger'' of [[Downingtown, Pennsylvania]] +** ''The Kennett Paper'' of [[Kennett Square, Pennsylvania]] +** ''Avon Grove Sun'' of [[West Grove, Pennsylvania]] +** ''Oxford Tribune'' of [[Oxford, Pennsylvania]] +** ''Elizabethtown Chronicle'' of [[Elizabethtown, Pennsylvania]] +** ''Donegal Ledger'' of [[Donegal, Pennsylvania]] +** ''Chadds Ford Post'' of [[Chadds Ford, Pennsylvania]] +** ''The Central Record'' of [[Medford, New Jersey]] +** ''Maple Shade Progress'' of [[Maple Shade, New Jersey]] +* Intercounty Newspapers {{WS|buckslocalnews.com}} +** ''The Review'' of Roxborough, Pennsylvania +** ''The Recorder'' of [[Conshohocken, Pennsylvania]] +** ''The Leader'' of [[Mount Airy, Pennsylvania|Mount Airy]] and West Oak Lake, Pennsylvania +** ''The Pennington Post'' of [[Pennington, New Jersey]] +** ''The Bristol Pilot'' of [[Bristol, Pennsylvania]] +** ''Yardley News'' of [[Yardley, Pennsylvania]] +** ''New Hope Gazette'' of [[New Hope, Pennsylvania]] +** ''Doylestown Patriot'' of [[Doylestown, Pennsylvania]] +** ''Newtown Advance'' of [[Newtown, Pennsylvania]] +** ''The Plain Dealer'' of [[Williamstown, New Jersey]] +** ''News Report'' of [[Sewell, New Jersey]] +** ''Record Breeze'' of [[Berlin, New Jersey]] +** ''Newsweekly'' of [[Moorestown, New Jersey]] +** ''Haddon Herald'' of [[Haddonfield, New Jersey]] +** ''New Egypt Press'' of [[New Egypt, New Jersey]] +** ''Community News'' of [[Pemberton, New Jersey]] +** ''Plymouth Meeting Journal'' of [[Plymouth Meeting, Pennsylvania]] +** ''Lafayette Hill Journal'' of [[Lafayette Hill, Pennsylvania]] +* Montgomery Newspapers {{WS|montgomerynews.com}} +** ''Ambler Gazette'' of [[Ambler, Pennsylvania]] +** ''Central Bucks Life'' of [[Bucks County, Pennsylvania]] +** ''The Colonial'' of [[Plymouth Meeting, Pennsylvania]] +** ''Glenside News'' of [[Glenside, Pennsylvania]] +** ''The Globe'' of [[Lower Moreland Township, Pennsylvania]] +** ''Main Line Life'' of [[Ardmore, Pennsylvania]] +** ''Montgomery Life'' of [[Fort Washington, Pennsylvania]] +** ''North Penn Life'' of [[Lansdale, Pennsylvania]] +** ''Perkasie News Herald'' of [[Perkasie, Pennsylvania]] +** ''Public Spirit'' of [[Hatboro, Pennsylvania]] +** ''Souderton Independent'' of [[Souderton, Pennsylvania]] +** ''Springfield Sun'' of [[Springfield, Pennsylvania]] +** ''Spring-Ford Reporter'' of [[Royersford, Pennsylvania]] +** ''Times Chronicle'' of [[Jenkintown, Pennsylvania]] +** ''Valley Item'' of [[Perkiomenville, Pennsylvania]] +** ''Willow Grove Guide'' of [[Willow Grove, Pennsylvania]] +* News Gleaner Publications (closed December 2008) {{WS|newsgleaner.com}} +** ''Life Newspapers'' of [[Philadelphia, Pennsylvania]] +* Suburban Publications +** ''The Suburban & Wayne Times'' {{WS|waynesuburban.com}} of [[Wayne, Pennsylvania]] +** ''The Suburban Advertiser'' of [[Exton, Pennsylvania]] +** ''The King of Prussia Courier'' of [[King of Prussia, Pennsylvania]] +* Press Newspapers {{WS|countypressonline.com}} +** ''County Press'' of [[Newtown Square, Pennsylvania]] +** ''Garnet Valley Press'' of [[Glen Mills, Pennsylvania]] +** ''Haverford Press'' of [[Newtown Square, Pennsylvania]] (closed January 2009) +** ''Hometown Press'' of [[Glen Mills, Pennsylvania]] (closed January 2009) +** ''Media Press'' of [[Newtown Square, Pennsylvania]] (closed January 2009) +** ''Springfield Press'' of [[Springfield, Pennsylvania]] +* Berks-Mont Newspapers {{WS|berksmontnews.com}} +** ''The Boyertown Area Times'' of [[Boyertown, Pennsylvania]] +** ''The Kutztown Area Patriot'' of [[Kutztown, Pennsylvania]] +** ''The Hamburg Area Item'' of [[Hamburg, Pennsylvania]] +** ''The Southern Berks News'' of [[Exeter Township, Berks County, Pennsylvania]] +** ''The Free Press'' of [[Quakertown, Pennsylvania]] +** ''The Saucon News'' of [[Quakertown, Pennsylvania]] +** ''Westside Weekly'' of [[Reading, Pennsylvania]] + +* Magazines +** ''Bucks Co. Town & Country Living'' +** ''Chester Co. Town & Country Living'' +** ''Montomgery Co. Town & Country Living'' +** ''Garden State Town & Country Living'' +** ''Montgomery Homes'' +** ''Philadelphia Golfer'' +** ''Parents Express'' +** ''Art Matters'' + +{{JRC}} + +==References== + + +[[Category:Journal Register publications|*]] diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest2.txt b/grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest2.txt new file mode 100644 index 0000000000..8f25a80fff --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/utilities/matching/Speedtest2.txt @@ -0,0 +1,188 @@ +This is a '''list of newspapers published by [[Journal Register Company]]'''. + +The company owns daily and weekly newspapers, other print media properties and newspaper-affiliated local Websites in the [[U.S.]] states of [[Connecticut]], [[Michigan]], [[New York]], [[Ohio]], [[Pennsylvania]] and [[New Jersey]], organized in six geographic "clusters":[http://www.journalregister.com/publications.html Journal Register Company: Our Publications], accessed April 21, 2010. + +== Capital-Saratoga == +Three dailies, associated weeklies and [[pennysaver]]s in greater [[Albany, New York]]; also [http://www.capitalcentral.com capitalcentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com]. + +* ''The Oneida Daily Dispatch'' {{WS|oneidadispatch.com}} of [[Oneida, New York]] +* ''[[The Record (Troy)|The Record]]'' {{WS|troyrecord.com}} of [[Troy, New York]] +* ''[[The Saratogian]]'' {{WS|saratogian.com}} of [[Saratoga Springs, New York]] +* Weeklies: +** ''Community News'' {{WS|cnweekly.com}} weekly of [[Clifton Park, New York]] +** ''Rome Observer'' {{WS|romeobserver.com}} of [[Rome, New York]] +** ''WG Life '' {{WS|saratogian.com/wglife/}} of [[Wilton, New York]] +** ''Ballston Spa Life '' {{WS|saratogian.com/bspalife}} of [[Ballston Spa, New York]] +** ''Greenbush Life'' {{WS|troyrecord.com/greenbush}} of [[Troy, New York]] +** ''Latham Life'' {{WS|troyrecord.com/latham}} of [[Latham, New York]] +** ''River Life'' {{WS|troyrecord.com/river}} of [[Troy, New York]] + +== Connecticut == +Three dailies, associated weeklies and [[pennysaver]]s in the state of [[Connecticut]]; also [http://www.ctcentral.com CTcentral.com], [http://www.ctcarsandtrucks.com CTCarsAndTrucks.com] and [http://www.jobsinct.com JobsInCT.com]. + +* ''The Middletown Press'' {{WS|middletownpress.com}} of [[Middletown, Connecticut|Middletown]] +* ''[[New Haven Register]]'' {{WS|newhavenregister.com}} of [[New Haven, Connecticut|New Haven]] +* ''The Register Citizen'' {{WS|registercitizen.com}} of [[Torrington, Connecticut|Torrington]] + +* Housatonic Publications +** ''The Housatonic Times'' {{WS|housatonictimes.com}} of [[New Milford, Connecticut|New Milford]] +** ''Litchfield County Times'' {{WS|countytimes.com}} of [[Litchfield, Connecticut|Litchfield]] + +* Minuteman Publications +** ''[[Fairfield Minuteman]]'' {{WS|fairfieldminuteman.com}}of [[Fairfield, Connecticut|Fairfield]] +** ''The Westport Minuteman'' {{WS|westportminuteman.com}} of [[Westport, Connecticut|Westport]] + +* Shoreline Newspapers +** ''The Dolphin'' {{WS|dolphin-news.com}} of [[Naval Submarine Base New London]] in [[New London, Connecticut|New London]] +** ''Shoreline Times'' {{WS|shorelinetimes.com}} of [[Guilford, Connecticut|Guilford]] + +* Foothills Media Group {{WS|foothillsmediagroup.com}} +** ''Thomaston Express'' {{WS|thomastonexpress.com}} of [[Thomaston, Connecticut|Thomaston]] +** ''Good News About Torrington'' {{WS|goodnewsabouttorrington.com}} of [[Torrington, Connecticut|Torrington]] +** ''Granby News'' {{WS|foothillsmediagroup.com/granby}} of [[Granby, Connecticut|Granby]] +** ''Canton News'' {{WS|foothillsmediagroup.com/canton}} of [[Canton, Connecticut|Canton]] +** ''Avon News'' {{WS|foothillsmediagroup.com/avon}} of [[Avon, Connecticut|Avon]] +** ''Simsbury News'' {{WS|foothillsmediagroup.com/simsbury}} of [[Simsbury, Connecticut|Simsbury]] +** ''Litchfield News'' {{WS|foothillsmediagroup.com/litchfield}} of [[Litchfield, Connecticut|Litchfield]] +** ''Foothills Trader'' {{WS|foothillstrader.com}} of Torrington, Bristol, Canton + +* Other weeklies +** ''The Milford-Orange Bulletin'' {{WS|ctbulletin.com}} of [[Orange, Connecticut|Orange]] +** ''The Post-Chronicle'' {{WS|ctpostchronicle.com}} of [[North Haven, Connecticut|North Haven]] +** ''West Hartford News'' {{WS|westhartfordnews.com}} of [[West Hartford, Connecticut|West Hartford]] + +* Magazines +** ''The Connecticut Bride'' {{WS|connecticutmag.com}} +** ''Connecticut Magazine'' {{WS|theconnecticutbride.com}} +** ''Passport Magazine'' {{WS|passport-mag.com}} + +== Michigan == +Four dailies, associated weeklies and [[pennysaver]]s in the state of [[Michigan]]; also [http://www.micentralhomes.com MIcentralhomes.com] and [http://www.micentralautos.com MIcentralautos.com] +* ''[[Oakland Press]]'' {{WS|theoaklandpress.com}} of [[Oakland, Michigan|Oakland]] +* ''Daily Tribune'' {{WS|dailytribune.com}} of [[Royal Oak, Michigan|Royal Oak]] +* ''Macomb Daily'' {{WS|macombdaily.com}} of [[Mt. Clemens, Michigan|Mt. Clemens]] +* ''[[Morning Sun]]'' {{WS|themorningsun.com}} of [[Mount Pleasant, Michigan|Mount Pleasant]] + +* Heritage Newspapers {{WS|heritage.com}} +** ''Belleville View'' {{WS|bellevilleview.com}} +** ''Ile Camera'' {{WS|thenewsherald.com/ile_camera}} +** ''Monroe Guardian'' {{WS|monreguardian.com}} +** ''Ypsilanti Courier'' {{WS|ypsilanticourier.com}} +** ''News-Herald'' {{WS|thenewsherald.com}} +** ''Press & Guide'' {{WS|pressandguide.com}} +** ''Chelsea Standard & Dexter Leader'' {{WS|chelseastandard.com}} +** ''Manchester Enterprise'' {{WS|manchesterguardian.com}} +** ''Milan News-Leader'' {{WS|milannews.com}} +** ''Saline Reporter'' {{WS|salinereporter.com}} +* Independent Newspapers +** ''Advisor'' {{WS|sourcenewspapers.com}} +** ''Source'' {{WS|sourcenewspapers.com}} +* Morning Star {{WS|morningstarpublishing.com}} +** ''The Leader & Kalkaskian'' {{WS|leaderandkalkaskian.com}} +** ''Grand Traverse Insider'' {{WS|grandtraverseinsider.com}} +** ''Alma Reminder'' +** ''Alpena Star'' +** ''Ogemaw/Oscoda County Star'' +** ''Presque Isle Star'' +** ''St. Johns Reminder'' + +* Voice Newspapers {{WS|voicenews.com}} +** ''Armada Times'' +** ''Bay Voice'' +** ''Blue Water Voice'' +** ''Downriver Voice'' +** ''Macomb Township Voice'' +** ''North Macomb Voice'' +** ''Weekend Voice'' + +== Mid-Hudson == +One daily, associated magazines in the [[Hudson River Valley]] of [[New York]]; also [http://www.midhudsoncentral.com MidHudsonCentral.com] and [http://www.jobsinnewyork.com JobsInNewYork.com]. + +* ''[[Daily Freeman]]'' {{WS|dailyfreeman.com}} of [[Kingston, New York]] +* ''Las Noticias'' {{WS|lasnoticiasny.com}} of [[Kingston, New York]] + +== Ohio == +Two dailies, associated magazines and three shared Websites, all in the state of [[Ohio]]: [http://www.allaroundcleveland.com AllAroundCleveland.com], [http://www.allaroundclevelandcars.com AllAroundClevelandCars.com] and [http://www.allaroundclevelandjobs.com AllAroundClevelandJobs.com]. + +* ''[[The News-Herald (Ohio)|The News-Herald]]'' {{WS|news-herald.com}} of [[Willoughby, Ohio|Willoughby]] +* ''[[The Morning Journal]]'' {{WS|morningjournal.com}} of [[Lorain, Ohio|Lorain]] +* ''El Latino Expreso'' {{WS|lorainlatino.com}} of [[Lorain, Ohio|Lorain]] + +== Philadelphia area == +Seven dailies and associated weeklies and magazines in [[Pennsylvania]] and [[New Jersey]], and associated Websites: [http://www.allaroundphilly.com AllAroundPhilly.com], [http://www.jobsinnj.com JobsInNJ.com], [http://www.jobsinpa.com JobsInPA.com], and [http://www.phillycarsearch.com PhillyCarSearch.com]. + +* ''[[The Daily Local News]]'' {{WS|dailylocal.com}} of [[West Chester, Pennsylvania|West Chester]] +* ''[[Delaware County Daily and Sunday Times]] {{WS|delcotimes.com}} of Primos [[Upper Darby Township, Pennsylvania]] +* ''[[The Mercury (Pennsylvania)|The Mercury]]'' {{WS|pottstownmercury.com}} of [[Pottstown, Pennsylvania|Pottstown]] +* ''[[The Reporter (Lansdale)|The Reporter]]'' {{WS|thereporteronline.com}} of [[Lansdale, Pennsylvania|Lansdale]] +* ''The Times Herald'' {{WS|timesherald.com}} of [[Norristown, Pennsylvania|Norristown]] +* ''[[The Trentonian]]'' {{WS|trentonian.com}} of [[Trenton, New Jersey]] + +* Weeklies +* ''The Phoenix'' {{WS|phoenixvillenews.com}} of [[Phoenixville, Pennsylvania]] +** ''El Latino Expreso'' {{WS|njexpreso.com}} of [[Trenton, New Jersey]] +** ''La Voz'' {{WS|lavozpa.com}} of [[Norristown, Pennsylvania]] +** ''The Tri County Record'' {{WS|tricountyrecord.com}} of [[Morgantown, Pennsylvania]] +** ''Penny Pincher'' {{WS|pennypincherpa.com}}of [[Pottstown, Pennsylvania]] + +* Chesapeake Publishing {{WS|southernchestercountyweeklies.com}} +** ''The Kennett Paper'' {{WS|kennettpaper.com}} of [[Kennett Square, Pennsylvania]] +** ''Avon Grove Sun'' {{WS|avongrovesun.com}} of [[West Grove, Pennsylvania]] +** ''The Central Record'' {{WS|medfordcentralrecord.com}} of [[Medford, New Jersey]] +** ''Maple Shade Progress'' {{WS|mapleshadeprogress.com}} of [[Maple Shade, New Jersey]] + +* Intercounty Newspapers {{WS|buckslocalnews.com}} {{WS|southjerseylocalnews.com}} +** ''The Pennington Post'' {{WS|penningtonpost.com}} of [[Pennington, New Jersey]] +** ''The Bristol Pilot'' {{WS|bristolpilot.com}} of [[Bristol, Pennsylvania]] +** ''Yardley News'' {{WS|yardleynews.com}} of [[Yardley, Pennsylvania]] +** ''Advance of Bucks County'' {{WS|advanceofbucks.com}} of [[Newtown, Pennsylvania]] +** ''Record Breeze'' {{WS|recordbreeze.com}} of [[Berlin, New Jersey]] +** ''Community News'' {{WS|sjcommunitynews.com}} of [[Pemberton, New Jersey]] + +* Montgomery Newspapers {{WS|montgomerynews.com}} +** ''Ambler Gazette'' {{WS|amblergazette.com}} of [[Ambler, Pennsylvania]] +** ''The Colonial'' {{WS|colonialnews.com}} of [[Plymouth Meeting, Pennsylvania]] +** ''Glenside News'' {{WS|glensidenews.com}} of [[Glenside, Pennsylvania]] +** ''The Globe'' {{WS|globenewspaper.com}} of [[Lower Moreland Township, Pennsylvania]] +** ''Montgomery Life'' {{WS|montgomerylife.com}} of [[Fort Washington, Pennsylvania]] +** ''North Penn Life'' {{WS|northpennlife.com}} of [[Lansdale, Pennsylvania]] +** ''Perkasie News Herald'' {{WS|perkasienewsherald.com}} of [[Perkasie, Pennsylvania]] +** ''Public Spirit'' {{WS|thepublicspirit.com}} of [[Hatboro, Pennsylvania]] +** ''Souderton Independent'' {{WS|soudertonindependent.com}} of [[Souderton, Pennsylvania]] +** ''Springfield Sun'' {{WS|springfieldsun.com}} of [[Springfield, Pennsylvania]] +** ''Spring-Ford Reporter'' {{WS|springfordreporter.com}} of [[Royersford, Pennsylvania]] +** ''Times Chronicle'' {{WS|thetimeschronicle.com}} of [[Jenkintown, Pennsylvania]] +** ''Valley Item'' {{WS|valleyitem.com}} of [[Perkiomenville, Pennsylvania]] +** ''Willow Grove Guide'' {{WS|willowgroveguide.com}} of [[Willow Grove, Pennsylvania]] +** ''The Review'' {{WS|roxreview.com}} of [[Roxborough, Philadelphia, Pennsylvania]] + +* Main Line Media News {{WS|mainlinemedianews.com}} +** ''Main Line Times'' {{WS|mainlinetimes.com}} of [[Ardmore, Pennsylvania]] +** ''Main Line Life'' {{WS|mainlinelife.com}} of [[Ardmore, Pennsylvania]] +** ''The King of Prussia Courier'' {{WS|kingofprussiacourier.com}} of [[King of Prussia, Pennsylvania]] + +* Delaware County News Network {{WS|delconewsnetwork.com}} +** ''News of Delaware County'' {{WS|newsofdelawarecounty.com}} of [[Havertown, Pennsylvania]] +** ''County Press'' {{WS|countypressonline.com}} of [[Newtown Square, Pennsylvania]] +** ''Garnet Valley Press'' {{WS|countypressonline.com}} of [[Glen Mills, Pennsylvania]] +** ''Springfield Press'' {{WS|countypressonline.com}} of [[Springfield, Pennsylvania]] +** ''Town Talk'' {{WS|towntalknews.com}} of [[Ridley, Pennsylvania]] + +* Berks-Mont Newspapers {{WS|berksmontnews.com}} +** ''The Boyertown Area Times'' {{WS|berksmontnews.com/boyertown_area_times}} of [[Boyertown, Pennsylvania]] +** ''The Kutztown Area Patriot'' {{WS|berksmontnews.com/kutztown_area_patriot}} of [[Kutztown, Pennsylvania]] +** ''The Hamburg Area Item'' {{WS|berksmontnews.com/hamburg_area_item}} of [[Hamburg, Pennsylvania]] +** ''The Southern Berks News'' {{WS|berksmontnews.com/southern_berks_news}} of [[Exeter Township, Berks County, Pennsylvania]] +** ''Community Connection'' {{WS|berksmontnews.com/community_connection}} of [[Boyertown, Pennsylvania]] + +* Magazines +** ''Bucks Co. Town & Country Living'' {{WS|buckscountymagazine.com}} +** ''Parents Express'' {{WS|parents-express.com}} +** ''Real Men, Rednecks'' {{WS|realmenredneck.com}} + +{{JRC}} + +==References== + + +[[Category:Journal Register publications|*]] From a5775233f00af48f8cc2ba79cccb5892d3bc699b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Jul 2022 09:26:11 +0900 Subject: [PATCH 08/10] checking for references by examining also whether it falls in the forbidden intervals --- .../grobid/core/utilities/SentenceUtilities.java | 15 ++++++++++++--- .../core/utilities/SentenceUtilitiesTest.java | 1 - 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index c421d1a20c..f1c5accc33 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -197,15 +197,18 @@ public List runSentenceDetection(String text, List runSentenceDetection(String text, List runSentenceDetection(String text, List forbidden) { + return forbidden + .stream().anyMatch(o -> currentOffset >= o.start && currentOffset < o.end); + } + /** * Return true if the token should be skipped when considering sentence content. */ @@ -309,7 +318,7 @@ private static boolean toSkipTokenNoHyphen(String tok) { /** - * Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in supercript. + * Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in superscript. */ private static boolean isValidSuperScriptNumericalReferenceMarker(LayoutToken token) { diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java index de701255ff..398b5c6ece 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java @@ -203,7 +203,6 @@ public void testCorrectSegmentation_shouldCancelWrongSegmentation2() throws Exce } @Test - @Ignore("reproduce issue #753") public void testRealCaseDesynchronisation_shouldReturnCorrectSentences() throws Exception { String text = "CCAligned ) is a 119language 1 parallel dataset built off 68 snapshots of Common Crawl. Documents are aligned if they are in the same language according to FastText LangID (Joulin et al., 2016(Joulin et al., , 2017, and have the same URL but for a differing language code. These alignments are refined with cross-lingual LASER embeddings (Artetxe and Schwenk, 2019). For sentence-level data, they split on newlines and align with LASER, but perform no further filtering. Human annotators evaluated the quality of document alignments for six languages (de, zh, ar, ro, et, my) selected for their different scripts and amount of retrieved documents, reporting precision of over 90%. The quality of the extracted parallel sentences is evaluated in a machine translation (MT) task on six European (da, cr, sl, sk, lt, et) languages of the TED corpus (Qi et al., 2018), where it compares favorably to systems built on crawled sentences from WikiMatrix and ParaCrawl (Qi et al., 2018); WMT-5: cs, de, fi, lv, ro. POS/DEP-5: part-of-speech labeling and dependency parsing for bg, ca, da, fi, id."; From 34034a60bf536823eef28f2848946ec4f01141e7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Jul 2022 10:19:11 +0900 Subject: [PATCH 09/10] revert layout modifications --- .../lang/impl/PragmaticSentenceDetector.java | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index f0b8940e4f..855dee8039 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -23,15 +23,16 @@ /** * Implementation of sentence segmentation via the Pragmatic Segmenter + * */ public class PragmaticSentenceDetector implements SentenceDetector { - private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class); + private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class); private ScriptingContainer instance = null; public PragmaticSentenceDetector() { String segmenterRbFile = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + - File.separator + "pragmatic_segmenter" + File.separator + "segmenter.rb"; + File.separator + "pragmatic_segmenter"+ File.separator + "segmenter.rb"; String segmenterLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation"; /*String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "sentence-segmentation" + File.separator + "pragmatic_segmenter" + File.separator + "gem" + File.separator + "gems" + @@ -210,7 +211,7 @@ protected static List getSentenceOffsetsOld(String text, List getSentenceOffsetsOld(String text, List 0) { int newPreviousEnd = start; - while (newPreviousEnd >= 1 && text.charAt(newPreviousEnd - 1) == ' ') { + while(newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') { newPreviousEnd--; if (start - newPreviousEnd > 10) { // this is a break to avoid going too far newPreviousEnd = start; // but look back previous character to cover general case - if (newPreviousEnd >= 1 && text.charAt(newPreviousEnd - 1) == ' ') { + if (newPreviousEnd >= 1 && text.charAt(newPreviousEnd-1) == ' ') { newPreviousEnd--; } } } - result.get(result.size() - 1).end = newPreviousEnd; + result.get(result.size()-1).end = newPreviousEnd; } } } @@ -262,30 +263,31 @@ protected static List getSentenceOffsetsOld(String text, List 10) { // this is a break to avoid going too far - start = previousEnd + 1; + start = previousEnd+1; } } recovered = true; } } - int end = start + chunk.length(); + int end = start+chunk.length(); // in case the last sentence is modified - if (end > text.length() && i == retList.size() - 1) + if (end > text.length() && i == retList.size()-1) end = text.length(); result.add(new OffsetPosition(start, end)); - pos = start + chunk.length(); + pos = start+chunk.length(); if (recovered) previousEnd += 1; else previousEnd = pos; } + return result; } } From c92bc1e883b93f8e15849c09bceff24da36504f6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 29 Jul 2022 10:54:05 +0900 Subject: [PATCH 10/10] implement the same safe-guard that we had for the python segmenter to avoid matching a very long string --- .../core/lang/impl/PragmaticSentenceDetector.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java index 855dee8039..339bae513d 100644 --- a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java +++ b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java @@ -143,7 +143,9 @@ protected static List getSentenceOffsets(String text, List -1) { - start = text.indexOf(sentenceClean, previousEnd); + String subString = StringUtils.substring(text, previousEnd, previousEnd + 2 * sentenceClean.length()); + int relativeIndexOf = subString.indexOf(sentenceClean); + start = relativeIndexOf > -1 ? relativeIndexOf + previousEnd : relativeIndexOf; } else { start = text.indexOf(sentenceClean); } @@ -152,7 +154,9 @@ protected static List getSentenceOffsets(String text, List -1) { - start = text.replace("\n", " ").indexOf(sentenceClean, previousEnd); + String subString = StringUtils.substring(text, previousEnd, previousEnd + 2 * sentenceClean.length()); + int relativeIndexOf = subString.replace("\n", " ").indexOf(sentenceClean); + start = relativeIndexOf > 1 ? relativeIndexOf + previousEnd : relativeIndexOf; } else { start = text.replace("\n", " ").indexOf(sentenceClean); } @@ -162,13 +166,13 @@ protected static List getSentenceOffsets(String text, List -1) { - textAdapted = text.substring(previousEnd); + textAdapted = StringUtils.substring(text, previousEnd, previousEnd + 2 * sentenceClean.length()); Pair inText = findInText(sentenceClean, textAdapted); start = inText.getRight(); outputStr = inText.getLeft(); start += previousEnd; } else if (previousStart > -1) { - textAdapted = text.substring(previousStart); + textAdapted = StringUtils.substring(text, previousStart, previousStart + 2 * sentenceClean.length()); Pair inText = findInText(sentenceClean, textAdapted); start = inText.getRight(); outputStr = inText.getLeft();