diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java index db02e25aa0..94958d752f 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/Document.java +++ b/grobid-core/src/main/java/org/grobid/core/document/Document.java @@ -1569,7 +1569,7 @@ public void assignGraphicObjectsToFigures() { if (realCaptionTokens != null && !realCaptionTokens.isEmpty()) { f.setLayoutTokens(realCaptionTokens); f.setTextArea(BoundingBoxCalculator.calculate(realCaptionTokens)); - f.setCaption(new StringBuilder(LayoutTokensUtil.toText(TextUtilities.dehyphenize(realCaptionTokens)))); + f.setCaption(new StringBuilder(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(realCaptionTokens)))); pageFigures.add(f); } } diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index d5f197ad47..bd7ee0013a 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1280,7 +1280,7 @@ private StringBuilder toTEITextPiece(StringBuilder buffer, curParagraph.appendChild(clusterContent); } else if (MARKER_LABELS.contains(clusterLabel)) { List refTokens = cluster.concatTokens(); - refTokens = TextUtilities.dehyphenize(refTokens); + refTokens = LayoutTokensUtil.dehyphenize(refTokens); String chunkRefString = LayoutTokensUtil.toText(refTokens); Element parent = curParagraph != null ? curParagraph : curDiv; diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/LayoutTokensUtil.java b/grobid-core/src/main/java/org/grobid/core/utilities/LayoutTokensUtil.java index b794bc3911..fb34c5796e 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/LayoutTokensUtil.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/LayoutTokensUtil.java @@ -13,6 +13,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -54,7 +55,7 @@ public static String normalizeText(List tokens) { } public static String normalizeDehyphenizeText(List tokens) { - return StringUtils.normalizeSpace(LayoutTokensUtil.toText(TextUtilities.dehyphenize(tokens)).replace("\n", " ")); + return StringUtils.normalizeSpace(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(tokens)).replace("\n", " ")); } public static String toText(List tokens) { @@ -65,7 +66,7 @@ public static boolean noCoords(LayoutToken t) { return t.getPage() == -1 || t.getWidth() <= 0; } - + public static boolean spaceyToken(String tok) { /*return (tok.equals(" ") || tok.equals("\u00A0") @@ -175,34 +176,134 @@ public static String getCoordsStringForOneBox(List toks) { } public static List dehyphenize(List tokens) { - PeekingIterator it = Iterators.peekingIterator(tokens.iterator()); - List result = new ArrayList<>(); - boolean normalized = false; - - LayoutToken prev = null; - while (it.hasNext()) { - LayoutToken cur = it.next(); - //the current token is dash, next is new line, and previous one is some sort of word - if (cur.isNewLineAfter() && cur.getText().equals("-") && (prev != null) && (!prev.getText().trim().isEmpty())) { - it.next(); - if (it.hasNext()) { - LayoutToken next = it.next(); - if (next.getText().equals("conjugated") || prev.getText().equals("anti")) { - result.add(cur); + List output = new ArrayList<>(); + + for (int i = 0; i < tokens.size(); i++) { + LayoutToken currentToken = tokens.get(i); + //the current token is dash checking what's around + if (currentToken.getText().equals("-")) { + if (doesRequireDehypenisation(tokens, i)) { + //Cleanup eventual additional spaces before the hypen that have been already written to the output + int z = output.size() - 1; + while (z >= 0 && output.get(z).getText().equals(" ")) { + String tokenString = output.get(z).getText(); + + if (tokenString.equals(" ")) { + output.remove(z); + } + z--; + } + + + List breakLines = new ArrayList<>(); + List spaces = new ArrayList<>(); + + int j = i + 1; + while (j < tokens.size() && tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n")) { + String tokenString = tokens.get(j).getText(); + + if (tokenString.equals("\n")) { + breakLines.add(j); + } + if (tokenString.equals(" ")) { + spaces.add(j); + } + j++; + } + i += breakLines.size() + spaces.size(); + } else { + output.add(currentToken); + + List breakLines = new ArrayList<>(); + List spaces = new ArrayList<>(); + + int j = i + 1; + while (j < tokens.size() && tokens.get(j).getText().equals("\n")) { + String tokenString = tokens.get(j).getText(); + + if (tokenString.equals("\n")) { + breakLines.add(j); + } + j++; } - result.add(next); - normalized = true; + i += breakLines.size() + spaces.size(); + } } else { - result.add(cur); + output.add(currentToken); + } + } + return output; + } + + /** + * Check if the current token (place i), or the hypen, needs to be removed or not. + *

+ * It will check the tokens before and after. It will get to the next "non space" tokens and verify + * that it's a plain word. If it's not it's keeping the hypen. + *

+ * TODO: What to do in case of a punctuation is found? + */ + protected static boolean doesRequireDehypenisation(List tokens, int i) { + boolean forward = false; + boolean backward = false; + + int j = i + 1; + int breakLine = 0; + int spacesAfter = 0; + + double coordinateY = tokens.get(i).getY(); + + while (j < tokens.size() && (tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n"))) { + if (tokens.get(j).getText().equals("\n")) { + breakLine++; + } else if (tokens.get(j).getText().equals(" ")) { + spacesAfter++; + } else if (tokens.get(j).getY() > coordinateY) { + breakLine++; + } + j++; + } + + if (breakLine == 0) { + // check if there is a break-line using coordinates, if not, no dehypenisation + if (j < tokens.size() && tokens.get(j).getY() == coordinateY) { + return false; + } + } + + //tokens.stream().collect(groupingBy(LayoutToken::getY)).keySet() + + if (j < tokens.size()) { + forward = StringUtils.isAllLowerCase(tokens.get(j).getText()); + if (forward) { + //If nothing before the hypen, but it looks like a forward hypenisation, let's trust it + if (i < 1) { + return forward; + } + + //I check if the coordinates have changed, this means there is a newline + if (tokens.get(j).getY() > coordinateY) { + return forward; + } + + // Check backward + int z = i - 1; + while (z > 0 && (tokens.get(z).getText().equals(" ") || tokens.get(z).getText().equals("\n"))) { + z--; + } + + if (StringUtils.isAlpha(tokens.get(z).getText())) { + if (tokens.get(z).getY() < coordinateY) { + backward = true; + } else if(coordinateY == -1 && breakLine > 0) { + backward = true; + } + } } - prev = cur; } - /*if (normalized) { - System.out.println("NORMALIZED: " + sb.toString()); - }*/ - return result; + return backward; } public static List subListByOffset(List token, int startIncluded) { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 315f94957e..c3a4242de5 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -101,122 +101,16 @@ private static int getLastPunctuationCharacter(String section) { return res; } + /** @use LayoutTokensUtil.dehyphenize(List tokens) **/ + @Deprecated public static List dehyphenize(List tokens) { - List output = new ArrayList<>(); - - for (int i = 0; i < tokens.size(); i++) { - LayoutToken currentToken = tokens.get(i); - //the current token is dash checking what's around - if (currentToken.getText().equals("-")) { - if (doesRequireDehypenisation(tokens, i)) { - //Cleanup eventual additional spaces before the hypen that have been already written to the output - int z = output.size() - 1; - while (z >= 0 && output.get(z).getText().equals(" ")) { - String tokenString = output.get(z).getText(); - - if (tokenString.equals(" ")) { - output.remove(z); - } - z--; - } - - - List breakLines = new ArrayList<>(); - List spaces = new ArrayList<>(); - - int j = i + 1; - while (j < tokens.size() && tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n")) { - String tokenString = tokens.get(j).getText(); - - if (tokenString.equals("\n")) { - breakLines.add(j); - } - if (tokenString.equals(" ")) { - spaces.add(j); - } - j++; - } - i += breakLines.size() + spaces.size(); - } else { - output.add(currentToken); - - List breakLines = new ArrayList<>(); - List spaces = new ArrayList<>(); - - int j = i + 1; - while (j < tokens.size() && tokens.get(j).getText().equals("\n")) { - String tokenString = tokens.get(j).getText(); - - if (tokenString.equals("\n")) { - breakLines.add(j); - } - j++; - } - i += breakLines.size() + spaces.size(); - - } - } else { - output.add(currentToken); - } - } - return output; + return LayoutTokensUtil.dehyphenize(tokens); } - /** - * Check if the current token (place i), or the hypen, needs to be removed or not. - *

- * It will check the tokens before and after. It will get to the next "non space" tokens and verify - * that it's a plain word. If it's not it's keeping the hypen. - *

- * TODO: add the check on the bounding box of the next token to see whether there is really a break line. - * TODO: What to do in case of a punctuation is found? - */ + /** @use LayoutTokenUtils.doesRequireDehypenisation(List tokens, int i)**/ + @Deprecated protected static boolean doesRequireDehypenisation(List tokens, int i) { - boolean forward = false; - boolean backward = false; - - int j = i + 1; - int breakLine = 0; - while (j < tokens.size() && (tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n"))) { - String tokenString = tokens.get(j).getText(); - - if (tokenString.equals("\n")) { - breakLine++; - } - j++; - } - - if (breakLine == 0) { - return false; - } - - Pattern onlyLowercaseLetters = Pattern.compile("[a-z]+"); - - if (j < tokens.size()) { - Matcher matcher = onlyLowercaseLetters.matcher(tokens.get(j).getText()); - if (matcher.find()) { - forward = true; - } - - if (forward) { - if(i < 1) { - //If nothing before the hypen, but it looks like a forward hypenisation, let's trust it - return forward; - } - - int z = i - 1; - while (z > 0 && tokens.get(z).getText().equals(" ")) { - z--; - } - - Matcher backwardMatcher = Pattern.compile("^[A-Za-z]+$").matcher(tokens.get(z).getText()); - if (backwardMatcher.find()) { - backward = true; - } - } - } - - return backward; + return LayoutTokensUtil.doesRequireDehypenisation(tokens, i); } public static String dehyphenize(String text) { @@ -224,7 +118,7 @@ public static String dehyphenize(String text) { final List layoutTokens = analyser.tokenizeWithLayoutToken(text); - return LayoutTokensUtil.toText(dehyphenize(layoutTokens)); + return LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(layoutTokens)); } public static String getLastToken(String section) { @@ -268,7 +162,7 @@ public static String getFirstToken(String section) { * @return Returns the dehyphenized string. *

* Deprecated method, not needed anymore since the @newline are preserved thanks to the LayoutTokens - * Use dehypenize + * @Use LayoutTokensUtil.dehypenize() */ @Deprecated public static String dehyphenizeHard(String text) { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/matching/ReferenceMarkerMatcher.java b/grobid-core/src/main/java/org/grobid/core/utilities/matching/ReferenceMarkerMatcher.java index 3bbc9b0753..a74a1e684e 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/matching/ReferenceMarkerMatcher.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/matching/ReferenceMarkerMatcher.java @@ -131,7 +131,7 @@ public Object apply(BibDataSet bibDataSet) { public List match(List refTokens) throws EntityMatcherException { cntManager.i(ReferenceMarkerMatcherCounters.INPUT_REF_STRINGS_CNT); - String text = LayoutTokensUtil.toText(TextUtilities.dehyphenize(LayoutTokensUtil.enrichWithNewLineInfo(refTokens))); + String text = LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(LayoutTokensUtil.enrichWithNewLineInfo(refTokens))); if (isAuthorCitationStyle(text)) { cntManager.i(ReferenceMarkerMatcherCounters.STYLE_AUTHORS); @@ -329,12 +329,12 @@ private static List>> splitAuthors(List ys : LayoutTokensUtil.split(splitTokens, AND_WORD_PATTERN, true)) { - result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(ys)), ys)); + result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(ys)), ys)); } } else if (matchCount > 1) { List> yearSplit = LayoutTokensUtil.split(splitTokens, YEAR_PATTERN, true, false); if (yearSplit.isEmpty()) { - result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(splitTokens)), splitTokens)); + result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(splitTokens)), splitTokens)); } else { if (matchCount(splitTokens, AUTHOR_NAME_PATTERN) == 1) { // cases like Grafton et al. 1995, 1998; @@ -346,24 +346,24 @@ private static List>> splitAuthors(List firstYearSplitItem; firstYearSplitItem = yearSplit.get(0); - result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(firstYearSplitItem)), firstYearSplitItem)); + result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(firstYearSplitItem)), firstYearSplitItem)); List excludedYearToks = firstYearSplitItem.subList(0, firstYearSplitItem.size() - 1); - String authorName = LayoutTokensUtil.toText(TextUtilities.dehyphenize(excludedYearToks)); + String authorName = LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(excludedYearToks)); for (int i = 1; i < yearSplit.size(); i++) { List toksI = yearSplit.get(i); - result.add(new Pair<>(authorName + " " + LayoutTokensUtil.toText(TextUtilities.dehyphenize(toksI)), toksI.subList(toksI.size() - 1, toksI.size()))); + result.add(new Pair<>(authorName + " " + LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(toksI)), toksI.subList(toksI.size() - 1, toksI.size()))); } } else { // case when two authors still appear for (List item : yearSplit) { - result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(item)), item)); + result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(item)), item)); } } } } else { - result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(splitTokens)), splitTokens)); + result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(splitTokens)), splitTokens)); } } return result; diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilIntegrationTest.java new file mode 100644 index 0000000000..94331b0e0b --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilIntegrationTest.java @@ -0,0 +1,44 @@ +package org.grobid.core.utilities; + +import org.grobid.core.document.Document; +import org.grobid.core.document.DocumentSource; +import org.grobid.core.engines.Engine; +import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.grobid.core.main.LibraryLoader; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; + +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +public class LayoutTokensUtilIntegrationTest { + + @BeforeClass + public static void setUp() throws Exception { + LibraryLoader.load(); + GrobidProperties.getInstance(); + } + + @Test + public void testDoesRequireDehyphenization2() throws Exception { + + DocumentSource documentSource = DocumentSource.fromPdf(new File("src/test/resources/org/grobid/core/utilities/dehypenisation1.pdf")); + Document result = Engine.getEngine(false).getParsers().getSegmentationParser().processing(documentSource, GrobidAnalysisConfig.defaultInstance()); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(result.getTokenizations(), 7), is(true)); + + } + + @Test + public void testDoesRequireDehyphenization() throws Exception { + + DocumentSource documentSource = DocumentSource.fromPdf(new File("src/test/resources/org/grobid/core/utilities/dehypenisation2.pdf")); + Document result = Engine.getEngine(false).getParsers().getSegmentationParser().processing(documentSource, GrobidAnalysisConfig.defaultInstance()); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(result.getTokenizations(), 7), is(true)); + + } + +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilTest.java index 5ba07cd48c..a54c944e61 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/LayoutTokensUtilTest.java @@ -6,6 +6,7 @@ import org.junit.Test; import java.util.List; +import java.util.stream.IntStream; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.hasSize; @@ -13,6 +14,95 @@ public class LayoutTokensUtilTest { + /** + * We fake the new line in the layout token coordinates + */ + @Test + public void testDoesRequireDehyphenization_shouldReturnTrue() throws Exception { + String input = "The study of iron-based supercondu- \n" + + "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 11), is(true)); + } + + @Test + public void testDoesRequireDehyphenization2_shouldReturnTrue() throws Exception { + String input = "The study of iron-based supercondu - \n" + + "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 12), is(true)); + } + + @Test + public void testDoesRequireDehyphenization_composedWords_shouldReturnFalse() throws Exception { + String input = "The study of iron-based supercondu - \n" + + "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 7), is(false)); + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 24), is(false)); + } + + @Test + public void testDoesRequireDehyphenization2_composedWords_shouldReturnFalse() throws Exception { + String input = "The study of iron- based supercondu - \n" + + "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 7), is(false)); + } + + @Test + public void testDoesRequireDehyphenization3_composedWords_shouldReturnFalse() throws Exception { + String input = "The study of iron - based supercondu - \n" + + "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 8), is(false)); + } + + @Test + public void testDoesRequireDehyphenization_usingCoordinates_shouldReturnTrue() throws Exception { + String input = "The study of iron-based supercondu - " + + "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + IntStream.range(0, 15).forEach(i -> layoutTokens.get(i).setY(10)); + IntStream.range(15, layoutTokens.size()).forEach(i -> layoutTokens.get(i).setY(30)); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 12), is(true)); + } + +// @Test +// public void testDoesRequireDehyphenization_withoutNewLine() throws Exception { +// String input = "The study of iron-based supercondu - " + +// "ctors superconductivity in the iron-pnictide LaFeAsO 1-x F x has been expanding and has \n"; +// +// List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); +// +// IntStream.range(0, 15).forEach(i -> layoutTokens.get(i).setY(10)); +// IntStream.range(15, layoutTokens.size()).forEach(i -> layoutTokens.get(i).setY(30)); +// +// assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 12), is(true)); +// } + + + @Test + public void testDoesRequireDehyphenization_hypenAtEndOfString_shouldReturnFalse() throws Exception { + String input = "The study of iron-based supercondu-"; + + List layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + assertThat(LayoutTokensUtil.doesRequireDehypenisation(layoutTokens, 11), is(false)); + } @Test public void testSubList() throws Exception { diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index d6e3c3dd21..ac69dd4915 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -285,6 +285,18 @@ public void testDoesRequireDehypenisation_usualWord_shouldReturnFalse() { assertThat(TextUtilities.doesRequireDehypenisation(tokens, 9), is(false)); } + @Test + public void testDoesRequireDehypenisation_usualWordWithSpace_shouldReturnFalse() { + List tokens = GrobidDefaultAnalyzer.getInstance().tokenizeWithLayoutToken("This is a sample open- source text"); + assertThat(TextUtilities.doesRequireDehypenisation(tokens, 9), is(false)); + } + + @Test + public void testDoesRequireDehypenisation_usualWordWith2Space_shouldReturnFalse() { + List tokens = GrobidDefaultAnalyzer.getInstance().tokenizeWithLayoutToken("This is a sample open - source text"); + assertThat(TextUtilities.doesRequireDehypenisation(tokens, 9), is(false)); + } + @Test public void testDoesRequireDehypenisation_sequence_shouldReturnFalse() { List tokens = GrobidDefaultAnalyzer.getInstance().tokenizeWithLayoutToken("This is a sample ABC123-3434 text"); @@ -369,8 +381,6 @@ public void testDoesRequireDehypenisation_falseFriend3_shouldReturnTrue() { assertThat(TextUtilities.doesRequireDehypenisation(tokens, 19), is(false)); } - - @Test public void testIsAllUpperCaseOrDigitOrDot() throws Exception { assertThat(TextUtilities.isAllUpperCaseOrDigitOrDot("this"), is(false)); diff --git a/grobid-core/src/test/resources/org/grobid/core/utilities/dehypenisation1.pdf b/grobid-core/src/test/resources/org/grobid/core/utilities/dehypenisation1.pdf new file mode 100644 index 0000000000..79d7606832 Binary files /dev/null and b/grobid-core/src/test/resources/org/grobid/core/utilities/dehypenisation1.pdf differ diff --git a/grobid-core/src/test/resources/org/grobid/core/utilities/dehypenisation2.pdf b/grobid-core/src/test/resources/org/grobid/core/utilities/dehypenisation2.pdf new file mode 100644 index 0000000000..60860c544e Binary files /dev/null and b/grobid-core/src/test/resources/org/grobid/core/utilities/dehypenisation2.pdf differ