Merge pull request #498 from kermitt2/improved-dehypenisation

Improved dehypenisation Former-commit-id: 472324a
kermitt2 · Sep 28, 2019 · 1adc351 · 1adc351
2 parents 8d5afd2 + 794297e
commit 1adc351
Show file tree

Hide file tree

Showing 10 changed files with 289 additions and 150 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/document/Document.java b/grobid-core/src/main/java/org/grobid/core/document/Document.java
@@ -1569,7 +1569,7 @@ public void assignGraphicObjectsToFigures() {
                 if (realCaptionTokens != null && !realCaptionTokens.isEmpty()) {
                     f.setLayoutTokens(realCaptionTokens);
                     f.setTextArea(BoundingBoxCalculator.calculate(realCaptionTokens));
-                    f.setCaption(new StringBuilder(LayoutTokensUtil.toText(TextUtilities.dehyphenize(realCaptionTokens))));
+                    f.setCaption(new StringBuilder(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(realCaptionTokens))));
                     pageFigures.add(f);
                 }
             }

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1280,7 +1280,7 @@ private StringBuilder toTEITextPiece(StringBuilder buffer,
                 curParagraph.appendChild(clusterContent);
             } else if (MARKER_LABELS.contains(clusterLabel)) {
                 List<LayoutToken> refTokens = cluster.concatTokens();
-                refTokens = TextUtilities.dehyphenize(refTokens);
+                refTokens = LayoutTokensUtil.dehyphenize(refTokens);
                 String chunkRefString = LayoutTokensUtil.toText(refTokens);
 
                 Element parent = curParagraph != null ? curParagraph : curDiv;

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/LayoutTokensUtil.java b/grobid-core/src/main/java/org/grobid/core/utilities/LayoutTokensUtil.java
@@ -13,6 +13,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
@@ -54,7 +55,7 @@ public static String normalizeText(List<LayoutToken> tokens) {
     }
 
     public static String normalizeDehyphenizeText(List<LayoutToken> tokens) {
-        return StringUtils.normalizeSpace(LayoutTokensUtil.toText(TextUtilities.dehyphenize(tokens)).replace("\n", " "));
+        return StringUtils.normalizeSpace(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(tokens)).replace("\n", " "));
     }
 
     public static String toText(List<LayoutToken> tokens) {
@@ -65,7 +66,7 @@ public static boolean noCoords(LayoutToken t) {
         return t.getPage() == -1 || t.getWidth() <= 0;
     }
 
-    
+
     public static boolean spaceyToken(String tok) {
         /*return (tok.equals(" ")
                 || tok.equals("\u00A0")
@@ -175,34 +176,134 @@ public static String getCoordsStringForOneBox(List<LayoutToken> toks) {
     }
 
     public static List<LayoutToken> dehyphenize(List<LayoutToken> tokens) {
-        PeekingIterator<LayoutToken> it = Iterators.peekingIterator(tokens.iterator());
-        List<LayoutToken> result = new ArrayList<>();
-        boolean normalized = false;
-
-        LayoutToken prev = null;
-        while (it.hasNext()) {
-            LayoutToken cur = it.next();
-            //the current token is dash, next is new line, and previous one is some sort of word
-            if (cur.isNewLineAfter() && cur.getText().equals("-") && (prev != null) && (!prev.getText().trim().isEmpty())) {
-                it.next();
-                if (it.hasNext()) {
-                    LayoutToken next = it.next();
-                    if (next.getText().equals("conjugated") || prev.getText().equals("anti")) {
-                        result.add(cur);
+        List<LayoutToken> output = new ArrayList<>();
+
+        for (int i = 0; i < tokens.size(); i++) {
+            LayoutToken currentToken = tokens.get(i);
+            //the current token is dash checking what's around
+            if (currentToken.getText().equals("-")) {
+                if (doesRequireDehypenisation(tokens, i)) {
+                    //Cleanup eventual additional spaces before the hypen that have been already written to the output
+                    int z = output.size() - 1;
+                    while (z >= 0 && output.get(z).getText().equals(" ")) {
+                        String tokenString = output.get(z).getText();
+
+                        if (tokenString.equals(" ")) {
+                            output.remove(z);
+                        }
+                        z--;
+                    }
+
+
+                    List<Integer> breakLines = new ArrayList<>();
+                    List<Integer> spaces = new ArrayList<>();
+
+                    int j = i + 1;
+                    while (j < tokens.size() && tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n")) {
+                        String tokenString = tokens.get(j).getText();
+
+                        if (tokenString.equals("\n")) {
+                            breakLines.add(j);
+                        }
+                        if (tokenString.equals(" ")) {
+                            spaces.add(j);
+                        }
+                        j++;
+                    }
+                    i += breakLines.size() + spaces.size();
+                } else {
+                    output.add(currentToken);
+
+                    List<Integer> breakLines = new ArrayList<>();
+                    List<Integer> spaces = new ArrayList<>();
+
+                    int j = i + 1;
+                    while (j < tokens.size() && tokens.get(j).getText().equals("\n")) {
+                        String tokenString = tokens.get(j).getText();
+
+                        if (tokenString.equals("\n")) {
+                            breakLines.add(j);
+                        }
+                        j++;
                     }
-                    result.add(next);
-                    normalized = true;
+                    i += breakLines.size() + spaces.size();
+
                 }
             } else {
-                result.add(cur);
+                output.add(currentToken);
+            }
+        }
+        return output;
+    }
+
+    /**
+     * Check if the current token (place i), or the hypen, needs to be removed or not.
+     * <p>
+     * It will check the tokens before and after. It will get to the next "non space" tokens and verify
+     * that it's a plain word. If it's not it's keeping the hypen.
+     * <p>
+     * TODO: What to do in case of a punctuation is found?
+     */
+    protected static boolean doesRequireDehypenisation(List<LayoutToken> tokens, int i) {
+        boolean forward = false;
+        boolean backward = false;
+
+        int j = i + 1;
+        int breakLine = 0;
+        int spacesAfter = 0;
+
+        double coordinateY = tokens.get(i).getY();
+
+        while (j < tokens.size() && (tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n"))) {
+            if (tokens.get(j).getText().equals("\n")) {
+                breakLine++;
+            } else if (tokens.get(j).getText().equals(" ")) {
+                spacesAfter++;
+            } else if (tokens.get(j).getY() > coordinateY) {
+                breakLine++;
+            }
+            j++;
+        }
+
+        if (breakLine == 0) {
+            // check if there is a break-line using coordinates, if not, no dehypenisation
+            if (j < tokens.size() && tokens.get(j).getY() == coordinateY) {
+                return false;
+            }
+        }
+
+        //tokens.stream().collect(groupingBy(LayoutToken::getY)).keySet()
+
+        if (j < tokens.size()) {
+            forward = StringUtils.isAllLowerCase(tokens.get(j).getText());
+            if (forward) {
+                //If nothing before the hypen, but it looks like a forward hypenisation, let's trust it
+                if (i < 1) {
+                    return forward;
+                }
+
+                //I check if the coordinates have changed, this means there is a newline
+                if (tokens.get(j).getY() > coordinateY) {
+                    return forward;
+                }
+
+                // Check backward
+                int z = i - 1;
+                while (z > 0 && (tokens.get(z).getText().equals(" ") || tokens.get(z).getText().equals("\n"))) {
+                    z--;
+                }
+
+                if (StringUtils.isAlpha(tokens.get(z).getText())) {
+                    if (tokens.get(z).getY() < coordinateY) {
+                        backward = true;
+                    } else if(coordinateY == -1 && breakLine > 0) {
+                        backward = true;
+                    }
+                }
             }
-            prev = cur;
         }
 
-        /*if (normalized) {
-            System.out.println("NORMALIZED: " + sb.toString());
-        }*/
-        return result;
+        return backward;
     }
 
     public static List<LayoutToken> subListByOffset(List<LayoutToken> token, int startIncluded) {

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -101,130 +101,24 @@ private static int getLastPunctuationCharacter(String section) {
         return res;
     }
 
+    /** @use LayoutTokensUtil.dehyphenize(List<LayoutToken> tokens) **/
+    @Deprecated
     public static List<LayoutToken> dehyphenize(List<LayoutToken> tokens) {
-        List<LayoutToken> output = new ArrayList<>();
-
-        for (int i = 0; i < tokens.size(); i++) {
-            LayoutToken currentToken = tokens.get(i);
-            //the current token is dash checking what's around
-            if (currentToken.getText().equals("-")) {
-                if (doesRequireDehypenisation(tokens, i)) {
-                    //Cleanup eventual additional spaces before the hypen that have been already written to the output
-                    int z = output.size() - 1;
-                    while (z >= 0 && output.get(z).getText().equals(" ")) {
-                        String tokenString = output.get(z).getText();
-
-                        if (tokenString.equals(" ")) {
-                            output.remove(z);
-                        }
-                        z--;
-                    }
-
-
-                    List<Integer> breakLines = new ArrayList<>();
-                    List<Integer> spaces = new ArrayList<>();
-
-                    int j = i + 1;
-                    while (j < tokens.size() && tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n")) {
-                        String tokenString = tokens.get(j).getText();
-
-                        if (tokenString.equals("\n")) {
-                            breakLines.add(j);
-                        }
-                        if (tokenString.equals(" ")) {
-                            spaces.add(j);
-                        }
-                        j++;
-                    }
-                    i += breakLines.size() + spaces.size();
-                } else {
-                    output.add(currentToken);
-
-                    List<Integer> breakLines = new ArrayList<>();
-                    List<Integer> spaces = new ArrayList<>();
-
-                    int j = i + 1;
-                    while (j < tokens.size() && tokens.get(j).getText().equals("\n")) {
-                        String tokenString = tokens.get(j).getText();
-
-                        if (tokenString.equals("\n")) {
-                            breakLines.add(j);
-                        }
-                        j++;
-                    }
-                    i += breakLines.size() + spaces.size();
-
-                }
-            } else {
-                output.add(currentToken);
-            }
-        }
-        return output;
+        return LayoutTokensUtil.dehyphenize(tokens);
     }
 
-    /**
-     * Check if the current token (place i), or the hypen, needs to be removed or not.
-     * <p>
-     * It will check the tokens before and after. It will get to the next "non space" tokens and verify
-     * that it's a plain word. If it's not it's keeping the hypen.
-     * <p>
-     * TODO: add the check on the bounding box of the next token to see whether there is really a break line.
-     * TODO: What to do in case of a punctuation is found?
-     */
+    /** @use LayoutTokenUtils.doesRequireDehypenisation(List<LayoutToken> tokens, int i)**/
+    @Deprecated
     protected static boolean doesRequireDehypenisation(List<LayoutToken> tokens, int i) {
-        boolean forward = false;
-        boolean backward = false;
-
-        int j = i + 1;
-        int breakLine = 0;
-        while (j < tokens.size() && (tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n"))) {
-            String tokenString = tokens.get(j).getText();
-
-            if (tokenString.equals("\n")) {
-                breakLine++;
-            }
-            j++;
-        }
-
-        if (breakLine == 0) {
-            return false;
-        }
-
-        Pattern onlyLowercaseLetters = Pattern.compile("[a-z]+");
-
-        if (j < tokens.size()) {
-            Matcher matcher = onlyLowercaseLetters.matcher(tokens.get(j).getText());
-            if (matcher.find()) {
-                forward = true;
-            }
-
-            if (forward) {
-                if(i < 1) {
-                    //If nothing before the hypen, but it looks like a forward hypenisation, let's trust it 
-                    return forward;
-                }
-
-                int z = i - 1;
-                while (z > 0 && tokens.get(z).getText().equals(" ")) {
-                    z--;
-                }
-
-                Matcher backwardMatcher = Pattern.compile("^[A-Za-z]+$").matcher(tokens.get(z).getText());
-                if (backwardMatcher.find()) {
-                    backward = true;
-                }
-            }
-        }
-
-        return backward;
+        return LayoutTokensUtil.doesRequireDehypenisation(tokens, i);
     }
 
     public static String dehyphenize(String text) {
         GrobidAnalyzer analyser = GrobidAnalyzer.getInstance();
 
         final List<LayoutToken> layoutTokens = analyser.tokenizeWithLayoutToken(text);
 
-        return LayoutTokensUtil.toText(dehyphenize(layoutTokens));
+        return LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(layoutTokens));
     }
 
     public static String getLastToken(String section) {
@@ -268,7 +162,7 @@ public static String getFirstToken(String section) {
      * @return Returns the dehyphenized string.
      * <p>
      * Deprecated method, not needed anymore since the @newline are preserved thanks to the LayoutTokens
-     * Use dehypenize
+     * @Use LayoutTokensUtil.dehypenize()
      */
     @Deprecated
     public static String dehyphenizeHard(String text) {