Skip to content

Commit

Permalink
Merge pull request #498 from kermitt2/improved-dehypenisation
Browse files Browse the repository at this point in the history
Improved dehypenisation

Former-commit-id: 472324a
  • Loading branch information
kermitt2 authored Sep 28, 2019
2 parents 8d5afd2 + 794297e commit 1adc351
Show file tree
Hide file tree
Showing 10 changed files with 289 additions and 150 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1569,7 +1569,7 @@ public void assignGraphicObjectsToFigures() {
if (realCaptionTokens != null && !realCaptionTokens.isEmpty()) {
f.setLayoutTokens(realCaptionTokens);
f.setTextArea(BoundingBoxCalculator.calculate(realCaptionTokens));
f.setCaption(new StringBuilder(LayoutTokensUtil.toText(TextUtilities.dehyphenize(realCaptionTokens))));
f.setCaption(new StringBuilder(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(realCaptionTokens))));
pageFigures.add(f);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,7 @@ private StringBuilder toTEITextPiece(StringBuilder buffer,
curParagraph.appendChild(clusterContent);
} else if (MARKER_LABELS.contains(clusterLabel)) {
List<LayoutToken> refTokens = cluster.concatTokens();
refTokens = TextUtilities.dehyphenize(refTokens);
refTokens = LayoutTokensUtil.dehyphenize(refTokens);
String chunkRefString = LayoutTokensUtil.toText(refTokens);

Element parent = curParagraph != null ? curParagraph : curDiv;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -54,7 +55,7 @@ public static String normalizeText(List<LayoutToken> tokens) {
}

public static String normalizeDehyphenizeText(List<LayoutToken> tokens) {
return StringUtils.normalizeSpace(LayoutTokensUtil.toText(TextUtilities.dehyphenize(tokens)).replace("\n", " "));
return StringUtils.normalizeSpace(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(tokens)).replace("\n", " "));
}

public static String toText(List<LayoutToken> tokens) {
Expand All @@ -65,7 +66,7 @@ public static boolean noCoords(LayoutToken t) {
return t.getPage() == -1 || t.getWidth() <= 0;
}


public static boolean spaceyToken(String tok) {
/*return (tok.equals(" ")
|| tok.equals("\u00A0")
Expand Down Expand Up @@ -175,34 +176,134 @@ public static String getCoordsStringForOneBox(List<LayoutToken> toks) {
}

public static List<LayoutToken> dehyphenize(List<LayoutToken> tokens) {
PeekingIterator<LayoutToken> it = Iterators.peekingIterator(tokens.iterator());
List<LayoutToken> result = new ArrayList<>();
boolean normalized = false;

LayoutToken prev = null;
while (it.hasNext()) {
LayoutToken cur = it.next();
//the current token is dash, next is new line, and previous one is some sort of word
if (cur.isNewLineAfter() && cur.getText().equals("-") && (prev != null) && (!prev.getText().trim().isEmpty())) {
it.next();
if (it.hasNext()) {
LayoutToken next = it.next();
if (next.getText().equals("conjugated") || prev.getText().equals("anti")) {
result.add(cur);
List<LayoutToken> output = new ArrayList<>();

for (int i = 0; i < tokens.size(); i++) {
LayoutToken currentToken = tokens.get(i);
//the current token is dash checking what's around
if (currentToken.getText().equals("-")) {
if (doesRequireDehypenisation(tokens, i)) {
//Cleanup eventual additional spaces before the hypen that have been already written to the output
int z = output.size() - 1;
while (z >= 0 && output.get(z).getText().equals(" ")) {
String tokenString = output.get(z).getText();

if (tokenString.equals(" ")) {
output.remove(z);
}
z--;
}


List<Integer> breakLines = new ArrayList<>();
List<Integer> spaces = new ArrayList<>();

int j = i + 1;
while (j < tokens.size() && tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n")) {
String tokenString = tokens.get(j).getText();

if (tokenString.equals("\n")) {
breakLines.add(j);
}
if (tokenString.equals(" ")) {
spaces.add(j);
}
j++;
}
i += breakLines.size() + spaces.size();
} else {
output.add(currentToken);

List<Integer> breakLines = new ArrayList<>();
List<Integer> spaces = new ArrayList<>();

int j = i + 1;
while (j < tokens.size() && tokens.get(j).getText().equals("\n")) {
String tokenString = tokens.get(j).getText();

if (tokenString.equals("\n")) {
breakLines.add(j);
}
j++;
}
result.add(next);
normalized = true;
i += breakLines.size() + spaces.size();

}
} else {
result.add(cur);
output.add(currentToken);
}
}
return output;
}

/**
* Check if the current token (place i), or the hypen, needs to be removed or not.
* <p>
* It will check the tokens before and after. It will get to the next "non space" tokens and verify
* that it's a plain word. If it's not it's keeping the hypen.
* <p>
* TODO: What to do in case of a punctuation is found?
*/
protected static boolean doesRequireDehypenisation(List<LayoutToken> tokens, int i) {
boolean forward = false;
boolean backward = false;

int j = i + 1;
int breakLine = 0;
int spacesAfter = 0;

double coordinateY = tokens.get(i).getY();

while (j < tokens.size() && (tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n"))) {
if (tokens.get(j).getText().equals("\n")) {
breakLine++;
} else if (tokens.get(j).getText().equals(" ")) {
spacesAfter++;
} else if (tokens.get(j).getY() > coordinateY) {
breakLine++;
}
j++;
}

if (breakLine == 0) {
// check if there is a break-line using coordinates, if not, no dehypenisation
if (j < tokens.size() && tokens.get(j).getY() == coordinateY) {
return false;
}
}

//tokens.stream().collect(groupingBy(LayoutToken::getY)).keySet()

if (j < tokens.size()) {
forward = StringUtils.isAllLowerCase(tokens.get(j).getText());
if (forward) {
//If nothing before the hypen, but it looks like a forward hypenisation, let's trust it
if (i < 1) {
return forward;
}

//I check if the coordinates have changed, this means there is a newline
if (tokens.get(j).getY() > coordinateY) {
return forward;
}

// Check backward
int z = i - 1;
while (z > 0 && (tokens.get(z).getText().equals(" ") || tokens.get(z).getText().equals("\n"))) {
z--;
}

if (StringUtils.isAlpha(tokens.get(z).getText())) {
if (tokens.get(z).getY() < coordinateY) {
backward = true;
} else if(coordinateY == -1 && breakLine > 0) {
backward = true;
}
}
}
prev = cur;
}

/*if (normalized) {
System.out.println("NORMALIZED: " + sb.toString());
}*/
return result;
return backward;
}

public static List<LayoutToken> subListByOffset(List<LayoutToken> token, int startIncluded) {
Expand Down
122 changes: 8 additions & 114 deletions grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,130 +101,24 @@ private static int getLastPunctuationCharacter(String section) {
return res;
}

/** @use LayoutTokensUtil.dehyphenize(List<LayoutToken> tokens) **/
@Deprecated
public static List<LayoutToken> dehyphenize(List<LayoutToken> tokens) {
List<LayoutToken> output = new ArrayList<>();

for (int i = 0; i < tokens.size(); i++) {
LayoutToken currentToken = tokens.get(i);
//the current token is dash checking what's around
if (currentToken.getText().equals("-")) {
if (doesRequireDehypenisation(tokens, i)) {
//Cleanup eventual additional spaces before the hypen that have been already written to the output
int z = output.size() - 1;
while (z >= 0 && output.get(z).getText().equals(" ")) {
String tokenString = output.get(z).getText();

if (tokenString.equals(" ")) {
output.remove(z);
}
z--;
}


List<Integer> breakLines = new ArrayList<>();
List<Integer> spaces = new ArrayList<>();

int j = i + 1;
while (j < tokens.size() && tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n")) {
String tokenString = tokens.get(j).getText();

if (tokenString.equals("\n")) {
breakLines.add(j);
}
if (tokenString.equals(" ")) {
spaces.add(j);
}
j++;
}
i += breakLines.size() + spaces.size();
} else {
output.add(currentToken);

List<Integer> breakLines = new ArrayList<>();
List<Integer> spaces = new ArrayList<>();

int j = i + 1;
while (j < tokens.size() && tokens.get(j).getText().equals("\n")) {
String tokenString = tokens.get(j).getText();

if (tokenString.equals("\n")) {
breakLines.add(j);
}
j++;
}
i += breakLines.size() + spaces.size();

}
} else {
output.add(currentToken);
}
}
return output;
return LayoutTokensUtil.dehyphenize(tokens);
}

/**
* Check if the current token (place i), or the hypen, needs to be removed or not.
* <p>
* It will check the tokens before and after. It will get to the next "non space" tokens and verify
* that it's a plain word. If it's not it's keeping the hypen.
* <p>
* TODO: add the check on the bounding box of the next token to see whether there is really a break line.
* TODO: What to do in case of a punctuation is found?
*/
/** @use LayoutTokenUtils.doesRequireDehypenisation(List<LayoutToken> tokens, int i)**/
@Deprecated
protected static boolean doesRequireDehypenisation(List<LayoutToken> tokens, int i) {
boolean forward = false;
boolean backward = false;

int j = i + 1;
int breakLine = 0;
while (j < tokens.size() && (tokens.get(j).getText().equals(" ") || tokens.get(j).getText().equals("\n"))) {
String tokenString = tokens.get(j).getText();

if (tokenString.equals("\n")) {
breakLine++;
}
j++;
}

if (breakLine == 0) {
return false;
}

Pattern onlyLowercaseLetters = Pattern.compile("[a-z]+");

if (j < tokens.size()) {
Matcher matcher = onlyLowercaseLetters.matcher(tokens.get(j).getText());
if (matcher.find()) {
forward = true;
}

if (forward) {
if(i < 1) {
//If nothing before the hypen, but it looks like a forward hypenisation, let's trust it
return forward;
}

int z = i - 1;
while (z > 0 && tokens.get(z).getText().equals(" ")) {
z--;
}

Matcher backwardMatcher = Pattern.compile("^[A-Za-z]+$").matcher(tokens.get(z).getText());
if (backwardMatcher.find()) {
backward = true;
}
}
}

return backward;
return LayoutTokensUtil.doesRequireDehypenisation(tokens, i);
}

public static String dehyphenize(String text) {
GrobidAnalyzer analyser = GrobidAnalyzer.getInstance();

final List<LayoutToken> layoutTokens = analyser.tokenizeWithLayoutToken(text);

return LayoutTokensUtil.toText(dehyphenize(layoutTokens));
return LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(layoutTokens));
}

public static String getLastToken(String section) {
Expand Down Expand Up @@ -268,7 +162,7 @@ public static String getFirstToken(String section) {
* @return Returns the dehyphenized string.
* <p>
* Deprecated method, not needed anymore since the @newline are preserved thanks to the LayoutTokens
* Use dehypenize
* @Use LayoutTokensUtil.dehypenize()
*/
@Deprecated
public static String dehyphenizeHard(String text) {
Expand Down
Loading

0 comments on commit 1adc351

Please sign in to comment.