Skip to content

Commit

Permalink
do not use anymore deprecated dehyphenization methods in grobid core
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Sep 28, 2019
1 parent 5f4af22 commit 883b3cb
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1569,7 +1569,7 @@ public void assignGraphicObjectsToFigures() {
if (realCaptionTokens != null && !realCaptionTokens.isEmpty()) {
f.setLayoutTokens(realCaptionTokens);
f.setTextArea(BoundingBoxCalculator.calculate(realCaptionTokens));
f.setCaption(new StringBuilder(LayoutTokensUtil.toText(TextUtilities.dehyphenize(realCaptionTokens))));
f.setCaption(new StringBuilder(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(realCaptionTokens))));
pageFigures.add(f);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,7 @@ private StringBuilder toTEITextPiece(StringBuilder buffer,
curParagraph.appendChild(clusterContent);
} else if (MARKER_LABELS.contains(clusterLabel)) {
List<LayoutToken> refTokens = cluster.concatTokens();
refTokens = TextUtilities.dehyphenize(refTokens);
refTokens = LayoutTokensUtil.dehyphenize(refTokens);
String chunkRefString = LayoutTokensUtil.toText(refTokens);

Element parent = curParagraph != null ? curParagraph : curDiv;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public static String normalizeText(List<LayoutToken> tokens) {
}

public static String normalizeDehyphenizeText(List<LayoutToken> tokens) {
return StringUtils.normalizeSpace(LayoutTokensUtil.toText(TextUtilities.dehyphenize(tokens)).replace("\n", " "));
return StringUtils.normalizeSpace(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(tokens)).replace("\n", " "));
}

public static String toText(List<LayoutToken> tokens) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public static String dehyphenize(String text) {

final List<LayoutToken> layoutTokens = analyser.tokenizeWithLayoutToken(text);

return LayoutTokensUtil.toText(dehyphenize(layoutTokens));
return LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(layoutTokens));
}

public static String getLastToken(String section) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ public Object apply(BibDataSet bibDataSet) {

public List<MatchResult> match(List<LayoutToken> refTokens) throws EntityMatcherException {
cntManager.i(ReferenceMarkerMatcherCounters.INPUT_REF_STRINGS_CNT);
String text = LayoutTokensUtil.toText(TextUtilities.dehyphenize(LayoutTokensUtil.enrichWithNewLineInfo(refTokens)));
String text = LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(LayoutTokensUtil.enrichWithNewLineInfo(refTokens)));

if (isAuthorCitationStyle(text)) {
cntManager.i(ReferenceMarkerMatcherCounters.STYLE_AUTHORS);
Expand Down Expand Up @@ -329,12 +329,12 @@ private static List<Pair<String, List<LayoutToken>>> splitAuthors(List<LayoutTok
int matchCount = matchCount(text, YEAR_PATTERN_WITH_LOOK_AROUND);
if (matchCount == 2 && text.contains(" and ")) {
for (List<LayoutToken> ys : LayoutTokensUtil.split(splitTokens, AND_WORD_PATTERN, true)) {
result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(ys)), ys));
result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(ys)), ys));
}
} else if (matchCount > 1) {
List<List<LayoutToken>> yearSplit = LayoutTokensUtil.split(splitTokens, YEAR_PATTERN, true, false);
if (yearSplit.isEmpty()) {
result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(splitTokens)), splitTokens));
result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(splitTokens)), splitTokens));
} else {
if (matchCount(splitTokens, AUTHOR_NAME_PATTERN) == 1) {
// cases like Grafton et al. 1995, 1998;
Expand All @@ -346,24 +346,24 @@ private static List<Pair<String, List<LayoutToken>>> splitAuthors(List<LayoutTok

List<LayoutToken> firstYearSplitItem;
firstYearSplitItem = yearSplit.get(0);
result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(firstYearSplitItem)), firstYearSplitItem));
result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(firstYearSplitItem)), firstYearSplitItem));

List<LayoutToken> excludedYearToks = firstYearSplitItem.subList(0, firstYearSplitItem.size() - 1);
String authorName = LayoutTokensUtil.toText(TextUtilities.dehyphenize(excludedYearToks));
String authorName = LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(excludedYearToks));

for (int i = 1; i < yearSplit.size(); i++) {
List<LayoutToken> toksI = yearSplit.get(i);
result.add(new Pair<>(authorName + " " + LayoutTokensUtil.toText(TextUtilities.dehyphenize(toksI)), toksI.subList(toksI.size() - 1, toksI.size())));
result.add(new Pair<>(authorName + " " + LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(toksI)), toksI.subList(toksI.size() - 1, toksI.size())));
}
} else {
// case when two authors still appear
for (List<LayoutToken> item : yearSplit) {
result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(item)), item));
result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(item)), item));
}
}
}
} else {
result.add(new Pair<>(LayoutTokensUtil.toText(TextUtilities.dehyphenize(splitTokens)), splitTokens));
result.add(new Pair<>(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(splitTokens)), splitTokens));
}
}
return result;
Expand Down

0 comments on commit 883b3cb

Please sign in to comment.