Skip to content

Commit

Permalink
Merge pull request #1242 from kermitt2/bugfix/language-null
Browse files Browse the repository at this point in the history
Avoid running aground when detecting the language
  • Loading branch information
lfoppiano authored Feb 4, 2025
2 parents 0f41699 + 7383b1f commit 99423c8
Showing 1 changed file with 9 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,15 @@ public String processingHeaderSection(

contentSample.append(stringSample);
}
//In case we don't have text, it might be that someone is trying to process a document that is not a scientific article,
// one more attempt with the full header.
if (contentSample.length() < 200) {
String stringSample = Document.getTokenizationParts(doc.getDocumentPart(SegmentationLabels.HEADER), tokenizations)
.stream().map(LayoutToken::toString)
.collect(Collectors.joining(" "));

contentSample.append(stringSample);
}
}
Language langu = languageUtilities.runLanguageId(contentSample.toString());
if (langu != null) {
Expand Down

0 comments on commit 99423c8

Please sign in to comment.