Skip to content

Commit

Permalink
Merge pull request #1235 from kermitt2/bugfix/avoid-empty-sentences
Browse files Browse the repository at this point in the history
Bugfix/avoid empty sentences
  • Loading branch information
lfoppiano authored Jan 23, 2025
2 parents c0e6525 + 3bc567e commit 0f41699
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 25 deletions.
35 changes: 18 additions & 17 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public List<BoundingBox> getCoordinates() {

List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
//theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
List<BoundingBox> result = new ArrayList<BoundingBox>();
Expand All @@ -291,7 +291,7 @@ public List<BoundingBox> getCoordinates() {
// here we bound all figure graphics in one single box (given that we can have hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -307,8 +307,8 @@ public List<BoundingBox> getCoordinates() {
theBoxes.add(theGraphicsBox);
}

List<BoundingBox> result = new ArrayList<BoundingBox>();
if (theBoxes != null && theBoxes.size() > 0) {
List<BoundingBox> result = new ArrayList<>();
if (CollectionUtils.isNotEmpty(theBoxes)) {
BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
List<BoundingBox> mergedBox = VectorGraphicBoxCalculator.mergeBoxes(theBoxes);
result.addAll(mergedBox);
Expand All @@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand All @@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (config.isGenerateTeiCoordinates("figure")) {
List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
}

// if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
// -> note: this was restricted to the bitmap objects only... the bounding box calculation
// with vector graphics might need some double check

// here we bound all figure graphics in one single box (given that we can have hundred graphics
// here we bound all figure graphics in one single box (given that we can have a hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
theBoxes.add(theGraphicsBox);
}

if (theBoxes != null && theBoxes.size() > 0) {
if (CollectionUtils.isNotEmpty(theBoxes)) {
String coords = Joiner.on(";").join(theBoxes);
XmlBuilderUtils.addCoords(figureElement, coords);
}
}
if (header != null) {

if (StringUtils.isNotBlank(header)) {
Element head = XmlBuilderUtils.teiElement("head",
LayoutTokensUtil.normalizeText(header.toString()));
figureElement.appendChild(head);

}
if (label != null) {

if (StringUtils.isNotBlank(label)) {
Element labelEl = XmlBuilderUtils.teiElement("label",
LayoutTokensUtil.normalizeText(label.toString()));
figureElement.appendChild(labelEl);
}
if (caption != null) {

if (StringUtils.isNotBlank(caption)) {
Element desc = XmlBuilderUtils.teiElement("figDesc");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
Expand All @@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references
if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

Expand Down Expand Up @@ -435,7 +436,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
// LayoutTokensUtil.normalizeText(caption.toString()));
}

if (desc != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the figure caption, for that we need to introduce
Expand All @@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

figureElement.appendChild(desc);
}
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
Element go = XmlBuilderUtils.teiElement("graphic");
String uri = graphicObject.getURI();
Expand Down
17 changes: 10 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand Down Expand Up @@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}

Expand Down Expand Up @@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}*/

Element desc = null;
if (caption != null) {
if (StringUtils.isNotBlank(caption)) {
// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references

Expand All @@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
citationMarkerType = markerTypes.get(0);
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
Expand All @@ -144,7 +147,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
desc.appendChild(textNode(clusterContent));
}

if (desc != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the table caption, for that we need to introduce
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara

// in xom, the following gives all the text under the element, for the whole subtree
String text = curParagraph.getValue();
if (StringUtils.isEmpty(text))
if (StringUtils.isBlank(text))
return;

// identify ref nodes, ref spans and ref positions
Expand Down

0 comments on commit 0f41699

Please sign in to comment.