Merge pull request #1235 from kermitt2/bugfix/avoid-empty-sentences

Bugfix/avoid empty sentences
kermitt2 · Jan 23, 2025 · 0f41699 · 0f41699
2 parents c0e6525 + 3bc567e
commit 0f41699
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 25 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -276,7 +276,7 @@ public List<BoundingBox> getCoordinates() {
 
         List<BoundingBox> theBoxes = null;
         // non graphic elements
-        if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
+        if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
             //theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
             BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
             List<BoundingBox> result = new ArrayList<BoundingBox>();
@@ -291,7 +291,7 @@ public List<BoundingBox> getCoordinates() {
         // here we bound all figure graphics in one single box (given that we can have hundred graphics
         // in a single figure)
         BoundingBox theGraphicsBox = null;
-        if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
+        if (CollectionUtils.isNotEmpty(graphicObjects)) {
             for (GraphicObject graphicObject : graphicObjects) {
                 if (theGraphicsBox == null) {
                     theGraphicsBox = graphicObject.getBoundingBox();
@@ -307,8 +307,8 @@ public List<BoundingBox> getCoordinates() {
             theBoxes.add(theGraphicsBox);
         }
 
-        List<BoundingBox> result = new ArrayList<BoundingBox>();
-        if (theBoxes != null && theBoxes.size() > 0) {
+        List<BoundingBox> result = new ArrayList<>();
+        if (CollectionUtils.isNotEmpty(theBoxes)) {
             BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
             List<BoundingBox> mergedBox = VectorGraphicBoxCalculator.mergeBoxes(theBoxes);
             result.addAll(mergedBox);
@@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {
 
     public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
         if (!isCompleteForTEI()) {
+            LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
             return null;
         }
         Element figureElement = XmlBuilderUtils.teiElement("figure");
@@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
         if (config.isGenerateTeiCoordinates("figure")) {
             List<BoundingBox> theBoxes = null;
             // non graphic elements
-            if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
+            if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
                 theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
             }
 
             // if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
             // -> note: this was restricted to the bitmap objects only... the bounding box calculation
             // with vector graphics might need some double check
 
-            // here we bound all figure graphics in one single box (given that we can have hundred graphics
+            // here we bound all figure graphics in one single box (given that we can have a hundred graphics
             // in a single figure)
             BoundingBox theGraphicsBox = null;
-            if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
+            if (CollectionUtils.isNotEmpty(graphicObjects)) {
                 for (GraphicObject graphicObject : graphicObjects) {
                     if (theGraphicsBox == null) {
                         theGraphicsBox = graphicObject.getBoundingBox();
@@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                 theBoxes.add(theGraphicsBox);
             }
 
-            if (theBoxes != null && theBoxes.size() > 0) {
+            if (CollectionUtils.isNotEmpty(theBoxes)) {
                 String coords = Joiner.on(";").join(theBoxes);
                 XmlBuilderUtils.addCoords(figureElement, coords);
             }
         }
-        if (header != null) {
+
+        if (StringUtils.isNotBlank(header)) {
             Element head = XmlBuilderUtils.teiElement("head",
                     LayoutTokensUtil.normalizeText(header.toString()));
             figureElement.appendChild(head);
-
         }
-        if (label != null) {
+
+        if (StringUtils.isNotBlank(label)) {
             Element labelEl = XmlBuilderUtils.teiElement("label",
                 LayoutTokensUtil.normalizeText(label.toString()));
             figureElement.appendChild(labelEl);
         }
-        if (caption != null) {
-
+        if (StringUtils.isNotBlank(caption)) {
             Element desc = XmlBuilderUtils.teiElement("figDesc");
             if (config.isGenerateTeiIds()) {
                 String divID = KeyGen.getKey().substring(0, 7);
@@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
             // if the segment has been parsed with the full text model we further extract the clusters
             // to get the bibliographical references
-            if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
+            if (StringUtils.isNotBlank(labeledCaption)) {
                 TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
                 List<TaggingTokenCluster> clusters = clusteror.cluster();
 
                 MarkerType citationMarkerType = null;
-                if (markerTypes != null && markerTypes.size()>0) {
+                if (CollectionUtils.isNotEmpty(markerTypes)) {
                     citationMarkerType = markerTypes.get(0);
                 }
 
@@ -435,7 +436,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                 //    LayoutTokensUtil.normalizeText(caption.toString()));
             }
 
-            if (desc != null && config.isWithSentenceSegmentation()) {
+            if (config.isWithSentenceSegmentation()) {
                 formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                 // we need a sentence segmentation of the figure caption, for that we need to introduce 
@@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
             figureElement.appendChild(desc);
         }
-        if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
+        if (CollectionUtils.isNotEmpty(graphicObjects)) {
             for (GraphicObject graphicObject : graphicObjects) {
                 Element go = XmlBuilderUtils.teiElement("graphic");
                 String uri = graphicObject.getURI();

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -1,5 +1,6 @@
 package org.grobid.core.data;
 
+import org.apache.commons.collections4.CollectionUtils;
 import org.grobid.core.GrobidModels;
 import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.data.table.Cell;
@@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
 	@Override
     public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
 		if (!isCompleteForTEI()) {
+            LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
 			return null;
 		}
 
@@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 		}*/
 
         Element desc = null;
-        if (caption != null) {
+        if (StringUtils.isNotBlank(caption)) {
             // if the segment has been parsed with the full text model we further extract the clusters
             // to get the bibliographical references
 
@@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
             if (StringUtils.isNotBlank(labeledCaption)) {
                 TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
                 List<TaggingTokenCluster> clusters = clusteror.cluster();                
+
+                MarkerType citationMarkerType = null;
+                if (CollectionUtils.isNotEmpty(markerTypes)) {
+                    citationMarkerType = markerTypes.get(0);
+                }
+
                 for (TaggingTokenCluster cluster : clusters) {
                     if (cluster == null) {
                         continue;
                     }
 
-                    MarkerType citationMarkerType = null;
-                    if (markerTypes != null && markerTypes.size()>0) {
-                        citationMarkerType = markerTypes.get(0);
-                    }
-
                     TaggingLabel clusterLabel = cluster.getTaggingLabel();
                     //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
                     String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
@@ -144,7 +147,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                         desc.appendChild(textNode(clusterContent));
                     }
 
-                    if (desc != null && config.isWithSentenceSegmentation()) {
+                    if (config.isWithSentenceSegmentation()) {
                         formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1887,7 +1887,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // in xom, the following gives all the text under the element, for the whole subtree
         String text = curParagraph.getValue();
-        if (StringUtils.isEmpty(text))
+        if (StringUtils.isBlank(text))
             return;
 
         // identify ref nodes, ref spans and ref positions