Merge branch 'master' into feature/update-lingua

# Conflicts: # build.gradle
kermitt2 · Jan 28, 2025 · 183ccf3 · 183ccf3
2 parents fe475f0 + 0f41699
commit 183ccf3
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 31 deletions.
diff --git a/build.gradle b/build.gradle
@@ -103,7 +103,8 @@ subprojects {
         // treating them separately, these jars will be flattened into grobid-core.jar on installing,
         // to avoid missing dependencies from the projects that include grobid-core (see 'jar' task in grobid-core)
         localLibs = ['crfpp-1.0.2.jar',
-                     'wipo-analysers-0.0.2.jar',
+                     'langdetect-1.1-20120112.jar',
+                     'grobid-lucene-analysers-0.0.1.jar',
                      'imageio-pnm-1.0.jar',
                      'wapiti-1.5.0.jar']
     }
@@ -131,7 +132,7 @@ subprojects {
         implementation "com.cybozu.labs:langdetect:1.1-20120112"
         implementation "com.rockymadden.stringmetric:stringmetric-core_2.11:0.27.4"
         implementation "commons-pool:commons-pool:1.6"
-        implementation "commons-io:commons-io:2.5"
+        implementation "commons-io:commons-io:2.14.0"
         implementation "org.apache.commons:commons-lang3:3.6"
         implementation "org.apache.commons:commons-collections4:4.1"
         implementation 'org.apache.commons:commons-text:1.11.0'
@@ -245,7 +246,7 @@ project("grobid-core") {
         api "xerces:xercesImpl:2.12.0"
         api "net.arnx:jsonic:1.3.10"
         api "net.sf.saxon:Saxon-HE:9.6.0-9"
-        api "xom:xom:1.3.2"
+        api "xom:xom:1.3.9"
         api 'javax.xml.bind:jaxb-api:2.3.0'
 
         implementation "joda-time:joda-time:2.9.9"

diff --git a/...d-core/localLibs/wipo-analysers-0.0.2.jar → ...calLibs/grobid-lucene-analysers-0.0.1.jar b/...d-core/localLibs/wipo-analysers-0.0.2.jar → ...calLibs/grobid-lucene-analysers-0.0.1.jar
diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java
@@ -5,8 +5,8 @@
 
 import org.grobid.core.utilities.UnicodeUtil;
 import org.grobid.core.utilities.LayoutTokensUtil;
-import org.wipo.nlp.textboundaries.ReTokenizer;
-import org.wipo.nlp.textboundaries.ReTokenizerFactory;
+import org.grobid.nlp.textboundaries.ReTokenizer;
+import org.grobid.nlp.textboundaries.ReTokenizerFactory;
 
 import java.util.List;
 import java.util.ArrayList;

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -276,7 +276,7 @@ public List<BoundingBox> getCoordinates() {
 
         List<BoundingBox> theBoxes = null;
         // non graphic elements
-        if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
+        if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
             //theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
             BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
             List<BoundingBox> result = new ArrayList<BoundingBox>();
@@ -291,7 +291,7 @@ public List<BoundingBox> getCoordinates() {
         // here we bound all figure graphics in one single box (given that we can have hundred graphics
         // in a single figure)
         BoundingBox theGraphicsBox = null;
-        if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
+        if (CollectionUtils.isNotEmpty(graphicObjects)) {
             for (GraphicObject graphicObject : graphicObjects) {
                 if (theGraphicsBox == null) {
                     theGraphicsBox = graphicObject.getBoundingBox();
@@ -307,8 +307,8 @@ public List<BoundingBox> getCoordinates() {
             theBoxes.add(theGraphicsBox);
         }
 
-        List<BoundingBox> result = new ArrayList<BoundingBox>();
-        if (theBoxes != null && theBoxes.size() > 0) {
+        List<BoundingBox> result = new ArrayList<>();
+        if (CollectionUtils.isNotEmpty(theBoxes)) {
             BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
             List<BoundingBox> mergedBox = VectorGraphicBoxCalculator.mergeBoxes(theBoxes);
             result.addAll(mergedBox);
@@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {
 
     public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
         if (!isCompleteForTEI()) {
+            LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
             return null;
         }
         Element figureElement = XmlBuilderUtils.teiElement("figure");
@@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
         if (config.isGenerateTeiCoordinates("figure")) {
             List<BoundingBox> theBoxes = null;
             // non graphic elements
-            if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
+            if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
                 theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
             }
 
             // if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
             // -> note: this was restricted to the bitmap objects only... the bounding box calculation
             // with vector graphics might need some double check
 
-            // here we bound all figure graphics in one single box (given that we can have hundred graphics
+            // here we bound all figure graphics in one single box (given that we can have a hundred graphics
             // in a single figure)
             BoundingBox theGraphicsBox = null;
-            if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
+            if (CollectionUtils.isNotEmpty(graphicObjects)) {
                 for (GraphicObject graphicObject : graphicObjects) {
                     if (theGraphicsBox == null) {
                         theGraphicsBox = graphicObject.getBoundingBox();
@@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                 theBoxes.add(theGraphicsBox);
             }
 
-            if (theBoxes != null && theBoxes.size() > 0) {
+            if (CollectionUtils.isNotEmpty(theBoxes)) {
                 String coords = Joiner.on(";").join(theBoxes);
                 XmlBuilderUtils.addCoords(figureElement, coords);
             }
         }
-        if (header != null) {
+
+        if (StringUtils.isNotBlank(header)) {
             Element head = XmlBuilderUtils.teiElement("head",
                     LayoutTokensUtil.normalizeText(header.toString()));
             figureElement.appendChild(head);
-
         }
-        if (label != null) {
+
+        if (StringUtils.isNotBlank(label)) {
             Element labelEl = XmlBuilderUtils.teiElement("label",
                 LayoutTokensUtil.normalizeText(label.toString()));
             figureElement.appendChild(labelEl);
         }
-        if (caption != null) {
-
+        if (StringUtils.isNotBlank(caption)) {
             Element desc = XmlBuilderUtils.teiElement("figDesc");
             if (config.isGenerateTeiIds()) {
                 String divID = KeyGen.getKey().substring(0, 7);
@@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
             // if the segment has been parsed with the full text model we further extract the clusters
             // to get the bibliographical references
-            if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
+            if (StringUtils.isNotBlank(labeledCaption)) {
                 TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
                 List<TaggingTokenCluster> clusters = clusteror.cluster();
 
                 MarkerType citationMarkerType = null;
-                if (markerTypes != null && markerTypes.size()>0) {
+                if (CollectionUtils.isNotEmpty(markerTypes)) {
                     citationMarkerType = markerTypes.get(0);
                 }
 
@@ -435,7 +436,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                 //    LayoutTokensUtil.normalizeText(caption.toString()));
             }
 
-            if (desc != null && config.isWithSentenceSegmentation()) {
+            if (config.isWithSentenceSegmentation()) {
                 formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                 // we need a sentence segmentation of the figure caption, for that we need to introduce 
@@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 
             figureElement.appendChild(desc);
         }
-        if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
+        if (CollectionUtils.isNotEmpty(graphicObjects)) {
             for (GraphicObject graphicObject : graphicObjects) {
                 Element go = XmlBuilderUtils.teiElement("graphic");
                 String uri = graphicObject.getURI();

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -1,5 +1,6 @@
 package org.grobid.core.data;
 
+import org.apache.commons.collections4.CollectionUtils;
 import org.grobid.core.GrobidModels;
 import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.data.table.Cell;
@@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
 	@Override
     public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
 		if (!isCompleteForTEI()) {
+            LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
 			return null;
 		}
 
@@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
 		}*/
 
         Element desc = null;
-        if (caption != null) {
+        if (StringUtils.isNotBlank(caption)) {
             // if the segment has been parsed with the full text model we further extract the clusters
             // to get the bibliographical references
 
@@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
             if (StringUtils.isNotBlank(labeledCaption)) {
                 TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
                 List<TaggingTokenCluster> clusters = clusteror.cluster();                
+
+                MarkerType citationMarkerType = null;
+                if (CollectionUtils.isNotEmpty(markerTypes)) {
+                    citationMarkerType = markerTypes.get(0);
+                }
+
                 for (TaggingTokenCluster cluster : clusters) {
                     if (cluster == null) {
                         continue;
                     }
 
-                    MarkerType citationMarkerType = null;
-                    if (markerTypes != null && markerTypes.size()>0) {
-                        citationMarkerType = markerTypes.get(0);
-                    }
-
                     TaggingLabel clusterLabel = cluster.getTaggingLabel();
                     //String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
                     String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
@@ -144,7 +147,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                         desc.appendChild(textNode(clusterContent));
                     }
 
-                    if (desc != null && config.isWithSentenceSegmentation()) {
+                    if (config.isWithSentenceSegmentation()) {
                         formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1887,7 +1887,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
 
         // in xom, the following gives all the text under the element, for the whole subtree
         String text = curParagraph.getValue();
-        if (StringUtils.isEmpty(text))
+        if (StringUtils.isBlank(text))
             return;
 
         // identify ref nodes, ref spans and ref positions

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/tagging/GenericTaggerUtils.java b/grobid-core/src/main/java/org/grobid/core/engines/tagging/GenericTaggerUtils.java
@@ -6,7 +6,7 @@
 import org.apache.commons.lang3.tuple.Pair;
 import org.grobid.core.engines.label.TaggingLabels;
 import org.grobid.core.utilities.Triple;
-import org.wipo.analyzers.wipokr.utils.StringUtil;
+import org.grobid.analyzers.grobidkr.utils.StringUtil;
 
 import java.util.ArrayList;
 import java.util.List;