Skip to content

Commit

Permalink
Merge branch 'master' into feature/update-lingua
Browse files Browse the repository at this point in the history
# Conflicts:
#	build.gradle
  • Loading branch information
lfoppiano committed Jan 28, 2025
2 parents fe475f0 + 0f41699 commit 183ccf3
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 31 deletions.
7 changes: 4 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ subprojects {
// treating them separately, these jars will be flattened into grobid-core.jar on installing,
// to avoid missing dependencies from the projects that include grobid-core (see 'jar' task in grobid-core)
localLibs = ['crfpp-1.0.2.jar',
'wipo-analysers-0.0.2.jar',
'langdetect-1.1-20120112.jar',
'grobid-lucene-analysers-0.0.1.jar',
'imageio-pnm-1.0.jar',
'wapiti-1.5.0.jar']
}
Expand Down Expand Up @@ -131,7 +132,7 @@ subprojects {
implementation "com.cybozu.labs:langdetect:1.1-20120112"
implementation "com.rockymadden.stringmetric:stringmetric-core_2.11:0.27.4"
implementation "commons-pool:commons-pool:1.6"
implementation "commons-io:commons-io:2.5"
implementation "commons-io:commons-io:2.14.0"
implementation "org.apache.commons:commons-lang3:3.6"
implementation "org.apache.commons:commons-collections4:4.1"
implementation 'org.apache.commons:commons-text:1.11.0'
Expand Down Expand Up @@ -245,7 +246,7 @@ project("grobid-core") {
api "xerces:xercesImpl:2.12.0"
api "net.arnx:jsonic:1.3.10"
api "net.sf.saxon:Saxon-HE:9.6.0-9"
api "xom:xom:1.3.2"
api "xom:xom:1.3.9"
api 'javax.xml.bind:jaxb-api:2.3.0'

implementation "joda-time:joda-time:2.9.9"
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.wipo.nlp.textboundaries.ReTokenizer;
import org.wipo.nlp.textboundaries.ReTokenizerFactory;
import org.grobid.nlp.textboundaries.ReTokenizer;
import org.grobid.nlp.textboundaries.ReTokenizerFactory;

import java.util.List;
import java.util.ArrayList;
Expand Down
35 changes: 18 additions & 17 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public List<BoundingBox> getCoordinates() {

List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
//theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
List<BoundingBox> result = new ArrayList<BoundingBox>();
Expand All @@ -291,7 +291,7 @@ public List<BoundingBox> getCoordinates() {
// here we bound all figure graphics in one single box (given that we can have hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -307,8 +307,8 @@ public List<BoundingBox> getCoordinates() {
theBoxes.add(theGraphicsBox);
}

List<BoundingBox> result = new ArrayList<BoundingBox>();
if (theBoxes != null && theBoxes.size() > 0) {
List<BoundingBox> result = new ArrayList<>();
if (CollectionUtils.isNotEmpty(theBoxes)) {
BoundingBox oneBox = BoundingBoxCalculator.calculateOneBox(layoutTokens, true);
List<BoundingBox> mergedBox = VectorGraphicBoxCalculator.mergeBoxes(theBoxes);
result.addAll(mergedBox);
Expand All @@ -329,6 +329,7 @@ public boolean isCompleteForTEI() {

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a figure that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand All @@ -339,18 +340,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (config.isGenerateTeiCoordinates("figure")) {
List<BoundingBox> theBoxes = null;
// non graphic elements
if (getLayoutTokens() != null && getLayoutTokens().size() > 0) {
if (CollectionUtils.isNotEmpty(getLayoutTokens())) {
theBoxes = BoundingBoxCalculator.calculate(getLayoutTokens());
}

// if (getBitmapGraphicObjects() != null && !getBitmapGraphicObjects().isEmpty()) {
// -> note: this was restricted to the bitmap objects only... the bounding box calculation
// with vector graphics might need some double check

// here we bound all figure graphics in one single box (given that we can have hundred graphics
// here we bound all figure graphics in one single box (given that we can have a hundred graphics
// in a single figure)
BoundingBox theGraphicsBox = null;
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
if (theGraphicsBox == null) {
theGraphicsBox = graphicObject.getBoundingBox();
Expand All @@ -366,24 +367,24 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
theBoxes.add(theGraphicsBox);
}

if (theBoxes != null && theBoxes.size() > 0) {
if (CollectionUtils.isNotEmpty(theBoxes)) {
String coords = Joiner.on(";").join(theBoxes);
XmlBuilderUtils.addCoords(figureElement, coords);
}
}
if (header != null) {

if (StringUtils.isNotBlank(header)) {
Element head = XmlBuilderUtils.teiElement("head",
LayoutTokensUtil.normalizeText(header.toString()));
figureElement.appendChild(head);

}
if (label != null) {

if (StringUtils.isNotBlank(label)) {
Element labelEl = XmlBuilderUtils.teiElement("label",
LayoutTokensUtil.normalizeText(label.toString()));
figureElement.appendChild(labelEl);
}
if (caption != null) {

if (StringUtils.isNotBlank(caption)) {
Element desc = XmlBuilderUtils.teiElement("figDesc");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
Expand All @@ -392,12 +393,12 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references
if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

Expand Down Expand Up @@ -435,7 +436,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
// LayoutTokensUtil.normalizeText(caption.toString()));
}

if (desc != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the figure caption, for that we need to introduce
Expand All @@ -453,7 +454,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

figureElement.appendChild(desc);
}
if ((graphicObjects != null) && (graphicObjects.size() > 0)) {
if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
Element go = XmlBuilderUtils.teiElement("graphic");
String uri = graphicObject.getURI();
Expand Down
17 changes: 10 additions & 7 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
Expand Down Expand Up @@ -69,6 +70,7 @@ public boolean isCompleteForTEI() {
@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (!isCompleteForTEI()) {
LOGGER.warn("Found a table that is badly formatted but it should have been spotted before. We ignore it now.");
return null;
}

Expand Down Expand Up @@ -98,7 +100,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}*/

Element desc = null;
if (caption != null) {
if (StringUtils.isNotBlank(caption)) {
// if the segment has been parsed with the full text model we further extract the clusters
// to get the bibliographical references

Expand All @@ -111,16 +113,17 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();

MarkerType citationMarkerType = null;
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
citationMarkerType = markerTypes.get(0);
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
//String clusterContent = LayoutTokensUtil.normalizeText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
Expand All @@ -144,7 +147,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
desc.appendChild(textNode(clusterContent));
}

if (desc != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());

// we need a sentence segmentation of the table caption, for that we need to introduce
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara

// in xom, the following gives all the text under the element, for the whole subtree
String text = curParagraph.getValue();
if (StringUtils.isEmpty(text))
if (StringUtils.isBlank(text))
return;

// identify ref nodes, ref spans and ref positions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.utilities.Triple;
import org.wipo.analyzers.wipokr.utils.StringUtil;
import org.grobid.analyzers.grobidkr.utils.StringUtil;

import java.util.ArrayList;
import java.util.List;
Expand Down

0 comments on commit 183ccf3

Please sign in to comment.