Skip to content

Commit

Permalink
Merge pull request #1238 from kermitt2/bugfix/table-notes
Browse files Browse the repository at this point in the history
Fix the way table notes are streamed on the XML
  • Loading branch information
lfoppiano authored Feb 8, 2025
2 parents d72ef0e + f73413a commit 3702e4d
Showing 1 changed file with 13 additions and 14 deletions.
27 changes: 13 additions & 14 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import nu.xom.Element;
import nu.xom.Node;

import static org.grobid.core.document.TEIFormatter.isNewParagraph;
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;
Expand Down Expand Up @@ -185,6 +186,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (StringUtils.isNotBlank(labeledNote)) {
Element p = teiElement("p");
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand All @@ -193,7 +195,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

MarkerType citationMarkerType = null;
if (markerTypes != null && markerTypes.size()>0) {
if (CollectionUtils.isNotEmpty(markerTypes)) {
citationMarkerType = markerTypes.get(0);
}

Expand All @@ -210,30 +212,27 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
citationMarkerType);
if (refNodes != null) {
for (Node n : refNodes) {
noteNode.appendChild(n);
p.appendChild(n);
}
}
} catch(Exception e) {
LOGGER.warn("Problem when serializing TEI fragment for table note", e);
}
} else {
noteNode.appendChild(textNode(clusterContent));
if (p.getChildCount() > 0 && isNewParagraph(clusterLabel, p)) {
noteNode.appendChild(p);
p = teiElement("p");
}
p.appendChild(textNode(clusterContent));
}

if (noteNode != null && config.isWithSentenceSegmentation()) {
if (config.isWithSentenceSegmentation()) {
// we need a sentence segmentation of the figure caption
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
}

// enclose note content in a <p> element
if (noteNode != null) {
noteNode.setLocalName("p");

Element tabNote = XmlBuilderUtils.teiElement("note");
tabNote.appendChild(noteNode);

noteNode = tabNote;
}
}
if (p.getChildCount() > 0) {
noteNode.appendChild(p);
}
} else {
noteNode = XmlBuilderUtils.teiElement("note", LayoutTokensUtil.normalizeText(note.toString()).trim());
Expand Down

0 comments on commit 3702e4d

Please sign in to comment.