Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect "other" text on request #1212

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1287,7 +1287,11 @@ public void setInstitution(String inst) {
}

public void setNote(String not) {
note = StringUtils.normalizeSpace(not);
if (StringUtils.isBlank(this.note)) {
note = StringUtils.normalizeSpace(not);
} else {
note += " " + StringUtils.normalizeSpace(not);
}
}

public void setAffiliation(String a) {
Expand Down
23 changes: 23 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ public boolean apply(GraphicObject graphicObject) {
private List<BoundingBox> textArea;
private List<LayoutToken> layoutTokens;

private List<List<LayoutToken>> discardedPiecesTokens = new ArrayList<>();

// coordinates
private int page = -1;
private double y = 0.0;
Expand Down Expand Up @@ -454,6 +456,16 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form

figureElement.appendChild(desc);
}

if (CollectionUtils.isNotEmpty(discardedPiecesTokens)) {
for (List<LayoutToken> discardedPieceTokens : discardedPiecesTokens) {
Element note = XmlBuilderUtils.teiElement("note");
note.addAttribute(new Attribute("type", "other"));
note.appendChild(LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(discardedPieceTokens)).trim());
figureElement.appendChild(note);
}
}

if (CollectionUtils.isNotEmpty(graphicObjects)) {
for (GraphicObject graphicObject : graphicObjects) {
Element go = XmlBuilderUtils.teiElement("graphic");
Expand Down Expand Up @@ -574,4 +586,15 @@ public void setUri(URI uri) {
this.uri = uri;
}

public List<List<LayoutToken>> getDiscardedPiecesTokens() {
return discardedPiecesTokens;
}

public void setDiscardedPiecesTokens(List<List<LayoutToken>> discardedPiecesTokens) {
this.discardedPiecesTokens = discardedPiecesTokens;
}

public void addDiscardedPieceTokens(List<LayoutToken> pieceToken) {
this.discardedPiecesTokens.add(pieceToken);
}
}
25 changes: 24 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ public class Table extends Figure {
private List<LayoutToken> noteLayoutTokens = null;
private String labeledNote = null;

private List<List<LayoutToken>> discardedPiecesTokens = new ArrayList<>();


public void setGoodTable(boolean goodTable) {
this.goodTable = goodTable;
Expand Down Expand Up @@ -254,8 +256,18 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
tableElement.appendChild(desc);
tableElement.appendChild(contentEl);

if (noteNode != null)
if (noteNode != null) {
tableElement.appendChild(noteNode);
}

if (CollectionUtils.isNotEmpty(discardedPiecesTokens)) {
for (List<LayoutToken> discardedPieceTokens : discardedPiecesTokens) {
Element note = XmlBuilderUtils.teiElement("note");
note.addAttribute(new Attribute("type", "other"));
note.appendChild(LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(discardedPieceTokens)).trim());
tableElement.appendChild(note);
}
}

return tableElement.toXML();
}
Expand Down Expand Up @@ -436,4 +448,15 @@ public String getTeiId() {
return "tab_" + this.id;
}

public List<List<LayoutToken>> getDiscardedPiecesTokens() {
return discardedPiecesTokens;
}

public void setDiscardedPiecesTokens(List<List<LayoutToken>> discardedPiecesTokens) {
this.discardedPiecesTokens = discardedPiecesTokens;
}

public void addDiscardedPieceTokens(List<LayoutToken> pieceToken) {
this.discardedPiecesTokens.add(pieceToken);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;

import org.grobid.core.data.BibDataSet;
Expand Down Expand Up @@ -163,7 +164,7 @@ static public Document generalResultSegmentation(Document doc, String labeledRes
Block block = docBlocks.get(blockIndex);
List<LayoutToken> tokens = block.getTokens();
String localText = block.getText();
if ( (tokens == null) || (localText == null) || (localText.trim().length() == 0) ) {
if ( (tokens == null) || StringUtils.isBlank(localText) ) {
blockIndex++;
indexLine = 0;
if (blockIndex < docBlocks.size()) {
Expand Down Expand Up @@ -274,7 +275,7 @@ static public Document generalResultSegmentation(Document doc, String labeledRes

if (blockIndex == docBlocks.size()) {
// the last labelled piece has still to be added
if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
if (!StringUtils.equals(curPlainLabel, lastPlainLabel) && lastPlainLabel != null) {
if ( (pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) &&
(pointerA.getTokenDocPos() != -1) ) {
labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
Expand Down
Loading