diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java index 95798c36c9..1e97e411a1 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java @@ -1,21 +1,22 @@ package org.grobid.core.engines; -import org.chasen.crfpp.Tagger; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.GrobidModel; import org.grobid.core.GrobidModels; import org.grobid.core.data.Affiliation; +import org.grobid.core.engines.label.TaggingLabel; +import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.features.FeaturesVectorAffiliationAddress; import org.grobid.core.layout.LayoutToken; import org.grobid.core.lexicon.Lexicon; +import org.grobid.core.tokenization.TaggingTokenCluster; +import org.grobid.core.tokenization.TaggingTokenClusteror; +import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.UnicodeUtil; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.engines.tagging.GenericTaggerUtils; -import org.grobid.core.tokenization.TaggingTokenCluster; -import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; import java.util.ArrayList; import java.util.List; @@ -24,8 +25,12 @@ public class AffiliationAddressParser extends AbstractParser { public Lexicon lexicon = Lexicon.getInstance(); + protected AffiliationAddressParser(GrobidModel model) { + super(model); + } + public AffiliationAddressParser() { - super(GrobidModels.AFFILIATION_ADDRESS); + this(GrobidModels.AFFILIATION_ADDRESS); } public List processing(String input) { @@ -78,22 +83,26 @@ protected static List getAffiliationBlocks(List tokenizatio return affiliationBlocks; } + /** + * Separate affiliation blocks, when they appears to be in separate set of offsets. + */ protected static List getAffiliationBlocksFromSegments(List> tokenizations) { - ArrayList affiliationBlocks = new ArrayList(); + ArrayList affiliationBlocks = new ArrayList<>(); int end = 0; for(List tokenizationSegment : tokenizations) { - if (tokenizationSegment == null || tokenizationSegment.size() == 0) + if (CollectionUtils.isEmpty(tokenizationSegment)) continue; // if we have an offset shit, we introduce a segmentation of the affiliation block LayoutToken startToken = tokenizationSegment.get(0); int start = startToken.getOffset(); - if (start-end > 2) + if (start-end > 2 && end > 0) affiliationBlocks.add("\n"); for(LayoutToken tok : tokenizationSegment) { - if (tok.getText().length() == 0) + if (StringUtils.isEmpty(tok.getText())) { continue; + } if (!tok.getText().equals(" ")) { if (tok.getText().equals("\n")) { @@ -123,11 +132,11 @@ public List processingLayoutTokens(List> tokeniza //System.out.println(affiliationBlocks.toString()); - List> placesPositions = new ArrayList>(); - List> countriesPositions = new ArrayList>(); + List> placesPositions = new ArrayList<>(); + List> countriesPositions = new ArrayList<>(); placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation)); countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation)); - List> allTokens = new ArrayList>(); + List> allTokens = new ArrayList<>(); allTokens.add(tokenizationsAffiliation); String affiliationSequenceWithFeatures = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); diff --git a/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java index 3f4b2d657d..c686a9cb97 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java @@ -1,33 +1,28 @@ package org.grobid.core.engines; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.AfterClass; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static org.hamcrest.Matchers.hasSize; -import static org.hamcrest.Matchers.nullValue; -import static org.junit.Assert.assertThat; -import static org.hamcrest.CoreMatchers.is; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - import com.google.common.base.Joiner; - +import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.Affiliation; import org.grobid.core.factory.GrobidFactory; import org.grobid.core.features.FeaturesVectorAffiliationAddress; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.main.LibraryLoader; import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; +import org.junit.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.*; +import static org.junit.Assert.assertThat; public class AffiliationAddressParserTest { @@ -43,13 +38,13 @@ public class AffiliationAddressParserTest { @Before public void setUp() throws Exception { - this.target = new AffiliationAddressParser(); + this.target = new AffiliationAddressParser(GrobidModels.DUMMY); this.analyzer = GrobidAnalyzer.getInstance(); } @BeforeClass public static void init() { - LibraryLoader.load(); +// LibraryLoader.load(); GrobidProperties.getInstance(); } @@ -257,4 +252,109 @@ public void shouldExtractMultipleAffiliations() throws Exception { is("University of Madness") ); } + + @Test + @Ignore("This test is used to show the failing input data") + public void testResultExtractionLayoutTokensFromDLOutput() throws Exception { + String result = "\n" + + "\n" + + "Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t\t\n" + + "Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t\tI-\n" + + "Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t\tI-\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t\tI-\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t\t\n" + + "Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t\tI-\n" + + ";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t\t\n"; + + List tokenizations = Arrays.stream(result.split("\n")) + .map(row -> new LayoutToken(row.split("\t")[0])) + .collect(Collectors.toList()); + + assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0))); + } + + + @Test + public void testResultExtractionLayoutTokensFromCRFOutput() throws Exception { + String result = "MD\tmd\tM\tMD\tMD\tMD\tD\tMD\tMD\tMD\tLINESTART\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXX\t\tI-\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t\t\n" + + "Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t\tI-\n" + + "Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t\tI-\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t\tI-\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t\tI-\n" + + "Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t\tI-\n" + + ";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t\t"; + + List tokenizations = Arrays.stream(result.split("\n")) + .map(row -> new LayoutToken(row.split("\t")[0])) + .collect(Collectors.toList()); + + assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0))); + } + + @Test + public void testGetAffiliationBlocksFromSegments_1() throws Exception { + String block1 = "Department of science, University of Science, University of Madness"; + List tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1); + tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100)); + + String block2 = "Department of mental health, University of happyness, Italy"; + List tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2); + tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 500)); + + List affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2)); + + assertThat(affiliationBlocksFromSegments, hasSize(22)); + assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n")))); + assertThat(affiliationBlocksFromSegments.get(11), is("\n")); + } + + @Test + public void testGetAffiliationBlocksFromSegments_2() throws Exception { + String block1 = "Department of science, University of Science, University of Madness"; + List tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1); + tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100)); + + String block2 = "Department of mental health, University of happyness, Italy"; + List tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2); + tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 100 + tokBlock1.size())); + + List affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2)); + + assertThat(affiliationBlocksFromSegments, hasSize(21)); + assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n")))); + assertThat(affiliationBlocksFromSegments.get(11), is(not("@newline"))); + + } }