diff --git a/pom.xml b/pom.xml index 27a03e73..281fae7a 100644 --- a/pom.xml +++ b/pom.xml @@ -262,7 +262,7 @@ org.apache.pdfbox pdfbox - 2.0.28 + 3.0.0 diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java index 3a6773a9..1abb7e77 100644 --- a/src/main/java/technology/tabula/CommandLineApp.java +++ b/src/main/java/technology/tabula/CommandLineApp.java @@ -158,7 +158,7 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException private void extractFile(File pdfFile, Appendable outFile) throws ParseException { PDDocument pdfDocument = null; try { - pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password); + pdfDocument = this.password == null ? org.apache.pdfbox.Loader.loadPDF(pdfFile) : org.apache.pdfbox.Loader.loadPDF(pdfFile, this.password); PageIterator pageIterator = getPageIterator(pdfDocument); List tables = new ArrayList<>(); diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java index 91609045..02f860df 100644 --- a/src/main/java/technology/tabula/debug/Debug.java +++ b/src/main/java/technology/tabula/debug/Debug.java @@ -215,7 +215,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells, boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths, boolean drawDetectedTables) throws IOException { - PDDocument document = PDDocument.load(new File(pdfPath)); + PDDocument document = org.apache.pdfbox.Loader.loadPDF(new File(pdfPath)); ObjectExtractor oe = new ObjectExtractor(document); @@ -349,7 +349,7 @@ public static void main(String[] args) throws IOException { if (pages == null) { // user specified all pages - PDDocument document = PDDocument.load(pdfFile); + PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile); int numPages = document.getNumberOfPages(); pages = new ArrayList<>(numPages); diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java index fb43622a..c3bab2be 100644 --- a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java @@ -804,8 +804,7 @@ private List getVerticalRulings(BufferedImage image) { private PDDocument removeText(PDPage page) throws IOException { PDFStreamParser parser = new PDFStreamParser(page); - parser.parse(); - List tokens = parser.getTokens(); + List tokens = parser.parse(); List newTokens = new ArrayList<>(); for (Object token : tokens) { if (token instanceof Operator) { diff --git a/src/test/java/technology/tabula/TestCell.java b/src/test/java/technology/tabula/TestCell.java index de1b8cb8..adbffe1d 100644 --- a/src/test/java/technology/tabula/TestCell.java +++ b/src/test/java/technology/tabula/TestCell.java @@ -30,8 +30,7 @@ public void testIsPlaceholder() { public void testGetTextElements() { Cell cell = new Cell(0, 0, 0, 0); assertTrue(cell.getTextElements().isEmpty()); - - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, UtilsForTesting.HELVETICA_BOLD, 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); List tList = new ArrayList<>(); tList.add(tChunk); diff --git a/src/test/java/technology/tabula/TestLine.java b/src/test/java/technology/tabula/TestLine.java index 90df0e31..649b1924 100644 --- a/src/test/java/technology/tabula/TestLine.java +++ b/src/test/java/technology/tabula/TestLine.java @@ -6,6 +6,7 @@ import java.util.List; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Test; public class TestLine { @@ -14,7 +15,7 @@ public class TestLine { public void testSetTextElements() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); List tList = new ArrayList<>(); tList.add(tChunk); @@ -28,7 +29,7 @@ public void testSetTextElements() { public void testAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(3, tChunk); @@ -39,7 +40,7 @@ public void testAddTextChunkIntTextChunk() { public void testLessThanAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(0, tChunk); line.addTextChunk(0, tChunk); @@ -51,7 +52,7 @@ public void testLessThanAddTextChunkIntTextChunk() { public void testErrorAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(-1, tChunk); } @@ -60,7 +61,7 @@ public void testErrorAddTextChunkIntTextChunk() { public void testToString() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(0, tChunk); line.addTextChunk(0, tChunk); diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java index 9db7ad18..dcdc7573 100644 --- a/src/test/java/technology/tabula/TestObjectExtractor.java +++ b/src/test/java/technology/tabula/TestObjectExtractor.java @@ -12,16 +12,9 @@ public class TestObjectExtractor { - /*@Test(expected=IOException.class) - public void testWrongPasswordRaisesException() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document, "wrongpass"); - oe.extract().next(); - }*/ - @Test(expected = IOException.class) public void testEmptyOnEncryptedFileRaisesException() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { oe.extract().next(); } @@ -29,7 +22,7 @@ public void testEmptyOnEncryptedFileRaisesException() throws IOException { @Test public void testCanReadPDFWithOwnerEncryption() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); int i = 0; @@ -44,7 +37,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException { @Test public void testGoodPassword() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { List pages = new ArrayList<>(); PageIterator pi = oe.extract(); @@ -58,7 +51,7 @@ public void testGoodPassword() throws IOException { @Test public void testTextExtractionDoesNotRaise() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/rotated_page.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -70,7 +63,7 @@ public void testTextExtractionDoesNotRaise() throws IOException { @Test public void testShouldDetectRulings() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -85,7 +78,7 @@ public void testShouldDetectRulings() throws IOException { @Test public void testDontThrowNPEInShfill() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/labor.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -101,7 +94,7 @@ public void testDontThrowNPEInShfill() throws IOException { @Test public void testExtractOnePage() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { @@ -114,7 +107,7 @@ public void testExtractOnePage() throws IOException { @Test(expected = IndexOutOfBoundsException.class) public void testExtractWrongPageNumber() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { @@ -124,7 +117,7 @@ public void testExtractWrongPageNumber() throws IOException { @Test public void testTextElementsContainedInPage() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page page = oe.extractPage(1); @@ -137,7 +130,7 @@ public void testTextElementsContainedInPage() throws IOException { } @Test public void testDoNotNPEInPointComparator() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page p = oe.extractPage(1); diff --git a/src/test/java/technology/tabula/TestProjectionProfile.java b/src/test/java/technology/tabula/TestProjectionProfile.java index e7af882f..738ca3dc 100644 --- a/src/test/java/technology/tabula/TestProjectionProfile.java +++ b/src/test/java/technology/tabula/TestProjectionProfile.java @@ -21,8 +21,8 @@ public void setUpProjectionProfile() { PDPage pdPage = new PDPage(); PDDocument pdDocument = new PDDocument(); - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f); - TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f); + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "test", 1f); + TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "test", 1f); List textList = new ArrayList<>(); textList.add(textElement); textList.add(textElement2); diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 6e58f6a4..d1247811 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -162,7 +162,7 @@ public void testDetectionOfTables() throws Exception { NodeList tables = regionDocument.getElementsByTagName("table"); // tabula extractors - PDDocument pdfDocument = PDDocument.load(this.pdf); + PDDocument pdfDocument = org.apache.pdfbox.Loader.loadPDF(this.pdf); ObjectExtractor extractor = new ObjectExtractor(pdfDocument); // parse expected tables from the ground truth dataset diff --git a/src/test/java/technology/tabula/TestTextElement.java b/src/test/java/technology/tabula/TestTextElement.java index feaaa5e6..bdb06484 100644 --- a/src/test/java/technology/tabula/TestTextElement.java +++ b/src/test/java/technology/tabula/TestTextElement.java @@ -4,6 +4,7 @@ import java.util.List; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Assert; import org.junit.Test; @@ -13,7 +14,7 @@ public class TestTextElement { @Test public void createTextElement() { - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f); + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f); Assert.assertNotNull(textElement); Assert.assertEquals("A", textElement.getText()); @@ -22,7 +23,7 @@ public void createTextElement() { Assert.assertEquals(5f, textElement.getTop(), 0); Assert.assertEquals(10f, textElement.getWidth(), 0); Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(PDType1Font.HELVETICA, textElement.getFont()); + Assert.assertEquals(UtilsForTesting.HELVETICA, textElement.getFont()); Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); Assert.assertEquals(0f, textElement.getDirection(), 0); @@ -32,7 +33,7 @@ public void createTextElement() { @Test public void createTextElementWithDirection() { - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f); + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f); Assert.assertNotNull(textElement); Assert.assertEquals("A", textElement.getText()); @@ -41,7 +42,7 @@ public void createTextElementWithDirection() { Assert.assertEquals(5f, textElement.getTop(), 0); Assert.assertEquals(10f, textElement.getWidth(), 0); Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(PDType1Font.HELVETICA, textElement.getFont()); + Assert.assertEquals(UtilsForTesting.HELVETICA, textElement.getFont()); Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); Assert.assertEquals(6f, textElement.getDirection(), 0); @@ -52,18 +53,18 @@ public void createTextElementWithDirection() { public void mergeFourElementsIntoFourWords() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(40f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(60f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f)); + elements.add(new TextElement(40f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f)); + elements.add(new TextElement(60f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f))); Assert.assertEquals(expectedWords, words); @@ -73,18 +74,18 @@ public void mergeFourElementsIntoFourWords() { public void mergeFourElementsIntoOneWord() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); expectedWords.add(textChunk); Assert.assertEquals(expectedWords, words); @@ -99,10 +100,10 @@ public void mergeElementsShouldBeIdempotent() { */ List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List words2 = TextElement.mergeWords(elements); @@ -113,20 +114,20 @@ public void mergeElementsShouldBeIdempotent() { public void mergeElementsWithSkippingRules() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 17f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0.001f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, " ", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.TIMES_ROMAN, 10f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 17f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0.001f, 25f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, " ", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, UtilsForTesting.TIMES_ROMAN, 10f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.TIMES_ROMAN, 10f, "D", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, UtilsForTesting.TIMES_ROMAN, 10f, "D", 1f, 6f)); expectedWords.add(textChunk); Assert.assertEquals(expectedWords, words); @@ -137,30 +138,30 @@ public void mergeElementsWithSkippingRules() { public void mergeTenElementsIntoTwoWords() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 60f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - elements.add(new TextElement(0f, 70f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - elements.add(new TextElement(0f, 80f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - elements.add(new TextElement(0f, 90f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - elements.add(new TextElement(0f, 100f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 0f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 60f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "M", 1f, 6f)); + elements.add(new TextElement(0f, 70f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "U", 1f, 6f)); + elements.add(new TextElement(0f, 80f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "N", 1f, 6f)); + elements.add(new TextElement(0f, 90f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 100f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, PDType1Font.HELVETICA, 1f, " ", 1f)); //Check why width=10.5? + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, UtilsForTesting.HELVETICA, 1f, " ", 1f)); //Check why width=10.5? expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(0f, 70f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(0f, 80f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(0f, 90f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(0f, 100f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); + TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(0f, 70f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(0f, 80f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(0f, 90f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(0f, 100f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); expectedWords.add(textChunk2); Assert.assertEquals(2, words.size()); @@ -172,29 +173,29 @@ public void mergeTenElementsIntoTwoWords() { public void mergeTenElementsIntoTwoLines() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - elements.add(new TextElement(20f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - elements.add(new TextElement(20f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - elements.add(new TextElement(20f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - elements.add(new TextElement(20f, 40f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 0f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 0f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "M", 1f, 6f)); + elements.add(new TextElement(20f, 10f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "U", 1f, 6f)); + elements.add(new TextElement(20f, 20f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "N", 1f, 6f)); + elements.add(new TextElement(20f, 30f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); + elements.add(new TextElement(20f, 40f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "A", 1f, 6f)); expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(20f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(20f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(20f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(20f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(20f, 40f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); + TextChunk textChunk2 = new TextChunk(new TextElement(20f, 0f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(20f, 10f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(20f, 20f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(20f, 30f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(20f, 40f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "O", 1f, 6f)); expectedWords.add(textChunk2); Assert.assertEquals(2, words.size()); diff --git a/src/test/java/technology/tabula/TestUtils.java b/src/test/java/technology/tabula/TestUtils.java index e68411df..6dccc758 100644 --- a/src/test/java/technology/tabula/TestUtils.java +++ b/src/test/java/technology/tabula/TestUtils.java @@ -122,7 +122,7 @@ public void testQuickSortLongList() { @Test public void testJPEG2000DoesNotRaise() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); + PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); PDPage page = pdf_document.getPage(0); Utils.pageConvertToImage(pdf_document, page, 360, ImageType.RGB); } diff --git a/src/test/java/technology/tabula/UtilsForTesting.java b/src/test/java/technology/tabula/UtilsForTesting.java index 3ee8efde..c2a47aea 100644 --- a/src/test/java/technology/tabula/UtilsForTesting.java +++ b/src/test/java/technology/tabula/UtilsForTesting.java @@ -8,10 +8,16 @@ import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVPrinter; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Assert; public class UtilsForTesting { + public static PDType1Font HELVETICA = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + public static PDType1Font HELVETICA_BOLD = new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD); + public static PDType1Font TIMES_ROMAN = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); + public static Page getAreaFromFirstPage(String path, float top, float left, float bottom, float right) throws IOException { return getAreaFromPage(path, 1, top, left, bottom, right); } @@ -23,8 +29,8 @@ public static Page getAreaFromPage(String path, int page, float top, float left, public static Page getPage(String path, int pageNumber) throws IOException { ObjectExtractor oe = null; try { - PDDocument document = PDDocument - .load(new File(path)); + PDDocument document = org.apache.pdfbox.Loader + .loadPDF(new File(path)); oe = new ObjectExtractor(document); Page page = oe.extract(pageNumber); return page;