Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate to PDFBox 3.0.0 #414

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.28</version>
<version>3.0.0</version>
</dependency>

<dependency>
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/technology/tabula/CommandLineApp.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException
private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
PDDocument pdfDocument = null;
try {
pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password);
pdfDocument = this.password == null ? org.apache.pdfbox.Loader.loadPDF(pdfFile) : org.apache.pdfbox.Loader.loadPDF(pdfFile, this.password);
PageIterator pageIterator = getPageIterator(pdfDocument);
List<Table> tables = new ArrayList<>();

Expand Down
4 changes: 2 additions & 2 deletions src/main/java/technology/tabula/debug/Debug.java
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re
boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells,
boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths,
boolean drawDetectedTables) throws IOException {
PDDocument document = PDDocument.load(new File(pdfPath));
PDDocument document = org.apache.pdfbox.Loader.loadPDF(new File(pdfPath));

ObjectExtractor oe = new ObjectExtractor(document);

Expand Down Expand Up @@ -349,7 +349,7 @@ public static void main(String[] args) throws IOException {

if (pages == null) {
// user specified all pages
PDDocument document = PDDocument.load(pdfFile);
PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile);

int numPages = document.getNumberOfPages();
pages = new ArrayList<>(numPages);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -804,8 +804,7 @@ private List<Ruling> getVerticalRulings(BufferedImage image) {
private PDDocument removeText(PDPage page) throws IOException {

PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
List<Object> tokens = parser.parse();
List<Object> newTokens = new ArrayList<>();
for (Object token : tokens) {
if (token instanceof Operator) {
Expand Down
3 changes: 1 addition & 2 deletions src/test/java/technology/tabula/TestCell.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ public void testIsPlaceholder() {
public void testGetTextElements() {
Cell cell = new Cell(0, 0, 0, 0);
assertTrue(cell.getTextElements().isEmpty());

TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5);
TextElement tElement = new TextElement(0, 0, 0, 0, UtilsForTesting.HELVETICA_BOLD, 10, "test", 5);
TextChunk tChunk = new TextChunk(tElement);
List<TextChunk> tList = new ArrayList<>();
tList.add(tChunk);
Expand Down
11 changes: 6 additions & 5 deletions src/test/java/technology/tabula/TestLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.List;

import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.junit.Test;

public class TestLine {
Expand All @@ -14,7 +15,7 @@ public class TestLine {
public void testSetTextElements() {
Line line = new Line();

TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5);
TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5);
TextChunk tChunk = new TextChunk(tElement);
List<TextChunk> tList = new ArrayList<>();
tList.add(tChunk);
Expand All @@ -28,7 +29,7 @@ public void testSetTextElements() {
public void testAddTextChunkIntTextChunk() {
Line line = new Line();

TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5);
TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5);
TextChunk tChunk = new TextChunk(tElement);
line.addTextChunk(3, tChunk);

Expand All @@ -39,7 +40,7 @@ public void testAddTextChunkIntTextChunk() {
public void testLessThanAddTextChunkIntTextChunk() {
Line line = new Line();

TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5);
TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5);
TextChunk tChunk = new TextChunk(tElement);
line.addTextChunk(0, tChunk);
line.addTextChunk(0, tChunk);
Expand All @@ -51,7 +52,7 @@ public void testLessThanAddTextChunkIntTextChunk() {
public void testErrorAddTextChunkIntTextChunk() {
Line line = new Line();

TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5);
TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5);
TextChunk tChunk = new TextChunk(tElement);
line.addTextChunk(-1, tChunk);
}
Expand All @@ -60,7 +61,7 @@ public void testErrorAddTextChunkIntTextChunk() {
public void testToString() {
Line line = new Line();

TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5);
TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10, "test", 5);
TextChunk tChunk = new TextChunk(tElement);
line.addTextChunk(0, tChunk);
line.addTextChunk(0, tChunk);
Expand Down
27 changes: 10 additions & 17 deletions src/test/java/technology/tabula/TestObjectExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,17 @@

public class TestObjectExtractor {

/*@Test(expected=IOException.class)
public void testWrongPasswordRaisesException() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"));
ObjectExtractor oe = new ObjectExtractor(pdf_document, "wrongpass");
oe.extract().next();
}*/

@Test(expected = IOException.class)
public void testEmptyOnEncryptedFileRaisesException() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
oe.extract().next();
}
}

@Test
public void testCanReadPDFWithOwnerEncryption() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();
int i = 0;
Expand All @@ -44,7 +37,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException {

@Test
public void testGoodPassword() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword");
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword");
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
List<Page> pages = new ArrayList<>();
PageIterator pi = oe.extract();
Expand All @@ -58,7 +51,7 @@ public void testGoodPassword() throws IOException {

@Test
public void testTextExtractionDoesNotRaise() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/rotated_page.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();

Expand All @@ -70,7 +63,7 @@ public void testTextExtractionDoesNotRaise() throws IOException {

@Test
public void testShouldDetectRulings() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();

Expand All @@ -85,7 +78,7 @@ public void testShouldDetectRulings() throws IOException {

@Test
public void testDontThrowNPEInShfill() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/labor.pdf"));

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();
Expand All @@ -101,7 +94,7 @@ public void testDontThrowNPEInShfill() throws IOException {

@Test
public void testExtractOnePage() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
assertEquals(2, pdf_document.getNumberOfPages());

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Expand All @@ -114,7 +107,7 @@ public void testExtractOnePage() throws IOException {

@Test(expected = IndexOutOfBoundsException.class)
public void testExtractWrongPageNumber() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
assertEquals(2, pdf_document.getNumberOfPages());

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Expand All @@ -124,7 +117,7 @@ public void testExtractWrongPageNumber() throws IOException {

@Test
public void testTextElementsContainedInPage() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf"));

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Page page = oe.extractPage(1);
Expand All @@ -137,7 +130,7 @@ public void testTextElementsContainedInPage() throws IOException {
}

@Test public void testDoNotNPEInPointComparator() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/npe_issue_206.pdf"));

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Page p = oe.extractPage(1);
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/technology/tabula/TestProjectionProfile.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ public void setUpProjectionProfile() {
PDPage pdPage = new PDPage();
PDDocument pdDocument = new PDDocument();

TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f);
TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f);
TextElement textElement = new TextElement(5f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "test", 1f);
TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, UtilsForTesting.HELVETICA, 1f, "test", 1f);
List<TextElement> textList = new ArrayList<>();
textList.add(textElement);
textList.add(textElement2);
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/technology/tabula/TestTableDetection.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ public void testDetectionOfTables() throws Exception {
NodeList tables = regionDocument.getElementsByTagName("table");

// tabula extractors
PDDocument pdfDocument = PDDocument.load(this.pdf);
PDDocument pdfDocument = org.apache.pdfbox.Loader.loadPDF(this.pdf);
ObjectExtractor extractor = new ObjectExtractor(pdfDocument);

// parse expected tables from the ground truth dataset
Expand Down
Loading