Skip to content

Commit

Permalink
feat: Use jpype instead of subprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
chezou committed Aug 27, 2023
1 parent b24e3bd commit 17e8dd2
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 144 deletions.
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"pandas >= 0.25.3",
"numpy",
"distro",
"jpype1",
]
dynamic = ["version"]

Expand Down Expand Up @@ -75,3 +76,7 @@ exclude = [

[tool.mypy]
ignore_missing_imports = true

[tool.pytest.ini_options]
# Disable faulthandler plugin on Windows to prevent spurious console noise
addopts = "-p no:faulthandler"
95 changes: 38 additions & 57 deletions tabula/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,18 @@
import os
import platform
import shlex
import subprocess
from collections import defaultdict
from copy import deepcopy
from dataclasses import asdict
from logging import getLogger
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import jpype
import jpype.imports
import numpy as np
import pandas as pd

from .errors import CSVParseError, JavaNotFoundError
from .errors import CSVParseError
from .file_util import localize_file
from .template import load_template
from .util import FileLikeObj, TabulaOption
Expand All @@ -55,12 +56,29 @@ def _jar_path() -> str:
return os.environ.get("TABULA_JAR", DEFAULT_CONFIG["JAR_PATH"])


def _call_tabula_java(options: TabulaOption, path: Optional[str] = None) -> str:
from java.lang import StringBuilder
from org.apache.commons.cli import DefaultParser
from technology import tabula

sb = StringBuilder()
parser = DefaultParser()

args = options.build_option_list()
if path:
args.insert(0, path)

cmd = parser.parse(tabula.CommandLineApp.buildOptions(), args)
tabula.CommandLineApp(sb, cmd).extractTables(cmd)
return str(sb.toString())


def _run(
java_options: List[str],
options: TabulaOption,
path: Optional[str] = None,
encoding: str = "utf-8",
) -> bytes:
) -> str:
"""Call tabula-java with the given lists of Java options and tabula-py
options, as well as an optional path to pass to tabula-java as a regular
argument and an optional encoding to use for any required output sent to
Expand All @@ -69,37 +87,24 @@ def _run(
tabula-py options are translated into tabula-java options, see
:func:`build_options` for more information.
"""
# Workaround to enforce the silent option. See:
# https://github.com/tabulapdf/tabula-java/issues/231#issuecomment-397281157
if options.silent:
java_options.extend(
(
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log"
"=org.apache.commons.logging.impl.NoOpLog",

if not jpype.isJVMStarted():
jpype.addClassPath(_jar_path())

# Workaround to enforce the silent option. See:
# https://github.com/tabulapdf/tabula-java/issues/231#issuecomment-397281157
if options.silent:
java_options.extend(
(
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log"
"=org.apache.commons.logging.impl.NoOpLog",
)
)
)

args = ["java"] + java_options + ["-jar", _jar_path()] + options.build_option_list()
if path:
args.append(path)
jpype.startJVM(*java_options, convertStrings=False)

try:
result = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.DEVNULL,
check=True,
)
if result.stderr:
logger.warning(f"Got stderr: {result.stderr.decode(encoding)}")
return result.stdout
except FileNotFoundError:
raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR)
except subprocess.CalledProcessError as e:
logger.error(f"Error from tabula-java:\n{e.stderr.decode(encoding)}\n")
raise
return _call_tabula_java(options, path)


def read_pdf(
Expand Down Expand Up @@ -242,12 +247,6 @@ def read_pdf(
tabula.errors.CSVParseError:
If pandas CSV parsing failed.
tabula.errors.JavaNotFoundError:
If java is not installed or found.
subprocess.CalledProcessError:
If tabula-java execution failed.
Examples:
Expand Down Expand Up @@ -437,7 +436,7 @@ def read_pdf(
_pandas_options = deepcopy(pandas_options)
fmt = tabula_options.format
if fmt == "JSON":
raw_json: List[Any] = json.loads(output.decode(encoding))
raw_json: List[Any] = json.loads(output)
if multiple_tables:
return _extract_from(raw_json, _pandas_options)
else:
Expand All @@ -447,7 +446,7 @@ def read_pdf(
_pandas_options["encoding"] = _pandas_options.get("encoding", encoding)

try:
return [pd.read_csv(io.BytesIO(output), **_pandas_options)]
return [pd.read_csv(io.StringIO(output), **_pandas_options)]
except pd.errors.ParserError as e:
message = "Error failed to create DataFrame with different column tables.\n"
message += (
Expand Down Expand Up @@ -579,12 +578,6 @@ def read_pdf_with_template(
tabula.errors.CSVParseError:
If pandas CSV parsing failed.
tabula.errors.JavaNotFoundError:
If java is not installed or found.
subprocess.CalledProcessError:
If tabula-java execution failed.
Examples:
Expand Down Expand Up @@ -798,12 +791,6 @@ def convert_into(
ValueError:
If output_format is unknown format, or if downloaded remote file size is 0.
tabula.errors.JavaNotFoundError:
If java is not installed or found.
subprocess.CalledProcessError:
If tabula-java execution failed.
"""

if output_path is None or len(output_path) == 0:
Expand Down Expand Up @@ -935,12 +922,6 @@ def convert_into_by_batch(
Raises:
ValueError:
If input_dir doesn't exist.
tabula.errors.JavaNotFoundError:
If java is not installed or found.
subprocess.CalledProcessError:
If tabula-java execution failed.
"""

if input_dir is None or not os.path.isdir(input_dir):
Expand Down
87 changes: 0 additions & 87 deletions tests/test_read_pdf_table.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import filecmp
import json
import os
import platform
import shutil
import subprocess
import tempfile
import unittest
import uuid
from unittest.mock import patch

import pandas as pd # type: ignore

Expand Down Expand Up @@ -287,33 +284,6 @@ def test_read_pdf_with_binary_template(self):
self.assertEqual(len(dfs), 4)
self.assertTrue(dfs[0].equals(pd.read_csv(self.expected_csv1)))

@patch("subprocess.run")
@patch("tabula.io._jar_path")
def test_read_pdf_with_jar_path(self, jar_func, mock_fun):
jar_func.return_value = "/tmp/tabula-java.jar"

tabula.read_pdf(self.pdf_path, encoding="utf-8")

target_args = ["java"]
if platform.system() == "Darwin":
target_args += ["-Djava.awt.headless=true"]
target_args += [
"-Dfile.encoding=UTF8",
"-jar",
"/tmp/tabula-java.jar",
"--guess",
"--format",
"JSON",
"tests/resources/data.pdf",
]
subp_args = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"stdin": subprocess.DEVNULL,
"check": True,
}
mock_fun.assert_called_with(target_args, **subp_args)

def test_read_pdf_with_dtype_string(self):
pdf_path = "tests/resources/data_dtype.pdf"
expected_csv = "tests/resources/data_dtype_expected.csv"
Expand Down Expand Up @@ -361,63 +331,6 @@ def test_read_pdf_with_dtype_string(self):
dfs_template[0].equals(pd.read_csv(template_expected_csv, **pandas_options))
)

@patch("subprocess.run")
@patch("tabula.io._jar_path")
def test_read_pdf_with_silent_false(self, jar_func, mock_fun):
jar_func.return_value = "/tmp/tabula-java.jar"

tabula.read_pdf(self.pdf_path, encoding="utf-8", silent=False)

target_args = ["java"]
if platform.system() == "Darwin":
target_args += ["-Djava.awt.headless=true"]
target_args += [
"-Dfile.encoding=UTF8",
"-jar",
"/tmp/tabula-java.jar",
"--guess",
"--format",
"JSON",
"tests/resources/data.pdf",
]
subp_args = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"stdin": subprocess.DEVNULL,
"check": True,
}
mock_fun.assert_called_with(target_args, **subp_args)

@patch("subprocess.run")
@patch("tabula.io._jar_path")
def test_read_pdf_with_silent_true(self, jar_func, mock_fun):
jar_func.return_value = "/tmp/tabula-java.jar"

tabula.read_pdf(self.pdf_path, encoding="utf-8", silent=True)

target_args = ["java"]
if platform.system() == "Darwin":
target_args += ["-Djava.awt.headless=true"]
target_args += [
"-Dfile.encoding=UTF8",
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog",
"-jar",
"/tmp/tabula-java.jar",
"--guess",
"--format",
"JSON",
"--silent",
"tests/resources/data.pdf",
]
subp_args = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"stdin": subprocess.DEVNULL,
"check": True,
}
mock_fun.assert_called_with(target_args, **subp_args)


if __name__ == "__main__":
unittest.main()

0 comments on commit 17e8dd2

Please sign in to comment.