From 7769f1f7d7feb0cd053f9901322ce9dbbafdb78b Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Sat, 26 Aug 2023 17:16:03 -0700 Subject: [PATCH] feat: Use jpype instead of subprocess --- pyproject.toml | 5 ++ tabula/io.py | 95 +++++++++++++++--------------------- tests/test_read_pdf_table.py | 87 --------------------------------- 3 files changed, 43 insertions(+), 144 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d22d83..b4732f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "pandas >= 0.25.3", "numpy", "distro", + "jpype1", ] dynamic = ["version"] @@ -75,3 +76,7 @@ exclude = [ [tool.mypy] ignore_missing_imports = true + +[tool.pytest.ini_options] +# Disable faulthandler plugin on Windows to prevent spurious console noise +addopts = "-p no:faulthandler" diff --git a/tabula/io.py b/tabula/io.py index 2e63b17..706fd64 100644 --- a/tabula/io.py +++ b/tabula/io.py @@ -23,17 +23,18 @@ import os import platform import shlex -import subprocess from collections import defaultdict from copy import deepcopy from dataclasses import asdict from logging import getLogger from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +import jpype +import jpype.imports import numpy as np import pandas as pd -from .errors import CSVParseError, JavaNotFoundError +from .errors import CSVParseError from .file_util import localize_file from .template import load_template from .util import FileLikeObj, TabulaOption @@ -55,12 +56,29 @@ def _jar_path() -> str: return os.environ.get("TABULA_JAR", DEFAULT_CONFIG["JAR_PATH"]) +def _call_tabula_java(options: TabulaOption, path: Optional[str] = None) -> str: + from java.lang import StringBuilder + from org.apache.commons.cli import DefaultParser + from technology import tabula + + sb = StringBuilder() + parser = DefaultParser() + + args = options.build_option_list() + if path: + args.insert(0, path) + + cmd = parser.parse(tabula.CommandLineApp.buildOptions(), args) + tabula.CommandLineApp(sb, cmd).extractTables(cmd) + return str(sb.toString()) + + def _run( java_options: List[str], options: TabulaOption, path: Optional[str] = None, encoding: str = "utf-8", -) -> bytes: +) -> str: """Call tabula-java with the given lists of Java options and tabula-py options, as well as an optional path to pass to tabula-java as a regular argument and an optional encoding to use for any required output sent to @@ -69,37 +87,24 @@ def _run( tabula-py options are translated into tabula-java options, see :func:`build_options` for more information. """ - # Workaround to enforce the silent option. See: - # https://github.com/tabulapdf/tabula-java/issues/231#issuecomment-397281157 - if options.silent: - java_options.extend( - ( - "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", - "-Dorg.apache.commons.logging.Log" - "=org.apache.commons.logging.impl.NoOpLog", + + if not jpype.isJVMStarted(): + jpype.addClassPath(_jar_path()) + + # Workaround to enforce the silent option. See: + # https://github.com/tabulapdf/tabula-java/issues/231#issuecomment-397281157 + if options.silent: + java_options.extend( + ( + "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", + "-Dorg.apache.commons.logging.Log" + "=org.apache.commons.logging.impl.NoOpLog", + ) ) - ) - args = ["java"] + java_options + ["-jar", _jar_path()] + options.build_option_list() - if path: - args.append(path) + jpype.startJVM(*java_options, convertStrings=False) - try: - result = subprocess.run( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.DEVNULL, - check=True, - ) - if result.stderr: - logger.warning(f"Got stderr: {result.stderr.decode(encoding)}") - return result.stdout - except FileNotFoundError: - raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR) - except subprocess.CalledProcessError as e: - logger.error(f"Error from tabula-java:\n{e.stderr.decode(encoding)}\n") - raise + return _call_tabula_java(options, path) def read_pdf( @@ -242,12 +247,6 @@ def read_pdf( tabula.errors.CSVParseError: If pandas CSV parsing failed. - tabula.errors.JavaNotFoundError: - If java is not installed or found. - - subprocess.CalledProcessError: - If tabula-java execution failed. - Examples: @@ -437,7 +436,7 @@ def read_pdf( _pandas_options = deepcopy(pandas_options) fmt = tabula_options.format if fmt == "JSON": - raw_json: List[Any] = json.loads(output.decode(encoding)) + raw_json: List[Any] = json.loads(output) if multiple_tables: return _extract_from(raw_json, _pandas_options) else: @@ -447,7 +446,7 @@ def read_pdf( _pandas_options["encoding"] = _pandas_options.get("encoding", encoding) try: - return [pd.read_csv(io.BytesIO(output), **_pandas_options)] + return [pd.read_csv(io.StringIO(output), **_pandas_options)] except pd.errors.ParserError as e: message = "Error failed to create DataFrame with different column tables.\n" message += ( @@ -579,12 +578,6 @@ def read_pdf_with_template( tabula.errors.CSVParseError: If pandas CSV parsing failed. - tabula.errors.JavaNotFoundError: - If java is not installed or found. - - subprocess.CalledProcessError: - If tabula-java execution failed. - Examples: @@ -798,12 +791,6 @@ def convert_into( ValueError: If output_format is unknown format, or if downloaded remote file size is 0. - - tabula.errors.JavaNotFoundError: - If java is not installed or found. - - subprocess.CalledProcessError: - If tabula-java execution failed. """ if output_path is None or len(output_path) == 0: @@ -935,12 +922,6 @@ def convert_into_by_batch( Raises: ValueError: If input_dir doesn't exist. - - tabula.errors.JavaNotFoundError: - If java is not installed or found. - - subprocess.CalledProcessError: - If tabula-java execution failed. """ if input_dir is None or not os.path.isdir(input_dir): diff --git a/tests/test_read_pdf_table.py b/tests/test_read_pdf_table.py index c28ced4..5e575ec 100644 --- a/tests/test_read_pdf_table.py +++ b/tests/test_read_pdf_table.py @@ -1,13 +1,10 @@ import filecmp import json import os -import platform import shutil -import subprocess import tempfile import unittest import uuid -from unittest.mock import patch import pandas as pd # type: ignore @@ -287,33 +284,6 @@ def test_read_pdf_with_binary_template(self): self.assertEqual(len(dfs), 4) self.assertTrue(dfs[0].equals(pd.read_csv(self.expected_csv1))) - @patch("subprocess.run") - @patch("tabula.io._jar_path") - def test_read_pdf_with_jar_path(self, jar_func, mock_fun): - jar_func.return_value = "/tmp/tabula-java.jar" - - tabula.read_pdf(self.pdf_path, encoding="utf-8") - - target_args = ["java"] - if platform.system() == "Darwin": - target_args += ["-Djava.awt.headless=true"] - target_args += [ - "-Dfile.encoding=UTF8", - "-jar", - "/tmp/tabula-java.jar", - "--guess", - "--format", - "JSON", - "tests/resources/data.pdf", - ] - subp_args = { - "stdout": subprocess.PIPE, - "stderr": subprocess.PIPE, - "stdin": subprocess.DEVNULL, - "check": True, - } - mock_fun.assert_called_with(target_args, **subp_args) - def test_read_pdf_with_dtype_string(self): pdf_path = "tests/resources/data_dtype.pdf" expected_csv = "tests/resources/data_dtype_expected.csv" @@ -361,63 +331,6 @@ def test_read_pdf_with_dtype_string(self): dfs_template[0].equals(pd.read_csv(template_expected_csv, **pandas_options)) ) - @patch("subprocess.run") - @patch("tabula.io._jar_path") - def test_read_pdf_with_silent_false(self, jar_func, mock_fun): - jar_func.return_value = "/tmp/tabula-java.jar" - - tabula.read_pdf(self.pdf_path, encoding="utf-8", silent=False) - - target_args = ["java"] - if platform.system() == "Darwin": - target_args += ["-Djava.awt.headless=true"] - target_args += [ - "-Dfile.encoding=UTF8", - "-jar", - "/tmp/tabula-java.jar", - "--guess", - "--format", - "JSON", - "tests/resources/data.pdf", - ] - subp_args = { - "stdout": subprocess.PIPE, - "stderr": subprocess.PIPE, - "stdin": subprocess.DEVNULL, - "check": True, - } - mock_fun.assert_called_with(target_args, **subp_args) - - @patch("subprocess.run") - @patch("tabula.io._jar_path") - def test_read_pdf_with_silent_true(self, jar_func, mock_fun): - jar_func.return_value = "/tmp/tabula-java.jar" - - tabula.read_pdf(self.pdf_path, encoding="utf-8", silent=True) - - target_args = ["java"] - if platform.system() == "Darwin": - target_args += ["-Djava.awt.headless=true"] - target_args += [ - "-Dfile.encoding=UTF8", - "-Dorg.slf4j.simpleLogger.defaultLogLevel=off", - "-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog", - "-jar", - "/tmp/tabula-java.jar", - "--guess", - "--format", - "JSON", - "--silent", - "tests/resources/data.pdf", - ] - subp_args = { - "stdout": subprocess.PIPE, - "stderr": subprocess.PIPE, - "stdin": subprocess.DEVNULL, - "check": True, - } - mock_fun.assert_called_with(target_args, **subp_args) - if __name__ == "__main__": unittest.main()