diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 60ee746f..c5c4f709 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -20,11 +20,8 @@ jobs: git-crypt unlock /tmp/git-crypt-key rm /tmp/git-crypt-key - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Build image uses: docker/build-push-action@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6bd336e7..5d88d7dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,7 +47,7 @@ repos: hooks: - id: test name: test - entry: poetry run task test + entry: poetry run task short_test language: system pass_filenames: false types: [python] diff --git a/Dockerfile b/Dockerfile index 2c482c44..8744f498 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,7 @@ COPY monopoly ./monopoly COPY tests ./tests RUN poetry install -CMD ["python", "-m", "poetry", "run", "task", "test"] +CMD ["python", "-m", "poetry", "run", "task", "full_test"] FROM base AS runtime diff --git a/monopoly/examples/__init__.py b/monopoly/examples/__init__.py new file mode 100644 index 00000000..e562e3e1 --- /dev/null +++ b/monopoly/examples/__init__.py @@ -0,0 +1,3 @@ +from .example_bank import MonopolyBank + +__all__ = ["MonopolyBank"] diff --git a/monopoly/examples/example_bank.py b/monopoly/examples/example_bank.py new file mode 100644 index 00000000..32c46c7d --- /dev/null +++ b/monopoly/examples/example_bank.py @@ -0,0 +1,20 @@ +from monopoly.banks.base import BankBase +from monopoly.config import StatementConfig +from monopoly.constants import AccountType, BankNames + + +# fmt: off +class MonopolyBank(BankBase): + """Dummy class to help with reading the example PDF statement""" + statement_config = StatementConfig( + bank_name=BankNames.EXAMPLE, + account_type=AccountType.CREDIT, + transaction_pattern=( + r"(?P\d+/\d+)\s*" + r"(?P.*?)\s*" + r"(?P[\d.,]+)$" + ), + transaction_date_format=r"%d/%m", + statement_date_pattern=r"\d{2}\-\d{2}\-\d{4}", + statement_date_format=r"%d-%m-%Y", + ) diff --git a/monopoly/examples/multiple_statements.py b/monopoly/examples/multiple_statements.py index e30c3be9..fe82d0fc 100644 --- a/monopoly/examples/multiple_statements.py +++ b/monopoly/examples/multiple_statements.py @@ -12,7 +12,8 @@ def ocbc_example(): bank = Ocbc( file_path=file_path, ) - statement = bank.extract() + pages = bank.get_pages() + statement = bank.extract(pages) transformed_df = bank.transform(statement) bank.load(transformed_df, statement) diff --git a/monopoly/examples/single_statement.py b/monopoly/examples/single_statement.py index ff72bb50..53fd9c7b 100644 --- a/monopoly/examples/single_statement.py +++ b/monopoly/examples/single_statement.py @@ -1,23 +1,4 @@ -from monopoly.banks.base import BankBase -from monopoly.config import StatementConfig -from monopoly.constants import AccountType, BankNames - - -# fmt: off -class MonopolyBank(BankBase): - """Dummy class to help with reading the example PDF statement""" - statement_config = StatementConfig( - bank_name=BankNames.EXAMPLE, - account_type=AccountType.CREDIT, - transaction_pattern=( - r"(?P\d+/\d+)\s*" - r"(?P.*?)\s*" - r"(?P[\d.,]+)$" - ), - transaction_date_format=r"%d/%m", - statement_date_pattern=r"\d{2}\-\d{2}\-\d{4}", - statement_date_format=r"%d-%m-%Y", - ) +from monopoly.examples import MonopolyBank def example(): @@ -30,7 +11,8 @@ def example(): # This runs Tesseract on the PDF and # extracts transactions as raw text - statement = bank.extract() + pages = bank.get_pages() + statement = bank.extract(pages) # Dates are converted into an ISO 8601 date format transformed_df = bank.transform(statement) diff --git a/monopoly/main.py b/monopoly/main.py index 4d19196f..cd230435 100644 --- a/monopoly/main.py +++ b/monopoly/main.py @@ -38,7 +38,8 @@ def process_bank_statement(message: Message, banks: dict): with message.save(attachment) as file: processor: StatementProcessor = bank_class(file_path=file) - statement = processor.extract() + pages = processor.get_pages() + statement = processor.extract(pages) transformed_df = processor.transform(statement) processor.load(transformed_df, statement, upload_to_cloud=True) diff --git a/monopoly/pdf.py b/monopoly/pdf.py index 719bbdef..2166d3b1 100644 --- a/monopoly/pdf.py +++ b/monopoly/pdf.py @@ -42,24 +42,29 @@ def __init__(self, file_path: str, config: PdfConfig = None): self.static_string = config.static_string self.remove_vertical_text = True - def open(self): + def open(self, password_override: str = None): + """ + Opens a PDF document. Accepts a manual password override, + if the user does not want to set passwords in the .env file. + """ logger.info("Opening pdf from path %s", self.file_path) document = fitz.Document(self.file_path) + password = self.password or password_override if not document.is_encrypted: return document - if self.password and not self.brute_force_mask: - document.authenticate(self.password) + if password: + document.authenticate(password) if document.is_encrypted: - raise ValueError("Wrong password - document is encrypted") + raise ValueError("Wrong password - unable to open document") return document # This attempts to unlock statements based on a common password, # followed by the last few digits of a card - if document.is_encrypted and self.brute_force_mask: + if not password and self.brute_force_mask and self.static_string: logger.info("Unlocking PDF using a string prefix with mask") password = self.unlock_pdf( pdf_file_path=self.file_path, diff --git a/monopoly/processor.py b/monopoly/processor.py index 57b03470..419c31f3 100644 --- a/monopoly/processor.py +++ b/monopoly/processor.py @@ -1,5 +1,4 @@ import logging -from dataclasses import dataclass from datetime import datetime from typing import Optional @@ -7,23 +6,25 @@ from monopoly.config import PdfConfig, StatementConfig, settings from monopoly.constants import StatementFields -from monopoly.pdf import PdfParser +from monopoly.pdf import PdfPage, PdfParser from monopoly.statement import Statement from monopoly.storage import upload_to_cloud_storage, write_to_csv logger = logging.getLogger(__name__) -@dataclass -class StatementProcessor: - statement_config: StatementConfig - file_path: str - pdf_config: Optional[PdfConfig] = None - transform_dates: bool = True +class StatementProcessor(PdfParser): + def __init__( + self, statement_config, file_path, pdf_config=None, transform_dates=True + ): + self.statement_config: StatementConfig = statement_config + self.file_path: str = file_path + self.pdf_config: Optional[PdfConfig] = pdf_config + self.transform_dates: bool = transform_dates + + super().__init__(file_path=self.file_path, config=pdf_config) - def extract(self) -> Statement: - parser = PdfParser(self.file_path, self.pdf_config) - pages = parser.get_pages() + def extract(self, pages: list[PdfPage]) -> Statement: statement = Statement(pages, self.statement_config) if not statement.transactions: diff --git a/pyproject.toml b/pyproject.toml index 6f3cd1b0..1903df66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,8 @@ pylint-pydantic = "^0.3.0" [tool.taskipy.tasks] format = "isort . && black ." lint = "flake8 monopoly && pylint monopoly" -test = "pytest -n auto" +short_test = "pytest --ignore tests/integration/banks" +full_test = "pytest -n auto" ci = "poetry run task format && poetry run task lint && poetry run task test" [tool.pylint] diff --git a/tests/integration/banks/citibank/test_citibank_extract.py b/tests/integration/banks/citibank/test_citibank_extract.py index 360af79c..657f4802 100644 --- a/tests/integration/banks/citibank/test_citibank_extract.py +++ b/tests/integration/banks/citibank/test_citibank_extract.py @@ -6,7 +6,8 @@ def test_citibank_extract_unprotected_pdf(citibank: Citibank): - raw_df = citibank.extract().df + pages = citibank.get_pages() + raw_df = citibank.extract(pages).df expected_df = pd.read_csv("tests/integration/fixtures/citibank/expected.csv") assert_frame_equal(raw_df, expected_df) diff --git a/tests/integration/banks/hsbc/test_hsbc_extract.py b/tests/integration/banks/hsbc/test_hsbc_extract.py index 25e0d29a..8a461266 100644 --- a/tests/integration/banks/hsbc/test_hsbc_extract.py +++ b/tests/integration/banks/hsbc/test_hsbc_extract.py @@ -6,7 +6,8 @@ def test_hsbc_extract_unprotected_pdf(hsbc: Hsbc): - raw_df = hsbc.extract().df + pages = hsbc.get_pages() + raw_df = hsbc.extract(pages).df expected_df = pd.read_csv("tests/integration/fixtures/hsbc/expected.csv") assert_frame_equal(raw_df, expected_df) diff --git a/tests/integration/banks/ocbc/test_ocbc_extract.py b/tests/integration/banks/ocbc/test_ocbc_extract.py index acf3be83..bbfe3c07 100644 --- a/tests/integration/banks/ocbc/test_ocbc_extract.py +++ b/tests/integration/banks/ocbc/test_ocbc_extract.py @@ -6,7 +6,8 @@ def test_ocbc_extract_unprotected_pdf(ocbc: Ocbc): - raw_df = ocbc.extract().df + pages = ocbc.get_pages() + raw_df = ocbc.extract(pages).df expected_df = pd.read_csv("tests/integration/fixtures/ocbc/expected.csv") diff --git a/tests/integration/test_parser.py b/tests/integration/test_parser.py index 7c61470a..ada643b6 100644 --- a/tests/integration/test_parser.py +++ b/tests/integration/test_parser.py @@ -1,5 +1,6 @@ -import pytest +from pytest import raises +from monopoly.banks import Hsbc from monopoly.pdf import PdfParser @@ -14,7 +15,7 @@ def test_wrong_password_raises_error(parser: PdfParser): parser.file_path = "tests/integration/fixtures/protected.pdf" parser.password = "wrong_pw" - with pytest.raises(ValueError, match="document is encrypted"): + with raises(ValueError, match="Wrong password"): parser.open() @@ -30,7 +31,7 @@ def test_get_pages_invalid_returns_error(parser: PdfParser): parser.file_path = "tests/integration/fixtures/4_pages_blank.pdf" parser.page_range = slice(99, -99) - with pytest.raises(ValueError, match="bad page number"): + with raises(ValueError, match="bad page number"): parser.get_pages() @@ -42,3 +43,17 @@ def test_pdf_unlock(parser: PdfParser): ) assert password == "foobar123" + + +def test_override_password(hsbc: Hsbc): + hsbc = Hsbc("tests/integration/fixtures/protected.pdf") + + document = hsbc.open(password_override="foobar123") + assert not document.is_encrypted + + +def test_error_raised_if_override_is_wrong(hsbc: Hsbc): + hsbc = Hsbc("tests/integration/fixtures/protected.pdf") + + with raises(ValueError, match="Wrong password"): + hsbc.open(password_override="wrongpw")