Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(banks): allow for manual password override #25

Merged
merged 4 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@ jobs:
git-crypt unlock /tmp/git-crypt-key
rm /tmp/git-crypt-key

- name: Set up QEMU
uses: docker/setup-qemu-action@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
uses: docker/setup-buildx-action@v3

- name: Build image
uses: docker/build-push-action@v4
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ repos:
hooks:
- id: test
name: test
entry: poetry run task test
entry: poetry run task short_test
language: system
pass_filenames: false
types: [python]
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ COPY monopoly ./monopoly
COPY tests ./tests
RUN poetry install

CMD ["python", "-m", "poetry", "run", "task", "test"]
CMD ["python", "-m", "poetry", "run", "task", "full_test"]

FROM base AS runtime

Expand Down
3 changes: 3 additions & 0 deletions monopoly/examples/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .example_bank import MonopolyBank

__all__ = ["MonopolyBank"]
20 changes: 20 additions & 0 deletions monopoly/examples/example_bank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from monopoly.banks.base import BankBase
from monopoly.config import StatementConfig
from monopoly.constants import AccountType, BankNames


# fmt: off
class MonopolyBank(BankBase):
"""Dummy class to help with reading the example PDF statement"""
statement_config = StatementConfig(
bank_name=BankNames.EXAMPLE,
account_type=AccountType.CREDIT,
transaction_pattern=(
r"(?P<date>\d+/\d+)\s*"
r"(?P<description>.*?)\s*"
r"(?P<amount>[\d.,]+)$"
),
transaction_date_format=r"%d/%m",
statement_date_pattern=r"\d{2}\-\d{2}\-\d{4}",
statement_date_format=r"%d-%m-%Y",
)
3 changes: 2 additions & 1 deletion monopoly/examples/multiple_statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def ocbc_example():
bank = Ocbc(
file_path=file_path,
)
statement = bank.extract()
pages = bank.get_pages()
statement = bank.extract(pages)
transformed_df = bank.transform(statement)
bank.load(transformed_df, statement)

Expand Down
24 changes: 3 additions & 21 deletions monopoly/examples/single_statement.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,4 @@
from monopoly.banks.base import BankBase
from monopoly.config import StatementConfig
from monopoly.constants import AccountType, BankNames


# fmt: off
class MonopolyBank(BankBase):
"""Dummy class to help with reading the example PDF statement"""
statement_config = StatementConfig(
bank_name=BankNames.EXAMPLE,
account_type=AccountType.CREDIT,
transaction_pattern=(
r"(?P<date>\d+/\d+)\s*"
r"(?P<description>.*?)\s*"
r"(?P<amount>[\d.,]+)$"
),
transaction_date_format=r"%d/%m",
statement_date_pattern=r"\d{2}\-\d{2}\-\d{4}",
statement_date_format=r"%d-%m-%Y",
)
from monopoly.examples import MonopolyBank


def example():
Expand All @@ -30,7 +11,8 @@ def example():

# This runs Tesseract on the PDF and
# extracts transactions as raw text
statement = bank.extract()
pages = bank.get_pages()
statement = bank.extract(pages)

# Dates are converted into an ISO 8601 date format
transformed_df = bank.transform(statement)
Expand Down
3 changes: 2 additions & 1 deletion monopoly/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def process_bank_statement(message: Message, banks: dict):

with message.save(attachment) as file:
processor: StatementProcessor = bank_class(file_path=file)
statement = processor.extract()
pages = processor.get_pages()
statement = processor.extract(pages)
transformed_df = processor.transform(statement)
processor.load(transformed_df, statement, upload_to_cloud=True)

Expand Down
15 changes: 10 additions & 5 deletions monopoly/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,29 @@ def __init__(self, file_path: str, config: PdfConfig = None):
self.static_string = config.static_string
self.remove_vertical_text = True

def open(self):
def open(self, password_override: str = None):
"""
Opens a PDF document. Accepts a manual password override,
if the user does not want to set passwords in the .env file.
"""
logger.info("Opening pdf from path %s", self.file_path)
document = fitz.Document(self.file_path)
password = self.password or password_override

if not document.is_encrypted:
return document

if self.password and not self.brute_force_mask:
document.authenticate(self.password)
if password:
document.authenticate(password)

if document.is_encrypted:
raise ValueError("Wrong password - document is encrypted")
raise ValueError("Wrong password - unable to open document")

return document

# This attempts to unlock statements based on a common password,
# followed by the last few digits of a card
if document.is_encrypted and self.brute_force_mask:
if not password and self.brute_force_mask and self.static_string:
logger.info("Unlocking PDF using a string prefix with mask")
password = self.unlock_pdf(
pdf_file_path=self.file_path,
Expand Down
23 changes: 12 additions & 11 deletions monopoly/processor.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
import logging
from dataclasses import dataclass
from datetime import datetime
from typing import Optional

from pandas import DataFrame

from monopoly.config import PdfConfig, StatementConfig, settings
from monopoly.constants import StatementFields
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfPage, PdfParser
from monopoly.statement import Statement
from monopoly.storage import upload_to_cloud_storage, write_to_csv

logger = logging.getLogger(__name__)


@dataclass
class StatementProcessor:
statement_config: StatementConfig
file_path: str
pdf_config: Optional[PdfConfig] = None
transform_dates: bool = True
class StatementProcessor(PdfParser):
def __init__(
self, statement_config, file_path, pdf_config=None, transform_dates=True
):
self.statement_config: StatementConfig = statement_config
self.file_path: str = file_path
self.pdf_config: Optional[PdfConfig] = pdf_config
self.transform_dates: bool = transform_dates

super().__init__(file_path=self.file_path, config=pdf_config)

def extract(self) -> Statement:
parser = PdfParser(self.file_path, self.pdf_config)
pages = parser.get_pages()
def extract(self, pages: list[PdfPage]) -> Statement:
statement = Statement(pages, self.statement_config)

if not statement.transactions:
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ pylint-pydantic = "^0.3.0"
[tool.taskipy.tasks]
format = "isort . && black ."
lint = "flake8 monopoly && pylint monopoly"
test = "pytest -n auto"
short_test = "pytest --ignore tests/integration/banks"
full_test = "pytest -n auto"
ci = "poetry run task format && poetry run task lint && poetry run task test"

[tool.pylint]
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/banks/citibank/test_citibank_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def test_citibank_extract_unprotected_pdf(citibank: Citibank):
raw_df = citibank.extract().df
pages = citibank.get_pages()
raw_df = citibank.extract(pages).df
expected_df = pd.read_csv("tests/integration/fixtures/citibank/expected.csv")

assert_frame_equal(raw_df, expected_df)
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/banks/hsbc/test_hsbc_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def test_hsbc_extract_unprotected_pdf(hsbc: Hsbc):
raw_df = hsbc.extract().df
pages = hsbc.get_pages()
raw_df = hsbc.extract(pages).df
expected_df = pd.read_csv("tests/integration/fixtures/hsbc/expected.csv")

assert_frame_equal(raw_df, expected_df)
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/banks/ocbc/test_ocbc_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def test_ocbc_extract_unprotected_pdf(ocbc: Ocbc):
raw_df = ocbc.extract().df
pages = ocbc.get_pages()
raw_df = ocbc.extract(pages).df

expected_df = pd.read_csv("tests/integration/fixtures/ocbc/expected.csv")

Expand Down
21 changes: 18 additions & 3 deletions tests/integration/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from pytest import raises

from monopoly.banks import Hsbc
from monopoly.pdf import PdfParser


Expand All @@ -14,7 +15,7 @@ def test_wrong_password_raises_error(parser: PdfParser):
parser.file_path = "tests/integration/fixtures/protected.pdf"
parser.password = "wrong_pw"

with pytest.raises(ValueError, match="document is encrypted"):
with raises(ValueError, match="Wrong password"):
parser.open()


Expand All @@ -30,7 +31,7 @@ def test_get_pages_invalid_returns_error(parser: PdfParser):
parser.file_path = "tests/integration/fixtures/4_pages_blank.pdf"
parser.page_range = slice(99, -99)

with pytest.raises(ValueError, match="bad page number"):
with raises(ValueError, match="bad page number"):
parser.get_pages()


Expand All @@ -42,3 +43,17 @@ def test_pdf_unlock(parser: PdfParser):
)

assert password == "foobar123"


def test_override_password(hsbc: Hsbc):
hsbc = Hsbc("tests/integration/fixtures/protected.pdf")

document = hsbc.open(password_override="foobar123")
assert not document.is_encrypted


def test_error_raised_if_override_is_wrong(hsbc: Hsbc):
hsbc = Hsbc("tests/integration/fixtures/protected.pdf")

with raises(ValueError, match="Wrong password"):
hsbc.open(password_override="wrongpw")