benjamin-awd · benjamin-awd · Sep 5, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024
@@ -15,7 +15,7 @@ jobs:
       - name: Install pdftotext
         uses: daaku/gh-action-apt-install@v4
         with:
-          packages: build-essential libpoppler-cpp-dev pkg-config
+          packages: build-essential libpoppler-cpp-dev pkg-config ocrmypdf
 
       - name: Setup Python & Poetry
         uses: ./.github/actions/setup-python-poetry

@@ -23,7 +23,7 @@ jobs:
       - name: Install pdftotext
         uses: daaku/gh-action-apt-install@v4
         with:
-          packages: build-essential libpoppler-cpp-dev pkg-config
+          packages: build-essential libpoppler-cpp-dev pkg-config ocrmypdf
 
       - name: Setup Python & Poetry
         uses: ./.github/actions/setup-python-poetry

@@ -15,7 +15,7 @@ jobs:
       - name: Install pdftotext
         uses: daaku/gh-action-apt-install@v4
         with:
-          packages: build-essential libpoppler-cpp-dev pkg-config
+          packages: build-essential libpoppler-cpp-dev pkg-config ocrmypdf
 
       - name: Setup Python & Poetry
         uses: ./.github/actions/setup-python-poetry

diff --git a/README.md b/README.md
@@ -26,13 +26,13 @@ Monopoly is a pip-installable Python package on [PyPI](https://pypi.org/project/
 Since Monopoly uses `pdftotext`, you'll need to install additional dependencies:
 
 ```sh
-apt-get install build-essential libpoppler-cpp-dev pkg-config
+apt-get install build-essential libpoppler-cpp-dev pkg-config ocrmypdf
 ```
 
 or
 
 ```sh
-brew install gcc@11 pkg-config poppler
+brew install gcc@11 pkg-config poppler ocrmypdf
 ```
 
 Then install with pipx:
@@ -72,7 +72,7 @@ python3 src/monopoly/examples/single_statement.py
 ## Features
 - Parses PDFs using predefined configuration classes per bank.
 - Handles locked PDFs with credentials passed via environment variables.
-- Supports a variety of date/number formats and determines if a transaction is debit or credit.
+- Supports adding OCR for image-based bank statements.
 - Provides a generic parser that can be used without any predefined configuration (caveat emptor).
 - Includes a safety check (enabled by default) that validates totals for debit or credit statements.
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ tabulate = "^0.9.0"
 pydantic = "^2.5.2"
 dateparser = "^1.2.0"
 strenum = "^0.4.15"
-
+ocrmypdf = { version = "^16.5.0", optional = true }
 
 [tool.poetry.group.dev.dependencies]
 black = ">=23.7,<25.0"
@@ -43,9 +43,11 @@ types-tabulate = "^0.9.0.20240106"
 pytest-xdist = "^3.6.1"
 flake8 = "^7.0.0"
 ruff = ">=0.4.7,<0.7.0"
+git-cliff = "^2.3.0"
 
+[tool.poetry.extras]
+ocr = ["ocrmypdf"]
 
-git-cliff = "^2.3.0"
 [tool.taskipy.tasks]
 format = "isort . && black ."
 lint = "flake8 src && pylint src && ruff check src"
@@ -85,7 +87,9 @@ disable_error_code = [
 
 [[tool.mypy.overrides]]
 module = [
-    "fitz",
+    "pymupdf",
+    "ocrmypdf",
+    "ocrmypdf.exceptions",
     "pdftotext",
     "pdf2john",
 ]

diff --git a/src/monopoly/banks/__init__.py b/src/monopoly/banks/__init__.py
@@ -1,16 +1,17 @@
 import logging
 from typing import Type
 
-from ..examples.example_bank import ExampleBank
 from .base import BankBase
 from .citibank import Citibank
 from .dbs import Dbs
+from .detector import BankDetector
+from .example_bank import ExampleBank
 from .hsbc import Hsbc
 from .maybank import Maybank
 from .ocbc import Ocbc
 from .standard_chartered import StandardChartered
 
-banks: list[Type[BankBase]] = [
+banks: list[Type["BankBase"]] = [
     Citibank,
     Dbs,
     ExampleBank,
@@ -21,3 +22,5 @@
 ]
 
 logger = logging.getLogger(__name__)
+
+__all__ = ["BankDetector", "BankBase", *[bank.__name__ for bank in banks]]
diff --git a/src/monopoly/banks/base.py b/src/monopoly/banks/base.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 
 from monopoly.config import PdfConfig, StatementConfig
 
@@ -15,6 +16,7 @@ class BankBase:
 
     statement_configs: list[StatementConfig]
     pdf_config: PdfConfig = PdfConfig()
+    identifiers: list[list[Any]]
 
     def __init_subclass__(cls, **kwargs) -> None:
         if not hasattr(cls, "statement_configs"):

diff --git a/src/monopoly/bank_detector.py → src/monopoly/banks/detector.py b/src/monopoly/bank_detector.py → src/monopoly/banks/detector.py
@@ -1,12 +1,14 @@
 import logging
 from dataclasses import Field, fields
 from functools import cached_property
-from typing import Any, Type
+from typing import TYPE_CHECKING, Any, Type
 
-from monopoly.banks import BankBase, banks
-from monopoly.identifiers import Identifier, MetadataIdentifier, TextIdentifier
+from monopoly.identifiers import Identifier, TextIdentifier
 from monopoly.pdf import PdfDocument
 
+if TYPE_CHECKING:
+    from .base import BankBase
+
 logger = logging.getLogger(__name__)
 
 
@@ -20,20 +22,22 @@ def metadata_items(self) -> list[Any]:
         Retrieves encryption and metadata identifiers from a bank statement PDF
         """
         identifiers: list[Identifier] = []
-        if metadata := self.document.open().metadata:
-            metadata_identifier = MetadataIdentifier(**metadata)
+        if metadata_identifier := self.document.metadata_identifier:
             identifiers.append(metadata_identifier)
 
         if not identifiers:
             raise ValueError("Could not get identifier")
 
         return identifiers
 
-    def detect_bank(self) -> Type[BankBase] | None:
+    def detect_bank(self, banks: list[Type["BankBase"]]) -> Type["BankBase"] | None:
         """
         Reads the encryption metadata or actual metadata (if the PDF is not encrypted),
         and checks for a bank based on unique identifiers.
         """
+        if not banks:
+            banks = []
+
         logger.debug("Found PDF properties: %s", self.metadata_items)
 
         for bank in banks:
@@ -43,7 +47,7 @@ def detect_bank(self) -> Type[BankBase] | None:
 
     def is_bank_identified(
         self,
-        bank: Type[BankBase],
+        bank: Type["BankBase"],
     ) -> bool:
         """
         Checks if a bank is identified based on a list of metadata items.

diff --git a/src/monopoly/examples/example_bank.py → src/monopoly/banks/example_bank.py b/src/monopoly/examples/example_bank.py → src/monopoly/banks/example_bank.py
@@ -1,10 +1,11 @@
 from re import compile as regex
 
-from monopoly.banks.base import BankBase
 from monopoly.config import StatementConfig
 from monopoly.constants import EntryType, InternalBankNames, SharedPatterns
 from monopoly.identifiers import TextIdentifier
 
+from .base import BankBase
+
 
 class ExampleBank(BankBase):
     """Dummy class to help with reading the example PDF statement"""

diff --git a/src/monopoly/banks/hsbc/hsbc.py b/src/monopoly/banks/hsbc/hsbc.py
@@ -26,19 +26,26 @@ class Hsbc(BankBase):
         multiline_transactions=True,
     )
 
+    email_statement_identifier = [
+        MetadataIdentifier(
+            title="PRJ_BEAGLE_ST_CNS_SGH_APP_Orchid",
+            author="Registered to: HSBCGLOB",
+            creator="OpenText Exstream",
+        ),
+        TextIdentifier("HSBC"),
+    ]
+
+    web_and_mobile_statement_identifier = [
+        MetadataIdentifier(
+            format="PDF 1.7", producer="OpenText Output Transformation Engine"
+        )
+    ]
+
     pdf_config = PdfConfig(
-        page_bbox=(0, 0, 379, 842),
+        page_bbox=(0, 0, 379, 840),
+        ocr_identifiers=web_and_mobile_statement_identifier,
     )
 
-    identifiers = [
-        [
-            MetadataIdentifier(
-                title="PRJ_BEAGLE_ST_CNS_SGH_APP_Orchid",
-                author="Registered to: HSBCGLOB",
-                creator="OpenText Exstream",
-            ),
-            TextIdentifier("HSBC"),
-        ],
-    ]
+    identifiers = [email_statement_identifier, web_and_mobile_statement_identifier]
 
     statement_configs = [credit_config]
diff --git a/src/monopoly/cli.py b/src/monopoly/cli.py
@@ -1,10 +1,10 @@
 import traceback
 from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Collection, Iterable, Optional, TypedDict
 
 import click
-from pydantic.dataclasses import Field, dataclass
 from tabulate import tabulate
 from tqdm import tqdm
 
@@ -41,7 +41,7 @@ class Result:
 
     source_file_name: str
     target_file_name: Optional[str] = None
-    error_info: dict[str, str] = Field(default_factory=dict)
+    error_info: dict[str, str] = field(default_factory=dict)
 
 
 @dataclass
@@ -123,10 +123,19 @@ def process_statement(
         information about the processed statement. If an error occurs during processing,
         returns a Result object with error information.
     """
-    from monopoly.pipeline import Pipeline  # pylint: disable=import-outside-toplevel
+    # pylint: disable=import-outside-toplevel, too-many-locals
+    from monopoly.banks import BankDetector, banks
+    from monopoly.generic import GenericBank
+    from monopoly.pdf import PdfDocument, PdfParser
+    from monopoly.pipeline import Pipeline
 
     try:
-        pipeline = Pipeline(file)
+        document = PdfDocument(file)
+        analyzer = BankDetector(document)
+        bank = analyzer.detect_bank(banks) or GenericBank
+        parser = PdfParser(bank, document)
+        pipeline = Pipeline(parser)
+
         statement = pipeline.extract(safety_check=safety_check)
         transactions = pipeline.transform(statement)
 

diff --git a/src/monopoly/config.py b/src/monopoly/config.py
@@ -1,10 +1,9 @@
-from dataclasses import field
+from dataclasses import dataclass, field
 from typing import Optional, Pattern
 
-from pydantic.dataclasses import dataclass
-
 from monopoly.constants import BankNames, EntryType, InternalBankNames
 from monopoly.enums import RegexEnum
+from monopoly.identifiers import MetadataIdentifier
 
 
 @dataclass
@@ -67,7 +66,9 @@ class PdfConfig:
     - `page_bbox`: A tuple representing the bounding box range for every
     page. This is used to avoid weirdness like vertical text, and other
     PDF artifacts that may affect parsing.
+    - `ocr_identifiers`: Applies OCR on PDFs with a specific metadata identifier.
     """
 
     page_range: tuple[Optional[int], Optional[int]] = (None, None)
     page_bbox: Optional[tuple[float, float, float, float]] = None
+    ocr_identifiers: Optional[list[MetadataIdentifier]] = None
diff --git a/src/monopoly/constants/date.py b/src/monopoly/constants/date.py
@@ -14,20 +14,21 @@
 class DateFormats(StrEnum):
     """Holds a case-insensitive list of common ISO 8601 date formats"""
 
-    D = r"(?i:1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)"
-    DD = r"(?i:01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)"
-    M = r"(?i:1|2|3|4|5|6|7|8|9|10|11|12)"
-    MM = r"(?i:01|02|03|04|05|06|07|08|09|10|11|12)"
+    D = r"(1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)"
+    DD = r"(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)"
+    M = r"(1|2|3|4|5|6|7|8|9|10|11|12)"
+    MM = r"(01|02|03|04|05|06|07|08|09|10|11|12)"
     MMM = r"(?i:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
     MMMM = r"(?i:January|February|March|April|May|June|July|August|September|October|November|December)"
-    YY = r"(?i:[2-5][0-9]\b)"
-    YYYY = r"(?i:20\d{2}\b)"
+    YY = r"([2-5][0-9]\b)"
+    YYYY = r"(20\d{2}\b)"
 
 
 class ISO8601(RegexEnum):
     DD_MM = rf"\b({DateFormats.DD}[\/\-\s]{DateFormats.MM})"
     DD_MM_YY = rf"\b({DateFormats.DD}[\/\-\s]{DateFormats.MM}[\/\-\s]{DateFormats.YY})"
     DD_MMM = rf"\b({DateFormats.DD}[-\s]{DateFormats.MMM})"
+    DD_MMM_RELAXED = DD_MMM.replace(r"[-\s]", r"(?:[-\s]|)")
     DD_MMM_YY = rf"\b({DateFormats.DD}[-\s]{DateFormats.MMM}[-\s]{DateFormats.YY})"
     DD_MMM_YYYY = (
         rf"\b({DateFormats.DD}[-\s]{DateFormats.MMM}[,\s]{{1,2}}{DateFormats.YYYY})"

diff --git a/src/monopoly/constants/statement.py b/src/monopoly/constants/statement.py
@@ -103,8 +103,8 @@ class CreditTransactionPatterns(RegexEnum):
         + SharedPatterns.AMOUNT_EXTENDED
     )
     HSBC = (
-        rf"(?P<posting_date>{ISO8601.DD_MMM})\s+"
-        rf"(?P<transaction_date>{ISO8601.DD_MMM})\s+"
+        rf"(?P<posting_date>{ISO8601.DD_MMM_RELAXED})\s+"
+        rf"(?P<transaction_date>{ISO8601.DD_MMM_RELAXED})\s+"
         + SharedPatterns.DESCRIPTION
         + SharedPatterns.AMOUNT_EXTENDED
     )

diff --git a/src/monopoly/examples/__init__.py b/src/monopoly/examples/__init__.py
diff --git a/src/monopoly/examples/single_statement.py b/src/monopoly/examples/single_statement.py
@@ -1,3 +1,5 @@
+from monopoly.banks import ExampleBank
+from monopoly.pdf import PdfDocument, PdfParser
 from monopoly.pipeline import Pipeline
 
 
@@ -6,13 +8,11 @@ def example():
     a single bank statement
 
     You can pass in the bank class if you want to specify a specific bank,
-    or ignore the bank argument and let the Pipeline try to automatically
-    detect the bank.
+    or use the BankDetector class to try to detect the bank automatically.
     """
-    pipeline = Pipeline(
-        file_path="src/monopoly/examples/example_statement.pdf",
-        # bank=ExampleBank
-    )
+    document = PdfDocument(file_path="src/monopoly/examples/example_statement.pdf")
+    parser = PdfParser(ExampleBank, document)
+    pipeline = Pipeline(parser)
 
     # This runs pdftotext on the PDF and
     # extracts transactions as raw text
@@ -22,12 +22,15 @@ def example():
     transactions = pipeline.transform(statement)
 
     # Parsed transactions writen to a CSV file in the "example" directory
-    pipeline.load(
+    file_path = pipeline.load(
         transactions=transactions,
         statement=statement,
         output_directory="src/monopoly/examples",
     )
 
+    with open(file_path, encoding="utf8") as file:
+        print(file.read()[0:248])
+
 
 if __name__ == "__main__":
     example()
diff --git a/src/monopoly/generic/__init__.py b/src/monopoly/generic/__init__.py
@@ -1,8 +1,4 @@
 from .generic import DateMatch, DatePatternAnalyzer
-from .generic_handler import GenericStatementHandler
+from .handler import GenericBank, GenericStatementHandler
 
-__all__ = [
-    "DatePatternAnalyzer",
-    "DateMatch",
-    "GenericStatementHandler",
-]
+__all__ = ["DatePatternAnalyzer", "DateMatch", "GenericStatementHandler", "GenericBank"]