From 0ef4f9b282e91809e7ce1b4cd809fe0ebc2112fa Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 21 Jan 2025 20:58:18 -0500 Subject: [PATCH 01/10] feat: allow returning indirect object references from workers --- playa/__init__.py | 11 +++----- playa/document.py | 9 ++++--- playa/pdftypes.py | 2 +- playa/worker.py | 60 +++++++++++++++++++++++++++++++++--------- tests/test_parallel.py | 24 ++++++++++++++++- 5 files changed, 79 insertions(+), 27 deletions(-) diff --git a/playa/__init__.py b/playa/__init__.py index 0df60be..c463653 100644 --- a/playa/__init__.py +++ b/playa/__init__.py @@ -17,10 +17,9 @@ from concurrent.futures import ProcessPoolExecutor from os import PathLike from multiprocessing.context import BaseContext -from pathlib import Path from typing import Union -from playa.worker import _set_document +from playa.worker import _init_worker, _add_boss from playa.document import Document, LayoutDict, schema as schema # noqa: F401 from playa.page import DeviceSpace from playa._version import __version__ # noqa: F401 @@ -28,11 +27,6 @@ fieldnames = LayoutDict.__annotations__.keys() -def _init_worker(path: Path, password: str = "", space: DeviceSpace = "screen") -> None: - fp = builtins.open(path, "rb") - _set_document(Document(fp, password=password, space=space, init_worker=True)) - - def open( path: Union[PathLike, str], *, @@ -63,6 +57,7 @@ def open( max_workers=max_workers, mp_context=mp_context, initializer=_init_worker, # type: ignore[arg-type] - initargs=(path, password, space), # type: ignore[arg-type] + initargs=(id(pdf), path, password, space), # type: ignore[arg-type] ) + _add_boss(pdf) return pdf diff --git a/playa/document.py b/playa/document.py index 2c1c10b..c71cdba 100644 --- a/playa/document.py +++ b/playa/document.py @@ -85,7 +85,7 @@ nunpack, ) from playa.structtree import StructTree -from playa.worker import _set_document, _ref_document, _deref_document, _deref_page +from playa.worker import _set_document, _ref_document, _deref_document, _deref_page, in_worker log = logging.getLogger(__name__) @@ -830,12 +830,13 @@ def __init__( fp: BinaryIO, password: str = "", space: DeviceSpace = "screen", - init_worker: bool = False, + _boss_id: int = 0, ) -> None: - if init_worker: + if _boss_id: # Set this **right away** because it is needed to get # indirect object references right. - _set_document(self) + _set_document(self, _boss_id) + assert in_worker() self.xrefs: List[XRef] = [] self.space = space self.info = [] diff --git a/playa/pdftypes.py b/playa/pdftypes.py index 56e938e..bf4c8e3 100644 --- a/playa/pdftypes.py +++ b/playa/pdftypes.py @@ -21,7 +21,7 @@ from playa.lzw import lzwdecode from playa.runlength import rldecode from playa.utils import apply_png_predictor, apply_tiff_predictor -from playa.worker import DocumentRef, _deref_document +from playa.worker import DocumentRef, _deref_document, in_worker logger = logging.getLogger(__name__) diff --git a/playa/worker.py b/playa/worker.py index 47424d9..c458dd1 100644 --- a/playa/worker.py +++ b/playa/worker.py @@ -1,21 +1,25 @@ """Worker subprocess related functions and data.""" import weakref +from pathlib import Path from typing import Union, TYPE_CHECKING if TYPE_CHECKING: - from playa.document import Document + from playa.document import Document, DeviceSpace from playa.page import Page # Type signature of document reference -DocumentRef = Union[weakref.ReferenceType["Document"], str] +DocumentRef = Union[weakref.ReferenceType["Document"], int] # Type signature of page reference PageRef = Union[weakref.ReferenceType["Page"], int] # A global PDF object used in worker processes __pdf: Union["Document", None] = None -# Flag used to signal that we should look at the global document -GLOBAL_DOC = "[citation needed]" +# Registry of documents which have workers +__bosses: weakref.WeakValueDictionary[int, "Document"] = weakref.WeakValueDictionary() +# Numeric id of the document in the boss process (will show up instead +# of weak references when serialized, gets looked up in _bosses) +GLOBAL_DOC: int = 0 def in_worker() -> bool: @@ -23,9 +27,29 @@ def in_worker() -> bool: return __pdf is not None -def _set_document(doc: "Document") -> None: - global __pdf +def _init_worker( + boss: int, path: Path, password: str = "", space: "DeviceSpace" = "screen" +) -> None: + from playa.document import Document + + global __pdf, GLOBAL_DOC + fp = open(path, "rb") + __pdf = Document(fp, password=password, space=space, _boss_id=boss) + GLOBAL_DOC = boss + + +def _add_boss(doc: "Document") -> None: + """Call this in the parent process.""" + global __bosses + assert not in_worker() + __bosses[id(doc)] = doc + + +def _set_document(doc: "Document", boss: int) -> None: + """Call this in the worker process.""" + global __pdf, GLOBAL_DOC __pdf = doc + GLOBAL_DOC = boss def _get_document() -> Union["Document", None]: @@ -34,16 +58,26 @@ def _get_document() -> Union["Document", None]: def _ref_document(doc: "Document") -> DocumentRef: - return weakref.ref(doc) if __pdf is None else GLOBAL_DOC + if in_worker(): + global GLOBAL_DOC + assert GLOBAL_DOC != 0 + return GLOBAL_DOC + else: + return weakref.ref(doc) def _deref_document(ref: DocumentRef) -> "Document": - doc = __pdf - if isinstance(ref, weakref.ReferenceType): + if in_worker(): + return __pdf + if isinstance(ref, int): + if ref not in __bosses: + raise RuntimeError(f"Unknown or deleted document with ID {ref}!") + return __bosses[ref] + else: doc = ref() - if doc is None: - raise RuntimeError("Document no longer exists (or never existed)!") - return doc + if doc is None: + raise RuntimeError("Document no longer exists (or never existed)!") + return doc def _ref_page(page: "Page") -> PageRef: @@ -58,5 +92,5 @@ def _deref_page(ref: PageRef) -> "Page": else: page = ref() if page is None: - raise RuntimeError("Page no longer exists!") + raise RuntimeError(f"Page {ref} no longer exists (or never existed)!") return page diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 7669d42..0cc178c 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -5,11 +5,13 @@ import playa import playa.document from playa.page import Page +from playa.worker import in_worker, _get_document from tests.data import TESTDIR, CONTRIB def has_one_true_pdf() -> int: - doc = playa.worker._get_document() + assert in_worker() + doc = _get_document() assert doc is not None assert doc.space == "default" return len(doc.pages) @@ -28,6 +30,22 @@ def test_open_parallel(): assert future.result() == 1 +def get_resources(page: Page) -> dict: + assert in_worker() + return page.resources + + +def test_parallel_references(): + with playa.open( + TESTDIR / "pdf_structure.pdf", space="default", max_workers=2 + ) as pdf: + resources, = list(pdf.pages.map(get_resources)) + desc = resources["Font"].resolve() # should succeed! + assert "F1" in desc # should exist! + assert "F2" in desc + assert desc["F1"].resolve()["LastChar"] == 17 + + def get_text(page: Page) -> str: return " ".join(x.chars for x in page.texts) @@ -39,3 +57,7 @@ def test_map_parallel(): with playa.open(CONTRIB / "PSC_Station.pdf", space="default") as pdf: texts = list(pdf.pages.map(get_text)) assert texts == parallel_texts + + +if __name__ == '__main__': + test_parallel_references() From de465fe03916aecb9556b0b8390fa94cdc8536fb Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 21 Jan 2025 21:40:27 -0500 Subject: [PATCH 02/10] fix: make page references safe too --- playa/document.py | 11 +++++++++-- playa/pdftypes.py | 2 +- playa/worker.py | 18 ++++++------------ tests/test_parallel.py | 4 ++-- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/playa/document.py b/playa/document.py index c71cdba..0187c81 100644 --- a/playa/document.py +++ b/playa/document.py @@ -85,7 +85,13 @@ nunpack, ) from playa.structtree import StructTree -from playa.worker import _set_document, _ref_document, _deref_document, _deref_page, in_worker +from playa.worker import ( + _set_document, + _ref_document, + _deref_document, + _get_document, + in_worker, +) log = logging.getLogger(__name__) @@ -1388,7 +1394,8 @@ def _read_xref_from( def call_page(func: Callable[[Page], Any], idx: int) -> Any: """Call a function on a page in a worker process.""" - return func(_deref_page(idx)) + doc = _get_document() + return func(doc.pages[idx]) class PageList: diff --git a/playa/pdftypes.py b/playa/pdftypes.py index bf4c8e3..56e938e 100644 --- a/playa/pdftypes.py +++ b/playa/pdftypes.py @@ -21,7 +21,7 @@ from playa.lzw import lzwdecode from playa.runlength import rldecode from playa.utils import apply_png_predictor, apply_tiff_predictor -from playa.worker import DocumentRef, _deref_document, in_worker +from playa.worker import DocumentRef, _deref_document logger = logging.getLogger(__name__) diff --git a/playa/worker.py b/playa/worker.py index c458dd1..c39095d 100644 --- a/playa/worker.py +++ b/playa/worker.py @@ -2,7 +2,7 @@ import weakref from pathlib import Path -from typing import Union, TYPE_CHECKING +from typing import Tuple, Union, TYPE_CHECKING if TYPE_CHECKING: from playa.document import Document, DeviceSpace @@ -11,7 +11,7 @@ # Type signature of document reference DocumentRef = Union[weakref.ReferenceType["Document"], int] # Type signature of page reference -PageRef = Union[weakref.ReferenceType["Page"], int] +PageRef = Tuple[DocumentRef, int] # A global PDF object used in worker processes __pdf: Union["Document", None] = None @@ -81,16 +81,10 @@ def _deref_document(ref: DocumentRef) -> "Document": def _ref_page(page: "Page") -> PageRef: - return weakref.ref(page) if __pdf is None else page.page_idx + return page.doc, page.page_idx def _deref_page(ref: PageRef) -> "Page": - if isinstance(ref, int): - if __pdf is None: - raise RuntimeError("Not in a worker process, cannot retrieve document!") - return __pdf.pages[ref] - else: - page = ref() - if page is None: - raise RuntimeError(f"Page {ref} no longer exists (or never existed)!") - return page + docref, idx = ref + doc = _deref_document(docref) + return doc.pages[idx] diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 0cc178c..7f2c7e2 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -39,7 +39,7 @@ def test_parallel_references(): with playa.open( TESTDIR / "pdf_structure.pdf", space="default", max_workers=2 ) as pdf: - resources, = list(pdf.pages.map(get_resources)) + (resources,) = list(pdf.pages.map(get_resources)) desc = resources["Font"].resolve() # should succeed! assert "F1" in desc # should exist! assert "F2" in desc @@ -59,5 +59,5 @@ def test_map_parallel(): assert texts == parallel_texts -if __name__ == '__main__': +if __name__ == "__main__": test_parallel_references() From 303baf310929ea599c7a6bfefdc2dd07c8f34a6e Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 21 Jan 2025 22:08:21 -0500 Subject: [PATCH 03/10] fix: fix page references --- playa/worker.py | 2 +- tests/test_parallel.py | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/playa/worker.py b/playa/worker.py index c39095d..a645c0d 100644 --- a/playa/worker.py +++ b/playa/worker.py @@ -81,7 +81,7 @@ def _deref_document(ref: DocumentRef) -> "Document": def _ref_page(page: "Page") -> PageRef: - return page.doc, page.page_idx + return _ref_document(page.doc), page.page_idx def _deref_page(ref: PageRef) -> "Page": diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 7f2c7e2..42147ad 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -1,10 +1,13 @@ """Test parallel analysis.""" +import operator +from typing import List + import pytest import playa import playa.document -from playa.page import Page +from playa.page import Page, XObjectObject from playa.worker import in_worker, _get_document from tests.data import TESTDIR, CONTRIB @@ -30,22 +33,33 @@ def test_open_parallel(): assert future.result() == 1 -def get_resources(page: Page) -> dict: - assert in_worker() - return page.resources - - def test_parallel_references(): with playa.open( TESTDIR / "pdf_structure.pdf", space="default", max_workers=2 ) as pdf: - (resources,) = list(pdf.pages.map(get_resources)) + (resources,) = pdf.pages.map(operator.attrgetter("resources")) desc = resources["Font"].resolve() # should succeed! assert "F1" in desc # should exist! assert "F2" in desc assert desc["F1"].resolve()["LastChar"] == 17 +def get_xobjs(page: Page) -> List[XObjectObject]: + return list(page.xobjects) + + +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") +def test_parallel_xobjects(): + # Verify that page references (used in XObjects) also work + with playa.open(CONTRIB / "basicapi.pdf", space="default", max_workers=2) as pdf: + for page in pdf.pages: + for xobj in page.xobjects: + assert xobj.page.page_idx == page.page_idx + for idx, xobjs in enumerate(pdf.pages.map(get_xobjs)): + for xobj in xobjs: + assert xobj.page.page_idx == idx + + def get_text(page: Page) -> str: return " ".join(x.chars for x in page.texts) @@ -60,4 +74,4 @@ def test_map_parallel(): if __name__ == "__main__": - test_parallel_references() + test_parallel_xobjects() From ae255c0a9c50f5b147864af384e18c8d767f4f7a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 21 Jan 2025 22:33:16 -0500 Subject: [PATCH 04/10] fix: correct errors found by mypy --- playa/document.py | 2 ++ playa/worker.py | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/playa/document.py b/playa/document.py index 0187c81..26f5140 100644 --- a/playa/document.py +++ b/playa/document.py @@ -1395,6 +1395,8 @@ def _read_xref_from( def call_page(func: Callable[[Page], Any], idx: int) -> Any: """Call a function on a page in a worker process.""" doc = _get_document() + if doc is None: + raise RuntimeError("Document no longer exists (or never existed)!") return func(doc.pages[idx]) diff --git a/playa/worker.py b/playa/worker.py index a645c0d..93ba395 100644 --- a/playa/worker.py +++ b/playa/worker.py @@ -68,20 +68,20 @@ def _ref_document(doc: "Document") -> DocumentRef: def _deref_document(ref: DocumentRef) -> "Document": if in_worker(): - return __pdf - if isinstance(ref, int): + doc = __pdf + elif isinstance(ref, int): if ref not in __bosses: raise RuntimeError(f"Unknown or deleted document with ID {ref}!") - return __bosses[ref] + doc = __bosses[ref] else: doc = ref() - if doc is None: - raise RuntimeError("Document no longer exists (or never existed)!") - return doc + if doc is None: + raise RuntimeError("Document no longer exists (or never existed)!") + return doc def _ref_page(page: "Page") -> PageRef: - return _ref_document(page.doc), page.page_idx + return page.docref, page.page_idx def _deref_page(ref: PageRef) -> "Page": From a10eb485227e00494bbe70ff71d93300afe91526 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 22 Jan 2025 06:20:30 -0500 Subject: [PATCH 05/10] refactor: use _deref_page --- playa/document.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/playa/document.py b/playa/document.py index 26f5140..df39b01 100644 --- a/playa/document.py +++ b/playa/document.py @@ -89,8 +89,9 @@ _set_document, _ref_document, _deref_document, - _get_document, + _deref_page, in_worker, + PageRef, ) log = logging.getLogger(__name__) @@ -1392,12 +1393,9 @@ def _read_xref_from( self._read_xref_from(pos + self.offset, xrefs) -def call_page(func: Callable[[Page], Any], idx: int) -> Any: +def call_page(func: Callable[[Page], Any], pageref: PageRef) -> Any: """Call a function on a page in a worker process.""" - doc = _get_document() - if doc is None: - raise RuntimeError("Document no longer exists (or never existed)!") - return func(doc.pages[idx]) + return func(_deref_page(pageref)) class PageList: @@ -1454,7 +1452,9 @@ def map(self, func: Callable[[Page], Any]) -> Iterator: doc = _deref_document(self.docref) if doc._pool is not None: return doc._pool.map( - call_page, itertools.repeat(func), (page.page_idx for page in self) + call_page, + itertools.repeat(func), + ((id(doc), page.page_idx) for page in self), ) else: return (func(page) for page in self) From 4a80945d5bbd32d77edf6d540a3fc656ed245273 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 22 Jan 2025 06:37:03 -0500 Subject: [PATCH 06/10] feat: eliminate direct use of weakrefs entirely --- playa/__init__.py | 3 +-- playa/worker.py | 16 +++++----------- tests/test_open.py | 1 - tests/test_pdftypes.py | 13 ++++++------- 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/playa/__init__.py b/playa/__init__.py index c463653..a62d62b 100644 --- a/playa/__init__.py +++ b/playa/__init__.py @@ -19,7 +19,7 @@ from multiprocessing.context import BaseContext from typing import Union -from playa.worker import _init_worker, _add_boss +from playa.worker import _init_worker from playa.document import Document, LayoutDict, schema as schema # noqa: F401 from playa.page import DeviceSpace from playa._version import __version__ # noqa: F401 @@ -59,5 +59,4 @@ def open( initializer=_init_worker, # type: ignore[arg-type] initargs=(id(pdf), path, password, space), # type: ignore[arg-type] ) - _add_boss(pdf) return pdf diff --git a/playa/worker.py b/playa/worker.py index 93ba395..06ac60c 100644 --- a/playa/worker.py +++ b/playa/worker.py @@ -38,13 +38,6 @@ def _init_worker( GLOBAL_DOC = boss -def _add_boss(doc: "Document") -> None: - """Call this in the parent process.""" - global __bosses - assert not in_worker() - __bosses[id(doc)] = doc - - def _set_document(doc: "Document", boss: int) -> None: """Call this in the worker process.""" global __pdf, GLOBAL_DOC @@ -63,18 +56,19 @@ def _ref_document(doc: "Document") -> DocumentRef: assert GLOBAL_DOC != 0 return GLOBAL_DOC else: - return weakref.ref(doc) + docid = id(doc) + if docid not in __bosses: + __bosses[docid] = doc + return docid def _deref_document(ref: DocumentRef) -> "Document": if in_worker(): doc = __pdf - elif isinstance(ref, int): + else: if ref not in __bosses: raise RuntimeError(f"Unknown or deleted document with ID {ref}!") doc = __bosses[ref] - else: - doc = ref() if doc is None: raise RuntimeError("Document no longer exists (or never existed)!") return doc diff --git a/tests/test_open.py b/tests/test_open.py index 5361393..c986b92 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -136,7 +136,6 @@ def test_weakrefs() -> None: with playa.open(TESTDIR / "simple5.pdf") as doc: ref = doc.catalog["Pages"] del doc - assert ref.doc() is None with pytest.raises(RuntimeError): _ = ref.resolve() diff --git a/tests/test_pdftypes.py b/tests/test_pdftypes.py index 488426f..db95b14 100644 --- a/tests/test_pdftypes.py +++ b/tests/test_pdftypes.py @@ -5,8 +5,7 @@ from playa.data_structures import NameTree, NumberTree from playa.runlength import rldecode from playa.pdftypes import ObjRef, resolve1, resolve_all - -import weakref +from playa.worker import _ref_document NUMTREE1 = { "Kids": [ @@ -94,15 +93,15 @@ class MockDoc(dict): pass mockdoc = MockDoc({42: "hello"}) - mockdoc[41] = ObjRef(weakref.ref(mockdoc), 42) - mockdoc[40] = ObjRef(weakref.ref(mockdoc), 41) + mockdoc[41] = ObjRef(_ref_document(mockdoc), 42) + mockdoc[40] = ObjRef(_ref_document(mockdoc), 41) assert mockdoc[41].resolve() == "hello" assert resolve1(mockdoc[41]) == "hello" assert mockdoc[40].resolve() == mockdoc[41] assert resolve_all(mockdoc[40]) == "hello" mockdoc[39] = [mockdoc[40], mockdoc[41]] assert resolve_all(mockdoc[39]) == ["hello", "hello"] - mockdoc[38] = ["hello", ObjRef(weakref.ref(mockdoc), 38)] + mockdoc[38] = ["hello", ObjRef(_ref_document(mockdoc), 38)] # This resolves the *list*, not the indirect object, so its second # element will get expanded once into a new list. ouf = resolve_all(mockdoc[38]) @@ -113,8 +112,8 @@ class MockDoc(dict): assert fou[1] is mockdoc[38] # Likewise here, we have to dig a bit to see the circular # reference. Your best option is not to use resolve_all ;-) - mockdoc[30] = ["hello", ObjRef(weakref.ref(mockdoc), 31)] - mockdoc[31] = ["hello", ObjRef(weakref.ref(mockdoc), 30)] + mockdoc[30] = ["hello", ObjRef(_ref_document(mockdoc), 31)] + mockdoc[31] = ["hello", ObjRef(_ref_document(mockdoc), 30)] bof = resolve_all(mockdoc[30]) assert bof[1][1][1] is mockdoc[31] fob = resolve_all(mockdoc[30][1]) From 86ca051dd7fdeffa6c7bf135c34a9b1d5d268a7f Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 22 Jan 2025 06:51:53 -0500 Subject: [PATCH 07/10] fix: make pagelist[slice] return a PageList for easier mapping --- README.md | 32 ++++++++++++++++++++++++++++++++ playa/document.py | 21 ++++++++++++++++----- tests/test_parallel.py | 4 ++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4bfb3ec..3ab7212 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,38 @@ possible piece of information, PLAYA gives you some options here. Wherever possible this information can be computed lazily, but this involves some more work on the user's part. +## Using multiple CPUs + +You may be wondering, what does "Parallel and Lazy" really mean? +PLAYA allows you to take advantage of multiple CPUs, which can greatly +speed up some operations on large documents. This parallelism +currently operates at the page level since this is the most logical +way to split up a PDF. To enable it, pass the `max_workers` argument +to `playa.open` with the number of cores you wish to use (you can also +explicitly pass `None` to use the maximum): + +```python +with playa.open(path, max_workers=4) as pdf: + ... +``` + +Now, you can apply a function across the pages of the PDF in parallel +using the `map` method of `pdf.pages`, for example: + +```python +def get_page_size(page: Page) -> Tuple[int, int]: + return page.width, page.height + +page_sizes = pdf.pages.map(get_page_size) +``` + +You could also just do this for certain pages by subscripting +`pdf.pages`: + +```python +some_page_sizes = pdf.pages[2:5].map(get_page_size) +``` + ## Dictionary-based API There used to be a "dictionary-based" API here. You can now find it diff --git a/playa/document.py b/playa/document.py index df39b01..c39f04e 100644 --- a/playa/document.py +++ b/playa/document.py @@ -1401,14 +1401,23 @@ def call_page(func: Callable[[Page], Any], pageref: PageRef) -> Any: class PageList: """List of pages indexable by 0-based index or string label.""" - def __init__(self, doc: Document): + def __init__( + self, doc: Document, pages: Union[Iterable[Page], None] = None + ) -> None: self.docref = _ref_document(doc) + if pages is not None: + self._pages = pages + self._labels: Dict[str, Page] = {page.label: page for page in pages} + else: + self._init_pages(doc) + + def _init_pages(self, doc: Document) -> None: try: - page_labels: Iterable[Optional[str]] = doc.page_labels + page_labels: Iterable[Union[str, None]] = doc.page_labels except (KeyError, ValueError): page_labels = (str(idx) for idx in itertools.count(1)) self._pages = [] - self._labels: Dict[str, Page] = {} + self._labels = {} try: page_objects = list(doc._get_page_objects()) except (KeyError, IndexError, TypeError): @@ -1431,10 +1440,12 @@ def __iter__(self) -> Iterator[Page]: return iter(self._pages) def __getitem__(self, key: Union[int, str]) -> Page: - if isinstance(key, int) or isinstance(key, slice): + if isinstance(key, int): return self._pages[key] + elif isinstance(key, slice): + return PageList(_deref_document(self.docref), self._pages[key]) elif isinstance(key, tuple): - return [self[k] for k in key] + return PageList(_deref_document(self.docref), [self[k] for k in key]) else: return self._labels[key] diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 42147ad..2385ae2 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -71,6 +71,10 @@ def test_map_parallel(): with playa.open(CONTRIB / "PSC_Station.pdf", space="default") as pdf: texts = list(pdf.pages.map(get_text)) assert texts == parallel_texts + with playa.open(CONTRIB / "PSC_Station.pdf", space="default", max_workers=2) as pdf: + parallel_texts = list(pdf.pages[3:8].map(get_text)) + print(parallel_texts) + assert parallel_texts != texts if __name__ == "__main__": From 6a31e4357371dc800d3c05421b535fb49b8d11be Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 22 Jan 2025 06:54:51 -0500 Subject: [PATCH 08/10] fix: types --- playa/document.py | 6 ++++-- playa/worker.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/playa/document.py b/playa/document.py index c39f04e..86845a0 100644 --- a/playa/document.py +++ b/playa/document.py @@ -1406,8 +1406,10 @@ def __init__( ) -> None: self.docref = _ref_document(doc) if pages is not None: - self._pages = pages - self._labels: Dict[str, Page] = {page.label: page for page in pages} + self._pages = list(pages) + self._labels: Dict[str, Page] = { + page.label: page for page in pages if page.label is not None + } else: self._init_pages(doc) diff --git a/playa/worker.py b/playa/worker.py index 06ac60c..e0a5912 100644 --- a/playa/worker.py +++ b/playa/worker.py @@ -9,7 +9,7 @@ from playa.page import Page # Type signature of document reference -DocumentRef = Union[weakref.ReferenceType["Document"], int] +DocumentRef = int # Type signature of page reference PageRef = Tuple[DocumentRef, int] From 31703b338e6a7312444ce0963992a792782f4a34 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 22 Jan 2025 07:02:34 -0500 Subject: [PATCH 09/10] docs: document parallel stuff --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 3ab7212..7aa41cc 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,25 @@ You could also just do this for certain pages by subscripting some_page_sizes = pdf.pages[2:5].map(get_page_size) ``` +There are some limitations to this, because it uses `multiprocessing`. +The function you pass to `map` must be serializable by `pickle`, which +in practice means that an inner function or lambda generally doesn't +work. You can get around this in a very Java-like way by passing a +callable object that encapsulates the necessary state. If you wish to +avoid traumatising readers of your code, then use `functools.partial` +instead: + +```python +pdf.pages.map(partial(myfunc, arg1=value1, arg2=value2)) +``` + +Also, any value returned by your function must also be serializable. +There is a bit of magic that enables this to work for PDF objects +containing indirect object references, so you should be able to, for +instance, get the `dests` or `annots` from every page without any +trouble. But if you have your own complex objects that you return you +may encounter problems (or slowness). + ## Dictionary-based API There used to be a "dictionary-based" API here. You can now find it From a22e93441d4d1f32db94f709843831ce90c0f102 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 22 Jan 2025 08:15:15 -0500 Subject: [PATCH 10/10] feat: allow lists as indexes too --- playa/document.py | 4 ++-- tests/test_document.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/playa/document.py b/playa/document.py index 86845a0..6faba3f 100644 --- a/playa/document.py +++ b/playa/document.py @@ -1446,8 +1446,8 @@ def __getitem__(self, key: Union[int, str]) -> Page: return self._pages[key] elif isinstance(key, slice): return PageList(_deref_document(self.docref), self._pages[key]) - elif isinstance(key, tuple): - return PageList(_deref_document(self.docref), [self[k] for k in key]) + elif isinstance(key, (tuple, list)): + return PageList(_deref_document(self.docref), (self[k] for k in key)) else: return self._labels[key] diff --git a/tests/test_document.py b/tests/test_document.py index b2822c3..f6ecb4b 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -113,6 +113,8 @@ def test_pages(): assert [p.label for p in twopages] == ["3", "4"] threepages = doc.pages["2", 2, 3] assert [p.label for p in threepages] == ["2", "3", "4"] + threepages = doc.pages[["2", 2, 3]] + assert [p.label for p in threepages] == ["2", "3", "4"] @pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")