From 620198d38f5fcf948602870b87ca1f5e553af188 Mon Sep 17 00:00:00 2001 From: blue-hope Date: Thu, 26 Oct 2023 00:10:07 +0900 Subject: [PATCH 1/7] feat: Add page metadata on PDFMinerLoader --- .../langchain/document_loaders/parsers/pdf.py | 10 ++++++---- .../integration_tests/document_loaders/test_pdf.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 74373ba9b6d3b..fd116be90f459 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -133,20 +133,22 @@ def __init__(self, extract_images: bool = False): def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" + from pdfminer.pdfpage import PDFPage if not self.extract_images: from pdfminer.high_level import extract_text with blob.as_bytes_io() as pdf_file_obj: - text = extract_text(pdf_file_obj) - metadata = {"source": blob.source} - yield Document(page_content=text, metadata=metadata) + pages = PDFPage.get_pages(pdf_file_obj) + for i, page in enumerate(pages): + text = extract_text(pdf_file_obj, page_numbers=[i]) + metadata = {"source": blob.source, "page": str(i)} + yield Document(page_content=text, metadata=metadata) else: import io from pdfminer.converter import PDFPageAggregator, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager - from pdfminer.pdfpage import PDFPage text_io = io.StringIO() with blob.as_bytes_io() as pdf_file_obj: diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index aea75c11fda00..a0b461c4c6bce 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -54,7 +54,7 @@ def test_pdfminer_loader() -> None: loader = PDFMinerLoader(str(file_path)) docs = loader.load() - assert len(docs) == 1 + assert len(docs) == 16 def test_pdfminer_pdf_as_html_loader() -> None: From d5e91c7a6af515599ba896848e8917f682d3c6e4 Mon Sep 17 00:00:00 2001 From: blue-hope Date: Thu, 26 Oct 2023 00:23:38 +0900 Subject: [PATCH 2/7] fix: Fix lint --- libs/langchain/langchain/document_loaders/parsers/pdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index fd116be90f459..0425a12e97943 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -134,6 +134,7 @@ def __init__(self, extract_images: bool = False): def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" from pdfminer.pdfpage import PDFPage + if not self.extract_images: from pdfminer.high_level import extract_text From 55275d0fab5efd312c2ea50a7d4e588f44d94f06 Mon Sep 17 00:00:00 2001 From: blue-hope Date: Thu, 26 Oct 2023 01:03:05 +0900 Subject: [PATCH 3/7] fix: Fix minor --- libs/langchain/langchain/document_loaders/parsers/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 0425a12e97943..09e4756e25e55 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -140,7 +140,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: with blob.as_bytes_io() as pdf_file_obj: pages = PDFPage.get_pages(pdf_file_obj) - for i, page in enumerate(pages): + for i, _ in enumerate(pages): text = extract_text(pdf_file_obj, page_numbers=[i]) metadata = {"source": blob.source, "page": str(i)} yield Document(page_content=text, metadata=metadata) From 54fc08421e5bfe084a20159eb8dba93aff8e61f5 Mon Sep 17 00:00:00 2001 From: blue-hope Date: Thu, 26 Oct 2023 13:31:31 +0900 Subject: [PATCH 4/7] feat: Add feature flag load_per_pages --- .../langchain/document_loaders/parsers/pdf.py | 20 +++++++++++++------ .../langchain/document_loaders/pdf.py | 5 ++++- .../document_loaders/test_pdf.py | 13 ++++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 09e4756e25e55..f540f4df1612e 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -128,21 +128,28 @@ def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str: class PDFMinerParser(BaseBlobParser): """Parse `PDF` using `PDFMiner`.""" - def __init__(self, extract_images: bool = False): + def __init__(self, extract_images: bool = False, load_per_pages: bool = False): self.extract_images = extract_images + self.load_per_pages = load_per_pages def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" - from pdfminer.pdfpage import PDFPage if not self.extract_images: from pdfminer.high_level import extract_text with blob.as_bytes_io() as pdf_file_obj: - pages = PDFPage.get_pages(pdf_file_obj) - for i, _ in enumerate(pages): - text = extract_text(pdf_file_obj, page_numbers=[i]) - metadata = {"source": blob.source, "page": str(i)} + if self.load_per_pages: + from pdfminer.pdfpage import PDFPage + + pages = PDFPage.get_pages(pdf_file_obj) + for i, _ in enumerate(pages): + text = extract_text(pdf_file_obj, page_numbers=[i]) + metadata = {"source": blob.source, "page": str(i)} + yield Document(page_content=text, metadata=metadata) + else: + text = extract_text(pdf_file_obj) + metadata = {"source": blob.source} yield Document(page_content=text, metadata=metadata) else: import io @@ -150,6 +157,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: from pdfminer.converter import PDFPageAggregator, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.pdfpage import PDFPage text_io = io.StringIO() with blob.as_bytes_io() as pdf_file_obj: diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 7463eb0dec4dc..e6f7d4ad9f52f 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -251,6 +251,7 @@ def __init__( *, headers: Optional[Dict] = None, extract_images: bool = False, + load_per_pages: bool = False, ) -> None: """Initialize with file path.""" try: @@ -262,7 +263,9 @@ def __init__( ) super().__init__(file_path, headers=headers) - self.parser = PDFMinerParser(extract_images=extract_images) + self.parser = PDFMinerParser( + extract_images=extract_images, load_per_pages=load_per_pages + ) def load(self) -> List[Document]: """Eagerly load the content.""" diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index a0b461c4c6bce..ad195a310e651 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -53,6 +53,19 @@ def test_pdfminer_loader() -> None: file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" loader = PDFMinerLoader(str(file_path)) + docs = loader.load() + assert len(docs) == 1 + + """Test PDFMiner loader. (load per pages)""" + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = PDFMinerLoader(str(file_path), load_per_pages=True) + docs = loader.load() + + assert len(docs) == 1 + + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PDFMinerLoader(str(file_path), load_per_pages=True) + docs = loader.load() assert len(docs) == 16 From 2a6c469b1b9be9a0aa177243c7a52494202648bc Mon Sep 17 00:00:00 2001 From: blue-hope Date: Tue, 31 Oct 2023 17:46:15 +0900 Subject: [PATCH 5/7] feat: Rename args name and add doc --- .../langchain/document_loaders/parsers/pdf.py | 21 ++++++++++++------- .../langchain/document_loaders/pdf.py | 12 ++++++++--- .../document_loaders/test_pdf.py | 4 ++-- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index f540f4df1612e..2f1bb4d854fdf 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -128,9 +128,16 @@ def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str: class PDFMinerParser(BaseBlobParser): """Parse `PDF` using `PDFMiner`.""" - def __init__(self, extract_images: bool = False, load_per_pages: bool = False): + def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True): + """Initialize a parser based on PDFMiner. + + Args: + extract_images: Whether to extract images from PDF. + concatenate_pages: If True, concatenate all PDF pages into one a single + document. Otherwise, return one document per page. + """ self.extract_images = extract_images - self.load_per_pages = load_per_pages + self.concatenate_pages = concatenate_pages def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" @@ -139,7 +146,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: from pdfminer.high_level import extract_text with blob.as_bytes_io() as pdf_file_obj: - if self.load_per_pages: + if self.concatenate_pages: + text = extract_text(pdf_file_obj) + metadata = {"source": blob.source} + yield Document(page_content=text, metadata=metadata) + else: from pdfminer.pdfpage import PDFPage pages = PDFPage.get_pages(pdf_file_obj) @@ -147,10 +158,6 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: text = extract_text(pdf_file_obj, page_numbers=[i]) metadata = {"source": blob.source, "page": str(i)} yield Document(page_content=text, metadata=metadata) - else: - text = extract_text(pdf_file_obj) - metadata = {"source": blob.source} - yield Document(page_content=text, metadata=metadata) else: import io diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index e6f7d4ad9f52f..910075efdffd4 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -251,9 +251,15 @@ def __init__( *, headers: Optional[Dict] = None, extract_images: bool = False, - load_per_pages: bool = False, + concatenate_pages: bool = True, ) -> None: - """Initialize with file path.""" + """Initialize with file path. + + Args: + extract_images: Whether to extract images from PDF. + concatenate_pages: If True, concatenate all PDF pages into one a single + document. Otherwise, return one document per page. + """ try: from pdfminer.high_level import extract_text # noqa:F401 except ImportError: @@ -264,7 +270,7 @@ def __init__( super().__init__(file_path, headers=headers) self.parser = PDFMinerParser( - extract_images=extract_images, load_per_pages=load_per_pages + extract_images=extract_images, concatenate_pages=concatenate_pages ) def load(self) -> List[Document]: diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index ad195a310e651..48e7ec1d9b730 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -58,13 +58,13 @@ def test_pdfminer_loader() -> None: """Test PDFMiner loader. (load per pages)""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PDFMinerLoader(str(file_path), load_per_pages=True) + loader = PDFMinerLoader(str(file_path), concatenate_pages=False) docs = loader.load() assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PDFMinerLoader(str(file_path), load_per_pages=True) + loader = PDFMinerLoader(str(file_path), concatenate_pages=False) docs = loader.load() assert len(docs) == 16 From 02e694f6060af2f5357728740da8c666746e3acb Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 1 Nov 2023 10:19:52 -0400 Subject: [PATCH 6/7] Update libs/langchain/tests/integration_tests/document_loaders/test_pdf.py --- .../tests/integration_tests/document_loaders/test_pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index 48e7ec1d9b730..966a917ae74cd 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -56,7 +56,7 @@ def test_pdfminer_loader() -> None: docs = loader.load() assert len(docs) == 1 - """Test PDFMiner loader. (load per pages)""" + # Verify that concatenating pages parameter works file_path = Path(__file__).parent.parent / "examples/hello.pdf" loader = PDFMinerLoader(str(file_path), concatenate_pages=False) docs = loader.load() From fcc38a597b2f5bb06c7093b3bc69d55983072195 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 1 Nov 2023 10:19:57 -0400 Subject: [PATCH 7/7] Update libs/langchain/tests/integration_tests/document_loaders/test_pdf.py --- .../tests/integration_tests/document_loaders/test_pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index 966a917ae74cd..526970ff7fda2 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -58,7 +58,7 @@ def test_pdfminer_loader() -> None: # Verify that concatenating pages parameter works file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PDFMinerLoader(str(file_path), concatenate_pages=False) + loader = PDFMinerLoader(str(file_path), concatenate_pages=True) docs = loader.load() assert len(docs) == 1