From ab4e43c984461b817e3e45ea75e9b47ce83a180d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 16 Jan 2025 15:47:43 +0100 Subject: [PATCH] adding test for PyPDF to extract passages so that they are detect by DocumentSplitter --- haystack/components/converters/pypdf.py | 19 ++++++++-------- .../converters/test_pypdf_to_document.py | 22 +++++++++++++++++++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 334ef097d7..15bbcc1fec 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -158,17 +158,16 @@ def from_dict(cls, data): def _default_convert(self, reader: "PdfReader") -> str: texts = [] for page in reader.pages: - texts.append( - page.extract_text( - orientations=self.plain_mode_orientations, - extraction_mode=self.extraction_mode.value, - space_width=self.plain_mode_space_width, - layout_mode_space_vertically=self.layout_mode_space_vertically, - layout_mode_scale_weight=self.layout_mode_scale_weight, - layout_mode_strip_rotated=self.layout_mode_strip_rotated, - layout_mode_font_height_weight=self.layout_mode_font_height_weight, - ) + extracted_text = page.extract_text( + orientations=self.plain_mode_orientations, + extraction_mode=self.extraction_mode.value, + space_width=self.plain_mode_space_width, + layout_mode_space_vertically=self.layout_mode_space_vertically, + layout_mode_scale_weight=self.layout_mode_scale_weight, + layout_mode_strip_rotated=self.layout_mode_strip_rotated, + layout_mode_font_height_weight=self.layout_mode_font_height_weight, ) + texts.append(extracted_text) text = "\f".join(texts) return text diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index 916bb771ee..6306f0659e 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -8,6 +8,7 @@ from haystack import Document, default_from_dict, default_to_dict from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode +from haystack.components.preprocessors import DocumentSplitter from haystack.dataclasses import ByteStream @@ -213,3 +214,24 @@ def test_run_empty_document(self, caplog, test_files_path): # Check that meta is used when the returned document is initialized and thus when doc id is generated assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf" assert output["documents"][0].id != Document(content="").id + + def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path): + converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT) + sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"] + pdf_doc = converter.run(sources=sources) + splitter = DocumentSplitter(split_length=1, split_by="passage") + docs = splitter.run(pdf_doc["documents"]) + + assert len(docs["documents"]) == 51 + + expected = ( + "A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n" + "edited and managed by its own audience directly using a web browser. A typical wiki\ncontains " + "multiple pages for the subjects or scope of the project and may be either open\nto the public or " + "limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are " + "enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a " + "content management system, differs from other web-based systems\nsuch as blog software, in that " + "the content is created without any defined owner or leader,\nand wikis have little inherent " + "structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n" + ) + assert docs["documents"][2].content == expected