deepset-ai · davidsbatista · Jan 17, 2025 · Jan 16, 2025
@@ -158,17 +158,16 @@ def from_dict(cls, data):
     def _default_convert(self, reader: "PdfReader") -> str:
         texts = []
         for page in reader.pages:
-            texts.append(
-                page.extract_text(
-                    orientations=self.plain_mode_orientations,
-                    extraction_mode=self.extraction_mode.value,
-                    space_width=self.plain_mode_space_width,
-                    layout_mode_space_vertically=self.layout_mode_space_vertically,
-                    layout_mode_scale_weight=self.layout_mode_scale_weight,
-                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
-                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
-                )
+            extracted_text = page.extract_text(
+                orientations=self.plain_mode_orientations,
+                extraction_mode=self.extraction_mode.value,
+                space_width=self.plain_mode_space_width,
+                layout_mode_space_vertically=self.layout_mode_space_vertically,
+                layout_mode_scale_weight=self.layout_mode_scale_weight,
+                layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+                layout_mode_font_height_weight=self.layout_mode_font_height_weight,
             )
+            texts.append(extracted_text)
         text = "\f".join(texts)
         return text
 

@@ -8,6 +8,7 @@
 
 from haystack import Document, default_from_dict, default_to_dict
 from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
+from haystack.components.preprocessors import DocumentSplitter
 from haystack.dataclasses import ByteStream
 
 
@@ -213,3 +214,24 @@ def test_run_empty_document(self, caplog, test_files_path):
             # Check that meta is used when the returned document is initialized and thus when doc id is generated
             assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
             assert output["documents"][0].id != Document(content="").id
+
+    def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
+        converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT)
+        sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
+        pdf_doc = converter.run(sources=sources)
+        splitter = DocumentSplitter(split_length=1, split_by="passage")
+        docs = splitter.run(pdf_doc["documents"])
+
+        assert len(docs["documents"]) == 51
+
+        expected = (
+            "A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n"
+            "edited and managed by its own audience directly using a web browser. A typical wiki\ncontains "
+            "multiple pages for the subjects or scope of the project and may be either open\nto the public or "
+            "limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are "
+            "enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a "
+            "content management system, diﬀers from other web-based systems\nsuch as blog software, in that "
+            "the content is created without any deﬁned owner or leader,\nand wikis have little inherent "
+            "structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n"
+        )
+        assert docs["documents"][2].content == expected