Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: test that PyPDF can extract passages so that they are detect by DocumentSplitter #8739

Merged
merged 1 commit into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,16 @@ def from_dict(cls, data):
def _default_convert(self, reader: "PdfReader") -> str:
texts = []
for page in reader.pages:
texts.append(
page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
extracted_text = page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
texts.append(extracted_text)
text = "\f".join(texts)
return text

Expand Down
22 changes: 22 additions & 0 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import PyPDFToDocument, PyPDFExtractionMode
from haystack.components.preprocessors import DocumentSplitter
from haystack.dataclasses import ByteStream


Expand Down Expand Up @@ -213,3 +214,24 @@ def test_run_empty_document(self, caplog, test_files_path):
# Check that meta is used when the returned document is initialized and thus when doc id is generated
assert output["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
assert output["documents"][0].id != Document(content="").id

def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
converter = PyPDFToDocument(extraction_mode=PyPDFExtractionMode.LAYOUT)
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
pdf_doc = converter.run(sources=sources)
splitter = DocumentSplitter(split_length=1, split_by="passage")
docs = splitter.run(pdf_doc["documents"])

assert len(docs["documents"]) == 51

expected = (
"A wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively\n"
"edited and managed by its own audience directly using a web browser. A typical wiki\ncontains "
"multiple pages for the subjects or scope of the project and may be either open\nto the public or "
"limited to use within an organization for maintaining its internal knowledge\nbase. Wikis are "
"enabled by wiki software, otherwise known as wiki engines. A wiki engine,\nbeing a form of a "
"content management system, differs from other web-based systems\nsuch as blog software, in that "
"the content is created without any defined owner or leader,\nand wikis have little inherent "
"structure, allowing structure to emerge according to the\nneeds of the users.[1]\n\n"
)
assert docs["documents"][2].content == expected
Loading