PDFMinerLoader is taking way too much time #24394
Replies: 1 comment
-
The drastic difference in loading times between the two PDF files when using
Here is the relevant part of the class PDFMinerLoader(BasePDFLoader):
"""Load `PDF` files using `PDFMiner`."""
def __init__(
self,
file_path: str,
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
concatenate_pages: bool = True,
) -> None:
"""Initialize with file path.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
try:
from pdfminer.high_level import extract_text # noqa:F401
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages
) Additionally, the class PDFMinerParser(BaseBlobParser):
"""Parse `PDF` using `PDFMiner`."""
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
"""Initialize a parser based on PDFMiner.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
self.extract_images = extract_images
self.concatenate_pages = concatenate_pages
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
if not self.extract_images:
try:
from pdfminer.high_level import extract_text
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
if self.concatenate_pages:
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
from pdfminer.pdfpage import PDFPage
pages = PDFPage.get_pages(pdf_file_obj)
for i, _ in enumerate(pages):
text = extract_text(pdf_file_obj, page_numbers=[i])
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
import io
from pdfminer.converter import PDFPageAggregator, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
text_io = io.StringIO()
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
pages = PDFPage.get_pages(pdf_file_obj)
rsrcmgr = PDFResourceManager()
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
for i, page in enumerate(pages):
interpreter_for_text.process_page(page)
interpreter_for_image.process_page(page)
content = text_io.getvalue() + self._extract_images_from_page(
device_for_image.get_result()
)
text_io.truncate(0)
text_io.seek(0)
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=content, metadata=metadata) In summary, the larger file size, higher page count, and the resource-intensive nature of PDFMiner contribute to the longer loading times for the 19 MB PDF file [1][2]. |
Beta Was this translation helpful? Give feedback.
-
Checked other resources
Commit to Help
Example Code
Description
I'm trying to load a file of 19 mb has 613 pages that is taking 40 minutes, then again another file of 62 mb containing 900+ pages is taking only 45 seconds. Not sure why is this happening.
Can someone tell me the reason?
System Info
langchain = "^0.1.20"
langchain-community = "^0.0.38"
langchain-core = ">=0.1.52,<0.2.0"
"pdfminer.six" = "^20231228"
Beta Was this translation helpful? Give feedback.
All reactions