From 9bd6e9df365e966938979511237c035a02fb4fa9 Mon Sep 17 00:00:00 2001 From: 123-fake-st <34491334+123-fake-st@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:07:46 -0600 Subject: [PATCH] update pdf document loaders' metadata source to url for online pdf (#13274) - **Description:** Update 5 pdf document loaders in `langchain.document_loaders.pdf`, to store a url in the metadata (instead of a temporary, local file path) if the user provides a web path to a pdf: `PyPDFium2Loader`, `PDFMinerLoader`, `PDFMinerPDFasHTMLLoader`, `PyMuPDFLoader`, and `PDFPlumberLoader` were updated. - The updates follow the approach used to update `PyPDFLoader` for the same behavior in #12092 - The `PyMuPDFLoader` changes required additional work in updating `langchain.document_loaders.parsers.pdf.PyMuPDFParser` to be able to process either an `io.BufferedReader` (from local pdf) or `io.BytesIO` (from online pdf) - The `PDFMinerPDFasHTMLLoader` change used a simpler approach since the metadata is assigned by the loader and not the parser - **Issue:** Fixes #7034 - **Dependencies:** None ```python # PyPDFium2Loader example: # old behavior >>> from langchain.document_loaders import PyPDFium2Loader >>> loader = PyPDFium2Loader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': '/var/folders/7z/d5dt407n673drh1f5cm8spj40000gn/T/tmpm5oqa92f/tmp.pdf', 'page': 0} # new behavior >>> from langchain.document_loaders import PyPDFium2Loader >>> loader = PyPDFium2Loader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': 'https://arxiv.org/pdf/1706.03762.pdf', 'page': 0} ``` --- .../langchain/document_loaders/parsers/pdf.py | 5 +++- .../langchain/document_loaders/pdf.py | 24 +++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 52d47e1f2fff0..1e16bf7fce542 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -235,7 +235,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: import fitz with blob.as_bytes_io() as file_path: - doc = fitz.open(file_path) # open document + if blob.data is None: + doc = fitz.open(file_path) + else: + doc = fitz.open(stream=file_path, filetype="pdf") yield from [ Document( diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index ceb7d292957d8..6b5e1dbef95d9 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -194,7 +194,10 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazy load given path as pages.""" - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) yield from self.parser.parse(blob) @@ -284,7 +287,10 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazily load documents.""" - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) yield from self.parser.parse(blob) @@ -318,7 +324,9 @@ def load(self) -> List[Document]: laparams=LAParams(), output_type="html", ) - metadata = {"source": self.file_path} + metadata = { + "source": self.file_path if self.web_path is None else self.web_path + } return [Document(page_content=output_string.getvalue(), metadata=metadata)] @@ -357,7 +365,10 @@ def load(self, **kwargs: Any) -> List[Document]: parser = PyMuPDFParser( text_kwargs=text_kwargs, extract_images=self.extract_images ) - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) return parser.parse(blob) @@ -523,7 +534,10 @@ def load(self) -> List[Document]: dedupe=self.dedupe, extract_images=self.extract_images, ) - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) return parser.parse(blob)