diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 52d47e1f2fff0..1e16bf7fce542 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -235,7 +235,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: import fitz with blob.as_bytes_io() as file_path: - doc = fitz.open(file_path) # open document + if blob.data is None: + doc = fitz.open(file_path) + else: + doc = fitz.open(stream=file_path, filetype="pdf") yield from [ Document( diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index ceb7d292957d8..6b5e1dbef95d9 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -194,7 +194,10 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazy load given path as pages.""" - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) yield from self.parser.parse(blob) @@ -284,7 +287,10 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazily load documents.""" - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) yield from self.parser.parse(blob) @@ -318,7 +324,9 @@ def load(self) -> List[Document]: laparams=LAParams(), output_type="html", ) - metadata = {"source": self.file_path} + metadata = { + "source": self.file_path if self.web_path is None else self.web_path + } return [Document(page_content=output_string.getvalue(), metadata=metadata)] @@ -357,7 +365,10 @@ def load(self, **kwargs: Any) -> List[Document]: parser = PyMuPDFParser( text_kwargs=text_kwargs, extract_images=self.extract_images ) - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) return parser.parse(blob) @@ -523,7 +534,10 @@ def load(self) -> List[Document]: dedupe=self.dedupe, extract_images=self.extract_images, ) - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) return parser.parse(blob)