Skip to content

Commit

Permalink
update pdf document loaders' metadata source to url for online pdf (#…
Browse files Browse the repository at this point in the history
…13274)

- **Description:** Update 5 pdf document loaders in
`langchain.document_loaders.pdf`, to store a url in the metadata
(instead of a temporary, local file path) if the user provides a web
path to a pdf: `PyPDFium2Loader`, `PDFMinerLoader`,
`PDFMinerPDFasHTMLLoader`, `PyMuPDFLoader`, and `PDFPlumberLoader` were
updated.
- The updates follow the approach used to update `PyPDFLoader` for the
same behavior in #12092
- The `PyMuPDFLoader` changes required additional work in updating
`langchain.document_loaders.parsers.pdf.PyMuPDFParser` to be able to
process either an `io.BufferedReader` (from local pdf) or `io.BytesIO`
(from online pdf)
- The `PDFMinerPDFasHTMLLoader` change used a simpler approach since the
metadata is assigned by the loader and not the parser
  - **Issue:** Fixes #7034
  - **Dependencies:** None


```python
# PyPDFium2Loader example:
# old behavior
>>> from langchain.document_loaders import PyPDFium2Loader
>>> loader = PyPDFium2Loader('https://arxiv.org/pdf/1706.03762.pdf')
>>> docs = loader.load()
>>> docs[0].metadata
{'source': '/var/folders/7z/d5dt407n673drh1f5cm8spj40000gn/T/tmpm5oqa92f/tmp.pdf', 'page': 0}

# new behavior
>>> from langchain.document_loaders import PyPDFium2Loader
>>> loader = PyPDFium2Loader('https://arxiv.org/pdf/1706.03762.pdf')
>>> docs = loader.load()
>>> docs[0].metadata
{'source': 'https://arxiv.org/pdf/1706.03762.pdf', 'page': 0}
```
  • Loading branch information
123-fake-st authored Nov 29, 2023
1 parent 6f64cb5 commit 9bd6e9d
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 6 deletions.
5 changes: 4 additions & 1 deletion libs/langchain/langchain/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
import fitz

with blob.as_bytes_io() as file_path:
doc = fitz.open(file_path) # open document
if blob.data is None:
doc = fitz.open(file_path)
else:
doc = fitz.open(stream=file_path, filetype="pdf")

yield from [
Document(
Expand Down
24 changes: 19 additions & 5 deletions libs/langchain/langchain/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,10 @@ def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
blob = Blob.from_path(self.file_path)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
else:
blob = Blob.from_path(self.file_path)
yield from self.parser.parse(blob)


Expand Down Expand Up @@ -284,7 +287,10 @@ def lazy_load(
self,
) -> Iterator[Document]:
"""Lazily load documents."""
blob = Blob.from_path(self.file_path)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
else:
blob = Blob.from_path(self.file_path)
yield from self.parser.parse(blob)


Expand Down Expand Up @@ -318,7 +324,9 @@ def load(self) -> List[Document]:
laparams=LAParams(),
output_type="html",
)
metadata = {"source": self.file_path}
metadata = {
"source": self.file_path if self.web_path is None else self.web_path
}
return [Document(page_content=output_string.getvalue(), metadata=metadata)]


Expand Down Expand Up @@ -357,7 +365,10 @@ def load(self, **kwargs: Any) -> List[Document]:
parser = PyMuPDFParser(
text_kwargs=text_kwargs, extract_images=self.extract_images
)
blob = Blob.from_path(self.file_path)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
else:
blob = Blob.from_path(self.file_path)
return parser.parse(blob)


Expand Down Expand Up @@ -523,7 +534,10 @@ def load(self) -> List[Document]:
dedupe=self.dedupe,
extract_images=self.extract_images,
)
blob = Blob.from_path(self.file_path)
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
else:
blob = Blob.from_path(self.file_path)
return parser.parse(blob)


Expand Down

0 comments on commit 9bd6e9d

Please sign in to comment.