Skip to content

Commit

Permalink
Add option for pdf loader to create one document per page (#361)
Browse files Browse the repository at this point in the history
* Add option for pdf loader to create one document per page

* Update test

* Add to docs, update arg
  • Loading branch information
nfcampos authored Mar 28, 2023
1 parent 314f970 commit 6718dec
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ hide_table_of_contents: true

# PDF files

This example goes over how to load data from PDF files. One document will be created for each PDF file.
This example goes over how to load data from PDF files. By default, one document will be created for each page in the PDF file, you can change this behavior by setting the `splitPages` option to `false`.

# Setup

```bash npm2yarn
npm install pdf-parse
npm install pdfjs-dist
```

# Usage
# Usage, one document per page

```typescript
import { PDFLoader } from "langchain/document_loaders";
Expand All @@ -21,3 +21,15 @@ const loader = new PDFLoader("src/document_loaders/example_data/example.pdf");

const docs = await loader.load();
```

# Usage, one document per file

```typescript
import { PDFLoader } from "langchain/document_loaders";

const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", {
splitPages: false,
});

const docs = await loader.load();
```
2 changes: 1 addition & 1 deletion docs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@
"engines": {
"node": ">=18"
}
}
}
8 changes: 4 additions & 4 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
"husky": "^8.0.3",
"jest": "^29.5.0",
"lint-staged": "^13.1.1",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^3.4.120",
"prettier": "^2.8.3",
"puppeteer": "^19.7.2",
"redis": "^4.6.4",
Expand All @@ -128,7 +128,7 @@
"cohere-ai": "^5.0.2",
"d3-dsv": "^3.0.1",
"hnswlib-node": "^1.4.2",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^3.4.120",
"puppeteer": "^19.7.2",
"redis": "^4.6.4",
"serpapi": "^1.1.1",
Expand Down Expand Up @@ -163,7 +163,7 @@
"hnswlib-node": {
"optional": true
},
"pdf-parse": {
"pdfjs-dist": {
"optional": true
},
"puppeteer": {
Expand Down Expand Up @@ -302,4 +302,4 @@
"import": "./retrievers.js"
}
}
}
}
68 changes: 58 additions & 10 deletions langchain/src/document_loaders/pdf.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,70 @@
import type { TextItem } from "pdfjs-dist/types/src/display/api.js";
import { Document } from "../document.js";
import { BufferLoader } from "./buffer.js";

export class PDFLoader extends BufferLoader {
private splitPages: boolean;

constructor(filePathOrBlob: string | Blob, { splitPages = true } = {}) {
super(filePathOrBlob);
this.splitPages = splitPages;
}

public async parse(
raw: Buffer,
metadata: Document["metadata"]
): Promise<Document[]> {
const { pdf } = await PDFLoaderImports();
const parsed = await pdf(raw);
const { getDocument, version } = await PDFLoaderImports();
const pdf = await getDocument({
data: new Uint8Array(raw.buffer),
useWorkerFetch: false,
isEvalSupported: false,
useSystemFonts: true,
}).promise;
const meta = await pdf.getMetadata().catch(() => null);

const documents: Document[] = [];

for (let i = 1; i <= pdf.numPages; i += 1) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
const text = content.items
.map((item) => (item as TextItem).str)
.join("\n");

documents.push(
new Document({
pageContent: text,
metadata: {
...metadata,
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdf.numPages,
},
loc: {
pageNumber: i,
},
},
})
);
}

if (this.splitPages) {
return documents;
}

return [
new Document({
pageContent: parsed.text,
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: {
...metadata,
pdf: {
info: parsed.info,
metadata: parsed.metadata,
numpages: parsed.numpages,
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdf.numPages,
},
},
}),
Expand All @@ -26,13 +74,13 @@ export class PDFLoader extends BufferLoader {

async function PDFLoaderImports() {
try {
// the main entrypoint has some debug code that we don't want to import
const { default: pdf } = await import("pdf-parse/lib/pdf-parse.js");
return { pdf };
const { default: mod } = await import("pdfjs-dist");
const { getDocument, version } = mod;
return { getDocument, version };
} catch (e) {
console.error(e);
throw new Error(
"Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
"Failed to load pdfjs-dist. Please install it with eg. `npm install pdfjs-dist`."
);
}
}
6 changes: 4 additions & 2 deletions langchain/src/document_loaders/tests/directory.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ test("Test Directory loader", async () => {
UnknownHandling.Ignore
);
const docs = await loader.load();
expect(docs.length).toBe(76);
expect(docs.length).toBe(90);
expect(docs.map((d) => d.metadata.source).sort()).toEqual([
// PDF
path.resolve(directoryPath, "1706.03762.pdf"),
...Array.from({ length: 15 }, (_) =>
path.resolve(directoryPath, "1706.03762.pdf")
),
// CSV
...Array.from({ length: 32 }, (_) =>
path.resolve(
Expand Down
16 changes: 14 additions & 2 deletions langchain/src/document_loaders/tests/pdf-blob.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,30 @@ test("Test PDF loader from blob", async () => {
);
const docs = await loader.load();

expect(docs.length).toBe(1);
expect(docs.length).toBe(15);
expect(docs[0].pageContent).toContain("Attention Is All You Need");
expect(docs[0].metadata).toMatchInlineSnapshot(`
{
"blobType": "application/pdf",
"loc": {
"pageNumber": 1,
},
"pdf": {
"info": {
"Author": "",
"CreationDate": "D:20171207010315Z",
"Creator": "LaTeX with hyperref package",
"Custom": {
"PTEX.Fullbanner": "This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2",
},
"EncryptFilterName": null,
"IsAcroFormPresent": false,
"IsCollectionPresent": false,
"IsLinearized": false,
"IsSignaturesPresent": false,
"IsXFAPresent": false,
"Keywords": "",
"Language": null,
"ModDate": "D:20171207010315Z",
"PDFFormatVersion": "1.5",
"Producer": "pdfTeX-1.40.17",
Expand All @@ -39,7 +50,8 @@ test("Test PDF loader from blob", async () => {
},
},
"metadata": null,
"numpages": 15,
"totalPages": 15,
"version": "3.4.120",
},
"source": "blob",
}
Expand Down
12 changes: 12 additions & 0 deletions langchain/src/document_loaders/tests/pdf.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@ test("Test PDF loader from file", async () => {
const loader = new PDFLoader(filePath);
const docs = await loader.load();

expect(docs.length).toBe(15);
expect(docs[0].pageContent).toContain("Attention Is All You Need");
});

test("Test PDF loader from file to single document", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/1706.03762.pdf"
);
const loader = new PDFLoader(filePath, { splitPages: false });
const docs = await loader.load();

expect(docs.length).toBe(1);
expect(docs[0].pageContent).toContain("Attention Is All You Need");
});
Loading

1 comment on commit 6718dec

@vercel
Copy link

@vercel vercel bot commented on 6718dec Mar 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.