-
Notifications
You must be signed in to change notification settings - Fork 52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add custom metadata to uploaded files #93
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,248 @@ | ||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */ | ||
/* eslint-disable @typescript-eslint/no-explicit-any */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was added to test a bad configuration, I know we must trust on TS, but on the file we have many |
||
|
||
import { describe, test, expect } from "bun:test"; | ||
import { FileDataLoader } from "./file-loader"; | ||
import type { DatasWithFileSource } from "./database"; | ||
|
||
describe("FileDataLoader Integration Tests", () => { | ||
describe("PDF Loading", () => { | ||
test("should load and transform Wizard of Oz PDF", async () => { | ||
const config: DatasWithFileSource = { | ||
type: "pdf", | ||
fileSource: "./data/the_wonderful_wizard_of_oz.pdf", | ||
options: { | ||
metadata: { | ||
book: "The Wonderful Wizard of Oz", | ||
type: "classic literature", | ||
}, | ||
}, | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
const loadFunction = await loader.loadFile({}); | ||
const result = await loadFunction({ | ||
chunkSize: 1000, | ||
chunkOverlap: 200, | ||
}); | ||
|
||
expect(result.length).toBeGreaterThan(0); | ||
expect(result[0]).toEqual({ | ||
data: expect.any(String), | ||
id: expect.any(String), | ||
metadata: expect.objectContaining({ | ||
book: "The Wonderful Wizard of Oz", | ||
type: "classic literature", | ||
source: expect.stringContaining("the_wonderful_wizard_of_oz.pdf"), | ||
timestamp: expect.any(String), | ||
paragraphNumber: expect.any(Number), | ||
}), | ||
}); | ||
|
||
const allContent = result.map((document) => document.data).join(" "); | ||
expect(allContent).toContain("Dorothy"); | ||
}); | ||
}); | ||
|
||
describe("CSV Loading", () => { | ||
test("should load and transform user info CSV", async () => { | ||
const config: DatasWithFileSource = { | ||
type: "csv", | ||
fileSource: "./data/list_of_user_info.csv", | ||
options: { | ||
metadata: { | ||
dataType: "user_info", | ||
version: "1.0", | ||
}, | ||
}, | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
const loadFunction = await loader.loadFile({}); | ||
const result = await loadFunction({}); | ||
|
||
expect(result.length).toBeGreaterThan(0); | ||
expect(result[0]).toEqual({ | ||
data: expect.any(String), | ||
id: expect.any(String), | ||
metadata: expect.objectContaining({ | ||
dataType: "user_info", | ||
version: "1.0", | ||
}), | ||
}); | ||
|
||
for (const document of result) { | ||
expect(document.data).toBeTruthy(); | ||
expect(typeof document.data).toBe("string"); | ||
} | ||
}); | ||
}); | ||
|
||
describe("Text File Loading", () => { | ||
test("should load and transform Wizard of Oz summary text", async () => { | ||
const chunkSize = 500; | ||
const chunkOverlap = 50; | ||
const config: DatasWithFileSource = { | ||
type: "text-file", | ||
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt", | ||
options: { | ||
metadata: { | ||
contentType: "summary", | ||
subject: "The Wonderful Wizard of Oz", | ||
}, | ||
}, | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
const loadFunction = await loader.loadFile({}); | ||
const result = await loadFunction({ | ||
chunkSize: chunkSize, | ||
chunkOverlap: chunkOverlap, | ||
}); | ||
|
||
expect(result.length).toBeGreaterThan(0); | ||
expect(result[0]).toEqual({ | ||
data: expect.any(String), | ||
id: expect.any(String), | ||
metadata: expect.objectContaining({ | ||
contentType: "summary", | ||
subject: "The Wonderful Wizard of Oz", | ||
}), | ||
}); | ||
|
||
for (const document of result) { | ||
expect(document.data.length).toBeLessThanOrEqual(chunkSize + chunkOverlap); | ||
} | ||
}); | ||
}); | ||
|
||
describe("HTML Loading", () => { | ||
test("should load and transform Wizard of Oz summary HTML", async () => { | ||
const config: DatasWithFileSource = { | ||
type: "html", | ||
source: "./data/the_wonderful_wizard_of_oz_summary.html", | ||
options: { | ||
metadata: { | ||
format: "html", | ||
subject: "The Wonderful Wizard of Oz Summary", | ||
}, | ||
}, | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
const loadFunction = await loader.loadFile({}); | ||
const result = await loadFunction({}); | ||
|
||
expect(result.length).toBeGreaterThan(0); | ||
expect(result[0]).toEqual({ | ||
data: expect.any(String), | ||
id: expect.any(String), | ||
metadata: expect.objectContaining({ | ||
format: "html", | ||
subject: "The Wonderful Wizard of Oz Summary", | ||
}), | ||
}); | ||
|
||
const content = result[0].data; | ||
expect(content).not.toContain("<html>"); | ||
expect(content).not.toContain("<body>"); | ||
expect(content).not.toContain("<"); | ||
}); | ||
}); | ||
|
||
describe("Multiple File Types", () => { | ||
test("should handle loading different formats with consistent metadata", async () => { | ||
const commonMetadata = { | ||
project: "Wizard of Oz Analysis", | ||
timestamp: new Date().toISOString(), | ||
}; | ||
|
||
const configs: DatasWithFileSource[] = [ | ||
{ | ||
type: "pdf", | ||
fileSource: "./data/the_wonderful_wizard_of_oz.pdf", | ||
options: { metadata: commonMetadata }, | ||
}, | ||
{ | ||
type: "text-file", | ||
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt", | ||
options: { metadata: commonMetadata }, | ||
}, | ||
{ | ||
type: "html", | ||
source: "./data/the_wonderful_wizard_of_oz_summary.html", | ||
options: { metadata: commonMetadata }, | ||
}, | ||
]; | ||
|
||
const results = await Promise.all( | ||
configs.map(async (config) => { | ||
const loader = new FileDataLoader(config); | ||
const loadFunction = await loader.loadFile({}); | ||
return loadFunction({}); | ||
}) | ||
); | ||
|
||
for (const result of results) { | ||
expect(result.length).toBeGreaterThan(0); | ||
expect(result[0].metadata).toMatchObject(commonMetadata); | ||
} | ||
|
||
const [pdfContent, txtContent, htmlContent] = results.map((r) => | ||
r.map((document) => document.data).join(" ") | ||
); | ||
|
||
expect(pdfContent).toContain("Dorothy"); | ||
expect(txtContent).toContain("Dorothy"); | ||
expect(htmlContent).toContain("Dorothy"); | ||
}); | ||
}); | ||
|
||
describe("FileDataLoader Error Handling", () => { | ||
describe("Missing Files", () => { | ||
test("should handle non-existent files", () => { | ||
const config: DatasWithFileSource = { | ||
type: "pdf", | ||
fileSource: "./data/does_not_exist.pdf", | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
expect(loader.loadFile({})).rejects.toThrow(/no such file/i); | ||
}); | ||
}); | ||
|
||
describe("Invalid Configurations", () => { | ||
test("should error with invalid file type", () => { | ||
const config: DatasWithFileSource = { | ||
type: "invalid" as any, | ||
fileSource: "./data/some_file.txt", | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
expect(loader.loadFile({})).rejects.toThrow(/unsupported data type/i); | ||
}); | ||
|
||
test("should error with missing required options for processors", () => { | ||
const config: DatasWithFileSource = { | ||
fileSource: "test.doc", | ||
processor: { | ||
options: {}, | ||
}, | ||
} as any; | ||
|
||
const loader = new FileDataLoader(config); | ||
expect(loader.loadFile({})).rejects.toThrow(); | ||
}); | ||
|
||
test("should error with invalid file path", () => { | ||
const config: DatasWithFileSource = { | ||
type: "pdf", | ||
fileSource: "", | ||
}; | ||
|
||
const loader = new FileDataLoader(config); | ||
expect(loader.loadFile({})).rejects.toThrow(); | ||
}); | ||
}); | ||
}); | ||
}); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,27 +131,29 @@ export class FileDataLoader { | |
case "pdf": { | ||
const splitter = new RecursiveCharacterTextSplitter(args); | ||
const splittedDocuments = await splitter.splitDocuments(documents); | ||
|
||
return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({ | ||
source: metadata.source, | ||
timestamp: new Date().toISOString(), | ||
paragraphNumber: index + 1, | ||
pageNumber: metadata.loc?.pageNumber || undefined, | ||
author: metadata.pdf?.info?.Author || undefined, | ||
title: metadata.pdf?.info?.Title || undefined, | ||
totalPages: metadata.pdf?.totalPages || undefined, | ||
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined, | ||
})); | ||
return this.mapDocumentsIntoInsertPayload( | ||
splittedDocuments, | ||
(metadata: any, index: number) => ({ | ||
source: metadata.source, | ||
timestamp: new Date().toISOString(), | ||
paragraphNumber: index + 1, | ||
pageNumber: metadata.loc?.pageNumber || undefined, | ||
author: metadata.pdf?.info?.Author || undefined, | ||
title: metadata.pdf?.info?.Title || undefined, | ||
totalPages: metadata.pdf?.totalPages || undefined, | ||
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined, | ||
}) | ||
); | ||
} | ||
|
||
case "csv": { | ||
return mapDocumentsIntoInsertPayload(documents); | ||
return this.mapDocumentsIntoInsertPayload(documents); | ||
} | ||
|
||
case "text-file": { | ||
const splitter = new RecursiveCharacterTextSplitter(args); | ||
const splittedDocuments = await splitter.splitDocuments(documents); | ||
return mapDocumentsIntoInsertPayload(splittedDocuments); | ||
return this.mapDocumentsIntoInsertPayload(splittedDocuments); | ||
} | ||
|
||
case "html": { | ||
|
@@ -162,7 +164,7 @@ export class FileDataLoader { | |
|
||
const newDocuments = await sequence.invoke(documents); | ||
|
||
return mapDocumentsIntoInsertPayload(newDocuments); | ||
return this.mapDocumentsIntoInsertPayload(newDocuments); | ||
} | ||
|
||
// Processors will be handled here. E.g. "unstructured", "llama-parse" | ||
|
@@ -182,17 +184,20 @@ export class FileDataLoader { | |
throw new Error(`Unsupported data type: ${this.config.type}`); | ||
} | ||
} | ||
} | ||
|
||
function mapDocumentsIntoInsertPayload( | ||
splittedDocuments: Document[], | ||
metadataMapper?: (metadata: any, index: number) => Record<string, any> | ||
) { | ||
return splittedDocuments.map((document, index) => ({ | ||
data: document.pageContent, | ||
id: nanoid(), | ||
...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}), | ||
})); | ||
} | ||
private mapDocumentsIntoInsertPayload( | ||
splittedDocuments: Document[], | ||
metadataMapper?: (metadata: any, index: number) => Record<string, any> | ||
) { | ||
return splittedDocuments.map((document, index) => ({ | ||
data: document.pageContent, | ||
id: nanoid(), | ||
metadata: { | ||
...(metadataMapper ? metadataMapper(document.metadata, index) : {}), | ||
...this.config.options?.metadata, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was the biggest change here, I'm destructuring the current metadata, to accept the provided metadata by the user, as we can do on raw texts |
||
}, | ||
})); | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In order to use
data: expect.any(String),
instead add the entiry data as string. same for ids, it will change on every run.