Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom metadata to uploaded files #93

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 248 additions & 0 deletions src/file-loader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to use data: expect.any(String), instead add the entiry data as string. same for ids, it will change on every run.

/* eslint-disable @typescript-eslint/no-explicit-any */
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was added to test a bad configuration, I know we must trust on TS, but on the file we have many anys already, but was used for the last 2 tests, it kinda was just to add more branch coverage


import { describe, test, expect } from "bun:test";
import { FileDataLoader } from "./file-loader";
import type { DatasWithFileSource } from "./database";

describe("FileDataLoader Integration Tests", () => {
describe("PDF Loading", () => {
test("should load and transform Wizard of Oz PDF", async () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
options: {
metadata: {
book: "The Wonderful Wizard of Oz",
type: "classic literature",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({
chunkSize: 1000,
chunkOverlap: 200,
});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
book: "The Wonderful Wizard of Oz",
type: "classic literature",
source: expect.stringContaining("the_wonderful_wizard_of_oz.pdf"),
timestamp: expect.any(String),
paragraphNumber: expect.any(Number),
}),
});

const allContent = result.map((document) => document.data).join(" ");
expect(allContent).toContain("Dorothy");
});
});

describe("CSV Loading", () => {
test("should load and transform user info CSV", async () => {
const config: DatasWithFileSource = {
type: "csv",
fileSource: "./data/list_of_user_info.csv",
options: {
metadata: {
dataType: "user_info",
version: "1.0",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
dataType: "user_info",
version: "1.0",
}),
});

for (const document of result) {
expect(document.data).toBeTruthy();
expect(typeof document.data).toBe("string");
}
});
});

describe("Text File Loading", () => {
test("should load and transform Wizard of Oz summary text", async () => {
const chunkSize = 500;
const chunkOverlap = 50;
const config: DatasWithFileSource = {
type: "text-file",
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
options: {
metadata: {
contentType: "summary",
subject: "The Wonderful Wizard of Oz",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({
chunkSize: chunkSize,
chunkOverlap: chunkOverlap,
});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
contentType: "summary",
subject: "The Wonderful Wizard of Oz",
}),
});

for (const document of result) {
expect(document.data.length).toBeLessThanOrEqual(chunkSize + chunkOverlap);
}
});
});

describe("HTML Loading", () => {
test("should load and transform Wizard of Oz summary HTML", async () => {
const config: DatasWithFileSource = {
type: "html",
source: "./data/the_wonderful_wizard_of_oz_summary.html",
options: {
metadata: {
format: "html",
subject: "The Wonderful Wizard of Oz Summary",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
format: "html",
subject: "The Wonderful Wizard of Oz Summary",
}),
});

const content = result[0].data;
expect(content).not.toContain("<html>");
expect(content).not.toContain("<body>");
expect(content).not.toContain("<");
});
});

describe("Multiple File Types", () => {
test("should handle loading different formats with consistent metadata", async () => {
const commonMetadata = {
project: "Wizard of Oz Analysis",
timestamp: new Date().toISOString(),
};

const configs: DatasWithFileSource[] = [
{
type: "pdf",
fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
options: { metadata: commonMetadata },
},
{
type: "text-file",
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
options: { metadata: commonMetadata },
},
{
type: "html",
source: "./data/the_wonderful_wizard_of_oz_summary.html",
options: { metadata: commonMetadata },
},
];

const results = await Promise.all(
configs.map(async (config) => {
const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
return loadFunction({});
})
);

for (const result of results) {
expect(result.length).toBeGreaterThan(0);
expect(result[0].metadata).toMatchObject(commonMetadata);
}

const [pdfContent, txtContent, htmlContent] = results.map((r) =>
r.map((document) => document.data).join(" ")
);

expect(pdfContent).toContain("Dorothy");
expect(txtContent).toContain("Dorothy");
expect(htmlContent).toContain("Dorothy");
});
});

describe("FileDataLoader Error Handling", () => {
describe("Missing Files", () => {
test("should handle non-existent files", () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "./data/does_not_exist.pdf",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow(/no such file/i);
});
});

describe("Invalid Configurations", () => {
test("should error with invalid file type", () => {
const config: DatasWithFileSource = {
type: "invalid" as any,
fileSource: "./data/some_file.txt",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow(/unsupported data type/i);
});

test("should error with missing required options for processors", () => {
const config: DatasWithFileSource = {
fileSource: "test.doc",
processor: {
options: {},
},
} as any;

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow();
});

test("should error with invalid file path", () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow();
});
});
});
});
53 changes: 29 additions & 24 deletions src/file-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,27 +131,29 @@ export class FileDataLoader {
case "pdf": {
const splitter = new RecursiveCharacterTextSplitter(args);
const splittedDocuments = await splitter.splitDocuments(documents);

return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({
source: metadata.source,
timestamp: new Date().toISOString(),
paragraphNumber: index + 1,
pageNumber: metadata.loc?.pageNumber || undefined,
author: metadata.pdf?.info?.Author || undefined,
title: metadata.pdf?.info?.Title || undefined,
totalPages: metadata.pdf?.totalPages || undefined,
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
}));
return this.mapDocumentsIntoInsertPayload(
splittedDocuments,
(metadata: any, index: number) => ({
source: metadata.source,
timestamp: new Date().toISOString(),
paragraphNumber: index + 1,
pageNumber: metadata.loc?.pageNumber || undefined,
author: metadata.pdf?.info?.Author || undefined,
title: metadata.pdf?.info?.Title || undefined,
totalPages: metadata.pdf?.totalPages || undefined,
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
})
);
}

case "csv": {
return mapDocumentsIntoInsertPayload(documents);
return this.mapDocumentsIntoInsertPayload(documents);
}

case "text-file": {
const splitter = new RecursiveCharacterTextSplitter(args);
const splittedDocuments = await splitter.splitDocuments(documents);
return mapDocumentsIntoInsertPayload(splittedDocuments);
return this.mapDocumentsIntoInsertPayload(splittedDocuments);
}

case "html": {
Expand All @@ -162,7 +164,7 @@ export class FileDataLoader {

const newDocuments = await sequence.invoke(documents);

return mapDocumentsIntoInsertPayload(newDocuments);
return this.mapDocumentsIntoInsertPayload(newDocuments);
}

// Processors will be handled here. E.g. "unstructured", "llama-parse"
Expand All @@ -182,17 +184,20 @@ export class FileDataLoader {
throw new Error(`Unsupported data type: ${this.config.type}`);
}
}
}

function mapDocumentsIntoInsertPayload(
splittedDocuments: Document[],
metadataMapper?: (metadata: any, index: number) => Record<string, any>
) {
return splittedDocuments.map((document, index) => ({
data: document.pageContent,
id: nanoid(),
...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}),
}));
}
private mapDocumentsIntoInsertPayload(
splittedDocuments: Document[],
metadataMapper?: (metadata: any, index: number) => Record<string, any>
) {
return splittedDocuments.map((document, index) => ({
data: document.pageContent,
id: nanoid(),
metadata: {
...(metadataMapper ? metadataMapper(document.metadata, index) : {}),
...this.config.options?.metadata,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was the biggest change here,

I'm destructuring the current metadata, to accept the provided metadata by the user, as we can do on raw texts

},
}));
}
}

Expand Down
Loading