Skip to content

Commit

Permalink
Merge pull request #1146 from mendableai/feat/wait-validation
Browse files Browse the repository at this point in the history
[FIR-796] feat(api/types): Add action and wait time validation for scrape requests
  • Loading branch information
ftonato authored Feb 10, 2025
2 parents 2b7b740 + 290dd03 commit 3608256
Show file tree
Hide file tree
Showing 4 changed files with 333 additions and 10 deletions.
54 changes: 46 additions & 8 deletions apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,25 @@ export const extractOptions = z

export type ExtractOptions = z.infer<typeof extractOptions>;

const ACTIONS_MAX_WAIT_TIME = 60;
const MAX_ACTIONS = 15;
function calculateTotalWaitTime(actions: any[] = [], waitFor: number = 0): number {
const actionWaitTime = actions.reduce((acc, action) => {
if (action.type === "wait") {
if (action.milliseconds) {
return acc + action.milliseconds;
}
// Consider selector actions as 1 second
if (action.selector) {
return acc + 1000;
}
}
return acc;
}, 0);

return waitFor + actionWaitTime;
}

export const actionsSchema = z.array(
z.union([
z
Expand Down Expand Up @@ -113,9 +132,19 @@ export const actionsSchema = z.array(
script: z.string(),
}),
]),
).refine(
(actions) => actions.length <= MAX_ACTIONS,
{
message: `Number of actions cannot exceed ${MAX_ACTIONS}`,
},
).refine(
(actions) => calculateTotalWaitTime(actions) <= ACTIONS_MAX_WAIT_TIME * 1000,
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
},
);

export const scrapeOptions = z
const baseScrapeOptions = z
.object({
formats: z
.enum([
Expand All @@ -140,7 +169,7 @@ export const scrapeOptions = z
excludeTags: z.string().array().optional(),
onlyMainContent: z.boolean().default(true),
timeout: z.number().int().positive().finite().safe().optional(),
waitFor: z.number().int().nonnegative().finite().safe().max(30000).default(0),
waitFor: z.number().int().nonnegative().finite().safe().max(60000).default(0),
// Deprecate this to jsonOptions
extract: extractOptions.optional(),
// New
Expand Down Expand Up @@ -191,7 +220,17 @@ export const scrapeOptions = z
})
.strict(strictMessage);

export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeOptions = baseScrapeOptions.refine(
(obj) => {
if (!obj.actions) return true;
return calculateTotalWaitTime(obj.actions, obj.waitFor) <= ACTIONS_MAX_WAIT_TIME * 1000;
},
{
message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`,
}
);

export type ScrapeOptions = z.infer<typeof baseScrapeOptions>;

import Ajv from "ajv";

Expand Down Expand Up @@ -246,7 +285,7 @@ export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;

export const scrapeRequestSchema = scrapeOptions
export const scrapeRequestSchema = baseScrapeOptions
.omit({ timeout: true })
.extend({
url,
Expand Down Expand Up @@ -325,7 +364,7 @@ export const webhookSchema = z.preprocess(
.strict(strictMessage),
);

export const batchScrapeRequestSchema = scrapeOptions
export const batchScrapeRequestSchema = baseScrapeOptions
.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
Expand All @@ -349,7 +388,7 @@ export const batchScrapeRequestSchema = scrapeOptions
},
);

export const batchScrapeRequestSchemaNoURLValidation = scrapeOptions
export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions
.extend({
urls: z.string().array(),
origin: z.string().optional().default("api"),
Expand Down Expand Up @@ -876,8 +915,7 @@ export const searchRequestSchema = z
location: z.string().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: scrapeOptions
.extend({
scrapeOptions: baseScrapeOptions.extend({
formats: z
.array(
z.enum([
Expand Down
7 changes: 5 additions & 2 deletions apps/test-suite/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"test:suite": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false",
"test:load": "artillery run --output ./load-test-results/test-run-report.json load-test.yml",
"test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts",
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts"
"test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts",
"test:schema-validation": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/schema-validation.test.ts"
},
"author": "",
"license": "ISC",
Expand All @@ -22,9 +23,11 @@
"ts-jest": "^29.1.2"
},
"devDependencies": {
"@jest/globals": "^29.7.0",
"@types/jest": "^29.5.12",
"@types/supertest": "^6.0.2",
"artillery": "^2.0.19",
"typescript": "^5.4.5"
"typescript": "^5.4.5",
"zod": "^3.24.1"
}
}
11 changes: 11 additions & 0 deletions apps/test-suite/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3608256

Please sign in to comment.