Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: much more reliable pagination #331

Merged
merged 15 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 125 additions & 8 deletions maxun-core/src/interpret.ts
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,9 @@ export default class Interpreter extends EventEmitter {
let previousHeight = 0;
// track unique items per page to avoid re-scraping
let scrapedItems: Set<string> = new Set<string>();
let visitedUrls: string[] = [];

let availableSelectors = config.pagination.selector.split(',');

while (true) {
switch (config.pagination.type) {
Expand Down Expand Up @@ -575,8 +578,9 @@ export default class Interpreter extends EventEmitter {
previousHeight = currentTopHeight;
break;
case 'clickNext':
console.log("Page URL:", page.url());
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);

// console.log("Page results:", pageResults);

// Filter out already scraped items
Expand All @@ -588,37 +592,149 @@ export default class Interpreter extends EventEmitter {
});

allResults = allResults.concat(newResults);
console.log("Results so far:", allResults.length);

if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}

const nextButton = await page.$(config.pagination.selector);
let checkButton = null;
let workingSelector = null;

for (let i = 0; i < availableSelectors.length; i++) {
const selector = availableSelectors[i];
try {
// Wait for selector with a short timeout
checkButton = await page.waitForSelector(selector, { state: 'attached' });
if (checkButton) {
workingSelector = selector;
break;
}
} catch (error) {
console.log(`Selector failed: ${selector}`);
}
}

if (!workingSelector) {
return allResults;
}

// const nextButton = await page.$(config.pagination.selector);
const nextButton = await page.$(workingSelector);
if (!nextButton) {
return allResults; // No more pages to scrape
}
await Promise.all([
nextButton.dispatchEvent('click'),
page.waitForNavigation({ waitUntil: 'networkidle' })
]);

const selectorIndex = availableSelectors.indexOf(workingSelector!);
availableSelectors = availableSelectors.slice(selectorIndex);

// await Promise.all([
// nextButton.dispatchEvent('click'),
// page.waitForNavigation({ waitUntil: 'networkidle' })
// ]);

const previousUrl = page.url();
visitedUrls.push(previousUrl);

try {
// Try both click methods simultaneously
await Promise.race([
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.click()
]),
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.dispatchEvent('click')
])
]);
} catch (error) {
// Verify if navigation actually succeeded
const currentUrl = page.url();
if (currentUrl === previousUrl) {
console.log("Previous URL same as current URL. Navigation failed.");
}
}

const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);

// Extract the current page number from the URL
const match = currentUrl.match(/\d+/);
if (match) {
const currentNumber = match[0];
// Use visitedUrls.length + 1 as the next page number
const nextNumber = visitedUrls.length + 1;

// Create new URL by replacing the current number with the next number
const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());

console.log(`Navigating to constructed URL: ${nextUrl}`);

// Navigate to the next page
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }),
page.goto(nextUrl)
]);
}
}

Comment on lines +622 to +682
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Handle race conditions in navigation logic.

The navigation logic has potential race conditions and could benefit from more robust error handling.

Issues to address:

  1. Race condition between click methods
  2. Lack of proper error handling for navigation failures
  3. Potential infinite loop in URL construction

Apply this fix:

-try {
-  await Promise.race([
-    Promise.all([
-      page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
-      nextButton.click()
-    ]),
-    Promise.all([
-      page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
-      nextButton.dispatchEvent('click')
-    ])
-  ]);
-} catch (error) {
-  const currentUrl = page.url();
-  if (currentUrl === previousUrl) {
-    console.log("Previous URL same as current URL. Navigation failed.");
-  }
-}
+const MAX_NAVIGATION_ATTEMPTS = 3;
+let navigationAttempt = 0;
+
+while (navigationAttempt < MAX_NAVIGATION_ATTEMPTS) {
+  try {
+    await Promise.race([
+      page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 })
+        .catch(() => null), // Prevent rejection from breaking the race
+      nextButton.click()
+        .catch(() => nextButton.dispatchEvent('click'))
+    ]);
+    
+    const currentUrl = page.url();
+    if (currentUrl !== previousUrl) {
+      break; // Navigation successful
+    }
+    
+    navigationAttempt++;
+    await page.waitForTimeout(1000 * navigationAttempt); // Exponential backoff
+  } catch (error) {
+    console.error(`Navigation attempt ${navigationAttempt + 1} failed:`, error);
+    navigationAttempt++;
+  }
+}
+
+if (navigationAttempt === MAX_NAVIGATION_ATTEMPTS) {
+  console.error('Max navigation attempts reached. Stopping pagination.');
+  return allResults;
+}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// const nextButton = await page.$(config.pagination.selector);
const nextButton = await page.$(workingSelector);
if (!nextButton) {
return allResults; // No more pages to scrape
}
await Promise.all([
nextButton.dispatchEvent('click'),
page.waitForNavigation({ waitUntil: 'networkidle' })
]);
const selectorIndex = availableSelectors.indexOf(workingSelector!);
availableSelectors = availableSelectors.slice(selectorIndex);
// await Promise.all([
// nextButton.dispatchEvent('click'),
// page.waitForNavigation({ waitUntil: 'networkidle' })
// ]);
const previousUrl = page.url();
visitedUrls.push(previousUrl);
try {
// Try both click methods simultaneously
await Promise.race([
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.click()
]),
Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
nextButton.dispatchEvent('click')
])
]);
} catch (error) {
// Verify if navigation actually succeeded
const currentUrl = page.url();
if (currentUrl === previousUrl) {
console.log("Previous URL same as current URL. Navigation failed.");
}
}
const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
// Extract the current page number from the URL
const match = currentUrl.match(/\d+/);
if (match) {
const currentNumber = match[0];
// Use visitedUrls.length + 1 as the next page number
const nextNumber = visitedUrls.length + 1;
// Create new URL by replacing the current number with the next number
const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
console.log(`Navigating to constructed URL: ${nextUrl}`);
// Navigate to the next page
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }),
page.goto(nextUrl)
]);
}
}
// const nextButton = await page.$(config.pagination.selector);
const nextButton = await page.$(workingSelector);
if (!nextButton) {
return allResults; // No more pages to scrape
}
const selectorIndex = availableSelectors.indexOf(workingSelector!);
availableSelectors = availableSelectors.slice(selectorIndex);
// await Promise.all([
// nextButton.dispatchEvent('click'),
// page.waitForNavigation({ waitUntil: 'networkidle' })
// ]);
const previousUrl = page.url();
visitedUrls.push(previousUrl);
const MAX_NAVIGATION_ATTEMPTS = 3;
let navigationAttempt = 0;
while (navigationAttempt < MAX_NAVIGATION_ATTEMPTS) {
try {
await Promise.race([
page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 })
.catch(() => null), // Prevent rejection from breaking the race
nextButton.click()
.catch(() => nextButton.dispatchEvent('click'))
]);
const currentUrl = page.url();
if (currentUrl !== previousUrl) {
break; // Navigation successful
}
navigationAttempt++;
await page.waitForTimeout(1000 * navigationAttempt); // Exponential backoff
} catch (error) {
console.error(`Navigation attempt ${navigationAttempt + 1} failed:`, error);
navigationAttempt++;
}
}
if (navigationAttempt === MAX_NAVIGATION_ATTEMPTS) {
console.error('Max navigation attempts reached. Stopping pagination.');
return allResults;
}
const currentUrl = page.url();
if (visitedUrls.includes(currentUrl)) {
console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
// Extract the current page number from the URL
const match = currentUrl.match(/\d+/);
if (match) {
const currentNumber = match[0];
// Use visitedUrls.length + 1 as the next page number
const nextNumber = visitedUrls.length + 1;
// Create new URL by replacing the current number with the next number
const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
console.log(`Navigating to constructed URL: ${nextUrl}`);
// Navigate to the next page
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }),
page.goto(nextUrl)
]);
}
}
🧰 Tools
🪛 Biome (1.9.4)

[error] 623-623: Other switch clauses can erroneously access this declaration.
Wrap the declaration in a block to restrict its access to the switch clause.

The declaration is defined in this switch clause:

Unsafe fix: Wrap the declaration in a block.

(lint/correctness/noSwitchDeclarations)


[error] 628-628: Other switch clauses can erroneously access this declaration.
Wrap the declaration in a block to restrict its access to the switch clause.

The declaration is defined in this switch clause:

Unsafe fix: Wrap the declaration in a block.

(lint/correctness/noSwitchDeclarations)


[error] 636-636: Other switch clauses can erroneously access this declaration.
Wrap the declaration in a block to restrict its access to the switch clause.

The declaration is defined in this switch clause:

Unsafe fix: Wrap the declaration in a block.

(lint/correctness/noSwitchDeclarations)


[error] 659-659: Other switch clauses can erroneously access this declaration.
Wrap the declaration in a block to restrict its access to the switch clause.

The declaration is defined in this switch clause:

Unsafe fix: Wrap the declaration in a block.

(lint/correctness/noSwitchDeclarations)

// Give the page a moment to stabilize after navigation
await page.waitForTimeout(1000);
break;
case 'clickLoadMore':
while (true) {
const loadMoreButton = await page.$(config.pagination.selector);
let checkButton = null;
let workingSelector = null;

for (let i = 0; i < availableSelectors.length; i++) {
const selector = availableSelectors[i];
try {
// Wait for selector with a short timeout
checkButton = await page.waitForSelector(selector, { state: 'attached' });
if (checkButton) {
workingSelector = selector;
break;
}
} catch (error) {
console.log(`Selector failed: ${selector}`);
}
}

if (!workingSelector) {
// No more working selectors available, so scrape the remaining items
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}

const loadMoreButton = await page.$(workingSelector);
if (!loadMoreButton) {
// No more "Load More" button, so scrape the remaining items
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}

const selectorIndex = availableSelectors.indexOf(workingSelector!);
availableSelectors = availableSelectors.slice(selectorIndex);

// Click the 'Load More' button to load additional items
await loadMoreButton.dispatchEvent('click');
// await loadMoreButton.dispatchEvent('click');
try {
await Promise.race([
loadMoreButton.click(),
loadMoreButton.dispatchEvent('click')
]);
} catch (error) {
console.log('Both click attempts failed');
}
await page.waitForTimeout(2000); // Wait for new items to load
// After clicking 'Load More', scroll down to load more items
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);

// Check if more items are available
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
Expand All @@ -628,6 +744,7 @@ export default class Interpreter extends EventEmitter {
return allResults;
}
previousHeight = currentHeight;

if (config.limit && allResults.length >= config.limit) {
// If limit is set and reached, return the limited results
allResults = allResults.slice(0, config.limit);
Expand Down
24 changes: 24 additions & 0 deletions server/src/workflow-management/classes/Generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ export class WorkflowGenerator {

private listSelector: string = '';

private paginationMode: boolean = false;

/**
* The public constructor of the WorkflowGenerator.
* Takes socket for communication as a parameter and registers some important events on it.
Expand Down Expand Up @@ -120,6 +122,9 @@ export class WorkflowGenerator {
this.socket.on('listSelector', (data: { selector: string }) => {
this.listSelector = data.selector;
})
this.socket.on('setPaginationMode', (data: { pagination: boolean }) => {
this.paginationMode = data.pagination;
})
}

/**
Expand Down Expand Up @@ -702,6 +707,25 @@ export class WorkflowGenerator {
const selectorBasedOnCustomAction = (this.getList === true)
? await getNonUniqueSelectors(page, coordinates, this.listSelector)
: await getSelectors(page, coordinates);

if (this.paginationMode && selectorBasedOnCustomAction) {
// Chain selectors in specific priority order
const selectors = selectorBasedOnCustomAction;
const selectorChain = [
selectors?.iframeSelector?.full,
selectors?.shadowSelector?.full,
selectors?.testIdSelector,
selectors?.id,
selectors?.hrefSelector,
selectors?.accessibilitySelector,
selectors?.attrSelector,
selectors?.generalSelector
]
.filter(selector => selector !== null && selector !== undefined)
.join(',');

return selectorChain;
}

const bestSelector = getBestSelectorForAction(
{
Expand Down
1 change: 1 addition & 0 deletions src/components/browser/BrowserWindow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ export const BrowserWindow = () => {
setPaginationSelector(highlighterData.selector);
notify(`info`, t('browser_window.attribute_modal.notifications.pagination_select_success'));
addListStep(listSelector!, fields, currentListId || 0, { type: paginationType, selector: highlighterData.selector });
socket?.emit('setPaginationMode', { pagination: false });
}
return;
}
Expand Down
1 change: 1 addition & 0 deletions src/context/browserActions.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ export const ActionProvider = ({ children }: { children: ReactNode }) => {
setPaginationMode(true);
setCaptureStage('pagination');
socket?.emit('setGetList', { getList: false });
socket?.emit('setPaginationMode', { pagination: true });
};

const stopPaginationMode = () => setPaginationMode(false);
Expand Down