From 85a911f360a89d790db68a8a3cabad673367fb22 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 13 Feb 2025 10:47:52 -0500 Subject: [PATCH] chore(glob): fix compile [#267] --- Cargo.lock | 12 +- examples/url_glob.rs | 2 +- spider/Cargo.toml | 2 +- spider/src/configuration.rs | 4 +- spider/src/features/chrome.rs | 3 +- spider/src/website.rs | 604 +++++++++++++++++------------- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 11 files changed, 370 insertions(+), 267 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 79eb498e16..7583069a63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5588,7 +5588,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.27.61" +version = "2.27.62" dependencies = [ "ahash", "aho-corasick", @@ -5653,7 +5653,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.27.61" +version = "2.27.62" dependencies = [ "adblock", "aho-corasick", @@ -5744,7 +5744,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.27.61" +version = "2.27.62" dependencies = [ "clap", "env_logger", @@ -5787,7 +5787,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.27.61" +version = "2.27.62" dependencies = [ "aho-corasick", "fast_html2md", @@ -5810,7 +5810,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.27.61" +version = "2.27.62" dependencies = [ "hashbrown 0.15.2", "indexmap 1.9.3", @@ -5827,7 +5827,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.27.61" +version = "2.27.62" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/url_glob.rs b/examples/url_glob.rs index e9abe26efe..0d33e161aa 100644 --- a/examples/url_glob.rs +++ b/examples/url_glob.rs @@ -1,4 +1,4 @@ -//! `cargo run --example url_glob --features glob` +//! cargo run --example url_glob --features glob extern crate spider; use spider::tokio; diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 55cfd320c7..ad09a0ab04 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.27.61" +version = "2.27.62" authors = [ "j-mendez " ] diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index aa8f1092cb..43dd3b59ee 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -365,7 +365,7 @@ impl Configuration { delay: 0, depth: 25, redirect_limit: Box::new(7), - request_timeout: Some(Box::new(Duration::from_secs(15))), + request_timeout: Some(Box::new(Duration::from_secs(60))), only_html: true, ..Default::default() } @@ -378,7 +378,7 @@ impl Configuration { delay: 0, depth: 25, redirect_limit: Box::new(7), - request_timeout: Some(Box::new(Duration::from_secs(15))), + request_timeout: Some(Box::new(Duration::from_secs(60))), chrome_intercept: RequestInterceptConfiguration::new(cfg!( feature = "chrome_intercept" )), diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index aeaa65c11d..6d2345e1b9 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -468,7 +468,8 @@ pub(crate) async fn attempt_navigation( viewport: &Option, ) -> Result { let mut cdp_params = CreateTargetParams::new(url); - cdp_params.background = Some(browser_context_id.is_some()); + + cdp_params.background = Some(browser_context_id.is_some()); // not supported headless-shell cdp_params.browser_context_id.clone_from(browser_context_id); cdp_params.url = url.into(); cdp_params.for_tab = Some(false); diff --git a/spider/src/website.rs b/spider/src/website.rs index 68a19ba576..ca6ba1faab 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -1670,6 +1670,7 @@ impl Website { } /// Expand links for crawl. + #[cfg(not(feature = "glob"))] async fn _crawl_establish( &mut self, client: &Client, @@ -1831,7 +1832,11 @@ impl Website { } /// Expand links for crawl. - #[cfg(all(not(feature = "decentralized"), feature = "chrome"))] + #[cfg(all( + not(feature = "decentralized"), + feature = "chrome", + not(feature = "glob") + ))] async fn crawl_establish( &mut self, client: &Client, @@ -1943,216 +1948,6 @@ impl Website { } } - /// fetch the page with chrome - #[cfg(all( - not(feature = "glob"), - not(feature = "decentralized"), - feature = "smart" - ))] - async fn render_chrome_page( - config: &Configuration, - client: &Client, - browser: &Arc, - context_id: &Option, - page: &mut Page, - url: &str, - ) { - if let Ok(chrome_page) = crate::features::chrome::attempt_navigation( - "about:blank", - &browser, - &config.request_timeout, - &context_id, - &config.viewport, - ) - .await - { - crate::features::chrome::setup_chrome_events(&chrome_page, &config).await; - let intercept_handle = crate::features::chrome::setup_chrome_interception_base( - &chrome_page, - config.chrome_intercept.enabled, - &config.auth_challenge_response, - config.chrome_intercept.block_visuals, - &url, - ) - .await; - - let next_page = Page::new( - &url, - &client, - &chrome_page, - &config.wait_for, - &config.screenshot, - false, // we use the initial about:blank page. - &config.openai_config, - &config.execution_scripts, - &config.automation_scripts, - &config.viewport, - &config.request_timeout, - ) - .await; - - page.clone_from(&next_page); - - if let Some(h) = intercept_handle { - let abort_handle = h.abort_handle(); - if let Err(elasped) = - tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await - { - log::warn!("Handler timeout exceeded {elasped}"); - abort_handle.abort(); - } - } - } - } - - /// Expand links for crawl. - #[cfg(all( - not(feature = "glob"), - not(feature = "decentralized"), - feature = "smart" - ))] - async fn crawl_establish_smart( - &mut self, - client: &Client, - mut base: &mut RelativeSelectors, - _: bool, - browser: &Arc, - context_id: &Option, - ) -> HashSet { - let links: HashSet = if self - .is_allowed_default(&self.get_base_link()) - .eq(&ProcessLinkStatus::Allowed) - { - let url = self.url.inner(); - - let mut page = Page::new_page(&url, &client).await; - - let mut retry_count = self.configuration.retry; - - while page.should_retry && retry_count > 0 { - retry_count -= 1; - if let Some(timeout) = page.get_timeout() { - tokio::time::sleep(timeout).await; - } - if page.status_code == StatusCode::GATEWAY_TIMEOUT { - if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async { - if retry_count.is_power_of_two() { - Website::render_chrome_page( - &self.configuration, - client, - browser, - context_id, - &mut page, - url, - ) - .await; - } else { - let next_page = Page::new_page(url, &client).await; - page.clone_from(&next_page); - }; - }) - .await - { - log::warn!("backoff timeout {elasped}"); - } - } else { - if retry_count.is_power_of_two() { - Website::render_chrome_page( - &self.configuration, - client, - browser, - context_id, - &mut page, - url, - ) - .await - } else { - page.clone_from(&Page::new_page(url, &client).await); - } - } - } - - let page_links: HashSet = page - .smart_links( - &base, - &browser, - &self.configuration, - &context_id, - &self.domain_parsed, - ) - .await; - - if let Some(ref domain) = page.final_redirect_destination { - let prior_domain = self.domain_parsed.take(); - crate::utils::modify_selectors( - &prior_domain, - domain, - &mut self.domain_parsed, - &mut self.url, - &mut base, - AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld), - ); - } - - emit_log(&self.url.inner()); - - if let Some(sid) = page.signature { - self.insert_signature(sid).await; - } - - self.insert_link(match self.on_link_find_callback { - Some(cb) => { - let c = cb(*self.url.clone(), None); - - c.0 - } - _ => *self.url.clone(), - }) - .await; - - let links = if !page_links.is_empty() { - page_links - } else { - self.status = CrawlStatus::Empty; - Default::default() - }; - - self.initial_status_code = page.status_code; - - if page.status_code == reqwest::StatusCode::FORBIDDEN { - self.status = CrawlStatus::Blocked; - } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS { - self.status = CrawlStatus::RateLimited; - } else if page.status_code.is_server_error() { - self.status = CrawlStatus::ServerError; - } - - if self.configuration.return_page_links { - page.page_links = if links.is_empty() { - None - } else { - Some(Box::new(links.clone())) - }; - } - - if let Some(cb) = self.on_should_crawl_callback { - if !cb(&page) { - page.blocked_crawl = true; - channel_send_page(&self.channel, page, &self.channel_guard); - return Default::default(); - } - } - - channel_send_page(&self.channel, page, &self.channel_guard); - - links - } else { - HashSet::new() - }; - - links - } - /// Expand links for crawl. #[cfg(all(not(feature = "glob"), feature = "decentralized"))] async fn crawl_establish( @@ -2284,7 +2079,7 @@ impl Website { async fn crawl_establish( &mut self, client: &Client, - base: &mut (CompactString, smallvec::SmallVec<[CompactString; 2]>), + base: &mut RelativeSelectors, _: bool, page: &chromiumoxide::Page, ) -> HashSet { @@ -2307,6 +2102,13 @@ impl Website { &client, &page, &self.configuration.wait_for, + &self.configuration.screenshot, + false, // we use the initial about:blank page. + &self.configuration.openai_config, + &self.configuration.execution_scripts, + &self.configuration.automation_scripts, + &self.configuration.viewport, + &self.configuration.request_timeout, ) .await; let u = page.get_url(); @@ -2325,14 +2127,14 @@ impl Website { if self.configuration.return_page_links { page.page_links = Some(Default::default()); - let next_links = HashSet::from(page.links(&base).await); + let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await); channel_send_page(&self.channel, page.clone(), &self.channel_guard); links.extend(next_links); } else { channel_send_page(&self.channel, page.clone(), &self.channel_guard); - let next_links = HashSet::from(page.links(&base).await); + let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await); links.extend(next_links); } @@ -2342,15 +2144,11 @@ impl Website { } /// Expand links for crawl. - #[cfg(all( - feature = "glob", - not(feature = "chrome"), - not(feature = "decentralized") - ))] - async fn crawl_establish( + #[cfg(feature = "glob")] + async fn _crawl_establish( &mut self, client: &Client, - base: &mut (CompactString, smallvec::SmallVec<[CompactString; 2]>), + base: &mut RelativeSelectors, _: bool, ) -> HashSet { let mut links: HashSet = HashSet::new(); @@ -2359,63 +2157,367 @@ impl Website { self.configuration.configure_allowlist(); - for link in expanded { - let allowed = self.is_allowed(&link); + for url in expanded { + if self + .is_allowed_default(url.inner()) + .eq(&ProcessLinkStatus::Allowed) + { + let mut links_ssg = links.clone(); + let mut links_pages = if self.configuration.return_page_links { + Some(links.clone()) + } else { + None + }; + let mut page_links_settings = + PageLinkBuildSettings::new(true, self.configuration.full_resources); - if allowed.eq(&ProcessLinkStatus::BudgetExceeded) { - break; + page_links_settings.subdomains = self.configuration.subdomains; + page_links_settings.tld = self.configuration.tld; + page_links_settings.normalize = self.configuration.normalize; + + let mut domain_parsed = self.domain_parsed.take(); + + let mut page = Page::new_page_streaming( + &url, + client, + false, + base, + &self.configuration.external_domains_caseless, + &page_links_settings, + &mut links, + Some(&mut links_ssg), + &mut domain_parsed, // original domain + &mut self.domain_parsed, + &mut links_pages, + ) + .await; + + if self.domain_parsed.is_none() { + if let Some(mut domain_parsed) = domain_parsed.take() { + convert_abs_url(&mut domain_parsed); + self.domain_parsed.replace(domain_parsed); + } + } + + let mut retry_count = self.configuration.retry; + let domains_caseless = &self.configuration.external_domains_caseless; + + while page.should_retry && retry_count > 0 { + retry_count -= 1; + if let Some(timeout) = page.get_timeout() { + tokio::time::sleep(timeout).await; + } + + if page.status_code == StatusCode::GATEWAY_TIMEOUT { + let mut domain_parsed_clone = self.domain_parsed.clone(); + + if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async { + page.clone_from( + &Page::new_page_streaming( + &url, + client, + false, + base, + domains_caseless, + &page_links_settings, + &mut links, + Some(&mut links_ssg), + &mut domain_parsed, + &mut domain_parsed_clone, + &mut links_pages, + ) + .await, + ); + }) + .await + { + log::info!("backoff gateway timeout exceeded {elasped}"); + } + + self.domain_parsed = domain_parsed_clone; + } else { + page.clone_from( + &Page::new_page_streaming( + &url, + client, + false, + base, + &self.configuration.external_domains_caseless, + &page_links_settings, + &mut links, + Some(&mut links_ssg), + &mut domain_parsed, + &mut self.domain_parsed, + &mut links_pages, + ) + .await, + ); + } + } + + emit_log(&url); + + if let Some(signature) = page.signature { + if !self.is_signature_allowed(signature).await { + return Default::default(); + } + self.insert_signature(signature).await; + } + + self.insert_link(match self.on_link_find_callback { + Some(cb) => { + let c = cb(*self.url.clone(), None); + c.0 + } + _ => *self.url.clone(), + }) + .await; + + if page.is_empty() { + self.status = CrawlStatus::Empty; + } + + if self.configuration.return_page_links { + page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new); + } + + links.extend(links_ssg); + + self.initial_status_code = page.status_code; + + if page.status_code == reqwest::StatusCode::FORBIDDEN && links.is_empty() { + self.status = CrawlStatus::Blocked; + } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS { + self.status = CrawlStatus::RateLimited; + } else if page.status_code.is_server_error() { + self.status = CrawlStatus::ServerError; + } + + if let Some(cb) = self.on_should_crawl_callback { + if !cb(&page) { + page.blocked_crawl = true; + channel_send_page(&self.channel, page, &self.channel_guard); + return Default::default(); + } + } + + channel_send_page(&self.channel, page, &self.channel_guard); } + } - if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await { - continue; + links + } + + /// Expand links for crawl. + #[cfg(all( + not(feature = "glob"), + not(feature = "decentralized"), + feature = "smart" + ))] + async fn crawl_establish_smart( + &mut self, + client: &Client, + mut base: &mut RelativeSelectors, + _: bool, + browser: &Arc, + context_id: &Option, + ) -> HashSet { + let links: HashSet = if self + .is_allowed_default(&self.get_base_link()) + .eq(&ProcessLinkStatus::Allowed) + { + let url = self.url.inner(); + + let mut page = Page::new_page(&url, &client).await; + + let mut retry_count = self.configuration.retry; + + while page.should_retry && retry_count > 0 { + retry_count -= 1; + if let Some(timeout) = page.get_timeout() { + tokio::time::sleep(timeout).await; + } + if page.status_code == StatusCode::GATEWAY_TIMEOUT { + if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async { + if retry_count.is_power_of_two() { + Website::render_chrome_page( + &self.configuration, + client, + browser, + context_id, + &mut page, + url, + ) + .await; + } else { + let next_page = Page::new_page(url, &client).await; + page.clone_from(&next_page); + }; + }) + .await + { + log::warn!("backoff timeout {elasped}"); + } + } else { + if retry_count.is_power_of_two() { + Website::render_chrome_page( + &self.configuration, + client, + browser, + context_id, + &mut page, + url, + ) + .await + } else { + page.clone_from(&Page::new_page(url, &client).await); + } + } } - let mut page = Page::new(&link.inner(), &client).await; + let page_links: HashSet = page + .smart_links( + &base, + &browser, + &self.configuration, + &context_id, + &self.domain_parsed, + ) + .await; if let Some(ref domain) = page.final_redirect_destination { - let domain: Box = CaseInsensitiveString::new(&domain).into(); let prior_domain = self.domain_parsed.take(); - self.domain_parsed = parse_absolute_url(&domain); - self.url = domain; - let s = self.setup_selectors(); - base.0 = s.0; - base.1 = s.1; - if let Some(pd) = prior_domain { - if let Some(domain_name) = pd.host_str() { - base.2 = domain_name.into(); - } - } + crate::utils::modify_selectors( + &prior_domain, + domain, + &mut self.domain_parsed, + &mut self.url, + &mut base, + AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld), + ); } - let u = page.get_url().into(); - let link_result = match self.on_link_find_callback { - Some(cb) => cb(u, None), - _ => (u, None), - }; + emit_log(&self.url.inner()); if let Some(sid) = page.signature { self.insert_signature(sid).await; } - self.insert_link(link_result.0).await; + self.insert_link(match self.on_link_find_callback { + Some(cb) => { + let c = cb(*self.url.clone(), None); - if !page.is_empty() { - if self.configuration.return_page_links { - page.page_links = Some(Default::default()); + c.0 } - let page_links = HashSet::from(page.links(&base).await); + _ => *self.url.clone(), + }) + .await; - links.extend(page_links); + let links = if !page_links.is_empty() { + page_links } else { self.status = CrawlStatus::Empty; + Default::default() }; + self.initial_status_code = page.status_code; + + if page.status_code == reqwest::StatusCode::FORBIDDEN { + self.status = CrawlStatus::Blocked; + } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS { + self.status = CrawlStatus::RateLimited; + } else if page.status_code.is_server_error() { + self.status = CrawlStatus::ServerError; + } + + if self.configuration.return_page_links { + page.page_links = if links.is_empty() { + None + } else { + Some(Box::new(links.clone())) + }; + } + + if let Some(cb) = self.on_should_crawl_callback { + if !cb(&page) { + page.blocked_crawl = true; + channel_send_page(&self.channel, page, &self.channel_guard); + return Default::default(); + } + } + channel_send_page(&self.channel, page, &self.channel_guard); - } + + links + } else { + HashSet::new() + }; links } + /// fetch the page with chrome + #[cfg(all( + not(feature = "glob"), + not(feature = "decentralized"), + feature = "smart" + ))] + async fn render_chrome_page( + config: &Configuration, + client: &Client, + browser: &Arc, + context_id: &Option, + page: &mut Page, + url: &str, + ) { + if let Ok(chrome_page) = crate::features::chrome::attempt_navigation( + "about:blank", + &browser, + &config.request_timeout, + &context_id, + &config.viewport, + ) + .await + { + crate::features::chrome::setup_chrome_events(&chrome_page, &config).await; + let intercept_handle = crate::features::chrome::setup_chrome_interception_base( + &chrome_page, + config.chrome_intercept.enabled, + &config.auth_challenge_response, + config.chrome_intercept.block_visuals, + &url, + ) + .await; + + let next_page = Page::new( + &url, + &client, + &chrome_page, + &config.wait_for, + &config.screenshot, + false, // we use the initial about:blank page. + &config.openai_config, + &config.execution_scripts, + &config.automation_scripts, + &config.viewport, + &config.request_timeout, + ) + .await; + + page.clone_from(&next_page); + + if let Some(h) = intercept_handle { + let abort_handle = h.abort_handle(); + if let Err(elasped) = + tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await + { + log::warn!("Handler timeout exceeded {elasped}"); + abort_handle.abort(); + } + } + } + } + /// Set the crawl status depending on crawl state. The crawl that only changes if the state is Start or Active. fn set_crawl_status(&mut self) { if self.status == CrawlStatus::Start || self.status == CrawlStatus::Active { diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 2f4d6a9ec2..f2b4ff94b0 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.27.61" +version = "2.27.62" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 71b7e93707..e47a28e94d 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.27.61" +version = "2.27.62" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 3deaa5ce04..0ebbf72a3d 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.27.61" +version = "2.27.62" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index ff856a5d10..e6699e77e4 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.27.61" +version = "2.27.62" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index de0de7ccb0..910f0c6766 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.27.61" +version = "2.27.62" authors = [ "j-mendez " ]