Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor/bugfix: Better unwrap checks/usage for stability #136

Merged
merged 7 commits into from
Jul 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/spyglass/src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ pub fn start_api_ipc(state: &AppState) -> anyhow::Result<Server, ()> {
let server = ServerBuilder::new(io)
.start(&endpoint)
.map_err(|_| log::warn!("Couldn't open socket"))
.unwrap();
.expect("Unable to open ipc socket");

log::info!("Started IPC server at {}", endpoint);
Ok(server)
Expand Down
117 changes: 67 additions & 50 deletions crates/spyglass/src/api/route.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,33 @@ use super::response;
pub async fn add_queue(state: AppState, queue_item: request::QueueItemParam) -> Result<String> {
let db = &state.db;

let parsed = Url::parse(&queue_item.url).unwrap();
let new_task = crawl_queue::ActiveModel {
domain: Set(parsed.host_str().unwrap().to_string()),
url: Set(queue_item.url.to_owned()),
crawl_type: Set(crawl_queue::CrawlType::Normal),
..Default::default()
};
if let Ok(parsed) = Url::parse(&queue_item.url) {
let new_task = crawl_queue::ActiveModel {
domain: Set(parsed.host_str().expect("Invalid host str").to_string()),
url: Set(queue_item.url.to_owned()),
crawl_type: Set(crawl_queue::CrawlType::Normal),
..Default::default()
};

match new_task.insert(db).await {
Ok(_) => Ok("ok".to_string()),
Err(err) => Err(Error {
code: ErrorCode::InternalError,
message: err.to_string(),
data: None,
}),
return match new_task.insert(db).await {
Ok(_) => Ok("ok".to_string()),
Err(err) => Err(Error {
code: ErrorCode::InternalError,
message: err.to_string(),
data: None,
}),
};
}

Ok("ok".to_string())
}

async fn _get_current_status(state: AppState) -> jsonrpc_core::Result<AppStatus> {
// Grab crawler status
let app_state = &state.app_state;
let paused_status = app_state.get("paused").unwrap();
let paused_status = app_state
.entry("paused".to_string())
.or_insert("false".to_string());
let is_paused = *paused_status == *"true";

// Grab details about index
Expand Down Expand Up @@ -80,7 +85,7 @@ pub async fn crawl_stats(state: AppState) -> jsonrpc_core::Result<CrawlStats> {
}

let mut by_domain = HashMap::new();
let queue_stats = queue_stats.unwrap();
let queue_stats = queue_stats.expect("Invalid queue_stats");
for stat in queue_stats {
let entry = by_domain
.entry(stat.domain)
Expand All @@ -93,7 +98,7 @@ pub async fn crawl_stats(state: AppState) -> jsonrpc_core::Result<CrawlStats> {
}
}

let indexed_stats = indexed_stats.unwrap();
let indexed_stats = indexed_stats.expect("Invalid indexed_stats");
for stat in indexed_stats {
let entry = by_domain
.entry(stat.domain)
Expand Down Expand Up @@ -262,24 +267,34 @@ pub async fn search(state: AppState, search_req: request::SearchParam) -> Result

let mut results: Vec<SearchResult> = Vec::new();
for (score, doc_addr) in docs {
let retrieved = searcher.doc(doc_addr).unwrap();

let doc_id = retrieved.get_first(fields.id).unwrap();
let domain = retrieved.get_first(fields.domain).unwrap();
let title = retrieved.get_first(fields.title).unwrap();
let description = retrieved.get_first(fields.description).unwrap();
let url = retrieved.get_first(fields.url).unwrap();

let result = SearchResult {
doc_id: doc_id.as_text().unwrap().to_string(),
domain: domain.as_text().unwrap().to_string(),
title: title.as_text().unwrap().to_string(),
description: description.as_text().unwrap().to_string(),
url: url.as_text().unwrap().to_string(),
score,
};

results.push(result);
if let Ok(retrieved) = searcher.doc(doc_addr) {
let doc_id = retrieved
.get_first(fields.id)
.expect("Missing doc_id in schema");
let domain = retrieved
.get_first(fields.domain)
.expect("Missing domain in schema");
let title = retrieved
.get_first(fields.title)
.expect("Missing title in schema");
let description = retrieved
.get_first(fields.description)
.expect("Missing description in schema");
let url = retrieved
.get_first(fields.url)
.expect("Missing url in schema");

let result = SearchResult {
doc_id: doc_id.as_text().unwrap_or_default().to_string(),
domain: domain.as_text().unwrap_or_default().to_string(),
title: title.as_text().unwrap_or_default().to_string(),
description: description.as_text().unwrap_or_default().to_string(),
url: url.as_text().unwrap_or_default().to_string(),
score,
};

results.push(result);
}
}

let meta = SearchMeta {
Expand All @@ -305,30 +320,32 @@ pub async fn search_lenses(
.all(&state.db)
.await;

if let Err(err) = query_results {
log::error!("Unable to search lenses: {:?}", err);
return Err(jsonrpc_core::Error::new(ErrorCode::InternalError));
}
match query_results {
Ok(query_results) => {
for lens in query_results {
results.push(LensResult {
author: lens.author,
title: lens.name,
description: lens.description.unwrap_or_else(|| "".to_string()),
..Default::default()
});
}

let query_results = query_results.unwrap();
for lens in query_results {
results.push(LensResult {
author: lens.author,
title: lens.name,
description: lens.description.unwrap_or_else(|| "".to_string()),
..Default::default()
});
Ok(SearchLensesResp { results })
}
Err(err) => {
log::error!("Unable to search lenses: {:?}", err);
Err(jsonrpc_core::Error::new(ErrorCode::InternalError))
}
}

Ok(SearchLensesResp { results })
}

#[instrument(skip(state))]
pub async fn toggle_pause(state: AppState) -> jsonrpc_core::Result<AppStatus> {
// Scope so that the app_state mutex is correctly released.
{
let app_state = &state.app_state;
let mut paused_status = app_state.get_mut("paused").unwrap();
let mut paused_status = app_state.entry("paused".into()).or_insert("false".into());

let current_status = paused_status.to_string() == "true";
let updated_status = !current_status;
Expand Down
38 changes: 22 additions & 16 deletions crates/spyglass/src/crawler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ fn normalize_href(url: &str, href: &str) -> Option<String> {
// Force HTTPS, crawler will fallback to HTTP if necessary.
if let Ok(url) = Url::parse(href) {
let mut url = url;
url.set_scheme("https").unwrap();
if url.scheme() == "http" {
url.set_scheme("https").expect("Unable to set HTTPS scheme");
}
return Some(url.to_string());
}
} else {
Expand Down Expand Up @@ -106,24 +108,24 @@ fn determine_canonical(original: &Url, extracted: &Url) -> String {
}

// Only allow overrides on the same root domain.
let origin_dn = parse_domain_name(origin_dn.unwrap());
let extracted_dn = parse_domain_name(extracted_dn.unwrap());
let origin_dn = parse_domain_name(origin_dn.expect("origin_dn should not be None"));
let extracted_dn = parse_domain_name(extracted_dn.expect("extracted_dn should not be None"));

if origin_dn.is_err() || extracted_dn.is_err() {
return original.to_string();
}

let origin_dn = origin_dn.unwrap();
let origin_dn = origin_dn.expect("origin_dn invalid");
let extracted_dn = extracted_dn.expect("extracted_dn invalid");

// Special case for bootstrapper.
if origin_dn.root().is_some() && origin_dn.root().unwrap() == "archive.org" {
return extracted.to_string();
if let Some(root) = origin_dn.root() {
if root == "archive.org" || Some(root) == extracted_dn.root() {
return extracted.to_string();
}
}

if origin_dn.root() == extracted_dn.unwrap().root() {
extracted.to_string()
} else {
original.to_string()
}
original.to_string()
}

impl Crawler {
Expand Down Expand Up @@ -208,16 +210,20 @@ impl Crawler {
db: &DatabaseConnection,
id: i64,
) -> anyhow::Result<Option<CrawlResult>, anyhow::Error> {
let crawl = crawl_queue::Entity::find_by_id(id).one(db).await?.unwrap();
let crawl = crawl_queue::Entity::find_by_id(id).one(db).await?;
if crawl.is_none() {
return Ok(None);
}

let crawl = crawl.expect("Invalid crawl model");
// Modify bootstrapped URLs to pull from the Internet Archive
let fetch_url = if crawl.crawl_type == crawl_queue::CrawlType::Bootstrap {
create_archive_url(&crawl.url)
} else {
crawl.url.clone()
};

let url = Url::parse(&fetch_url).expect("Invalid URL");
let url = Url::parse(&fetch_url).expect("Invalid fetch URL");

// Break apart domain + path of the URL
let domain = url.host_str().expect("Invalid URL");
Expand All @@ -238,7 +244,7 @@ impl Crawler {
// Check for robots.txt of this domain
// When looking at bootstrapped tasks, check the original URL
if crawl.crawl_type == crawl_queue::CrawlType::Bootstrap {
let og_url = Url::parse(&crawl.url).unwrap();
let og_url = Url::parse(&crawl.url).expect("Invalid crawl URL");
if !check_resource_rules(db, &self.client, &og_url).await? {
return Ok(None);
}
Expand All @@ -257,8 +263,8 @@ impl Crawler {
// Check to see if a canonical URL was found, if not use the original
// bootstrapped URL
if crawl.crawl_type == crawl_queue::CrawlType::Bootstrap {
let parsed = Url::parse(&result.url).unwrap();
let domain = parsed.host_str().unwrap();
let parsed = Url::parse(&result.url).expect("Invalid result URL");
let domain = parsed.host_str().expect("Invalid result URL host");
if domain == "web.archive.org" {
result.url = crawl.url.clone();
}
Expand Down
57 changes: 31 additions & 26 deletions crates/spyglass/src/crawler/robots.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub fn filter_set(rules: &[ParsedRule], allow: bool) -> RegexSet {
.map(|x| x.regex.clone())
.collect();

RegexSet::new(&rules).unwrap()
RegexSet::new(&rules).expect("Invalid regex rules")
}

/// Parse a robots.txt file and return a vector of parsed rules
Expand Down Expand Up @@ -81,7 +81,8 @@ pub fn parse(domain: &str, txt: &str) -> Vec<ParsedRule> {
} else if regex.is_none() && prefix.starts_with("disallow") {
rules.push(ParsedRule {
domain: domain.to_string(),
regex: regex_for_robots("/", WildcardType::Regex).unwrap(),
regex: regex_for_robots("/", WildcardType::Regex)
.expect("Invalid robots regex"),
allow_crawl: true,
});
}
Expand All @@ -100,7 +101,7 @@ pub async fn check_resource_rules(
client: &HTTPClient,
url: &Url,
) -> anyhow::Result<bool> {
let domain = url.host_str().unwrap();
let domain = url.host_str().unwrap_or_default();
let path = url[url::Position::BeforePath..].to_string();

let rules = resource_rule::Entity::find()
Expand All @@ -119,29 +120,29 @@ pub async fn check_resource_rules(
Ok(res) => {
match res.status() {
StatusCode::OK => {
let body = res.text().await.unwrap();

let parsed_rules = parse(domain, &body);
// No rules? Treat as an allow all
if parsed_rules.is_empty() {
let new_rule = resource_rule::ActiveModel {
domain: Set(domain.to_owned()),
rule: Set("/".to_owned()),
no_index: Set(false),
allow_crawl: Set(true),
..Default::default()
};
new_rule.insert(db).await?;
} else {
for rule in parsed_rules.iter() {
if let Ok(body) = res.text().await {
let parsed_rules = parse(domain, &body);
// No rules? Treat as an allow all
if parsed_rules.is_empty() {
let new_rule = resource_rule::ActiveModel {
domain: Set(rule.domain.to_owned()),
rule: Set(rule.regex.to_owned()),
domain: Set(domain.to_owned()),
rule: Set("/".to_owned()),
no_index: Set(false),
allow_crawl: Set(rule.allow_crawl),
allow_crawl: Set(true),
..Default::default()
};
new_rule.insert(db).await?;
} else {
for rule in parsed_rules.iter() {
let new_rule = resource_rule::ActiveModel {
domain: Set(rule.domain.to_owned()),
rule: Set(rule.regex.to_owned()),
no_index: Set(false),
allow_crawl: Set(rule.allow_crawl),
..Default::default()
};
new_rule.insert(db).await?;
}
}
}
}
Expand Down Expand Up @@ -185,11 +186,15 @@ pub async fn check_resource_rules(
if !headers.contains_key(http::header::CONTENT_TYPE) {
return Ok(false);
} else {
let value = headers.get(http::header::CONTENT_TYPE).unwrap();
let value = value.to_str().unwrap();
if !value.to_string().contains(&"text/html") {
log::info!("Unable to crawl: content-type =/= text/html");
return Ok(false);
let value = headers
.get(http::header::CONTENT_TYPE)
.and_then(|header| header.to_str().ok());

if let Some(value) = value {
if !value.to_string().contains(&"text/html") {
log::info!("Unable to crawl: content-type =/= text/html");
return Ok(false);
}
}
}
}
Expand Down
13 changes: 10 additions & 3 deletions crates/spyglass/src/plugin/exports.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,18 @@ pub(crate) fn plugin_cmd(env: &PluginEnv) {
if let Ok(conn) = Connection::open(path) {
let stmt = conn.prepare(&query);
if let Ok(mut stmt) = stmt {
let results =
stmt.query_map([], |row| Ok(row.get::<usize, String>(0).unwrap()));
let results = stmt.query_map([], |row| {
Ok(row.get::<usize, String>(0).unwrap_or_default())
});

if let Ok(results) = results {
let collected = results.map(|x| x.unwrap()).collect::<Vec<String>>();
let collected: Vec<String> = results
.map(|x| x.unwrap_or_default())
.collect::<Vec<String>>()
.into_iter()
.filter(|x| !x.is_empty())
.collect();

if let Err(e) = wasi_write(&env.wasi_env, &collected) {
log::error!("{}", e);
}
Expand Down
Loading