spyglass-search · a5huynh · Jul 27, 2022 · Jul 27, 2022 · Jul 27, 2022 · Jul 27, 2022
diff --git a/crates/spyglass/src/api/mod.rs b/crates/spyglass/src/api/mod.rs
@@ -78,7 +78,7 @@ pub fn start_api_ipc(state: &AppState) -> anyhow::Result<Server, ()> {
     let server = ServerBuilder::new(io)
         .start(&endpoint)
         .map_err(|_| log::warn!("Couldn't open socket"))
-        .unwrap();
+        .expect("Unable to open ipc socket");
 
     log::info!("Started IPC server at {}", endpoint);
     Ok(server)

diff --git a/crates/spyglass/src/api/route.rs b/crates/spyglass/src/api/route.rs
@@ -25,28 +25,33 @@ use super::response;
 pub async fn add_queue(state: AppState, queue_item: request::QueueItemParam) -> Result<String> {
     let db = &state.db;
 
-    let parsed = Url::parse(&queue_item.url).unwrap();
-    let new_task = crawl_queue::ActiveModel {
-        domain: Set(parsed.host_str().unwrap().to_string()),
-        url: Set(queue_item.url.to_owned()),
-        crawl_type: Set(crawl_queue::CrawlType::Normal),
-        ..Default::default()
-    };
+    if let Ok(parsed) = Url::parse(&queue_item.url) {
+        let new_task = crawl_queue::ActiveModel {
+            domain: Set(parsed.host_str().expect("Invalid host str").to_string()),
+            url: Set(queue_item.url.to_owned()),
+            crawl_type: Set(crawl_queue::CrawlType::Normal),
+            ..Default::default()
+        };
 
-    match new_task.insert(db).await {
-        Ok(_) => Ok("ok".to_string()),
-        Err(err) => Err(Error {
-            code: ErrorCode::InternalError,
-            message: err.to_string(),
-            data: None,
-        }),
+        return match new_task.insert(db).await {
+            Ok(_) => Ok("ok".to_string()),
+            Err(err) => Err(Error {
+                code: ErrorCode::InternalError,
+                message: err.to_string(),
+                data: None,
+            }),
+        };
     }
+
+    Ok("ok".to_string())
 }
 
 async fn _get_current_status(state: AppState) -> jsonrpc_core::Result<AppStatus> {
     // Grab crawler status
     let app_state = &state.app_state;
-    let paused_status = app_state.get("paused").unwrap();
+    let paused_status = app_state
+        .entry("paused".to_string())
+        .or_insert("false".to_string());
     let is_paused = *paused_status == *"true";
 
     // Grab details about index
@@ -80,7 +85,7 @@ pub async fn crawl_stats(state: AppState) -> jsonrpc_core::Result<CrawlStats> {
     }
 
     let mut by_domain = HashMap::new();
-    let queue_stats = queue_stats.unwrap();
+    let queue_stats = queue_stats.expect("Invalid queue_stats");
     for stat in queue_stats {
         let entry = by_domain
             .entry(stat.domain)
@@ -93,7 +98,7 @@ pub async fn crawl_stats(state: AppState) -> jsonrpc_core::Result<CrawlStats> {
         }
     }
 
-    let indexed_stats = indexed_stats.unwrap();
+    let indexed_stats = indexed_stats.expect("Invalid indexed_stats");
     for stat in indexed_stats {
         let entry = by_domain
             .entry(stat.domain)
@@ -262,24 +267,34 @@ pub async fn search(state: AppState, search_req: request::SearchParam) -> Result
 
     let mut results: Vec<SearchResult> = Vec::new();
     for (score, doc_addr) in docs {
-        let retrieved = searcher.doc(doc_addr).unwrap();
-
-        let doc_id = retrieved.get_first(fields.id).unwrap();
-        let domain = retrieved.get_first(fields.domain).unwrap();
-        let title = retrieved.get_first(fields.title).unwrap();
-        let description = retrieved.get_first(fields.description).unwrap();
-        let url = retrieved.get_first(fields.url).unwrap();
-
-        let result = SearchResult {
-            doc_id: doc_id.as_text().unwrap().to_string(),
-            domain: domain.as_text().unwrap().to_string(),
-            title: title.as_text().unwrap().to_string(),
-            description: description.as_text().unwrap().to_string(),
-            url: url.as_text().unwrap().to_string(),
-            score,
-        };
-
-        results.push(result);
+        if let Ok(retrieved) = searcher.doc(doc_addr) {
+            let doc_id = retrieved
+                .get_first(fields.id)
+                .expect("Missing doc_id in schema");
+            let domain = retrieved
+                .get_first(fields.domain)
+                .expect("Missing domain in schema");
+            let title = retrieved
+                .get_first(fields.title)
+                .expect("Missing title in schema");
+            let description = retrieved
+                .get_first(fields.description)
+                .expect("Missing description in schema");
+            let url = retrieved
+                .get_first(fields.url)
+                .expect("Missing url in schema");
+
+            let result = SearchResult {
+                doc_id: doc_id.as_text().unwrap_or_default().to_string(),
+                domain: domain.as_text().unwrap_or_default().to_string(),
+                title: title.as_text().unwrap_or_default().to_string(),
+                description: description.as_text().unwrap_or_default().to_string(),
+                url: url.as_text().unwrap_or_default().to_string(),
+                score,
+            };
+
+            results.push(result);
+        }
     }
 
     let meta = SearchMeta {
@@ -305,30 +320,32 @@ pub async fn search_lenses(
         .all(&state.db)
         .await;
 
-    if let Err(err) = query_results {
-        log::error!("Unable to search lenses: {:?}", err);
-        return Err(jsonrpc_core::Error::new(ErrorCode::InternalError));
-    }
+    match query_results {
+        Ok(query_results) => {
+            for lens in query_results {
+                results.push(LensResult {
+                    author: lens.author,
+                    title: lens.name,
+                    description: lens.description.unwrap_or_else(|| "".to_string()),
+                    ..Default::default()
+                });
+            }
 
-    let query_results = query_results.unwrap();
-    for lens in query_results {
-        results.push(LensResult {
-            author: lens.author,
-            title: lens.name,
-            description: lens.description.unwrap_or_else(|| "".to_string()),
-            ..Default::default()
-        });
+            Ok(SearchLensesResp { results })
+        }
+        Err(err) => {
+            log::error!("Unable to search lenses: {:?}", err);
+            Err(jsonrpc_core::Error::new(ErrorCode::InternalError))
+        }
     }
-
-    Ok(SearchLensesResp { results })
 }
 
 #[instrument(skip(state))]
 pub async fn toggle_pause(state: AppState) -> jsonrpc_core::Result<AppStatus> {
     // Scope so that the app_state mutex is correctly released.
     {
         let app_state = &state.app_state;
-        let mut paused_status = app_state.get_mut("paused").unwrap();
+        let mut paused_status = app_state.entry("paused".into()).or_insert("false".into());
 
         let current_status = paused_status.to_string() == "true";
         let updated_status = !current_status;

diff --git a/crates/spyglass/src/crawler/mod.rs b/crates/spyglass/src/crawler/mod.rs
@@ -63,7 +63,9 @@ fn normalize_href(url: &str, href: &str) -> Option<String> {
             // Force HTTPS, crawler will fallback to HTTP if necessary.
             if let Ok(url) = Url::parse(href) {
                 let mut url = url;
-                url.set_scheme("https").unwrap();
+                if url.scheme() == "http" {
+                    url.set_scheme("https").expect("Unable to set HTTPS scheme");
+                }
                 return Some(url.to_string());
             }
         } else {
@@ -106,24 +108,24 @@ fn determine_canonical(original: &Url, extracted: &Url) -> String {
     }
 
     // Only allow overrides on the same root domain.
-    let origin_dn = parse_domain_name(origin_dn.unwrap());
-    let extracted_dn = parse_domain_name(extracted_dn.unwrap());
+    let origin_dn = parse_domain_name(origin_dn.expect("origin_dn should not be None"));
+    let extracted_dn = parse_domain_name(extracted_dn.expect("extracted_dn should not be None"));
 
     if origin_dn.is_err() || extracted_dn.is_err() {
         return original.to_string();
     }
 
-    let origin_dn = origin_dn.unwrap();
+    let origin_dn = origin_dn.expect("origin_dn invalid");
+    let extracted_dn = extracted_dn.expect("extracted_dn invalid");
+
     // Special case for bootstrapper.
-    if origin_dn.root().is_some() && origin_dn.root().unwrap() == "archive.org" {
-        return extracted.to_string();
+    if let Some(root) = origin_dn.root() {
+        if root == "archive.org" || Some(root) == extracted_dn.root() {
+            return extracted.to_string();
+        }
     }
 
-    if origin_dn.root() == extracted_dn.unwrap().root() {
-        extracted.to_string()
-    } else {
-        original.to_string()
-    }
+    original.to_string()
 }
 
 impl Crawler {
@@ -208,16 +210,20 @@ impl Crawler {
         db: &DatabaseConnection,
         id: i64,
     ) -> anyhow::Result<Option<CrawlResult>, anyhow::Error> {
-        let crawl = crawl_queue::Entity::find_by_id(id).one(db).await?.unwrap();
+        let crawl = crawl_queue::Entity::find_by_id(id).one(db).await?;
+        if crawl.is_none() {
+            return Ok(None);
+        }
 
+        let crawl = crawl.expect("Invalid crawl model");
         // Modify bootstrapped URLs to pull from the Internet Archive
         let fetch_url = if crawl.crawl_type == crawl_queue::CrawlType::Bootstrap {
             create_archive_url(&crawl.url)
         } else {
             crawl.url.clone()
         };
 
-        let url = Url::parse(&fetch_url).expect("Invalid URL");
+        let url = Url::parse(&fetch_url).expect("Invalid fetch URL");
 
         // Break apart domain + path of the URL
         let domain = url.host_str().expect("Invalid URL");
@@ -238,7 +244,7 @@ impl Crawler {
         // Check for robots.txt of this domain
         // When looking at bootstrapped tasks, check the original URL
         if crawl.crawl_type == crawl_queue::CrawlType::Bootstrap {
-            let og_url = Url::parse(&crawl.url).unwrap();
+            let og_url = Url::parse(&crawl.url).expect("Invalid crawl URL");
             if !check_resource_rules(db, &self.client, &og_url).await? {
                 return Ok(None);
             }
@@ -257,8 +263,8 @@ impl Crawler {
         // Check to see if a canonical URL was found, if not use the original
         // bootstrapped URL
         if crawl.crawl_type == crawl_queue::CrawlType::Bootstrap {
-            let parsed = Url::parse(&result.url).unwrap();
-            let domain = parsed.host_str().unwrap();
+            let parsed = Url::parse(&result.url).expect("Invalid result URL");
+            let domain = parsed.host_str().expect("Invalid result URL host");
             if domain == "web.archive.org" {
                 result.url = crawl.url.clone();
             }

diff --git a/crates/spyglass/src/crawler/robots.rs b/crates/spyglass/src/crawler/robots.rs
@@ -41,7 +41,7 @@ pub fn filter_set(rules: &[ParsedRule], allow: bool) -> RegexSet {
         .map(|x| x.regex.clone())
         .collect();
 
-    RegexSet::new(&rules).unwrap()
+    RegexSet::new(&rules).expect("Invalid regex rules")
 }
 
 /// Parse a robots.txt file and return a vector of parsed rules
@@ -81,7 +81,8 @@ pub fn parse(domain: &str, txt: &str) -> Vec<ParsedRule> {
                         } else if regex.is_none() && prefix.starts_with("disallow") {
                             rules.push(ParsedRule {
                                 domain: domain.to_string(),
-                                regex: regex_for_robots("/", WildcardType::Regex).unwrap(),
+                                regex: regex_for_robots("/", WildcardType::Regex)
+                                    .expect("Invalid robots regex"),
                                 allow_crawl: true,
                             });
                         }
@@ -100,7 +101,7 @@ pub async fn check_resource_rules(
     client: &HTTPClient,
     url: &Url,
 ) -> anyhow::Result<bool> {
-    let domain = url.host_str().unwrap();
+    let domain = url.host_str().unwrap_or_default();
     let path = url[url::Position::BeforePath..].to_string();
 
     let rules = resource_rule::Entity::find()
@@ -119,29 +120,29 @@ pub async fn check_resource_rules(
             Ok(res) => {
                 match res.status() {
                     StatusCode::OK => {
-                        let body = res.text().await.unwrap();
-
-                        let parsed_rules = parse(domain, &body);
-                        // No rules? Treat as an allow all
-                        if parsed_rules.is_empty() {
-                            let new_rule = resource_rule::ActiveModel {
-                                domain: Set(domain.to_owned()),
-                                rule: Set("/".to_owned()),
-                                no_index: Set(false),
-                                allow_crawl: Set(true),
-                                ..Default::default()
-                            };
-                            new_rule.insert(db).await?;
-                        } else {
-                            for rule in parsed_rules.iter() {
+                        if let Ok(body) = res.text().await {
+                            let parsed_rules = parse(domain, &body);
+                            // No rules? Treat as an allow all
+                            if parsed_rules.is_empty() {
                                 let new_rule = resource_rule::ActiveModel {
-                                    domain: Set(rule.domain.to_owned()),
-                                    rule: Set(rule.regex.to_owned()),
+                                    domain: Set(domain.to_owned()),
+                                    rule: Set("/".to_owned()),
                                     no_index: Set(false),
-                                    allow_crawl: Set(rule.allow_crawl),
+                                    allow_crawl: Set(true),
                                     ..Default::default()
                                 };
                                 new_rule.insert(db).await?;
+                            } else {
+                                for rule in parsed_rules.iter() {
+                                    let new_rule = resource_rule::ActiveModel {
+                                        domain: Set(rule.domain.to_owned()),
+                                        rule: Set(rule.regex.to_owned()),
+                                        no_index: Set(false),
+                                        allow_crawl: Set(rule.allow_crawl),
+                                        ..Default::default()
+                                    };
+                                    new_rule.insert(db).await?;
+                                }
                             }
                         }
                     }
@@ -185,11 +186,15 @@ pub async fn check_resource_rules(
             if !headers.contains_key(http::header::CONTENT_TYPE) {
                 return Ok(false);
             } else {
-                let value = headers.get(http::header::CONTENT_TYPE).unwrap();
-                let value = value.to_str().unwrap();
-                if !value.to_string().contains(&"text/html") {
-                    log::info!("Unable to crawl: content-type =/= text/html");
-                    return Ok(false);
+                let value = headers
+                    .get(http::header::CONTENT_TYPE)
+                    .and_then(|header| header.to_str().ok());
+
+                if let Some(value) = value {
+                    if !value.to_string().contains(&"text/html") {
+                        log::info!("Unable to crawl: content-type =/= text/html");
+                        return Ok(false);
+                    }
                 }
             }
         }

diff --git a/crates/spyglass/src/plugin/exports.rs b/crates/spyglass/src/plugin/exports.rs
@@ -85,11 +85,18 @@ pub(crate) fn plugin_cmd(env: &PluginEnv) {
                 if let Ok(conn) = Connection::open(path) {
                     let stmt = conn.prepare(&query);
                     if let Ok(mut stmt) = stmt {
-                        let results =
-                            stmt.query_map([], |row| Ok(row.get::<usize, String>(0).unwrap()));
+                        let results = stmt.query_map([], |row| {
+                            Ok(row.get::<usize, String>(0).unwrap_or_default())
+                        });
 
                         if let Ok(results) = results {
-                            let collected = results.map(|x| x.unwrap()).collect::<Vec<String>>();
+                            let collected: Vec<String> = results
+                                .map(|x| x.unwrap_or_default())
+                                .collect::<Vec<String>>()
+                                .into_iter()
+                                .filter(|x| !x.is_empty())
+                                .collect();
+
                             if let Err(e) = wasi_write(&env.wasi_env, &collected) {
                                 log::error!("{}", e);
                             }