Skip to content

Commit

Permalink
bugfix: gracefully handle empty docs/tags when attempting to apply ta…
Browse files Browse the repository at this point in the history
…gs (#304)

* bugfix: gracefully handle empty docs/tags when attempting to apply tags

* bugfix: gracefully handle regex creation failures

* fix test
  • Loading branch information
a5huynh committed Feb 8, 2023
1 parent 91e8f37 commit aa251eb
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 71 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/entities/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ shared = { path = "../shared" }
strum = "0.24"
strum_macros = "0.24"
tantivy = "0.18"
thiserror = "1.0.38"
tokio = { version = "1", features = ["full"] }
url = "2.2"

Expand Down
36 changes: 24 additions & 12 deletions crates/entities/src/models/crawl_queue.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::collections::HashSet;

use regex::RegexSet;
use sea_orm::entity::prelude::*;
use sea_orm::sea_query::{OnConflict, Query, SqliteQueryBuilder};
Expand All @@ -8,6 +6,8 @@ use sea_orm::{
QueryOrder, QueryTrait, Set, Statement,
};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use thiserror::Error;
use url::Url;

use super::crawl_tag;
Expand All @@ -19,6 +19,14 @@ use shared::regex::{regex_for_domain, regex_for_prefix};

const MAX_RETRIES: u8 = 5;

#[derive(Debug, Error)]
pub enum EnqueueError {
#[error("Database error: {0}")]
DbError(#[from] sea_orm::DbErr),
#[error("other enqueue error: {0}")]
Other(#[from] anyhow::Error),
}

#[derive(Debug, Clone, PartialEq, EnumIter, DeriveActiveEnum, Serialize, Deserialize, Eq)]
#[sea_orm(rs_type = "String", db_type = "String(None)")]
pub enum TaskErrorType {
Expand Down Expand Up @@ -431,7 +439,7 @@ fn filter_urls(
settings: &UserSettings,
overrides: &EnqueueSettings,
urls: &[String],
) -> Vec<String> {
) -> anyhow::Result<Vec<String>> {
let mut allow_list: Vec<String> = Vec::new();
let mut skip_list: Vec<String> = Vec::new();
let mut restrict_list: Vec<String> = Vec::new();
Expand All @@ -447,12 +455,13 @@ fn filter_urls(
restrict_list.extend(ruleset.restrict_list);
}

let allow_list = RegexSet::new(allow_list).expect("Unable to create allow list");
let skip_list = RegexSet::new(skip_list).expect("Unable to create skip list");
let restrict_list = RegexSet::new(restrict_list).expect("Unable to create restrict list");
let allow_list = RegexSet::new(allow_list)?;
let skip_list = RegexSet::new(skip_list)?;
let restrict_list = RegexSet::new(restrict_list)?;

// Ignore invalid URLs
urls.iter()
let res = urls
.iter()
.filter_map(|url| {
if let Ok(mut parsed) = Url::parse(url) {
// Check that we can handle this scheme
Expand Down Expand Up @@ -496,7 +505,9 @@ fn filter_urls(

None
})
.collect::<Vec<String>>()
.collect::<Vec<String>>();

Ok(res)
}

pub async fn enqueue_local_files(
Expand Down Expand Up @@ -558,9 +569,9 @@ pub async fn enqueue_all(
settings: &UserSettings,
overrides: &EnqueueSettings,
pipeline: Option<String>,
) -> anyhow::Result<(), sea_orm::DbErr> {
) -> anyhow::Result<(), EnqueueError> {
// Filter URLs
let urls = filter_urls(lenses, settings, overrides, urls);
let urls = filter_urls(lenses, settings, overrides, urls).unwrap_or_default();

// Ignore urls already indexed
let mut is_indexed: HashSet<String> = HashSet::with_capacity(urls.len());
Expand All @@ -586,7 +597,7 @@ pub async fn enqueue_all(
if let Ok(parsed) = Url::parse(&url) {
let domain = match parsed.scheme() {
"file" => "localhost",
_ => parsed.host_str().expect("Invalid URL host"),
_ => parsed.host_str()?,
};

result = Some(ActiveModel {
Expand Down Expand Up @@ -1191,7 +1202,8 @@ mod test {
"https://www.reddit.com/submit?title=The%20Epic%20of%20Humanity&url=https://bahaiworld.bahai.org/library/the-epic-of-humanity".into()
];

let mut filtered = filter_urls(&[lens], &settings, &overrides, &to_enqueue);
let mut filtered = filter_urls(&[lens], &settings, &overrides, &to_enqueue)
.expect("Unable to filter urls");
assert_eq!(filtered.len(), 1);
assert_eq!(
filtered.pop(),
Expand Down
13 changes: 10 additions & 3 deletions crates/entities/src/models/indexed_document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,12 @@ pub async fn insert_tags_for_docs<C: ConnectionTrait>(
db: &C,
docs: &[Model],
tags: &[i64],
) -> Result<InsertResult<document_tag::ActiveModel>, DbErr> {
) -> Result<(), DbErr> {
// Nothing to do if we have no docs or tags
if docs.is_empty() || tags.is_empty() {
return Ok(());
}

// Remove dupes before adding
let tags: HashSet<i64> = HashSet::from_iter(tags.iter().cloned());
let doc_ids: Vec<i64> = docs.iter().map(|m| m.id).collect();
Expand Down Expand Up @@ -187,7 +192,9 @@ pub async fn insert_tags_for_docs<C: ConnectionTrait>(
.to_owned(),
)
.exec(db)
.await
.await?;

Ok(())
}

/// Inserts an entry into the tag table for each document and
Expand All @@ -196,7 +203,7 @@ pub async fn insert_tags_many<C: ConnectionTrait>(
db: &C,
docs: &[Model],
tags: &[TagPair],
) -> Result<InsertResult<document_tag::ActiveModel>, DbErr> {
) -> Result<(), DbErr> {
let mut tag_ids: Vec<i64> = Vec::new();
for (label, value) in tags.iter() {
match get_or_create(db, label.to_owned(), value).await {
Expand Down
102 changes: 46 additions & 56 deletions crates/spyglass/src/api/route.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ use entities::models::{
bootstrap_queue, connection::get_all_connections, crawl_queue, fetch_history, indexed_document,
lens, tag,
};
use entities::schema::{DocFields, SearchDocument};
use entities::sea_orm;
use entities::sea_orm::{prelude::*, sea_query, sea_query::Expr, QueryOrder, Set};
use entities::sea_orm::{FromQueryResult, JoinType, QuerySelect};
use jsonrpsee::core::Error;
use libspyglass::connection::{self, credentials, handle_authorize_connection};
use libspyglass::filesystem;
use libspyglass::plugin::PluginCommand;
use libspyglass::search::Searcher;
use libspyglass::search::{document_to_struct, Searcher};
use libspyglass::state::AppState;
use libspyglass::task::{AppPause, CleanupTask, ManagerCommand};
use shared::config::Config;
Expand Down Expand Up @@ -406,8 +405,6 @@ pub async fn search(
.await;

let start = SystemTime::now();
let fields = DocFields::as_fields();

let index = &state.index;
let searcher = index.reader.searcher();

Expand All @@ -427,59 +424,52 @@ pub async fn search(

let mut results: Vec<SearchResult> = Vec::new();
let mut missing: Vec<(String, String)> = Vec::new();

let terms = search_req.query.split_whitespace().collect::<HashSet<_>>();
for (score, doc_addr) in docs {
if let Ok(retrieved) = searcher.doc(doc_addr) {
let doc_id = retrieved
.get_first(fields.id)
.expect("Missing doc_id in schema");
let domain = retrieved
.get_first(fields.domain)
.expect("Missing domain in schema");
let title = retrieved
.get_first(fields.title)
.expect("Missing title in schema");
let description = retrieved
.get_first(fields.description)
.expect("Missing description in schema");
let url = retrieved
.get_first(fields.url)
.expect("Missing url in schema");

log::debug!("Got id with url {:?} {:?}", doc_id, url);
if let Some(doc_id) = doc_id.as_text() {
let indexed = indexed_document::Entity::find()
.filter(indexed_document::Column::DocId.eq(doc_id))
.one(&state.db)
.await;

let crawl_uri = url.as_text().unwrap_or_default().to_string();
match indexed {
Ok(Some(indexed)) => {
let tags = indexed
.find_related(tag::Entity)
.all(&state.db)
.await
.unwrap_or_default()
.iter()
.map(|tag| (tag.label.as_ref().to_string(), tag.value.clone()))
.collect::<Vec<(String, String)>>();

let result = SearchResult {
doc_id: doc_id.to_string(),
domain: domain.as_text().unwrap_or_default().to_string(),
title: title.as_text().unwrap_or_default().to_string(),
crawl_uri: crawl_uri.clone(),
description: description.as_text().unwrap_or_default().to_string(),
url: indexed.open_url.unwrap_or(crawl_uri),
tags,
score,
};

results.push(result);
}
_ => {
missing.push((doc_id.to_owned(), crawl_uri.to_owned()));
}
if let Ok(Ok(doc)) = searcher.doc(doc_addr).map(|doc| document_to_struct(&doc)) {
log::debug!("Got id with url {} {}", doc.doc_id, doc.url);
let indexed = indexed_document::Entity::find()
.filter(indexed_document::Column::DocId.eq(doc.doc_id.clone()))
.one(&state.db)
.await;

let crawl_uri = doc.url;
match indexed {
Ok(Some(indexed)) => {
let tags = indexed
.find_related(tag::Entity)
.all(&state.db)
.await
.unwrap_or_default()
.iter()
.map(|tag| (tag.label.as_ref().to_string(), tag.value.clone()))
.collect::<Vec<(String, String)>>();

let matched_indices = doc
.content
.split_whitespace()
.enumerate()
.filter(|(_, w)| terms.contains(w))
.map(|(idx, _)| idx)
.collect::<Vec<_>>();

dbg!(matched_indices);
let result = SearchResult {
doc_id: doc.doc_id.clone(),
domain: doc.domain,
title: doc.title,
crawl_uri: crawl_uri.clone(),
description: doc.description,
url: indexed.open_url.unwrap_or(crawl_uri),
tags,
score,
};

results.push(result);
}
_ => {
missing.push((doc.doc_id.to_owned(), crawl_uri.to_owned()));
}
}
}
Expand Down
40 changes: 40 additions & 0 deletions crates/spyglass/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::time::Instant;

use anyhow::anyhow;
use migration::{Expr, Func};
use tantivy::collector::TopDocs;
use tantivy::directory::MmapDirectory;
Expand Down Expand Up @@ -320,6 +321,45 @@ async fn get_tag_checks(db: &DatabaseConnection, search: &str) -> Option<Vec<i64
None
}

pub struct RetrievedDocument {
pub doc_id: String,
pub domain: String,
pub title: String,
pub description: String,
pub content: String,
pub url: String,
}

fn field_to_string(doc: &Document, field: Field) -> String {
doc.get_first(field)
.map(|x| x.as_text().unwrap_or_default())
.map(|x| x.to_string())
.unwrap_or_default()
}

pub fn document_to_struct(doc: &Document) -> anyhow::Result<RetrievedDocument> {
let fields = DocFields::as_fields();
let doc_id = field_to_string(doc, fields.id);
if doc_id.is_empty() {
return Err(anyhow!("Invalid doc_id"));
}

let domain = field_to_string(doc, fields.domain);
let title = field_to_string(doc, fields.title);
let description = field_to_string(doc, fields.description);
let url = field_to_string(doc, fields.url);
let content = field_to_string(doc, fields.content);

Ok(RetrievedDocument {
doc_id,
domain,
title,
description,
content,
url,
})
}

#[cfg(test)]
mod test {
use crate::search::{DocumentUpdate, IndexPath, Searcher};
Expand Down

0 comments on commit aa251eb

Please sign in to comment.