diff --git a/crates/spyglass/src/api/handler/search.rs b/crates/spyglass/src/api/handler/search.rs index 3551df207..e46602656 100644 --- a/crates/spyglass/src/api/handler/search.rs +++ b/crates/spyglass/src/api/handler/search.rs @@ -15,6 +15,11 @@ use std::collections::HashSet; use std::time::SystemTime; use tracing::instrument; +/// Max number of tokens we'll look at for matches before stopping. +const MAX_HIGHLIGHT_SCAN: usize = 10_000; +/// Max number of matches we need to generate a decent preview. +const MAX_HIGHLIGHT_MATCHES: usize = 5; + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] struct WordRange { start: usize, @@ -66,19 +71,28 @@ fn generate_highlight_preview(index: &Searcher, query: &str, content: &str) -> S .map(|s| s.to_string()) .collect::>(); - let matched_indices = content - .split_whitespace() - .enumerate() - .filter(|(_, w)| { - let normalized = tokenizer - .token_stream(w) - .next() - .map(|t| t.text.clone()) - .unwrap_or_else(|| w.to_string()); - terms.contains(&normalized) - }) - .map(|(idx, _)| idx) - .collect::>(); + let mut matched_indices = Vec::new(); + let mut num_tokens_scanned = 0; + for (idx, w) in content.split_whitespace().enumerate() { + num_tokens_scanned += 1; + + let normalized = tokenizer + .token_stream(w) + .next() + .map(|t| t.text.clone()) + .unwrap_or_else(|| w.to_string()); + if terms.contains(&normalized) { + matched_indices.push(idx); + } + + if matched_indices.len() > MAX_HIGHLIGHT_MATCHES { + break; + } + + if num_tokens_scanned > MAX_HIGHLIGHT_SCAN { + break; + } + } // Create word ranges from the indices let mut ranges: Vec = Vec::new();