From db2c6c3eb9ba00321d443ecc9590ce3e06d54bdd Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Fri, 4 Oct 2024 13:15:20 +0200 Subject: [PATCH] increase compaction interval merging the segments right now takes a write lock for the entirety of the operation. this causes all searches to time out whenever the live index compacts its segments. we should actually be able to split up the merge operation to create the merged segment on a read lock and only take a write lock when switching and cleaning the old segments for the new one. increasing the compaction interval is only a temporary fix --- crates/core/src/inverted_index/indexing.rs | 2 +- crates/core/src/live_index/index.rs | 57 +++++++++++++++------- crates/core/src/live_index/mod.rs | 6 +-- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/crates/core/src/inverted_index/indexing.rs b/crates/core/src/inverted_index/indexing.rs index fa7c7fe5..9a4a62c2 100644 --- a/crates/core/src/inverted_index/indexing.rs +++ b/crates/core/src/inverted_index/indexing.rs @@ -153,7 +153,7 @@ impl InvertedIndex { Ok(()) } - #[allow(clippy::missing_panics_doc)] // cannot panic as writer is prepared + #[allow(clippy::missing_panics_doc)] // should not panic as writer is prepared pub fn merge_segments_by_id(&mut self, segments: &[SegmentId]) -> Result> { self.prepare_writer()?; diff --git a/crates/core/src/live_index/index.rs b/crates/core/src/live_index/index.rs index f812ff96..5058afd5 100644 --- a/crates/core/src/live_index/index.rs +++ b/crates/core/src/live_index/index.rs @@ -142,19 +142,13 @@ impl InnerIndex { .unwrap(); self.sync_meta_with_index(); + self.re_open(); } pub fn compact_segments_by_date(&mut self) { - let mut segments_by_date: HashMap> = HashMap::new(); - - for segment in self.meta.segments.clone() { - segments_by_date - .entry(segment.created.date_naive()) - .or_default() - .push(segment); - } + let segments_to_compact = self.prepare_segments_for_compaction(); - for (_, segments) in segments_by_date { + for (_, segments) in segments_to_compact { if segments.len() <= 1 { continue; } @@ -162,15 +156,14 @@ impl InnerIndex { let segment_ids: Vec = segments.iter().map(|s| s.id).collect(); let newest_creation_date = segments.iter().map(|s| s.created).max().unwrap(); - if let Ok(Some(new_segment_id)) = - self.index.inverted_index.merge_segments_by_id(&segment_ids) - { - // Update meta with the new segment, using the newest creation date - self.meta.segments.retain(|s| !segment_ids.contains(&s.id)); - self.meta.segments.push(Segment { - id: new_segment_id, - created: newest_creation_date, - }); + let merge_result = self.index.inverted_index.merge_segments_by_id(&segment_ids); + + if let Ok(Some(new_segment_id)) = merge_result { + self.update_meta_after_compaction( + segment_ids, + new_segment_id, + newest_creation_date, + ); } } @@ -178,6 +171,34 @@ impl InnerIndex { self.re_open(); } + fn prepare_segments_for_compaction(&self) -> HashMap> { + let mut segments_by_date: HashMap> = HashMap::new(); + + for segment in self.meta.segments.clone() { + segments_by_date + .entry(segment.created.date_naive()) + .or_default() + .push(segment); + } + + segments_by_date + } + + fn update_meta_after_compaction( + &mut self, + old_segment_ids: Vec, + new_segment_id: SegmentId, + newest_creation_date: DateTime, + ) { + self.meta + .segments + .retain(|s| !old_segment_ids.contains(&s.id)); + self.meta.segments.push(Segment { + id: new_segment_id, + created: newest_creation_date, + }); + } + fn re_open(&mut self) { self.index.inverted_index.re_open().unwrap(); self.index.prepare_writer().unwrap(); diff --git a/crates/core/src/live_index/mod.rs b/crates/core/src/live_index/mod.rs index afc5b732..60448f7d 100644 --- a/crates/core/src/live_index/mod.rs +++ b/crates/core/src/live_index/mod.rs @@ -25,8 +25,8 @@ mod index_manager; pub use self::crawler::Crawler; const TTL: Duration = Duration::from_secs(60 * 60 * 24 * 60); // 60 days -const PRUNE_INTERVAL: Duration = Duration::from_secs(60 * 60); // 1 hour -const COMPACT_INTERVAL: Duration = Duration::from_secs(60 * 60); // 1 hour -const AUTO_COMMIT_INTERVAL: Duration = Duration::from_secs(60 * 5); // 5 minutes +const PRUNE_INTERVAL: Duration = Duration::from_secs(6 * 60 * 60); // 6 hours +const COMPACT_INTERVAL: Duration = Duration::from_secs(6 * 60 * 60); // 6 hours +const AUTO_COMMIT_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes const EVENT_LOOP_INTERVAL: Duration = Duration::from_secs(5); const BATCH_SIZE: usize = 512;