Skip to content

Commit

Permalink
refactor(storage): improve inverted index match phrase query (#16547)
Browse files Browse the repository at this point in the history
* refactor(storage): improve inverted index match phrase query

* fix typos

* fix machete

* fix machete
  • Loading branch information
b41sh committed Sep 30, 2024
1 parent 0ed932b commit abd8266
Show file tree
Hide file tree
Showing 17 changed files with 825 additions and 771 deletions.
23 changes: 11 additions & 12 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ openai_api_rust = { git = "https://github.com/datafuse-extras/openai-api", rev =
orc-rust = { git = "https://github.com/datafuse-extras/datafusion-orc", rev = "03372b97" }
recursive = { git = "https://github.com/datafuse-extras/recursive.git", rev = "6af35a1" }
sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" }
tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0" }
tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0", package = "tantivy-common" }
tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "124a8fc" }
tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370" }
tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370", package = "tantivy-common" }
tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "0e300e9" }
xorfilter-rs = { git = "https://github.com/datafuse-extras/xorfilter", tag = "databend-alpha.4" }
4 changes: 3 additions & 1 deletion src/common/arrow/src/arrow/array/list/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,9 @@ impl<O: Offset, M: MutableArray> MutableListArray<O, M> {
pub fn try_push_valid(&mut self) -> Result<()> {
let total_length = self.values.len();
let offset = self.offsets.last().to_usize();
let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?;
let length = total_length
.checked_sub(offset)
.ok_or_else(|| Error::Overflow)?;

self.offsets.try_push_usize(length)?;
if let Some(validity) = &mut self.validity {
Expand Down
22 changes: 15 additions & 7 deletions src/common/arrow/src/arrow/offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ impl<O: Offset> Offsets<O> {
pub fn try_push(&mut self, length: O) -> Result<(), Error> {
let old_length = self.last();
assert!(length >= O::zero());
let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?;
let new_length = old_length
.checked_add(&length)
.ok_or_else(|| Error::Overflow)?;
self.0.push(new_length);
Ok(())
}
Expand All @@ -140,10 +142,12 @@ impl<O: Offset> Offsets<O> {
/// * checks that this length does not overflow
#[inline]
pub fn try_push_usize(&mut self, length: usize) -> Result<(), Error> {
let length = O::from_usize(length).ok_or(Error::Overflow)?;
let length = O::from_usize(length).ok_or_else(|| Error::Overflow)?;

let old_length = self.last();
let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?;
let new_length = old_length
.checked_add(&length)
.ok_or_else(|| Error::Overflow)?;
self.0.push(new_length);
Ok(())
}
Expand Down Expand Up @@ -267,8 +271,8 @@ impl<O: Offset> Offsets<O> {

let last_offset = original_offset
.checked_add(total_length)
.ok_or(Error::Overflow)?;
O::from_usize(last_offset).ok_or(Error::Overflow)?;
.ok_or_else(|| Error::Overflow)?;
O::from_usize(last_offset).ok_or_else(|| Error::Overflow)?;
Ok(())
}

Expand All @@ -279,7 +283,9 @@ impl<O: Offset> Offsets<O> {
let mut length = *self.last();
let other_length = *other.last();
// check if the operation would overflow
length.checked_add(&other_length).ok_or(Error::Overflow)?;
length
.checked_add(&other_length)
.ok_or_else(|| Error::Overflow)?;

let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]);
let offsets = lengths.map(|new_length| {
Expand All @@ -306,7 +312,9 @@ impl<O: Offset> Offsets<O> {
let other_length = other.last().expect("Length to be non-zero");
let mut length = *self.last();
// check if the operation would overflow
length.checked_add(other_length).ok_or(Error::Overflow)?;
length
.checked_add(other_length)
.ok_or_else(|| Error::Overflow)?;

let lengths = other.windows(2).map(|w| w[1] - w[0]);
let offsets = lengths.map(|new_length| {
Expand Down
4 changes: 1 addition & 3 deletions src/query/ee/tests/it/inverted_index/index_refresh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
&index_version,
);

let field_nums = query_fields.len();
let has_score = true;
let need_position = false;
let mut field_ids = HashSet::new();
Expand Down Expand Up @@ -177,15 +176,14 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
let matched_rows = index_reader
.clone()
.do_filter(
field_nums,
need_position,
has_score,
query.box_clone(),
&field_ids,
&index_record,
&fuzziness,
tokenizer_manager,
block_meta.row_count as u32,
block_meta.row_count,
&index_loc,
)
.await?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ impl Transform for TransformAddInternalColumns {

fn transform(&mut self, mut block: DataBlock) -> Result<DataBlock> {
if let Some(meta) = block.take_meta() {
let internal_column_meta =
InternalColumnMeta::downcast_from(meta).ok_or(ErrorCode::Internal("It's a bug"))?;
let internal_column_meta = InternalColumnMeta::downcast_from(meta)
.ok_or_else(|| ErrorCode::Internal("It's a bug"))?;
let num_rows = block.num_rows();
for internal_column in self.internal_columns.values() {
let column =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ impl FlightSqlServiceImpl {
pub(super) fn get_user_password(metadata: &MetadataMap) -> Result<(String, String), String> {
let basic = "Basic ";
let authorization = Self::get_header_value(metadata, "authorization")
.ok_or("authorization not parsable".to_string())?;
.ok_or_else(|| "authorization not parsable".to_string())?;

if !authorization.starts_with(basic) {
return Err(format!("Auth type not implemented: {authorization}"));
Expand Down
10 changes: 5 additions & 5 deletions src/query/service/src/servers/http/v1/session/refresh_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ pub async fn refresh_handler(
let mgr = ClientSessionManager::instance();
match &ctx.credential {
Credential::Jwt { .. } => {
let session_id =
req.session_id
.ok_or(HttpErrorCode::bad_request(ErrorCode::BadArguments(
"JWT session should provide session_id when refresh session",
)))?;
let session_id = req.session_id.ok_or_else(|| {
HttpErrorCode::bad_request(ErrorCode::BadArguments(
"JWT session should provide session_id when refresh session",
))
})?;
mgr.refresh_in_memory_states(&session_id);

let tenant = ctx.session.get_current_tenant();
Expand Down
2 changes: 1 addition & 1 deletion src/query/sql/src/planner/planner_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ impl TableRefVisitor {

let func_name = func.name.name.to_lowercase();
// If the function is not suitable for caching, we should not cache the plan
if !is_cacheable_function(&func_name) || func_name == "score" {
if !is_cacheable_function(&func_name) {
self.cache_miss = true;
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/query/storages/common/index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ ignored = ["xorfilter-rs", "match-template"]
[dependencies]
anyerror = { workspace = true }
cbordata = { version = "0.6.0" }
crc32fast = "1.3.2"
databend-common-arrow = { workspace = true }
databend-common-ast = { workspace = true }
databend-common-exception = { workspace = true }
Expand All @@ -29,8 +28,8 @@ levenshtein_automata = "0.2.1"
log = { workspace = true }
match-template = { workspace = true }
parquet = { workspace = true }
roaring = "0.10.1"
serde = { workspace = true }
serde_json = { workspace = true }
tantivy = { workspace = true }
tantivy-common = { workspace = true }
tantivy-fst = "0.5"
Expand Down
Loading

0 comments on commit abd8266

Please sign in to comment.