From 64b25bc37afb82a8997e106e0ec1a4c1439f2117 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Maqueda Date: Tue, 26 Jul 2022 14:13:45 +0200 Subject: [PATCH 1/5] Fix escaped like wildcards Added a new function that replaces the like wildcards '%' and '_' for the regex counterparts before executing them. It also takes into account that the wildcards can be escaped, in that case, it does remove the escape characters and leaves the wildcards so that they are matched against the raw character. This is implemented iterating over all the characters of the pattern to figure out when it needs to be transformed or not. --- arrow/src/compute/kernels/comparison.rs | 75 +++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 7733ce67a76e..9237ee7ae5f6 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -169,7 +169,7 @@ where let re = if let Some(ref regex) = map.get(pat) { regex } else { - let re_pattern = escape(pat).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(pat)?; let re = op(&re_pattern)?; map.insert(pat, re); map.get(pat).unwrap() @@ -248,7 +248,9 @@ pub fn like_utf8_scalar( bit_util::set_bit(bool_slice, i); } } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use starts_with let starts_with = &right[..right.len() - 1]; @@ -266,7 +268,7 @@ pub fn like_utf8_scalar( } } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {}", @@ -296,6 +298,41 @@ pub fn like_utf8_scalar( Ok(BooleanArray::from(data)) } +fn replace_like_wildcards(text: &str) -> Result { + let text = escape(text); + let mut result = String::new(); + let mut preceding_backslash_chars = String::new(); + for c in text.chars() { + if c == '\\' { + preceding_backslash_chars.push(c); + } else if is_like_pattern(c) { + if preceding_backslash_chars.is_empty() { + // An unescaped like wildcard. Replaced by regex pattern + if c == '%' { + result.push_str(".*"); + } else { + result.push('.'); + } + } else { + // Escaped like wildcard. Remove the last two backslash + if preceding_backslash_chars.len() > 2 { + result.push_str(&preceding_backslash_chars[0..preceding_backslash_chars.len() - 2]); + } + result.push(c); + } + preceding_backslash_chars = String::new(); + } else { + // No like wildcard found. Append unchanged + if !preceding_backslash_chars.is_empty() { + result.push_str(&preceding_backslash_chars); + preceding_backslash_chars = String::new(); + } + result.push(c); + } + } + Ok(result) +} + /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// @@ -342,7 +379,7 @@ pub fn nlike_utf8_scalar( result.append(!left.value(i).ends_with(&right[1..])); } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {}", @@ -423,7 +460,7 @@ pub fn ilike_utf8_scalar( ); } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {}", @@ -506,7 +543,7 @@ pub fn nilike_utf8_scalar( ); } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {}", @@ -3740,6 +3777,32 @@ mod tests { vec![false, true, false, false] ); + test_utf8_scalar!( + test_utf8_scalar_like_escape, + vec!["a%", "a\\x"], + "a\\%", + like_utf8_scalar, + vec![true, false] + ); + + test_utf8!( + test_utf8_scalar_ilike_regex, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_utf8, + vec![true] + ); + + #[test] + fn test_replace_like_wildcards() { + let a_eq = "\\%_%\\_\\\\%.\\."; + let expected = String::from("%..*_\\\\%\\.\\\\\\."); + assert_eq!( + replace_like_wildcards(a_eq).unwrap(), + expected + ); + } + test_utf8!( test_utf8_array_eq, vec!["arrow", "arrow", "arrow", "arrow"], From 46f80b5436b070a564e84310a3d4c2c3472daea8 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Maqueda Date: Mon, 1 Aug 2022 16:30:28 +0200 Subject: [PATCH 2/5] Rewrite logic with peek after PR feedback --- arrow/Cargo.toml | 1 + arrow/src/compute/kernels/comparison.rs | 79 ++++++++++++++++--------- 2 files changed, 51 insertions(+), 29 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d0a7c73ae3d5..dcecdb6743ff 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -49,6 +49,7 @@ half = { version = "2.0", default-features = false } hashbrown = { version = "0.12", default-features = false } csv_crate = { version = "1.1", default-features = false, optional = true, package="csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } +regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } lazy_static = { version = "1.4", default-features = false } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 9237ee7ae5f6..14c1d61b0514 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -35,7 +35,7 @@ use crate::datatypes::{ }; use crate::error::{ArrowError, Result}; use crate::util::bit_util; -use regex::{escape, Regex}; +use regex::Regex; use std::collections::HashMap; /// Helper function to perform boolean lambda function on values from two array accessors, this @@ -299,34 +299,37 @@ pub fn like_utf8_scalar( } fn replace_like_wildcards(text: &str) -> Result { - let text = escape(text); let mut result = String::new(); - let mut preceding_backslash_chars = String::new(); - for c in text.chars() { + let text = String::from(text); + let mut chars_iter = text.chars().peekable(); + while let Some(c) = chars_iter.next() { if c == '\\' { - preceding_backslash_chars.push(c); - } else if is_like_pattern(c) { - if preceding_backslash_chars.is_empty() { - // An unescaped like wildcard. Replaced by regex pattern - if c == '%' { - result.push_str(".*"); - } else { - result.push('.'); + let next = chars_iter.peek().copied(); + match next { + Some(next) => { + if is_like_pattern(next) { + result.push(next); + // Skipping the next char as it is already appended + chars_iter.next(); + } else { + result.push('\\'); + result.push('\\'); + } } - } else { - // Escaped like wildcard. Remove the last two backslash - if preceding_backslash_chars.len() > 2 { - result.push_str(&preceding_backslash_chars[0..preceding_backslash_chars.len() - 2]); + None => { + result.push('\\'); + result.push('\\'); + return Ok(result); } - result.push(c); } - preceding_backslash_chars = String::new(); + } else if regex_syntax::is_meta_character(c) { + result.push('\\'); + result.push(c); + } else if c == '%' { + result.push_str(".*"); + } else if c == '_' { + result.push('.'); } else { - // No like wildcard found. Append unchanged - if !preceding_backslash_chars.is_empty() { - result.push_str(&preceding_backslash_chars); - preceding_backslash_chars = String::new(); - } result.push(c); } } @@ -3795,12 +3798,30 @@ mod tests { #[test] fn test_replace_like_wildcards() { - let a_eq = "\\%_%\\_\\\\%.\\."; - let expected = String::from("%..*_\\\\%\\.\\\\\\."); - assert_eq!( - replace_like_wildcards(a_eq).unwrap(), - expected - ); + let a_eq = "_%"; + let expected = String::from("..*"); + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_leave_like_meta_chars() { + let a_eq = "\\%\\_"; + let expected = String::from("%_"); + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_with_multiple_escape_chars() { + let a_eq = "\\\\%"; + let expected = String::from("\\\\%"); + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_escape_regex_meta_char() { + let a_eq = "."; + let expected = String::from("\\."); + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); } test_utf8!( From c50a822b8d155e182c06faee6168d4035cf9ccf6 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Maqueda Date: Tue, 2 Aug 2022 15:27:47 +0200 Subject: [PATCH 3/5] Simplifly logic --- arrow/src/compute/kernels/comparison.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 14c1d61b0514..b7ef92c3b6ff 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -304,23 +304,18 @@ fn replace_like_wildcards(text: &str) -> Result { let mut chars_iter = text.chars().peekable(); while let Some(c) = chars_iter.next() { if c == '\\' { - let next = chars_iter.peek().copied(); + let next = chars_iter.peek(); match next { - Some(next) => { - if is_like_pattern(next) { - result.push(next); - // Skipping the next char as it is already appended - chars_iter.next(); - } else { - result.push('\\'); - result.push('\\'); - } + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); } - None => { + _ => { result.push('\\'); result.push('\\'); - return Ok(result); } + } } else if regex_syntax::is_meta_character(c) { result.push('\\'); From 9f3e8ac2105f9fb48fb187990f81ffeaa3f7b960 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Maqueda Date: Wed, 3 Aug 2022 12:01:12 +0200 Subject: [PATCH 4/5] Add documentation and refactor string creation in tests --- arrow/src/compute/kernels/comparison.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index b7ef92c3b6ff..3869d5b71b45 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -298,10 +298,15 @@ pub fn like_utf8_scalar( Ok(BooleanArray::from(data)) } -fn replace_like_wildcards(text: &str) -> Result { +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` +/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` +/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` +fn replace_like_wildcards(pattern: &str) -> Result { let mut result = String::new(); - let text = String::from(text); - let mut chars_iter = text.chars().peekable(); + let pattern = String::from(pattern); + let mut chars_iter = pattern.chars().peekable(); while let Some(c) = chars_iter.next() { if c == '\\' { let next = chars_iter.peek(); @@ -3794,28 +3799,28 @@ mod tests { #[test] fn test_replace_like_wildcards() { let a_eq = "_%"; - let expected = String::from("..*"); + let expected = "..*"; assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); } #[test] fn test_replace_like_wildcards_leave_like_meta_chars() { let a_eq = "\\%\\_"; - let expected = String::from("%_"); + let expected = "%_"; assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); } #[test] fn test_replace_like_wildcards_with_multiple_escape_chars() { let a_eq = "\\\\%"; - let expected = String::from("\\\\%"); + let expected = "\\\\%"; assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); } #[test] fn test_replace_like_wildcards_escape_regex_meta_char() { let a_eq = "."; - let expected = String::from("\\."); + let expected = "\\."; assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); } From f3dd39077d61d4d3b4cb9ef9390b3342d3b9f822 Mon Sep 17 00:00:00 2001 From: Daniel Martinez Maqueda Date: Wed, 3 Aug 2022 15:25:17 +0200 Subject: [PATCH 5/5] Add small fix and cargo fmt --- arrow/src/compute/kernels/comparison.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 3869d5b71b45..e4187ef87155 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -320,7 +320,6 @@ fn replace_like_wildcards(pattern: &str) -> Result { result.push('\\'); result.push('\\'); } - } } else if regex_syntax::is_meta_character(c) { result.push('\\'); @@ -370,7 +369,9 @@ pub fn nlike_utf8_scalar( for i in 0..left.len() { result.append(left.value(i) != right); } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() { @@ -443,7 +444,9 @@ pub fn ilike_utf8_scalar( for i in 0..left.len() { result.append(left.value(i) == right); } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() { @@ -524,7 +527,9 @@ pub fn nilike_utf8_scalar( for i in 0..left.len() { result.append(left.value(i) != right); } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() {