From 895a614ec6ab8cef3c199526c7f93db29b3b0fb3 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 13 Dec 2024 10:31:04 +0100 Subject: [PATCH] feat: raise warning with some patterns that have many 2-byte atoms. (#264) Regular expressions like `/[A-Za-z]/{N,}` with N>=2, can be very slow because they produce a large number of 2-bytes atoms. Raise warnings on those cases. --- lib/src/compiler/mod.rs | 31 +++++++++++++------ .../compiler/tests/testdata/warnings/35.in | 6 ++++ .../compiler/tests/testdata/warnings/35.out | 6 ++++ .../tests/testdata/warnings/no_warnings.in | 7 +++++ 4 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 lib/src/compiler/tests/testdata/warnings/35.in create mode 100644 lib/src/compiler/tests/testdata/warnings/35.out diff --git a/lib/src/compiler/mod.rs b/lib/src/compiler/mod.rs index ff56b3f76..b29114a3a 100644 --- a/lib/src/compiler/mod.rs +++ b/lib/src/compiler/mod.rs @@ -18,7 +18,7 @@ use std::{fmt, iter}; use bincode::Options; use bitmask::bitmask; use bstr::{BStr, ByteSlice}; -use itertools::izip; +use itertools::{izip, Itertools, MinMaxResult}; #[cfg(feature = "logging")] use log::*; use regex_syntax::hir; @@ -2124,7 +2124,7 @@ impl<'a> Compiler<'a> { false, ); - let mut atoms = result.map_err(|err| match err { + let re_atoms = result.map_err(|err| match err { re::Error::TooLarge => InvalidRegexp::build( &self.report_builder, "regexp is too large".to_string(), @@ -2143,13 +2143,24 @@ impl<'a> Compiler<'a> { )); } - let mut slow_pattern = false; - - for atom in atoms.iter_mut() { - if atom.atom.len() < 2 { - slow_pattern = true; - } - } + let slow_pattern = + match re_atoms.iter().map(|re_atom| re_atom.atom.len()).minmax() { + // No atoms, slow pattern. + MinMaxResult::NoElements => true, + // Only one atom shorter than 2 bytes, slow pattern. + MinMaxResult::OneElement(len) if len < 2 => true, + // More than one atom, but all shorter than 2 bytes. + MinMaxResult::MinMax(_, max) if max < 2 => true, + // More than 2700 atoms, all with exactly 2 bytes. + // Why 2700?. The larger the number of atoms the higher the + // odds of finding one of them in the data, which slows down + // the scan. The regex [A-Za-z]{N,} (with N>=2) produces + // (26+26)^2 = 2704 atoms. So, 2700 is large enough, but + // produces a warning with the aforementioned regex. + MinMaxResult::MinMax(2, 2) if re_atoms.len() > 2700 => true, + // In all other cases the pattern is not slow. + _ => false, + }; if slow_pattern { if self.error_on_slow_pattern { @@ -2167,7 +2178,7 @@ impl<'a> Compiler<'a> { } } - Ok((atoms, is_fast_regexp)) + Ok((re_atoms, is_fast_regexp)) } fn c_literal_chain_head( diff --git a/lib/src/compiler/tests/testdata/warnings/35.in b/lib/src/compiler/tests/testdata/warnings/35.in new file mode 100644 index 000000000..cdccf6fdb --- /dev/null +++ b/lib/src/compiler/tests/testdata/warnings/35.in @@ -0,0 +1,6 @@ +rule test{ + strings: + $a = /[A-Za-z]{2,}/ + condition: + $a +} \ No newline at end of file diff --git a/lib/src/compiler/tests/testdata/warnings/35.out b/lib/src/compiler/tests/testdata/warnings/35.out new file mode 100644 index 000000000..6069ddfe7 --- /dev/null +++ b/lib/src/compiler/tests/testdata/warnings/35.out @@ -0,0 +1,6 @@ +warning[slow_pattern]: slow pattern + --> line:3:5 + | +3 | $a = /[A-Za-z]{2,}/ + | ------------------- this pattern may slow down the scan + | diff --git a/lib/src/compiler/tests/testdata/warnings/no_warnings.in b/lib/src/compiler/tests/testdata/warnings/no_warnings.in index d39eb7fac..318e3f257 100644 --- a/lib/src/compiler/tests/testdata/warnings/no_warnings.in +++ b/lib/src/compiler/tests/testdata/warnings/no_warnings.in @@ -42,4 +42,11 @@ rule test_6 { $a = "foo" condition: all of ($a*, $a*) at 0 +} + +rule test_7 { + strings: + $a = /[A-Fa-f0-9]{2,}/ + condition: + $a } \ No newline at end of file