Skip to content

Commit

Permalink
feat: raise warning with some patterns that have many 2-byte atoms. (#…
Browse files Browse the repository at this point in the history
…264)

Regular expressions like `/[A-Za-z]/{N,}` with N>=2, can be very slow because they produce a large number of 2-bytes atoms. Raise warnings on those cases.
  • Loading branch information
plusvic authored Dec 13, 2024
1 parent 888c77e commit 895a614
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 10 deletions.
31 changes: 21 additions & 10 deletions lib/src/compiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use std::{fmt, iter};
use bincode::Options;
use bitmask::bitmask;
use bstr::{BStr, ByteSlice};
use itertools::izip;
use itertools::{izip, Itertools, MinMaxResult};
#[cfg(feature = "logging")]
use log::*;
use regex_syntax::hir;
Expand Down Expand Up @@ -2124,7 +2124,7 @@ impl<'a> Compiler<'a> {
false,
);

let mut atoms = result.map_err(|err| match err {
let re_atoms = result.map_err(|err| match err {
re::Error::TooLarge => InvalidRegexp::build(
&self.report_builder,
"regexp is too large".to_string(),
Expand All @@ -2143,13 +2143,24 @@ impl<'a> Compiler<'a> {
));
}

let mut slow_pattern = false;

for atom in atoms.iter_mut() {
if atom.atom.len() < 2 {
slow_pattern = true;
}
}
let slow_pattern =
match re_atoms.iter().map(|re_atom| re_atom.atom.len()).minmax() {
// No atoms, slow pattern.
MinMaxResult::NoElements => true,
// Only one atom shorter than 2 bytes, slow pattern.
MinMaxResult::OneElement(len) if len < 2 => true,
// More than one atom, but all shorter than 2 bytes.
MinMaxResult::MinMax(_, max) if max < 2 => true,
// More than 2700 atoms, all with exactly 2 bytes.
// Why 2700?. The larger the number of atoms the higher the
// odds of finding one of them in the data, which slows down
// the scan. The regex [A-Za-z]{N,} (with N>=2) produces
// (26+26)^2 = 2704 atoms. So, 2700 is large enough, but
// produces a warning with the aforementioned regex.
MinMaxResult::MinMax(2, 2) if re_atoms.len() > 2700 => true,
// In all other cases the pattern is not slow.
_ => false,
};

if slow_pattern {
if self.error_on_slow_pattern {
Expand All @@ -2167,7 +2178,7 @@ impl<'a> Compiler<'a> {
}
}

Ok((atoms, is_fast_regexp))
Ok((re_atoms, is_fast_regexp))
}

fn c_literal_chain_head(
Expand Down
6 changes: 6 additions & 0 deletions lib/src/compiler/tests/testdata/warnings/35.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
rule test{
strings:
$a = /[A-Za-z]{2,}/
condition:
$a
}
6 changes: 6 additions & 0 deletions lib/src/compiler/tests/testdata/warnings/35.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
warning[slow_pattern]: slow pattern
--> line:3:5
|
3 | $a = /[A-Za-z]{2,}/
| ------------------- this pattern may slow down the scan
|
7 changes: 7 additions & 0 deletions lib/src/compiler/tests/testdata/warnings/no_warnings.in
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,11 @@ rule test_6 {
$a = "foo"
condition:
all of ($a*, $a*) at 0
}

rule test_7 {
strings:
$a = /[A-Fa-f0-9]{2,}/
condition:
$a
}

0 comments on commit 895a614

Please sign in to comment.