Skip to content

Commit 4c84ed8

Browse files
authored
add switch to change regex engine from Rust to Python (#983)
1 parent b8d3b95 commit 4c84ed8

File tree

3 files changed

+123
-20
lines changed

3 files changed

+123
-20
lines changed

python/pydantic_core/core_schema.py

+10
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class CoreConfig(TypedDict, total=False):
6969
validation_error_cause: Whether to add user-python excs to the __cause__ of a ValidationError.
7070
Requires exceptiongroup backport pre Python 3.11.
7171
coerce_numbers_to_str: Whether to enable coercion of any `Number` type to `str` (not applicable in `strict` mode).
72+
regex_engine: The regex engine to use for regex pattern validation. Default is 'rust-regex'. See `StringSchema`.
7273
"""
7374

7475
title: str
@@ -752,6 +753,7 @@ class StringSchema(TypedDict, total=False):
752753
strip_whitespace: bool
753754
to_lower: bool
754755
to_upper: bool
756+
regex_engine: Literal['rust-regex', 'python-re'] # default: 'rust-regex'
755757
strict: bool
756758
ref: str
757759
metadata: Any
@@ -766,6 +768,7 @@ def str_schema(
766768
strip_whitespace: bool | None = None,
767769
to_lower: bool | None = None,
768770
to_upper: bool | None = None,
771+
regex_engine: Literal['rust-regex', 'python-re'] | None = None,
769772
strict: bool | None = None,
770773
ref: str | None = None,
771774
metadata: Any = None,
@@ -789,6 +792,12 @@ def str_schema(
789792
strip_whitespace: Whether to strip whitespace from the value
790793
to_lower: Whether to convert the value to lowercase
791794
to_upper: Whether to convert the value to uppercase
795+
regex_engine: The regex engine to use for pattern validation. Default is 'rust-regex'.
796+
- `rust-regex` uses the [`regex`](https://docs.rs/regex) Rust
797+
crate, which is non-backtracking and therefore more DDoS
798+
resistant, but does not support all regex features.
799+
- `python-re` use the [`re`](https://docs.python.org/3/library/re.html) module,
800+
which supports all regex features, but may be slower.
792801
strict: Whether the value should be a string or a value that can be converted to a string
793802
ref: optional unique identifier of the schema, used to reference the schema in other places
794803
metadata: Any other information you want to include with the schema, not used by pydantic-core
@@ -802,6 +811,7 @@ def str_schema(
802811
strip_whitespace=strip_whitespace,
803812
to_lower=to_lower,
804813
to_upper=to_upper,
814+
regex_engine=regex_engine,
805815
strict=strict,
806816
ref=ref,
807817
metadata=metadata,

src/validators/string.rs

+55-7
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ impl Validator for StrValidator {
7272
#[derive(Debug, Clone, Default)]
7373
pub struct StrConstrainedValidator {
7474
strict: bool,
75-
pattern: Option<Regex>,
75+
pattern: Option<Pattern>,
7676
max_length: Option<usize>,
7777
min_length: Option<usize>,
7878
strip_whitespace: bool,
@@ -126,10 +126,10 @@ impl Validator for StrConstrainedValidator {
126126
}
127127

128128
if let Some(pattern) = &self.pattern {
129-
if !pattern.is_match(str) {
129+
if !pattern.is_match(py, str)? {
130130
return Err(ValError::new(
131131
ErrorType::StringPatternMismatch {
132-
pattern: pattern.to_string(),
132+
pattern: pattern.pattern.clone(),
133133
context: None,
134134
},
135135
input,
@@ -170,10 +170,16 @@ impl Validator for StrConstrainedValidator {
170170
impl StrConstrainedValidator {
171171
fn build(schema: &PyDict, config: Option<&PyDict>) -> PyResult<Self> {
172172
let py = schema.py();
173-
let pattern = match schema.get_as(intern!(py, "pattern"))? {
174-
Some(s) => Some(Regex::new(s).map_err(|e| py_schema_error_type!("{}", e))?),
175-
None => None,
176-
};
173+
174+
let pattern = schema
175+
.get_as(intern!(py, "pattern"))?
176+
.map(|s| {
177+
let regex_engine =
178+
schema_or_config(schema, config, intern!(py, "regex_engine"), intern!(py, "regex_engine"))?
179+
.unwrap_or(RegexEngine::RUST_REGEX);
180+
Pattern::compile(py, s, regex_engine)
181+
})
182+
.transpose()?;
177183
let min_length: Option<usize> =
178184
schema_or_config(schema, config, intern!(py, "min_length"), intern!(py, "str_min_length"))?;
179185
let max_length: Option<usize> =
@@ -219,3 +225,45 @@ impl StrConstrainedValidator {
219225
|| self.to_upper
220226
}
221227
}
228+
229+
#[derive(Debug, Clone)]
230+
struct Pattern {
231+
pattern: String,
232+
engine: RegexEngine,
233+
}
234+
235+
#[derive(Debug, Clone)]
236+
enum RegexEngine {
237+
RustRegex(Regex),
238+
PythonRe(PyObject),
239+
}
240+
241+
impl RegexEngine {
242+
const RUST_REGEX: &str = "rust-regex";
243+
const PYTHON_RE: &str = "python-re";
244+
}
245+
246+
impl Pattern {
247+
fn compile(py: Python<'_>, pattern: String, engine: &str) -> PyResult<Self> {
248+
let engine = match engine {
249+
RegexEngine::RUST_REGEX => {
250+
RegexEngine::RustRegex(Regex::new(&pattern).map_err(|e| py_schema_error_type!("{}", e))?)
251+
}
252+
RegexEngine::PYTHON_RE => {
253+
let re_compile = py.import(intern!(py, "re"))?.getattr(intern!(py, "compile"))?;
254+
RegexEngine::PythonRe(re_compile.call1((&pattern,))?.into())
255+
}
256+
_ => return Err(py_schema_error_type!("Invalid regex engine: {}", engine)),
257+
};
258+
Ok(Self { pattern, engine })
259+
}
260+
261+
fn is_match(&self, py: Python<'_>, target: &str) -> PyResult<bool> {
262+
match &self.engine {
263+
RegexEngine::RustRegex(regex) => Ok(regex.is_match(target)),
264+
RegexEngine::PythonRe(py_regex) => {
265+
Ok(!py_regex.call_method1(py, intern!(py, "match"), (target,))?.is_none(py))
266+
}
267+
}
268+
}
269+
}

tests/validators/test_string.py

+58-13
Original file line numberDiff line numberDiff line change
@@ -167,26 +167,34 @@ def test_str_constrained_config():
167167
v.validate_python('test long')
168168

169169

170-
def test_invalid_regex():
170+
@pytest.mark.parametrize('engine', [None, 'rust-regex', 'python-re'])
171+
def test_invalid_regex(engine):
171172
# TODO uncomment and fix once #150 is done
172173
# with pytest.raises(SchemaError) as exc_info:
173174
# SchemaValidator({'type': 'str', 'pattern': 123})
174175
# assert exc_info.value.args[0] == (
175176
# 'Error building "str" validator:\n TypeError: \'int\' object cannot be converted to \'PyString\''
176177
# )
177178
with pytest.raises(SchemaError) as exc_info:
178-
SchemaValidator({'type': 'str', 'pattern': '(abc'})
179-
assert exc_info.value.args[0] == (
180-
'Error building "str" validator:\n'
181-
' SchemaError: regex parse error:\n'
182-
' (abc\n'
183-
' ^\n'
184-
'error: unclosed group'
185-
)
186-
187-
188-
def test_regex_error():
189-
v = SchemaValidator({'type': 'str', 'pattern': '11'})
179+
SchemaValidator(core_schema.str_schema(pattern='(abc', regex_engine=engine))
180+
181+
if engine is None or engine == 'rust-regex':
182+
assert exc_info.value.args[0] == (
183+
'Error building "str" validator:\n'
184+
' SchemaError: regex parse error:\n'
185+
' (abc\n'
186+
' ^\n'
187+
'error: unclosed group'
188+
)
189+
elif engine == 'python-re':
190+
assert exc_info.value.args[0] == (
191+
'Error building "str" validator:\n error: missing ), unterminated subpattern at position 0'
192+
)
193+
194+
195+
@pytest.mark.parametrize('engine', [None, 'rust-regex', 'python-re'])
196+
def test_regex_error(engine):
197+
v = SchemaValidator(core_schema.str_schema(pattern='11', regex_engine=engine))
190198
with pytest.raises(ValidationError) as exc_info:
191199
v.validate_python('12')
192200
assert exc_info.value.errors(include_url=False) == [
@@ -297,3 +305,40 @@ def test_coerce_numbers_to_str_from_json(number: str, expected_str: str) -> None
297305

298306
v = SchemaValidator(core_schema.str_schema(), config)
299307
assert v.validate_json(number) == expected_str
308+
309+
310+
@pytest.mark.parametrize('mode', (None, 'schema', 'config'))
311+
def test_backtracking_regex_rust_unsupported(mode) -> None:
312+
pattern = r'r(#*)".*?"\1'
313+
314+
with pytest.raises(SchemaError) as exc_info:
315+
if mode is None:
316+
# rust-regex is the default
317+
SchemaValidator(core_schema.str_schema(pattern=pattern))
318+
elif mode == 'schema':
319+
SchemaValidator(core_schema.str_schema(pattern=pattern, regex_engine='rust-regex'))
320+
elif mode == 'config':
321+
SchemaValidator(core_schema.str_schema(pattern=pattern), core_schema.CoreConfig(regex_engine='rust-regex'))
322+
323+
assert exc_info.value.args[0] == (
324+
'Error building \"str\" validator:\n'
325+
' SchemaError: regex parse error:\n'
326+
' r(#*)\".*?\"\\1\n'
327+
' ^^\n'
328+
'error: backreferences are not supported'
329+
)
330+
331+
332+
@pytest.mark.parametrize('mode', ('schema', 'config'))
333+
def test_backtracking_regex_python(mode) -> None:
334+
pattern = r'r(#*)".*?"\1'
335+
336+
if mode == 'schema':
337+
v = SchemaValidator(core_schema.str_schema(pattern=pattern, regex_engine='python-re'))
338+
elif mode == 'config':
339+
v = SchemaValidator(core_schema.str_schema(pattern=pattern), core_schema.CoreConfig(regex_engine='python-re'))
340+
assert v.validate_python('r""') == 'r""'
341+
assert v.validate_python('r#""#') == 'r#""#'
342+
with pytest.raises(ValidationError):
343+
# not a valid match for the pattern
344+
v.validate_python('r#"#')

0 commit comments

Comments
 (0)