Skip to content

Commit

Permalink
feat(stdlib): Add replace_with function (#636)
Browse files Browse the repository at this point in the history
* feat(stdlib): Add replace_with function

This is similar to `replace`, but takes a closure to compute the replacment from the match and
capture groups, instead of taking a replacment string.

Fixes: #628

* Pull request feedback

* enhancement(replace_with): Pass object instead of array to closure

This allows us to expose the named capture groups with names.

* Add named capture groups directly to capture object
  • Loading branch information
tmccombs authored Jan 24, 2024
1 parent 41a9b11 commit 309725b
Show file tree
Hide file tree
Showing 10 changed files with 300 additions and 0 deletions.
2 changes: 2 additions & 0 deletions changelog.d/628.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add a `replace_with` function that is similar to `replace` but takes a closure instead of a
replacement string.
3 changes: 3 additions & 0 deletions lib/tests/tests/functions/replace_with/capture_groups.vrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# result: F bar F F fo F G

replace_with("foo bar faa fee fo fum gum", r'([fg])\w\w') -> |m| { upcase(string!(m.captures[0])) }
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# result:
# function call error for "replace_with" at (1:73): Capture group cannot be named "string" or "captures"

replace_with("captain bold", r'cap(?P<captures>\w*)') -> |_m| { "test" }
7 changes: 7 additions & 0 deletions lib/tests/tests/functions/replace_with/error_in_closure.vrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# result:
# function call error for "replace_with" at (1:105): function call error for "assert" at (59:92): failed to parse

replace_with("this is a test", r'(?i)test') -> |_m| {
assert!(false, "failed to parse")
"TEST"
}
3 changes: 3 additions & 0 deletions lib/tests/tests/functions/replace_with/neg_count.vrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# result: "fOO bAr cAt dOg"

replace_with("foo bar cat dog", r'[oa]*', count: -32) -> |m| { upcase(m.string) }
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# result:
# function call error for "replace_with" at (1:64): Capture group cannot be named "string" or "captures"

replace_with("a test", r'"(?P<string>.*)"') -> |m| { m.string }
15 changes: 15 additions & 0 deletions lib/tests/tests/functions/replace_with/wrong_type.vrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# result:
# error[E122]: type mismatch in closure return type
# ┌─ :2:34
# │
# 2 │ replace_with("", r'test') -> |m| { to_int!(m.string) }
# │ ^^^^^^^^^^^^^^^^^^^^^
# │ │
# │ block returns invalid value type
# │ received: integer
# │ expected: string
# │
# = see language documentation at https://vrl.dev
# = try your code in the VRL REPL, learn more at https://vrl.dev/examples

replace_with("", r'test') -> |m| { to_int!(m.string) }
3 changes: 3 additions & 0 deletions lib/tests/tests/functions/replace_with/zero_count.vrl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#result: "foo bar"

replace_with("foo bar", r'[oa]', count: 0) -> |m| { upcase(m.string) }
3 changes: 3 additions & 0 deletions src/stdlib/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ cfg_if::cfg_if! {
mod redact;
mod remove;
mod replace;
mod replace_with;
mod reverse_dns;
mod round;
mod seahash;
Expand Down Expand Up @@ -325,6 +326,7 @@ cfg_if::cfg_if! {
pub use redact::Redact;
pub use remove::Remove;
pub use replace::Replace;
pub use replace_with::ReplaceWith;
pub use reverse_dns::ReverseDns;
pub use round::Round;
pub use set::Set;
Expand Down Expand Up @@ -493,6 +495,7 @@ pub fn all() -> Vec<Box<dyn Function>> {
Box::new(Redact),
Box::new(Remove),
Box::new(Replace),
Box::new(ReplaceWith),
Box::new(ReverseDns),
Box::new(Round),
Box::new(Seahash),
Expand Down
256 changes: 256 additions & 0 deletions src/stdlib/replace_with.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
use std::collections::BTreeMap;

use regex::{CaptureMatches, CaptureNames, Captures, Regex};

use crate::compiler::prelude::*;

fn replace_with<T>(
value: Value,
pattern: &Regex,
count: Value,
ctx: &mut Context,
runner: closure::Runner<T>,
) -> Resolved
where
T: Fn(&mut Context) -> Result<Value, ExpressionError>,
{
let haystack = value.try_bytes_utf8_lossy()?;
let count = match count.try_integer()? {
i if i > 0 => i as usize,
i if i < 0 => 0,
// this is when i == 0
_ => return Ok(value),
};
let captures = pattern.captures_iter(&haystack);
make_replacement(
captures,
&haystack,
count,
pattern.capture_names(),
ctx,
runner,
)
}

fn make_replacement<T>(
caps: CaptureMatches,
haystack: &str,
count: usize,
capture_names: CaptureNames,
ctx: &mut Context,
runner: closure::Runner<T>,
) -> Resolved
where
T: Fn(&mut Context) -> Result<Value, ExpressionError>,
{
// possible optimization: peek at first capture, if none return the original value.
let mut replaced = String::with_capacity(haystack.len());
let limit = if count == 0 { usize::MAX } else { count - 1 };
let mut last_match = 0;
// we loop over the matches ourselves instead of calling Regex::replacen, so that we can
// handle errors. This is however based on the implementation of Regex::replacen
for (idx, captures) in caps.enumerate() {
// Safe to unrap because the 0th index always includes the full match.
let m = captures.get(0).unwrap(); // full match

let mut value = captures_to_value(&captures, capture_names.clone());
runner.map_value(ctx, &mut value)?;
let replacement = value.try_bytes_utf8_lossy()?;

replaced.push_str(&haystack[last_match..m.start()]);
replaced.push_str(&replacement);
last_match = m.end();
if idx >= limit {
break;
}
}
// add the final component
replaced.push_str(&haystack[last_match..]);
Ok(replaced.into())
}

const STRING_NAME: &str = "string";
const CAPTURES_NAME: &str = "captures";

fn captures_to_value(captures: &Captures, capture_names: CaptureNames) -> Value {
let mut object: ObjectMap = BTreeMap::new();

// The full match, named "string"
object.insert(STRING_NAME.into(), captures.get(0).unwrap().as_str().into());
// The length includes the total match, so subtract 1
let mut capture_groups: Vec<Value> = Vec::with_capacity(captures.len() - 1);

// We skip the first entry, because it is for the full match, which we have already
// extracted
for (idx, name) in capture_names.enumerate().skip(1) {
let value: Value = if let Some(group) = captures.get(idx) {
group.as_str().into()
} else {
Value::Null
};
if let Some(name) = name {
object.insert(name.into(), value.clone());
}
capture_groups.push(value);
}

object.insert(CAPTURES_NAME.into(), capture_groups.into());

object.into()
}

#[derive(Clone, Copy, Debug)]
pub struct ReplaceWith;

impl Function for ReplaceWith {
fn identifier(&self) -> &'static str {
"replace_with"
}

fn parameters(&self) -> &'static [Parameter] {
&[
Parameter {
keyword: "value",
kind: kind::BYTES,
required: true,
},
Parameter {
keyword: "pattern",
kind: kind::REGEX,
required: true,
},
Parameter {
keyword: "count",
kind: kind::INTEGER,
required: false,
},
]
}

fn examples(&self) -> &'static [Example] {
&[
Example {
title: "double replacement",
source: r#"replace_with("foobar", r'o|a') -> |m| { m.string + m.string }"#,
result: Ok("foooobaar"),
},
Example {
title: "replace count",
source: r#"replace_with("foobar", r'o|a', count: 1) -> |m| { m.string + m.string }"#,
result: Ok("fooobar"),
},
Example {
title: "replace with capture group",
source: r#"replace_with("foo123bar", r'foo(\d+)bar') -> |m| { x = m.captures[0]; "x={{x}}" }"#,
result: Ok(r#"x=123"#),
},
Example {
title: "process capture group",
source: r#"replace_with(s'Got message: {"msg": "b"}', r'message: (\{.*\})') -> |m| { to_string!(parse_json!(m.captures[0]).msg) }"#,
result: Ok("Got b"),
},
Example {
title: "Optional capture group",
source: r#"replace_with("foobar", r'bar( of gold)?') -> |m| { if m.captures[1] == null { "baz" } else { "rich" } }"#,
result: Ok("foobaz"),
},
Example {
title: "Named capture group",
source: r#"replace_with("foo123bar", r'foo(?P<num>\d+)bar') -> |m| { x = to_int!(m.num); to_string(x+ 1) }"#, //to_string(to_int!(m.named.num) + 1) }"#,
result: Ok("\"124\""),
},
]
}

fn compile(
&self,
_state: &state::TypeState,
_ctx: &mut FunctionCompileContext,
arguments: ArgumentList,
) -> Compiled {
let value = arguments.required("value");
let pattern = arguments.required("pattern");
let count = arguments.optional("count").unwrap_or(expr!(-1));

let closure = arguments.required_closure()?;

Ok(ReplaceWithFn {
value,
pattern,
count,
closure,
}
.as_expr())
}

fn closure(&self) -> Option<closure::Definition> {
use closure::{Definition, Input, Output, Variable, VariableKind};

let match_type = Collection::from_parts(
BTreeMap::from([
(STRING_NAME.into(), Kind::bytes()),
(
CAPTURES_NAME.into(),
Kind::array(Collection::from_unknown(Kind::bytes().or_null())),
),
]),
Kind::bytes().or_null(),
);

Some(Definition {
inputs: vec![Input {
parameter_keyword: "value",
kind: Kind::bytes(),
variables: vec![
Variable {
kind: VariableKind::Exact(Kind::object(match_type)),
},
],
output: Output::Kind(Kind::bytes()),
example: Example {
title: "replace with hash",
source: r#"replace_with("received email from a@example.com", pattern: r'\w+@\w+\.\w+') -> |match| { sha2(match.string) }"#,
result: Ok("received email from 896bdca840c9304a5d0bdbeacc4ef359e3093f80c9777c9967e31ba0ff99ed58"),
},
}],
is_iterator: false,
})
}
}

#[derive(Debug, Clone)]
struct ReplaceWithFn {
value: Box<dyn Expression>,
pattern: Box<dyn Expression>,
count: Box<dyn Expression>,
closure: FunctionClosure,
}

impl FunctionExpression for ReplaceWithFn {
fn resolve(&self, ctx: &mut Context) -> ExpressionResult<Value> {
let value = self.value.resolve(ctx)?;
let pattern = self.pattern.resolve(ctx)?;
let pattern = pattern
.as_regex()
.ok_or_else(|| ExpressionError::from("failed to resolve regex"))?;
for name in pattern.capture_names().flatten() {
if name == STRING_NAME || name == CAPTURES_NAME {
return Err(ExpressionError::from(
r#"Capture group cannot be named "string" or "captures""#,
));
}
}
let count = self.count.resolve(ctx)?;
let FunctionClosure {
variables, block, ..
} = &self.closure;

let runner = closure::Runner::new(variables, |ctx| block.resolve(ctx));

replace_with(value, pattern, count, ctx, runner)
}

fn type_def(&self, _: &state::TypeState) -> TypeDef {
TypeDef::bytes().infallible()
}
}

0 comments on commit 309725b

Please sign in to comment.