feat(rome_js_analyze): noControlCharactersInRegex (#4656)

Co-authored-by: Emanuele Stoppa <my.burning@gmail.com>
rome · Jul 15, 2023 · 5510f5f · 5510f5f
1 parent 8234b5f
commit 5510f5f
Show file tree

Hide file tree

Showing 16 changed files with 1,098 additions and 65 deletions.
diff --git a/crates/rome_diagnostics_categories/src/categories.rs b/crates/rome_diagnostics_categories/src/categories.rs
@@ -81,6 +81,7 @@ define_categories! {
     "lint/nursery/noConfusingArrow": "https://docs.rome.tools/lint/rules/noConfusingArrow",
     "lint/nursery/noConsoleLog": "https://docs.rome.tools/lint/rules/noConsoleLog",
     "lint/nursery/noConstantCondition": "https://docs.rome.tools/lint/rules/noConstantCondition",
+    "lint/nursery/noControlCharactersInRegex": "https://docs.rome.tools/lint/rules/noControlCharactersInRegex",
     "lint/nursery/noDuplicateJsonKeys": "https://docs.rome.tools/lint/rules/noDuplicateJsonKeys",
     "lint/nursery/noDuplicateJsxProps": "https://docs.rome.tools/lint/rules/noDuplicateJsxProps",
     "lint/nursery/noExcessiveComplexity": "https://docs.rome.tools/lint/rules/noExcessiveComplexity",

diff --git a/crates/rome_js_analyze/src/analyzers/nursery.rs b/crates/rome_js_analyze/src/analyzers/nursery.rs
diff --git a/crates/rome_js_analyze/src/analyzers/nursery/no_control_characters_in_regex.rs b/crates/rome_js_analyze/src/analyzers/nursery/no_control_characters_in_regex.rs
@@ -0,0 +1,274 @@
+use crate::utils::escape_string;
+use rome_analyze::{context::RuleContext, declare_rule, Ast, Rule, RuleDiagnostic};
+use rome_console::markup;
+use rome_js_syntax::{
+    AnyJsExpression, JsCallArguments, JsCallExpression, JsNewExpression, JsRegexLiteralExpression,
+    JsStringLiteralExpression,
+};
+use rome_rowan::{declare_node_union, AstNode, AstSeparatedList};
+use std::{iter::Peekable, str::Chars};
+
+declare_rule! {
+    /// Prevents from having control characters and some escape sequences that match control characters in regular expressions.
+    ///
+    /// Control characters are hidden special characters that are numbered from 0 to 31 in the ASCII system.
+    /// They're not commonly used in JavaScript text. So, if you see them in a pattern (called a regular expression), it's probably a mistake.
+    ///
+    /// The following elements of regular expression patterns are considered possible errors in typing and are therefore disallowed by this rule:
+    ///
+    /// - Hexadecimal character escapes from `\x00` to `\x1F`
+    /// - Unicode character escapes from `\u0000` to `\u001F`
+    /// - Unicode code point escapes from `\u{0}` to `\u{1F}`
+    /// - Unescaped raw characters from U+0000 to U+001F
+    ///
+    /// Control escapes such as `\t` and `\n` are allowed by this rule.
+    ///
+    /// Source: https://eslint.org/docs/latest/rules/no-control-regex
+    ///
+    /// ## Examples
+    ///
+    /// ### Invalid
+    /// ```js,expect_diagnostic
+    ///  var pattern1 = /\x00/;
+    /// ```
+    /// ```js,expect_diagnostic
+    ///  var pattern2 = /\x0C/;
+    /// ```
+    /// ```js,expect_diagnostic
+    ///  var pattern3 = /\x1F/;
+    /// ```
+    /// ```js,expect_diagnostic
+    ///  var pattern4 = /\u000C/;
+    /// ```
+    /// ```js,expect_diagnostic
+    ///  var pattern5 = /\u{C}/u;
+    /// ```
+    /// ```js,expect_diagnostic
+    ///  var pattern7 = new RegExp("\x0C");
+    /// ```
+    /// ```js,expect_diagnostic
+    ///  var pattern7 = new RegExp("\\x0C");
+    /// ```
+    ///
+    /// ### Valid
+    /// ```js
+    /// var pattern1 = /\x20/;
+    /// var pattern2 = /\u0020/;
+    /// var pattern3 = /\u{20}/u;
+    /// var pattern4 = /\t/;
+    /// var pattern5 = /\n/;
+    /// var pattern6 = new RegExp("\x20");
+    /// ```
+    ///
+    pub(crate) NoControlCharactersInRegex {
+        version: "next",
+        name: "noControlCharactersInRegex",
+        recommended: true,
+    }
+}
+
+declare_node_union! {
+    pub(crate) RegexExpressionLike = JsNewExpression | JsCallExpression | JsRegexLiteralExpression
+}
+
+fn decode_hex_character_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> {
+    let first = iter.next()?;
+    let second = iter.next()?;
+    let digits = format!("{first}{second}");
+    let code_point = i64::from_str_radix(&digits, 16).ok()?;
+    Some((digits, code_point))
+}
+
+fn decode_unicode_escape_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> {
+    let mut digits = String::new();
+    // Loop 4 times as unicode escape sequence has exactly 4 hexadecimal digits
+    for _ in 0..4 {
+        if let Some(&c) = iter.peek() {
+            match c {
+                '0'..='9' | 'a'..='f' | 'A'..='F' => digits.push(iter.next()?),
+                _ => continue,
+            }
+        }
+    }
+    let code_point = i64::from_str_radix(digits.as_str(), 16).ok()?;
+    Some((digits, code_point))
+}
+
+fn decode_escaped_code_point_to_code_point(iter: &mut Peekable<Chars>) -> Option<(String, i64)> {
+    let mut digits = String::new();
+    if iter.peek() == Some(&'{') {
+        iter.next();
+        while let Some(&c) = iter.peek() {
+            if c == '}' {
+                iter.next();
+                let code_point = i64::from_str_radix(&digits, 16).ok()?;
+                return Some((format!("{{{}}}", digits), code_point));
+            } else {
+                digits.push(iter.next()?);
+            }
+        }
+    }
+    None
+}
+
+fn add_control_character_to_vec(
+    prefix: &str,
+    iter: &mut Peekable<Chars>,
+    control_characters: &mut Vec<String>,
+    decode: fn(&mut Peekable<Chars>) -> Option<(String, i64)>,
+) {
+    if let Some((s, code_point)) = decode(iter) {
+        // ASCII control characters are represented by code points from 0 to 31
+        if (0..=31).contains(&code_point) {
+            control_characters.push(format!("{prefix}{s}"));
+        }
+    }
+}
+
+/// Collecting control characters for regex. The following characters in regular expression patterns are considered as control characters:
+/// - Hexadecimal character escapes from `\x00` to `\x1F`.
+/// - Unicode character escapes from `\u0000` to `\u001F`.
+/// - Unicode code point escapes range from `\u{0}` to `\u{1F}`.
+///     - The Unicode flag must be set as true in order for these Unicode code point escapes to work: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode.
+/// - Unescaped raw characters from U+0000 to U+001F.
+fn collect_control_characters(pattern: String, flags: Option<String>) -> Option<Vec<String>> {
+    let mut control_characters: Vec<String> = Vec::new();
+    let is_unicode_flag_set = flags.unwrap_or_default().contains('u');
+    let mut iter = pattern.chars().peekable();
+
+    while let Some(c) = iter.next() {
+        match c {
+            '\\' => match iter.next() {
+                Some('x') => add_control_character_to_vec(
+                    "\\x",
+                    &mut iter,
+                    &mut control_characters,
+                    decode_hex_character_to_code_point,
+                ),
+                Some('u') if is_unicode_flag_set => add_control_character_to_vec(
+                    "\\u",
+                    &mut iter,
+                    &mut control_characters,
+                    decode_escaped_code_point_to_code_point,
+                ),
+                Some('u') => add_control_character_to_vec(
+                    "\\u",
+                    &mut iter,
+                    &mut control_characters,
+                    decode_unicode_escape_to_code_point,
+                ),
+                Some('\\') => continue,
+                _ => break,
+            },
+            _ => continue,
+        }
+    }
+
+    if !control_characters.is_empty() {
+        Some(control_characters)
+    } else {
+        None
+    }
+}
+
+fn collect_control_characters_from_expression(
+    callee: &AnyJsExpression,
+    js_call_arguments: &JsCallArguments,
+) -> Option<Vec<String>> {
+    let js_identifier = match callee {
+        AnyJsExpression::JsIdentifierExpression(js_identifier) => js_identifier,
+        _ => return None,
+    };
+
+    if js_identifier.name().ok()?.has_name("RegExp") {
+        let mut args = js_call_arguments.args().iter();
+        let raw_pattern = args
+            .next()
+            .and_then(|arg| arg.ok())
+            .and_then(|arg| JsStringLiteralExpression::cast_ref(arg.syntax()))
+            .and_then(|js_string_literal| js_string_literal.inner_string_text().ok())?
+            .to_string();
+
+        let pattern = escape_string(&raw_pattern).unwrap_or(raw_pattern);
+
+        let regexp_flags = args
+            .next()
+            .and_then(|arg| arg.ok())
+            .and_then(|arg| JsStringLiteralExpression::cast_ref(arg.syntax()))
+            .map(|js_string_literal| js_string_literal.text());
+
+        return collect_control_characters(pattern, regexp_flags);
+    }
+    None
+}
+
+impl Rule for NoControlCharactersInRegex {
+    type Query = Ast<RegexExpressionLike>;
+    type State = Vec<String>;
+    type Signals = Option<Self::State>;
+    type Options = ();
+
+    fn run(ctx: &RuleContext<Self>) -> Self::Signals {
+        let node = ctx.query();
+        match node {
+            RegexExpressionLike::JsNewExpression(js_new_expression) => {
+                collect_control_characters_from_expression(
+                    &js_new_expression.callee().ok()?,
+                    &js_new_expression.arguments()?,
+                )
+            }
+            RegexExpressionLike::JsCallExpression(js_call_expression) => {
+                collect_control_characters_from_expression(
+                    &js_call_expression.callee().ok()?,
+                    &js_call_expression.arguments().ok()?,
+                )
+            }
+            RegexExpressionLike::JsRegexLiteralExpression(js_regex_literal_expression) => {
+                collect_control_characters(
+                    js_regex_literal_expression.pattern().ok()?,
+                    js_regex_literal_expression.flags().ok(),
+                )
+            }
+        }
+    }
+
+    fn diagnostic(ctx: &RuleContext<Self>, state: &Self::State) -> Option<RuleDiagnostic> {
+        Some(RuleDiagnostic::new(
+            rule_category!(),
+            ctx.query().range(),
+            markup! {
+                "Unexpected control character(s) in regular expression: "<Emphasis>{state.join(", ")}</Emphasis>""
+            },
+        ).note(
+            markup! {
+                "Control characters are unusual and potentially incorrect inputs, so they are disallowed."
+            }
+        ))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_collect_control_characters() {
+        assert_eq!(
+            collect_control_characters(String::from("\\x00\\x0F\\u0010\\u001F"), None),
+            Some(vec![
+                String::from("\\x00"),
+                String::from("\\x0F"),
+                String::from("\\u0010"),
+                String::from("\\u001F")
+            ])
+        );
+        assert_eq!(
+            collect_control_characters(String::from("\\u{0}\\u{1F}"), Some(String::from("u"))),
+            Some(vec![String::from("\\u{0}"), String::from("\\u{1F}")])
+        );
+        assert_eq!(
+            collect_control_characters(String::from("\\x20\\u0020\\u{20}\\t\\n"), None),
+            None
+        );
+    }
+}
diff --git a/crates/rome_js_analyze/tests/specs/nursery/noControlCharactersInRegex/invalid.js b/crates/rome_js_analyze/tests/specs/nursery/noControlCharactersInRegex/invalid.js
@@ -0,0 +1,25 @@
+var regex = RegExp("\\x1f");
+var regex = RegExp("\\u{1111}*\\x1F", "u");
+var regex = new RegExp("\\x1f\\x1e");
+var regex = new RegExp("\\x1fFOO\\x00");
+var regex = new RegExp("FOO\\x1fFOO\\x1f");
+var regex = new RegExp("\\x1f");
+var regex = new RegExp("\\u001F", flags);
+var regex = new RegExp("\\u{1111}*\\x1F", "u");
+var regex = new RegExp("\\u{1F}", "u");
+var regex = new RegExp("\\u{1F}", "gui");
+var regex = new RegExp("\\x0C");
+var regex = new RegExp("\x0C");
+var regex = /\x00/;
+var regex = /\x0C/;
+var regex = /\x1F/;
+var regex = /\u000C/;
+var regex = /\u{C}/u;
+var regex = /\\\x1f\\x1e/;
+var regex = /\\\x1fFOO\\x00/;
+var regex = /FOO\\\x1fFOO\\x1f/;
+var regex = /(?<a>\\x1f)/;
+var regex = /(?<\u{1d49c}>.)\x1f/;
+var regex = /\u{1111}*\x1F/u;
+var regex = /\u{1F}/u;
+var regex = /\u{1F}/gui;