rust-lang · llogiq · Aug 28, 2015 · Aug 28, 2015 · Aug 30, 2015 · Aug 29, 2015
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,6 +16,9 @@ keywords = ["clippy", "lint", "plugin"]
 name = "clippy"
 plugin = true
 
+[dependencies]
+unicode-normalization = "*"
+
 [dev-dependencies]
 compiletest_rs = "*"
 regex = "*"

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 A collection of lints that give helpful tips to newbies and catch oversights.
 
 ##Lints
-There are 53 lints included in this crate:
+There are 54 lints included in this crate:
 
 name                                                                                                 | default | meaning
 -----------------------------------------------------------------------------------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@ -56,6 +56,7 @@ name
 [string_to_string](https://github.com/Manishearth/rust-clippy/wiki#string_to_string)                 | warn    | calling `String.to_string()` which is a no-op
 [toplevel_ref_arg](https://github.com/Manishearth/rust-clippy/wiki#toplevel_ref_arg)                 | warn    | a function argument is declared `ref` (i.e. `fn foo(ref x: u8)`, but not `fn foo((ref x, ref y): (u8, u8))`)
 [type_complexity](https://github.com/Manishearth/rust-clippy/wiki#type_complexity)                   | warn    | usage of very complex types; recommends factoring out parts into `type` definitions
+[unicode_not_nfc](https://github.com/Manishearth/rust-clippy/wiki#unicode_not_nfc)                   | allow   | using a unicode literal not in NFC normal form (see http://www.unicode.org/reports/tr15/ for further information)
 [unit_cmp](https://github.com/Manishearth/rust-clippy/wiki#unit_cmp)                                 | warn    | comparing unit values (which is always `true` or `false`, respectively)
 [unused_collect](https://github.com/Manishearth/rust-clippy/wiki#unused_collect)                     | warn    | `collect()`ing an iterator without using the result; this is usually better written as a for loop
 [while_let_loop](https://github.com/Manishearth/rust-clippy/wiki#while_let_loop)                     | warn    | `loop { if let { ... } else break }` can be written as a `while let` loop

diff --git a/src/lib.rs b/src/lib.rs
@@ -90,6 +90,7 @@ pub fn plugin_registrar(reg: &mut Registry) {
         types::CAST_PRECISION_LOSS,
         types::CAST_SIGN_LOSS,
         unicode::NON_ASCII_LITERAL,
+        unicode::UNICODE_NOT_NFC,
     ]);
 
     reg.register_lint_group("clippy", vec![

diff --git a/src/unicode.rs b/src/unicode.rs
@@ -1,6 +1,11 @@
+extern crate unicode_normalization;
+
+use std::fmt::Write;
 use rustc::lint::*;
 use syntax::ast::*;
-use syntax::codemap::{BytePos, Span};
+use syntax::codemap::{Pos, BytePos, Span};
+use self::unicode_normalization::char::canonical_combining_class;
+use self::unicode_normalization::UnicodeNormalization;
 
 use utils::span_lint;
 
@@ -9,13 +14,16 @@ declare_lint!{ pub ZERO_WIDTH_SPACE, Deny,
 declare_lint!{ pub NON_ASCII_LITERAL, Allow,
                "using any literal non-ASCII chars in a string literal; suggests \
                 using the \\u escape instead" }
+declare_lint!{ pub UNICODE_NOT_NFC, Allow,
+               "using a unicode literal not in NFC normal form (see \
+               http://www.unicode.org/reports/tr15/ for further information)" }
 
 #[derive(Copy, Clone)]
 pub struct Unicode;
 
 impl LintPass for Unicode {
     fn get_lints(&self) -> LintArray {
-        lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL)
+        lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC)
     }
 
     fn check_expr(&mut self, cx: &Context, expr: &Expr) {
@@ -27,23 +35,132 @@ impl LintPass for Unicode {
     }
 }
 
+fn pos(base: BytePos, i: usize) -> BytePos {
+    if i == 0 { base } else { base + Pos::from_usize(i + 1) }
+}
+
+#[allow(cast_possible_truncation)]
+fn str_pos_lint(cx: &Context, lint: &'static Lint, span: Span, index: usize,
+        end_index: Option<usize>, msg: &str) {
+
+    span_lint(cx, lint,
+        Span {
+            lo: pos(span.lo, index),
+            hi: end_index.map_or(span.hi, |i| pos(span.lo, i)),
+            expn_id: span.expn_id,
+        },
+        msg);
+}
+
+
+fn push_start(from: &mut Option<usize>, til: Option<usize>,
+        v: &mut Vec<(usize, Option<usize>)>) {
+    if let Some(s) = from.take() {
+        v.push((s, til));
+    }
+}
+
+fn push_last_and_report<F>(cx: &Context, string: &str, span: Span,
+        mut from: Option<usize>, mut ranges: Vec<(usize, Option<usize>)>,
+        lint: &'static Lint, prefix: &str, multi_fun: F)
+where F: Fn(&str) -> String, {
+    push_start(&mut from, None, &mut ranges);
+    match ranges.len() {
+        0 => (),
+        1 => {
+            let range = ranges[0];
+            str_pos_lint(cx, lint, span, range.0, range.1, &format!(
+                "{} range detected. Consider using `{}`",
+                prefix,
+                &if let Some(u) = range.1 {
+                    multi_fun(&string[range.0 .. u])
+                } else {
+                    multi_fun(&string[range.0 ..])
+                }
+            ));
+        },
+        x => {
+            let mut repls = String::new();
+            for (from, until) in ranges {
+                let _ = if let Some(u) = until {
+                    write!(&mut repls, "\n{}..{} => {}",
+                        from, u, &multi_fun(&string[from..u]))
+                } else {
+                    write!(&mut repls, "\n{}.. => {}",
+                        from, &multi_fun(&string[from..]))
+                };
+            }
+            span_lint(cx, lint, span, &format!(
+                "{} {} ranges detected. Consider the following replacements:{}",
+                x, prefix, &repls));
+        }
+    }
+}
+
 fn check_str(cx: &Context, string: &str, span: Span) {
+    let mut zero_width_ranges = vec![];
+    let mut non_ascii_ranges = vec![];
+    let mut non_nfc_ranges = vec![];
+    let mut zero_width_start = None;
+    let mut non_ascii_start = None;
+    let mut non_nfc_start = None;
+    let mut last_base_char = None;
     for (i, c) in string.char_indices() {
         if c == '\u{200B}' {
-            str_pos_lint(cx, ZERO_WIDTH_SPACE, span, i,
-                         "zero-width space detected. Consider using `\\u{200B}`");
+            if zero_width_start.is_none() {
+                zero_width_start = Some(i);
+            }
+        } else {
+            push_start(&mut zero_width_start, Some(i), &mut zero_width_ranges);
         }
         if c as u32 > 0x7F {
-            str_pos_lint(cx, NON_ASCII_LITERAL, span, i, &format!(
-                "literal non-ASCII character detected. Consider using `\\u{{{:X}}}`", c as u32));
+            if non_ascii_start.is_none() {
+                non_ascii_start = Some(i);
+            }
+        } else {
+            push_start(&mut non_ascii_start, Some(i), &mut non_ascii_ranges);
         }
+        if canonical_combining_class(c) == 0 { // not a combining char
+            if let Some(l) = last_base_char {
+                let seq = &string[l..i];
+                if seq.nfc().zip(seq.chars()).any(|(a, b)| a != b) {
+                    if non_nfc_start.is_none() {
+                        non_nfc_start = last_base_char;
+                    }
+                } else {
+                    if let Some(nns) = non_nfc_start.take() {
+                        non_nfc_ranges.push((nns, Some(i)));
+                    }
+                }
+            }
+            last_base_char = Some(i);
+        }
+    }
+    push_last_and_report(cx, string, span, zero_width_start, zero_width_ranges,
+        ZERO_WIDTH_SPACE, "zero-width space", zero_width_replacement);
+    push_last_and_report(cx, string, span, non_ascii_start, non_ascii_ranges,
+        NON_ASCII_LITERAL, "non-ascii literal", non_ascii_replacement);
+    if cx.current_level(NON_ASCII_LITERAL) == Level::Allow {
+        push_last_and_report(cx, string, span, non_nfc_start, non_nfc_ranges,
+            UNICODE_NOT_NFC, "non-NFC unicode", non_nfc_replacement);
+    } else {
+        push_last_and_report(cx, string, span, non_nfc_start, non_nfc_ranges,
+            UNICODE_NOT_NFC, "non-NFC unicode", non_nfc_ascii_replacement);
     }
 }
 
-#[allow(cast_possible_truncation)]
-fn str_pos_lint(cx: &Context, lint: &'static Lint, span: Span, index: usize, msg: &str) {
-    span_lint(cx, lint, Span { lo: span.lo + BytePos((1 + index) as u32),
-                               hi: span.lo + BytePos((1 + index) as u32),
-                               expn_id: span.expn_id }, msg);
+fn zero_width_replacement(string: &str) -> String {
+    string.chars().map(|_| "\\u{200B}").collect()
+}
+
+fn non_ascii_replacement(string: &str) -> String {
+    string.chars().flat_map(char::escape_unicode).collect()
+}
+
+fn non_nfc_replacement(string: &str) -> String {
+    string.nfc().collect()
+}
 
+fn non_nfc_ascii_replacement(string: &str) -> String {
+    string.nfc().flat_map(char::escape_unicode).collect()
 }
diff --git a/tests/compile-fail/unicode.rs b/tests/compile-fail/unicode.rs
@@ -3,19 +3,17 @@
 
 #[deny(zero_width_space)]
 fn zero() {
-    print!("Here >< is a ZWS, and another");
-               //~^ ERROR zero-width space detected. Consider using `\u{200B}`
-                            //~^^ ERROR zero-width space detected. Consider using `\u{200B}`
+    print!("Here >< is a ZWS, and another"); //~ ERROR 2 zero-width space ranges detected.
 }
 
-//#[deny(unicode_canon)]
+#[deny(unicode_not_nfc)]
 fn canon() {
-    print!("̀ah?"); //not yet ~ERROR non-canonical unicode sequence detected. Consider using à
+    print!("̀àh?"); //~ERROR non-NFC unicode range detected. Consider using `àh`
 }
 
 #[deny(non_ascii_literal)]
 fn uni() {
-    print!("Üben!"); //~ERROR literal non-ASCII character detected. Consider using `\u{DC}`
+    print!("Üben!"); //~ERROR non-ascii literal range detected. Consider using `\u{dc}`
 }
 
 fn main() {