From 7f63c7cf4c2938c31de3d63fc769706f0d87cb54 Mon Sep 17 00:00:00 2001 From: Ravi Shankar Date: Sun, 15 Nov 2015 02:37:49 +0530 Subject: [PATCH] Detect confusing unicode characters and show the alternative --- src/libsyntax/diagnostic.rs | 4 + src/libsyntax/parse/lexer/mod.rs | 4 +- src/libsyntax/parse/lexer/unicode_chars.rs | 186 +++++++++++++++++++++ src/test/parse-fail/unicode-chars.rs | 18 ++ 4 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 src/libsyntax/parse/lexer/unicode_chars.rs create mode 100644 src/test/parse-fail/unicode-chars.rs diff --git a/src/libsyntax/diagnostic.rs b/src/libsyntax/diagnostic.rs index f1fd8be472830..870dea02212d1 100644 --- a/src/libsyntax/diagnostic.rs +++ b/src/libsyntax/diagnostic.rs @@ -174,6 +174,10 @@ impl SpanHandler { self.handler.emit(Some((&self.cm, sp)), msg, Bug); panic!(ExplicitBug); } + pub fn span_bug_no_panic(&self, sp: Span, msg: &str) { + self.handler.emit(Some((&self.cm, sp)), msg, Bug); + self.handler.bump_err_count(); + } pub fn span_unimpl(&self, sp: Span, msg: &str) -> ! { self.span_bug(sp, &format!("unimplemented {}", msg)); } diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index e1d8a4d8c5423..cb2181a083177 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -26,6 +26,7 @@ use std::rc::Rc; pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag}; pub mod comments; +mod unicode_chars; pub trait Reader { fn is_eof(&self) -> bool; @@ -1224,7 +1225,8 @@ impl<'a> StringReader<'a> { c => { let last_bpos = self.last_pos; let bpos = self.pos; - panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c)); + unicode_chars::check_for_substitution(&self, c); + panic!(self.fatal_span_char(last_bpos, bpos, "unknown start of token", c)) } } } diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs new file mode 100644 index 0000000000000..dbec1a8851cfe --- /dev/null +++ b/src/libsyntax/parse/lexer/unicode_chars.rs @@ -0,0 +1,186 @@ +// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Characters and their corresponding confusables were collected from +// http://www.unicode.org/Public/security/revision-06/confusables.txt + +use codemap::mk_sp as make_span; +use super::StringReader; + +const UNICODE_ARRAY: &'static [(char, &'static str, char)] = &[ + ('ߺ', "Nko Lajanyalan", '_'), + ('﹍', "Dashed Low Line", '_'), + ('﹎', "Centreline Low Line", '_'), + ('﹏', "Wavy Low Line", '_'), + ('‐', "Hyphen", '-'), + ('‑', "Non-Breaking Hyphen", '-'), + ('‒', "Figure Dash", '-'), + ('–', "En Dash", '-'), + ('﹘', "Small Em Dash", '-'), + ('⁃', "Hyphen Bullet", '-'), + ('˗', "Modifier Letter Minus Sign", '-'), + ('−', "Minus Sign", '-'), + ('٫', "Arabic Decimal Separator", ','), + ('‚', "Single Low-9 Quotation Mark", ','), + ('ꓹ', "Lisu Letter Tone Na Po", ','), + (';', "Greek Question Mark", ';'), + ('ः', "Devanagari Sign Visarga", ':'), + ('ઃ', "Gujarati Sign Visarga", ':'), + (':', "Fullwidth Colon", ':'), + ('։', "Armenian Full Stop", ':'), + ('܃', "Syriac Supralinear Colon", ':'), + ('܄', "Syriac Sublinear Colon", ':'), + ('︰', "Presentation Form For Vertical Two Dot Leader", ':'), + ('᠃', "Mongolian Full Stop", ':'), + ('᠉', "Mongolian Manchu Full Stop", ':'), + ('⁚', "Two Dot Punctuation", ':'), + ('׃', "Hebrew Punctuation Sof Pasuq", ':'), + ('˸', "Modifier Letter Raised Colon", ':'), + ('꞉', "Modifier Letter Colon", ':'), + ('∶', "Ratio", ':'), + ('ː', "Modifier Letter Triangular Colon", ':'), + ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'), + ('!', "Fullwidth Exclamation Mark", '!'), + ('ǃ', "Latin Letter Retroflex Click", '!'), + ('ʔ', "Latin Letter Glottal Stop", '?'), + ('ॽ', "Devanagari Letter Glottal Stop", '?'), + ('Ꭾ', "Cherokee Letter He", '?'), + ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'), + ('․', "One Dot Leader", '.'), + ('۔', "Arabic Full Stop", '.'), + ('܁', "Syriac Supralinear Full Stop", '.'), + ('܂', "Syriac Sublinear Full Stop", '.'), + ('꘎', "Vai Full Stop", '.'), + ('𐩐', "Kharoshthi Punctuation Dot", '.'), + ('٠', "Arabic-Indic Digit Zero", '.'), + ('۰', "Extended Arabic-Indic Digit Zero", '.'), + ('ꓸ', "Lisu Letter Tone Mya Ti", '.'), + ('՝', "Armenian Comma", '\''), + (''', "Fullwidth Apostrophe", '\''), + ('‘', "Left Single Quotation Mark", '\''), + ('’', "Right Single Quotation Mark", '\''), + ('‛', "Single High-Reversed-9 Quotation Mark", '\''), + ('′', "Prime", '\''), + ('‵', "Reversed Prime", '\''), + ('՚', "Armenian Apostrophe", '\''), + ('׳', "Hebrew Punctuation Geresh", '\''), + ('`', "Greek Varia", '\''), + ('`', "Fullwidth Grave Accent", '\''), + ('΄', "Greek Tonos", '\''), + ('´', "Greek Oxia", '\''), + ('᾽', "Greek Koronis", '\''), + ('᾿', "Greek Psili", '\''), + ('῾', "Greek Dasia", '\''), + ('ʹ', "Modifier Letter Prime", '\''), + ('ʹ', "Greek Numeral Sign", '\''), + ('ˊ', "Modifier Letter Acute Accent", '\''), + ('ˋ', "Modifier Letter Grave Accent", '\''), + ('˴', "Modifier Letter Middle Grave Accent", '\''), + ('ʻ', "Modifier Letter Turned Comma", '\''), + ('ʽ', "Modifier Letter Reversed Comma", '\''), + ('ʼ', "Modifier Letter Apostrophe", '\''), + ('ʾ', "Modifier Letter Right Half Ring", '\''), + ('ꞌ', "Latin Small Letter Saltillo", '\''), + ('י', "Hebrew Letter Yod", '\''), + ('ߴ', "Nko High Tone Apostrophe", '\''), + ('ߵ', "Nko Low Tone Apostrophe", '\''), + ('[', "Fullwidth Left Square Bracket", '('), + ('❨', "Medium Left Parenthesis Ornament", '('), + ('❲', "Light Left Tortoise Shell Bracket Ornament", '('), + ('〔', "Left Tortoise Shell Bracket", '('), + ('﴾', "Ornate Left Parenthesis", '('), + (']', "Fullwidth Right Square Bracket", ')'), + ('❩', "Medium Right Parenthesis Ornament", ')'), + ('❳', "Light Right Tortoise Shell Bracket Ornament", ')'), + ('〕', "Right Tortoise Shell Bracket", ')'), + ('﴿', "Ornate Right Parenthesis", ')'), + ('❴', "Medium Left Curly Bracket Ornament", '{'), + ('❵', "Medium Right Curly Bracket Ornament", '}'), + ('⁎', "Low Asterisk", '*'), + ('٭', "Arabic Five Pointed Star", '*'), + ('∗', "Asterisk Operator", '*'), + ('᜵', "Philippine Single Punctuation", '/'), + ('⁁', "Caret Insertion Point", '/'), + ('∕', "Division Slash", '/'), + ('⁄', "Fraction Slash", '/'), + ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'), + ('⟋', "Mathematical Rising Diagonal", '/'), + ('⧸', "Big Solidus", '/'), + ('㇓', "Cjk Stroke Sp", '/'), + ('〳', "Vertical Kana Repeat Mark Upper Half", '/'), + ('丿', "Cjk Unified Ideograph-4E3F", '/'), + ('⼃', "Kangxi Radical Slash", '/'), + ('\', "Fullwidth Reverse Solidus", '\\'), + ('﹨', "Small Reverse Solidus", '\\'), + ('∖', "Set Minus", '\\'), + ('⟍', "Mathematical Falling Diagonal", '\\'), + ('⧵', "Reverse Solidus Operator", '\\'), + ('⧹', "Big Reverse Solidus", '\\'), + ('㇔', "Cjk Stroke D", '\\'), + ('丶', "Cjk Unified Ideograph-4E36", '\\'), + ('⼂', "Kangxi Radical Dot", '\\'), + ('ꝸ', "Latin Small Letter Um", '&'), + ('﬩', "Hebrew Letter Alternative Plus Sign", '+'), + ('‹', "Single Left-Pointing Angle Quotation Mark", '<'), + ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'), + ('˂', "Modifier Letter Left Arrowhead", '<'), + ('꓿', "Lisu Punctuation Full Stop", '='), + ('›', "Single Right-Pointing Angle Quotation Mark", '>'), + ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'), + ('˃', "Modifier Letter Right Arrowhead", '>'), + ('Ⲻ', "Coptic Capital Letter Dialect-P Ni", '-'), + ('Ɂ', "Latin Capital Letter Glottal Stop", '?'), + ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), ]; + +const ASCII_ARRAY: &'static [(char, &'static str)] = &[ + ('_', "Underscore"), + ('-', "Minus/Hyphen"), + (',', "Comma"), + (';', "Semicolon"), + (':', "Colon"), + ('!', "Exclamation Mark"), + ('?', "Question Mark"), + ('.', "Period"), + ('\'', "Single Quote"), + ('(', "Left Parenthesis"), + (')', "Right Parenthesis"), + ('{', "Left Curly Brace"), + ('}', "Right Curly Brace"), + ('*', "Asterisk"), + ('/', "Slash"), + ('\\', "Backslash"), + ('&', "Ampersand"), + ('+', "Plus Sign"), + ('<', "Less-Than Sign"), + ('=', "Equals Sign"), + ('>', "Greater-Than Sign"), ]; + +pub fn check_for_substitution(reader: &StringReader, ch: char) { + UNICODE_ARRAY + .iter() + .find(|&&(c, _, _)| c == ch) + .map(|&(_, u_name, ascii_char)| { + let span = make_span(reader.last_pos, reader.pos); + match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) { + Some(&(ascii_char, ascii_name)) => { + let msg = + format!("unicode character '{}' ({}) looks much like '{}' ({}), but it's not", + ch, u_name, ascii_char, ascii_name); + reader.help_span(span, &msg); + }, + None => { + reader + .span_diagnostic + .span_bug_no_panic(span, + &format!("substitution character not found for '{}'", ch)); + } + } + }); +} diff --git a/src/test/parse-fail/unicode-chars.rs b/src/test/parse-fail/unicode-chars.rs new file mode 100644 index 0000000000000..adfaf62b5d3cc --- /dev/null +++ b/src/test/parse-fail/unicode-chars.rs @@ -0,0 +1,18 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// compile-flags: -Z parse-only +// ignore-tidy-linelength + +fn main() { + let y = 0; + //~^ ERROR unknown start of token: \u{37e} + //~^^ HELP unicode character ';' (Greek Question Mark) looks much like ';' (Semicolon), but it's not +}