From afcf259b3ae7f34e913e9b26e7ca28b06c2271c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emilio=20Cobos=20=C3=81lvarez?= Date: Tue, 17 Jan 2017 12:33:21 +0100 Subject: [PATCH 1/5] Use less UTF-8 logic when not needed. --- src/tokenizer.rs | 153 ++++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 76 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3fac405e..faef7e1d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -380,8 +380,8 @@ impl<'a> Tokenizer<'a> { } #[inline] - fn starts_with(&self, needle: &str) -> bool { - self.input[self.position..].starts_with(needle) + fn starts_with(&self, needle: &[u8]) -> bool { + self.input.as_bytes()[self.position..].starts_with(needle) } } @@ -405,88 +405,88 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Option> { if tokenizer.is_eof() { return None } - let c = tokenizer.next_char(); + let c = tokenizer.next_byte_unchecked(); let token = match c { - '\t' | '\n' | ' ' | '\r' | '\x0C' => { + b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => { let start_position = tokenizer.position(); tokenizer.advance(1); while !tokenizer.is_eof() { - match tokenizer.next_char() { - ' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.advance(1), + match tokenizer.next_byte_unchecked() { + b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer.advance(1), _ => break, } } WhiteSpace(tokenizer.slice_from(start_position)) }, - '"' => consume_string(tokenizer, false), - '#' => { + b'"' => consume_string(tokenizer, false), + b'#' => { tokenizer.advance(1); if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } - else if !tokenizer.is_eof() && match tokenizer.next_char() { - 'a'...'z' | 'A'...'Z' | '0'...'9' | '-' | '_' => true, - '\\' => !tokenizer.has_newline_at(1), - _ => c > '\x7F', // Non-ASCII + else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() { + b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'-' | b'_' => true, + b'\\' => !tokenizer.has_newline_at(1), + _ => !c.is_ascii(), } { Hash(consume_name(tokenizer)) } - else { Delim(c) } + else { Delim('#') } }, - '$' => { - if tokenizer.starts_with("$=") { tokenizer.advance(2); SuffixMatch } - else { tokenizer.advance(1); Delim(c) } + b'$' => { + if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch } + else { tokenizer.advance(1); Delim('$') } }, - '\'' => consume_string(tokenizer, true), - '(' => { tokenizer.advance(1); ParenthesisBlock }, - ')' => { tokenizer.advance(1); CloseParenthesis }, - '*' => { - if tokenizer.starts_with("*=") { tokenizer.advance(2); SubstringMatch } - else { tokenizer.advance(1); Delim(c) } + b'\'' => consume_string(tokenizer, true), + b'(' => { tokenizer.advance(1); ParenthesisBlock }, + b')' => { tokenizer.advance(1); CloseParenthesis }, + b'*' => { + if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch } + else { tokenizer.advance(1); Delim('*') } }, - '+' => { + b'+' => { if ( tokenizer.has_at_least(1) - && matches!(tokenizer.char_at(1), '0'...'9') + && matches!(tokenizer.byte_at(1), b'0'...b'9') ) || ( tokenizer.has_at_least(2) - && tokenizer.char_at(1) == '.' - && matches!(tokenizer.char_at(2), '0'...'9') + && tokenizer.byte_at(1) == b'.' + && matches!(tokenizer.byte_at(2), b'0'...b'9') ) { consume_numeric(tokenizer) } else { tokenizer.advance(1); - Delim(c) + Delim('+') } }, - ',' => { tokenizer.advance(1); Comma }, - '-' => { + b',' => { tokenizer.advance(1); Comma }, + b'-' => { if ( tokenizer.has_at_least(1) - && matches!(tokenizer.char_at(1), '0'...'9') + && matches!(tokenizer.byte_at(1), b'0'...b'9') ) || ( tokenizer.has_at_least(2) - && tokenizer.char_at(1) == '.' - && matches!(tokenizer.char_at(2), '0'...'9') + && tokenizer.byte_at(1) == b'.' + && matches!(tokenizer.byte_at(2), b'0'...b'9') ) { consume_numeric(tokenizer) - } else if tokenizer.starts_with("-->") { + } else if tokenizer.starts_with(b"-->") { tokenizer.advance(3); CDC } else if is_ident_start(tokenizer) { consume_ident_like(tokenizer) } else { tokenizer.advance(1); - Delim(c) + Delim('-') } }, - '.' => { + b'.' => { if tokenizer.has_at_least(1) - && matches!(tokenizer.char_at(1), '0'...'9' + && matches!(tokenizer.byte_at(1), b'0'...b'9' ) { consume_numeric(tokenizer) } else { tokenizer.advance(1); - Delim(c) + Delim('.') } } - '/' if tokenizer.starts_with("/*") => { + b'/' if tokenizer.starts_with(b"/*") => { tokenizer.advance(2); // consume "/*" let start_position = tokenizer.position(); let content; @@ -503,58 +503,59 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Option> { } Comment(content) } - '0'...'9' => consume_numeric(tokenizer), - ':' => { tokenizer.advance(1); Colon }, - ';' => { tokenizer.advance(1); Semicolon }, - '<' => { - if tokenizer.starts_with("