diff --git a/.travis.yml b/.travis.yml index 94f98322..2f0758b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,9 @@ script: - cargo test --verbose - cargo doc --verbose - cargo test --features heapsize + - cargo test --features dummy_match_byte + - if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo test --features bench; fi + - if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo test --features "bench dummy_match_byte"; fi notifications: webhooks: http://build.servo.org:54856/travis diff --git a/Cargo.toml b/Cargo.toml index 414003dc..f46859b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ repository = "https://github.com/servo/rust-cssparser" readme = "README.md" keywords = ["css", "syntax", "parser"] license = "MPL-2.0" +build = "build.rs" [dev-dependencies] @@ -22,7 +23,12 @@ heapsize = {version = ">=0.1.1, <0.4.0", optional = true} matches = "0.1" serde = {version = ">=0.6.6, <0.9", optional = true} +[build-dependencies] +syn = { version = "0.10.6", features = ["full", "visit"]} +quote = "0.3" + [features] serde-serialization = [ "serde" ] heap_size = [ "heapsize" ] bench = [] +dummy_match_byte = [] diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..84e36be5 --- /dev/null +++ b/build.rs @@ -0,0 +1,40 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#[macro_use] extern crate quote; +extern crate syn; + +use std::env; +use std::path::Path; + + +#[cfg(feature = "dummy_match_byte")] +mod codegen { + use std::path::Path; + pub fn main(_: &Path) {} +} + +#[cfg(not(feature = "dummy_match_byte"))] +#[path = "src/macros/mod.rs"] +mod macros; + +#[cfg(not(feature = "dummy_match_byte"))] +mod codegen { + use macros; + use std::env; + use std::path::Path; + + pub fn main(tokenizer_rs: &Path) { + macros::match_byte::expand(tokenizer_rs, + &Path::new(&env::var("OUT_DIR").unwrap()).join("tokenizer.rs")); + + } +} + +fn main() { + let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + let tokenizer_rs = Path::new(&manifest_dir).join("src/tokenizer.rs"); + codegen::main(&tokenizer_rs); + println!("cargo:rerun-if-changed={}", tokenizer_rs.display()); +} diff --git a/src/lib.rs b/src/lib.rs index 0681bcde..58859e88 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,7 +137,25 @@ macro_rules! match_ignore_ascii_case { } mod rules_and_declarations; + +#[cfg(feature = "dummy_match_byte")] +macro_rules! match_byte { + ($value:expr, $($rest:tt)* ) => { + match $value { + $( + $rest + )+ + } + }; +} + +#[cfg(feature = "dummy_match_byte")] mod tokenizer; + +#[cfg(not(feature = "dummy_match_byte"))] +mod tokenizer { + include!(concat!(env!("OUT_DIR"), "/tokenizer.rs")); +} mod parser; mod from_bytes; mod color; diff --git a/src/macros/match_byte.rs b/src/macros/match_byte.rs new file mode 100644 index 00000000..79519d01 --- /dev/null +++ b/src/macros/match_byte.rs @@ -0,0 +1,271 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use quote::{ToTokens, Tokens}; +use std::fs::File; +use std::io::{Read, Write}; +use std::path::Path; +use std::vec; +use std::iter; +use syn; + +pub fn expand(from: &Path, to: &Path) { + let mut source = String::new(); + File::open(from).unwrap().read_to_string(&mut source).unwrap(); + let tts = syn::parse_token_trees(&source).expect("Parsing rules.rs module"); + let mut tokens = Tokens::new(); + tokens.append_all(expand_tts(tts)); + + let code = tokens.to_string().replace("{ ", "{\n").replace(" }", "\n}"); + File::create(to).unwrap().write_all(code.as_bytes()).unwrap(); +} + +fn expand_tts(tts: Vec) -> Vec { + use syn::*; + let mut expanded = Vec::new(); + let mut tts = tts.into_iter(); + while let Some(tt) = tts.next() { + match tt { + TokenTree::Token(Token::Ident(ident)) => { + if ident != "match_byte" { + expanded.push(TokenTree::Token(Token::Ident(ident))); + continue; + } + + match tts.next() { + Some(TokenTree::Token(Token::Not)) => {}, + other => { + expanded.push(TokenTree::Token(Token::Ident(ident))); + if let Some(other) = other { + expanded.push(other); + } + continue; + } + } + + let tts = match tts.next() { + Some(TokenTree::Delimited(Delimited { tts, .. })) => tts, + other => { + expanded.push(TokenTree::Token(Token::Ident(ident))); + expanded.push(TokenTree::Token(Token::Not)); + if let Some(other) = other { + expanded.push(other); + } + continue; + } + }; + + let (to_be_matched, table, cases, wildcard_binding) = parse_match_bytes_macro(tts); + let expr = expand_match_bytes_macro(to_be_matched, + &table, + cases, + wildcard_binding); + + let tts = syn::parse_token_trees(&expr) + .expect("parsing macro expansion as token trees"); + expanded.extend(expand_tts(tts)); + } + TokenTree::Delimited(Delimited { delim, tts }) => { + expanded.push(TokenTree::Delimited(Delimited { + delim: delim, + tts: expand_tts(tts), + })) + } + other => expanded.push(other), + } + } + expanded +} + +/// Parses a token tree corresponding to the `match_byte` macro. +/// +/// ## Example +/// +/// ```rust +/// match_byte! { tokenizer.next_byte_unchecked(), +/// b'a'..b'z' => { ... } +/// b'0'..b'9' => { ... } +/// b'\n' | b'\\' => { ... } +/// foo => { ... } +/// } +/// +/// Returns: +/// * The token tree that contains the expression to be matched (in this case +/// `tokenizer.next_byte_unchecked()`. +/// +/// * The table with the different cases per byte, each entry in the table +/// contains a non-zero integer representing a different arm of the +/// match expression. +/// +/// * The list of cases containing the expansion of the arms of the match +/// expression. +/// +/// * An optional identifier to which the wildcard pattern is matched (`foo` in +/// this case). +/// +fn parse_match_bytes_macro(tts: Vec) -> (Vec, [u8; 256], Vec, Option) { + let mut tts = tts.into_iter(); + + // Grab the thing we're matching, until we find a comma. + let mut left_hand_side = vec![]; + loop { + match tts.next() { + Some(syn::TokenTree::Token(syn::Token::Comma)) => break, + Some(other) => left_hand_side.push(other), + None => panic!("Expected not to run out of tokens looking for a comma"), + } + } + + let mut cases = vec![]; + let mut table = [0u8; 256]; + + let mut tts = tts.peekable(); + let mut case_id: u8 = 1; + let mut binding = None; + while tts.len() > 0 { + cases.push(parse_case(&mut tts, &mut table, &mut binding, case_id)); + + // Allow an optional comma between cases. + match tts.peek() { + Some(&syn::TokenTree::Token(syn::Token::Comma)) => { + tts.next(); + }, + _ => {}, + } + + case_id += 1; + } + + (left_hand_side, table, cases, binding) +} + +#[derive(Debug)] +struct Case(Vec); + +/// Parses a single pattern => expression, and returns the case, filling in the +/// table with the case id for every byte that matched. +/// +/// The `binding` parameter is the identifier that is used by the wildcard +/// pattern. +fn parse_case(tts: &mut iter::Peekable>, + table: &mut [u8; 256], + binding: &mut Option, + case_id: u8) + -> Case { + // The last byte checked, as part of this pattern, to properly detect + // ranges. + let mut last_byte: Option = None; + + // Loop through the pattern filling with bytes the table. + loop { + match tts.next() { + Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => { + table[byte as usize] = case_id; + last_byte = Some(byte); + } + Some(syn::TokenTree::Token(syn::Token::BinOp(syn::BinOpToken::Or))) => { + last_byte = None; // This pattern is over. + }, + Some(syn::TokenTree::Token(syn::Token::DotDotDot)) => { + assert!(last_byte.is_some(), "Expected closed range!"); + match tts.next() { + Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => { + for b in last_byte.take().unwrap()..byte { + if table[b as usize] == 0 { + table[b as usize] = case_id; + } + } + if table[byte as usize] == 0 { + table[byte as usize] = case_id; + } + } + other => panic!("Expected closed range, got: {:?}", other), + } + }, + Some(syn::TokenTree::Token(syn::Token::FatArrow)) => break, + Some(syn::TokenTree::Token(syn::Token::Ident(ident))) => { + assert_eq!(last_byte, None, "I don't support ranges with identifiers!"); + assert_eq!(*binding, None); + for mut byte in table.iter_mut() { + if *byte == 0 { + *byte = case_id; + } + } + *binding = Some(ident) + } + Some(syn::TokenTree::Token(syn::Token::Underscore)) => { + assert_eq!(last_byte, None); + for mut byte in table.iter_mut() { + if *byte == 0 { + *byte = case_id; + } + } + }, + other => panic!("Expected literal byte, got: {:?}", other), + } + } + + match tts.next() { + Some(syn::TokenTree::Delimited(syn::Delimited { delim: syn::DelimToken::Brace, tts })) => { + Case(tts) + } + other => panic!("Expected case with braces after fat arrow, got: {:?}", other), + } +} + +fn expand_match_bytes_macro(to_be_matched: Vec, + table: &[u8; 256], + cases: Vec, + binding: Option) + -> String { + use std::fmt::Write; + + assert!(!to_be_matched.is_empty()); + assert!(table.iter().all(|b| *b != 0), "Incomplete pattern? Bogus code!"); + + // We build the expression with text since it's easier. + let mut expr = "{\n".to_owned(); + expr.push_str("enum Case {\n"); + for (i, _) in cases.iter().enumerate() { + write!(&mut expr, "Case{} = {},", i + 1, i + 1).unwrap(); + } + expr.push_str("}\n"); // enum Case + + expr.push_str("static __CASES: [Case; 256] = ["); + for byte in table.iter() { + write!(&mut expr, "Case::Case{}, ", *byte).unwrap(); + } + expr.push_str("];\n"); + + let mut tokens = Tokens::new(); + let to_be_matched = syn::Delimited { + delim: if binding.is_some() { syn::DelimToken::Brace } else { syn::DelimToken::Paren }, + tts: to_be_matched + }; + to_be_matched.to_tokens(&mut tokens); + + if let Some(ref binding) = binding { + write!(&mut expr, "let {} = {};\n", binding.to_string(), tokens.as_str()).unwrap(); + } + + write!(&mut expr, "match __CASES[{} as usize] {{", match binding { + Some(binding) => binding.to_string(), + None => tokens.to_string(), + }).unwrap(); + + for (i, case) in cases.into_iter().enumerate() { + let mut case_tokens = Tokens::new(); + let case = syn::Delimited { + delim: syn::DelimToken::Brace, + tts: case.0 + }; + case.to_tokens(&mut case_tokens); + write!(&mut expr, "Case::Case{} => {},\n", i + 1, case_tokens.as_str()).unwrap(); + } + expr.push_str("}\n"); // match + + expr.push_str("}\n"); // top + + expr +} diff --git a/src/macros/mod.rs b/src/macros/mod.rs new file mode 100644 index 00000000..6799e549 --- /dev/null +++ b/src/macros/mod.rs @@ -0,0 +1,5 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +pub mod match_byte; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3fac405e..ec9ef803 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -262,7 +262,7 @@ impl<'a> Tokenizer<'a> { #[inline] pub fn next(&mut self) -> Result, ()> { - next_token(self).ok_or(()) + next_token(self) } #[inline] @@ -359,17 +359,14 @@ impl<'a> Tokenizer<'a> { } #[inline] - fn next_char(&self) -> char { self.char_at(0) } - - #[inline] - fn char_at(&self, offset: usize) -> char { - self.input[self.position + offset..].chars().next().unwrap() + fn next_char(&self) -> char { + self.input[self.position..].chars().next().unwrap() } #[inline] fn has_newline_at(&self, offset: usize) -> bool { self.position + offset < self.input.len() && - matches!(self.char_at(offset), '\n' | '\r' | '\x0C') + matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C') } #[inline] @@ -380,8 +377,8 @@ impl<'a> Tokenizer<'a> { } #[inline] - fn starts_with(&self, needle: &str) -> bool { - self.input[self.position..].starts_with(needle) + fn starts_with(&self, needle: &[u8]) -> bool { + self.input.as_bytes()[self.position..].starts_with(needle) } } @@ -401,164 +398,169 @@ pub struct SourceLocation { } -fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Option> { +fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { if tokenizer.is_eof() { - return None + return Err(()) } - let c = tokenizer.next_char(); - let token = match c { - '\t' | '\n' | ' ' | '\r' | '\x0C' => { + let b = tokenizer.next_byte_unchecked(); + let token = match_byte! { b, + b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => { let start_position = tokenizer.position(); tokenizer.advance(1); while !tokenizer.is_eof() { - match tokenizer.next_char() { - ' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.advance(1), + match tokenizer.next_byte_unchecked() { + b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer.advance(1), _ => break, } } WhiteSpace(tokenizer.slice_from(start_position)) }, - '"' => consume_string(tokenizer, false), - '#' => { + b'"' => { consume_string(tokenizer, false) }, + b'#' => { tokenizer.advance(1); if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } - else if !tokenizer.is_eof() && match tokenizer.next_char() { - 'a'...'z' | 'A'...'Z' | '0'...'9' | '-' | '_' => true, - '\\' => !tokenizer.has_newline_at(1), - _ => c > '\x7F', // Non-ASCII + else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() { + b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'-' | b'_' => true, + b'\\' => !tokenizer.has_newline_at(1), + _ => !b.is_ascii(), } { Hash(consume_name(tokenizer)) } - else { Delim(c) } + else { Delim('#') } }, - '$' => { - if tokenizer.starts_with("$=") { tokenizer.advance(2); SuffixMatch } - else { tokenizer.advance(1); Delim(c) } + b'$' => { + if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch } + else { tokenizer.advance(1); Delim('$') } }, - '\'' => consume_string(tokenizer, true), - '(' => { tokenizer.advance(1); ParenthesisBlock }, - ')' => { tokenizer.advance(1); CloseParenthesis }, - '*' => { - if tokenizer.starts_with("*=") { tokenizer.advance(2); SubstringMatch } - else { tokenizer.advance(1); Delim(c) } + b'\'' => { consume_string(tokenizer, true) }, + b'(' => { tokenizer.advance(1); ParenthesisBlock }, + b')' => { tokenizer.advance(1); CloseParenthesis }, + b'*' => { + if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch } + else { tokenizer.advance(1); Delim('*') } }, - '+' => { + b'+' => { if ( tokenizer.has_at_least(1) - && matches!(tokenizer.char_at(1), '0'...'9') + && matches!(tokenizer.byte_at(1), b'0'...b'9') ) || ( tokenizer.has_at_least(2) - && tokenizer.char_at(1) == '.' - && matches!(tokenizer.char_at(2), '0'...'9') + && tokenizer.byte_at(1) == b'.' + && matches!(tokenizer.byte_at(2), b'0'...b'9') ) { consume_numeric(tokenizer) } else { tokenizer.advance(1); - Delim(c) + Delim('+') } }, - ',' => { tokenizer.advance(1); Comma }, - '-' => { + b',' => { tokenizer.advance(1); Comma }, + b'-' => { if ( tokenizer.has_at_least(1) - && matches!(tokenizer.char_at(1), '0'...'9') + && matches!(tokenizer.byte_at(1), b'0'...b'9') ) || ( tokenizer.has_at_least(2) - && tokenizer.char_at(1) == '.' - && matches!(tokenizer.char_at(2), '0'...'9') + && tokenizer.byte_at(1) == b'.' + && matches!(tokenizer.byte_at(2), b'0'...b'9') ) { consume_numeric(tokenizer) - } else if tokenizer.starts_with("-->") { + } else if tokenizer.starts_with(b"-->") { tokenizer.advance(3); CDC } else if is_ident_start(tokenizer) { consume_ident_like(tokenizer) } else { tokenizer.advance(1); - Delim(c) + Delim('-') } }, - '.' => { + b'.' => { if tokenizer.has_at_least(1) - && matches!(tokenizer.char_at(1), '0'...'9' + && matches!(tokenizer.byte_at(1), b'0'...b'9' ) { consume_numeric(tokenizer) } else { tokenizer.advance(1); - Delim(c) + Delim('.') } } - '/' if tokenizer.starts_with("/*") => { - tokenizer.advance(2); // consume "/*" - let start_position = tokenizer.position(); - let content; - match tokenizer.input[tokenizer.position..].find("*/") { - Some(offset) => { - tokenizer.advance(offset); - content = tokenizer.slice_from(start_position); - tokenizer.advance(2); - } - None => { - tokenizer.position = tokenizer.input.len(); - content = tokenizer.slice_from(start_position); + b'/' => { + if tokenizer.starts_with(b"/*") { + tokenizer.advance(2); // consume "/*" + let start_position = tokenizer.position(); + let content; + match tokenizer.input[tokenizer.position..].find("*/") { + Some(offset) => { + tokenizer.advance(offset); + content = tokenizer.slice_from(start_position); + tokenizer.advance(2); + } + None => { + tokenizer.position = tokenizer.input.len(); + content = tokenizer.slice_from(start_position); + } } + Comment(content) + } else { + tokenizer.advance(1); + Delim('/') } - Comment(content) } - '0'...'9' => consume_numeric(tokenizer), - ':' => { tokenizer.advance(1); Colon }, - ';' => { tokenizer.advance(1); Semicolon }, - '<' => { - if tokenizer.starts_with("