Skip to content

Commit

Permalink
Auto merge of #112 - servo:perf, r=SimonSapin
Browse files Browse the repository at this point in the history
Some perf tweaks to the tokenizer.

This makes parsing quite faster, by stripping UTF-8 logic and using tables instead of branching everywhere.

We may be able to tweak it a bit more (sometimes the table may be overkill? I don't know).

I've written the table macro so you can skip it easily if you want.

In any case, benchmark results:

Before:

> test tests::big_stylesheet ... bench:  10,392,017 ns/iter (+/- 1,954,644)
> test tests::unquoted_url   ... bench:     261,854 ns/iter (+/- 53,335)

After:

> test tests::big_stylesheet ... bench:   8,638,215 ns/iter (+/- 381,980)
> test tests::unquoted_url   ... bench:     211,863 ns/iter (+/- 73,418)

Which is quite good if you ask me.

<!-- Reviewable:start -->
---
This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-cssparser/112)
<!-- Reviewable:end -->
  • Loading branch information
bors-servo authored Jan 21, 2017
2 parents 664111f + 21f8573 commit 7c8a9a6
Show file tree
Hide file tree
Showing 7 changed files with 538 additions and 162 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ script:
- cargo test --verbose
- cargo doc --verbose
- cargo test --features heapsize
- cargo test --features dummy_match_byte
- if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo test --features bench; fi
- if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo test --features "bench dummy_match_byte"; fi

notifications:
webhooks: http://build.servo.org:54856/travis
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ repository = "https://github.com/servo/rust-cssparser"
readme = "README.md"
keywords = ["css", "syntax", "parser"]
license = "MPL-2.0"
build = "build.rs"


[dev-dependencies]
Expand All @@ -22,7 +23,12 @@ heapsize = {version = ">=0.1.1, <0.4.0", optional = true}
matches = "0.1"
serde = {version = ">=0.6.6, <0.9", optional = true}

[build-dependencies]
syn = { version = "0.10.6", features = ["full", "visit"]}
quote = "0.3"

[features]
serde-serialization = [ "serde" ]
heap_size = [ "heapsize" ]
bench = []
dummy_match_byte = []
40 changes: 40 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#[macro_use] extern crate quote;
extern crate syn;

use std::env;
use std::path::Path;


#[cfg(feature = "dummy_match_byte")]
mod codegen {
use std::path::Path;
pub fn main(_: &Path) {}
}

#[cfg(not(feature = "dummy_match_byte"))]
#[path = "src/macros/mod.rs"]
mod macros;

#[cfg(not(feature = "dummy_match_byte"))]
mod codegen {
use macros;
use std::env;
use std::path::Path;

pub fn main(tokenizer_rs: &Path) {
macros::match_byte::expand(tokenizer_rs,
&Path::new(&env::var("OUT_DIR").unwrap()).join("tokenizer.rs"));

}
}

fn main() {
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
let tokenizer_rs = Path::new(&manifest_dir).join("src/tokenizer.rs");
codegen::main(&tokenizer_rs);
println!("cargo:rerun-if-changed={}", tokenizer_rs.display());
}
18 changes: 18 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,25 @@ macro_rules! match_ignore_ascii_case {
}

mod rules_and_declarations;

#[cfg(feature = "dummy_match_byte")]
macro_rules! match_byte {
($value:expr, $($rest:tt)* ) => {
match $value {
$(
$rest
)+
}
};
}

#[cfg(feature = "dummy_match_byte")]
mod tokenizer;

#[cfg(not(feature = "dummy_match_byte"))]
mod tokenizer {
include!(concat!(env!("OUT_DIR"), "/tokenizer.rs"));
}
mod parser;
mod from_bytes;
mod color;
Expand Down
271 changes: 271 additions & 0 deletions src/macros/match_byte.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use quote::{ToTokens, Tokens};
use std::fs::File;
use std::io::{Read, Write};
use std::path::Path;
use std::vec;
use std::iter;
use syn;

pub fn expand(from: &Path, to: &Path) {
let mut source = String::new();
File::open(from).unwrap().read_to_string(&mut source).unwrap();
let tts = syn::parse_token_trees(&source).expect("Parsing rules.rs module");
let mut tokens = Tokens::new();
tokens.append_all(expand_tts(tts));

let code = tokens.to_string().replace("{ ", "{\n").replace(" }", "\n}");
File::create(to).unwrap().write_all(code.as_bytes()).unwrap();
}

fn expand_tts(tts: Vec<syn::TokenTree>) -> Vec<syn::TokenTree> {
use syn::*;
let mut expanded = Vec::new();
let mut tts = tts.into_iter();
while let Some(tt) = tts.next() {
match tt {
TokenTree::Token(Token::Ident(ident)) => {
if ident != "match_byte" {
expanded.push(TokenTree::Token(Token::Ident(ident)));
continue;
}

match tts.next() {
Some(TokenTree::Token(Token::Not)) => {},
other => {
expanded.push(TokenTree::Token(Token::Ident(ident)));
if let Some(other) = other {
expanded.push(other);
}
continue;
}
}

let tts = match tts.next() {
Some(TokenTree::Delimited(Delimited { tts, .. })) => tts,
other => {
expanded.push(TokenTree::Token(Token::Ident(ident)));
expanded.push(TokenTree::Token(Token::Not));
if let Some(other) = other {
expanded.push(other);
}
continue;
}
};

let (to_be_matched, table, cases, wildcard_binding) = parse_match_bytes_macro(tts);
let expr = expand_match_bytes_macro(to_be_matched,
&table,
cases,
wildcard_binding);

let tts = syn::parse_token_trees(&expr)
.expect("parsing macro expansion as token trees");
expanded.extend(expand_tts(tts));
}
TokenTree::Delimited(Delimited { delim, tts }) => {
expanded.push(TokenTree::Delimited(Delimited {
delim: delim,
tts: expand_tts(tts),
}))
}
other => expanded.push(other),
}
}
expanded
}

/// Parses a token tree corresponding to the `match_byte` macro.
///
/// ## Example
///
/// ```rust
/// match_byte! { tokenizer.next_byte_unchecked(),
/// b'a'..b'z' => { ... }
/// b'0'..b'9' => { ... }
/// b'\n' | b'\\' => { ... }
/// foo => { ... }
/// }
///
/// Returns:
/// * The token tree that contains the expression to be matched (in this case
/// `tokenizer.next_byte_unchecked()`.
///
/// * The table with the different cases per byte, each entry in the table
/// contains a non-zero integer representing a different arm of the
/// match expression.
///
/// * The list of cases containing the expansion of the arms of the match
/// expression.
///
/// * An optional identifier to which the wildcard pattern is matched (`foo` in
/// this case).
///
fn parse_match_bytes_macro(tts: Vec<syn::TokenTree>) -> (Vec<syn::TokenTree>, [u8; 256], Vec<Case>, Option<syn::Ident>) {
let mut tts = tts.into_iter();

// Grab the thing we're matching, until we find a comma.
let mut left_hand_side = vec![];
loop {
match tts.next() {
Some(syn::TokenTree::Token(syn::Token::Comma)) => break,
Some(other) => left_hand_side.push(other),
None => panic!("Expected not to run out of tokens looking for a comma"),
}
}

let mut cases = vec![];
let mut table = [0u8; 256];

let mut tts = tts.peekable();
let mut case_id: u8 = 1;
let mut binding = None;
while tts.len() > 0 {
cases.push(parse_case(&mut tts, &mut table, &mut binding, case_id));

// Allow an optional comma between cases.
match tts.peek() {
Some(&syn::TokenTree::Token(syn::Token::Comma)) => {
tts.next();
},
_ => {},
}

case_id += 1;
}

(left_hand_side, table, cases, binding)
}

#[derive(Debug)]
struct Case(Vec<syn::TokenTree>);

/// Parses a single pattern => expression, and returns the case, filling in the
/// table with the case id for every byte that matched.
///
/// The `binding` parameter is the identifier that is used by the wildcard
/// pattern.
fn parse_case(tts: &mut iter::Peekable<vec::IntoIter<syn::TokenTree>>,
table: &mut [u8; 256],
binding: &mut Option<syn::Ident>,
case_id: u8)
-> Case {
// The last byte checked, as part of this pattern, to properly detect
// ranges.
let mut last_byte: Option<u8> = None;

// Loop through the pattern filling with bytes the table.
loop {
match tts.next() {
Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => {
table[byte as usize] = case_id;
last_byte = Some(byte);
}
Some(syn::TokenTree::Token(syn::Token::BinOp(syn::BinOpToken::Or))) => {
last_byte = None; // This pattern is over.
},
Some(syn::TokenTree::Token(syn::Token::DotDotDot)) => {
assert!(last_byte.is_some(), "Expected closed range!");
match tts.next() {
Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => {
for b in last_byte.take().unwrap()..byte {
if table[b as usize] == 0 {
table[b as usize] = case_id;
}
}
if table[byte as usize] == 0 {
table[byte as usize] = case_id;
}
}
other => panic!("Expected closed range, got: {:?}", other),
}
},
Some(syn::TokenTree::Token(syn::Token::FatArrow)) => break,
Some(syn::TokenTree::Token(syn::Token::Ident(ident))) => {
assert_eq!(last_byte, None, "I don't support ranges with identifiers!");
assert_eq!(*binding, None);
for mut byte in table.iter_mut() {
if *byte == 0 {
*byte = case_id;
}
}
*binding = Some(ident)
}
Some(syn::TokenTree::Token(syn::Token::Underscore)) => {
assert_eq!(last_byte, None);
for mut byte in table.iter_mut() {
if *byte == 0 {
*byte = case_id;
}
}
},
other => panic!("Expected literal byte, got: {:?}", other),
}
}

match tts.next() {
Some(syn::TokenTree::Delimited(syn::Delimited { delim: syn::DelimToken::Brace, tts })) => {
Case(tts)
}
other => panic!("Expected case with braces after fat arrow, got: {:?}", other),
}
}

fn expand_match_bytes_macro(to_be_matched: Vec<syn::TokenTree>,
table: &[u8; 256],
cases: Vec<Case>,
binding: Option<syn::Ident>)
-> String {
use std::fmt::Write;

assert!(!to_be_matched.is_empty());
assert!(table.iter().all(|b| *b != 0), "Incomplete pattern? Bogus code!");

// We build the expression with text since it's easier.
let mut expr = "{\n".to_owned();
expr.push_str("enum Case {\n");
for (i, _) in cases.iter().enumerate() {
write!(&mut expr, "Case{} = {},", i + 1, i + 1).unwrap();
}
expr.push_str("}\n"); // enum Case

expr.push_str("static __CASES: [Case; 256] = [");
for byte in table.iter() {
write!(&mut expr, "Case::Case{}, ", *byte).unwrap();
}
expr.push_str("];\n");

let mut tokens = Tokens::new();
let to_be_matched = syn::Delimited {
delim: if binding.is_some() { syn::DelimToken::Brace } else { syn::DelimToken::Paren },
tts: to_be_matched
};
to_be_matched.to_tokens(&mut tokens);

if let Some(ref binding) = binding {
write!(&mut expr, "let {} = {};\n", binding.to_string(), tokens.as_str()).unwrap();
}

write!(&mut expr, "match __CASES[{} as usize] {{", match binding {
Some(binding) => binding.to_string(),
None => tokens.to_string(),
}).unwrap();

for (i, case) in cases.into_iter().enumerate() {
let mut case_tokens = Tokens::new();
let case = syn::Delimited {
delim: syn::DelimToken::Brace,
tts: case.0
};
case.to_tokens(&mut case_tokens);
write!(&mut expr, "Case::Case{} => {},\n", i + 1, case_tokens.as_str()).unwrap();
}
expr.push_str("}\n"); // match

expr.push_str("}\n"); // top

expr
}
5 changes: 5 additions & 0 deletions src/macros/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

pub mod match_byte;
Loading

0 comments on commit 7c8a9a6

Please sign in to comment.