Skip to content

Commit

Permalink
Added parserdata structure to tokenizer
Browse files Browse the repository at this point in the history
This parserdata structure is a gateway between the tokenizer and the
parser. In a certain case (just one), the tokenizer needs to know the
state of the parser to generate a correct token. The current setup has
the tokenizer and parser in such a way, that we cannot easily reference
eachother without borrow/check issues.

THerefor we add this "hack", which finds out the data beforehand, and
calls the tokenizer with this data. This means the call is done for each
tokenizer call, instead of only when needed, but it saves a big refactor
of the tokenizer/parser.

In the future, we probably should separate the tokenizer, parser, and
tree builder/sink structure so this is not an issue anymore.
  • Loading branch information
jaytaph committed Nov 5, 2023
1 parent dca4907 commit 9bf6eb3
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 23 deletions.
8 changes: 4 additions & 4 deletions src/bin/parser-test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ fn main() {
tests_failed: Vec::new(),
};

let filenames = Some(&["tests2.dat"][..]);
let filenames = Some(&["plain-text-unsafe.dat"][..]);
let fixtures = read_fixtures(filenames).expect("fixtures");

for fixture_file in fixtures {
Expand All @@ -41,9 +41,9 @@ fn main() {

let mut test_idx = 1;
for test in fixture_file.tests {
// if test_idx == 57 {
run_test(test_idx, test, &mut results);
// }
if test_idx == 11 {
run_test(test_idx, test, &mut results);
}
test_idx += 1;
}

Expand Down
18 changes: 16 additions & 2 deletions src/html5/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment
use crate::html5::parser::quirks::QuirksMode;
use crate::html5::tokenizer::state::State;
use crate::html5::tokenizer::token::Token;
use crate::html5::tokenizer::{Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
use crate::types::{ParseError, Result};
use alloc::rc::Rc;
use core::cell::RefCell;
Expand Down Expand Up @@ -3752,13 +3752,27 @@ impl<'chars> Html5Parser<'chars> {
}
}

fn parser_data(&self) -> ParserData {
let namespace = self
.get_adjusted_current_node()
.namespace
.unwrap_or_default();

ParserData {
adjusted_node_namespace: namespace,
}
}

/// Fetches the next token from the tokenizer. However, if the token is a text token AND
/// it starts with one or more whitespaces, the token is split into 2 tokens: the whitespace part
/// and the remainder.
fn fetch_next_token(&mut self) -> Token {
// If there are no tokens to fetch, fetch the next token from the tokenizer
if self.token_queue.is_empty() {
let token = self.tokenizer.next_token().expect("tokenizer error");
let token = self
.tokenizer
.next_token(self.parser_data())
.expect("tokenizer error");

if let Token::Text(value) = token {
for c in value.chars() {
Expand Down
30 changes: 24 additions & 6 deletions src/html5/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::bytes::Bytes::{self, *};
use crate::bytes::SeekMode::SeekCur;
use crate::bytes::{CharIterator, Position};
use crate::html5::error_logger::{ErrorLogger, ParserError};
use crate::html5::node::HTML_NAMESPACE;
use crate::html5::tokenizer::state::State;
use crate::html5::tokenizer::token::Token;
use crate::types::{Error, Result};
Expand Down Expand Up @@ -50,6 +51,21 @@ pub struct Tokenizer<'stream> {
pub error_logger: Rc<RefCell<ErrorLogger>>,
}

/// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed
/// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for
/// more information and how we should refactor this properly.
pub struct ParserData {
pub adjusted_node_namespace: String,
}

impl Default for ParserData {
fn default() -> Self {
ParserData {
adjusted_node_namespace: HTML_NAMESPACE.to_string(),
}
}
}

/// Options that can be passed to the tokenizer. Mostly needed when dealing with tests.
pub struct Options {
/// Sets the initial state of the tokenizer. Normally only needed when dealing with tests
Expand Down Expand Up @@ -103,8 +119,8 @@ impl<'stream> Tokenizer<'stream> {
}

/// Retrieves the next token from the input stream or Token::EOF when the end is reached
pub fn next_token(&mut self) -> Result<Token> {
self.consume_stream()?;
pub fn next_token(&mut self, parser_data: ParserData) -> Result<Token> {
self.consume_stream(parser_data)?;

if self.token_queue.is_empty() {
return Ok(Token::Eof);
Expand All @@ -124,7 +140,7 @@ impl<'stream> Tokenizer<'stream> {
}

/// Consumes the input stream. Continues until the stream is completed or a token has been generated.
fn consume_stream(&mut self) -> Result<()> {
fn consume_stream(&mut self, parser_data: ParserData) -> Result<()> {
loop {
// Something is already in the token buffer, so we can return it.
if !self.token_queue.is_empty() {
Expand Down Expand Up @@ -1210,9 +1226,11 @@ impl<'stream> Tokenizer<'stream> {
if self.chars.look_ahead_slice(7) == "[CDATA[" {
self.chars.seek(SeekCur, 7);

// @TODO: If there is an adjusted current node and it is not an element in the HTML namespace,
// then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error.
// Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state.
if parser_data.adjusted_node_namespace != HTML_NAMESPACE {
self.state = State::CDATASection;
continue;
}

self.parse_error(ParserError::CdataInHtmlContent);
self.current_token = Some(Token::Comment("[CDATA[".into()));

Expand Down
4 changes: 3 additions & 1 deletion src/html5/tokenizer/character_reference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::bytes::{
};
use crate::html5::error_logger::ParserError;
use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS};

use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT};
use lazy_static::lazy_static;

Expand Down Expand Up @@ -350,6 +351,7 @@ lazy_static! {
#[cfg(test)]
mod tests {
use super::*;
use crate::html5::tokenizer::ParserData;
use crate::{bytes::CharIterator, html5::error_logger::ErrorLogger};
use std::cell::RefCell;
use std::rc::Rc;
Expand All @@ -367,7 +369,7 @@ mod tests {
let error_logger = Rc::new(RefCell::new(ErrorLogger::new()));
let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone());

let token = tokenizer.next_token().unwrap();
let token = tokenizer.next_token(ParserData::default()).unwrap();
assert_eq!(expected, token.to_string());
}
)*
Expand Down
7 changes: 4 additions & 3 deletions src/testing/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::FIXTURE_ROOT;
use crate::bytes::CharIterator;
use crate::html5::tokenizer::ParserData;
use crate::html5::{
error_logger::ErrorLogger,
tokenizer::{
Expand Down Expand Up @@ -211,12 +212,12 @@ impl TestSpec {
// If there is no output, still do an (initial) next token so the parser can generate
// errors.
if self.output.is_empty() {
tokenizer.next_token().unwrap();
tokenizer.next_token(ParserData::default()).unwrap();
}

// There can be multiple tokens to match. Make sure we match all of them
for expected in self.output.iter() {
let actual = tokenizer.next_token().unwrap();
let actual = tokenizer.next_token(ParserData::default()).unwrap();
assert_eq!(self.escape(&actual), self.escape(expected));
}

Expand All @@ -237,7 +238,7 @@ impl TestSpec {
let mut tokenizer = builder.build();

for _ in self.output.iter() {
tokenizer.next_token().unwrap();
tokenizer.next_token(ParserData::default()).unwrap();
}
}
}
Expand Down
14 changes: 7 additions & 7 deletions tests/tree_construction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const DISABLED_CASES: &[&str] = &[
#[test_case("tests3.dat")]
#[test_case("tests4.dat")]
#[test_case("tests5.dat")]
// #[test_case("tests6.dat")]
#[test_case("tests6.dat")]
#[test_case("tests7.dat")]
#[test_case("tests8.dat")]
#[test_case("tests9.dat")]
Expand All @@ -27,9 +27,9 @@ const DISABLED_CASES: &[&str] = &[
// #[test_case("tests16.dat")]
#[test_case("tests17.dat")]
#[test_case("tests18.dat")]
// #[test_case("tests19.dat")]
#[test_case("tests19.dat")]
#[test_case("tests20.dat")]
// #[test_case("tests21.dat")]
#[test_case("tests21.dat")]
#[test_case("tests22.dat")]
#[test_case("tests23.dat")]
#[test_case("tests24.dat")]
Expand All @@ -40,11 +40,11 @@ const DISABLED_CASES: &[&str] = &[
#[test_case("blocks.dat")]
#[test_case("comments01.dat")]
#[test_case("doctype01.dat")]
// #[test_case("domjs-unsafe.dat")]
#[test_case("domjs-unsafe.dat")]
#[test_case("entities01.dat")]
#[test_case("entities02.dat")]
#[test_case("foreign-fragment.dat")]
#[test_case("html5test-com.dat")]
// #[test_case("html5test-com.dat")]
#[test_case("inbody01.dat")]
#[test_case("isindex.dat")]
#[test_case("main-element.dat")]
Expand All @@ -54,7 +54,7 @@ const DISABLED_CASES: &[&str] = &[
#[test_case("noscript01.dat")]
#[test_case("pending-spec-changes.dat")]
#[test_case("pending-spec-changes-plain-text-unsafe.dat")]
// #[test_case("plain-text-unsafe.dat")]
#[test_case("plain-text-unsafe.dat")]
#[test_case("quirks01.dat")]
#[test_case("ruby.dat")]
#[test_case("scriptdata01.dat")]
Expand All @@ -64,7 +64,7 @@ const DISABLED_CASES: &[&str] = &[
// #[test_case("template.dat")]
#[test_case("tests_innerHTML_1.dat")]
#[test_case("tricky01.dat")]
// #[test_case("webkit01.dat")]
#[test_case("webkit01.dat")]
#[test_case("webkit02.dat")]
fn tree_construction(filename: &str) {
let fixture_file =
Expand Down

0 comments on commit 9bf6eb3

Please sign in to comment.