From 9bf6eb34eebccae88cd999ed953a447e155cd5cf Mon Sep 17 00:00:00 2001 From: Joshua Thijssen Date: Sun, 5 Nov 2023 12:10:56 +0100 Subject: [PATCH] Added parserdata structure to tokenizer This parserdata structure is a gateway between the tokenizer and the parser. In a certain case (just one), the tokenizer needs to know the state of the parser to generate a correct token. The current setup has the tokenizer and parser in such a way, that we cannot easily reference eachother without borrow/check issues. THerefor we add this "hack", which finds out the data beforehand, and calls the tokenizer with this data. This means the call is done for each tokenizer call, instead of only when needed, but it saves a big refactor of the tokenizer/parser. In the future, we probably should separate the tokenizer, parser, and tree builder/sink structure so this is not an issue anymore. --- src/bin/parser-test.rs | 8 +++--- src/html5/parser.rs | 18 +++++++++++-- src/html5/tokenizer.rs | 30 +++++++++++++++++----- src/html5/tokenizer/character_reference.rs | 4 ++- src/testing/tokenizer.rs | 7 ++--- tests/tree_construction.rs | 14 +++++----- 6 files changed, 58 insertions(+), 23 deletions(-) diff --git a/src/bin/parser-test.rs b/src/bin/parser-test.rs index af63a9ea0..5e5f78da3 100755 --- a/src/bin/parser-test.rs +++ b/src/bin/parser-test.rs @@ -29,7 +29,7 @@ fn main() { tests_failed: Vec::new(), }; - let filenames = Some(&["tests2.dat"][..]); + let filenames = Some(&["plain-text-unsafe.dat"][..]); let fixtures = read_fixtures(filenames).expect("fixtures"); for fixture_file in fixtures { @@ -41,9 +41,9 @@ fn main() { let mut test_idx = 1; for test in fixture_file.tests { - // if test_idx == 57 { - run_test(test_idx, test, &mut results); - // } + if test_idx == 11 { + run_test(test_idx, test, &mut results); + } test_idx += 1; } diff --git a/src/html5/parser.rs b/src/html5/parser.rs index 3dc516f1f..43a3606e8 100644 --- a/src/html5/parser.rs +++ b/src/html5/parser.rs @@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment use crate::html5::parser::quirks::QuirksMode; use crate::html5::tokenizer::state::State; use crate::html5::tokenizer::token::Token; -use crate::html5::tokenizer::{Tokenizer, CHAR_NUL, CHAR_REPLACEMENT}; +use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT}; use crate::types::{ParseError, Result}; use alloc::rc::Rc; use core::cell::RefCell; @@ -3752,13 +3752,27 @@ impl<'chars> Html5Parser<'chars> { } } + fn parser_data(&self) -> ParserData { + let namespace = self + .get_adjusted_current_node() + .namespace + .unwrap_or_default(); + + ParserData { + adjusted_node_namespace: namespace, + } + } + /// Fetches the next token from the tokenizer. However, if the token is a text token AND /// it starts with one or more whitespaces, the token is split into 2 tokens: the whitespace part /// and the remainder. fn fetch_next_token(&mut self) -> Token { // If there are no tokens to fetch, fetch the next token from the tokenizer if self.token_queue.is_empty() { - let token = self.tokenizer.next_token().expect("tokenizer error"); + let token = self + .tokenizer + .next_token(self.parser_data()) + .expect("tokenizer error"); if let Token::Text(value) = token { for c in value.chars() { diff --git a/src/html5/tokenizer.rs b/src/html5/tokenizer.rs index cea1f5ee5..931370f9e 100644 --- a/src/html5/tokenizer.rs +++ b/src/html5/tokenizer.rs @@ -8,6 +8,7 @@ use crate::bytes::Bytes::{self, *}; use crate::bytes::SeekMode::SeekCur; use crate::bytes::{CharIterator, Position}; use crate::html5::error_logger::{ErrorLogger, ParserError}; +use crate::html5::node::HTML_NAMESPACE; use crate::html5::tokenizer::state::State; use crate::html5::tokenizer::token::Token; use crate::types::{Error, Result}; @@ -50,6 +51,21 @@ pub struct Tokenizer<'stream> { pub error_logger: Rc>, } +/// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed +/// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for +/// more information and how we should refactor this properly. +pub struct ParserData { + pub adjusted_node_namespace: String, +} + +impl Default for ParserData { + fn default() -> Self { + ParserData { + adjusted_node_namespace: HTML_NAMESPACE.to_string(), + } + } +} + /// Options that can be passed to the tokenizer. Mostly needed when dealing with tests. pub struct Options { /// Sets the initial state of the tokenizer. Normally only needed when dealing with tests @@ -103,8 +119,8 @@ impl<'stream> Tokenizer<'stream> { } /// Retrieves the next token from the input stream or Token::EOF when the end is reached - pub fn next_token(&mut self) -> Result { - self.consume_stream()?; + pub fn next_token(&mut self, parser_data: ParserData) -> Result { + self.consume_stream(parser_data)?; if self.token_queue.is_empty() { return Ok(Token::Eof); @@ -124,7 +140,7 @@ impl<'stream> Tokenizer<'stream> { } /// Consumes the input stream. Continues until the stream is completed or a token has been generated. - fn consume_stream(&mut self) -> Result<()> { + fn consume_stream(&mut self, parser_data: ParserData) -> Result<()> { loop { // Something is already in the token buffer, so we can return it. if !self.token_queue.is_empty() { @@ -1210,9 +1226,11 @@ impl<'stream> Tokenizer<'stream> { if self.chars.look_ahead_slice(7) == "[CDATA[" { self.chars.seek(SeekCur, 7); - // @TODO: If there is an adjusted current node and it is not an element in the HTML namespace, - // then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error. - // Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state. + if parser_data.adjusted_node_namespace != HTML_NAMESPACE { + self.state = State::CDATASection; + continue; + } + self.parse_error(ParserError::CdataInHtmlContent); self.current_token = Some(Token::Comment("[CDATA[".into())); diff --git a/src/html5/tokenizer/character_reference.rs b/src/html5/tokenizer/character_reference.rs index 4eb3854a1..5d2796cb0 100644 --- a/src/html5/tokenizer/character_reference.rs +++ b/src/html5/tokenizer/character_reference.rs @@ -6,6 +6,7 @@ use crate::bytes::{ }; use crate::html5::error_logger::ParserError; use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS}; + use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT}; use lazy_static::lazy_static; @@ -350,6 +351,7 @@ lazy_static! { #[cfg(test)] mod tests { use super::*; + use crate::html5::tokenizer::ParserData; use crate::{bytes::CharIterator, html5::error_logger::ErrorLogger}; use std::cell::RefCell; use std::rc::Rc; @@ -367,7 +369,7 @@ mod tests { let error_logger = Rc::new(RefCell::new(ErrorLogger::new())); let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone()); - let token = tokenizer.next_token().unwrap(); + let token = tokenizer.next_token(ParserData::default()).unwrap(); assert_eq!(expected, token.to_string()); } )* diff --git a/src/testing/tokenizer.rs b/src/testing/tokenizer.rs index 252930828..557c0ab75 100644 --- a/src/testing/tokenizer.rs +++ b/src/testing/tokenizer.rs @@ -1,5 +1,6 @@ use super::FIXTURE_ROOT; use crate::bytes::CharIterator; +use crate::html5::tokenizer::ParserData; use crate::html5::{ error_logger::ErrorLogger, tokenizer::{ @@ -211,12 +212,12 @@ impl TestSpec { // If there is no output, still do an (initial) next token so the parser can generate // errors. if self.output.is_empty() { - tokenizer.next_token().unwrap(); + tokenizer.next_token(ParserData::default()).unwrap(); } // There can be multiple tokens to match. Make sure we match all of them for expected in self.output.iter() { - let actual = tokenizer.next_token().unwrap(); + let actual = tokenizer.next_token(ParserData::default()).unwrap(); assert_eq!(self.escape(&actual), self.escape(expected)); } @@ -237,7 +238,7 @@ impl TestSpec { let mut tokenizer = builder.build(); for _ in self.output.iter() { - tokenizer.next_token().unwrap(); + tokenizer.next_token(ParserData::default()).unwrap(); } } } diff --git a/tests/tree_construction.rs b/tests/tree_construction.rs index 1e0774008..24fdcfc91 100644 --- a/tests/tree_construction.rs +++ b/tests/tree_construction.rs @@ -15,7 +15,7 @@ const DISABLED_CASES: &[&str] = &[ #[test_case("tests3.dat")] #[test_case("tests4.dat")] #[test_case("tests5.dat")] -// #[test_case("tests6.dat")] +#[test_case("tests6.dat")] #[test_case("tests7.dat")] #[test_case("tests8.dat")] #[test_case("tests9.dat")] @@ -27,9 +27,9 @@ const DISABLED_CASES: &[&str] = &[ // #[test_case("tests16.dat")] #[test_case("tests17.dat")] #[test_case("tests18.dat")] -// #[test_case("tests19.dat")] +#[test_case("tests19.dat")] #[test_case("tests20.dat")] -// #[test_case("tests21.dat")] +#[test_case("tests21.dat")] #[test_case("tests22.dat")] #[test_case("tests23.dat")] #[test_case("tests24.dat")] @@ -40,11 +40,11 @@ const DISABLED_CASES: &[&str] = &[ #[test_case("blocks.dat")] #[test_case("comments01.dat")] #[test_case("doctype01.dat")] -// #[test_case("domjs-unsafe.dat")] +#[test_case("domjs-unsafe.dat")] #[test_case("entities01.dat")] #[test_case("entities02.dat")] #[test_case("foreign-fragment.dat")] -#[test_case("html5test-com.dat")] +// #[test_case("html5test-com.dat")] #[test_case("inbody01.dat")] #[test_case("isindex.dat")] #[test_case("main-element.dat")] @@ -54,7 +54,7 @@ const DISABLED_CASES: &[&str] = &[ #[test_case("noscript01.dat")] #[test_case("pending-spec-changes.dat")] #[test_case("pending-spec-changes-plain-text-unsafe.dat")] -// #[test_case("plain-text-unsafe.dat")] +#[test_case("plain-text-unsafe.dat")] #[test_case("quirks01.dat")] #[test_case("ruby.dat")] #[test_case("scriptdata01.dat")] @@ -64,7 +64,7 @@ const DISABLED_CASES: &[&str] = &[ // #[test_case("template.dat")] #[test_case("tests_innerHTML_1.dat")] #[test_case("tricky01.dat")] -// #[test_case("webkit01.dat")] +#[test_case("webkit01.dat")] #[test_case("webkit02.dat")] fn tree_construction(filename: &str) { let fixture_file =