diff --git a/src/bin/parser-test.rs b/src/bin/parser-test.rs index af63a9ea0..5e5f78da3 100755 --- a/src/bin/parser-test.rs +++ b/src/bin/parser-test.rs @@ -29,7 +29,7 @@ fn main() { tests_failed: Vec::new(), }; - let filenames = Some(&["tests2.dat"][..]); + let filenames = Some(&["plain-text-unsafe.dat"][..]); let fixtures = read_fixtures(filenames).expect("fixtures"); for fixture_file in fixtures { @@ -41,9 +41,9 @@ fn main() { let mut test_idx = 1; for test in fixture_file.tests { - // if test_idx == 57 { - run_test(test_idx, test, &mut results); - // } + if test_idx == 11 { + run_test(test_idx, test, &mut results); + } test_idx += 1; } diff --git a/src/html5/parser.rs b/src/html5/parser.rs index 3dc516f1f..43a3606e8 100644 --- a/src/html5/parser.rs +++ b/src/html5/parser.rs @@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment use crate::html5::parser::quirks::QuirksMode; use crate::html5::tokenizer::state::State; use crate::html5::tokenizer::token::Token; -use crate::html5::tokenizer::{Tokenizer, CHAR_NUL, CHAR_REPLACEMENT}; +use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT}; use crate::types::{ParseError, Result}; use alloc::rc::Rc; use core::cell::RefCell; @@ -3752,13 +3752,27 @@ impl<'chars> Html5Parser<'chars> { } } + fn parser_data(&self) -> ParserData { + let namespace = self + .get_adjusted_current_node() + .namespace + .unwrap_or_default(); + + ParserData { + adjusted_node_namespace: namespace, + } + } + /// Fetches the next token from the tokenizer. However, if the token is a text token AND /// it starts with one or more whitespaces, the token is split into 2 tokens: the whitespace part /// and the remainder. fn fetch_next_token(&mut self) -> Token { // If there are no tokens to fetch, fetch the next token from the tokenizer if self.token_queue.is_empty() { - let token = self.tokenizer.next_token().expect("tokenizer error"); + let token = self + .tokenizer + .next_token(self.parser_data()) + .expect("tokenizer error"); if let Token::Text(value) = token { for c in value.chars() { diff --git a/src/html5/tokenizer.rs b/src/html5/tokenizer.rs index cea1f5ee5..931370f9e 100644 --- a/src/html5/tokenizer.rs +++ b/src/html5/tokenizer.rs @@ -8,6 +8,7 @@ use crate::bytes::Bytes::{self, *}; use crate::bytes::SeekMode::SeekCur; use crate::bytes::{CharIterator, Position}; use crate::html5::error_logger::{ErrorLogger, ParserError}; +use crate::html5::node::HTML_NAMESPACE; use crate::html5::tokenizer::state::State; use crate::html5::tokenizer::token::Token; use crate::types::{Error, Result}; @@ -50,6 +51,21 @@ pub struct Tokenizer<'stream> { pub error_logger: Rc>, } +/// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed +/// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for +/// more information and how we should refactor this properly. +pub struct ParserData { + pub adjusted_node_namespace: String, +} + +impl Default for ParserData { + fn default() -> Self { + ParserData { + adjusted_node_namespace: HTML_NAMESPACE.to_string(), + } + } +} + /// Options that can be passed to the tokenizer. Mostly needed when dealing with tests. pub struct Options { /// Sets the initial state of the tokenizer. Normally only needed when dealing with tests @@ -103,8 +119,8 @@ impl<'stream> Tokenizer<'stream> { } /// Retrieves the next token from the input stream or Token::EOF when the end is reached - pub fn next_token(&mut self) -> Result { - self.consume_stream()?; + pub fn next_token(&mut self, parser_data: ParserData) -> Result { + self.consume_stream(parser_data)?; if self.token_queue.is_empty() { return Ok(Token::Eof); @@ -124,7 +140,7 @@ impl<'stream> Tokenizer<'stream> { } /// Consumes the input stream. Continues until the stream is completed or a token has been generated. - fn consume_stream(&mut self) -> Result<()> { + fn consume_stream(&mut self, parser_data: ParserData) -> Result<()> { loop { // Something is already in the token buffer, so we can return it. if !self.token_queue.is_empty() { @@ -1210,9 +1226,11 @@ impl<'stream> Tokenizer<'stream> { if self.chars.look_ahead_slice(7) == "[CDATA[" { self.chars.seek(SeekCur, 7); - // @TODO: If there is an adjusted current node and it is not an element in the HTML namespace, - // then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error. - // Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state. + if parser_data.adjusted_node_namespace != HTML_NAMESPACE { + self.state = State::CDATASection; + continue; + } + self.parse_error(ParserError::CdataInHtmlContent); self.current_token = Some(Token::Comment("[CDATA[".into())); diff --git a/src/html5/tokenizer/character_reference.rs b/src/html5/tokenizer/character_reference.rs index 4eb3854a1..5d2796cb0 100644 --- a/src/html5/tokenizer/character_reference.rs +++ b/src/html5/tokenizer/character_reference.rs @@ -6,6 +6,7 @@ use crate::bytes::{ }; use crate::html5::error_logger::ParserError; use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS}; + use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT}; use lazy_static::lazy_static; @@ -350,6 +351,7 @@ lazy_static! { #[cfg(test)] mod tests { use super::*; + use crate::html5::tokenizer::ParserData; use crate::{bytes::CharIterator, html5::error_logger::ErrorLogger}; use std::cell::RefCell; use std::rc::Rc; @@ -367,7 +369,7 @@ mod tests { let error_logger = Rc::new(RefCell::new(ErrorLogger::new())); let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone()); - let token = tokenizer.next_token().unwrap(); + let token = tokenizer.next_token(ParserData::default()).unwrap(); assert_eq!(expected, token.to_string()); } )* diff --git a/src/testing/tokenizer.rs b/src/testing/tokenizer.rs index 252930828..557c0ab75 100644 --- a/src/testing/tokenizer.rs +++ b/src/testing/tokenizer.rs @@ -1,5 +1,6 @@ use super::FIXTURE_ROOT; use crate::bytes::CharIterator; +use crate::html5::tokenizer::ParserData; use crate::html5::{ error_logger::ErrorLogger, tokenizer::{ @@ -211,12 +212,12 @@ impl TestSpec { // If there is no output, still do an (initial) next token so the parser can generate // errors. if self.output.is_empty() { - tokenizer.next_token().unwrap(); + tokenizer.next_token(ParserData::default()).unwrap(); } // There can be multiple tokens to match. Make sure we match all of them for expected in self.output.iter() { - let actual = tokenizer.next_token().unwrap(); + let actual = tokenizer.next_token(ParserData::default()).unwrap(); assert_eq!(self.escape(&actual), self.escape(expected)); } @@ -237,7 +238,7 @@ impl TestSpec { let mut tokenizer = builder.build(); for _ in self.output.iter() { - tokenizer.next_token().unwrap(); + tokenizer.next_token(ParserData::default()).unwrap(); } } } diff --git a/tests/tree_construction.rs b/tests/tree_construction.rs index 1e0774008..24fdcfc91 100644 --- a/tests/tree_construction.rs +++ b/tests/tree_construction.rs @@ -15,7 +15,7 @@ const DISABLED_CASES: &[&str] = &[ #[test_case("tests3.dat")] #[test_case("tests4.dat")] #[test_case("tests5.dat")] -// #[test_case("tests6.dat")] +#[test_case("tests6.dat")] #[test_case("tests7.dat")] #[test_case("tests8.dat")] #[test_case("tests9.dat")] @@ -27,9 +27,9 @@ const DISABLED_CASES: &[&str] = &[ // #[test_case("tests16.dat")] #[test_case("tests17.dat")] #[test_case("tests18.dat")] -// #[test_case("tests19.dat")] +#[test_case("tests19.dat")] #[test_case("tests20.dat")] -// #[test_case("tests21.dat")] +#[test_case("tests21.dat")] #[test_case("tests22.dat")] #[test_case("tests23.dat")] #[test_case("tests24.dat")] @@ -40,11 +40,11 @@ const DISABLED_CASES: &[&str] = &[ #[test_case("blocks.dat")] #[test_case("comments01.dat")] #[test_case("doctype01.dat")] -// #[test_case("domjs-unsafe.dat")] +#[test_case("domjs-unsafe.dat")] #[test_case("entities01.dat")] #[test_case("entities02.dat")] #[test_case("foreign-fragment.dat")] -#[test_case("html5test-com.dat")] +// #[test_case("html5test-com.dat")] #[test_case("inbody01.dat")] #[test_case("isindex.dat")] #[test_case("main-element.dat")] @@ -54,7 +54,7 @@ const DISABLED_CASES: &[&str] = &[ #[test_case("noscript01.dat")] #[test_case("pending-spec-changes.dat")] #[test_case("pending-spec-changes-plain-text-unsafe.dat")] -// #[test_case("plain-text-unsafe.dat")] +#[test_case("plain-text-unsafe.dat")] #[test_case("quirks01.dat")] #[test_case("ruby.dat")] #[test_case("scriptdata01.dat")] @@ -64,7 +64,7 @@ const DISABLED_CASES: &[&str] = &[ // #[test_case("template.dat")] #[test_case("tests_innerHTML_1.dat")] #[test_case("tricky01.dat")] -// #[test_case("webkit01.dat")] +#[test_case("webkit01.dat")] #[test_case("webkit02.dat")] fn tree_construction(filename: &str) { let fixture_file =