Added parserdata structure to tokenizer

This parserdata structure is a gateway between the tokenizer and the parser. In a certain case (just one), the tokenizer needs to know the state of the parser to generate a correct token. The current setup has the tokenizer and parser in such a way, that we cannot easily reference eachother without borrow/check issues. THerefor we add this "hack", which finds out the data beforehand, and calls the tokenizer with this data. This means the call is done for each tokenizer call, instead of only when needed, but it saves a big refactor of the tokenizer/parser. In the future, we probably should separate the tokenizer, parser, and tree builder/sink structure so this is not an issue anymore.
gosub-io · Nov 5, 2023 · 9bf6eb3 · 9bf6eb3
1 parent dca4907
commit 9bf6eb3
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 23 deletions.
diff --git a/src/bin/parser-test.rs b/src/bin/parser-test.rs
@@ -29,7 +29,7 @@ fn main() {
         tests_failed: Vec::new(),
     };
 
-    let filenames = Some(&["tests2.dat"][..]);
+    let filenames = Some(&["plain-text-unsafe.dat"][..]);
     let fixtures = read_fixtures(filenames).expect("fixtures");
 
     for fixture_file in fixtures {
@@ -41,9 +41,9 @@ fn main() {
 
         let mut test_idx = 1;
         for test in fixture_file.tests {
-            // if test_idx == 57 {
-            run_test(test_idx, test, &mut results);
-            // }
+            if test_idx == 11 {
+                run_test(test_idx, test, &mut results);
+            }
             test_idx += 1;
         }
 

diff --git a/src/html5/parser.rs b/src/html5/parser.rs
@@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment
 use crate::html5::parser::quirks::QuirksMode;
 use crate::html5::tokenizer::state::State;
 use crate::html5::tokenizer::token::Token;
-use crate::html5::tokenizer::{Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
+use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
 use crate::types::{ParseError, Result};
 use alloc::rc::Rc;
 use core::cell::RefCell;
@@ -3752,13 +3752,27 @@ impl<'chars> Html5Parser<'chars> {
         }
     }
 
+    fn parser_data(&self) -> ParserData {
+        let namespace = self
+            .get_adjusted_current_node()
+            .namespace
+            .unwrap_or_default();
+
+        ParserData {
+            adjusted_node_namespace: namespace,
+        }
+    }
+
     /// Fetches the next token from the tokenizer. However, if the token is a text token AND
     /// it starts with one or more whitespaces, the token is split into 2 tokens: the whitespace part
     /// and the remainder.
     fn fetch_next_token(&mut self) -> Token {
         // If there are no tokens to fetch, fetch the next token from the tokenizer
         if self.token_queue.is_empty() {
-            let token = self.tokenizer.next_token().expect("tokenizer error");
+            let token = self
+                .tokenizer
+                .next_token(self.parser_data())
+                .expect("tokenizer error");
 
             if let Token::Text(value) = token {
                 for c in value.chars() {

diff --git a/src/html5/tokenizer.rs b/src/html5/tokenizer.rs
@@ -8,6 +8,7 @@ use crate::bytes::Bytes::{self, *};
 use crate::bytes::SeekMode::SeekCur;
 use crate::bytes::{CharIterator, Position};
 use crate::html5::error_logger::{ErrorLogger, ParserError};
+use crate::html5::node::HTML_NAMESPACE;
 use crate::html5::tokenizer::state::State;
 use crate::html5::tokenizer::token::Token;
 use crate::types::{Error, Result};
@@ -50,6 +51,21 @@ pub struct Tokenizer<'stream> {
     pub error_logger: Rc<RefCell<ErrorLogger>>,
 }
 
+/// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed
+/// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for
+/// more information and how we should refactor this properly.
+pub struct ParserData {
+    pub adjusted_node_namespace: String,
+}
+
+impl Default for ParserData {
+    fn default() -> Self {
+        ParserData {
+            adjusted_node_namespace: HTML_NAMESPACE.to_string(),
+        }
+    }
+}
+
 /// Options that can be passed to the tokenizer. Mostly needed when dealing with tests.
 pub struct Options {
     /// Sets the initial state of the tokenizer. Normally only needed when dealing with tests
@@ -103,8 +119,8 @@ impl<'stream> Tokenizer<'stream> {
     }
 
     /// Retrieves the next token from the input stream or Token::EOF when the end is reached
-    pub fn next_token(&mut self) -> Result<Token> {
-        self.consume_stream()?;
+    pub fn next_token(&mut self, parser_data: ParserData) -> Result<Token> {
+        self.consume_stream(parser_data)?;
 
         if self.token_queue.is_empty() {
             return Ok(Token::Eof);
@@ -124,7 +140,7 @@ impl<'stream> Tokenizer<'stream> {
     }
 
     /// Consumes the input stream. Continues until the stream is completed or a token has been generated.
-    fn consume_stream(&mut self) -> Result<()> {
+    fn consume_stream(&mut self, parser_data: ParserData) -> Result<()> {
         loop {
             // Something is already in the token buffer, so we can return it.
             if !self.token_queue.is_empty() {
@@ -1210,9 +1226,11 @@ impl<'stream> Tokenizer<'stream> {
                     if self.chars.look_ahead_slice(7) == "[CDATA[" {
                         self.chars.seek(SeekCur, 7);
 
-                        // @TODO: If there is an adjusted current node and it is not an element in the HTML namespace,
-                        // then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error.
-                        // Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state.
+                        if parser_data.adjusted_node_namespace != HTML_NAMESPACE {
+                            self.state = State::CDATASection;
+                            continue;
+                        }
+
                         self.parse_error(ParserError::CdataInHtmlContent);
                         self.current_token = Some(Token::Comment("[CDATA[".into()));
 

diff --git a/src/html5/tokenizer/character_reference.rs b/src/html5/tokenizer/character_reference.rs
@@ -6,6 +6,7 @@ use crate::bytes::{
 };
 use crate::html5::error_logger::ParserError;
 use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS};
+
 use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT};
 use lazy_static::lazy_static;
 
@@ -350,6 +351,7 @@ lazy_static! {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::html5::tokenizer::ParserData;
     use crate::{bytes::CharIterator, html5::error_logger::ErrorLogger};
     use std::cell::RefCell;
     use std::rc::Rc;
@@ -367,7 +369,7 @@ mod tests {
                     let error_logger = Rc::new(RefCell::new(ErrorLogger::new()));
                     let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone());
 
-                    let token = tokenizer.next_token().unwrap();
+                    let token = tokenizer.next_token(ParserData::default()).unwrap();
                     assert_eq!(expected, token.to_string());
                 }
             )*

diff --git a/src/testing/tokenizer.rs b/src/testing/tokenizer.rs
@@ -1,5 +1,6 @@
 use super::FIXTURE_ROOT;
 use crate::bytes::CharIterator;
+use crate::html5::tokenizer::ParserData;
 use crate::html5::{
     error_logger::ErrorLogger,
     tokenizer::{
@@ -211,12 +212,12 @@ impl TestSpec {
             // If there is no output, still do an (initial) next token so the parser can generate
             // errors.
             if self.output.is_empty() {
-                tokenizer.next_token().unwrap();
+                tokenizer.next_token(ParserData::default()).unwrap();
             }
 
             // There can be multiple tokens to match. Make sure we match all of them
             for expected in self.output.iter() {
-                let actual = tokenizer.next_token().unwrap();
+                let actual = tokenizer.next_token(ParserData::default()).unwrap();
                 assert_eq!(self.escape(&actual), self.escape(expected));
             }
 
@@ -237,7 +238,7 @@ impl TestSpec {
             let mut tokenizer = builder.build();
 
             for _ in self.output.iter() {
-                tokenizer.next_token().unwrap();
+                tokenizer.next_token(ParserData::default()).unwrap();
             }
         }
     }

diff --git a/tests/tree_construction.rs b/tests/tree_construction.rs
@@ -15,7 +15,7 @@ const DISABLED_CASES: &[&str] = &[
 #[test_case("tests3.dat")]
 #[test_case("tests4.dat")]
 #[test_case("tests5.dat")]
-// #[test_case("tests6.dat")]
+#[test_case("tests6.dat")]
 #[test_case("tests7.dat")]
 #[test_case("tests8.dat")]
 #[test_case("tests9.dat")]
@@ -27,9 +27,9 @@ const DISABLED_CASES: &[&str] = &[
 // #[test_case("tests16.dat")]
 #[test_case("tests17.dat")]
 #[test_case("tests18.dat")]
-// #[test_case("tests19.dat")]
+#[test_case("tests19.dat")]
 #[test_case("tests20.dat")]
-// #[test_case("tests21.dat")]
+#[test_case("tests21.dat")]
 #[test_case("tests22.dat")]
 #[test_case("tests23.dat")]
 #[test_case("tests24.dat")]
@@ -40,11 +40,11 @@ const DISABLED_CASES: &[&str] = &[
 #[test_case("blocks.dat")]
 #[test_case("comments01.dat")]
 #[test_case("doctype01.dat")]
-// #[test_case("domjs-unsafe.dat")]
+#[test_case("domjs-unsafe.dat")]
 #[test_case("entities01.dat")]
 #[test_case("entities02.dat")]
 #[test_case("foreign-fragment.dat")]
-#[test_case("html5test-com.dat")]
+// #[test_case("html5test-com.dat")]
 #[test_case("inbody01.dat")]
 #[test_case("isindex.dat")]
 #[test_case("main-element.dat")]
@@ -54,7 +54,7 @@ const DISABLED_CASES: &[&str] = &[
 #[test_case("noscript01.dat")]
 #[test_case("pending-spec-changes.dat")]
 #[test_case("pending-spec-changes-plain-text-unsafe.dat")]
-// #[test_case("plain-text-unsafe.dat")]
+#[test_case("plain-text-unsafe.dat")]
 #[test_case("quirks01.dat")]
 #[test_case("ruby.dat")]
 #[test_case("scriptdata01.dat")]
@@ -64,7 +64,7 @@ const DISABLED_CASES: &[&str] = &[
 // #[test_case("template.dat")]
 #[test_case("tests_innerHTML_1.dat")]
 #[test_case("tricky01.dat")]
-// #[test_case("webkit01.dat")]
+#[test_case("webkit01.dat")]
 #[test_case("webkit02.dat")]
 fn tree_construction(filename: &str) {
     let fixture_file =