Skip to content

Commit

Permalink
fix: parse <noscript> elements without panicking
Browse files Browse the repository at this point in the history
  • Loading branch information
max-heller committed Jan 17, 2025
1 parent c5e880c commit b7c44d0
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 49 deletions.
30 changes: 30 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2145,6 +2145,36 @@ mainfontfallback = [
"#);
}

#[test]
fn noscript_element() {
let output = MDBook::init()
.config(Config::pandoc())
.chapter(Chapter::new(
"",
"<noscript>\n\n## No scripting enabled\n\n</noscript>",
"chapter.md",
))
.build();
insta::assert_snapshot!(output, @r#"
├─ log output
│ INFO mdbook::book: Running the pandoc backend
│ INFO mdbook_pandoc::pandoc::renderer: Running pandoc
│ INFO mdbook_pandoc::pandoc::renderer: Wrote output to book/markdown/pandoc-ir
├─ markdown/pandoc-ir
│ [ RawBlock (Format "html") "<noscript>"
│ , Plain [ Str "\n" ]
│ , Header
│ 2
│ ( "book__markdown__src__chapter.md__no-scripting-enabled"
│ , [ "unnumbered" , "unlisted" ]
│ , []
│ )
│ [ Str "No scripting enabled" ]
│ , RawBlock (Format "html") "</noscript>"
│ ]
"#);
}

static BOOKS: Lazy<PathBuf> = Lazy::new(|| Path::new(env!("CARGO_MANIFEST_DIR")).join("books"));

#[test]
Expand Down
84 changes: 52 additions & 32 deletions src/preprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ use std::{
str,
};

use anyhow::{anyhow, Context as _};
use anyhow::{anyhow, Context};
use ego_tree::NodeId;
use html5ever::{expanded_name, local_name, namespace_url, ns, tendril::format_tendril, LocalName};
use html5ever::{expanded_name, local_name, namespace_url, ns, tendril::format_tendril};
use log::log;
use mdbook::{
book::{BookItems, Chapter},
Expand Down Expand Up @@ -649,6 +649,11 @@ impl<'book> Preprocess<'book> {
chapter: &'book Chapter,
out: impl io::Write,
) -> anyhow::Result<()> {
if log::log_enabled!(log::Level::Trace) {
log::debug!("Preprocessing '{}':\n{}", chapter.name, chapter.content);
} else {
log::debug!("Preprocessing '{}'", chapter.name);
}
let preprocessed = PreprocessChapter::new(&mut self.preprocessor, chapter, self.part_num);
preprocessed.preprocess(out)
}
Expand Down Expand Up @@ -837,7 +842,10 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> {
fn preprocess(mut self, writer: impl io::Write) -> anyhow::Result<()> {
let mut tree = TreeBuilder::new();
while let Some((event, range)) = self.parser.next() {
self.preprocess_event(event, range, &mut tree);
self.preprocess_event(event, range.clone(), &mut tree)
.with_context(|| {
format!("failed to preprocess '{}'", &self.chapter.content[range])
})?;
}
let events = tree.finish();

Expand All @@ -849,21 +857,19 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> {
event: Event<'book>,
range: Range<usize>,
tree: &mut TreeBuilder<'book>,
) {
) -> anyhow::Result<()> {
match event {
Event::Start(tag) => {
let push_element =
|this: &mut Self, tree: &mut TreeBuilder<'book>, element: MdElement<'book>| {
let node = tree.create_element(element);
this.stack.push(node);
node
};
let push_html_element =
|this: &mut Self, tree: &mut TreeBuilder<'book>, name: LocalName| {
let node = tree.create_html_element(name);
this.stack.push(node);
node
};
let push_element = |this: &mut Self, tree: &mut TreeBuilder<'book>, element| {
let node = tree.create_element(element)?;
this.stack.push(node);
Ok::<_, anyhow::Error>(node)
};
let push_html_element = |this: &mut Self, tree: &mut TreeBuilder<'book>, name| {
let node = tree.create_html_element(name)?;
this.stack.push(node);
Ok(node)
};
match tag {
Tag::List(start_number) => {
self.preprocessor.ctx.cur_list_depth += 1;
Expand All @@ -875,9 +881,9 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> {
}
Tag::Item => push_element(self, tree, MdElement::Item),
Tag::FootnoteDefinition(label) => {
let node = push_element(self, tree, MdElement::FootnoteDefinition);
let node = push_element(self, tree, MdElement::FootnoteDefinition)?;
tree.footnote(label, node);
node
Ok(node)
}
Tag::Table(alignment) => push_element(
self,
Expand Down Expand Up @@ -964,21 +970,20 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> {
id,
},
),
Tag::HtmlBlock => return,
Tag::HtmlBlock => return Ok(()),
Tag::MetadataBlock(_) => {
log::warn!("Ignoring metadata block");
for (event, _) in &mut self.parser {
if let Event::End(TagEnd::MetadataBlock(_)) = event {
break;
}
}
// False positive
#[allow(clippy::needless_return)]
return;
return Ok(());
}
};
}?;
Ok(())
}
Event::End(TagEnd::HtmlBlock | TagEnd::MetadataBlock(_)) => {}
Event::End(TagEnd::HtmlBlock | TagEnd::MetadataBlock(_)) => Ok(()),
Event::End(end) => {
let node = self
.stack
Expand Down Expand Up @@ -1006,26 +1011,41 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> {
if let Some(html) = html {
tree.process_html(html);
}
Ok(())
}
Event::Html(html) | Event::InlineHtml(html) => {
tree.process_html(html.as_ref().into());
Ok(())
}
Event::Html(html) | Event::InlineHtml(html) => tree.process_html(html.as_ref().into()),
Event::Text(text) => {
tree.create_element(MdElement::Text(text));
tree.create_element(MdElement::Text(text))?;
tree.process_html("</span>".into());
Ok(())
}
Event::Code(code) => {
tree.create_element(MdElement::InlineCode(code));
tree.create_element(MdElement::InlineCode(code))?;
tree.process_html("</code>".into());
Ok(())
}
Event::FootnoteReference(label) => {
tree.create_element(MdElement::FootnoteReference(label));
tree.create_element(MdElement::FootnoteReference(label))?;
Ok(())
}
Event::SoftBreak => {
tree.create_element(MdElement::SoftBreak);
tree.create_element(MdElement::SoftBreak)?;
Ok(())
}
Event::HardBreak => {
tree.process_html("<br>".into());
Ok(())
}
Event::Rule => {
tree.process_html("<hr>".into());
Ok(())
}
Event::HardBreak => tree.process_html("<br>".into()),
Event::Rule => tree.process_html("<hr>".into()),
Event::TaskListMarker(checked) => {
tree.create_element(MdElement::TaskListMarker(checked));
tree.create_element(MdElement::TaskListMarker(checked))?;
Ok(())
}
}
}
Expand Down
46 changes: 29 additions & 17 deletions src/preprocess/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ use std::{
};

use aho_corasick::AhoCorasick;
use anyhow::Context;
use ego_tree::{NodeId, NodeRef};
use html5ever::{
expanded_name, local_name, namespace_url, ns,
serialize::Serializer,
tendril::{fmt::UTF8, format_tendril, Tendril, TendrilSink},
tendril::{fmt::UTF8, format_tendril, StrTendril, Tendril, TendrilSink},
LocalName,
};
use pulldown_cmark::{CowStr, LinkType};
Expand Down Expand Up @@ -52,32 +53,43 @@ impl Tree<'_> {

impl<'book> TreeBuilder<'book> {
pub fn new() -> Self {
let html_parser = html5ever::driver::parse_fragment(
HtmlTreeSink::new(),
html5ever::ParseOpts::default(),
html::name!(html "body"),
Vec::new(),
);
let html_parser = {
let mut opts = html5ever::ParseOpts::default();
// If this is enabled (the default) then the contents of <noscript> elements get parsed
// as text, which doesn't play nice with the assumptions that the tree builder makes
// about the creation of new elements when HTML tags are parsed.
opts.tree_builder.scripting_enabled = false;
html5ever::driver::parse_fragment(
HtmlTreeSink::new(),
opts,
html::name!(html "body"),
Vec::new(),
)
};
Self {
html: html_parser,
footnotes: Default::default(),
}
}

pub fn create_element(&mut self, element: MdElement<'book>) -> NodeId {
self.html
.process(format_tendril!("<{}>", element.name().local));
fn create_element_inner(&mut self, html: StrTendril) -> anyhow::Result<NodeId> {
self.html.process(html.clone());
let sink = &self.html.tokenizer.sink.sink;
let id = sink.most_recently_created_element.take().unwrap();
let mut tree = sink.tree.borrow_mut();
sink.most_recently_created_element.take().with_context(|| {
format!("parsing HTML {html} did not result in the creation of a new element")
})
}

pub fn create_element(&mut self, element: MdElement<'book>) -> anyhow::Result<NodeId> {
let tag = format_tendril!("<{}>", element.name().local);
let id = self.create_element_inner(tag)?;
let mut tree = self.html.tokenizer.sink.sink.tree.borrow_mut();
*tree.tree.get_mut(id).unwrap().value() = Node::Element(Element::Markdown(element));
id
Ok(id)
}

pub fn create_html_element(&mut self, name: LocalName) -> NodeId {
self.html.process(format_tendril!("<{}>", name));
let sink = &self.html.tokenizer.sink.sink;
sink.most_recently_created_element.take().unwrap()
pub fn create_html_element(&mut self, name: LocalName) -> anyhow::Result<NodeId> {
self.create_element_inner(format_tendril!("<{}>", name))
}

pub fn process_html(&mut self, html: Tendril<UTF8>) {
Expand Down
Loading

0 comments on commit b7c44d0

Please sign in to comment.