diff --git a/Cargo.lock b/Cargo.lock index 7b0200f..0351f8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,17 +287,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "derive_more" -version = "0.99.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "digest" version = "0.10.7" @@ -420,15 +409,6 @@ dependencies = [ "new_debug_unreachable", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -439,15 +419,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "getopts" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" -dependencies = [ - "unicode-width", -] - [[package]] name = "getrandom" version = "0.2.15" @@ -818,6 +789,7 @@ dependencies = [ "ego-tree", "env_logger", "html5ever", + "indexmap", "insta", "log", "mdbook", @@ -826,7 +798,6 @@ dependencies = [ "pulldown-cmark", "regex", "replace_with", - "scraper", "semver", "serde", "serde_yaml", @@ -1276,40 +1247,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "scraper" -version = "0.22.0" -source = "git+https://github.com/rust-scraper/scraper?rev=1896e4f2c57438b5d42aade714d2b8f3019e983f#1896e4f2c57438b5d42aade714d2b8f3019e983f" -dependencies = [ - "cssparser", - "ego-tree", - "getopts", - "html5ever", - "indexmap", - "precomputed-hash", - "selectors", - "tendril", -] - -[[package]] -name = "selectors" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" -dependencies = [ - "bitflags", - "cssparser", - "derive_more", - "fxhash", - "log", - "new_debug_unreachable", - "phf", - "phf_codegen", - "precomputed-hash", - "servo_arc", - "smallvec", -] - [[package]] name = "semver" version = "1.0.24" @@ -1370,15 +1307,6 @@ dependencies = [ "unsafe-libyaml", ] -[[package]] -name = "servo_arc" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a" -dependencies = [ - "stable_deref_trait", -] - [[package]] name = "sha2" version = "0.10.8" @@ -1705,12 +1633,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" -[[package]] -name = "unicode-width" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" - [[package]] name = "unsafe-libyaml" version = "0.2.11" diff --git a/Cargo.toml b/Cargo.toml index 8e5d43c..423d284 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,13 +19,13 @@ anyhow = "1.0.47" cssparser = "0.34.0" env_logger = "0.11.0" html5ever = "0.29.0" +indexmap = "2.7.0" log = "0.4.0" mdbook = { version = "0.4.35", default-features = false } normpath = "1.0.0" once_cell = "1.0.0" pulldown-cmark = { version = "0.10.0", default-features = false } regex = "1.5.5" -scraper = { git = "https://github.com/rust-scraper/scraper", rev = "1896e4f2c57438b5d42aade714d2b8f3019e983f", features = ["deterministic"] } ego-tree = "0.10.0" replace_with = "0.1.7" semver = "1.0.0" diff --git a/src/html.rs b/src/html.rs index 054390f..791e7eb 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,42 +1,24 @@ -use std::cell::Cell; +use crate::preprocess; -use html5ever::{interface::TreeSink, namespace_url, ns, LocalName, QualName}; +pub type Parser<'book> = html5ever::Parser>; -pub type NodeId = ::Handle; -pub type Parser = html5ever::Parser; - -pub fn name(local: LocalName) -> QualName { - QualName::new(None, ns!(), local) -} - -/// Determines the HTML element to which child events should be appended based on the state of the parser. -pub fn most_recently_created_open_element(parser: &Parser) -> NodeId { - struct Tracer<'a> { - html: &'a scraper::Html, - prev: Cell>, - next: Cell>, - } - - impl html5ever::interface::Tracer for Tracer<'_> { - type Handle = NodeId; - - fn trace_handle(&self, handle: &Self::Handle) { - if let Some(node) = self.html.tree.get(*handle) { - if node.value().is_element() { - self.prev.swap(&self.next); - self.next.set(Some(*handle)) - } - } +#[macro_export] +macro_rules! html_name { + (html $name:tt) => {{ + use html5ever::namespace_url; + html5ever::QualName { + prefix: None, + ns: html5ever::ns!(html), + local: html5ever::local_name!($name), } - } - - let sink = &parser.tokenizer.sink; - let html = sink.sink.0.borrow(); - let tracer = Tracer { - html: &html, - prev: Default::default(), - next: Default::default(), - }; - sink.trace_handles(&tracer); - tracer.prev.into_inner().unwrap() + }}; + ($name:tt) => {{ + use html5ever::namespace_url; + html5ever::QualName { + prefix: None, + ns: html5ever::ns!(), + local: html5ever::local_name!($name), + } + }}; } +pub use crate::html_name as name; diff --git a/src/lib.rs b/src/lib.rs index 9a7b78b..8d557d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1242,6 +1242,44 @@ some text here "##); } + #[test] + fn implicitly_closed_tags() { + let book = MDBook::init() + .config(Config::latex()) + .chapter(Chapter::new( + "", + r#" +- before +- [Box](#foo) +- after + +# Foo + "#, + "chapter.md", + )) + .build(); + insta::assert_snapshot!(book, @r##" + ├─ log output + │ INFO mdbook::book: Running the pandoc backend + │ INFO mdbook_pandoc::pandoc::renderer: Running pandoc + │ INFO mdbook_pandoc::pandoc::renderer: Wrote output to book/latex/output.tex + ├─ latex/output.tex + │ \begin{itemize} + │ \tightlist + │ \item + │ before + │ \item + │ \hyperref[book__latex__src__chapter.md__foo]{Box} + │ \item + │ after + │ \end{itemize} + │ + │ \chapter{Foo}\label{book__latex__src__chapter.md__foo} + ├─ latex/src/chapter.md + │ [BulletList [[Plain [Str "before"]], [Plain [Link ("", [], []) [Str "Box", RawInline (Format "html") "", RawInline (Format "html") ""] ("#foo", "")]], [Plain [Str "after"]]], Header 1 ("foo", [], []) [Str "Foo"]] + "##); + } + #[test] fn rust_reference_regression_nested_elements() { let book = MDBook::init() @@ -1411,11 +1449,9 @@ some text here text -

more **markdown** -

outside divs @@ -1447,14 +1483,8 @@ outside divs │ , Para [ Str "text" ] │ ] │ , RawBlock (Format "html") "" - │ , Plain [ Str "\n" , RawInline (Format "html") "

" ] - │ , Div - │ ( "" , [] , [] ) - │ [ Plain [ Str "\n" ] - │ , Para [ Str "more " , Strong [ Str "markdown" ] ] - │ ] - │ , RawBlock (Format "html") "

" │ , Plain [ Str "\n" ] + │ , Para [ Str "more " , Strong [ Str "markdown" ] ] │ ] │ , RawBlock (Format "html") "" │ , Plain [ Str "\n" ] diff --git a/src/pandoc/native.rs b/src/pandoc/native.rs index 225b53f..9878443 100644 --- a/src/pandoc/native.rs +++ b/src/pandoc/native.rs @@ -5,10 +5,10 @@ use std::{ use anyhow::anyhow; use escape::Escape; -use html5ever::{local_name, serialize::HtmlSerializer}; +use html5ever::serialize::HtmlSerializer; use pulldown_cmark::CowStr; -use crate::{html, preprocess::PreprocessChapter}; +use crate::preprocess::{self, PreprocessChapter}; use super::OutputFormat; @@ -68,22 +68,22 @@ where } } -impl Attributes for &scraper::node::Attributes { +impl Attributes for &preprocess::tree::Attributes { fn id(&self) -> Option<&str> { - self.get(&html::name(local_name!("id"))).map(|s| s.as_ref()) + self.id.as_deref() } fn classes(&self) -> impl Iterator { - self.get(&html::name(local_name!("class"))) - .filter(|s| !s.is_empty()) + (!self.classes.is_empty()) + .then_some(self.classes.split_ascii_whitespace()) .into_iter() - .flat_map(|s| s.split(' ')) + .flatten() } fn attrs(&self) -> impl Iterator { - self.iter() - .filter(|(k, _)| !matches!(k.local, local_name!("id") | local_name!("class"))) - .map(|(k, v)| (k.local.as_ref(), v.as_ref())) + self.rest + .iter() + .map(|(name, value)| (name.local.as_ref(), value.as_ref())) } } @@ -797,18 +797,17 @@ impl<'a, 'book, 'p, W: io::Write> SerializeBlock<'a, 'book, 'p, W> { /// Table, with attributes, caption, optional short caption, column alignments and widths /// (required), table head, table bodies, and table foot - pub fn serialize_table( + pub fn serialize_table( self, - state: &mut S, attrs: impl Attributes, cols: impl IntoIterator)>, header: ( impl Attributes, - impl FnOnce(&mut S, &mut SerializeRows<'_, 'book, 'p, W>) -> anyhow::Result<()>, + impl FnOnce(&mut SerializeRows<'_, 'book, 'p, W>) -> anyhow::Result<()>, ), body: ( impl Attributes, - impl FnOnce(&mut S, &mut SerializeRows<'_, 'book, 'p, W>) -> anyhow::Result<()>, + impl FnOnce(&mut SerializeRows<'_, 'book, 'p, W>) -> anyhow::Result<()>, ), ) -> anyhow::Result<()> { write!(self.serializer.unescaped(), "Table ")?; @@ -846,7 +845,7 @@ impl<'a, 'book, 'p, W: io::Write> SerializeBlock<'a, 'book, 'p, W> { self.serializer.write_attributes(attrs)?; write!(self.serializer.unescaped(), " ")?; let mut serializer = SerializeList::new(self.serializer, Row)?; - rows(state, &mut serializer)?; + rows(&mut serializer)?; serializer.finish()?; write!(self.serializer.unescaped(), ")")?; } @@ -858,7 +857,7 @@ impl<'a, 'book, 'p, W: io::Write> SerializeBlock<'a, 'book, 'p, W> { self.serializer.write_attributes(attrs)?; write!(self.serializer.unescaped(), " (RowHeadColumns 0) [] ")?; let mut serializer = SerializeList::new(self.serializer, Row)?; - rows(state, &mut serializer)?; + rows(&mut serializer)?; serializer.finish()?; write!(self.serializer.unescaped(), ")]")?; } diff --git a/src/preprocess.rs b/src/preprocess.rs index 6a6a824..48834b1 100644 --- a/src/preprocess.rs +++ b/src/preprocess.rs @@ -15,6 +15,8 @@ use std::{ }; use anyhow::{anyhow, Context as _}; +use ego_tree::NodeId; +use html5ever::{expanded_name, local_name, namespace_url, ns, tendril::format_tendril, LocalName}; use log::log; use mdbook::{ book::{BookItems, Chapter}, @@ -22,14 +24,14 @@ use mdbook::{ }; use normpath::PathExt; use once_cell::sync::Lazy; -use pulldown_cmark::{CowStr, Event, HeadingLevel, LinkType}; +use pulldown_cmark::{CowStr, Event, HeadingLevel, LinkType, Tag, TagEnd}; use regex::Regex; use walkdir::WalkDir; use crate::pandoc::{self, native::ColWidth, OutputFormat, RenderContext}; -mod tree; -use tree::TreeBuilder; +pub mod tree; +use tree::{Element, MdElement, Node, QualNameExt, TreeBuilder}; pub struct Preprocessor<'book> { pub(crate) ctx: RenderContext<'book>, @@ -662,10 +664,10 @@ pub struct PreprocessChapter<'book, 'preprocessor> { chapter: &'book Chapter, part_num: usize, parser: Parser<'book>, - matching_tags: Vec, + stack: Vec, encountered_h1: bool, - tables: VecDeque<&'book str>, identifiers: HashMap, + in_table_head: bool, } struct Parser<'book> { @@ -738,11 +740,11 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { preprocessor, chapter, parser: Parser::new(&chapter.content), - matching_tags: Default::default(), + stack: Vec::new(), encountered_h1: false, - tables: Default::default(), identifiers: Default::default(), part_num, + in_table_head: false, } } @@ -799,10 +801,6 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { Some((level, classes)) } - pub fn pop_table(&mut self) -> Option<&'book str> { - self.tables.pop_front() - } - pub fn column_widths<'table>( &self, table: &'table str, @@ -834,7 +832,7 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { while let Some((event, range)) = self.parser.next() { self.preprocess_event(event, range, &mut tree); } - let events = tree.finish(); + let events = dbg!(tree.finish()); pandoc::native::Serializer::serialize(writer, self, |blocks| events.emit(blocks)) } @@ -845,28 +843,52 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { range: Range, tree: &mut TreeBuilder<'book>, ) { - use pulldown_cmark::{Event, Tag, TagEnd}; - - let event = match event { + match event { Event::Start(tag) => { - let tag = match tag { + let push_element = + |this: &mut Self, tree: &mut TreeBuilder<'book>, element: MdElement<'book>| { + let node = tree.create_element(element); + this.stack.push(node); + node + }; + let push_html_element = + |this: &mut Self, tree: &mut TreeBuilder<'book>, name: LocalName| { + let node = tree.create_html_element(name); + this.stack.push(node); + node + }; + match tag { Tag::List(start_number) => { self.preprocessor.ctx.cur_list_depth += 1; self.preprocessor.ctx.max_list_depth = cmp::max( self.preprocessor.ctx.max_list_depth, self.preprocessor.ctx.cur_list_depth, ); - Tag::List(start_number) + push_element(self, tree, MdElement::List(start_number)) } + Tag::Item => push_element(self, tree, MdElement::Item), Tag::FootnoteDefinition(label) => { - let bookmark = tree.bookmark(); - tree.footnote(label.clone(), bookmark); - Tag::FootnoteDefinition(label) + let node = push_element(self, tree, MdElement::FootnoteDefinition); + tree.footnote(label, node); + node + } + Tag::Table(alignment) => push_element( + self, + tree, + MdElement::Table { + alignment, + source: &self.chapter.content[range], + }, + ), + Tag::TableHead => { + self.in_table_head = true; + push_html_element(self, tree, local_name!("thead")) } - Tag::Table(alignment) => { - self.tables.push_back(&self.chapter.content[range]); - Tag::Table(alignment) + Tag::TableRow => push_html_element(self, tree, local_name!("tr")), + Tag::TableCell if self.in_table_head => { + push_html_element(self, tree, local_name!("th")) } + Tag::TableCell => push_html_element(self, tree, local_name!("td")), Tag::Heading { level, id, @@ -889,20 +911,22 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { id.into() } }); - self.update_heading(level, classes) - .map(|(level, classes)| Tag::Heading { + let element = self + .update_heading(level, classes) + .map(|(level, classes)| MdElement::Heading { level, id, classes, attrs, }) - .unwrap_or(Tag::Paragraph) + .unwrap_or(MdElement::Paragraph); + push_element(self, tree, element) } Tag::Link { link_type, dest_url, title, - id, + id: _, } => { let dest_url = self.preprocessor.normalize_link_or_leave_as_is( self.chapter, @@ -910,32 +934,93 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { dest_url, LinkContext::Link, ); - Tag::Link { + push_element(self, tree, MdElement::Link { dest_url, title }) + } + Tag::Paragraph => push_element(self, tree, MdElement::Paragraph), + Tag::BlockQuote => push_element(self, tree, MdElement::BlockQuote), + Tag::CodeBlock(kind) => push_element(self, tree, MdElement::CodeBlock(kind)), + Tag::Emphasis => push_element(self, tree, MdElement::Emphasis), + Tag::Strong => push_element(self, tree, MdElement::Strong), + Tag::Strikethrough => push_element(self, tree, MdElement::Strikethrough), + Tag::Image { + link_type, + dest_url, + title, + id, + } => push_element( + self, + tree, + MdElement::Image { link_type, dest_url, title, id, + }, + ), + Tag::HtmlBlock => return, + Tag::MetadataBlock(_) => { + log::warn!("Ignoring metadata block"); + for (event, _) in &mut self.parser { + if let Event::End(TagEnd::MetadataBlock(_)) = event { + break; + } } + // False positive + #[allow(clippy::needless_return)] + return; } - tag => tag, }; - self.matching_tags.push(tag.to_end()); - Event::Start(tag) } - Event::End(_) => { - let end = self.matching_tags.pop().unwrap(); - if let TagEnd::List(_) = &end { - self.preprocessor.ctx.cur_list_depth -= 1; + Event::End(TagEnd::HtmlBlock | TagEnd::MetadataBlock(_)) => {} + Event::End(end) => { + let node = self + .stack + .pop() + .unwrap_or_else(|| panic!("unmatched {end:?}")); + let html = { + let tree = tree.html.tokenizer.sink.sink.tree.borrow(); + let Node::Element(element) = tree.tree.get(node).unwrap().value() else { + unreachable!() + }; + match element { + Element::Markdown(MdElement::List(_)) => { + self.preprocessor.ctx.cur_list_depth -= 1 + } + Element::Html(element) + if element.name.expanded() == expanded_name!(html "thead") => + { + self.in_table_head = false + } + _ => {} + } + let name = element.name(); + (!name.is_void_element()).then(|| format_tendril!("", name.local)) + }; + if let Some(html) = html { + tree.process_html(html); } - Event::End(end) } - Event::Html(html) | Event::InlineHtml(html) => { - tree.process_html(html.as_ref().into()); - return; + Event::Html(html) | Event::InlineHtml(html) => tree.process_html(html.as_ref().into()), + Event::Text(text) => { + tree.create_element(MdElement::Text(text)); + tree.process_html("".into()); } - event => event, - }; - tree.generate_event(event); + Event::Code(code) => { + tree.create_element(MdElement::InlineCode(code)); + tree.process_html("".into()); + } + Event::FootnoteReference(label) => { + tree.create_element(MdElement::FootnoteReference(label)); + } + Event::SoftBreak => { + tree.create_element(MdElement::SoftBreak); + } + Event::HardBreak => tree.process_html("
".into()), + Event::Rule => tree.process_html("
".into()), + Event::TaskListMarker(checked) => { + tree.create_element(MdElement::TaskListMarker(checked)); + } + } } pub fn resolve_image_url<'url>( @@ -976,7 +1061,7 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { impl<'book> ChapterAnchors<'book> { /// Searches for tags in the provided chapter with identifiers that can be used as link anchors. fn new(chapter: &'book Chapter) -> anyhow::Result { - use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd}; + use pulldown_cmark::{Options, Parser}; let mut parser = Parser::new_ext(&chapter.content, Options::ENABLE_HEADING_ATTRIBUTES); let beginning = 'beginning: { let heading_id = loop { diff --git a/src/preprocess/tree.rs b/src/preprocess/tree.rs index fe425a2..8ad7101 100644 --- a/src/preprocess/tree.rs +++ b/src/preprocess/tree.rs @@ -1,308 +1,158 @@ use std::{ - collections::{BTreeMap, HashMap}, + borrow::Cow, + collections::HashMap, fmt, io::{self, Write}, - iter, slice, }; use aho_corasick::AhoCorasick; -use ego_tree::NodeRef; +use ego_tree::{NodeId, NodeRef}; use html5ever::{ - local_name, namespace_url, + expanded_name, local_name, namespace_url, ns, serialize::Serializer, - tendril::{fmt::UTF8, Tendril, TendrilSink}, - LocalName, QualName, + tendril::{fmt::UTF8, format_tendril, Tendril, TendrilSink}, + LocalName, }; -use pulldown_cmark::{CodeBlockKind, CowStr, Event as MdEvent, LinkType}; -use scraper::{node::Element, Node}; +use pulldown_cmark::{CodeBlockKind, CowStr, LinkType}; use crate::{html, latex, pandoc, preprocess::UnresolvableRemoteImage}; -pub struct TreeBuilder<'book> { - html: html::Parser, - md: BTreeMap>>, - parent: html::NodeId, - child: Option, - event_node_name: QualName, - footnotes: HashMap, Bookmark>, -} +mod node; +pub use node::{Attributes, Element, MdElement, Node, QualNameExt}; -pub struct Emitter<'book> { - html: scraper::Html, - md: BTreeMap>>, - event_node_name: QualName, - footnotes: HashMap, Bookmark>, -} +mod sink; +pub use sink::HtmlTreeSink; -pub enum Event<'a> { - Markdown(&'a MdEvent<'a>), - Html(NodeRef<'a, Node>), +#[derive(Debug)] +pub struct Tree<'book> { + errors: Vec>, + pub tree: ego_tree::Tree>, } -pub struct Bookmark { - node: html::NodeId, - offset: usize, +pub struct TreeBuilder<'book> { + pub html: html::Parser<'book>, + footnotes: HashMap, NodeId>, } -impl<'book> Emitter<'book> { - fn events<'a>(&'a self, node: NodeRef<'a, Node>) -> impl Iterator> + 'a - where - 'book: 'a, - { - enum Iter<'a> { - Md(slice::Iter<'a, MdEvent<'a>>), - Html(iter::Once>), - } - impl<'a> Iterator for Iter<'a> { - type Item = Event<'a>; +pub struct Emitter<'book> { + tree: Tree<'book>, + footnotes: HashMap, NodeId>, +} - fn next(&mut self) -> Option { - match self { - Self::Md(events) => events.next().map(Event::Markdown), - Self::Html(events) => events.next().map(Event::Html), - } - } - } - match node.value() { - Node::Element(element) if element.name == self.event_node_name => { - debug_assert!(!node.has_children()); - let events = self.md[&node.id()].iter(); - Iter::Md(events) - } - _ => Iter::Html(iter::once(node)), +impl Tree<'_> { + pub fn new() -> Self { + Self { + errors: Vec::new(), + tree: ego_tree::Tree::new(Node::Document), } } - - fn children<'a>(&'a self, node: NodeRef<'a, Node>) -> impl Iterator> + 'a - where - 'book: 'a, - { - node.children().flat_map(move |child| self.events(child)) - } - - fn load_bookmark(&self, bookmark: &Bookmark) -> impl Iterator> + '_ { - let Bookmark { node, offset } = bookmark; - let node = (self.html.tree.get(*node)).expect("bookmark should point to a valid node"); - self.events(node).skip(*offset).chain( - node.next_siblings() - .flat_map(|sibling| self.events(sibling)), - ) - } } impl<'book> TreeBuilder<'book> { pub fn new() -> Self { let html_parser = html5ever::driver::parse_fragment( - scraper::HtmlTreeSink::new(scraper::Html::new_fragment()), + HtmlTreeSink::new(), html5ever::ParseOpts::default(), - html5ever::QualName::new(None, html5ever::ns!(html), html5ever::local_name!("body")), + html::name!(html "body"), Vec::new(), ); - let parent = html::most_recently_created_open_element(&html_parser); Self { - parent, - child: None, - md: Default::default(), html: html_parser, - event_node_name: html::name(LocalName::from("mdbook-pandoc")), footnotes: Default::default(), } } - pub fn process_html(&mut self, html: Tendril) { - self.html.process(html); - self.parent = html::most_recently_created_open_element(&self.html); - self.child = None; + pub fn create_element(&mut self, element: MdElement<'book>) -> NodeId { + self.html + .process(format_tendril!("<{}>", element.name().local)); + let sink = &self.html.tokenizer.sink.sink; + let id = sink.most_recently_created_element.take().unwrap(); + let mut tree = sink.tree.borrow_mut(); + *tree.tree.get_mut(id).unwrap().value() = Node::Element(Element::Markdown(element)); + id } - fn events(&mut self) -> (html::NodeId, &mut Vec>) { - let child = *self.child.get_or_insert_with(|| { - let mut html = self.html.tokenizer.sink.sink.0.borrow_mut(); - let mut parent = html.tree.get_mut(self.parent).unwrap(); - let child = parent.append(Node::Element(Element::new( - self.event_node_name.clone(), - Vec::new(), - ))); - child.id() - }); - (child, self.md.entry(child).or_default()) + pub fn create_html_element(&mut self, name: LocalName) -> NodeId { + self.html.process(format_tendril!("<{}>", name)); + let sink = &self.html.tokenizer.sink.sink; + sink.most_recently_created_element.take().unwrap() } - pub fn bookmark(&mut self) -> Bookmark { - let (node, events) = self.events(); - Bookmark { - node, - offset: events.len(), - } - } - - pub fn generate_event(&mut self, event: MdEvent<'book>) { - let (_, events) = self.events(); - events.push(event); + pub fn process_html(&mut self, html: Tendril) { + self.html.process(html); + let sink = &self.html.tokenizer.sink.sink; + sink.most_recently_created_element.take(); } - pub fn footnote(&mut self, label: CowStr<'book>, bookmark: Bookmark) { - self.footnotes.insert(label, bookmark); + pub fn footnote(&mut self, label: CowStr<'book>, node: NodeId) { + self.footnotes.insert(label, node); } pub fn finish(self) -> Emitter<'book> { Emitter { - html: self.html.finish(), - md: self.md, - event_node_name: self.event_node_name, + tree: self.html.finish(), footnotes: self.footnotes, } } } impl<'book> Emitter<'book> { - pub fn serialize_events<'event>( + pub fn serialize_children<'event>( &self, - mut events: impl Iterator>, + node: NodeRef<'_, Node>, serializer: &mut pandoc::native::SerializeNested<'_, '_, 'book, '_, impl io::Write>, ) -> anyhow::Result<()> where 'book: 'event, { - while let Some(event) = events.next() { - self.serialize_event(event, &mut events, serializer)?; + for node in node.children() { + self.serialize_node(node, serializer)?; } Ok(()) } - pub fn serialize_event<'event>( - &self, - event: Event<'event>, - siblings: &mut impl Iterator>, - serializer: &mut pandoc::native::SerializeNested<'_, '_, 'book, '_, impl io::Write>, - ) -> anyhow::Result<()> - where - 'book: 'event, - { - match event { - Event::Html(node) => self.serialize_node(node, serializer), - Event::Markdown(event) => self.serialize_md_event(event, siblings, serializer), - } - } - - pub fn serialize_children<'event>( + pub fn serialize_node( &self, - tag: &pulldown_cmark::Tag<'event>, - siblings: &mut impl Iterator>, + node: NodeRef<'_, Node>, serializer: &mut pandoc::native::SerializeNested<'_, '_, 'book, '_, impl io::Write>, - ) -> anyhow::Result<()> - where - 'book: 'event, - { - let end = tag.to_end(); - while let Some(event) = siblings.next() { - match event { - Event::Markdown(MdEvent::End(tag)) if *tag == end => break, - _ => self.serialize_event(event, siblings, serializer)?, - } - } - Ok(()) - } - - pub fn skip_children<'event>( - tag: &pulldown_cmark::Tag<'event>, - siblings: &mut impl Iterator>, ) -> anyhow::Result<()> { - let end = tag.to_end(); - while let Some(event) = siblings.next() { - match event { - Event::Markdown(MdEvent::End(tag)) if *tag == end => break, - Event::Markdown(MdEvent::Start(tag)) => Self::skip_children(tag, siblings)?, - _ => {} + match node.value() { + Node::Document => unreachable!(), + Node::HtmlComment(comment) => { + serializer.serialize_raw_html(|serializer| serializer.write_comment(comment)) } - } - Ok(()) - } - - pub fn serialize_nested_children<'event>( - &self, - tag: &pulldown_cmark::Tag<'event>, - mut child: impl FnMut(&pulldown_cmark::Tag<'event>) -> bool, - siblings: &mut impl Iterator>, - serializer: &mut pandoc::native::SerializeList< - '_, - 'book, - '_, - impl io::Write, - pandoc::native::List, - >, - ) -> anyhow::Result<()> - where - 'book: 'event, - { - let end = tag.to_end(); - while let Some(event) = siblings.next() { - match event { - Event::Markdown(MdEvent::End(tag)) if *tag == end => break, - Event::Markdown(MdEvent::Start(tag)) if child(tag) => { - let mut blocks = serializer.serialize_element()??; - blocks.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) - })?; - blocks.finish()?; + Node::HtmlText(text) => { + if matches!( + serializer.preprocessor().preprocessor.ctx.output, + pandoc::OutputFormat::HtmlLike + ) { + serializer.serialize_raw_html(|serializer| serializer.write_text(text)) + } else { + serializer.serialize_inlines(|inlines| { + inlines.serialize_element()?.serialize_str(text) + }) } - _ => anyhow::bail!("expected start of {tag:?} child, got {event:?}"), - } - } - Ok(()) - } - - pub fn serialize_md_event<'event>( - &self, - event: &MdEvent<'event>, - siblings: &mut impl Iterator>, - serializer: &mut pandoc::native::SerializeNested<'_, '_, 'book, '_, impl io::Write>, - ) -> anyhow::Result<()> - where - 'book: 'event, - { - use pulldown_cmark::{Tag, TagEnd}; - match event { - // HTML has already been parsed and stripped from the markdown events - html @ (MdEvent::Html(_) | MdEvent::InlineHtml(_)) => { - log::error!("HTML should have been filtered out of markdown events: {html:?}"); - Ok(()) } - MdEvent::Text(s) => serializer - .serialize_inlines(|inlines| inlines.serialize_element()?.serialize_str(s)), - MdEvent::Code(s) => serializer - .serialize_inlines(|inlines| inlines.serialize_element()?.serialize_code((), s)), - MdEvent::SoftBreak => serializer - .serialize_inlines(|inlines| inlines.serialize_element()?.serialize_soft_break()), - MdEvent::HardBreak => serializer - .serialize_inlines(|inlines| inlines.serialize_element()?.serialize_line_break()), - MdEvent::Rule => serializer - .blocks()? - .serialize_element()? - .serialize_horizontal_rule(), - MdEvent::TaskListMarker(checked) => serializer.serialize_inlines(|inlines| { - inlines - .serialize_element()? - .serialize_str_unescaped(if *checked { "\\9746" } else { "\\9744" })?; - inlines.serialize_element()?.serialize_space() - }), - MdEvent::End(TagEnd::HtmlBlock) => Ok(()), - MdEvent::End(end) => { - anyhow::bail!("end tag should have been handled by a recursive call: {end:?}") - } - MdEvent::Start(tag) => match tag { - Tag::HtmlBlock => Ok(()), - Tag::Paragraph => { + Node::Element(Element::Markdown(element)) => match element { + MdElement::Paragraph => { serializer .blocks()? .serialize_element()? - .serialize_para(|inlines| { - inlines.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) + .serialize_para(|serializer| { + serializer.serialize_nested(|serializer| { + for node in node.children() { + self.serialize_node(node, serializer)?; + } + Ok(()) }) }) } - Tag::Heading { + MdElement::Text(text) => serializer + .serialize_inlines(|inlines| inlines.serialize_element()?.serialize_str(text)), + MdElement::SoftBreak => serializer.serialize_inlines(|inlines| { + inlines.serialize_element()?.serialize_soft_break() + }), + MdElement::Heading { level, id, classes, @@ -312,19 +162,206 @@ impl<'book> Emitter<'book> { (id.as_deref(), classes, attrs), |inlines| { inlines.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) + for node in node.children() { + self.serialize_node(node, serializer)?; + } + Ok(()) }) }, ), - Tag::BlockQuote => serializer + MdElement::List(None) => serializer + .blocks()? + .serialize_element()? + .serialize_bullet_list(|items| { + for child in node.children() { + let mut item = items.serialize_element()??; + item.serialize_nested(|item| { + for node in child.children() { + self.serialize_node(node, item)?; + } + Ok(()) + })?; + item.finish()?; + } + Ok(()) + }), + MdElement::List(Some(first)) => serializer + .blocks()? + .serialize_element()? + .serialize_ordered_list(*first, |items| { + for child in node.children() { + let mut item = items.serialize_element()??; + item.serialize_nested(|item| { + for node in child.children() { + self.serialize_node(node, item)?; + } + Ok(()) + })?; + item.finish()?; + } + Ok(()) + }), + MdElement::Item => self.serialize_children(node, serializer), + MdElement::TaskListMarker(checked) => serializer.serialize_inlines(|inlines| { + inlines + .serialize_element()? + .serialize_str_unescaped(if *checked { "\\9746" } else { "\\9744" })?; + inlines.serialize_element()?.serialize_space() + }), + MdElement::Link { dest_url, title } => serializer.serialize_inlines(|inlines| { + inlines.serialize_element()?.serialize_link( + (None, &[], &[]), + |alt| alt.serialize_nested(|alt| self.serialize_children(node, alt)), + dest_url, + title, + ) + }), + MdElement::Table { alignment, source } => { + let preprocessor = serializer.preprocessor(); + let column_widths = preprocessor.column_widths(source); + let mut children = node.children(); + let (head, body) = (children.next().unwrap(), children.next().unwrap()); + debug_assert!(children.next().is_none()); + + let thead = match head.value() { + Node::Element(Element::Html(element)) + if element.name.expanded() == expanded_name!(html "thead") => + { + element + } + event => anyhow::bail!("expected table head, got {event:?}"), + }; + let tbody = match body.value() { + Node::Element(Element::Html(element)) + if element.name.expanded() == expanded_name!(html "tbody") => + { + element + } + event => anyhow::bail!("expected table body, got {event:?}"), + }; + + serializer.blocks()?.serialize_element()?.serialize_table( + (), + (alignment.iter().copied().map(Into::into)).zip(column_widths), + (&thead.attrs, |serializer| { + for row in head.children() { + match row.value() { + Node::Element(Element::Html(element)) if element.name.expanded() == expanded_name!(html "tr") => { + serializer.serialize_element()?.serialize_row(&element.attrs, |cells| { + for cell in row.children() { + match cell.value() { + Node::Element(Element::Html(element)) if element.name.expanded() == expanded_name!(html "th") => { + for node in cell.children() { + cells.serialize_element()?.serialize_cell( + &element.attrs, + |blocks| { + blocks.serialize_nested(|serializer| { + self.serialize_node( + node, serializer, + ) + }) + }, + )?; + } + } + event => { + anyhow::bail!("expected table cell, got {event:?}") + } + } + } + Ok(()) + })? + } + event => anyhow::bail!("expected table row, got {event:?}"), + } + } + Ok(()) + }), + (&tbody.attrs, |serializer| { + for row in body.children() { + match row.value() { + Node::Element(Element::Html(element)) + if element.name.expanded() == expanded_name!(html "tr") => + { + serializer.serialize_element()?.serialize_row( + &element.attrs, + |cells| { + for cell in row.children() { + match cell.value() { + Node::Element(Element::Html(element)) + if element.name.expanded() + == expanded_name!(html "td") => + { + cells + .serialize_element()? + .serialize_cell(&element.attrs, |blocks| { + blocks.serialize_nested( + |serializer| { + for node in + cell.children() + { + self.serialize_node( + node, serializer, + )?; + } + Ok(()) + }, + ) + })? + } + event => { + anyhow::bail!( + "expected table data (), got {event:?}" + ) + } + } + } + Ok(()) + }, + )? + } + event => anyhow::bail!("expected table row, got {event:?}"), + } + } + Ok(()) + }), + ) + } + MdElement::FootnoteDefinition => Ok(()), + MdElement::FootnoteReference(label) => match self.footnotes.get(label) { + None => { + log::warn!("Undefined footnote: {label}"); + Ok(()) + } + Some(definition) => serializer.serialize_inlines(|serializer| { + serializer + .serialize_element()? + .serialize_note(|serializer| { + serializer.serialize_nested(|serializer| { + for node in self.tree.tree.get(*definition).unwrap().children() + { + self.serialize_node(node, serializer)?; + } + Ok(()) + }) + }) + }), + }, + MdElement::BlockQuote => serializer .blocks()? .serialize_element()? .serialize_block_quote(|blocks| { blocks.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) + for node in node.children() { + self.serialize_node(node, serializer)?; + } + Ok(()) }) }), - Tag::CodeBlock(kind) => { + MdElement::InlineCode(s) => serializer.serialize_inlines(|inlines| { + inlines.serialize_element()?.serialize_code((), s) + }), + MdElement::CodeBlock(kind) => { // MdBook supports custom attributes in code block info strings. // Attributes are separated by a comma, space, or tab from the language name. // See https://rust-lang.github.io/mdBook/format/mdbook.html#rust-code-block-attributes @@ -334,44 +371,57 @@ impl<'book> Emitter<'book> { CodeBlockKind::Indented => "", CodeBlockKind::Fenced(info_string) => info_string, }; - let mut parts = - info_string.split([',', ' ', '\t']).map(|part| part.trim()); + let mut parts = info_string.split([',', ' ', '\t']).map(|part| part.trim()); (parts.next(), parts) }; // https://rust-lang.github.io/mdBook/format/mdbook.html?highlight=hide#hiding-code-lines - let hide_lines = !serializer.preprocessor().preprocessor.ctx.code.show_hidden_lines; - let hidden_line_prefix = hide_lines.then(|| { - let hidelines_override = - attributes.find_map(|attr| attr.strip_prefix("hidelines=")); - hidelines_override.or_else(|| { - let lang = language?; - // Respect [output.html.code.hidelines] - let html = serializer.preprocessor().preprocessor.ctx.html; - html.and_then(|html| Some(html.code.hidelines.get(lang)?.as_str())) - .or((lang == "rust").then_some("#")) + let hide_lines = !serializer + .preprocessor() + .preprocessor + .ctx + .code + .show_hidden_lines; + let hidden_line_prefix = hide_lines + .then(|| { + let hidelines_override = + attributes.find_map(|attr| attr.strip_prefix("hidelines=")); + hidelines_override.or_else(|| { + let lang = language?; + // Respect [output.html.code.hidelines] + let html = serializer.preprocessor().preprocessor.ctx.html; + html.and_then(|html| Some(html.code.hidelines.get(lang)?.as_str())) + .or((lang == "rust").then_some("#")) + }) }) - }).flatten(); + .flatten(); - let texts = iter::from_fn(|| match siblings.next() { - Some(Event::Markdown(MdEvent::Text(text))) => Some(text), - Some(Event::Markdown(MdEvent::End(TagEnd::CodeBlock))) => None, - event => panic!("Code blocks should contain only literal text, but encountered {event:?}"), + let texts = node.children().map(|node| { + match node.value() { + Node::Element(Element::Markdown(MdElement::Text(text))) => text, + event => panic!("Code blocks should contain only literal text, but encountered {event:?}"), + } }); - let lines = texts.flat_map(|text| text.lines()).filter(|line| { - hidden_line_prefix.map_or(true, |prefix| !line.trim_start().starts_with(prefix)) - }).collect::>(); + let lines = texts + .flat_map(|text| text.lines()) + .filter(|line| { + hidden_line_prefix + .map_or(true, |prefix| !line.trim_start().starts_with(prefix)) + }) + .collect::>(); // Pandoc+fvextra only wraps long lines in code blocks with info strings // so fall back to "text" let language = language.unwrap_or("text"); - if let pandoc::OutputFormat::Latex { .. } = serializer.preprocessor().preprocessor.ctx.output { + if let pandoc::OutputFormat::Latex { .. } = + serializer.preprocessor().preprocessor.ctx.output + { const CODE_BLOCK_LINE_LENGTH_LIMIT: usize = 1000; - let overly_long_line = lines.iter().any(|line| { - line.len() > CODE_BLOCK_LINE_LENGTH_LIMIT - }); + let overly_long_line = lines + .iter() + .any(|line| line.len() > CODE_BLOCK_LINE_LENGTH_LIMIT); if overly_long_line { let lines = { let patterns = &[r"\", "{", "}", "$", "_", "^", "&", "]"]; @@ -386,18 +436,21 @@ impl<'book> Emitter<'book> { r"{{]}}", ]; let ac = AhoCorasick::new(patterns).unwrap(); - lines.into_iter().map(move |line| { - ac.replace_all(line, replace_with) - }) + lines + .into_iter() + .map(move |line| ac.replace_all(line, replace_with)) }; - return serializer.blocks()?.serialize_element()?.serialize_raw_block("latex", |raw| { - for line in lines { - raw.serialize_code(r"\texttt{{")?; - raw.serialize_code(&line)?; - raw.serialize_code(r"}}\\")?; - } - Ok(()) - }) + return serializer + .blocks()? + .serialize_element()? + .serialize_raw_block("latex", |raw| { + for line in lines { + raw.serialize_code(r"\texttt{{")?; + raw.serialize_code(&line)?; + raw.serialize_code(r"}}\\")?; + } + Ok(()) + }); } } @@ -413,222 +466,76 @@ impl<'book> Emitter<'book> { Ok(()) }) } - Tag::List(None) => serializer - .blocks()? - .serialize_element()? - .serialize_bullet_list(|items| { - self.serialize_nested_children( - tag, - |tag| matches!(tag, Tag::Item), - siblings, - items, - ) - }), - Tag::List(Some(first)) => serializer - .blocks()? - .serialize_element()? - .serialize_ordered_list(*first, |items| { - self.serialize_nested_children( - tag, - |tag| matches!(tag, Tag::Item), - siblings, - items, - ) - }), - Tag::Item => anyhow::bail!("list items should have been processed already"), - Tag::FootnoteDefinition(_) => Self::skip_children(tag, siblings), - Tag::Table(alignment) => { - let preprocessor = serializer.preprocessor(); - let table = preprocessor.pop_table().unwrap(); - let column_widths = preprocessor.column_widths(table); - serializer.blocks()?.serialize_element()?.serialize_table( - siblings, - (), - (alignment.iter().copied().map(Into::into)).zip(column_widths), - ((), |siblings, header| match siblings.next() { - Some(Event::Markdown(MdEvent::Start(Tag::TableHead))) => { - header.serialize_element()?.serialize_row((), |cells| loop { - match siblings.next() { - Some(Event::Markdown(MdEvent::End(TagEnd::TableHead))) => { - break Ok(()) - } - Some(Event::Markdown(MdEvent::Start( - cell @ Tag::TableCell, - ))) => cells.serialize_element()?.serialize_cell( - (), - |blocks| { - blocks.serialize_nested(|serializer| { - self.serialize_children( - cell, siblings, serializer, - ) - }) - }, - )?, - event => anyhow::bail!("expected table cell, got {event:?}"), - } - }) - } - event => anyhow::bail!("expected table head, got {event:?}"), - }), - ((), |siblings, body| loop { - match siblings.next() { - Some(Event::Markdown(MdEvent::End(TagEnd::Table))) => break Ok(()), - Some(Event::Markdown(MdEvent::Start(Tag::TableRow))) => { - body.serialize_element()?.serialize_row((), |cells| loop { - match siblings.next() { - Some(Event::Markdown(MdEvent::End( - TagEnd::TableRow, - ))) => break Ok(()), - Some(Event::Markdown(MdEvent::Start( - cell @ Tag::TableCell, - ))) => cells.serialize_element()?.serialize_cell( - (), - |blocks| { - blocks.serialize_nested(|serializer| { - self.serialize_children( - cell, siblings, serializer, - ) - }) - }, - )?, - event => anyhow::bail!("expected table cell, got {event:?}"), - } - })? - } - event => anyhow::bail!("expected table row, got {event:?}"), - } - }), - ) - } - Tag::TableHead | Tag::TableRow | Tag::TableCell => anyhow::bail!("table contents should have been processed already"), - Tag::Emphasis => serializer.serialize_inlines(|inlines| { + MdElement::Emphasis => serializer.serialize_inlines(|inlines| { inlines.serialize_element()?.serialize_emph(|inlines| { inlines.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) + self.serialize_children(node, serializer) }) }) }), - Tag::Strong => serializer.serialize_inlines(|inlines| { + MdElement::Strong => serializer.serialize_inlines(|inlines| { inlines.serialize_element()?.serialize_strong(|inlines| { inlines.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) + self.serialize_children(node, serializer) }) }) }), - Tag::Strikethrough => serializer.serialize_inlines(|inlines| { + MdElement::Strikethrough => serializer.serialize_inlines(|inlines| { inlines.serialize_element()?.serialize_strikeout(|inlines| { inlines.serialize_nested(|serializer| { - self.serialize_children(tag, siblings, serializer) + self.serialize_children(node, serializer) }) }) }), - Tag::Link { - link_type: _, - dest_url, - title, - id: _, - } => serializer.serialize_inlines(|inlines| { - inlines.serialize_element()?.serialize_link( - (None, &[], &[]), - |alt| { - alt.serialize_nested(|alt| self.serialize_children(tag, siblings, alt)) - }, - dest_url, - title, - ) - }), - Tag::Image { + MdElement::Image { link_type, dest_url, title, id, - } => { - serializer.serialize_inlines(|inlines| { - match inlines.serializer.preprocessor.resolve_image_url(dest_url.as_ref().into(), *link_type) { - Err(UnresolvableRemoteImage) => { - inlines.serialize_nested(|inlines| self.serialize_children(tag, siblings, inlines)) - }, - Ok(dest_url) => { - inlines.serialize_element()?.serialize_image( - (Some(id.as_ref()), &[], &[]), - |alt| alt.serialize_nested(|alt| self.serialize_children(tag, siblings, alt)), - &dest_url, - title, - ) - } - } - }) - }, - Tag::MetadataBlock(_kind) => { - log::warn!("Ignoring metadata block"); - Ok(()) - } - }, - MdEvent::FootnoteReference(label) => match self.footnotes.get(label) { - None => { - log::warn!("Undefined footnote reference: {label}"); - Ok(()) - } - Some(bookmark) => serializer.serialize_inlines(|serializer| { - serializer - .serialize_element()? - .serialize_note(|serializer| { - serializer.serialize_nested(|serializer| { - let mut events = self.load_bookmark(bookmark); - match events.next() { - Some(Event::Markdown(MdEvent::Start(tag @ Tag::FootnoteDefinition(l)))) => { - debug_assert_eq!(l, label); - self.serialize_children(tag, &mut events, serializer) - } - event => { - log::warn!("Failed to look up footnote definition: found {event:?} instead"); - Ok(()) - } - } - }) - }) + } => serializer.serialize_inlines(|inlines| { + match inlines + .serializer + .preprocessor + .resolve_image_url(dest_url.as_ref().into(), *link_type) + { + Err(UnresolvableRemoteImage) => inlines + .serialize_nested(|inlines| self.serialize_children(node, inlines)), + Ok(dest_url) => inlines.serialize_element()?.serialize_image( + (Some(id.as_ref()), &[], &[]), + |alt| alt.serialize_nested(|alt| self.serialize_children(node, alt)), + &dest_url, + title, + ), + } }), }, - } - } - - pub fn serialize_node( - &self, - node: NodeRef<'_, Node>, - serializer: &mut pandoc::native::SerializeNested<'_, '_, 'book, '_, impl io::Write>, - ) -> anyhow::Result<()> { - match node.value() { - Node::Document | Node::Fragment | Node::Doctype(_) | Node::ProcessingInstruction(_) => { - Ok(()) - } - Node::Comment(comment) => { - serializer.serialize_raw_html(|serializer| serializer.write_comment(comment)) - } - Node::Text(text) => { - if matches!( - serializer.preprocessor().preprocessor.ctx.output, - pandoc::OutputFormat::HtmlLike - ) { - serializer.serialize_raw_html(|serializer| serializer.write_text(text)) - } else { - serializer.serialize_inlines(|inlines| { - inlines.serialize_element()?.serialize_str(text) - }) - } - } - Node::Element(element) => { - debug_assert_ne!(element.name, self.event_node_name); + Node::Element(Element::Html(element)) => { match element.name.local { + local_name!("thead") + | local_name!("th") + | local_name!("tr") + | local_name!("td") => return self.serialize_children(node, serializer), + local_name!("br") => { + return serializer.serialize_inlines(|inlines| { + inlines.serialize_element()?.serialize_line_break() + }) + } + local_name!("hr") => { + return serializer + .blocks()? + .serialize_element()? + .serialize_horizontal_rule() + } local_name!("a") => { - let [href, title] = [local_name!("href"), local_name!("title")] - .map(|attr| element.attrs.get(&html::name(attr))); + let [href, title] = [html::name!("href"), html::name!("title")] + .map(|attr| element.attrs.rest.get(&attr)); return serializer.serialize_inlines(|inlines| { if let Some(href) = href { inlines.serialize_element()?.serialize_link( &element.attrs, |alt| { alt.serialize_nested(|alt| { - self.serialize_events(self.children(node), alt) + self.serialize_children(node, alt) }) }, href, @@ -639,7 +546,7 @@ impl<'book> Emitter<'book> { &element.attrs, |inlines| { inlines.serialize_nested(|serializer| { - self.serialize_events(self.children(node), serializer) + self.serialize_children(node, serializer) }) }, ) @@ -652,7 +559,7 @@ impl<'book> Emitter<'book> { .serialize_element()? .serialize_span(&element.attrs, |inlines| { inlines.serialize_nested(|serializer| { - self.serialize_events(self.children(node), serializer) + self.serialize_children(node, serializer) }) }) }) @@ -662,7 +569,7 @@ impl<'book> Emitter<'book> { &element.attrs, |blocks| { blocks.serialize_nested(|serializer| { - self.serialize_events(self.children(node), serializer) + self.serialize_children(node, serializer) }) }, ); @@ -670,8 +577,8 @@ impl<'book> Emitter<'book> { local_name!("img") => { let mut attrs = element.attrs.clone(); let [src, alt, title] = - [local_name!("src"), local_name!("alt"), local_name!("title")] - .map(|attr| attrs.swap_remove(&html::name(attr))); + [html::name!("src"), html::name!("alt"), html::name!("title")] + .map(|attr| attrs.rest.swap_remove(&attr)); let Some(src) = src else { return Ok(()) }; return match serializer .preprocessor() @@ -699,31 +606,23 @@ impl<'book> Emitter<'book> { }; } local_name!("i") => { - let mut attrs = element.attrs.iter(); - match attrs.next() { - Some((attr, val)) - if matches!(attr.local, local_name!("class")) - && attrs.next().is_none() => - { - if let Some(icon) = val.strip_prefix("fa fa-") { - let ctx = &mut serializer.preprocessor().preprocessor.ctx; - if let pandoc::OutputFormat::Latex { packages } = - &mut ctx.output - { - if !node.has_children() { - packages.need(latex::Package::FontAwesome); - return serializer.serialize_inlines(|inlines| { - inlines - .serialize_element()? - .serialize_raw_inline("latex", |raw| { - write!(raw, r"\faicon{{{icon}}}") - }) - }); - } + let Attributes { id, classes, rest } = &element.attrs; + if id.is_none() && rest.is_empty() { + if let Some(icon) = classes.strip_prefix("fa fa-") { + let ctx = &mut serializer.preprocessor().preprocessor.ctx; + if let pandoc::OutputFormat::Latex { packages } = &mut ctx.output { + if !node.has_children() { + packages.need(latex::Package::FontAwesome); + return serializer.serialize_inlines(|inlines| { + inlines + .serialize_element()? + .serialize_raw_inline("latex", |raw| { + write!(raw, r"\faicon{{{icon}}}") + }) + }); } } } - _ => {} } } _ => {} @@ -742,30 +641,32 @@ impl<'book> Emitter<'book> { serializer.preprocessor().preprocessor.ctx.output, pandoc::OutputFormat::HtmlLike )) - .then(|| element.attrs.get(&html::name(local_name!("id")))) + .then_some(element.attrs.id.as_ref()) .flatten() .map(|s| s.as_ref()); - let attrs = (id, &[], &[]); - match serializer.blocks() { - Ok(serializer) => { - serializer - .serialize_element()? - .serialize_div(attrs, |serializer| { - serializer.serialize_nested(|serializer| { - self.serialize_events(self.children(node), serializer) + if node.has_children() || id.is_some() { + let attrs = (id, &[], &[]); + match serializer.blocks() { + Ok(serializer) => { + serializer + .serialize_element()? + .serialize_div(attrs, |serializer| { + serializer.serialize_nested(|serializer| { + self.serialize_children(node, serializer) + }) }) - }) - } - Err(_) => serializer.serialize_inlines(|serializer| { - serializer - .serialize_element()? - .serialize_span(attrs, |serializer| { - serializer.serialize_nested(|serializer| { - self.serialize_events(self.children(node), serializer) + } + Err(_) => serializer.serialize_inlines(|serializer| { + serializer + .serialize_element()? + .serialize_span(attrs, |serializer| { + serializer.serialize_nested(|serializer| { + self.serialize_children(node, serializer) + }) }) - }) - }), - }?; + }), + }?; + } serializer .serialize_raw_html(|serializer| serializer.end_elem(element.name.clone())) } @@ -788,61 +689,46 @@ impl<'book> Emitter<'book> { } } - serializer.serialize_nested(|serializer| { - self.serialize_events(self.children(*self.html.root_element()), serializer) - }) - } -} - -impl fmt::Debug for Event<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Markdown(event) => write!(f, "{event:?}"), - Self::Html(event) => event.value().fmt(f), - } + let root = self.tree.tree.root().first_child().unwrap(); + serializer.serialize_nested(|serializer| self.serialize_children(root, serializer)) } } struct DebugChildren<'event> { tree: &'event Emitter<'event>, - parent: NodeRef<'event, Node>, + parent: NodeRef<'event, Node<'event>>, } -struct DebugEventAndDescendants<'event> { +struct DebugNodeAndDescendants<'event> { tree: &'event Emitter<'event>, - event: Event<'event>, + node: NodeRef<'event, Node<'event>>, } impl fmt::Debug for DebugChildren<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut f = f.debug_list(); - for event in self.tree.children(self.parent) { - f.entry(&DebugEventAndDescendants { + for child in self.parent.children() { + f.entry(&DebugNodeAndDescendants { tree: self.tree, - event, + node: child, }); } f.finish() } } -impl fmt::Debug for DebugEventAndDescendants<'_> { +impl fmt::Debug for DebugNodeAndDescendants<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.event.fmt(f)?; - match self.event { - Event::Markdown(_) => Ok(()), - Event::Html(node) => { - if node.has_children() { - write!(f, " => ")?; - DebugChildren { - tree: self.tree, - parent: node, - } - .fmt(f)?; - } - Ok(()) + self.node.value().fmt(f)?; + if self.node.has_children() { + write!(f, " => ")?; + DebugChildren { + tree: self.tree, + parent: self.node, } + .fmt(f)?; } + Ok(()) } } @@ -850,7 +736,7 @@ impl fmt::Debug for Emitter<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { DebugChildren { tree: self, - parent: *self.html.root_element(), + parent: self.tree.tree.root(), } .fmt(f) } diff --git a/src/preprocess/tree/node.rs b/src/preprocess/tree/node.rs new file mode 100644 index 0000000..89b5b27 --- /dev/null +++ b/src/preprocess/tree/node.rs @@ -0,0 +1,289 @@ +use std::fmt; + +use html5ever::{local_name, namespace_url, ns, tendril::StrTendril, Attribute, QualName}; +use indexmap::IndexMap; +use pulldown_cmark::{Alignment, CodeBlockKind, CowStr, HeadingLevel, LinkType}; + +use crate::html; + +/// A node in the tree. +pub enum Node<'book> { + /// The document root. + Document, + + /// An HTML comment. + HtmlComment(StrTendril), + + /// Text in raw HTML. + HtmlText(StrTendril), + + /// An element. + Element(Element<'book>), +} + +#[derive(Clone)] +pub struct Attributes { + pub id: Option, + pub classes: StrTendril, + pub rest: IndexMap, +} + +pub enum Element<'book> { + Html(HtmlElement), + Markdown(MdElement<'book>), +} + +/// An HTML element. +pub struct HtmlElement { + /// The element name. + pub name: QualName, + /// The element attributes. + pub attrs: Attributes, +} + +#[derive(Debug)] +pub enum MdElement<'a> { + Paragraph, + Text(CowStr<'a>), + SoftBreak, + Heading { + level: HeadingLevel, + id: Option>, + classes: Vec>, + attrs: Vec<(CowStr<'a>, Option>)>, + }, + BlockQuote, + InlineCode(CowStr<'a>), + CodeBlock(CodeBlockKind<'a>), + List(Option), + Item, + TaskListMarker(bool), + FootnoteDefinition, + FootnoteReference(CowStr<'a>), + Table { + alignment: Vec, + source: &'a str, + }, + Emphasis, + Strong, + Strikethrough, + Link { + dest_url: CowStr<'a>, + title: CowStr<'a>, + }, + Image { + link_type: LinkType, + dest_url: CowStr<'a>, + title: CowStr<'a>, + id: CowStr<'a>, + }, +} + +pub trait QualNameExt { + /// Is this the name of a [void element](https://developer.mozilla.org/en-US/docs/Glossary/Void_element)? + fn is_void_element(&self) -> bool; +} + +impl QualNameExt for QualName { + fn is_void_element(&self) -> bool { + self.ns == ns!(html) + && matches!( + self.local, + local_name!("area") + | local_name!("base") + | local_name!("basefont") + | local_name!("bgsound") + | local_name!("br") + | local_name!("col") + | local_name!("embed") + | local_name!("frame") + | local_name!("hr") + | local_name!("img") + | local_name!("input") + | local_name!("keygen") + | local_name!("link") + | local_name!("meta") + | local_name!("param") + | local_name!("source") + | local_name!("track") + | local_name!("wbr") + ) + } +} + +impl Element<'_> { + pub fn name(&self) -> &QualName { + match self { + Self::Html(element) => &element.name, + Self::Markdown(element) => element.name(), + } + } +} + +impl MdElement<'_> { + pub fn name(&self) -> &QualName { + match self { + MdElement::Paragraph => { + const P: &QualName = &html::name!(html "p"); + P + } + MdElement::Text(_) => { + const SPAN: &QualName = &html::name!(html "span"); + SPAN + } + MdElement::SoftBreak => { + const BR: &QualName = &html::name!(html "br"); + BR + } + MdElement::List(None) => { + const UL: &QualName = &html::name!(html "ul"); + UL + } + MdElement::List(Some(_)) => { + const OL: &QualName = &html::name!(html "ol"); + OL + } + MdElement::Item => { + const LI: &QualName = &html::name!(html "li"); + LI + } + MdElement::Table { .. } => { + const TABLE: &QualName = &html::name!(html "table"); + TABLE + } + MdElement::Link { .. } => { + const A: &QualName = &html::name!(html "a"); + A + } + MdElement::FootnoteDefinition => { + // Pretend footnote definitions are s to fit them + // into the HTML parser's view of the world. + const SPAN: &QualName = &html::name!(html "span"); + SPAN + } + MdElement::FootnoteReference(_) => { + const SUP: &QualName = &html::name!(html "sup"); + SUP + } + MdElement::Heading { level, .. } => { + const H1: &QualName = &html::name!(html "h1"); + const H2: &QualName = &html::name!(html "h2"); + const H3: &QualName = &html::name!(html "h3"); + const H4: &QualName = &html::name!(html "h4"); + const H5: &QualName = &html::name!(html "h5"); + const H6: &QualName = &html::name!(html "h6"); + match level { + HeadingLevel::H1 => H1, + HeadingLevel::H2 => H2, + HeadingLevel::H3 => H3, + HeadingLevel::H4 => H4, + HeadingLevel::H5 => H5, + HeadingLevel::H6 => H6, + } + } + MdElement::BlockQuote => { + const BLOCKQUOTE: &QualName = &html::name!(html "blockquote"); + BLOCKQUOTE + } + MdElement::CodeBlock(_) => { + const PRE: &QualName = &html::name!(html "pre"); + PRE + } + MdElement::Emphasis => { + const EM: &QualName = &html::name!(html "em"); + EM + } + MdElement::Strong => { + const STRONG: &QualName = &html::name!(html "strong"); + STRONG + } + MdElement::Strikethrough => { + const S: &QualName = &html::name!(html "s"); + S + } + MdElement::Image { .. } => { + // is a void element in HTML (can have no children), + // but in Markdown the "alt text" *can* contain children. + // Therefore, we pretend images are s so the parser + // lets us add children. + const SPAN: &QualName = &html::name!(html "span"); + SPAN + } + MdElement::InlineCode(_) => { + const CODE: &QualName = &html::name!(html "code"); + CODE + } + MdElement::TaskListMarker(_) => { + const INPUT: &QualName = &html::name!(html "input"); + INPUT + } + } + } +} + +impl HtmlElement { + pub fn new(name: QualName, attributes: Vec) -> Self { + let mut attrs = Attributes { + id: None, + classes: StrTendril::new(), + rest: IndexMap::with_capacity(attributes.len()), + }; + for attr in attributes { + match attr.name.local { + local_name!("id") => { + attrs.id = Some(attr.value); + } + local_name!("class") => { + attrs.classes = attr.value; + } + _ => { + attrs.rest.insert(attr.name, attr.value); + } + } + } + HtmlElement { name, attrs } + } +} + +impl Attributes { + pub fn iter(&self) -> impl Iterator { + const ID: &QualName = &html::name!("id"); + const CLASS: &QualName = &html::name!("class"); + (self.id.as_ref().map(|id| (ID, id)).into_iter()) + .chain((!self.classes.is_empty()).then_some((CLASS, &self.classes))) + .chain(&self.rest) + } +} + +impl fmt::Debug for Node<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Node::Document => write!(f, "Document"), + Node::HtmlComment(comment) => write!(f, ""), + Node::HtmlText(text) => write!(f, "Text({text})"), + Node::Element(element) => write!(f, "{element:?}"), + } + } +} + +impl fmt::Debug for Element<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Element::Html(element) => write!(f, "{element:?}"), + Element::Markdown(element) => write!(f, "{element:?}"), + } + } +} + +impl fmt::Debug for HtmlElement { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "<{}", self.name.local)?; + if !self.attrs.classes.is_empty() { + write!(f, r#" class="{}""#, self.attrs.classes)?; + } + for (name, value) in &self.attrs.rest { + write!(f, r#" {}="{value}""#, name.local)?; + } + write!(f, ">") + } +} diff --git a/src/preprocess/tree/sink.rs b/src/preprocess/tree/sink.rs new file mode 100644 index 0000000..943bfb8 --- /dev/null +++ b/src/preprocess/tree/sink.rs @@ -0,0 +1,212 @@ +use ego_tree::NodeId; +use html5ever::{ + local_name, + tendril::{format_tendril, StrTendril}, + tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}, + Attribute, QualName, +}; +use std::{ + borrow::Cow, + cell::{Cell, Ref, RefCell}, +}; + +use super::{ + node::{Element, HtmlElement, Node}, + Tree, +}; + +#[derive(Debug)] +pub struct HtmlTreeSink<'book> { + pub tree: RefCell>, + pub most_recently_created_element: Cell>, +} + +impl HtmlTreeSink<'_> { + pub fn new() -> Self { + Self { + tree: RefCell::new(Tree::new()), + most_recently_created_element: Cell::new(None), + } + } +} + +impl<'book> TreeSink for HtmlTreeSink<'book> { + type Handle = NodeId; + type Output = Tree<'book>; + type ElemName<'a> + = Ref<'a, QualName> + where + Self: 'a; + + fn finish(self) -> Tree<'book> { + self.tree.into_inner() + } + + fn parse_error(&self, msg: Cow<'static, str>) { + self.tree.borrow_mut().errors.push(msg); + } + + fn get_document(&self) -> Self::Handle { + self.tree.borrow().tree.root().id() + } + + fn elem_name<'a>(&'a self, target: &Self::Handle) -> Ref<'a, QualName> { + Ref::map(self.tree.borrow(), |this| { + let node = this.tree.get(*target).unwrap().value(); + match node { + Node::Element(element) => element.name(), + _ => unreachable!(), + } + }) + } + + fn create_element( + &self, + name: QualName, + attrs: Vec, + _flags: ElementFlags, + ) -> Self::Handle { + let mut this = self.tree.borrow_mut(); + let node = this + .tree + .orphan(Node::Element(Element::Html(HtmlElement::new(name, attrs)))); + let id = node.id(); + self.most_recently_created_element.set(Some(id)); + id + } + + fn create_comment(&self, comment: StrTendril) -> Self::Handle { + let mut this = self.tree.borrow_mut(); + this.tree.orphan(Node::HtmlComment(comment)).id() + } + + fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle { + let mut this = self.tree.borrow_mut(); + // https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction + // says processing instructions are considered comments in HTML + let comment = format_tendril!(""); + this.tree.orphan(Node::HtmlComment(comment)).id() + } + + fn append(&self, parent: &Self::Handle, child: NodeOrText) { + let mut this = self.tree.borrow_mut(); + let mut parent = this.tree.get_mut(*parent).unwrap(); + + match child { + NodeOrText::AppendNode(id) => { + parent.append_id(id); + } + NodeOrText::AppendText(text) => { + if let Some(mut child) = parent.last_child() { + if let Node::HtmlText(t) = child.value() { + t.push_tendril(&text); + return; + } + } + parent.append(Node::HtmlText(text)); + } + } + } + + fn append_based_on_parent_node( + &self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ) { + let has_parent = { + let this = self.tree.borrow(); + let element = this.tree.get(*element).unwrap(); + element.parent().is_some() + }; + + if has_parent { + self.append_before_sibling(element, child) + } else { + self.append(prev_element, child) + } + } + + fn append_doctype_to_document( + &self, + _name: StrTendril, + _public_id: StrTendril, + _system_id: StrTendril, + ) { + } + + fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle { + let this = self.tree.borrow(); + let template = this.tree.get(*target).unwrap(); + template.first_child().unwrap().id() + } + + fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool { + x == y + } + + fn set_quirks_mode(&self, _mode: QuirksMode) {} + + fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText) { + let mut this = self.tree.borrow_mut(); + let mut sibling = this.tree.get_mut(*sibling).unwrap(); + + match new_node { + NodeOrText::AppendNode(id) => { + sibling.insert_id_before(id); + } + NodeOrText::AppendText(text) => { + if let Some(mut prev) = sibling.prev_sibling() { + if let Node::HtmlText(t) = prev.value() { + t.push_tendril(&text); + return; + } + } + sibling.insert_before(Node::HtmlText(text)); + } + } + } + + fn add_attrs_if_missing(&self, target: &Self::Handle, attributes: Vec) { + let mut this = self.tree.borrow_mut(); + let mut node = this.tree.get_mut(*target).unwrap(); + let Node::Element(element) = node.value() else { + unreachable!() + }; + match element { + Element::Markdown(_) => {} + Element::Html(element) => { + let attrs = &mut element.attrs; + for attr in attributes { + match attr.name.local { + local_name!("id") => { + if attrs.id.is_none() { + attrs.id = Some(attr.value); + } + } + local_name!("class") => { + if attrs.classes.is_empty() { + attrs.classes = attr.value; + } + } + _ => { + attrs.rest.entry(attr.name).or_insert(attr.value); + } + } + } + } + } + } + + fn remove_from_parent(&self, target: &Self::Handle) { + let mut this = self.tree.borrow_mut(); + let mut node = this.tree.get_mut(*target).unwrap(); + node.detach(); + } + + fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) { + let mut this = self.tree.borrow_mut(); + let mut new_parent = this.tree.get_mut(*new_parent).unwrap(); + new_parent.reparent_from_id_append(*node); + } +}