diff --git a/Cargo.lock b/Cargo.lock index 5631576..caeb488 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -924,6 +924,7 @@ dependencies = [ "mdbook", "polib", "pulldown-cmark", + "pulldown-cmark-to-cmark", "semver", "serde_json", "tempfile", @@ -1226,6 +1227,15 @@ dependencies = [ "unicase", ] +[[package]] +name = "pulldown-cmark-to-cmark" +version = "10.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0194e6e1966c23cc5fd988714f85b18d548d773e81965413555d96569931833d" +dependencies = [ + "pulldown-cmark", +] + [[package]] name = "quote" version = "1.0.26" diff --git a/Cargo.toml b/Cargo.toml index 09d8660..2804dc2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ anyhow = "1.0.68" mdbook = "0.4.25" polib = "0.2.0" pulldown-cmark = { version = "0.9.2", default-features = false } +pulldown-cmark-to-cmark = "10.0.4" semver = "1.0.16" serde_json = "1.0.91" diff --git a/src/bin/mdbook-gettext.rs b/src/bin/mdbook-gettext.rs index 3f838b4..dd0d8b8 100644 --- a/src/bin/mdbook-gettext.rs +++ b/src/bin/mdbook-gettext.rs @@ -28,40 +28,47 @@ use anyhow::{anyhow, Context}; use mdbook::book::Book; use mdbook::preprocess::{CmdPreprocessor, PreprocessorContext}; use mdbook::BookItem; -use mdbook_i18n_helpers::extract_msgs; +use mdbook_i18n_helpers::{extract_events, group_events, reconstruct_markdown, Group}; use polib::catalog::Catalog; use polib::po_file; use semver::{Version, VersionReq}; use std::{io, process}; fn translate(text: &str, catalog: &Catalog) -> String { - let mut consumed = 0; // bytes of text consumed so far - let mut output = String::with_capacity(text.len()); + let mut translated_events = Vec::new(); + let events = extract_events(text, None); + let mut state = None; - for msg in extract_msgs(text) { - let span = msg.span(); - - // Copy over any bytes of text that precede this message. - if consumed < span.start { - output.push_str(&text[consumed..span.start]); + for group in group_events(&events) { + match group { + Group::Translate(events) => { + // Reconstruct the message. + let (msgid, new_state) = reconstruct_markdown(events, state.clone()); + let translated = catalog + .find_message(None, &msgid, None) + .filter(|msg| !msg.flags().is_fuzzy()) + .and_then(|msg| msg.msgstr().ok()) + .filter(|msgstr| !msgstr.is_empty()); + // Generate new events or reuse old events. + match translated { + Some(msgstr) => translated_events.extend(extract_events(msgstr, state)), + None => translated_events.extend_from_slice(events), + } + // Advance the state. + state = Some(new_state); + } + Group::Skip(events) => { + // Copy the events unchanged to the output. + translated_events.extend_from_slice(events); + // Advance the state. + let (_, new_state) = reconstruct_markdown(events, state); + state = Some(new_state); + } } - - // Insert the translated text - let msg_text = msg.text(text); - let translated = catalog - .find_message(None, msg_text, None) - .filter(|msg| !msg.flags().is_fuzzy()) - .and_then(|msg| msg.msgstr().ok()) - .filter(|msgstr| !msgstr.is_empty()) - .unwrap_or(msg_text); - output.push_str(translated); - consumed = span.end; } - // Handle any text left over after the last message. - let suffix = &text[consumed..]; - output.push_str(suffix); - output + let (translated, _) = reconstruct_markdown(&translated_events, None); + translated } fn translate_book(ctx: &PreprocessorContext, mut book: Book) -> anyhow::Result { @@ -160,19 +167,22 @@ mod tests { #[test] fn test_translate_single_paragraph() { let catalog = create_catalog(&[("foo bar", "FOO BAR")]); - assert_eq!(translate("foo bar\n", &catalog), "FOO BAR\n"); + // The output is normalized so the newline disappears. + assert_eq!(translate("foo bar\n", &catalog), "FOO BAR"); } #[test] fn test_translate_paragraph_with_leading_newlines() { let catalog = create_catalog(&[("foo bar", "FOO BAR")]); - assert_eq!(translate("\n\n\nfoo bar\n", &catalog), "\n\n\nFOO BAR\n"); + // The output is normalized so the newlines disappear. + assert_eq!(translate("\n\n\nfoo bar\n", &catalog), "FOO BAR"); } #[test] fn test_translate_paragraph_with_trailing_newlines() { let catalog = create_catalog(&[("foo bar", "FOO BAR")]); - assert_eq!(translate("foo bar\n\n\n", &catalog), "FOO BAR\n\n\n"); + // The output is normalized so the newlines disappear. + assert_eq!(translate("foo bar\n\n\n", &catalog), "FOO BAR"); } #[test] @@ -191,7 +201,7 @@ mod tests { \n\ FOO BAR\n\ \n\ - last paragraph\n" + last paragraph" ); } @@ -214,33 +224,52 @@ mod tests { PARAGRAPH", ), ]); - // Paragraph separation is kept intact while translating. + // Paragraph separation is normalized when translating. assert_eq!( translate( - "\n\ - first\n\ + "first\n\ paragraph\n\ \n\ \n\ - \n\ last\n\ - paragraph\n\ - \n\ - \n", + paragraph\n", &catalog ), - "\n\ - FIRST\n\ + "FIRST\n\ TRANSLATED\n\ PARAGRAPH\n\ \n\ - \n\ - \n\ LAST\n\ TRANSLATED\n\ - PARAGRAPH\n\ + PARAGRAPH" + ); + } + + #[test] + fn test_translate_code_block() { + let catalog = create_catalog(&[( + "fn foo() {\n\n let x = 10;\n\n}\n", + "fn FOO() {\n\n let X = 10;\n\n}\n", + )]); + assert_eq!( + translate( + "Text before.\n\ + \n\ + \n\ + ```rust,editable\n\ + fn foo() {\n\n let x = 10;\n\n}\n\ + ```\n\ + \n\ + Text after.\n", + &catalog + ), + "Text before.\n\ + \n\ + ```rust,editable\n\ + fn FOO() {\n\n let X = 10;\n\n}\n\ + ```\n\ \n\ - \n" + Text after.", ); } } diff --git a/src/bin/mdbook-xgettext.rs b/src/bin/mdbook-xgettext.rs index ba09bf5..0b8d35d 100644 --- a/src/bin/mdbook-xgettext.rs +++ b/src/bin/mdbook-xgettext.rs @@ -22,7 +22,7 @@ use anyhow::{anyhow, Context}; use mdbook::renderer::RenderContext; use mdbook::BookItem; -use mdbook_i18n_helpers::extract_msgs; +use mdbook_i18n_helpers::extract_messages; use polib::catalog::Catalog; use polib::message::Message; use polib::metadata::CatalogMetadata; @@ -88,9 +88,9 @@ fn create_catalog(ctx: &RenderContext) -> anyhow::Result { Some(path) => ctx.config.book.src.join(path), None => continue, }; - for msg in extract_msgs(&chapter.content) { - let source = format!("{}:{}", path.display(), msg.line_number()); - add_message(&mut catalog, msg.text(&chapter.content), &source); + for (lineno, msgid) in extract_messages(&chapter.content) { + let source = format!("{}:{}", path.display(), lineno); + add_message(&mut catalog, &msgid, &source); } } } @@ -214,7 +214,7 @@ mod tests { .collect::>(), &[ "The Foo Chapter", - "# How to Foo", + "How to Foo", "The first paragraph about Foo.\n\ Still the first paragraph." ] diff --git a/src/lib.rs b/src/lib.rs index 573e9ae..d972ab0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,256 +25,323 @@ use mdbook::utils::new_cmark_parser; use pulldown_cmark::{Event, Tag}; -use std::ops::Range; +use pulldown_cmark_to_cmark::{cmark_resume_with_options, Options, State}; -/// A translatable message. -#[derive(PartialEq, Debug)] -pub struct Message { - /// Line number where this message begins. - line: usize, - - /// Span of the input text containing this message. - span: Range, -} - -impl Message { - fn new(line: usize, span: Range) -> Self { - Self { line, span } - } - - /// Get the text of this message, as a slice of the document from - /// which it was generated. - pub fn text<'doc>(&self, document: &'doc str) -> &'doc str { - &document[self.span.clone()] - } - - /// Get the line number at which this message begins. - pub fn line_number(&self) -> usize { - self.line - } - - /// Get the span of the source document from which this message is - /// drawn. - pub fn span(&self) -> Range { - self.span.clone() - } - - /// Extend this message to the given offset. - fn extend(&mut self, to_end: usize) { - self.span.end = to_end; - } - - /// Trim trailing newlines from this message. - fn trim_right(&mut self, document: &str) { - let trimmed_len = document[self.span.clone()].trim_end_matches('\n').len(); - self.span.end = self.span.start + trimmed_len; +/// Extract Markdown events from `text`. +/// +/// The `state` can be used to give the parsing context. In +/// particular, if a code block has started, the text should be parsed +/// without interpreting special Markdown characters. +/// +/// The events are labeled with the line number where they start in +/// the document. +/// +/// # Examples +/// +/// ``` +/// use mdbook_i18n_helpers::extract_events; +/// use pulldown_cmark::{Event, Tag}; +/// +/// assert_eq!( +/// extract_events("Hello,\nworld!", None), +/// vec![ +/// (1, Event::Start(Tag::Paragraph)), +/// (1, Event::Text("Hello,".into())), +/// (1, Event::SoftBreak), +/// (2, Event::Text("world!".into())), +/// (1, Event::End(Tag::Paragraph)), +/// ] +/// ); +/// ``` +pub fn extract_events<'a>(text: &'a str, state: Option>) -> Vec<(usize, Event<'a>)> { + // Offsets of each newline in the input, used to calculate line + // numbers from byte offsets. + let offsets = text + .match_indices('\n') + .map(|(offset, _)| offset) + .collect::>(); + + match state { + // If we're in a code block, we disable the normal parsing and + // return lines of text. This matches the behavior of the + // parser in this case. + Some(state) if state.is_in_code_block => text + .split_inclusive('\n') + .enumerate() + .map(|(idx, line)| (idx + 1, Event::Text(line.into()))) + .collect(), + // Otherwise, we parse the text line normally. + _ => new_cmark_parser(text, false) + .into_offset_iter() + .map(|(event, range)| { + let lineno = offsets.partition_point(|&o| o < range.start) + 1; + (lineno, event) + }) + .collect(), } } -/// Accumulator for translatable messages based on input from the -/// Markdown parser. -struct MsgAccumulator<'a> { - /// The input document. - document: &'a str, - - /// Offsets of each newline in the input, used to calculate line - /// numbers from byte offsets. - offsets: Vec, - - /// The resulting messages, as ranges of the input document. - msgs: Vec, - - /// Current nesting depth of Start/End events. - depth: usize, - - /// If set, skip until the nesting depth returns to this level. - skip_until_depth: Option, - - /// Can the last message still be appended to? If this is `true` - /// then `self.msgs` has at least one element. - message_open: bool, +/// Markdown events grouped by type. +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum Group<'a> { + /// Markdown events which should be translated. + /// + /// This includes `[Text("foo")]` as well as sequences with text + /// such as `[Start(Emphasis), Text("foo") End(Emphasis)]`. + Translate(&'a [(usize, Event<'a>)]), + + /// Markdown events which should be skipped when translating. + /// + /// This includes structural events such as `Start(Heading(H1, + /// None, vec![]))`. + Skip(&'a [(usize, Event<'a>)]), } -impl<'a> MsgAccumulator<'a> { - fn new(document: &'a str) -> Self { - Self { - document, - offsets: document - .match_indices('\n') - .map(|(offset, _)| offset) - .collect(), - msgs: vec![], - depth: 0, - skip_until_depth: None, - message_open: false, - } - } - - /// Mark the current message as finished. - fn finish_message(&mut self) { - self.message_open = false; - } - - /// Add a new text message, or extend an existing one. - fn push_message(&mut self, span: Range) { - // try to combine with an existing message. - if self.message_open { - if let Some(last) = self.msgs.last_mut() { - last.extend(span.end); - return; +/// Group Markdown events into translatable and skipped events. +/// +/// This function will partition the input events into groups of +/// events which should be translated or skipped. Concatenating the +/// events in each group will give you back the original events. +/// +/// # Examples +/// +/// ``` +/// use mdbook_i18n_helpers::{extract_events, group_events, Group}; +/// use pulldown_cmark::{Event, Tag}; +/// +/// let events = extract_events("This is a _paragraph_ of text.", None); +/// assert_eq!( +/// events, +/// vec![ +/// (1, Event::Start(Tag::Paragraph)), +/// (1, Event::Text("This is a ".into())), +/// (1, Event::Start(Tag::Emphasis)), +/// (1, Event::Text("paragraph".into())), +/// (1, Event::End(Tag::Emphasis)), +/// (1, Event::Text(" of text.".into())), +/// (1, Event::End(Tag::Paragraph)), +/// ], +/// ); +/// +/// let groups = group_events(&events); +/// assert_eq!( +/// groups, +/// vec![ +/// Group::Skip(&[ +/// (1, Event::Start(Tag::Paragraph)), +/// ]), +/// Group::Translate(&[ +/// (1, Event::Text("This is a ".into())), +/// (1, Event::Start(Tag::Emphasis)), +/// (1, Event::Text("paragraph".into())), +/// (1, Event::End(Tag::Emphasis)), +/// (1, Event::Text(" of text.".into())), +/// ]), +/// Group::Skip(&[ +/// (1, Event::End(Tag::Paragraph)), +/// ]), +/// ] +/// ); +/// ``` +pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec> { + let mut groups = Vec::new(); + + enum State { + Translate(usize), + Skip(usize), + } + let mut state = State::Skip(0); + + for (idx, (_, event)) in events.iter().enumerate() { + match event { + Event::Start( + Tag::Emphasis | Tag::Strong | Tag::Strikethrough | Tag::Link(..) | Tag::Image(..), + ) + | Event::End( + Tag::Emphasis | Tag::Strong | Tag::Strikethrough | Tag::Link(..) | Tag::Image(..), + ) + | Event::Text(_) + | Event::Code(_) + | Event::FootnoteReference(_) + | Event::SoftBreak + | Event::HardBreak => { + // If we're currently skipping, then a new + // translatable group starts here. + if let State::Skip(start) = state { + groups.push(Group::Skip(&events[start..idx])); + state = State::Translate(idx); + } + } + _ => { + // If we're currently translating, then a new + // skippable group starts here. + if let State::Translate(start) = state { + groups.push(Group::Translate(&events[start..idx])); + state = State::Skip(idx); + } } } - - self.msgs - .push(Message::new(self.line_number(span.start), span)); - self.message_open = true; } - /// Calculate the line number for the given offset. - fn line_number(&self, offset: usize) -> usize { - self.offsets.partition_point(|&o| o < offset) + 1 + match state { + State::Translate(start) => groups.push(Group::Translate(&events[start..])), + State::Skip(start) => groups.push(Group::Skip(&events[start..])), } - /// Push a new Markdown event into the accumulator. - fn push_event(&mut self, evt: Event<'a>, span: Range) { - #[cfg(test)] - println!("{evt:?} -- {:?}", &self.document[span.start..span.end]); - - // Track the nesting depth. - match evt { - Event::Start(_) => self.depth += 1, - Event::End(_) => self.depth -= 1, - _ => {} - } + groups +} - // Handle skip_until_depth, including skipping the End event - // that returned to the desired level. - if let Some(depth) = self.skip_until_depth { - if self.depth <= depth { - self.skip_until_depth = None; - } - return; - } +/// Render a slice of Markdown events back to Markdown. +/// +/// # Examples +/// +/// ``` +/// use mdbook_i18n_helpers::{extract_events, reconstruct_markdown}; +/// use pulldown_cmark::{Event, Tag}; +/// +/// let group = extract_events("Hello *world!*", None); +/// let (reconstructed, _) = reconstruct_markdown(&group, None); +/// assert_eq!(reconstructed, "Hello _world!_"); +/// ``` +/// +/// Notice how this will normalize the Markdown to use `_` for +/// emphasis and `**` for strong emphasis. The style is chosen to +/// match the [Google developer documentation style +/// guide](https://developers.google.com/style/text-formatting). +pub fn reconstruct_markdown( + group: &[(usize, Event)], + state: Option>, +) -> (String, State<'static>) { + let events = group.iter().map(|(_, event)| event); + let mut markdown = String::new(); + let options = Options { + code_block_token_count: 3, + list_token: '-', + emphasis_token: '_', + strong_token: "**", + ..Options::default() + }; + // Advance the true state, but throw away the rendered Markdown + // since it can contain unwanted padding. + let new_state = cmark_resume_with_options( + events.clone(), + String::new(), + state.clone(), + options.clone(), + ) + .unwrap(); + + // Block quotes and lists add padding to the state. This is + // reflected in the rendered Markdown. We want to capture the + // Markdown without the padding to remove the effect of these + // structural elements. + let state_without_padding = state.map(|state| State { + padding: Vec::new(), + ..state + }); + cmark_resume_with_options(events, &mut markdown, state_without_padding, options).unwrap(); + (markdown, new_state) +} - match evt { - // Consider "inline" tags to be just part of the text. - Event::Start(Tag::Emphasis | Tag::Strong | Tag::Strikethrough | Tag::Link(..)) => { - self.push_message(span) - } - Event::End(Tag::Emphasis | Tag::Strong | Tag::Strikethrough | Tag::Link(..)) => { - self.push_message(span) +/// Extract translatable strings from `document`. +/// +/// # Examples +/// +/// Structural markup like headings and lists are removed from the +/// messages: +/// +/// ``` +/// use mdbook_i18n_helpers::extract_messages; +/// +/// assert_eq!( +/// extract_messages("# A heading"), +/// vec![(1, "A heading".into())], +/// ); +/// assert_eq!( +/// extract_messages( +/// "1. First item\n\ +/// 2. Second item\n" +/// ), +/// vec![ +/// (1, "First item".into()), +/// (2, "Second item".into()), +/// ], +/// ); +/// ``` +/// +/// Indentation due to structural elements like block quotes and lists +/// is ignored: +/// +/// ``` +/// use mdbook_i18n_helpers::extract_messages; +/// +/// let messages = extract_messages( +/// "> * Hello, this is a\n\ +/// > list in a quote.\n\ +/// >\n\ +/// > This is the second\n\ +/// > paragraph.\n" +/// ); +/// assert_eq!( +/// messages, +/// vec![ +/// (1, "Hello, this is a\nlist in a quote.".into()), +/// (4, "This is the second\nparagraph.".into()), +/// ], +/// ); +/// ``` +pub fn extract_messages(document: &str) -> Vec<(usize, String)> { + let events = extract_events(document, None); + let mut messages = Vec::new(); + let mut state = None; + for group in group_events(&events) { + match group { + Group::Translate(events) => { + if let Some((lineno, _)) = events.first() { + let (text, new_state) = reconstruct_markdown(events, state); + messages.push((*lineno, text)); + state = Some(new_state); + } } - - // We want to translate everything: text, code (from - // backticks, `..`), or HTML. - Event::Text(_) | Event::Code(_) | Event::Html(_) => self.push_message(span), - - // For many event types we just take the entire text from - // Start to End, which is already encompassed in the event - // span. - Event::Start( - Tag::CodeBlock(_) - | Tag::Heading(..) - | Tag::List(..) - | Tag::BlockQuote - | Tag::Table(..), - ) => { - self.finish_message(); - self.push_message(span); - self.finish_message(); - // Skip until we get to a nesting depth outside of this Start event. - self.skip_until_depth = Some(self.depth - 1); + Group::Skip(events) => { + let (_, new_state) = reconstruct_markdown(events, state); + state = Some(new_state); } - - // For any other Start or End events, finish the current - // message but do not begin a new one. - Event::Start(_) | Event::End(_) => self.finish_message(), - - _ => {} } } - /// Get the resulting list of messages. - fn into_msgs(mut self) -> Vec { - let parser = new_cmark_parser(self.document, false); - for (evt, span) in parser.into_offset_iter() { - self.push_event(evt, span); - } - for msg in &mut self.msgs { - msg.trim_right(self.document); - } - self.msgs - } -} - -/// Extract translatable messages from the Markdown text. -/// -/// Returns a vector of (line number, text), where line numbers begin -/// at 1. -pub fn extract_msgs(document: &str) -> Vec { - MsgAccumulator::new(document).into_msgs() + messages } #[cfg(test)] mod tests { use super::*; - #[test] - fn offset_to_line_empty() { - assert_eq!(MsgAccumulator::new("").line_number(0), 1); - } - - #[test] - fn offset_to_line_multiline() { - let input = "abc\ndef\nghi"; - let acc = MsgAccumulator::new(input); - let line_nums: Vec<_> = input - .chars() - .enumerate() - .map(|(idx, ch)| (acc.line_number(idx), ch)) - .collect(); - - assert_eq!( - line_nums, - vec![ - (1, 'a'), - (1, 'b'), - (1, 'c'), - (1, '\n'), - (2, 'd'), - (2, 'e'), - (2, 'f'), - (2, '\n'), - (3, 'g'), - (3, 'h'), - (3, 'i'), - ] - ); - } - /// Extract messages in `document`, assert they match `expected`. #[track_caller] - fn assert_extract_msgs(document: &str, expected: Vec<(usize, &str)>) { - let lineno_texts = extract_msgs(document) - .iter() - .map(|msg| (msg.line_number(), msg.text(document))) - .collect::>(); - assert_eq!(lineno_texts, expected); + fn assert_extract_messages(document: &str, expected: Vec<(usize, &str)>) { + assert_eq!( + extract_messages(document) + .iter() + .map(|(lineno, msg)| (*lineno, &msg[..])) + .collect::>(), + expected, + ) } #[test] - fn extract_msgs_empty() { - assert_extract_msgs("", vec![]); + fn extract_messages_empty() { + assert_extract_messages("", vec![]); } #[test] - fn extract_msgs_single_line() { - assert_extract_msgs("This is a paragraph.", vec![(1, "This is a paragraph.")]); + fn extract_messages_single_line() { + assert_extract_messages("This is a paragraph.", vec![(1, "This is a paragraph.")]); } #[test] - fn extract_msgs_simple() { - assert_extract_msgs( + fn extract_messages_simple() { + assert_extract_messages( "This is\n\ the first\n\ paragraph.🦀\n\ @@ -288,8 +355,8 @@ mod tests { } #[test] - fn extract_msgs_leading_newlines() { - assert_extract_msgs( + fn extract_messages_leading_newlines() { + assert_extract_messages( "\n\ \n\ \n\ @@ -300,8 +367,8 @@ mod tests { } #[test] - fn extract_msgs_trailing_newlines() { - assert_extract_msgs( + fn extract_messages_trailing_newlines() { + assert_extract_messages( "This is\n\ a paragraph.\n\ \n\ @@ -311,32 +378,38 @@ mod tests { } #[test] - fn extract_msgs_styled_text() { - assert_extract_msgs( - "**This** ~~message~~ _has_ `code` *style*\n", - vec![(1, "**This** ~~message~~ _has_ `code` *style*")], + fn extract_messages_styled_text() { + // The parser normalizes "*emphasis*" to "_emphasis_" and + // "__strong emphasis__" to "**strong emphasis**". + assert_extract_messages( + "**This** __~~message~~__ _has_ `code` *style*\n", + vec![(1, "**This** **~~message~~** _has_ `code` _style_")], ); } #[test] - fn extract_msgs_inline_html() { - assert_extract_msgs( + fn extract_messages_inline_html() { + // HTML tags are skipped, but text inside is extracted: + assert_extract_messages( "Hi ", - vec![(1, "Hi ")], + vec![ + (1, "Hi "), // + (1, "alert('there');"), + ], ); } #[test] - fn extract_msgs_links() { - assert_extract_msgs( + fn extract_messages_links() { + assert_extract_messages( "See [this page](https://example.com) for more info.", vec![(1, "See [this page](https://example.com) for more info.")], ); } #[test] - fn extract_msgs_links_footer() { - assert_extract_msgs( + fn extract_messages_reference_links() { + assert_extract_messages( r#" * [Brazilian Portuguese][pt-BR] and * [Korean][ko] @@ -344,16 +417,32 @@ mod tests { [pt-BR]: https://google.github.io/comprehensive-rust/pt-BR/ [ko]: https://google.github.io/comprehensive-rust/ko/ "#, - // The parser does not include the referenced links in the - // events it produces. This is probably OK: links would - // not have been translated, anyway. - vec![(2, "* [Brazilian Portuguese][pt-BR] and\n* [Korean][ko]")], + // The parser expands reference links on the fly. + vec![ + (2, "[Brazilian Portuguese](https://google.github.io/comprehensive-rust/pt-BR/) and"), + (3, "[Korean](https://google.github.io/comprehensive-rust/ko/)"), + ] + ); + } + + #[test] + fn extract_messages_footnotes() { + assert_extract_messages( + " +The document[^1] text. + +[^1]: The footnote text. +", + vec![ + (2, "The document[^1] text."), // + (4, "The footnote text."), + ], ); } #[test] - fn extract_msgs_block_quote() { - assert_extract_msgs( + fn extract_messages_block_quote() { + assert_extract_messages( r#"One of my favorite quotes is: > Don't believe everything you read on the Internet. @@ -364,113 +453,156 @@ mod tests { "#, vec![ (1, "One of my favorite quotes is:"), - (3, "> Don't believe everything you read on the Internet.\n>\n> I didn't say this second part, but I needed a paragraph for testing."), - (7, "--Abraham Lincoln"), - ] + (3, "Don't believe everything you read on the Internet."), + ( + 5, + "I didn't say this second part, but I needed a paragraph for testing.", + ), + (7, "\\--Abraham Lincoln"), + ], ); } #[test] - fn extract_msgs_table() { - let table = r#"| Module Type | Description -|-------------------|------------------------------------------------------------------------ -| `rust_binary` | Produces a Rust binary. -| `rust_library` | Produces a Rust library, and provides both `rlib` and `dylib` variants."#; - let input = format!("Hey, a table\n\n{table}\n\nFooter.\n"); - // tables are included as part of the text. - assert_extract_msgs( + fn extract_messages_table() { + let input = "\ + | Module Type | Description\n\ + |-------------------|-------------------------\n\ + | `rust_binary` | Produces a Rust binary.\n\ + | `rust_library` | Produces a Rust library.\n\ + "; + assert_extract_messages( &input, - vec![(1, "Hey, a table"), (3, table), (8, "Footer.")], + vec![ + (1, "Module Type"), + (1, "Description"), + (3, "`rust_binary`"), + (3, "Produces a Rust binary."), + (4, "`rust_library`"), + (4, "Produces a Rust library."), + ], ); } #[test] - fn extract_msgs_code_block() { - assert_extract_msgs( + fn extract_messages_code_block() { + assert_extract_messages( "Preamble\n```rust\nfn hello() {\n some_code()\n\n todo!()\n}\n```\nPostamble", vec![ (1, "Preamble"), - ( - 2, - "```rust\nfn hello() {\n some_code()\n\n todo!()\n}\n```", - ), + (3, "fn hello() {\n some_code()\n\n todo!()\n}\n"), (9, "Postamble"), ], ); } #[test] - fn extract_msgs_details() { - // This isn't great, because the parser treats any data - // following a tag as also HTML, but works well enough when - // `
` has blank lines before and after. - assert_extract_msgs( - "Preamble\n
\nSome Details\n
\n\nPostamble", + fn extract_messages_quoted_code_block() { + assert_extract_messages( + "\ + > Preamble\n\ + > ```rust\n\ + > fn hello() {\n\ + > some_code()\n\ + >\n\ + > todo!()\n\ + > }\n\ + > ```\n\ + > Postamble", vec![ (1, "Preamble"), - (2, "
\nSome Details\n
"), + (3, "fn hello() {\n some_code()\n\n todo!()\n}\n"), + (9, "Postamble"), + ], + ); + } + + #[test] + fn extract_messages_details() { + // This isn't great: we lose text following a HTML tag: + assert_extract_messages( + "Preamble\n\ +
\n\ + Some Details\n\ +
\n\ + \n\ + Postamble", + vec![ + (1, "Preamble"), // + // Missing "Some Details" (6, "Postamble"), ], ); - assert_extract_msgs( - "Preamble\n\n
\n\nSome Details\n\n
\n\nPostamble", + // It works well enough when `
` has blank lines + // before and after. + assert_extract_messages( + "Preamble\n\ + \n\ +
\n\ + \n\ + Some Details\n\ + \n\ +
\n\ + \n\ + Postamble", vec![ - (1, "Preamble"), - (3, "
"), + (1, "Preamble"), // (5, "Some Details"), - (7, "
"), (9, "Postamble"), ], ); } #[test] - fn extract_msgs_list() { - assert_extract_msgs( + fn extract_messages_list() { + assert_extract_messages( "Some text\n * List item 1🦀\n * List item 2\n\nMore text", vec![ - (1, "Some text"), - (2, " * List item 1🦀\n * List item 2"), + (1, "Some text"), // + (2, "List item 1🦀"), + (3, "List item 2"), (5, "More text"), ], ); } #[test] - fn extract_msgs_multilevel_list() { - assert_extract_msgs("Some text\n * List item 1\n * List item 2\n * Sublist 1\n * Sublist 2\n\nMore text", + fn extract_messages_multilevel_list() { + assert_extract_messages( + "Some text\n * List item 1\n * List item 2\n * Sublist 1\n * Sublist 2\n\nMore text", vec![ - (1, "Some text"), - (2, " * List item 1\n * List item 2\n * Sublist 1\n * Sublist 2"), - (7, "More text") - ] + (1, "Some text"), // + (2, "List item 1"), + (3, "List item 2"), + (4, "Sublist 1"), + (5, "Sublist 2"), + (7, "More text"), + ], ); } #[test] - fn extract_msgs_list_with_paras() { - assert_extract_msgs( + fn extract_messages_list_with_paragraphs() { + assert_extract_messages( r#"* Item 1. * Item 2, two lines. * Sub 1. * Sub 2. - - More paragraph. - -Top level. "#, vec![ - (1, "* Item 1.\n* Item 2,\n two lines.\n\n * Sub 1.\n * Sub 2.\n\n More paragraph."), - (10, "Top level."), - ] + (1, "Item 1."), + (2, "Item 2,\ntwo lines."), + (5, "Sub 1."), + (6, "Sub 2."), + ], ); } #[test] - fn extract_msgs_headings() { - assert_extract_msgs( + fn extract_messages_headings() { + assert_extract_messages( r#"Some text # Headline News🦀 @@ -481,18 +613,19 @@ Top level. "#, vec![ (1, "Some text"), - (2, "# Headline News🦀"), - (4, "* A\n* List"), - (7, "## Subheading"), + (2, "Headline News🦀"), + (4, "A"), + (5, "List"), + (7, "Subheading"), ], ); } #[test] - fn extract_msgs_code_followed_by_details() { + fn extract_messages_code_followed_by_details() { // This is a regression test for an error that would // incorrectly combine CodeBlock and HTML. - assert_extract_msgs( + assert_extract_messages( r#"```bob BOB ``` @@ -504,10 +637,8 @@ BOB
"#, vec![ - (1, "```bob\nBOB\n```"), - (5, "
"), - (7, "* Blah blah"), - (9, "
"), + (2, "BOB\n"), // + (7, "Blah blah"), ], ); }