Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lexical-markdown] Fix: normalize markdown in $convertFromMarkdownString to comply with CommonMark spec (2nd try) #6629

Merged
merged 21 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 71 additions & 7 deletions packages/lexical-markdown/src/MarkdownTransformers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,18 @@ export type TextMatchTransformer = Readonly<{
type: 'text-match';
}>;

const ORDERED_LIST_REGEX = /^(\s*)(\d{1,})\.\s/;
const UNORDERED_LIST_REGEX = /^(\s*)[-*+]\s/;
const CHECK_LIST_REGEX = /^(\s*)(?:-\s)?\s?(\[(\s|x)?\])\s/i;
const HEADING_REGEX = /^(#{1,6})\s/;
const QUOTE_REGEX = /^>\s/;
const CODE_START_REGEX = /^[ \t]*```(\w+)?/;
const CODE_END_REGEX = /[ \t]*```$/;
const CODE_SINGLE_LINE_REGEX =
/^[ \t]*```[^`]+(?:(?:`{1,2}|`{4,})[^`]+)*```(?:[^`]|$)/;
const TABLE_ROW_REG_EXP = /^(?:\|)(.+)(?:\|)\s?$/;
const TABLE_ROW_DIVIDER_REG_EXP = /^(\| ?:?-*:? ?)+\|\s?$/;

const createBlockNode = (
createNode: (match: Array<string>) => ElementNode,
): ElementTransformer['replace'] => {
Expand Down Expand Up @@ -266,7 +278,7 @@ export const HEADING: ElementTransformer = {
const level = Number(node.getTag().slice(1));
return '#'.repeat(level) + ' ' + exportChildren(node);
},
regExp: /^(#{1,6})\s/,
regExp: HEADING_REGEX,
replace: createBlockNode((match) => {
const tag = ('h' + match[1].length) as HeadingTagType;
return $createHeadingNode(tag);
Expand All @@ -288,7 +300,7 @@ export const QUOTE: ElementTransformer = {
}
return output.join('\n');
},
regExp: /^>\s/,
regExp: QUOTE_REGEX,
replace: (parentNode, children, _match, isImport) => {
if (isImport) {
const previousNode = parentNode.getPreviousSibling();
Expand Down Expand Up @@ -328,9 +340,9 @@ export const CODE: MultilineElementTransformer = {
},
regExpEnd: {
optional: true,
regExp: /[ \t]*```$/,
regExp: CODE_END_REGEX,
},
regExpStart: /^[ \t]*```(\w+)?/,
regExpStart: CODE_START_REGEX,
replace: (
rootNode,
children,
Expand Down Expand Up @@ -399,7 +411,7 @@ export const UNORDERED_LIST: ElementTransformer = {
export: (node, exportChildren) => {
return $isListNode(node) ? listExport(node, exportChildren, 0) : null;
},
regExp: /^(\s*)[-*+]\s/,
regExp: UNORDERED_LIST_REGEX,
replace: listReplace('bullet'),
type: 'element',
};
Expand All @@ -409,7 +421,7 @@ export const CHECK_LIST: ElementTransformer = {
export: (node, exportChildren) => {
return $isListNode(node) ? listExport(node, exportChildren, 0) : null;
},
regExp: /^(\s*)(?:-\s)?\s?(\[(\s|x)?\])\s/i,
regExp: CHECK_LIST_REGEX,
replace: listReplace('check'),
type: 'element',
};
Expand All @@ -419,7 +431,7 @@ export const ORDERED_LIST: ElementTransformer = {
export: (node, exportChildren) => {
return $isListNode(node) ? listExport(node, exportChildren, 0) : null;
},
regExp: /^(\s*)(\d{1,})\.\s/,
regExp: ORDERED_LIST_REGEX,
replace: listReplace('number'),
type: 'element',
};
Expand Down Expand Up @@ -519,3 +531,55 @@ export const LINK: TextMatchTransformer = {
trigger: ')',
type: 'text-match',
};

export function normalizeMarkdown(input: string): string {
const lines = input.split('\n');
let inCodeBlock = false;
const sanitizedLines: string[] = [];

for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const lastLine = sanitizedLines[sanitizedLines.length - 1];

// Code blocks of ```single line``` don't toggle the inCodeBlock flag
if (CODE_SINGLE_LINE_REGEX.test(line)) {
sanitizedLines.push(line);
continue;
}

// Detect the start or end of a code block
if (CODE_START_REGEX.test(line) || CODE_END_REGEX.test(line)) {
inCodeBlock = !inCodeBlock;
sanitizedLines.push(line);
continue;
}

// If we are inside a code block, keep the line unchanged
if (inCodeBlock) {
sanitizedLines.push(line);
continue;
}

// In markdown the concept of "empty paragraphs" does not exist.
// Blocks must be separated by an empty line. Non-empty adjacent lines must be merged.
if (
line === '' ||
lastLine === '' ||
!lastLine ||
HEADING_REGEX.test(lastLine) ||
HEADING_REGEX.test(line) ||
QUOTE_REGEX.test(line) ||
ORDERED_LIST_REGEX.test(line) ||
UNORDERED_LIST_REGEX.test(line) ||
CHECK_LIST_REGEX.test(line) ||
TABLE_ROW_REG_EXP.test(line) ||
TABLE_ROW_DIVIDER_REG_EXP.test(line)
) {
sanitizedLines.push(line);
} else {
sanitizedLines[sanitizedLines.length - 1] = lastLine + line;
}
}

return sanitizedLines.join('\n');
}
110 changes: 103 additions & 7 deletions packages/lexical-markdown/src/__tests__/unit/LexicalMarkdown.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ import {
Transformer,
TRANSFORMERS,
} from '../..';
import {MultilineElementTransformer} from '../../MarkdownTransformers';
import {
MultilineElementTransformer,
normalizeMarkdown,
} from '../../MarkdownTransformers';

// Matches html within a mdx file
const MDX_HTML_TRANSFORMER: MultilineElementTransformer = {
Expand Down Expand Up @@ -92,19 +95,36 @@ describe('Markdown', () => {
html: '<h6><span style="white-space: pre-wrap;">Hello world</span></h6>',
md: '###### Hello world',
},
{
// Multiline paragraphs: https://spec.commonmark.org/dingus/?text=Hello%0Aworld%0A!
html: '<p><span style="white-space: pre-wrap;">Helloworld!</span></p>',
md: ['Hello', 'world', '!'].join('\n'),
skipExport: true,
},
{
// Multiline paragraphs
html: '<p><span style="white-space: pre-wrap;">Hello</span><br><span style="white-space: pre-wrap;">world</span><br><span style="white-space: pre-wrap;">!</span></p>',
// TO-DO: It would be nice to support also hard line breaks (<br>) as \ or double spaces
// See https://spec.commonmark.org/0.31.2/#hard-line-breaks.
// Example: '<p><span style="white-space: pre-wrap;">Hello\\\nworld\\\n!</span></p>',
html: '<p><span style="white-space: pre-wrap;">Hello<br>world<br>!</span></p>',
md: ['Hello', 'world', '!'].join('\n'),
skipImport: true,
},
{
html: '<blockquote><span style="white-space: pre-wrap;">Hello</span><br><span style="white-space: pre-wrap;">world!</span></blockquote>',
md: '> Hello\n> world!',
},
// TO-DO: <br> should be preserved
// {
// html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world<br>!<br>!</span></li></ul>',
// md: '- Hello\n- world<br>!<br>!',
// skipImport: true,
// },
{
// Multiline list items
html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world</span><br><span style="white-space: pre-wrap;">!</span><br><span style="white-space: pre-wrap;">!</span></li></ul>',
// Multiline list items: https://spec.commonmark.org/dingus/?text=-%20Hello%0A-%20world%0A!%0A!
html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world!!</span></li></ul>',
md: '- Hello\n- world\n!\n!',
skipExport: true,
},
{
html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world</span></li></ul>',
Expand Down Expand Up @@ -274,8 +294,8 @@ describe('Markdown', () => {
skipExport: true,
},
{
// Import only: multiline quote will be prefixed with ">" on each line during export
html: '<blockquote><span style="white-space: pre-wrap;">Hello</span><br><span style="white-space: pre-wrap;">world</span><br><span style="white-space: pre-wrap;">!</span></blockquote>',
// https://spec.commonmark.org/dingus/?text=%3E%20Hello%0Aworld%0A!
html: '<blockquote><span style="white-space: pre-wrap;">Helloworld!</span></blockquote>',
md: '> Hello\nworld\n!',
skipExport: true,
},
Expand All @@ -298,8 +318,9 @@ describe('Markdown', () => {
},
{
customTransformers: [MDX_HTML_TRANSFORMER],
html: '<p><span style="white-space: pre-wrap;">Some HTML in mdx:</span></p><pre spellcheck="false" data-language="MyComponent"><span style="white-space: pre-wrap;">From HTML: Line 1\nSome Text</span></pre>',
html: '<p><span style="white-space: pre-wrap;">Some HTML in mdx:</span></p><pre spellcheck="false" data-language="MyComponent"><span style="white-space: pre-wrap;">From HTML: Line 1Some Text</span></pre>',
md: 'Some HTML in mdx:\n\n<MyComponent>Line 1\nSome Text</MyComponent>',
skipExport: true,
},
];

Expand Down Expand Up @@ -407,3 +428,78 @@ describe('Markdown', () => {
});
}
});

describe('normalizeMarkdown', () => {
it('should combine lines separated by a single \n unless they are in a codeblock', () => {
const markdown = `
A1
A2

A3

\`\`\`md
B1
B2

B3
\`\`\`

C1
C2

C3

\`\`\`js
D1
D2

D3
\`\`\`

\`\`\`single line code\`\`\`

E1
E2

E3
`;
expect(normalizeMarkdown(markdown)).toBe(`
A1A2

A3

\`\`\`md
B1
B2

B3
\`\`\`

C1C2

C3

\`\`\`js
D1
D2

D3
\`\`\`

\`\`\`single line code\`\`\`

E1E2

E3
`);
});

it('tables', () => {
const markdown = `
| a | b |
| --- | --- |
| c | d |
`;
expect(normalizeMarkdown(markdown)).toBe(markdown);
});
});
4 changes: 3 additions & 1 deletion packages/lexical-markdown/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import {
ITALIC_STAR,
ITALIC_UNDERSCORE,
LINK,
normalizeMarkdown,
ORDERED_LIST,
QUOTE,
STRIKETHROUGH,
Expand Down Expand Up @@ -82,11 +83,12 @@ function $convertFromMarkdownString(
node?: ElementNode,
shouldPreserveNewLines = false,
): void {
const sanitizedMarkdown = normalizeMarkdown(markdown);
Copy link
Contributor

@potatowagon potatowagon Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add a condition here, to only normalizeMarkdown if shouldPreserveNewLines = false?

internally we are using the shouldPreserveNewLines = true feature and normalizeMarkdown is causing eg.

hello
hello

to convert to
hellohello

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add some test coverage for this to prevent such regressions in the future

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added in d58c159

etrepum marked this conversation as resolved.
Show resolved Hide resolved
const importMarkdown = createMarkdownImport(
transformers,
shouldPreserveNewLines,
);
return importMarkdown(markdown, node);
return importMarkdown(sanitizedMarkdown, node);
}

/**
Expand Down
11 changes: 4 additions & 7 deletions packages/lexical-playground/__tests__/e2e/Markdown.spec.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -1310,7 +1310,6 @@ const IMPORTED_MARKDOWN_HTML = html`
bold italic strikethrough
</strong>
<span data-lexical-text="true">text,</span>
<br />
<strong
class="PlaygroundEditorTheme__textBold PlaygroundEditorTheme__textItalic PlaygroundEditorTheme__textStrikethrough"
data-lexical-text="true">
Expand Down Expand Up @@ -1408,9 +1407,7 @@ const IMPORTED_MARKDOWN_HTML = html`
dir="ltr">
<span data-lexical-text="true">Blockquotes text goes here</span>
<br />
<span data-lexical-text="true">And second</span>
<br />
<span data-lexical-text="true">line after</span>
<span data-lexical-text="true">And secondline after</span>
</blockquote>
<blockquote
class="PlaygroundEditorTheme__quote PlaygroundEditorTheme__ltr"
Expand Down Expand Up @@ -1488,9 +1485,9 @@ const IMPORTED_MARKDOWN_HTML = html`
class="PlaygroundEditorTheme__listItem PlaygroundEditorTheme__ltr"
dir="ltr"
value="1">
<span data-lexical-text="true">And can be nested</span>
<br />
<span data-lexical-text="true">and multiline as well</span>
<span data-lexical-text="true">
And can be nested and multiline as well
</span>
</li>
</ol>
</li>
Expand Down
Loading