Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow GFM spec on EM and STRONG delimiters #1686

Merged
merged 26 commits into from
Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
40493bb
Follow GFM spec on Left-flanking-delimiter-runs
calculuschild May 21, 2020
4e2ec90
Now passes several more tests
calculuschild May 29, 2020
283ab9c
Deleted an extra line while removing comments
calculuschild May 29, 2020
c38ee23
Fix Pedantic
calculuschild May 30, 2020
7c6551e
Properly handle reflinks that should be escaped
calculuschild Jun 12, 2020
bc17ded
Lint
calculuschild Jun 12, 2020
ea203cf
Lint 2
calculuschild Jun 12, 2020
556070b
Updated rules for underscore em
calculuschild Jun 12, 2020
4cbba07
Moved logic into Tokenizer. No longer injecting Reflinks
calculuschild Jun 17, 2020
335a660
Added fixes to Strong
calculuschild Jun 17, 2020
e926e0c
Lint...
calculuschild Jun 17, 2020
c60c9ba
Remove extra tests accidentally left in
calculuschild Jun 17, 2020
54218fe
Remove straggling "shouldfail: false"
calculuschild Jun 17, 2020
2a45677
Remove redundant regex symbols
calculuschild Jun 18, 2020
d233fd5
mask reflinks
UziTech Jun 20, 2020
56b6f5e
Merge pull request #1 from UziTech/mask-reflinks
calculuschild Jun 30, 2020
4db32dc
Links are masked only once per inline string
calculuschild Jun 30, 2020
4e7902e
Gaaaah lint
calculuschild Jun 30, 2020
bd4f8c4
Fix unrestricted "any character" for REDOS
calculuschild Jul 2, 2020
211b9f9
Removed Lookbehinds
calculuschild Jul 8, 2020
cc778ad
Removed redundancy in "startEM" check
calculuschild Jul 8, 2020
226bbe7
Lint
calculuschild Jul 8, 2020
1fb141d
Make strEnd const
calculuschild Jul 9, 2020
ad720c1
Make emEnd const
calculuschild Jul 9, 2020
e27e6f9
Sorted strong and em into sub-objects
calculuschild Jul 9, 2020
6b729ed
Merge branch 'EmphasisFixes' of https://github.com/calculuschild/mark…
calculuschild Jul 9, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions src/Lexer.js
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,29 @@ module.exports = class Lexer {
/**
* Lexing/Compiling
*/
inlineTokens(src, tokens = [], inLink = false, inRawBlock = false) {
inlineTokens(src, tokens = [], inLink = false, inRawBlock = false, prevChar = '') {
let token;

// String with links masked to avoid interference with em and strong
let maskedSrc = src;
let match;

// Mask out reflinks
if (this.tokens.links) {
const links = Object.keys(this.tokens.links);
if (links.length > 0) {
while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
}
}
}
}
// Mask out other blocks
while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
}

while (src) {
// escape
if (token = this.tokenizer.escape(src)) {
Expand Down Expand Up @@ -360,15 +380,15 @@ module.exports = class Lexer {
}

// strong
if (token = this.tokenizer.strong(src)) {
if (token = this.tokenizer.strong(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
continue;
}

// em
if (token = this.tokenizer.em(src)) {
if (token = this.tokenizer.em(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
Expand Down Expand Up @@ -414,6 +434,7 @@ module.exports = class Lexer {
// text
if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) {
src = src.substring(token.raw.length);
prevChar = token.raw.slice(-1);
tokens.push(token);
continue;
}
Expand Down
56 changes: 40 additions & 16 deletions src/Tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -489,25 +489,49 @@ module.exports = class Tokenizer {
}
}

strong(src) {
const cap = this.rules.inline.strong.exec(src);
if (cap) {
return {
type: 'strong',
raw: cap[0],
text: cap[4] || cap[3] || cap[2] || cap[1]
};
strong(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.strong.start.exec(src);

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
const endReg = match[0] === '**' ? this.rules.inline.strong.endAst : this.rules.inline.strong.endUnd;

endReg.lastIndex = 0;

let cap;
while ((match = endReg.exec(maskedSrc)) != null) {
cap = this.rules.inline.strong.middle.exec(maskedSrc.slice(0, match.index + 3));
if (cap) {
return {
type: 'strong',
raw: src.slice(0, cap[0].length),
text: src.slice(2, cap[0].length - 2)
};
}
}
}
}

em(src) {
const cap = this.rules.inline.em.exec(src);
if (cap) {
return {
type: 'em',
raw: cap[0],
text: cap[6] || cap[5] || cap[4] || cap[3] || cap[2] || cap[1]
};
em(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.em.start.exec(src);

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
const endReg = match[0] === '*' ? this.rules.inline.em.endAst : this.rules.inline.em.endUnd;

endReg.lastIndex = 0;

let cap;
while ((match = endReg.exec(maskedSrc)) != null) {
cap = this.rules.inline.em.middle.exec(maskedSrc.slice(0, match.index + 2));
if (cap) {
return {
type: 'em',
raw: src.slice(0, cap[0].length),
text: src.slice(1, cap[0].length - 1)
};
}
}
}
}

Expand Down
88 changes: 79 additions & 9 deletions src/rules.js
Original file line number Diff line number Diff line change
Expand Up @@ -168,19 +168,74 @@ const inline = {
link: /^!?\[(label)\]\(\s*(href)(?:\s+(title))?\s*\)/,
reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/,
em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^\*([^\s<"][\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/,
reflinkSearch: 'reflink|nolink(?!\\()',
strong: {
start: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation
middle: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/,
endAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]__(?!_)(?:(?=[punctuation\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
},
em: {
start: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation
middle: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/,
endAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]_(?!_)(?:(?=[punctuation\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
},
code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
br: /^( {2,}|\\)\n(?!\s*$)/,
del: noopTest,
text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n))|(?= {2,}\n))/
text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n))|(?= {2,}\n))/,
punctuation: /^([\s*punctuation])/
};

// list of punctuation marks from common mark spec
// without ` and ] to workaround Rule 17 (inline code blocks/links)
// without , to work around example 393
inline._punctuation = '!"#$%&\'()*+\\-./:;<=>?@\\[^_{|}~';
inline.em = edit(inline.em).replace(/punctuation/g, inline._punctuation).getRegex();
// without * and _ to workaround cases with double emphasis
inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~';
inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex();

// sequences em should skip over [title](link), `code`, <html>
inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*';

inline.em.start = edit(inline.em.start)
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.em.middle = edit(inline.em.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
.getRegex();

inline.em.endAst = edit(inline.em.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.em.endUnd = edit(inline.em.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.start = edit(inline.strong.start)
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.middle = edit(inline.strong.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/blockSkip/g, inline._blockSkip)
.getRegex();

inline.strong.endAst = edit(inline.strong.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.endUnd = edit(inline.strong.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.blockSkip = edit(inline._blockSkip, 'g')
.getRegex();

inline.overlapSkip = edit(inline._overlapSkip, 'g')
.getRegex();

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;

Expand Down Expand Up @@ -212,6 +267,11 @@ inline.reflink = edit(inline.reflink)
.replace('label', inline._label)
.getRegex();

inline.reflinkSearch = edit(inline.reflinkSearch, 'g')
.replace('reflink', inline.reflink)
.replace('nolink', inline.nolink)
.getRegex();

/**
* Normal Inline Grammar
*/
Expand All @@ -223,8 +283,18 @@ inline.normal = merge({}, inline);
*/

inline.pedantic = merge({}, inline.normal, {
strong: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/,
em: /^_(?=\S)([\s\S]*?\S)_(?!_)|^\*(?=\S)([\s\S]*?\S)\*(?!\*)/,
strong: {
start: /^__|\*\*/,
middle: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/,
endAst: /\*\*(?!\*)/g,
endUnd: /__(?!_)/g
},
em: {
start: /^_|\*/,
middle: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/,
endAst: /\*(?!\*)/g,
endUnd: /_(?!_)/g
},
link: edit(/^!?\[(label)\]\((.*?)\)/)
.replace('label', inline._label)
.getRegex(),
Expand Down
Loading