Skip to content

Commit

Permalink
fix: Total rework of Emphasis/Strong (#1864)
Browse files Browse the repository at this point in the history
BREAKING CHANGE: `em` and `strong` tokenizers have been merged into one `emStrong` tokenizer
  • Loading branch information
calculuschild committed Feb 7, 2021
1 parent f848e77 commit 7293251
Show file tree
Hide file tree
Showing 14 changed files with 280 additions and 372 deletions.
164 changes: 73 additions & 91 deletions lib/marked.esm.js
Expand Up @@ -839,46 +839,61 @@ var Tokenizer_1 = class Tokenizer {
}
}

strong(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.strong.start.exec(src);
emStrong(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.emStrong.lDelim.exec(src);
if (!match) return;

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
const endReg = match[0] === '**' ? this.rules.inline.strong.endAst : this.rules.inline.strong.endUnd;
if (match[3] && prevChar.match(/[\p{L}\p{N}]/u)) return; // _ can't be between two alphanumerics. \p{L}\p{N} includes non-english alphabet/numbers as well

const nextChar = match[1] || match[2] || '';

if (!nextChar || (nextChar && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
const lLength = match[0].length - 1;
let rDelim, rLength, delimTotal = lLength, midDelimTotal = 0;

const endReg = match[0][0] === '*' ? this.rules.inline.emStrong.rDelimAst : this.rules.inline.emStrong.rDelimUnd;
endReg.lastIndex = 0;

let cap;
maskedSrc = maskedSrc.slice(-1 * src.length + lLength); // Bump maskedSrc to same section of string as src (move to lexer?)

while ((match = endReg.exec(maskedSrc)) != null) {
cap = this.rules.inline.strong.middle.exec(maskedSrc.slice(0, match.index + 3));
if (cap) {
return {
type: 'strong',
raw: src.slice(0, cap[0].length),
text: src.slice(2, cap[0].length - 2)
};
rDelim = match[1] || match[2] || match[3] || match[4] || match[5] || match[6];

if (!rDelim) continue; // matched the first alternative in rules.js (skip the * in __abc*abc__)

rLength = rDelim.length;

if (match[3] || match[4]) { // found another Left Delim
delimTotal += rLength;
continue;
} else if (match[5] || match[6]) { // either Left or Right Delim
if (lLength % 3 && !((lLength + rLength) % 3)) {
midDelimTotal += rLength;
continue; // CommonMark Emphasis Rules 9-10
}
}
}
}
}

em(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.em.start.exec(src);
delimTotal -= rLength;

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
const endReg = match[0] === '*' ? this.rules.inline.em.endAst : this.rules.inline.em.endUnd;
if (delimTotal > 0) continue; // Haven't found enough closing delimiters

endReg.lastIndex = 0;
// If this is the last rDelimiter, remove extra characters. *a*** -> *a*
if (delimTotal + midDelimTotal - rLength <= 0 && !maskedSrc.slice(endReg.lastIndex).match(endReg)) {
rLength = Math.min(rLength, rLength + delimTotal + midDelimTotal);
}

let cap;
while ((match = endReg.exec(maskedSrc)) != null) {
cap = this.rules.inline.em.middle.exec(maskedSrc.slice(0, match.index + 2));
if (cap) {
if (Math.min(lLength, rLength) % 2) {
return {
type: 'em',
raw: src.slice(0, cap[0].length),
text: src.slice(1, cap[0].length - 1)
raw: src.slice(0, lLength + match.index + rLength + 1),
text: src.slice(1, lLength + match.index + rLength)
};
}
if (Math.min(lLength, rLength) % 2 === 0) {
return {
type: 'strong',
raw: src.slice(0, lLength + match.index + rLength + 1),
text: src.slice(2, lLength + match.index + rLength - 1)
};
}
}
Expand Down Expand Up @@ -1182,74 +1197,41 @@ const inline = {
reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
reflinkSearch: 'reflink|nolink(?!\\()',
strong: {
start: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation
middle: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/,
endAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation_\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]__(?!_)(?:(?=[punctuation*\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
},
em: {
start: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation
middle: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/,
endAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation_\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]_(?!_)(?:(?=[punctuation*\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
emStrong: {
lDelim: /^(?:\*+(?:([punct_])|[^\s*]))|^_+(?:([punct*])|([^\s_]))/,
// (1) and (2) can only be a Right Delimiter. (3) and (4) can only be Left. (5) and (6) can be either Left or Right.
// () Skip other delimiter (1) #*** (2) a***#, a*** (3) #***a, ***a (4) ***# (5) #***# (6) a***a
rDelimAst: /\_\_[^_]*?\*[^_]*?\_\_|[punct_](\*+)(?=[\s]|$)|[^punct*_\s](\*+)(?=[punct_\s]|$)|[punct_\s](\*+)(?=[^punct*_\s])|[\s](\*+)(?=[punct_])|[punct_](\*+)(?=[punct_])|[^punct*_\s](\*+)(?=[^punct*_\s])/,
rDelimUnd: /\*\*[^*]*?\_[^*]*?\*\*|[punct*](\_+)(?=[\s]|$)|[^punct*_\s](\_+)(?=[punct*\s]|$)|[punct*\s](\_+)(?=[^punct*_\s])|[\s](\_+)(?=[punct*])|[punct*](\_+)(?=[punct*])/ // ^- Not allowed for _
},
code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
br: /^( {2,}|\\)\n(?!\s*$)/,
del: noopTest$1,
text: /^(`+|[^`])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n)))/,
punctuation: /^([\s*punctuation])/
text: /^(`+|[^`])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*_]|\b_|$)|[^ ](?= {2,}\n)))/,
punctuation: /^([\spunctuation])/
};

// list of punctuation marks from common mark spec
// without * and _ to workaround cases with double emphasis
// list of punctuation marks from CommonMark spec
// without * and _ to handle the different emphasis markers * and _
inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~';
inline.punctuation = edit$1(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex();

// sequences em should skip over [title](link), `code`, <html>
inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*';
inline.blockSkip = /\[[^\]]*?\]\([^\)]*?\)|`[^`]*?`|<[^>]*?>/g;
inline.escapedEmSt = /\\\*|\\_/g;

inline._comment = edit$1(block._comment).replace('(?:-->|$)', '-->').getRegex();

inline.em.start = edit$1(inline.em.start)
.replace(/punctuation/g, inline._punctuation)
inline.emStrong.lDelim = edit$1(inline.emStrong.lDelim)
.replace(/punct/g, inline._punctuation)
.getRegex();

inline.em.middle = edit$1(inline.em.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
inline.emStrong.rDelimAst = edit$1(inline.emStrong.rDelimAst, 'g')
.replace(/punct/g, inline._punctuation)
.getRegex();

inline.em.endAst = edit$1(inline.em.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.em.endUnd = edit$1(inline.em.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.start = edit$1(inline.strong.start)
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.middle = edit$1(inline.strong.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
.getRegex();

inline.strong.endAst = edit$1(inline.strong.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strong.endUnd = edit$1(inline.strong.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.blockSkip = edit$1(inline._blockSkip, 'g')
.getRegex();

inline.overlapSkip = edit$1(inline._overlapSkip, 'g')
inline.emStrong.rDelimUnd = edit$1(inline.emStrong.rDelimUnd, 'g')
.replace(/punct/g, inline._punctuation)
.getRegex();

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
Expand Down Expand Up @@ -1328,7 +1310,7 @@ inline.gfm = merge$1({}, inline.normal, {
url: /^((?:ftp|https?):\/\/|www\.)(?:[a-zA-Z0-9\-]+\.?)+[^\s<]*|^email/,
_backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
del: /^(~~?)(?=[^\s~])([\s\S]*?[^\s~])\1(?=[^~]|$)/,
text: /^([`~]+|[^`~])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*~]|\b_|https?:\/\/|ftp:\/\/|www\.|$)|[^ ](?= {2,}\n)|[^a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-](?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))|(?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))/
text: /^([`~]+|[^`~])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*~_]|\b_|https?:\/\/|ftp:\/\/|www\.|$)|[^ ](?= {2,}\n)|[^a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-](?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))|(?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))/
});

inline.gfm.url = edit$1(inline.gfm.url, 'i')
Expand Down Expand Up @@ -1704,11 +1686,17 @@ var Lexer_1 = class Lexer {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString$1('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
}

// Mask out escaped em & strong delimiters
while ((match = this.tokenizer.rules.inline.escapedEmSt.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.escapedEmSt.lastIndex);
}

while (src) {
if (!keepPrevChar) {
prevChar = '';
}
keepPrevChar = false;

// escape
if (token = this.tokenizer.escape(src)) {
src = src.substring(token.raw.length);
Expand Down Expand Up @@ -1757,16 +1745,8 @@ var Lexer_1 = class Lexer {
continue;
}

// strong
if (token = this.tokenizer.strong(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
continue;
}

// em
if (token = this.tokenizer.em(src, maskedSrc, prevChar)) {
// em & strong
if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
Expand Down Expand Up @@ -1812,7 +1792,9 @@ var Lexer_1 = class Lexer {
// text
if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) {
src = src.substring(token.raw.length);
prevChar = token.raw.slice(-1);
if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started
prevChar = token.raw.slice(-1);
}
keepPrevChar = true;
lastToken = tokens[tokens.length - 1];
if (lastToken && lastToken.type === 'text') {
Expand Down

1 comment on commit 7293251

@vercel
Copy link

@vercel vercel bot commented on 7293251 Feb 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.