Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow GFM spec on EM and STRONG delimiters #1686

Merged
merged 26 commits into from Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
40493bb
Follow GFM spec on Left-flanking-delimiter-runs
calculuschild May 21, 2020
4e2ec90
Now passes several more tests
calculuschild May 29, 2020
283ab9c
Deleted an extra line while removing comments
calculuschild May 29, 2020
c38ee23
Fix Pedantic
calculuschild May 30, 2020
7c6551e
Properly handle reflinks that should be escaped
calculuschild Jun 12, 2020
bc17ded
Lint
calculuschild Jun 12, 2020
ea203cf
Lint 2
calculuschild Jun 12, 2020
556070b
Updated rules for underscore em
calculuschild Jun 12, 2020
4cbba07
Moved logic into Tokenizer. No longer injecting Reflinks
calculuschild Jun 17, 2020
335a660
Added fixes to Strong
calculuschild Jun 17, 2020
e926e0c
Lint...
calculuschild Jun 17, 2020
c60c9ba
Remove extra tests accidentally left in
calculuschild Jun 17, 2020
54218fe
Remove straggling "shouldfail: false"
calculuschild Jun 17, 2020
2a45677
Remove redundant regex symbols
calculuschild Jun 18, 2020
d233fd5
mask reflinks
UziTech Jun 20, 2020
56b6f5e
Merge pull request #1 from UziTech/mask-reflinks
calculuschild Jun 30, 2020
4db32dc
Links are masked only once per inline string
calculuschild Jun 30, 2020
4e7902e
Gaaaah lint
calculuschild Jun 30, 2020
bd4f8c4
Fix unrestricted "any character" for REDOS
calculuschild Jul 2, 2020
211b9f9
Removed Lookbehinds
calculuschild Jul 8, 2020
cc778ad
Removed redundancy in "startEM" check
calculuschild Jul 8, 2020
226bbe7
Lint
calculuschild Jul 8, 2020
1fb141d
Make strEnd const
calculuschild Jul 9, 2020
ad720c1
Make emEnd const
calculuschild Jul 9, 2020
e27e6f9
Sorted strong and em into sub-objects
calculuschild Jul 9, 2020
6b729ed
Merge branch 'EmphasisFixes' of https://github.com/calculuschild/mark…
calculuschild Jul 9, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 24 additions & 3 deletions src/Lexer.js
Expand Up @@ -319,9 +319,29 @@ module.exports = class Lexer {
/**
* Lexing/Compiling
*/
inlineTokens(src, tokens = [], inLink = false, inRawBlock = false) {
inlineTokens(src, tokens = [], inLink = false, inRawBlock = false, prevChar = '') {
let token;

// String with links masked to avoid interference with em and strong
let maskedSrc = src;
let match;

// Mask out reflinks
if (this.tokens.links) {
const links = Object.keys(this.tokens.links);
if (links.length > 0) {
while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
}
}
}
}
// Mask out other blocks
while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
}

while (src) {
// escape
if (token = this.tokenizer.escape(src)) {
Expand Down Expand Up @@ -360,15 +380,15 @@ module.exports = class Lexer {
}

// strong
if (token = this.tokenizer.strong(src)) {
if (token = this.tokenizer.strong(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
continue;
}

// em
if (token = this.tokenizer.em(src)) {
if (token = this.tokenizer.em(src, maskedSrc, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
Expand Down Expand Up @@ -414,6 +434,7 @@ module.exports = class Lexer {
// text
if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) {
src = src.substring(token.raw.length);
prevChar = token.raw.slice(-1);
tokens.push(token);
continue;
}
Expand Down
60 changes: 44 additions & 16 deletions src/Tokenizer.js
Expand Up @@ -489,25 +489,53 @@ module.exports = class Tokenizer {
}
}

strong(src) {
const cap = this.rules.inline.strong.exec(src);
if (cap) {
return {
type: 'strong',
raw: cap[0],
text: cap[4] || cap[3] || cap[2] || cap[1]
};
strong(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.strStart.exec(src);

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
let strEnd;

if (match[0] === '**') { strEnd = this.rules.inline.strEndAst; } else { strEnd = this.rules.inline.strEndUnd; }
calculuschild marked this conversation as resolved.
Show resolved Hide resolved

strEnd.lastIndex = 0;

let cap;
while ((match = strEnd.exec(maskedSrc)) != null) {
cap = this.rules.inline.strong.exec(maskedSrc.slice(0, match.index + 3));
if (cap) {
return {
type: 'strong',
raw: src.slice(0, cap[0].length),
text: src.slice(2, cap[0].length - 2)
};
}
}
}
}

em(src) {
const cap = this.rules.inline.em.exec(src);
if (cap) {
return {
type: 'em',
raw: cap[0],
text: cap[6] || cap[5] || cap[4] || cap[3] || cap[2] || cap[1]
};
em(src, maskedSrc, prevChar = '') {
let match = this.rules.inline.emStart.exec(src);

if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) {
maskedSrc = maskedSrc.slice(-1 * src.length);
let emEnd;

if (match[0] === '*') { emEnd = this.rules.inline.emEndAst; } else { emEnd = this.rules.inline.emEndUnd; }
calculuschild marked this conversation as resolved.
Show resolved Hide resolved

emEnd.lastIndex = 0;

let cap;
while ((match = emEnd.exec(maskedSrc)) != null) {
cap = this.rules.inline.em.exec(maskedSrc.slice(0, match.index + 2));
if (cap) {
return {
type: 'em',
raw: src.slice(0, cap[0].length),
text: src.slice(1, cap[0].length - 1)
};
}
}
}
}

Expand Down
73 changes: 65 additions & 8 deletions src/rules.js
Expand Up @@ -168,19 +168,71 @@ const inline = {
link: /^!?\[(label)\]\(\s*(href)(?:\s+(title))?\s*\)/,
reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/,
em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^\*([^\s<"][\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/,
reflinkSearch: 'reflink|nolink(?!\\()',
strStart: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation
strEndAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
strEndUnd: /[^\s]__(?!_)(?:(?=[punctuation\s])|$)/, // last char can't be a space, and final _ must preceed punct or \s (or endline)
strong: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it would be better to put all of the regexes together like

strong: {
  start: ...
  middle: ...
  endAst: ...
  endUnd: ...
}

just to make it a little more readable. I'm not sure what implications that might have in the rest of the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. That makes sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I had to redo the pedantic section as well to get names matching but behavior should be the same.

emStart: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation
emEndAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
emEndUnd: /[^\s]_(?!_)(?:(?=[punctuation\s])|$)/, // last char can't be a space, and final _ must preceed punct or \s (or endline)
// ⬐ skip overlapping Strong ⬐repeat logic for inner *'s (must be in pairs)| Underscores ⬐ skip overlapping Strong ⬐repeat logic for inner _'s (must be in pairs)⬎
em: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/,
code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
br: /^( {2,}|\\)\n(?!\s*$)/,
del: noopTest,
text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n))|(?= {2,}\n))/
text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n))|(?= {2,}\n))/,
punctuation: /^([\s*punctuation])/
};

// list of punctuation marks from common mark spec
// without ` and ] to workaround Rule 17 (inline code blocks/links)
// without , to work around example 393
inline._punctuation = '!"#$%&\'()*+\\-./:;<=>?@\\[^_{|}~';
inline.em = edit(inline.em).replace(/punctuation/g, inline._punctuation).getRegex();
// without * and _ to workaround cases with double emphasis
inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~';
inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex();

// sequences em should skip over [title](link), `code`, <html>
inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*';

inline.em = edit(inline.em)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
.getRegex();

inline.emStart = edit(inline.emStart)
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.emEndAst = edit(inline.emEndAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.emEndUnd = edit(inline.emEndUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.blockSkip = edit(inline._blockSkip, 'g')
.getRegex();

inline.overlapSkip = edit(inline._overlapSkip, 'g')
.getRegex();

inline.strong = edit(inline.strong)
.replace(/punctuation/g, inline._punctuation)
.replace(/blockSkip/g, inline._blockSkip)
.getRegex();

inline.strStart = edit(inline.strStart)
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strEndAst = edit(inline.strEndAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline.strEndUnd = edit(inline.strEndUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;

Expand Down Expand Up @@ -212,6 +264,11 @@ inline.reflink = edit(inline.reflink)
.replace('label', inline._label)
.getRegex();

inline.reflinkSearch = edit(inline.reflinkSearch, 'g')
.replace('reflink', inline.reflink)
.replace('nolink', inline.nolink)
.getRegex();

/**
* Normal Inline Grammar
*/
Expand All @@ -224,7 +281,7 @@ inline.normal = merge({}, inline);

inline.pedantic = merge({}, inline.normal, {
strong: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/,
em: /^_(?=\S)([\s\S]*?\S)_(?!_)|^\*(?=\S)([\s\S]*?\S)\*(?!\*)/,
em: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/,
link: edit(/^!?\[(label)\]\((.*?)\)/)
.replace('label', inline._label)
.getRegex(),
Expand Down