From 40493bbc2803f01929af069e75753f31adeedd12 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Thu, 21 May 2020 12:56:31 -0400 Subject: [PATCH 01/24] Follow GFM spec on Left-flanking-delimiter-runs --- src/rules.js | 4 +++- test/specs/commonmark/commonmark.0.29.json | 6 ++---- test/specs/gfm/commonmark.0.29.json | 6 ++---- test/specs/new/em_left_square_bracket.html | 1 + test/specs/new/em_left_square_bracket.md | 1 + 5 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 test/specs/new/em_left_square_bracket.html create mode 100644 test/specs/new/em_left_square_bracket.md diff --git a/src/rules.js b/src/rules.js index b34f56ff8f..615133ed3a 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,7 +169,9 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, - em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^\*([^\s<"][\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/, + //em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^\*([\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/, + em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^(?:\*(?![\s`punctuation])|(?<=[\s`punctuation])\*(?=[`punctuation]))([\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/, + //em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|(?:^\*(?![\s\[`punctuation])|^(?<=[\s\[`punctuation])\*(?=[\[`punctuation]))([\s\S]*?)(?:(?[named link some-url-

diff --git a/test/specs/new/em_left_square_bracket.md b/test/specs/new/em_left_square_bracket.md new file mode 100644 index 0000000000..36289e4d8b --- /dev/null +++ b/test/specs/new/em_left_square_bracket.md @@ -0,0 +1 @@ +*[named link some-url-* From 4e2ec90374bc79591d20d2bef655b59ce960ad37 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Fri, 29 May 2020 16:25:46 -0400 Subject: [PATCH 02/24] Now passes several more tests Added a check for the previous character to the *em* Tokenizer. Needed to pass any tests where the em block starts with a punctuation character (e.g. commonmark example 368) --- src/Lexer.js | 9 +++++---- src/Tokenizer.js | 16 +++++++++------- src/rules.js | 10 +++++----- test/specs/commonmark/commonmark.0.29.json | 8 ++++---- test/specs/gfm/commonmark.0.29.json | 8 ++++---- test/specs/new/em_2char.html | 4 +--- test/specs/new/em_2char.md | 2 -- test/specs/new/em_left_square_bracket.html | 1 - test/specs/new/em_left_square_bracket.md | 1 - test/specs/what/em_left_square_bracket000.md | 1 + 10 files changed, 29 insertions(+), 31 deletions(-) delete mode 100644 test/specs/new/em_left_square_bracket.html delete mode 100644 test/specs/new/em_left_square_bracket.md create mode 100644 test/specs/what/em_left_square_bracket000.md diff --git a/src/Lexer.js b/src/Lexer.js index b961706ee7..519ef88758 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -267,7 +267,7 @@ module.exports = class Lexer { case 'text': case 'heading': { token.tokens = []; - this.inlineTokens(token.text, token.tokens); + this.inlineTokens(token.text, token.tokens, undefined, undefined, ); break; } case 'table': { @@ -319,7 +319,7 @@ module.exports = class Lexer { /** * Lexing/Compiling */ - inlineTokens(src, tokens = [], inLink = false, inRawBlock = false) { + inlineTokens(src, tokens = [], inLink = false, inRawBlock = false, prevChar = '') { let token; while (src) { @@ -360,7 +360,7 @@ module.exports = class Lexer { } // strong - if (token = this.tokenizer.strong(src)) { + if (token = this.tokenizer.strong(src, prevChar)) { src = src.substring(token.raw.length); token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock); tokens.push(token); @@ -368,7 +368,7 @@ module.exports = class Lexer { } // em - if (token = this.tokenizer.em(src)) { + if (token = this.tokenizer.em(src, prevChar)) { src = src.substring(token.raw.length); token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock); tokens.push(token); @@ -414,6 +414,7 @@ module.exports = class Lexer { // text if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) { src = src.substring(token.raw.length); + prevChar = token.raw.slice(-1); tokens.push(token); continue; } diff --git a/src/Tokenizer.js b/src/Tokenizer.js index afe252fb41..77f48f91ff 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -489,7 +489,7 @@ module.exports = class Tokenizer { } } - strong(src) { + strong(src, prevChar='') { const cap = this.rules.inline.strong.exec(src); if (cap) { return { @@ -500,14 +500,16 @@ module.exports = class Tokenizer { } } - em(src) { + em(src, prevChar='') { const cap = this.rules.inline.em.exec(src); if (cap) { - return { - type: 'em', - raw: cap[0], - text: cap[6] || cap[5] || cap[4] || cap[3] || cap[2] || cap[1] - }; + if(!cap[1] || (cap[1] && (prevChar == '' || this.rules.inline.punctuation.exec(prevChar)))) { + return { + type: 'em', + raw: cap[0], + text: cap[8] || cap[7] || cap[6] || cap[5] || cap[4] || cap[3] || cap[2] + }; + } } } diff --git a/src/rules.js b/src/rules.js index 615133ed3a..0191f704ea 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,9 +169,9 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, - //em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^\*([\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/, - em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^(?:\*(?![\s`punctuation])|(?<=[\s`punctuation])\*(?=[`punctuation]))([\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/, - //em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|(?:^\*(?![\s\[`punctuation])|^(?<=[\s\[`punctuation])\*(?=[\[`punctuation]))([\s\S]*?)(?:(? (1) returns if starts w/ punctuation | (2) ⬐ Check groups [],``,<> to escape ⬐ escape if needed ⬐ repeated logic in case inner *'s exist inner *'s must occur in pairs⬎ Either: ⬐ last char can't be punct OR ⬐ final * must also be followed by punct (or endline) | (3) Underscores | (4) Underscores | (5) Underscores + em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!\[.*?\]|`.*?`|<.*?>)(?:[^\*]|[\\\s]\*)|\[.*?\]|`.*?`|<.*?>)|(?:(?:(?!\[.*?\]|`.*?`|<.*?>)(?:[^\*]|[\\\s]\*)|\[.*?\]|`.*?`|<.*?>)*?(??@\\[^_{|}~'; +inline._punctuation = '!"#$%&\'()+\\-./:;<=>?@\\[^_{|}~'; inline.em = edit(inline.em).replace(/punctuation/g, inline._punctuation).getRegex(); inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g; diff --git a/test/specs/commonmark/commonmark.0.29.json b/test/specs/commonmark/commonmark.0.29.json index d6bda41299..0ee5930a87 100644 --- a/test/specs/commonmark/commonmark.0.29.json +++ b/test/specs/commonmark/commonmark.0.29.json @@ -2766,7 +2766,7 @@ "start_line": 6003, "end_line": 6007, "section": "Code spans", - "shouldFail": true + "shouldFail": false }, { "markdown": "[not a `link](/foo`)\n", @@ -2976,7 +2976,7 @@ "start_line": 6455, "end_line": 6459, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "*(*foo*)*\n", @@ -2985,7 +2985,7 @@ "start_line": 6465, "end_line": 6469, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "*foo*bar\n", @@ -3394,7 +3394,7 @@ "start_line": 6928, "end_line": 6932, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "*foo [*bar*](/url)*\n", diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json index 03eeba9bf8..edb3d9d616 100644 --- a/test/specs/gfm/commonmark.0.29.json +++ b/test/specs/gfm/commonmark.0.29.json @@ -2766,7 +2766,7 @@ "start_line": 6003, "end_line": 6007, "section": "Code spans", - "shouldFail": true + "shouldFail": false }, { "markdown": "[not a `link](/foo`)\n", @@ -2976,7 +2976,7 @@ "start_line": 6455, "end_line": 6459, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail":false }, { "markdown": "*(*foo*)*\n", @@ -2985,7 +2985,7 @@ "start_line": 6465, "end_line": 6469, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "*foo*bar\n", @@ -3394,7 +3394,7 @@ "start_line": 6928, "end_line": 6932, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "*foo [*bar*](/url)*\n", diff --git a/test/specs/new/em_2char.html b/test/specs/new/em_2char.html index eb49036249..c28eb492d8 100644 --- a/test/specs/new/em_2char.html +++ b/test/specs/new/em_2char.html @@ -22,8 +22,6 @@

1_

-

1*

-

It’s leviOHsa, not levioSAH.

-

__ test test

\ No newline at end of file +

__ test test

diff --git a/test/specs/new/em_2char.md b/test/specs/new/em_2char.md index da34739179..37c2e03b25 100644 --- a/test/specs/new/em_2char.md +++ b/test/specs/new/em_2char.md @@ -22,8 +22,6 @@ _ 123_ _1__ -*1** - It’s levi*OH*sa, not levio*SAH.* __ test [test](https://test.com/_) diff --git a/test/specs/new/em_left_square_bracket.html b/test/specs/new/em_left_square_bracket.html deleted file mode 100644 index f2419b3773..0000000000 --- a/test/specs/new/em_left_square_bracket.html +++ /dev/null @@ -1 +0,0 @@ -

[named link some-url-

diff --git a/test/specs/new/em_left_square_bracket.md b/test/specs/new/em_left_square_bracket.md deleted file mode 100644 index 36289e4d8b..0000000000 --- a/test/specs/new/em_left_square_bracket.md +++ /dev/null @@ -1 +0,0 @@ -*[named link some-url-* diff --git a/test/specs/what/em_left_square_bracket000.md b/test/specs/what/em_left_square_bracket000.md new file mode 100644 index 0000000000..7bc24b7d2e --- /dev/null +++ b/test/specs/what/em_left_square_bracket000.md @@ -0,0 +1 @@ +foo *_* From 283ab9cf8aadb896881ede2c7c77e8d328cbfd9e Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Fri, 29 May 2020 16:33:49 -0400 Subject: [PATCH 03/24] Deleted an extra line while removing comments --- src/rules.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/rules.js b/src/rules.js index 0191f704ea..6bbbfc4b97 100644 --- a/src/rules.js +++ b/src/rules.js @@ -175,13 +175,16 @@ const inline = { code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/, br: /^( {2,}|\\)\n(?!\s*$)/, del: noopTest, - text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\?@\\[^_{|}~'; +inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex(); + inline.em = edit(inline.em).replace(/punctuation/g, inline._punctuation).getRegex(); inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g; From c38ee23f45aab44f55687b44c81ddcab0f25e714 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Fri, 29 May 2020 20:53:58 -0400 Subject: [PATCH 04/24] Fix Pedantic --- src/rules.js | 2 +- test/specs/run-spec.js | 2 ++ test/specs/what/em_left_square_bracket.html | 4 ++++ test/specs/what/em_left_square_bracket.md | 10 ++++++++++ test/specs/what/em_left_square_bracket0.html | 1 + test/specs/what/em_left_square_bracket0.md | 1 + test/specs/what/em_left_square_bracket00.html | 1 + test/specs/what/em_left_square_bracket00.md | 1 + test/specs/what/em_left_square_bracket000.html | 1 + test/specs/what/em_left_square_bracket0000.html | 1 + test/specs/what/em_left_square_bracket0000.md | 1 + test/specs/whats/strong_and_em_together.html | 7 +++++++ test/specs/whats/strong_and_em_together.md | 7 +++++++ 13 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 test/specs/what/em_left_square_bracket.html create mode 100644 test/specs/what/em_left_square_bracket.md create mode 100644 test/specs/what/em_left_square_bracket0.html create mode 100644 test/specs/what/em_left_square_bracket0.md create mode 100644 test/specs/what/em_left_square_bracket00.html create mode 100644 test/specs/what/em_left_square_bracket00.md create mode 100644 test/specs/what/em_left_square_bracket000.html create mode 100644 test/specs/what/em_left_square_bracket0000.html create mode 100644 test/specs/what/em_left_square_bracket0000.md create mode 100644 test/specs/whats/strong_and_em_together.html create mode 100644 test/specs/whats/strong_and_em_together.md diff --git a/src/rules.js b/src/rules.js index 6bbbfc4b97..be90fcfcb6 100644 --- a/src/rules.js +++ b/src/rules.js @@ -229,7 +229,7 @@ inline.normal = merge({}, inline); inline.pedantic = merge({}, inline.normal, { strong: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/, - em: /^_(?=\S)([\s\S]*?\S)_(?!_)|^\*(?=\S)([\s\S]*?\S)\*(?!\*)/, + em: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/, link: edit(/^!?\[(label)\]\((.*?)\)/) .replace('label', inline._label) .getRegex(), diff --git a/test/specs/run-spec.js b/test/specs/run-spec.js index 5fb5651b6d..debf91ef2d 100644 --- a/test/specs/run-spec.js +++ b/test/specs/run-spec.js @@ -53,3 +53,5 @@ runSpecs('Original', './original', false, { gfm: false, pedantic: true }); runSpecs('New', './new'); runSpecs('ReDOS', './redos'); runSpecs('Security', './security', false, { silent: true }); // silent - do not show deprecation warning + +//runSpecs('Whats', './whats'); diff --git a/test/specs/what/em_left_square_bracket.html b/test/specs/what/em_left_square_bracket.html new file mode 100644 index 0000000000..7132caf872 --- /dev/null +++ b/test/specs/what/em_left_square_bracket.html @@ -0,0 +1,4 @@ +

[[punctuation, asterisk, punctuation should work

+

[space, asterisk, punctuation should work

+

pnon-punctuation, asterisk, non-punctuation should work

+

p*[non-punctuation, asterisk, punctuation should NOT work*

diff --git a/test/specs/what/em_left_square_bracket.md b/test/specs/what/em_left_square_bracket.md new file mode 100644 index 0000000000..4f6d7a70de --- /dev/null +++ b/test/specs/what/em_left_square_bracket.md @@ -0,0 +1,10 @@ +[*[punctuation, asterisk, punctuation should work* + + + *[space, asterisk, punctuation should work* + + +p*non-punctuation, asterisk, non-punctuation should work* + + +p*[non-punctuation, asterisk, punctuation should NOT work* diff --git a/test/specs/what/em_left_square_bracket0.html b/test/specs/what/em_left_square_bracket0.html new file mode 100644 index 0000000000..8f88af1862 --- /dev/null +++ b/test/specs/what/em_left_square_bracket0.html @@ -0,0 +1 @@ +

foo bar

diff --git a/test/specs/what/em_left_square_bracket0.md b/test/specs/what/em_left_square_bracket0.md new file mode 100644 index 0000000000..300738b412 --- /dev/null +++ b/test/specs/what/em_left_square_bracket0.md @@ -0,0 +1 @@ +*foo *bar** diff --git a/test/specs/what/em_left_square_bracket00.html b/test/specs/what/em_left_square_bracket00.html new file mode 100644 index 0000000000..67ef9766e2 --- /dev/null +++ b/test/specs/what/em_left_square_bracket00.html @@ -0,0 +1 @@ +

foo bar baz

diff --git a/test/specs/what/em_left_square_bracket00.md b/test/specs/what/em_left_square_bracket00.md new file mode 100644 index 0000000000..bc4e9514af --- /dev/null +++ b/test/specs/what/em_left_square_bracket00.md @@ -0,0 +1 @@ +*foo **bar** baz* diff --git a/test/specs/what/em_left_square_bracket000.html b/test/specs/what/em_left_square_bracket000.html new file mode 100644 index 0000000000..2a71b393e9 --- /dev/null +++ b/test/specs/what/em_left_square_bracket000.html @@ -0,0 +1 @@ +

foo _

diff --git a/test/specs/what/em_left_square_bracket0000.html b/test/specs/what/em_left_square_bracket0000.html new file mode 100644 index 0000000000..cd620e6acf --- /dev/null +++ b/test/specs/what/em_left_square_bracket0000.html @@ -0,0 +1 @@ +

(foo)

diff --git a/test/specs/what/em_left_square_bracket0000.md b/test/specs/what/em_left_square_bracket0000.md new file mode 100644 index 0000000000..261a3189a0 --- /dev/null +++ b/test/specs/what/em_left_square_bracket0000.md @@ -0,0 +1 @@ +*(**foo**)* diff --git a/test/specs/whats/strong_and_em_together.html b/test/specs/whats/strong_and_em_together.html new file mode 100644 index 0000000000..71ec78c709 --- /dev/null +++ b/test/specs/whats/strong_and_em_together.html @@ -0,0 +1,7 @@ +

This is strong and em.

+ +

So is this word.

+ +

This is strong and em.

+ +

So is this word.

diff --git a/test/specs/whats/strong_and_em_together.md b/test/specs/whats/strong_and_em_together.md new file mode 100644 index 0000000000..95ee690dbe --- /dev/null +++ b/test/specs/whats/strong_and_em_together.md @@ -0,0 +1,7 @@ +***This is strong and em.*** + +So is ***this*** word. + +___This is strong and em.___ + +So is ___this___ word. From 7c6551e5422172ebfda6b0acf9d900fb92aaa595 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Fri, 12 Jun 2020 15:30:25 -0400 Subject: [PATCH 05/24] Properly handle reflinks that should be escaped Modifies the em rule after the block tokens are generated to detect known reflinks and skip over them so they don't get mistakenly italicized. --- src/Lexer.js | 9 ++++++++- src/rules.js | 12 +++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index 519ef88758..dc476dfe1f 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -1,6 +1,7 @@ const Tokenizer = require('./Tokenizer.js'); const { defaults } = require('./defaults.js'); const { block, inline } = require('./rules.js'); +const { edit } = require('./helpers.js'); /** * smartypants text replacement @@ -102,6 +103,12 @@ module.exports = class Lexer { this.blockTokens(src, this.tokens, true); + //Insert known reflinks into em rules to properly skip over them + var rep = Object.keys(this.tokens.links).join('|').replace(/\*/g,'\\*'); + this.tokenizer.rules.inline.em = edit(inline.em) + .replace(/reflink/g, rep) + .getRegex(); + this.inline(this.tokens); return this.tokens; @@ -368,7 +375,7 @@ module.exports = class Lexer { } // em - if (token = this.tokenizer.em(src, prevChar)) { + if (token = this.tokenizer.em(src, prevChar, this.tokens.links)) { src = src.substring(token.raw.length); token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock); tokens.push(token); diff --git a/src/rules.js b/src/rules.js index be90fcfcb6..0c9c5c6742 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,8 +169,8 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, - //==> (1) returns if starts w/ punctuation | (2) ⬐ Check groups [],``,<> to escape ⬐ escape if needed ⬐ repeated logic in case inner *'s exist inner *'s must occur in pairs⬎ Either: ⬐ last char can't be punct OR ⬐ final * must also be followed by punct (or endline) | (3) Underscores | (4) Underscores | (5) Underscores - em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!\[.*?\]|`.*?`|<.*?>)(?:[^\*]|[\\\s]\*)|\[.*?\]|`.*?`|<.*?>)|(?:(?:(?!\[.*?\]|`.*?`|<.*?>)(?:[^\*]|[\\\s]\*)|\[.*?\]|`.*?`|<.*?>)*?(? (1) returns if starts w/ punctuation | (2) ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎ ⬐last char can't be punct OR ⬐final * must also be followed by punct (or endline) | (3) Underscores | (4) Underscores | (5) Underscores + em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(??@\\[^_{|}~'; inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex(); -inline.em = edit(inline.em).replace(/punctuation/g, inline._punctuation).getRegex(); +// sequences em should skip over [reflink], [title][reflink], [title](link), `code`, +inline._emSkip = '\\[reflink\\]|\\[.*?\\]\\[reflink\\]|\\[.*?\\]\\(.*?\\)|`.*?`|<.*?>'; + +inline.em = edit(inline.em) + .replace(/punctuation/g, inline._punctuation) + .replace(/emSkip/g, inline._emSkip) + .getRegex(); inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g; From bc17deddb65a6baa3c7a566143df70481d1c9103 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Fri, 12 Jun 2020 15:37:18 -0400 Subject: [PATCH 06/24] Lint --- src/Lexer.js | 6 +++--- src/Tokenizer.js | 6 +++--- src/rules.js | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index dc476dfe1f..b275ed5861 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -103,8 +103,8 @@ module.exports = class Lexer { this.blockTokens(src, this.tokens, true); - //Insert known reflinks into em rules to properly skip over them - var rep = Object.keys(this.tokens.links).join('|').replace(/\*/g,'\\*'); + // Insert known reflinks into em rules to properly skip over them + const rep = Object.keys(this.tokens.links).join('|').replace(/\*/g, '\\*'); this.tokenizer.rules.inline.em = edit(inline.em) .replace(/reflink/g, rep) .getRegex(); @@ -274,7 +274,7 @@ module.exports = class Lexer { case 'text': case 'heading': { token.tokens = []; - this.inlineTokens(token.text, token.tokens, undefined, undefined, ); + this.inlineTokens(token.text, token.tokens, undefined, undefined); break; } case 'table': { diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 77f48f91ff..de71e130ed 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -489,7 +489,7 @@ module.exports = class Tokenizer { } } - strong(src, prevChar='') { + strong(src, prevChar = '') { const cap = this.rules.inline.strong.exec(src); if (cap) { return { @@ -500,10 +500,10 @@ module.exports = class Tokenizer { } } - em(src, prevChar='') { + em(src, prevChar = '') { const cap = this.rules.inline.em.exec(src); if (cap) { - if(!cap[1] || (cap[1] && (prevChar == '' || this.rules.inline.punctuation.exec(prevChar)))) { + if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { return { type: 'em', raw: cap[0], diff --git a/src/rules.js b/src/rules.js index 0c9c5c6742..8000e751ea 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,9 +169,8 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, - //==> (1) returns if starts w/ punctuation | (2) ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎ ⬐last char can't be punct OR ⬐final * must also be followed by punct (or endline) | (3) Underscores | (4) Underscores | (5) Underscores + // (1) returns if starts w/ punctuation | (2) ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎ ⬐last char can't be punct OR ⬐final * must also be followed by punct (or endline) | (3) Underscores | (4) Underscores | (5) Underscores em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(? Date: Fri, 12 Jun 2020 15:40:13 -0400 Subject: [PATCH 07/24] Lint 2 --- test/specs/run-spec.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/specs/run-spec.js b/test/specs/run-spec.js index debf91ef2d..5fb5651b6d 100644 --- a/test/specs/run-spec.js +++ b/test/specs/run-spec.js @@ -53,5 +53,3 @@ runSpecs('Original', './original', false, { gfm: false, pedantic: true }); runSpecs('New', './new'); runSpecs('ReDOS', './redos'); runSpecs('Security', './security', false, { silent: true }); // silent - do not show deprecation warning - -//runSpecs('Whats', './whats'); From 556070bffc361c440d091fd94714859196058aff Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Fri, 12 Jun 2020 16:29:25 -0400 Subject: [PATCH 08/24] Updated rules for underscore em Now fixes three more cases --- src/Tokenizer.js | 2 +- src/rules.js | 9 ++++----- test/specs/commonmark/commonmark.0.29.json | 6 +++--- test/specs/gfm/commonmark.0.29.json | 6 +++--- test/specs/new/em_2char.html | 2 -- test/specs/new/em_2char.md | 2 -- 6 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index de71e130ed..0c20d029ae 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -507,7 +507,7 @@ module.exports = class Tokenizer { return { type: 'em', raw: cap[0], - text: cap[8] || cap[7] || cap[6] || cap[5] || cap[4] || cap[3] || cap[2] + text: cap[3] || cap[2] }; } } diff --git a/src/rules.js b/src/rules.js index 8000e751ea..fc3a5e2646 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,8 +169,8 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, - // (1) returns if starts w/ punctuation | (2) ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎ ⬐last char can't be punct OR ⬐final * must also be followed by punct (or endline) | (3) Underscores | (4) Underscores | (5) Underscores - em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(??@\\[^_{|}~'; +// without * and _ to workaround cases with double emphasis +inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~'; inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex(); // sequences em should skip over [reflink], [title][reflink], [title](link), `code`, diff --git a/test/specs/commonmark/commonmark.0.29.json b/test/specs/commonmark/commonmark.0.29.json index 0ee5930a87..276477b61e 100644 --- a/test/specs/commonmark/commonmark.0.29.json +++ b/test/specs/commonmark/commonmark.0.29.json @@ -3010,7 +3010,7 @@ "start_line": 6497, "end_line": 6501, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "_(_foo_)_\n", @@ -3019,7 +3019,7 @@ "start_line": 6506, "end_line": 6510, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "_foo_bar\n", @@ -3301,7 +3301,7 @@ "start_line": 6824, "end_line": 6828, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "__foo_ bar_\n", diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json index edb3d9d616..7422d714fe 100644 --- a/test/specs/gfm/commonmark.0.29.json +++ b/test/specs/gfm/commonmark.0.29.json @@ -3010,7 +3010,7 @@ "start_line": 6497, "end_line": 6501, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "_(_foo_)_\n", @@ -3019,7 +3019,7 @@ "start_line": 6506, "end_line": 6510, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "_foo_bar\n", @@ -3301,7 +3301,7 @@ "start_line": 6824, "end_line": 6828, "section": "Emphasis and strong emphasis", - "shouldFail": true + "shouldFail": false }, { "markdown": "__foo_ bar_\n", diff --git a/test/specs/new/em_2char.html b/test/specs/new/em_2char.html index c28eb492d8..9ffe69be1c 100644 --- a/test/specs/new/em_2char.html +++ b/test/specs/new/em_2char.html @@ -20,8 +20,6 @@

_ 123_

-

1_

-

It’s leviOHsa, not levioSAH.

__ test test

diff --git a/test/specs/new/em_2char.md b/test/specs/new/em_2char.md index 37c2e03b25..a1f5cf18b6 100644 --- a/test/specs/new/em_2char.md +++ b/test/specs/new/em_2char.md @@ -20,8 +20,6 @@ _123 _ _ 123_ -_1__ - It’s levi*OH*sa, not levio*SAH.* __ test [test](https://test.com/_) From 4cbba074d6e7b8e6e829dc846b18982e6bda181b Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 17 Jun 2020 00:41:06 -0400 Subject: [PATCH 09/24] Moved logic into Tokenizer. No longer injecting Reflinks Reflinks are replaced with the string 'aaaa' of equal length to hide any asterisks or underscores that might have been inside, since these should not be included. Then, the resulting string is matched to the em regex. The positions of the captures are then used on the original string. --- src/Lexer.js | 9 +----- src/Tokenizer.js | 34 +++++++++++++++++----- src/rules.js | 5 ++-- test/specs/commonmark/commonmark.0.29.json | 21 +++++-------- test/specs/gfm/commonmark.0.29.json | 18 ++++-------- test/specs/new/em_and_reflinks.html | 4 +++ test/specs/new/em_and_reflinks.md | 11 +++++++ 7 files changed, 58 insertions(+), 44 deletions(-) create mode 100644 test/specs/new/em_and_reflinks.html create mode 100644 test/specs/new/em_and_reflinks.md diff --git a/src/Lexer.js b/src/Lexer.js index b275ed5861..ebe15cb170 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -1,7 +1,6 @@ const Tokenizer = require('./Tokenizer.js'); const { defaults } = require('./defaults.js'); const { block, inline } = require('./rules.js'); -const { edit } = require('./helpers.js'); /** * smartypants text replacement @@ -103,12 +102,6 @@ module.exports = class Lexer { this.blockTokens(src, this.tokens, true); - // Insert known reflinks into em rules to properly skip over them - const rep = Object.keys(this.tokens.links).join('|').replace(/\*/g, '\\*'); - this.tokenizer.rules.inline.em = edit(inline.em) - .replace(/reflink/g, rep) - .getRegex(); - this.inline(this.tokens); return this.tokens; @@ -274,7 +267,7 @@ module.exports = class Lexer { case 'text': case 'heading': { token.tokens = []; - this.inlineTokens(token.text, token.tokens, undefined, undefined); + this.inlineTokens(token.text, token.tokens); break; } case 'table': { diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 0c20d029ae..1b3acc90a2 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -500,15 +500,33 @@ module.exports = class Tokenizer { } } - em(src, prevChar = '') { - const cap = this.rules.inline.em.exec(src); + em(src, prevChar = '', links) { + let cap = this.rules.inline.preEm.exec(src); + if (cap) { - if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { - return { - type: 'em', - raw: cap[0], - text: cap[3] || cap[2] - }; + let text = src; + + if (links) { + links = Object.keys(links); + const reg = /(?:\[.*?\]\[.*?\])|(?:\[.*?\](?!\())/g; + let match; + while ((match = reg.exec(text)) != null) { + if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { + text = text.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + text.slice(reg.lastIndex); + } + } + } + + cap = this.rules.inline.em.exec(text); + + if (cap) { + if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { + return { + type: 'em', + raw: src.slice(0, cap[0].length), + text: src.slice(1, cap[0].length - 1) + }; + } } } } diff --git a/src/rules.js b/src/rules.js index fc3a5e2646..99dba29e8a 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,6 +169,7 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, + preEm: /^[\*_]/, // (1) returns if starts w/ punctuation | (2) ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎ ⬐last char can't be punct OR ⬐final * must also be followed by punct (or endline) | (3) Underscores ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner _'s (must be in pairs)⬎ ⬐last char can't be a space, and final _ must be followed by punct (or endline) em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(??@\\[\\]`^{|}~'; inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex(); -// sequences em should skip over [reflink], [title][reflink], [title](link), `code`, -inline._emSkip = '\\[reflink\\]|\\[.*?\\]\\[reflink\\]|\\[.*?\\]\\(.*?\\)|`.*?`|<.*?>'; +// sequences em should skip over [title](link), `code`, +inline._emSkip = '\\[.*?\\]\\(.*?\\)|`.*?`|<.*?>'; inline.em = edit(inline.em) .replace(/punctuation/g, inline._punctuation) diff --git a/test/specs/commonmark/commonmark.0.29.json b/test/specs/commonmark/commonmark.0.29.json index 276477b61e..0be40a1b73 100644 --- a/test/specs/commonmark/commonmark.0.29.json +++ b/test/specs/commonmark/commonmark.0.29.json @@ -2765,8 +2765,7 @@ "example": 341, "start_line": 6003, "end_line": 6007, - "section": "Code spans", - "shouldFail": false + "section": "Code spans" }, { "markdown": "[not a `link](/foo`)\n", @@ -2975,8 +2974,7 @@ "example": 367, "start_line": 6455, "end_line": 6459, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "*(*foo*)*\n", @@ -2984,8 +2982,7 @@ "example": 368, "start_line": 6465, "end_line": 6469, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "*foo*bar\n", @@ -3009,8 +3006,7 @@ "example": 371, "start_line": 6497, "end_line": 6501, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "_(_foo_)_\n", @@ -3018,8 +3014,7 @@ "example": 372, "start_line": 6506, "end_line": 6510, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "_foo_bar\n", @@ -3300,8 +3295,7 @@ "example": 406, "start_line": 6824, "end_line": 6828, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "__foo_ bar_\n", @@ -3393,8 +3387,7 @@ "example": 417, "start_line": 6928, "end_line": 6932, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "*foo [*bar*](/url)*\n", diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json index 7422d714fe..273226e91a 100644 --- a/test/specs/gfm/commonmark.0.29.json +++ b/test/specs/gfm/commonmark.0.29.json @@ -2765,8 +2765,7 @@ "example": 341, "start_line": 6003, "end_line": 6007, - "section": "Code spans", - "shouldFail": false + "section": "Code spans" }, { "markdown": "[not a `link](/foo`)\n", @@ -2984,8 +2983,7 @@ "example": 368, "start_line": 6465, "end_line": 6469, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "*foo*bar\n", @@ -3009,8 +3007,7 @@ "example": 371, "start_line": 6497, "end_line": 6501, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "_(_foo_)_\n", @@ -3018,8 +3015,7 @@ "example": 372, "start_line": 6506, "end_line": 6510, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "_foo_bar\n", @@ -3300,8 +3296,7 @@ "example": 406, "start_line": 6824, "end_line": 6828, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "__foo_ bar_\n", @@ -3393,8 +3388,7 @@ "example": 417, "start_line": 6928, "end_line": 6932, - "section": "Emphasis and strong emphasis", - "shouldFail": false + "section": "Emphasis and strong emphasis" }, { "markdown": "*foo [*bar*](/url)*\n", diff --git a/test/specs/new/em_and_reflinks.html b/test/specs/new/em_and_reflinks.html new file mode 100644 index 0000000000..45953c61e3 --- /dev/null +++ b/test/specs/new/em_and_reflinks.html @@ -0,0 +1,4 @@ +

Helloreflink*topguys!

+

Hello [notreflink] guys*!

+

Hello [notareflink] guys!

+

Helloreflink*bottomguys!

diff --git a/test/specs/new/em_and_reflinks.md b/test/specs/new/em_and_reflinks.md new file mode 100644 index 0000000000..19ddd9df3c --- /dev/null +++ b/test/specs/new/em_and_reflinks.md @@ -0,0 +1,11 @@ +[reflink*top]: theaddress + +*Hello [reflink*top] guys*! + +*Hello [not*reflink] guys*! + +*Hello [not*a*reflink] guys*! + +*Hello [reflink*bottom] guys*! + +[reflink*bottom]: theaddress From 335a6601d4a2d35c5957e5908de7d9cd1f86512c Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 17 Jun 2020 11:06:25 -0400 Subject: [PATCH 10/24] Added fixes to Strong Fixes examples 391, 397, 399, 400, 401, 431, 443, 475, 476, 479, and 480 --- src/Lexer.js | 2 +- src/Tokenizer.js | 33 +++++++++++++++----- src/rules.js | 10 ++++-- test/specs/commonmark/commonmark.0.29.json | 36 ++++++++-------------- test/specs/gfm/commonmark.0.29.json | 36 ++++++++-------------- 5 files changed, 59 insertions(+), 58 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index ebe15cb170..4bb2750dc2 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -360,7 +360,7 @@ module.exports = class Lexer { } // strong - if (token = this.tokenizer.strong(src, prevChar)) { + if (token = this.tokenizer.strong(src, prevChar, this.tokens.links)) { src = src.substring(token.raw.length); token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock); tokens.push(token); diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 1b3acc90a2..2c13adf19f 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -489,14 +489,33 @@ module.exports = class Tokenizer { } } - strong(src, prevChar = '') { - const cap = this.rules.inline.strong.exec(src); + strong(src, prevChar = '', links) { + let cap = this.rules.inline.preStrong.exec(src); + if (cap) { - return { - type: 'strong', - raw: cap[0], - text: cap[4] || cap[3] || cap[2] || cap[1] - }; + let text = src; + if (links) { + links = Object.keys(links); + const reg = /(?:\[.*?\]\[.*?\])|(?:\[.*?\](?!\())/g; + let match; + while ((match = reg.exec(text)) != null) { + if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { + text = text.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + text.slice(reg.lastIndex); + } + } + } + + cap = this.rules.inline.strong.exec(text); + + if (cap) { + if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { + return { + type: 'strong', + raw: src.slice(0, cap[0].length), + text: src.slice(2, cap[0].length - 2) + }; + } + } } } diff --git a/src/rules.js b/src/rules.js index 99dba29e8a..ab2f963817 100644 --- a/src/rules.js +++ b/src/rules.js @@ -168,7 +168,8 @@ const inline = { link: /^!?\[(label)\]\(\s*(href)(?:\s+(title))?\s*\)/, reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, - strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/, + preStrong: /^(?:\*\*)|(?:__)/, + strong: /^(?:(\*\*(?=[`\]\*punctuation]))|\*\*)(?![\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(??@\[\]\\^_`{|}~])/g; inline._scheme = /[a-zA-Z][a-zA-Z0-9+.-]{1,31}/; diff --git a/test/specs/commonmark/commonmark.0.29.json b/test/specs/commonmark/commonmark.0.29.json index 0be40a1b73..8f24dcacb8 100644 --- a/test/specs/commonmark/commonmark.0.29.json +++ b/test/specs/commonmark/commonmark.0.29.json @@ -3070,8 +3070,7 @@ "example": 379, "start_line": 6570, "end_line": 6574, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "foo**bar**\n", @@ -3170,8 +3169,7 @@ "example": 391, "start_line": 6685, "end_line": 6689, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "*(**foo**)*\n", @@ -3219,8 +3217,7 @@ "example": 397, "start_line": 6742, "end_line": 6746, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "_(__foo__)_\n", @@ -3236,8 +3233,7 @@ "example": 399, "start_line": 6761, "end_line": 6765, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__пристаням__стремятся\n", @@ -3245,8 +3241,7 @@ "example": 400, "start_line": 6768, "end_line": 6772, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__foo__bar__baz__\n", @@ -3254,8 +3249,7 @@ "example": 401, "start_line": 6775, "end_line": 6779, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__(bar)__.\n", @@ -3501,8 +3495,7 @@ "example": 431, "start_line": 7041, "end_line": 7047, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "**foo [*bar*](/url)**\n", @@ -3599,8 +3592,7 @@ "example": 443, "start_line": 7136, "end_line": 7140, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "****foo*\n", @@ -3869,8 +3861,7 @@ "example": 475, "start_line": 7385, "end_line": 7389, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__\n", @@ -3878,8 +3869,7 @@ "example": 476, "start_line": 7392, "end_line": 7396, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "*a `*`*\n", @@ -3903,8 +3893,7 @@ "example": 479, "start_line": 7413, "end_line": 7417, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__a\n", @@ -3912,8 +3901,7 @@ "example": 480, "start_line": 7420, "end_line": 7424, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "[link](/uri \"title\")\n", diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json index 273226e91a..2200dde2cb 100644 --- a/test/specs/gfm/commonmark.0.29.json +++ b/test/specs/gfm/commonmark.0.29.json @@ -3071,8 +3071,7 @@ "example": 379, "start_line": 6570, "end_line": 6574, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "foo**bar**\n", @@ -3171,8 +3170,7 @@ "example": 391, "start_line": 6685, "end_line": 6689, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "*(**foo**)*\n", @@ -3220,8 +3218,7 @@ "example": 397, "start_line": 6742, "end_line": 6746, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "_(__foo__)_\n", @@ -3237,8 +3234,7 @@ "example": 399, "start_line": 6761, "end_line": 6765, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__пристаням__стремятся\n", @@ -3246,8 +3242,7 @@ "example": 400, "start_line": 6768, "end_line": 6772, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__foo__bar__baz__\n", @@ -3255,8 +3250,7 @@ "example": 401, "start_line": 6775, "end_line": 6779, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__(bar)__.\n", @@ -3502,8 +3496,7 @@ "example": 431, "start_line": 7041, "end_line": 7047, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "**foo [*bar*](/url)**\n", @@ -3600,8 +3593,7 @@ "example": 443, "start_line": 7136, "end_line": 7140, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "****foo*\n", @@ -3870,8 +3862,7 @@ "example": 475, "start_line": 7385, "end_line": 7389, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__\n", @@ -3879,8 +3870,7 @@ "example": 476, "start_line": 7392, "end_line": 7396, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "*a `*`*\n", @@ -3904,8 +3894,7 @@ "example": 479, "start_line": 7413, "end_line": 7417, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "__a\n", @@ -3913,8 +3902,7 @@ "example": 480, "start_line": 7420, "end_line": 7424, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "[link](/uri \"title\")\n", From e926e0cbfeafe532327795a6e75584757c5c235e Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 17 Jun 2020 11:08:11 -0400 Subject: [PATCH 11/24] Lint... --- src/Tokenizer.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 2c13adf19f..d0c86d3855 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -504,7 +504,7 @@ module.exports = class Tokenizer { } } } - + cap = this.rules.inline.strong.exec(text); if (cap) { From c60c9ba4faf13335554b418529966b037fe42327 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 17 Jun 2020 16:09:42 -0400 Subject: [PATCH 12/24] Remove extra tests accidentally left in --- test/specs/what/em_left_square_bracket.html | 4 ---- test/specs/what/em_left_square_bracket.md | 10 ---------- test/specs/what/em_left_square_bracket0.html | 1 - test/specs/what/em_left_square_bracket0.md | 1 - test/specs/what/em_left_square_bracket00.html | 1 - test/specs/what/em_left_square_bracket00.md | 1 - test/specs/what/em_left_square_bracket000.html | 1 - test/specs/what/em_left_square_bracket000.md | 1 - test/specs/what/em_left_square_bracket0000.html | 1 - test/specs/what/em_left_square_bracket0000.md | 1 - test/specs/whats/strong_and_em_together.html | 7 ------- test/specs/whats/strong_and_em_together.md | 7 ------- 12 files changed, 36 deletions(-) delete mode 100644 test/specs/what/em_left_square_bracket.html delete mode 100644 test/specs/what/em_left_square_bracket.md delete mode 100644 test/specs/what/em_left_square_bracket0.html delete mode 100644 test/specs/what/em_left_square_bracket0.md delete mode 100644 test/specs/what/em_left_square_bracket00.html delete mode 100644 test/specs/what/em_left_square_bracket00.md delete mode 100644 test/specs/what/em_left_square_bracket000.html delete mode 100644 test/specs/what/em_left_square_bracket000.md delete mode 100644 test/specs/what/em_left_square_bracket0000.html delete mode 100644 test/specs/what/em_left_square_bracket0000.md delete mode 100644 test/specs/whats/strong_and_em_together.html delete mode 100644 test/specs/whats/strong_and_em_together.md diff --git a/test/specs/what/em_left_square_bracket.html b/test/specs/what/em_left_square_bracket.html deleted file mode 100644 index 7132caf872..0000000000 --- a/test/specs/what/em_left_square_bracket.html +++ /dev/null @@ -1,4 +0,0 @@ -

[[punctuation, asterisk, punctuation should work

-

[space, asterisk, punctuation should work

-

pnon-punctuation, asterisk, non-punctuation should work

-

p*[non-punctuation, asterisk, punctuation should NOT work*

diff --git a/test/specs/what/em_left_square_bracket.md b/test/specs/what/em_left_square_bracket.md deleted file mode 100644 index 4f6d7a70de..0000000000 --- a/test/specs/what/em_left_square_bracket.md +++ /dev/null @@ -1,10 +0,0 @@ -[*[punctuation, asterisk, punctuation should work* - - - *[space, asterisk, punctuation should work* - - -p*non-punctuation, asterisk, non-punctuation should work* - - -p*[non-punctuation, asterisk, punctuation should NOT work* diff --git a/test/specs/what/em_left_square_bracket0.html b/test/specs/what/em_left_square_bracket0.html deleted file mode 100644 index 8f88af1862..0000000000 --- a/test/specs/what/em_left_square_bracket0.html +++ /dev/null @@ -1 +0,0 @@ -

foo bar

diff --git a/test/specs/what/em_left_square_bracket0.md b/test/specs/what/em_left_square_bracket0.md deleted file mode 100644 index 300738b412..0000000000 --- a/test/specs/what/em_left_square_bracket0.md +++ /dev/null @@ -1 +0,0 @@ -*foo *bar** diff --git a/test/specs/what/em_left_square_bracket00.html b/test/specs/what/em_left_square_bracket00.html deleted file mode 100644 index 67ef9766e2..0000000000 --- a/test/specs/what/em_left_square_bracket00.html +++ /dev/null @@ -1 +0,0 @@ -

foo bar baz

diff --git a/test/specs/what/em_left_square_bracket00.md b/test/specs/what/em_left_square_bracket00.md deleted file mode 100644 index bc4e9514af..0000000000 --- a/test/specs/what/em_left_square_bracket00.md +++ /dev/null @@ -1 +0,0 @@ -*foo **bar** baz* diff --git a/test/specs/what/em_left_square_bracket000.html b/test/specs/what/em_left_square_bracket000.html deleted file mode 100644 index 2a71b393e9..0000000000 --- a/test/specs/what/em_left_square_bracket000.html +++ /dev/null @@ -1 +0,0 @@ -

foo _

diff --git a/test/specs/what/em_left_square_bracket000.md b/test/specs/what/em_left_square_bracket000.md deleted file mode 100644 index 7bc24b7d2e..0000000000 --- a/test/specs/what/em_left_square_bracket000.md +++ /dev/null @@ -1 +0,0 @@ -foo *_* diff --git a/test/specs/what/em_left_square_bracket0000.html b/test/specs/what/em_left_square_bracket0000.html deleted file mode 100644 index cd620e6acf..0000000000 --- a/test/specs/what/em_left_square_bracket0000.html +++ /dev/null @@ -1 +0,0 @@ -

(foo)

diff --git a/test/specs/what/em_left_square_bracket0000.md b/test/specs/what/em_left_square_bracket0000.md deleted file mode 100644 index 261a3189a0..0000000000 --- a/test/specs/what/em_left_square_bracket0000.md +++ /dev/null @@ -1 +0,0 @@ -*(**foo**)* diff --git a/test/specs/whats/strong_and_em_together.html b/test/specs/whats/strong_and_em_together.html deleted file mode 100644 index 71ec78c709..0000000000 --- a/test/specs/whats/strong_and_em_together.html +++ /dev/null @@ -1,7 +0,0 @@ -

This is strong and em.

- -

So is this word.

- -

This is strong and em.

- -

So is this word.

diff --git a/test/specs/whats/strong_and_em_together.md b/test/specs/whats/strong_and_em_together.md deleted file mode 100644 index 95ee690dbe..0000000000 --- a/test/specs/whats/strong_and_em_together.md +++ /dev/null @@ -1,7 +0,0 @@ -***This is strong and em.*** - -So is ***this*** word. - -___This is strong and em.___ - -So is ___this___ word. From 54218fe1644cf216a014fa15efa51e4ab1ccf056 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 17 Jun 2020 17:09:18 -0400 Subject: [PATCH 13/24] Remove straggling "shouldfail: false" --- test/specs/gfm/commonmark.0.29.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json index 2200dde2cb..8b53f95a0d 100644 --- a/test/specs/gfm/commonmark.0.29.json +++ b/test/specs/gfm/commonmark.0.29.json @@ -2974,8 +2974,7 @@ "example": 367, "start_line": 6455, "end_line": 6459, - "section": "Emphasis and strong emphasis", - "shouldFail":false + "section": "Emphasis and strong emphasis" }, { "markdown": "*(*foo*)*\n", From 2a45677bf054b818bee4a8261f6c3cc93f5639c5 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Thu, 18 Jun 2020 10:00:27 -0400 Subject: [PATCH 14/24] Remove redundant regex symbols Found a few more cases of redundant symbols in addition to fixes suggested during review. --- src/rules.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/rules.js b/src/rules.js index ab2f963817..3b081227cf 100644 --- a/src/rules.js +++ b/src/rules.js @@ -168,16 +168,16 @@ const inline = { link: /^!?\[(label)\]\(\s*(href)(?:\s+(title))?\s*\)/, reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, - preStrong: /^(?:\*\*)|(?:__)/, - strong: /^(?:(\*\*(?=[`\]\*punctuation]))|\*\*)(?![\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(? Date: Sat, 20 Jun 2020 10:25:48 -0500 Subject: [PATCH 15/24] mask reflinks --- src/Tokenizer.js | 40 +++++++++++++++++----------------------- src/rules.js | 6 ++++++ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index d0c86d3855..3b678e124f 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -58,6 +58,21 @@ function indentCodeCompensation(raw, text) { .join('\n'); } +function maskReflinks(text, links) { + if (links) { + links = Object.keys(links).filter(l => l.match(/[*_]/)); + if (links.length > 0) { + let match; + while ((match = this.rules.inline.reflinkSearch.exec(text)) != null) { + if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { + text = text.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + text.slice(this.rules.inline.reflinkSearch.lastIndex); + } + } + } + } + return text; +} + /** * Tokenizer */ @@ -493,17 +508,7 @@ module.exports = class Tokenizer { let cap = this.rules.inline.preStrong.exec(src); if (cap) { - let text = src; - if (links) { - links = Object.keys(links); - const reg = /(?:\[.*?\]\[.*?\])|(?:\[.*?\](?!\())/g; - let match; - while ((match = reg.exec(text)) != null) { - if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { - text = text.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + text.slice(reg.lastIndex); - } - } - } + const text = maskReflinks(src, links); cap = this.rules.inline.strong.exec(text); @@ -523,18 +528,7 @@ module.exports = class Tokenizer { let cap = this.rules.inline.preEm.exec(src); if (cap) { - let text = src; - - if (links) { - links = Object.keys(links); - const reg = /(?:\[.*?\]\[.*?\])|(?:\[.*?\](?!\())/g; - let match; - while ((match = reg.exec(text)) != null) { - if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { - text = text.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + text.slice(reg.lastIndex); - } - } - } + const text = maskReflinks(src, links); cap = this.rules.inline.em.exec(text); diff --git a/src/rules.js b/src/rules.js index 3b081227cf..19b420e97f 100644 --- a/src/rules.js +++ b/src/rules.js @@ -168,6 +168,7 @@ const inline = { link: /^!?\[(label)\]\(\s*(href)(?:\s+(title))?\s*\)/, reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, + reflinkSearch: 'reflink|nolink(?!\\()', preStrong: /^(?:\*\*|__)/, strong: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])((?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)*?(? Date: Tue, 30 Jun 2020 17:49:11 -0400 Subject: [PATCH 16/24] Links are masked only once per inline string --- src/Lexer.js | 18 ++++++++++++++++-- src/Tokenizer.js | 29 ++++++----------------------- test/specs/new/em_and_reflinks.html | 1 + test/specs/new/em_and_reflinks.md | 4 ++++ 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index 4bb2750dc2..587fb9b03c 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -322,6 +322,20 @@ module.exports = class Lexer { inlineTokens(src, tokens = [], inLink = false, inRawBlock = false, prevChar = '') { let token; + // String with links masked to avoid interference with em and strong + let maskedSrc = src; + if (this.tokens.links) { + const links = Object.keys(this.tokens.links); + if (links.length > 0) { + let match; + while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) { + if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { + maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex); + } + } + } + } + while (src) { // escape if (token = this.tokenizer.escape(src)) { @@ -360,7 +374,7 @@ module.exports = class Lexer { } // strong - if (token = this.tokenizer.strong(src, prevChar, this.tokens.links)) { + if (token = this.tokenizer.strong(src, maskedSrc, prevChar)) { src = src.substring(token.raw.length); token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock); tokens.push(token); @@ -368,7 +382,7 @@ module.exports = class Lexer { } // em - if (token = this.tokenizer.em(src, prevChar, this.tokens.links)) { + if (token = this.tokenizer.em(src, maskedSrc, prevChar)) { src = src.substring(token.raw.length); token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock); tokens.push(token); diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 3b678e124f..419b8bb711 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -58,21 +58,6 @@ function indentCodeCompensation(raw, text) { .join('\n'); } -function maskReflinks(text, links) { - if (links) { - links = Object.keys(links).filter(l => l.match(/[*_]/)); - if (links.length > 0) { - let match; - while ((match = this.rules.inline.reflinkSearch.exec(text)) != null) { - if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { - text = text.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + text.slice(this.rules.inline.reflinkSearch.lastIndex); - } - } - } - } - return text; -} - /** * Tokenizer */ @@ -504,13 +489,12 @@ module.exports = class Tokenizer { } } - strong(src, prevChar = '', links) { + strong(src, maskedSrc, prevChar = '') { let cap = this.rules.inline.preStrong.exec(src); if (cap) { - const text = maskReflinks(src, links); - - cap = this.rules.inline.strong.exec(text); + maskedSrc = maskedSrc.slice(-1*src.length); + cap = this.rules.inline.strong.exec(maskedSrc); if (cap) { if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { @@ -524,13 +508,12 @@ module.exports = class Tokenizer { } } - em(src, prevChar = '', links) { + em(src, maskedSrc, prevChar = '') { let cap = this.rules.inline.preEm.exec(src); if (cap) { - const text = maskReflinks(src, links); - - cap = this.rules.inline.em.exec(text); + maskedSrc = maskedSrc.slice(-1*src.length); + cap = this.rules.inline.em.exec(maskedSrc); if (cap) { if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { diff --git a/test/specs/new/em_and_reflinks.html b/test/specs/new/em_and_reflinks.html index 45953c61e3..32eb3d41aa 100644 --- a/test/specs/new/em_and_reflinks.html +++ b/test/specs/new/em_and_reflinks.html @@ -2,3 +2,4 @@

Hello [notreflink] guys*!

Hello [notareflink] guys!

Helloreflink*bottomguys!

+

Helloreflinknoemguys!

diff --git a/test/specs/new/em_and_reflinks.md b/test/specs/new/em_and_reflinks.md index 19ddd9df3c..09c9b66bb3 100644 --- a/test/specs/new/em_and_reflinks.md +++ b/test/specs/new/em_and_reflinks.md @@ -8,4 +8,8 @@ *Hello [reflink*bottom] guys*! +*Hello [reflinknoem] guys*! + [reflink*bottom]: theaddress + +[reflinknoem]: theaddress From 4e7902ec11639ef02bea37f8d50b8416b8ce31e5 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Tue, 30 Jun 2020 17:50:19 -0400 Subject: [PATCH 17/24] Gaaaah lint --- src/Tokenizer.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 419b8bb711..410255acc7 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -493,7 +493,7 @@ module.exports = class Tokenizer { let cap = this.rules.inline.preStrong.exec(src); if (cap) { - maskedSrc = maskedSrc.slice(-1*src.length); + maskedSrc = maskedSrc.slice(-1 * src.length); cap = this.rules.inline.strong.exec(maskedSrc); if (cap) { @@ -512,7 +512,7 @@ module.exports = class Tokenizer { let cap = this.rules.inline.preEm.exec(src); if (cap) { - maskedSrc = maskedSrc.slice(-1*src.length); + maskedSrc = maskedSrc.slice(-1 * src.length); cap = this.rules.inline.em.exec(maskedSrc); if (cap) { From bd4f8c464befad2b304d51e33e89e567326e62e0 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Thu, 2 Jul 2020 11:59:31 -0400 Subject: [PATCH 18/24] Fix unrestricted "any character" for REDOS And remove redundant unused capture group. --- src/rules.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rules.js b/src/rules.js index 19b420e97f..6c9269ef6f 100644 --- a/src/rules.js +++ b/src/rules.js @@ -173,7 +173,7 @@ const inline = { strong: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])((?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)*?(??@\\[\\]`^{|}~'; inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex(); // sequences em should skip over [title](link), `code`, -inline._emSkip = '\\[.*?\\]\\(.*?\\)|`.*?`|<.*?>'; +inline._emSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; inline.em = edit(inline.em) .replace(/punctuation/g, inline._punctuation) From 211b9f9a201df6846c3943a403064ab9d13ac146 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 8 Jul 2020 16:00:12 -0400 Subject: [PATCH 19/24] Removed Lookbehinds Beginning and End delimiters for EM and Strong must be searched in a separate regex to work without lookbehinds. This invalidates the regex that skips over blocks (code, html, etc.) that take precedence over EM or Strong blocks. Getting around this means we must now mask not only reflinks, but all enclosed blocks which were previously just skipped over in the Regex. Add one check for overlapping Strong block when testing EM, now passes Commonmark 390 and 471 --- src/Lexer.js | 8 ++++- src/Tokenizer.js | 40 ++++++++++++++++++---- src/rules.js | 38 +++++++++++++++++--- test/specs/commonmark/commonmark.0.29.json | 6 ++-- test/specs/gfm/commonmark.0.29.json | 6 ++-- 5 files changed, 78 insertions(+), 20 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index 587fb9b03c..d04a4e6f74 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -324,10 +324,12 @@ module.exports = class Lexer { // String with links masked to avoid interference with em and strong let maskedSrc = src; + let match; + + // Mask out reflinks if (this.tokens.links) { const links = Object.keys(this.tokens.links); if (links.length > 0) { - let match; while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) { if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex); @@ -335,6 +337,10 @@ module.exports = class Lexer { } } } + // Mask out other blocks + while ((match = this.tokenizer.rules.inline.emSkip.exec(maskedSrc)) != null) { + maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.emSkip.lastIndex); + } while (src) { // escape diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 410255acc7..9d7ca9cce5 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -490,11 +490,25 @@ module.exports = class Tokenizer { } strong(src, maskedSrc, prevChar = '') { - let cap = this.rules.inline.preStrong.exec(src); + let match = this.rules.inline.strStart.exec(src); - if (cap) { + if (match) { maskedSrc = maskedSrc.slice(-1 * src.length); - cap = this.rules.inline.strong.exec(maskedSrc); + let strEnd; + + if(match[0] == "**") + strEnd = this.rules.inline.strEndAst; + else + strEnd = this.rules.inline.strEndUnd; + + strEnd.lastIndex = 0; + + let cap; + while ((match = strEnd.exec(maskedSrc)) != null) { + cap = this.rules.inline.strong.exec(maskedSrc.slice(0,match.index+3)); + if (cap) + break; + } if (cap) { if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { @@ -509,11 +523,25 @@ module.exports = class Tokenizer { } em(src, maskedSrc, prevChar = '') { - let cap = this.rules.inline.preEm.exec(src); + let match = this.rules.inline.emStart.exec(src); - if (cap) { + if (match) { maskedSrc = maskedSrc.slice(-1 * src.length); - cap = this.rules.inline.em.exec(maskedSrc); + let emEnd; + + if(match[0] == "*") + emEnd = this.rules.inline.emEndAst; + else + emEnd = this.rules.inline.emEndUnd; + + emEnd.lastIndex = 0; + + let cap; + while ((match = emEnd.exec(maskedSrc)) != null) { + cap = this.rules.inline.em.exec(maskedSrc.slice(0,match.index+2)); + if (cap) + break; + } if (cap) { if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { diff --git a/src/rules.js b/src/rules.js index 6c9269ef6f..cd4de697de 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,11 +169,15 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, reflinkSearch: 'reflink|nolink(?!\\()', - preStrong: /^(?:\*\*|__)/, - strong: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])((?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)*?(? inline._emSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; +inline._strSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; +inline._evSkip = '__[^_]*?__'; inline.em = edit(inline.em) .replace(/punctuation/g, inline._punctuation) - .replace(/emSkip/g, inline._emSkip) + .replace(/evSkip/g, inline._evSkip) + .getRegex(); + +inline.emEndAst = edit(inline.emEndAst, 'g') + .replace(/punctuation/g, inline._punctuation) + .getRegex(); + +inline.emEndUnd = edit(inline.emEndUnd, 'g') + .replace(/punctuation/g, inline._punctuation) + .getRegex(); + +inline.emSkip = edit(inline._emSkip, 'g') + .getRegex(); + +inline.evSkip = edit(inline._evSkip, 'g') .getRegex(); inline.strong = edit(inline.strong) @@ -199,6 +219,14 @@ inline.strong = edit(inline.strong) .replace(/emSkip/g, inline._emSkip) .getRegex(); +inline.strEndAst = edit(inline.strEndAst, 'g') + .replace(/punctuation/g, inline._punctuation) + .getRegex(); + +inline.strEndUnd = edit(inline.strEndUnd, 'g') + .replace(/punctuation/g, inline._punctuation) + .getRegex(); + inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g; inline._scheme = /[a-zA-Z][a-zA-Z0-9+.-]{1,31}/; diff --git a/test/specs/commonmark/commonmark.0.29.json b/test/specs/commonmark/commonmark.0.29.json index 8f24dcacb8..b49e4ac4a0 100644 --- a/test/specs/commonmark/commonmark.0.29.json +++ b/test/specs/commonmark/commonmark.0.29.json @@ -3160,8 +3160,7 @@ "example": 390, "start_line": 6672, "end_line": 6676, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "**(**foo)\n", @@ -3828,8 +3827,7 @@ "example": 471, "start_line": 7355, "end_line": 7359, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "*[bar*](/url)\n", diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json index 8b53f95a0d..192186f19f 100644 --- a/test/specs/gfm/commonmark.0.29.json +++ b/test/specs/gfm/commonmark.0.29.json @@ -3160,8 +3160,7 @@ "example": 390, "start_line": 6672, "end_line": 6676, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "**(**foo)\n", @@ -3828,8 +3827,7 @@ "example": 471, "start_line": 7355, "end_line": 7359, - "section": "Emphasis and strong emphasis", - "shouldFail": true + "section": "Emphasis and strong emphasis" }, { "markdown": "*[bar*](/url)\n", From cc778ade42ec052f0b28315551d67a57b4681944 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 8 Jul 2020 16:58:58 -0400 Subject: [PATCH 20/24] Removed redundancy in "startEM" check --- src/Lexer.js | 4 ++-- src/Tokenizer.js | 30 +++++++++++++----------------- src/rules.js | 39 +++++++++++++++++++++++---------------- 3 files changed, 38 insertions(+), 35 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index d04a4e6f74..bef990b5bc 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -338,8 +338,8 @@ module.exports = class Lexer { } } // Mask out other blocks - while ((match = this.tokenizer.rules.inline.emSkip.exec(maskedSrc)) != null) { - maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.emSkip.lastIndex); + while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) { + maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex); } while (src) { diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 9d7ca9cce5..452a04bf22 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -491,8 +491,8 @@ module.exports = class Tokenizer { strong(src, maskedSrc, prevChar = '') { let match = this.rules.inline.strStart.exec(src); - - if (match) { + + if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); let strEnd; @@ -511,13 +511,11 @@ module.exports = class Tokenizer { } if (cap) { - if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { - return { - type: 'strong', - raw: src.slice(0, cap[0].length), - text: src.slice(2, cap[0].length - 2) - }; - } + return { + type: 'strong', + raw: src.slice(0, cap[0].length), + text: src.slice(2, cap[0].length - 2) + }; } } } @@ -525,7 +523,7 @@ module.exports = class Tokenizer { em(src, maskedSrc, prevChar = '') { let match = this.rules.inline.emStart.exec(src); - if (match) { + if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); let emEnd; @@ -544,13 +542,11 @@ module.exports = class Tokenizer { } if (cap) { - if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) { - return { - type: 'em', - raw: src.slice(0, cap[0].length), - text: src.slice(1, cap[0].length - 1) - }; - } + return { + type: 'em', + raw: src.slice(0, cap[0].length), + text: src.slice(1, cap[0].length - 1) + }; } } } diff --git a/src/rules.js b/src/rules.js index cd4de697de..f79b74c1e4 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,15 +169,15 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, reflinkSearch: 'reflink|nolink(?!\\()', - strStart: /^\*\*|__/, - strEndAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation\s]|$))/, - strEndUnd: /[^\s]__(?!_)(?:(?=[punctuation\s])|$)/, - strong: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])((?:(?:(?!evSkip)(?:[^*]|\\\*)|evSkip)|(?:(?:(?!evSkip)(?:[^*]|\\\*)|evSkip)*?(??@\\[\\]`^{|}~'; inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex(); // sequences em should skip over [title](link), `code`, -inline._emSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; -inline._strSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; -inline._evSkip = '__[^_]*?__'; +inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; +inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*'; inline.em = edit(inline.em) .replace(/punctuation/g, inline._punctuation) - .replace(/evSkip/g, inline._evSkip) + .replace(/overlapSkip/g, inline._overlapSkip) .getRegex(); +inline.emStart = edit(inline.emStart) + .replace(/punctuation/g, inline._punctuation) + .getRegex(); + inline.emEndAst = edit(inline.emEndAst, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); @@ -208,17 +211,21 @@ inline.emEndUnd = edit(inline.emEndUnd, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); -inline.emSkip = edit(inline._emSkip, 'g') +inline.blockSkip = edit(inline._blockSkip, 'g') .getRegex(); -inline.evSkip = edit(inline._evSkip, 'g') +inline.overlapSkip = edit(inline._overlapSkip, 'g') .getRegex(); inline.strong = edit(inline.strong) .replace(/punctuation/g, inline._punctuation) - .replace(/emSkip/g, inline._emSkip) + .replace(/blockSkip/g, inline._blockSkip) .getRegex(); +inline.strStart = edit(inline.strStart) + .replace(/punctuation/g, inline._punctuation) + .getRegex(); + inline.strEndAst = edit(inline.strEndAst, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); From 226bbe70b70dc325232be5606b152a62a3f09487 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 8 Jul 2020 17:01:42 -0400 Subject: [PATCH 21/24] Lint --- src/Tokenizer.js | 50 ++++++++++++++++++------------------------------ src/rules.js | 10 +++++----- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 452a04bf22..e331c8baf9 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -491,31 +491,25 @@ module.exports = class Tokenizer { strong(src, maskedSrc, prevChar = '') { let match = this.rules.inline.strStart.exec(src); - + if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); let strEnd; - if(match[0] == "**") - strEnd = this.rules.inline.strEndAst; - else - strEnd = this.rules.inline.strEndUnd; + if (match[0] === '**') { strEnd = this.rules.inline.strEndAst; } else { strEnd = this.rules.inline.strEndUnd; } strEnd.lastIndex = 0; let cap; while ((match = strEnd.exec(maskedSrc)) != null) { - cap = this.rules.inline.strong.exec(maskedSrc.slice(0,match.index+3)); - if (cap) - break; - } - - if (cap) { - return { - type: 'strong', - raw: src.slice(0, cap[0].length), - text: src.slice(2, cap[0].length - 2) - }; + cap = this.rules.inline.strong.exec(maskedSrc.slice(0, match.index + 3)); + if (cap) { + return { + type: 'strong', + raw: src.slice(0, cap[0].length), + text: src.slice(2, cap[0].length - 2) + }; + } } } } @@ -527,26 +521,20 @@ module.exports = class Tokenizer { maskedSrc = maskedSrc.slice(-1 * src.length); let emEnd; - if(match[0] == "*") - emEnd = this.rules.inline.emEndAst; - else - emEnd = this.rules.inline.emEndUnd; + if (match[0] === '*') { emEnd = this.rules.inline.emEndAst; } else { emEnd = this.rules.inline.emEndUnd; } emEnd.lastIndex = 0; let cap; while ((match = emEnd.exec(maskedSrc)) != null) { - cap = this.rules.inline.em.exec(maskedSrc.slice(0,match.index+2)); - if (cap) - break; - } - - if (cap) { - return { - type: 'em', - raw: src.slice(0, cap[0].length), - text: src.slice(1, cap[0].length - 1) - }; + cap = this.rules.inline.em.exec(maskedSrc.slice(0, match.index + 2)); + if (cap) { + return { + type: 'em', + raw: src.slice(0, cap[0].length), + text: src.slice(1, cap[0].length - 1) + }; + } } } } diff --git a/src/rules.js b/src/rules.js index f79b74c1e4..63b99a8dec 100644 --- a/src/rules.js +++ b/src/rules.js @@ -176,7 +176,7 @@ const inline = { emStart: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation emEndAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline) emEndUnd: /[^\s]_(?!_)(?:(?=[punctuation\s])|$)/, // last char can't be a space, and final _ must preceed punct or \s (or endline) - // ⬐ skip overlapping Strong ⬐repeat logic for inner *'s (must be in pairs)| Underscores ⬐ skip overlapping Strong ⬐repeat logic for inner _'s (must be in pairs)⬎ + // ⬐ skip overlapping Strong ⬐repeat logic for inner *'s (must be in pairs)| Underscores ⬐ skip overlapping Strong ⬐repeat logic for inner _'s (must be in pairs)⬎ em: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/, code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/, br: /^( {2,}|\\)\n(?!\s*$)/, @@ -200,8 +200,8 @@ inline.em = edit(inline.em) .getRegex(); inline.emStart = edit(inline.emStart) - .replace(/punctuation/g, inline._punctuation) - .getRegex(); + .replace(/punctuation/g, inline._punctuation) + .getRegex(); inline.emEndAst = edit(inline.emEndAst, 'g') .replace(/punctuation/g, inline._punctuation) @@ -223,8 +223,8 @@ inline.strong = edit(inline.strong) .getRegex(); inline.strStart = edit(inline.strStart) - .replace(/punctuation/g, inline._punctuation) - .getRegex(); + .replace(/punctuation/g, inline._punctuation) + .getRegex(); inline.strEndAst = edit(inline.strEndAst, 'g') .replace(/punctuation/g, inline._punctuation) From 1fb141d2755d9a6081fbc608d207ad894a42258a Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Thu, 9 Jul 2020 10:53:48 -0400 Subject: [PATCH 22/24] Make strEnd const Co-authored-by: Tony Brix --- src/Tokenizer.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index e331c8baf9..9c2e974875 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -494,9 +494,7 @@ module.exports = class Tokenizer { if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); - let strEnd; - - if (match[0] === '**') { strEnd = this.rules.inline.strEndAst; } else { strEnd = this.rules.inline.strEndUnd; } + const strEnd = match[0] === '**' ? this.rules.inline.strEndAst : this.rules.inline.strEndUnd; strEnd.lastIndex = 0; From ad720c1cba4e5cb884785f4d4550e7fadb8d3be1 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Thu, 9 Jul 2020 10:54:08 -0400 Subject: [PATCH 23/24] Make emEnd const Co-authored-by: Tony Brix --- src/Tokenizer.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index 9c2e974875..80d9f58398 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -517,9 +517,7 @@ module.exports = class Tokenizer { if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); - let emEnd; - - if (match[0] === '*') { emEnd = this.rules.inline.emEndAst; } else { emEnd = this.rules.inline.emEndUnd; } + const emEnd = match[0] === '*' ? this.rules.inline.emEndAst : this.rules.inline.emEndUnd; emEnd.lastIndex = 0; From e27e6f960f0b5a052e6fde496a7109a5acaf9e27 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Thu, 9 Jul 2020 19:35:22 -0400 Subject: [PATCH 24/24] Sorted strong and em into sub-objects --- src/Tokenizer.js | 24 +++++++++--------- src/rules.js | 63 +++++++++++++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 37 deletions(-) diff --git a/src/Tokenizer.js b/src/Tokenizer.js index e331c8baf9..c7d22c8869 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -490,19 +490,19 @@ module.exports = class Tokenizer { } strong(src, maskedSrc, prevChar = '') { - let match = this.rules.inline.strStart.exec(src); + let match = this.rules.inline.strong.start.exec(src); if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); - let strEnd; + let endReg; - if (match[0] === '**') { strEnd = this.rules.inline.strEndAst; } else { strEnd = this.rules.inline.strEndUnd; } + if (match[0] === '**') { endReg = this.rules.inline.strong.endAst; } else { endReg = this.rules.inline.strong.endUnd; } - strEnd.lastIndex = 0; + endReg.lastIndex = 0; let cap; - while ((match = strEnd.exec(maskedSrc)) != null) { - cap = this.rules.inline.strong.exec(maskedSrc.slice(0, match.index + 3)); + while ((match = endReg.exec(maskedSrc)) != null) { + cap = this.rules.inline.strong.middle.exec(maskedSrc.slice(0, match.index + 3)); if (cap) { return { type: 'strong', @@ -515,19 +515,19 @@ module.exports = class Tokenizer { } em(src, maskedSrc, prevChar = '') { - let match = this.rules.inline.emStart.exec(src); + let match = this.rules.inline.em.start.exec(src); if (match && (!match[1] || (match[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar))))) { maskedSrc = maskedSrc.slice(-1 * src.length); - let emEnd; + let endReg; - if (match[0] === '*') { emEnd = this.rules.inline.emEndAst; } else { emEnd = this.rules.inline.emEndUnd; } + if (match[0] === '*') { endReg = this.rules.inline.em.endAst; } else { endReg = this.rules.inline.em.endUnd; } - emEnd.lastIndex = 0; + endReg.lastIndex = 0; let cap; - while ((match = emEnd.exec(maskedSrc)) != null) { - cap = this.rules.inline.em.exec(maskedSrc.slice(0, match.index + 2)); + while ((match = endReg.exec(maskedSrc)) != null) { + cap = this.rules.inline.em.middle.exec(maskedSrc.slice(0, match.index + 2)); if (cap) { return { type: 'em', diff --git a/src/rules.js b/src/rules.js index 63b99a8dec..d4a67278b5 100644 --- a/src/rules.js +++ b/src/rules.js @@ -169,15 +169,18 @@ const inline = { reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/, nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/, reflinkSearch: 'reflink|nolink(?!\\()', - strStart: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation - strEndAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline) - strEndUnd: /[^\s]__(?!_)(?:(?=[punctuation\s])|$)/, // last char can't be a space, and final _ must preceed punct or \s (or endline) - strong: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/, - emStart: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation - emEndAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline) - emEndUnd: /[^\s]_(?!_)(?:(?=[punctuation\s])|$)/, // last char can't be a space, and final _ must preceed punct or \s (or endline) - // ⬐ skip overlapping Strong ⬐repeat logic for inner *'s (must be in pairs)| Underscores ⬐ skip overlapping Strong ⬐repeat logic for inner _'s (must be in pairs)⬎ - em: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/, + strong: { + start: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation + middle: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/, + endAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline) + endUnd: /[^\s]__(?!_)(?:(?=[punctuation\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline) + }, + em: { + start: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation + middle: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/, + endAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline) + endUnd: /[^\s]_(?!_)(?:(?=[punctuation\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline) + }, code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/, br: /^( {2,}|\\)\n(?!\s*$)/, del: noopTest, @@ -194,44 +197,44 @@ inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._pu inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>'; inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*'; -inline.em = edit(inline.em) +inline.em.start = edit(inline.em.start) .replace(/punctuation/g, inline._punctuation) - .replace(/overlapSkip/g, inline._overlapSkip) .getRegex(); -inline.emStart = edit(inline.emStart) +inline.em.middle = edit(inline.em.middle) .replace(/punctuation/g, inline._punctuation) + .replace(/overlapSkip/g, inline._overlapSkip) .getRegex(); -inline.emEndAst = edit(inline.emEndAst, 'g') +inline.em.endAst = edit(inline.em.endAst, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); -inline.emEndUnd = edit(inline.emEndUnd, 'g') +inline.em.endUnd = edit(inline.em.endUnd, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); -inline.blockSkip = edit(inline._blockSkip, 'g') - .getRegex(); - -inline.overlapSkip = edit(inline._overlapSkip, 'g') +inline.strong.start = edit(inline.strong.start) + .replace(/punctuation/g, inline._punctuation) .getRegex(); -inline.strong = edit(inline.strong) +inline.strong.middle = edit(inline.strong.middle) .replace(/punctuation/g, inline._punctuation) .replace(/blockSkip/g, inline._blockSkip) .getRegex(); -inline.strStart = edit(inline.strStart) +inline.strong.endAst = edit(inline.strong.endAst, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); -inline.strEndAst = edit(inline.strEndAst, 'g') +inline.strong.endUnd = edit(inline.strong.endUnd, 'g') .replace(/punctuation/g, inline._punctuation) .getRegex(); -inline.strEndUnd = edit(inline.strEndUnd, 'g') - .replace(/punctuation/g, inline._punctuation) +inline.blockSkip = edit(inline._blockSkip, 'g') + .getRegex(); + +inline.overlapSkip = edit(inline._overlapSkip, 'g') .getRegex(); inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g; @@ -280,8 +283,18 @@ inline.normal = merge({}, inline); */ inline.pedantic = merge({}, inline.normal, { - strong: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/, - em: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/, + strong: { + start: /^__|\*\*/, + middle: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/, + endAst: /\*\*(?!\*)/g, + endUnd: /__(?!_)/g + }, + em: { + start: /^_|\*/, + middle: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/, + endAst: /\*(?!\*)/g, + endUnd: /_(?!_)/g + }, link: edit(/^!?\[(label)\]\((.*?)\)/) .replace('label', inline._label) .getRegex(),