Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow GFM spec on EM and STRONG delimiters #1686

Merged
merged 26 commits into from Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
40493bb
Follow GFM spec on Left-flanking-delimiter-runs
calculuschild May 21, 2020
4e2ec90
Now passes several more tests
calculuschild May 29, 2020
283ab9c
Deleted an extra line while removing comments
calculuschild May 29, 2020
c38ee23
Fix Pedantic
calculuschild May 30, 2020
7c6551e
Properly handle reflinks that should be escaped
calculuschild Jun 12, 2020
bc17ded
Lint
calculuschild Jun 12, 2020
ea203cf
Lint 2
calculuschild Jun 12, 2020
556070b
Updated rules for underscore em
calculuschild Jun 12, 2020
4cbba07
Moved logic into Tokenizer. No longer injecting Reflinks
calculuschild Jun 17, 2020
335a660
Added fixes to Strong
calculuschild Jun 17, 2020
e926e0c
Lint...
calculuschild Jun 17, 2020
c60c9ba
Remove extra tests accidentally left in
calculuschild Jun 17, 2020
54218fe
Remove straggling "shouldfail: false"
calculuschild Jun 17, 2020
2a45677
Remove redundant regex symbols
calculuschild Jun 18, 2020
d233fd5
mask reflinks
UziTech Jun 20, 2020
56b6f5e
Merge pull request #1 from UziTech/mask-reflinks
calculuschild Jun 30, 2020
4db32dc
Links are masked only once per inline string
calculuschild Jun 30, 2020
4e7902e
Gaaaah lint
calculuschild Jun 30, 2020
bd4f8c4
Fix unrestricted "any character" for REDOS
calculuschild Jul 2, 2020
211b9f9
Removed Lookbehinds
calculuschild Jul 8, 2020
cc778ad
Removed redundancy in "startEM" check
calculuschild Jul 8, 2020
226bbe7
Lint
calculuschild Jul 8, 2020
1fb141d
Make strEnd const
calculuschild Jul 9, 2020
ad720c1
Make emEnd const
calculuschild Jul 9, 2020
e27e6f9
Sorted strong and em into sub-objects
calculuschild Jul 9, 2020
6b729ed
Merge branch 'EmphasisFixes' of https://github.com/calculuschild/mark…
calculuschild Jul 9, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 12 additions & 4 deletions src/Lexer.js
@@ -1,6 +1,7 @@
const Tokenizer = require('./Tokenizer.js');
const { defaults } = require('./defaults.js');
const { block, inline } = require('./rules.js');
const { edit } = require('./helpers.js');

/**
* smartypants text replacement
Expand Down Expand Up @@ -102,6 +103,12 @@ module.exports = class Lexer {

this.blockTokens(src, this.tokens, true);

// Insert known reflinks into em rules to properly skip over them
const rep = Object.keys(this.tokens.links).join('|').replace(/\*/g, '\\*');
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
this.tokenizer.rules.inline.em = edit(inline.em)
.replace(/reflink/g, rep)
.getRegex();

this.inline(this.tokens);

return this.tokens;
Expand Down Expand Up @@ -267,7 +274,7 @@ module.exports = class Lexer {
case 'text':
case 'heading': {
token.tokens = [];
this.inlineTokens(token.text, token.tokens);
this.inlineTokens(token.text, token.tokens, undefined, undefined);
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
break;
}
case 'table': {
Expand Down Expand Up @@ -319,7 +326,7 @@ module.exports = class Lexer {
/**
* Lexing/Compiling
*/
inlineTokens(src, tokens = [], inLink = false, inRawBlock = false) {
inlineTokens(src, tokens = [], inLink = false, inRawBlock = false, prevChar = '') {
let token;

while (src) {
Expand Down Expand Up @@ -360,15 +367,15 @@ module.exports = class Lexer {
}

// strong
if (token = this.tokenizer.strong(src)) {
if (token = this.tokenizer.strong(src, prevChar)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
continue;
}

// em
if (token = this.tokenizer.em(src)) {
if (token = this.tokenizer.em(src, prevChar, this.tokens.links)) {
src = src.substring(token.raw.length);
token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
tokens.push(token);
Expand Down Expand Up @@ -414,6 +421,7 @@ module.exports = class Lexer {
// text
if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) {
src = src.substring(token.raw.length);
prevChar = token.raw.slice(-1);
tokens.push(token);
continue;
}
Expand Down
16 changes: 9 additions & 7 deletions src/Tokenizer.js
Expand Up @@ -489,7 +489,7 @@ module.exports = class Tokenizer {
}
}

strong(src) {
strong(src, prevChar = '') {
const cap = this.rules.inline.strong.exec(src);
if (cap) {
return {
Expand All @@ -500,14 +500,16 @@ module.exports = class Tokenizer {
}
}

em(src) {
em(src, prevChar = '') {
const cap = this.rules.inline.em.exec(src);
if (cap) {
return {
type: 'em',
raw: cap[0],
text: cap[6] || cap[5] || cap[4] || cap[3] || cap[2] || cap[1]
};
if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
return {
type: 'em',
raw: cap[0],
text: cap[3] || cap[2]
};
}
}
}

Expand Down
23 changes: 16 additions & 7 deletions src/rules.js
Expand Up @@ -169,18 +169,27 @@ const inline = {
reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
strong: /^__([^\s_])__(?!_)|^\*\*([^\s*])\*\*(?!\*)|^__([^\s][\s\S]*?[^\s])__(?!_)|^\*\*([^\s][\s\S]*?[^\s])\*\*(?!\*)/,
em: /^_([^\s_])_(?!_)|^_([^\s_<][\s\S]*?[^\s_])_(?!_|[^\s,punctuation])|^_([^\s_<][\s\S]*?[^\s])_(?!_|[^\s,punctuation])|^\*([^\s*<\[])\*(?!\*)|^\*([^\s<"][\s\S]*?[^\s\[\*])\*(?![\]`punctuation])|^\*([^\s*"<\[][\s\S]*[^\s])\*(?!\*)/,
// (1) returns if starts w/ punctuation | (2) ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎ ⬐last char can't be punct OR ⬐final * must also be followed by punct (or endline) | (3) Underscores ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner _'s (must be in pairs)⬎ ⬐last char can't be a space, and final _ must be followed by punct (or endline)
em: /^(?:(\*(?=[`\]punctuation]))|\*)(?![\*\s])((?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^\*]|[\\\s]\*)|emSkip)*?(?<!\\)\*){2})*?)(?:(?<![`\s\]punctuation])\*(?!\*)|(?<=[`\]punctuation])\*(?!\*)(?:(?=[`\s\]punctuation]|$)))|^_(?![_\s])((?:(?:(?!emSkip)(?:[^_]|[\\\s]_)|emSkip)|(?:(?:(?!emSkip)(?:[^_]|[\\\s]_)|emSkip)*?(?<!\\)_){2})*?)(?:(?<![\s])_(?!_)(?:(?=[`\s\]punctuation])|$))/,
code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
br: /^( {2,}|\\)\n(?!\s*$)/,
del: noopTest,
text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n))|(?= {2,}\n))/
text: /^(`+|[^`])(?:[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n))|(?= {2,}\n))/,
punctuation: /^(['\s\]punctuation])/
};

// list of punctuation marks from common mark spec
// without ` and ] to workaround Rule 17 (inline code blocks/links)
// without , to work around example 393
inline._punctuation = '!"#$%&\'()*+\\-./:;<=>?@\\[^_{|}~';
inline.em = edit(inline.em).replace(/punctuation/g, inline._punctuation).getRegex();
// without * and _ to workaround cases with double emphasis
inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~';
inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex();

// sequences em should skip over [reflink], [title][reflink], [title](link), `code`, <html>
inline._emSkip = '\\[reflink\\]|\\[.*?\\]\\[reflink\\]|\\[.*?\\]\\(.*?\\)|`.*?`|<.*?>';

inline.em = edit(inline.em)
.replace(/punctuation/g, inline._punctuation)
.replace(/emSkip/g, inline._emSkip)
.getRegex();

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;

Expand Down Expand Up @@ -224,7 +233,7 @@ inline.normal = merge({}, inline);

inline.pedantic = merge({}, inline.normal, {
strong: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/,
em: /^_(?=\S)([\s\S]*?\S)_(?!_)|^\*(?=\S)([\s\S]*?\S)\*(?!\*)/,
em: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/,
link: edit(/^!?\[(label)\]\((.*?)\)/)
.replace('label', inline._label)
.getRegex(),
Expand Down
20 changes: 9 additions & 11 deletions test/specs/commonmark/commonmark.0.29.json
Expand Up @@ -2766,7 +2766,7 @@
"start_line": 6003,
"end_line": 6007,
"section": "Code spans",
"shouldFail": true
"shouldFail": false
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
},
{
"markdown": "[not a `link](/foo`)\n",
Expand Down Expand Up @@ -2976,7 +2976,7 @@
"start_line": 6455,
"end_line": 6459,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "*(*foo*)*\n",
Expand All @@ -2985,7 +2985,7 @@
"start_line": 6465,
"end_line": 6469,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "*foo*bar\n",
Expand All @@ -3010,7 +3010,7 @@
"start_line": 6497,
"end_line": 6501,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "_(_foo_)_\n",
Expand All @@ -3019,7 +3019,7 @@
"start_line": 6506,
"end_line": 6510,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "_foo_bar\n",
Expand Down Expand Up @@ -3301,7 +3301,7 @@
"start_line": 6824,
"end_line": 6828,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "__foo_ bar_\n",
Expand Down Expand Up @@ -3394,7 +3394,7 @@
"start_line": 6928,
"end_line": 6932,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "*foo [*bar*](/url)*\n",
Expand Down Expand Up @@ -3589,8 +3589,7 @@
"example": 441,
"start_line": 7122,
"end_line": 7126,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"section": "Emphasis and strong emphasis"
},
{
"markdown": "*foo**\n",
Expand All @@ -3616,8 +3615,7 @@
"example": 444,
"start_line": 7143,
"end_line": 7147,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"section": "Emphasis and strong emphasis"
},
{
"markdown": "**foo***\n",
Expand Down
20 changes: 9 additions & 11 deletions test/specs/gfm/commonmark.0.29.json
Expand Up @@ -2766,7 +2766,7 @@
"start_line": 6003,
"end_line": 6007,
"section": "Code spans",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "[not a `link](/foo`)\n",
Expand Down Expand Up @@ -2976,7 +2976,7 @@
"start_line": 6455,
"end_line": 6459,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail":false
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
},
{
"markdown": "*(*foo*)*\n",
Expand All @@ -2985,7 +2985,7 @@
"start_line": 6465,
"end_line": 6469,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "*foo*bar\n",
Expand All @@ -3010,7 +3010,7 @@
"start_line": 6497,
"end_line": 6501,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "_(_foo_)_\n",
Expand All @@ -3019,7 +3019,7 @@
"start_line": 6506,
"end_line": 6510,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "_foo_bar\n",
Expand Down Expand Up @@ -3301,7 +3301,7 @@
"start_line": 6824,
"end_line": 6828,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "__foo_ bar_\n",
Expand Down Expand Up @@ -3394,7 +3394,7 @@
"start_line": 6928,
"end_line": 6932,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"shouldFail": false
},
{
"markdown": "*foo [*bar*](/url)*\n",
Expand Down Expand Up @@ -3589,8 +3589,7 @@
"example": 441,
"start_line": 7122,
"end_line": 7126,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"section": "Emphasis and strong emphasis"
},
{
"markdown": "*foo**\n",
Expand All @@ -3616,8 +3615,7 @@
"example": 444,
"start_line": 7143,
"end_line": 7147,
"section": "Emphasis and strong emphasis",
"shouldFail": true
"section": "Emphasis and strong emphasis"
},
{
"markdown": "**foo***\n",
Expand Down
6 changes: 1 addition & 5 deletions test/specs/new/em_2char.html
Expand Up @@ -20,10 +20,6 @@

<p>_ 123_</p>

<p><em>1_</em></p>

<p><em>1*</em></p>

<p>It’s levi<em>OH</em>sa, not levio<em>SAH.</em></p>

<p>__ test <a href="https://test.com/_">test</a></p>
<p>__ test <a href="https://test.com/_">test</a></p>
4 changes: 0 additions & 4 deletions test/specs/new/em_2char.md
Expand Up @@ -20,10 +20,6 @@ _123 _

_ 123_

_1__

*1**
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure you want to remove these tests?

It looks like @UziTech added these in PR #1181.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They do not confirm to the commonmark spec, so trying to get them to pass was counterproductive.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can update the output of these two tests to match commonmark's output:

<p><em>1</em>_</p>
<p><em>1</em>*</p>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could... but this PR does not parse it correctly either. This is essentially Commonmark Example 442 in the test files, which we currently do not pass.

Since it wasn't correct in the earlier version, I feel it would be OK to let this one slide for now (this PR isn't breaking 442 or causing any regressions on things that worked correctly), and work on fixing that test in a future development. Getting this one obscure test to work that wasn't even working before involves some obnoxious workarounds and I don't know if it's worth focusing on it for this PR. Once this is stable I plan to go over more failing test cases and I can start tackling that one.

I hope that makes sense where I'm coming from.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep that makes sense. I think this corner case will probably affect very few people so it can be addressed in a different PR 👍


It’s levi*OH*sa, not levio*SAH.*

__ test [test](https://test.com/_)
4 changes: 4 additions & 0 deletions test/specs/what/em_left_square_bracket.html
@@ -0,0 +1,4 @@
<p>[<em>[punctuation, asterisk, punctuation should work</em></p>
<p><em>[space, asterisk, punctuation should work</em></p>
<p>p<em>non-punctuation, asterisk, non-punctuation should work</em></p>
<p>p*[non-punctuation, asterisk, punctuation should NOT work*</p>
10 changes: 10 additions & 0 deletions test/specs/what/em_left_square_bracket.md
@@ -0,0 +1,10 @@
[*[punctuation, asterisk, punctuation should work*


*[space, asterisk, punctuation should work*


p*non-punctuation, asterisk, non-punctuation should work*


p*[non-punctuation, asterisk, punctuation should NOT work*
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket0.html
@@ -0,0 +1 @@
<p><em>foo <em>bar</em></em></p>
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket0.md
@@ -0,0 +1 @@
*foo *bar**
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket00.html
@@ -0,0 +1 @@
<p><em>foo <strong>bar</strong> baz</em></p>
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket00.md
@@ -0,0 +1 @@
*foo **bar** baz*
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket000.html
@@ -0,0 +1 @@
<p>foo <em>_</em></p>
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket000.md
@@ -0,0 +1 @@
foo *_*
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket0000.html
@@ -0,0 +1 @@
<p><em>(<strong>foo</strong>)</em></p>
1 change: 1 addition & 0 deletions test/specs/what/em_left_square_bracket0000.md
@@ -0,0 +1 @@
*(**foo**)*
7 changes: 7 additions & 0 deletions test/specs/whats/strong_and_em_together.html
@@ -0,0 +1,7 @@
<p><strong><em>This is strong and em.</em></strong></p>

<p>So is <strong><em>this</em></strong> word.</p>

<p><strong><em>This is strong and em.</em></strong></p>

<p>So is <strong><em>this</em></strong> word.</p>
7 changes: 7 additions & 0 deletions test/specs/whats/strong_and_em_together.md
@@ -0,0 +1,7 @@
***This is strong and em.***

So is ***this*** word.

___This is strong and em.___

So is ___this___ word.