From e0005d8232a08827f5e99b8b35b09728b2b07503 Mon Sep 17 00:00:00 2001 From: Trevor Buckner Date: Wed, 1 Dec 2021 22:18:07 -0500 Subject: [PATCH] fix: speed up parsing long lists (#2302) Co-authored-by: Tony Brix --- lib/marked.cjs | 79 +++++++++++++--------------- lib/marked.esm.js | 74 ++++++++++++-------------- lib/marked.umd.js | 79 +++++++++++++--------------- package-lock.json | 5 ++ src/Tokenizer.js | 74 ++++++++++++-------------- test/specs/bug/adjacent_lists.html | 1 - test/specs/bug/adjacent_lists.md | 1 - test/specs/redos/quadratic_lists.cjs | 4 ++ 8 files changed, 151 insertions(+), 166 deletions(-) delete mode 100644 test/specs/bug/adjacent_lists.html delete mode 100644 test/specs/bug/adjacent_lists.md create mode 100644 test/specs/redos/quadratic_lists.cjs diff --git a/lib/marked.cjs b/lib/marked.cjs index 6e85fce027..97cbbe4b9f 100644 --- a/lib/marked.cjs +++ b/lib/marked.cjs @@ -533,7 +533,7 @@ var Tokenizer = /*#__PURE__*/function () { var cap = this.rules.block.list.exec(src); if (cap) { - var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents; + var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, nextLine, rawLine, itemContents; var bull = cap[1].trim(); var isordered = bull.length > 1; var list = { @@ -551,83 +551,77 @@ var Tokenizer = /*#__PURE__*/function () { } // Get next list item - var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item + var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Check if current bullet point can start a new List Item while (src) { - if (this.rules.block.hr.test(src)) { - // End list if we encounter an HR (possibly move into itemRegex?) + if (!(cap = itemRegex.exec(src))) { break; } - if (!(cap = itemRegex.exec(src))) { + if (this.rules.block.hr.test(src)) { + // End list if bullet was actually HR (possibly move into itemRegex?) break; } - lines = cap[2].split('\n'); + raw = cap[0]; + src = src.substring(raw.length); + line = cap[2].split('\n', 1)[0]; + nextLine = src.split('\n', 1)[0]; if (this.options.pedantic) { indent = 2; - itemContents = lines[0].trimLeft(); + itemContents = line.trimLeft(); } else { indent = cap[2].search(/[^ ]/); // Find first non-space char - indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1 + indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent - itemContents = lines[0].slice(indent - cap[1].length); + itemContents = line.slice(indent); + indent += cap[1].length; } blankLine = false; - raw = cap[0]; - if (!lines[0] && /^ *$/.test(lines[1])) { - // items begin with at most one blank line - raw = cap[1] + lines.slice(0, 2).join('\n') + '\n'; + if (!line && /^ *$/.test(nextLine)) { + // Items begin with at most one blank line + raw += nextLine + '\n'; + src = src.substring(nextLine.length + 1); list.loose = true; - lines = []; } - var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); + var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); // Check if following lines should be included in List Item - for (i = 1; i < lines.length; i++) { - line = lines[i]; + while (src && !list.loose) { + rawLine = src.split('\n', 1)[0]; + line = rawLine; // Re-align to follow commonmark nesting rules if (this.options.pedantic) { - // Re-align to follow commonmark nesting rules line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' '); } // End list item if found start of new bullet if (nextBulletRegex.test(line)) { - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; break; - } // Until we encounter a blank line, item contents do not need indentation - - - if (!blankLine) { - if (!line.trim()) { - // Check if current line is empty - blankLine = true; - } // Dedent if possible - - - if (line.search(/[^ ]/) >= indent) { - itemContents += '\n' + line.slice(indent); - } else { - itemContents += '\n' + line; - } - - continue; - } // Dedent this line - + } if (line.search(/[^ ]/) >= indent || !line.trim()) { + // Dedent if possible itemContents += '\n' + line.slice(indent); - continue; + } else if (!blankLine) { + // Until blank line, item doesn't need indentation + itemContents += '\n' + line; } else { - // Line was not properly indented; end of this item - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; + // Otherwise, improper indentation ends this item break; } + + if (!blankLine && !line.trim()) { + // Check if current line is blank + blankLine = true; + } + + raw += rawLine + '\n'; + src = src.substring(rawLine.length + 1); } if (!list.loose) { @@ -658,7 +652,6 @@ var Tokenizer = /*#__PURE__*/function () { text: itemContents }); list.raw += raw; - src = src.slice(raw.length); } // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic @@ -671,7 +664,7 @@ var Tokenizer = /*#__PURE__*/function () { this.lexer.state.top = false; list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []); - if (list.items[i].tokens.some(function (t) { + if (!list.loose && list.items[i].tokens.some(function (t) { return t.type === 'space'; })) { list.loose = true; diff --git a/lib/marked.esm.js b/lib/marked.esm.js index 3bc9ebbe3b..c8fac1cf4e 100644 --- a/lib/marked.esm.js +++ b/lib/marked.esm.js @@ -452,7 +452,7 @@ class Tokenizer { let cap = this.rules.block.list.exec(src); if (cap) { let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, - line, lines, itemContents; + line, nextLine, rawLine, itemContents; let bull = cap[1].trim(); const isordered = bull.length > 1; @@ -473,76 +473,73 @@ class Tokenizer { } // Get next list item - const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`); + const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`); - // Get each top-level item + // Check if current bullet point can start a new List Item while (src) { - if (this.rules.block.hr.test(src)) { // End list if we encounter an HR (possibly move into itemRegex?) + if (!(cap = itemRegex.exec(src))) { break; } - if (!(cap = itemRegex.exec(src))) { + if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?) break; } - lines = cap[2].split('\n'); + raw = cap[0]; + src = src.substring(raw.length); + + line = cap[2].split('\n', 1)[0]; + nextLine = src.split('\n', 1)[0]; if (this.options.pedantic) { indent = 2; - itemContents = lines[0].trimLeft(); + itemContents = line.trimLeft(); } else { indent = cap[2].search(/[^ ]/); // Find first non-space char - indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1 - itemContents = lines[0].slice(indent - cap[1].length); + indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent + itemContents = line.slice(indent); + indent += cap[1].length; } blankLine = false; - raw = cap[0]; - if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line - raw = cap[1] + lines.slice(0, 2).join('\n') + '\n'; + if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line + raw += nextLine + '\n'; + src = src.substring(nextLine.length + 1); list.loose = true; - lines = []; } const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`); - for (i = 1; i < lines.length; i++) { - line = lines[i]; + // Check if following lines should be included in List Item + while (src && !list.loose) { + rawLine = src.split('\n', 1)[0]; + line = rawLine; - if (this.options.pedantic) { // Re-align to follow commonmark nesting rules + // Re-align to follow commonmark nesting rules + if (this.options.pedantic) { line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' '); } // End list item if found start of new bullet if (nextBulletRegex.test(line)) { - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; break; } - // Until we encounter a blank line, item contents do not need indentation - if (!blankLine) { - if (!line.trim()) { // Check if current line is empty - blankLine = true; - } - - // Dedent if possible - if (line.search(/[^ ]/) >= indent) { - itemContents += '\n' + line.slice(indent); - } else { - itemContents += '\n' + line; - } - continue; - } - - // Dedent this line - if (line.search(/[^ ]/) >= indent || !line.trim()) { + if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible itemContents += '\n' + line.slice(indent); - continue; - } else { // Line was not properly indented; end of this item - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; + } else if (!blankLine) { // Until blank line, item doesn't need indentation + itemContents += '\n' + line; + } else { // Otherwise, improper indentation ends this item break; } + + if (!blankLine && !line.trim()) { // Check if current line is blank + blankLine = true; + } + + raw += rawLine + '\n'; + src = src.substring(rawLine.length + 1); } if (!list.loose) { @@ -573,7 +570,6 @@ class Tokenizer { }); list.raw += raw; - src = src.slice(raw.length); } // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic @@ -587,7 +583,7 @@ class Tokenizer { for (i = 0; i < l; i++) { this.lexer.state.top = false; list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []); - if (list.items[i].tokens.some(t => t.type === 'space')) { + if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) { list.loose = true; list.items[i].loose = true; } diff --git a/lib/marked.umd.js b/lib/marked.umd.js index cf69f9106d..25e84d3d21 100644 --- a/lib/marked.umd.js +++ b/lib/marked.umd.js @@ -535,7 +535,7 @@ var cap = this.rules.block.list.exec(src); if (cap) { - var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents; + var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, nextLine, rawLine, itemContents; var bull = cap[1].trim(); var isordered = bull.length > 1; var list = { @@ -553,83 +553,77 @@ } // Get next list item - var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item + var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Check if current bullet point can start a new List Item while (src) { - if (this.rules.block.hr.test(src)) { - // End list if we encounter an HR (possibly move into itemRegex?) + if (!(cap = itemRegex.exec(src))) { break; } - if (!(cap = itemRegex.exec(src))) { + if (this.rules.block.hr.test(src)) { + // End list if bullet was actually HR (possibly move into itemRegex?) break; } - lines = cap[2].split('\n'); + raw = cap[0]; + src = src.substring(raw.length); + line = cap[2].split('\n', 1)[0]; + nextLine = src.split('\n', 1)[0]; if (this.options.pedantic) { indent = 2; - itemContents = lines[0].trimLeft(); + itemContents = line.trimLeft(); } else { indent = cap[2].search(/[^ ]/); // Find first non-space char - indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1 + indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent - itemContents = lines[0].slice(indent - cap[1].length); + itemContents = line.slice(indent); + indent += cap[1].length; } blankLine = false; - raw = cap[0]; - if (!lines[0] && /^ *$/.test(lines[1])) { - // items begin with at most one blank line - raw = cap[1] + lines.slice(0, 2).join('\n') + '\n'; + if (!line && /^ *$/.test(nextLine)) { + // Items begin with at most one blank line + raw += nextLine + '\n'; + src = src.substring(nextLine.length + 1); list.loose = true; - lines = []; } - var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); + var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); // Check if following lines should be included in List Item - for (i = 1; i < lines.length; i++) { - line = lines[i]; + while (src && !list.loose) { + rawLine = src.split('\n', 1)[0]; + line = rawLine; // Re-align to follow commonmark nesting rules if (this.options.pedantic) { - // Re-align to follow commonmark nesting rules line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' '); } // End list item if found start of new bullet if (nextBulletRegex.test(line)) { - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; break; - } // Until we encounter a blank line, item contents do not need indentation - - - if (!blankLine) { - if (!line.trim()) { - // Check if current line is empty - blankLine = true; - } // Dedent if possible - - - if (line.search(/[^ ]/) >= indent) { - itemContents += '\n' + line.slice(indent); - } else { - itemContents += '\n' + line; - } - - continue; - } // Dedent this line - + } if (line.search(/[^ ]/) >= indent || !line.trim()) { + // Dedent if possible itemContents += '\n' + line.slice(indent); - continue; + } else if (!blankLine) { + // Until blank line, item doesn't need indentation + itemContents += '\n' + line; } else { - // Line was not properly indented; end of this item - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; + // Otherwise, improper indentation ends this item break; } + + if (!blankLine && !line.trim()) { + // Check if current line is blank + blankLine = true; + } + + raw += rawLine + '\n'; + src = src.substring(rawLine.length + 1); } if (!list.loose) { @@ -660,7 +654,6 @@ text: itemContents }); list.raw += raw; - src = src.slice(raw.length); } // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic @@ -673,7 +666,7 @@ this.lexer.state.top = false; list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []); - if (list.items[i].tokens.some(function (t) { + if (!list.loose && list.items[i].tokens.some(function (t) { return t.type === 'space'; })) { list.loose = true; diff --git a/package-lock.json b/package-lock.json index 14aa125098..6d38122072 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8304,6 +8304,11 @@ "safer-buffer": "^2.0.2", "tweetnacl": "~0.14.0" }, + "bin": { + "sshpk-conv": "bin/sshpk-conv", + "sshpk-sign": "bin/sshpk-sign", + "sshpk-verify": "bin/sshpk-verify" + }, "engines": { "node": ">=0.10.0" } diff --git a/src/Tokenizer.js b/src/Tokenizer.js index fe91349c84..a957883f28 100644 --- a/src/Tokenizer.js +++ b/src/Tokenizer.js @@ -169,7 +169,7 @@ export class Tokenizer { let cap = this.rules.block.list.exec(src); if (cap) { let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, - line, lines, itemContents; + line, nextLine, rawLine, itemContents; let bull = cap[1].trim(); const isordered = bull.length > 1; @@ -190,76 +190,73 @@ export class Tokenizer { } // Get next list item - const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`); + const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`); - // Get each top-level item + // Check if current bullet point can start a new List Item while (src) { - if (this.rules.block.hr.test(src)) { // End list if we encounter an HR (possibly move into itemRegex?) + if (!(cap = itemRegex.exec(src))) { break; } - if (!(cap = itemRegex.exec(src))) { + if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?) break; } - lines = cap[2].split('\n'); + raw = cap[0]; + src = src.substring(raw.length); + + line = cap[2].split('\n', 1)[0]; + nextLine = src.split('\n', 1)[0]; if (this.options.pedantic) { indent = 2; - itemContents = lines[0].trimLeft(); + itemContents = line.trimLeft(); } else { indent = cap[2].search(/[^ ]/); // Find first non-space char - indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1 - itemContents = lines[0].slice(indent - cap[1].length); + indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent + itemContents = line.slice(indent); + indent += cap[1].length; } blankLine = false; - raw = cap[0]; - if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line - raw = cap[1] + lines.slice(0, 2).join('\n') + '\n'; + if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line + raw += nextLine + '\n'; + src = src.substring(nextLine.length + 1); list.loose = true; - lines = []; } const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`); - for (i = 1; i < lines.length; i++) { - line = lines[i]; + // Check if following lines should be included in List Item + while (src && !list.loose) { + rawLine = src.split('\n', 1)[0]; + line = rawLine; - if (this.options.pedantic) { // Re-align to follow commonmark nesting rules + // Re-align to follow commonmark nesting rules + if (this.options.pedantic) { line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' '); } // End list item if found start of new bullet if (nextBulletRegex.test(line)) { - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; break; } - // Until we encounter a blank line, item contents do not need indentation - if (!blankLine) { - if (!line.trim()) { // Check if current line is empty - blankLine = true; - } - - // Dedent if possible - if (line.search(/[^ ]/) >= indent) { - itemContents += '\n' + line.slice(indent); - } else { - itemContents += '\n' + line; - } - continue; - } - - // Dedent this line - if (line.search(/[^ ]/) >= indent || !line.trim()) { + if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible itemContents += '\n' + line.slice(indent); - continue; - } else { // Line was not properly indented; end of this item - raw = cap[1] + lines.slice(0, i).join('\n') + '\n'; + } else if (!blankLine) { // Until blank line, item doesn't need indentation + itemContents += '\n' + line; + } else { // Otherwise, improper indentation ends this item break; } + + if (!blankLine && !line.trim()) { // Check if current line is blank + blankLine = true; + } + + raw += rawLine + '\n'; + src = src.substring(rawLine.length + 1); } if (!list.loose) { @@ -290,7 +287,6 @@ export class Tokenizer { }); list.raw += raw; - src = src.slice(raw.length); } // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic @@ -304,7 +300,7 @@ export class Tokenizer { for (i = 0; i < l; i++) { this.lexer.state.top = false; list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []); - if (list.items[i].tokens.some(t => t.type === 'space')) { + if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) { list.loose = true; list.items[i].loose = true; } diff --git a/test/specs/bug/adjacent_lists.html b/test/specs/bug/adjacent_lists.html deleted file mode 100644 index 2d64bd5963..0000000000 --- a/test/specs/bug/adjacent_lists.html +++ /dev/null @@ -1 +0,0 @@ -

foo bar *baz bim bam

diff --git a/test/specs/bug/adjacent_lists.md b/test/specs/bug/adjacent_lists.md deleted file mode 100644 index fdf80c0adb..0000000000 --- a/test/specs/bug/adjacent_lists.md +++ /dev/null @@ -1 +0,0 @@ -*foo __bar *baz bim__ bam* diff --git a/test/specs/redos/quadratic_lists.cjs b/test/specs/redos/quadratic_lists.cjs new file mode 100644 index 0000000000..8d230ca5af --- /dev/null +++ b/test/specs/redos/quadratic_lists.cjs @@ -0,0 +1,4 @@ +module.exports = { + markdown: '- a\n'.repeat(10000), + html: `` +};