From e0005d8232a08827f5e99b8b35b09728b2b07503 Mon Sep 17 00:00:00 2001
From: Trevor Buckner <calculuschild@gmail.com>
Date: Wed, 1 Dec 2021 22:18:07 -0500
Subject: [PATCH] fix: speed up parsing long lists (#2302)

Co-authored-by: Tony Brix <tony@brix.ninja>
---
 lib/marked.cjs                       | 79 +++++++++++++---------------
 lib/marked.esm.js                    | 74 ++++++++++++--------------
 lib/marked.umd.js                    | 79 +++++++++++++---------------
 package-lock.json                    |  5 ++
 src/Tokenizer.js                     | 74 ++++++++++++--------------
 test/specs/bug/adjacent_lists.html   |  1 -
 test/specs/bug/adjacent_lists.md     |  1 -
 test/specs/redos/quadratic_lists.cjs |  4 ++
 8 files changed, 151 insertions(+), 166 deletions(-)
 delete mode 100644 test/specs/bug/adjacent_lists.html
 delete mode 100644 test/specs/bug/adjacent_lists.md
 create mode 100644 test/specs/redos/quadratic_lists.cjs

diff --git a/lib/marked.cjs b/lib/marked.cjs
index 6e85fce027..97cbbe4b9f 100644
--- a/lib/marked.cjs
+++ b/lib/marked.cjs
@@ -533,7 +533,7 @@ var Tokenizer = /*#__PURE__*/function () {
     var cap = this.rules.block.list.exec(src);
 
     if (cap) {
-      var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents;
+      var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, nextLine, rawLine, itemContents;
       var bull = cap[1].trim();
       var isordered = bull.length > 1;
       var list = {
@@ -551,83 +551,77 @@ var Tokenizer = /*#__PURE__*/function () {
       } // Get next list item
 
 
-      var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item
+      var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Check if current bullet point can start a new List Item
 
       while (src) {
-        if (this.rules.block.hr.test(src)) {
-          // End list if we encounter an HR (possibly move into itemRegex?)
+        if (!(cap = itemRegex.exec(src))) {
           break;
         }
 
-        if (!(cap = itemRegex.exec(src))) {
+        if (this.rules.block.hr.test(src)) {
+          // End list if bullet was actually HR (possibly move into itemRegex?)
           break;
         }
 
-        lines = cap[2].split('\n');
+        raw = cap[0];
+        src = src.substring(raw.length);
+        line = cap[2].split('\n', 1)[0];
+        nextLine = src.split('\n', 1)[0];
 
         if (this.options.pedantic) {
           indent = 2;
-          itemContents = lines[0].trimLeft();
+          itemContents = line.trimLeft();
         } else {
           indent = cap[2].search(/[^ ]/); // Find first non-space char
 
-          indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
+          indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
 
-          itemContents = lines[0].slice(indent - cap[1].length);
+          itemContents = line.slice(indent);
+          indent += cap[1].length;
         }
 
         blankLine = false;
-        raw = cap[0];
 
-        if (!lines[0] && /^ *$/.test(lines[1])) {
-          // items begin with at most one blank line
-          raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
+        if (!line && /^ *$/.test(nextLine)) {
+          // Items begin with at most one blank line
+          raw += nextLine + '\n';
+          src = src.substring(nextLine.length + 1);
           list.loose = true;
-          lines = [];
         }
 
-        var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])");
+        var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); // Check if following lines should be included in List Item
 
-        for (i = 1; i < lines.length; i++) {
-          line = lines[i];
+        while (src && !list.loose) {
+          rawLine = src.split('\n', 1)[0];
+          line = rawLine; // Re-align to follow commonmark nesting rules
 
           if (this.options.pedantic) {
-            // Re-align to follow commonmark nesting rules
             line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, '  ');
           } // End list item if found start of new bullet
 
 
           if (nextBulletRegex.test(line)) {
-            raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
             break;
-          } // Until we encounter a blank line, item contents do not need indentation
-
-
-          if (!blankLine) {
-            if (!line.trim()) {
-              // Check if current line is empty
-              blankLine = true;
-            } // Dedent if possible
-
-
-            if (line.search(/[^ ]/) >= indent) {
-              itemContents += '\n' + line.slice(indent);
-            } else {
-              itemContents += '\n' + line;
-            }
-
-            continue;
-          } // Dedent this line
-
+          }
 
           if (line.search(/[^ ]/) >= indent || !line.trim()) {
+            // Dedent if possible
             itemContents += '\n' + line.slice(indent);
-            continue;
+          } else if (!blankLine) {
+            // Until blank line, item doesn't need indentation
+            itemContents += '\n' + line;
           } else {
-            // Line was not properly indented; end of this item
-            raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
+            // Otherwise, improper indentation ends this item
             break;
           }
+
+          if (!blankLine && !line.trim()) {
+            // Check if current line is blank
+            blankLine = true;
+          }
+
+          raw += rawLine + '\n';
+          src = src.substring(rawLine.length + 1);
         }
 
         if (!list.loose) {
@@ -658,7 +652,6 @@ var Tokenizer = /*#__PURE__*/function () {
           text: itemContents
         });
         list.raw += raw;
-        src = src.slice(raw.length);
       } // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
 
 
@@ -671,7 +664,7 @@ var Tokenizer = /*#__PURE__*/function () {
         this.lexer.state.top = false;
         list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
 
-        if (list.items[i].tokens.some(function (t) {
+        if (!list.loose && list.items[i].tokens.some(function (t) {
           return t.type === 'space';
         })) {
           list.loose = true;
diff --git a/lib/marked.esm.js b/lib/marked.esm.js
index 3bc9ebbe3b..c8fac1cf4e 100644
--- a/lib/marked.esm.js
+++ b/lib/marked.esm.js
@@ -452,7 +452,7 @@ class Tokenizer {
     let cap = this.rules.block.list.exec(src);
     if (cap) {
       let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
-        line, lines, itemContents;
+        line, nextLine, rawLine, itemContents;
 
       let bull = cap[1].trim();
       const isordered = bull.length > 1;
@@ -473,76 +473,73 @@ class Tokenizer {
       }
 
       // Get next list item
-      const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`);
+      const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`);
 
-      // Get each top-level item
+      // Check if current bullet point can start a new List Item
       while (src) {
-        if (this.rules.block.hr.test(src)) { // End list if we encounter an HR (possibly move into itemRegex?)
+        if (!(cap = itemRegex.exec(src))) {
           break;
         }
 
-        if (!(cap = itemRegex.exec(src))) {
+        if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?)
           break;
         }
 
-        lines = cap[2].split('\n');
+        raw = cap[0];
+        src = src.substring(raw.length);
+
+        line = cap[2].split('\n', 1)[0];
+        nextLine = src.split('\n', 1)[0];
 
         if (this.options.pedantic) {
           indent = 2;
-          itemContents = lines[0].trimLeft();
+          itemContents = line.trimLeft();
         } else {
           indent = cap[2].search(/[^ ]/); // Find first non-space char
-          indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
-          itemContents = lines[0].slice(indent - cap[1].length);
+          indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
+          itemContents = line.slice(indent);
+          indent += cap[1].length;
         }
 
         blankLine = false;
-        raw = cap[0];
 
-        if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line
-          raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
+        if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line
+          raw += nextLine + '\n';
+          src = src.substring(nextLine.length + 1);
           list.loose = true;
-          lines = [];
         }
 
         const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`);
 
-        for (i = 1; i < lines.length; i++) {
-          line = lines[i];
+        // Check if following lines should be included in List Item
+        while (src && !list.loose) {
+          rawLine = src.split('\n', 1)[0];
+          line = rawLine;
 
-          if (this.options.pedantic) { // Re-align to follow commonmark nesting rules
+          // Re-align to follow commonmark nesting rules
+          if (this.options.pedantic) {
             line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, '  ');
           }
 
           // End list item if found start of new bullet
           if (nextBulletRegex.test(line)) {
-            raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
             break;
           }
 
-          // Until we encounter a blank line, item contents do not need indentation
-          if (!blankLine) {
-            if (!line.trim()) { // Check if current line is empty
-              blankLine = true;
-            }
-
-            // Dedent if possible
-            if (line.search(/[^ ]/) >= indent) {
-              itemContents += '\n' + line.slice(indent);
-            } else {
-              itemContents += '\n' + line;
-            }
-            continue;
-          }
-
-          // Dedent this line
-          if (line.search(/[^ ]/) >= indent || !line.trim()) {
+          if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible
             itemContents += '\n' + line.slice(indent);
-            continue;
-          } else { // Line was not properly indented; end of this item
-            raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
+          } else if (!blankLine) { // Until blank line, item doesn't need indentation
+            itemContents += '\n' + line;
+          } else { // Otherwise, improper indentation ends this item
             break;
           }
+
+          if (!blankLine && !line.trim()) { // Check if current line is blank
+            blankLine = true;
+          }
+
+          raw += rawLine + '\n';
+          src = src.substring(rawLine.length + 1);
         }
 
         if (!list.loose) {
@@ -573,7 +570,6 @@ class Tokenizer {
         });
 
         list.raw += raw;
-        src = src.slice(raw.length);
       }
 
       // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
@@ -587,7 +583,7 @@ class Tokenizer {
       for (i = 0; i < l; i++) {
         this.lexer.state.top = false;
         list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
-        if (list.items[i].tokens.some(t => t.type === 'space')) {
+        if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) {
           list.loose = true;
           list.items[i].loose = true;
         }
diff --git a/lib/marked.umd.js b/lib/marked.umd.js
index cf69f9106d..25e84d3d21 100644
--- a/lib/marked.umd.js
+++ b/lib/marked.umd.js
@@ -535,7 +535,7 @@
       var cap = this.rules.block.list.exec(src);
 
       if (cap) {
-        var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents;
+        var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, nextLine, rawLine, itemContents;
         var bull = cap[1].trim();
         var isordered = bull.length > 1;
         var list = {
@@ -553,83 +553,77 @@
         } // Get next list item
 
 
-        var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item
+        var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Check if current bullet point can start a new List Item
 
         while (src) {
-          if (this.rules.block.hr.test(src)) {
-            // End list if we encounter an HR (possibly move into itemRegex?)
+          if (!(cap = itemRegex.exec(src))) {
             break;
           }
 
-          if (!(cap = itemRegex.exec(src))) {
+          if (this.rules.block.hr.test(src)) {
+            // End list if bullet was actually HR (possibly move into itemRegex?)
             break;
           }
 
-          lines = cap[2].split('\n');
+          raw = cap[0];
+          src = src.substring(raw.length);
+          line = cap[2].split('\n', 1)[0];
+          nextLine = src.split('\n', 1)[0];
 
           if (this.options.pedantic) {
             indent = 2;
-            itemContents = lines[0].trimLeft();
+            itemContents = line.trimLeft();
           } else {
             indent = cap[2].search(/[^ ]/); // Find first non-space char
 
-            indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
+            indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
 
-            itemContents = lines[0].slice(indent - cap[1].length);
+            itemContents = line.slice(indent);
+            indent += cap[1].length;
           }
 
           blankLine = false;
-          raw = cap[0];
 
-          if (!lines[0] && /^ *$/.test(lines[1])) {
-            // items begin with at most one blank line
-            raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
+          if (!line && /^ *$/.test(nextLine)) {
+            // Items begin with at most one blank line
+            raw += nextLine + '\n';
+            src = src.substring(nextLine.length + 1);
             list.loose = true;
-            lines = [];
           }
 
-          var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])");
+          var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); // Check if following lines should be included in List Item
 
-          for (i = 1; i < lines.length; i++) {
-            line = lines[i];
+          while (src && !list.loose) {
+            rawLine = src.split('\n', 1)[0];
+            line = rawLine; // Re-align to follow commonmark nesting rules
 
             if (this.options.pedantic) {
-              // Re-align to follow commonmark nesting rules
               line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, '  ');
             } // End list item if found start of new bullet
 
 
             if (nextBulletRegex.test(line)) {
-              raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
               break;
-            } // Until we encounter a blank line, item contents do not need indentation
-
-
-            if (!blankLine) {
-              if (!line.trim()) {
-                // Check if current line is empty
-                blankLine = true;
-              } // Dedent if possible
-
-
-              if (line.search(/[^ ]/) >= indent) {
-                itemContents += '\n' + line.slice(indent);
-              } else {
-                itemContents += '\n' + line;
-              }
-
-              continue;
-            } // Dedent this line
-
+            }
 
             if (line.search(/[^ ]/) >= indent || !line.trim()) {
+              // Dedent if possible
               itemContents += '\n' + line.slice(indent);
-              continue;
+            } else if (!blankLine) {
+              // Until blank line, item doesn't need indentation
+              itemContents += '\n' + line;
             } else {
-              // Line was not properly indented; end of this item
-              raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
+              // Otherwise, improper indentation ends this item
               break;
             }
+
+            if (!blankLine && !line.trim()) {
+              // Check if current line is blank
+              blankLine = true;
+            }
+
+            raw += rawLine + '\n';
+            src = src.substring(rawLine.length + 1);
           }
 
           if (!list.loose) {
@@ -660,7 +654,6 @@
             text: itemContents
           });
           list.raw += raw;
-          src = src.slice(raw.length);
         } // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
 
 
@@ -673,7 +666,7 @@
           this.lexer.state.top = false;
           list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
 
-          if (list.items[i].tokens.some(function (t) {
+          if (!list.loose && list.items[i].tokens.some(function (t) {
             return t.type === 'space';
           })) {
             list.loose = true;
diff --git a/package-lock.json b/package-lock.json
index 14aa125098..6d38122072 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -8304,6 +8304,11 @@
         "safer-buffer": "^2.0.2",
         "tweetnacl": "~0.14.0"
       },
+      "bin": {
+        "sshpk-conv": "bin/sshpk-conv",
+        "sshpk-sign": "bin/sshpk-sign",
+        "sshpk-verify": "bin/sshpk-verify"
+      },
       "engines": {
         "node": ">=0.10.0"
       }
diff --git a/src/Tokenizer.js b/src/Tokenizer.js
index fe91349c84..a957883f28 100644
--- a/src/Tokenizer.js
+++ b/src/Tokenizer.js
@@ -169,7 +169,7 @@ export class Tokenizer {
     let cap = this.rules.block.list.exec(src);
     if (cap) {
       let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
-        line, lines, itemContents;
+        line, nextLine, rawLine, itemContents;
 
       let bull = cap[1].trim();
       const isordered = bull.length > 1;
@@ -190,76 +190,73 @@ export class Tokenizer {
       }
 
       // Get next list item
-      const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`);
+      const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`);
 
-      // Get each top-level item
+      // Check if current bullet point can start a new List Item
       while (src) {
-        if (this.rules.block.hr.test(src)) { // End list if we encounter an HR (possibly move into itemRegex?)
+        if (!(cap = itemRegex.exec(src))) {
           break;
         }
 
-        if (!(cap = itemRegex.exec(src))) {
+        if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?)
           break;
         }
 
-        lines = cap[2].split('\n');
+        raw = cap[0];
+        src = src.substring(raw.length);
+
+        line = cap[2].split('\n', 1)[0];
+        nextLine = src.split('\n', 1)[0];
 
         if (this.options.pedantic) {
           indent = 2;
-          itemContents = lines[0].trimLeft();
+          itemContents = line.trimLeft();
         } else {
           indent = cap[2].search(/[^ ]/); // Find first non-space char
-          indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
-          itemContents = lines[0].slice(indent - cap[1].length);
+          indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
+          itemContents = line.slice(indent);
+          indent += cap[1].length;
         }
 
         blankLine = false;
-        raw = cap[0];
 
-        if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line
-          raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
+        if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line
+          raw += nextLine + '\n';
+          src = src.substring(nextLine.length + 1);
           list.loose = true;
-          lines = [];
         }
 
         const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`);
 
-        for (i = 1; i < lines.length; i++) {
-          line = lines[i];
+        // Check if following lines should be included in List Item
+        while (src && !list.loose) {
+          rawLine = src.split('\n', 1)[0];
+          line = rawLine;
 
-          if (this.options.pedantic) { // Re-align to follow commonmark nesting rules
+          // Re-align to follow commonmark nesting rules
+          if (this.options.pedantic) {
             line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, '  ');
           }
 
           // End list item if found start of new bullet
           if (nextBulletRegex.test(line)) {
-            raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
             break;
           }
 
-          // Until we encounter a blank line, item contents do not need indentation
-          if (!blankLine) {
-            if (!line.trim()) { // Check if current line is empty
-              blankLine = true;
-            }
-
-            // Dedent if possible
-            if (line.search(/[^ ]/) >= indent) {
-              itemContents += '\n' + line.slice(indent);
-            } else {
-              itemContents += '\n' + line;
-            }
-            continue;
-          }
-
-          // Dedent this line
-          if (line.search(/[^ ]/) >= indent || !line.trim()) {
+          if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible
             itemContents += '\n' + line.slice(indent);
-            continue;
-          } else { // Line was not properly indented; end of this item
-            raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
+          } else if (!blankLine) { // Until blank line, item doesn't need indentation
+            itemContents += '\n' + line;
+          } else { // Otherwise, improper indentation ends this item
             break;
           }
+
+          if (!blankLine && !line.trim()) { // Check if current line is blank
+            blankLine = true;
+          }
+
+          raw += rawLine + '\n';
+          src = src.substring(rawLine.length + 1);
         }
 
         if (!list.loose) {
@@ -290,7 +287,6 @@ export class Tokenizer {
         });
 
         list.raw += raw;
-        src = src.slice(raw.length);
       }
 
       // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
@@ -304,7 +300,7 @@ export class Tokenizer {
       for (i = 0; i < l; i++) {
         this.lexer.state.top = false;
         list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
-        if (list.items[i].tokens.some(t => t.type === 'space')) {
+        if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) {
           list.loose = true;
           list.items[i].loose = true;
         }
diff --git a/test/specs/bug/adjacent_lists.html b/test/specs/bug/adjacent_lists.html
deleted file mode 100644
index 2d64bd5963..0000000000
--- a/test/specs/bug/adjacent_lists.html
+++ /dev/null
@@ -1 +0,0 @@
-<p><em>foo <strong>bar *baz bim</strong> bam</em></p>
diff --git a/test/specs/bug/adjacent_lists.md b/test/specs/bug/adjacent_lists.md
deleted file mode 100644
index fdf80c0adb..0000000000
--- a/test/specs/bug/adjacent_lists.md
+++ /dev/null
@@ -1 +0,0 @@
-*foo __bar *baz bim__ bam*
diff --git a/test/specs/redos/quadratic_lists.cjs b/test/specs/redos/quadratic_lists.cjs
new file mode 100644
index 0000000000..8d230ca5af
--- /dev/null
+++ b/test/specs/redos/quadratic_lists.cjs
@@ -0,0 +1,4 @@
+module.exports = {
+  markdown: '- a\n'.repeat(10000),
+  html: `<ul>${'<li>a</li>'.repeat(10000)}</ul>`
+};