Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: speed up parsing long lists #2302

Merged
merged 10 commits into from Dec 2, 2021
46 changes: 26 additions & 20 deletions lib/marked.cjs
Expand Up @@ -533,7 +533,7 @@ var Tokenizer = /*#__PURE__*/function () {
var cap = this.rules.block.list.exec(src);

if (cap) {
var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents;
var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, itemContents;
var bull = cap[1].trim();
var isordered = bull.length > 1;
var list = {
Expand All @@ -551,7 +551,7 @@ var Tokenizer = /*#__PURE__*/function () {
} // Get next list item


var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item
var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Get each top-level item

while (src) {
if (this.rules.block.hr.test(src)) {
Expand All @@ -561,35 +561,41 @@ var Tokenizer = /*#__PURE__*/function () {

if (!(cap = itemRegex.exec(src))) {
break;
}
} //lines = cap[2].split('\n');
calculuschild marked this conversation as resolved.
Show resolved Hide resolved


lines = cap[2].split('\n');
raw = cap[0];
line = cap[2].split('\n', 1)[0];
src = src.substring(raw.length);
var nextLine = src.split('\n', 1)[0];

if (this.options.pedantic) {
indent = 2;
itemContents = lines[0].trimLeft();
itemContents = line.trimLeft();
} else {
indent = cap[2].search(/[^ ]/); // Find first non-space char

indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent

itemContents = lines[0].slice(indent - cap[1].length);
itemContents = line.slice(indent);
indent += cap[1].length;
}

blankLine = false;
raw = cap[0];

if (!lines[0] && /^ *$/.test(lines[1])) {
if (!line && /^ *$/.test(nextLine)) {
// items begin with at most one blank line
raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
raw += nextLine + '\n';
src = src.substring(nextLine.length + 1);
list.loose = true;
lines = [];
}

var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])");
var rawLine = void 0;

for (i = 1; i < lines.length; i++) {
line = lines[i];
while (src && !list.loose) {
rawLine = src.split('\n', 1)[0];
line = rawLine;

if (this.options.pedantic) {
// Re-align to follow commonmark nesting rules
Expand All @@ -598,7 +604,6 @@ var Tokenizer = /*#__PURE__*/function () {


if (nextBulletRegex.test(line)) {
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
} // Until we encounter a blank line, item contents do not need indentation

Expand All @@ -616,16 +621,19 @@ var Tokenizer = /*#__PURE__*/function () {
itemContents += '\n' + line;
}

raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
continue;
} // Dedent this line


if (line.search(/[^ ]/) >= indent || !line.trim()) {
itemContents += '\n' + line.slice(indent);
raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
continue;
} else {
// Line was not properly indented; end of this item
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
}
}
Expand Down Expand Up @@ -657,8 +665,7 @@ var Tokenizer = /*#__PURE__*/function () {
loose: false,
text: itemContents
});
list.raw += raw;
src = src.slice(raw.length);
list.raw += raw; //src = src.slice(raw.length);
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
} // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic


Expand All @@ -671,11 +678,10 @@ var Tokenizer = /*#__PURE__*/function () {
this.lexer.state.top = false;
list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);

if (list.items[i].tokens.some(function (t) {
if (!list.loose && list.items[i].tokens.some(function (t) {
return t.type === 'space';
})) {
list.loose = true;
list.items[i].loose = true;
list.loose = true; //list.items[i].loose = true;
}
}

Expand Down
44 changes: 27 additions & 17 deletions lib/marked.esm.js
Expand Up @@ -452,7 +452,7 @@ class Tokenizer {
let cap = this.rules.block.list.exec(src);
if (cap) {
let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
line, lines, itemContents;
line, itemContents;

let bull = cap[1].trim();
const isordered = bull.length > 1;
Expand All @@ -473,7 +473,7 @@ class Tokenizer {
}

// Get next list item
const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`);
const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`);

// Get each top-level item
while (src) {
Expand All @@ -485,38 +485,45 @@ class Tokenizer {
break;
}

lines = cap[2].split('\n');
//lines = cap[2].split('\n');
raw = cap[0];
line = cap[2].split('\n',1)[0];
src = src.substring(raw.length);
let nextLine = src.split('\n',1)[0];

if (this.options.pedantic) {
indent = 2;
itemContents = lines[0].trimLeft();
itemContents = line.trimLeft();
} else {
indent = cap[2].search(/[^ ]/); // Find first non-space char
indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
itemContents = lines[0].slice(indent - cap[1].length);
indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
itemContents = line.slice(indent);
indent += cap[1].length;
}

blankLine = false;
raw = cap[0];

if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line
raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';

if (!line && /^ *$/.test(nextLine)) { // items begin with at most one blank line
raw += nextLine + '\n';
src = src.substring(nextLine.length + 1);
list.loose = true;
lines = [];
}

const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`);

for (i = 1; i < lines.length; i++) {
line = lines[i];
let rawLine;

while(src && !list.loose) {
rawLine = src.split('\n',1)[0];
line = rawLine;

if (this.options.pedantic) { // Re-align to follow commonmark nesting rules
line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
}

// End list item if found start of new bullet
if (nextBulletRegex.test(line)) {
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
}

Expand All @@ -532,15 +539,18 @@ class Tokenizer {
} else {
itemContents += '\n' + line;
}
raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
continue;
}

// Dedent this line
if (line.search(/[^ ]/) >= indent || !line.trim()) {
itemContents += '\n' + line.slice(indent);
raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
continue;
} else { // Line was not properly indented; end of this item
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
}
}
Expand Down Expand Up @@ -573,7 +583,7 @@ class Tokenizer {
});

list.raw += raw;
src = src.slice(raw.length);
//src = src.slice(raw.length);
}

// Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
Expand All @@ -587,9 +597,9 @@ class Tokenizer {
for (i = 0; i < l; i++) {
this.lexer.state.top = false;
list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
if (list.items[i].tokens.some(t => t.type === 'space')) {
if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) {
list.loose = true;
list.items[i].loose = true;
//list.items[i].loose = true;
}
}

Expand Down
46 changes: 26 additions & 20 deletions lib/marked.umd.js
Expand Up @@ -535,7 +535,7 @@
var cap = this.rules.block.list.exec(src);

if (cap) {
var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents;
var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, itemContents;
var bull = cap[1].trim();
var isordered = bull.length > 1;
var list = {
Expand All @@ -553,7 +553,7 @@
} // Get next list item


var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item
var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Get each top-level item

while (src) {
if (this.rules.block.hr.test(src)) {
Expand All @@ -563,35 +563,41 @@

if (!(cap = itemRegex.exec(src))) {
break;
}
} //lines = cap[2].split('\n');


lines = cap[2].split('\n');
raw = cap[0];
line = cap[2].split('\n', 1)[0];
src = src.substring(raw.length);
var nextLine = src.split('\n', 1)[0];

if (this.options.pedantic) {
indent = 2;
itemContents = lines[0].trimLeft();
itemContents = line.trimLeft();
} else {
indent = cap[2].search(/[^ ]/); // Find first non-space char

indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent

itemContents = lines[0].slice(indent - cap[1].length);
itemContents = line.slice(indent);
indent += cap[1].length;
}

blankLine = false;
raw = cap[0];

if (!lines[0] && /^ *$/.test(lines[1])) {
if (!line && /^ *$/.test(nextLine)) {
// items begin with at most one blank line
raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
raw += nextLine + '\n';
src = src.substring(nextLine.length + 1);
list.loose = true;
lines = [];
}

var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])");
var rawLine = void 0;

for (i = 1; i < lines.length; i++) {
line = lines[i];
while (src && !list.loose) {
rawLine = src.split('\n', 1)[0];
line = rawLine;

if (this.options.pedantic) {
// Re-align to follow commonmark nesting rules
Expand All @@ -600,7 +606,6 @@


if (nextBulletRegex.test(line)) {
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
} // Until we encounter a blank line, item contents do not need indentation

Expand All @@ -618,16 +623,19 @@
itemContents += '\n' + line;
}

raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
continue;
} // Dedent this line


if (line.search(/[^ ]/) >= indent || !line.trim()) {
itemContents += '\n' + line.slice(indent);
raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
continue;
} else {
// Line was not properly indented; end of this item
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
}
}
Expand Down Expand Up @@ -659,8 +667,7 @@
loose: false,
text: itemContents
});
list.raw += raw;
src = src.slice(raw.length);
list.raw += raw; //src = src.slice(raw.length);
} // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic


Expand All @@ -673,11 +680,10 @@
this.lexer.state.top = false;
list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);

if (list.items[i].tokens.some(function (t) {
if (!list.loose && list.items[i].tokens.some(function (t) {
return t.type === 'space';
})) {
list.loose = true;
list.items[i].loose = true;
list.loose = true; //list.items[i].loose = true;
}
}

Expand Down
6 changes: 6 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.