Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: speed up parsing long lists #2302

Merged
merged 10 commits into from Dec 2, 2021
82 changes: 37 additions & 45 deletions lib/marked.cjs
Expand Up @@ -533,7 +533,7 @@ var Tokenizer = /*#__PURE__*/function () {
var cap = this.rules.block.list.exec(src);

if (cap) {
var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, lines, itemContents;
var raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine, line, nextLine, rawLine, itemContents;
var bull = cap[1].trim();
var isordered = bull.length > 1;
var list = {
Expand All @@ -551,83 +551,77 @@ var Tokenizer = /*#__PURE__*/function () {
} // Get next list item


var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))"); // Get each top-level item
var itemRegex = new RegExp("^( {0,3}" + bull + ")((?: [^\\n]*)?(?:\\n|$))"); // Check if current bullet point can start a new List Item

while (src) {
if (this.rules.block.hr.test(src)) {
// End list if we encounter an HR (possibly move into itemRegex?)
if (!(cap = itemRegex.exec(src))) {
break;
}

if (!(cap = itemRegex.exec(src))) {
if (this.rules.block.hr.test(src)) {
// End list if bullet was actually HR (possibly move into itemRegex?)
break;
}

lines = cap[2].split('\n');
raw = cap[0];
src = src.substring(raw.length);
line = cap[2].split('\n', 1)[0];
nextLine = src.split('\n', 1)[0];

if (this.options.pedantic) {
indent = 2;
itemContents = lines[0].trimLeft();
itemContents = line.trimLeft();
} else {
indent = cap[2].search(/[^ ]/); // Find first non-space char

indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent

itemContents = lines[0].slice(indent - cap[1].length);
itemContents = line.slice(indent);
indent += cap[1].length;
}

blankLine = false;
raw = cap[0];

if (!lines[0] && /^ *$/.test(lines[1])) {
// items begin with at most one blank line
raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
if (!line && /^ *$/.test(nextLine)) {
// Items begin with at most one blank line
raw += nextLine + '\n';
src = src.substring(nextLine.length + 1);
list.loose = true;
lines = [];
}

var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])");
var nextBulletRegex = new RegExp("^ {0," + Math.min(3, indent - 1) + "}(?:[*+-]|\\d{1,9}[.)])"); // Check if following lines should be included in List Item

for (i = 1; i < lines.length; i++) {
line = lines[i];
while (src && !list.loose) {
rawLine = src.split('\n', 1)[0];
line = rawLine; // Re-align to follow commonmark nesting rules

if (this.options.pedantic) {
// Re-align to follow commonmark nesting rules
line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
} // End list item if found start of new bullet


if (nextBulletRegex.test(line)) {
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
} // Until we encounter a blank line, item contents do not need indentation


if (!blankLine) {
if (!line.trim()) {
// Check if current line is empty
blankLine = true;
} // Dedent if possible


if (line.search(/[^ ]/) >= indent) {
itemContents += '\n' + line.slice(indent);
} else {
itemContents += '\n' + line;
}

continue;
} // Dedent this line

}

if (line.search(/[^ ]/) >= indent || !line.trim()) {
// Dedent if possible
itemContents += '\n' + line.slice(indent);
continue;
} else if (!blankLine) {
// Until blank line, item doesn't need indentation
itemContents += '\n' + line;
} else {
// Line was not properly indented; end of this item
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
// Otherwise, improper indentation ends this item
break;
}

if (!blankLine && !line.trim()) {
// Check if current line is blank
blankLine = true;
}

raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
}

if (!list.loose) {
Expand Down Expand Up @@ -658,7 +652,6 @@ var Tokenizer = /*#__PURE__*/function () {
text: itemContents
});
list.raw += raw;
src = src.slice(raw.length);
} // Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic


Expand All @@ -671,11 +664,10 @@ var Tokenizer = /*#__PURE__*/function () {
this.lexer.state.top = false;
list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);

if (list.items[i].tokens.some(function (t) {
if (!list.loose && list.items[i].tokens.some(function (t) {
return t.type === 'space';
})) {
list.loose = true;
list.items[i].loose = true;
list.loose = true; // list.items[i].loose = true;
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down
76 changes: 36 additions & 40 deletions lib/marked.esm.js
Expand Up @@ -452,7 +452,7 @@ class Tokenizer {
let cap = this.rules.block.list.exec(src);
if (cap) {
let raw, istask, ischecked, indent, i, blankLine, endsWithBlankLine,
line, lines, itemContents;
line, nextLine, rawLine, itemContents;

let bull = cap[1].trim();
const isordered = bull.length > 1;
Expand All @@ -473,76 +473,73 @@ class Tokenizer {
}

// Get next list item
const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*| *)(?:\\n[^\\n]*)*(?:\\n|$))`);
const itemRegex = new RegExp(`^( {0,3}${bull})((?: [^\\n]*)?(?:\\n|$))`);

// Get each top-level item
// Check if current bullet point can start a new List Item
while (src) {
if (this.rules.block.hr.test(src)) { // End list if we encounter an HR (possibly move into itemRegex?)
if (!(cap = itemRegex.exec(src))) {
break;
}

if (!(cap = itemRegex.exec(src))) {
if (this.rules.block.hr.test(src)) { // End list if bullet was actually HR (possibly move into itemRegex?)
break;
}

lines = cap[2].split('\n');
raw = cap[0];
src = src.substring(raw.length);

line = cap[2].split('\n', 1)[0];
nextLine = src.split('\n', 1)[0];

if (this.options.pedantic) {
indent = 2;
itemContents = lines[0].trimLeft();
itemContents = line.trimLeft();
} else {
indent = cap[2].search(/[^ ]/); // Find first non-space char
indent = cap[1].length + (indent > 4 ? 1 : indent); // intented code blocks after 4 spaces; indent is always 1
itemContents = lines[0].slice(indent - cap[1].length);
indent = indent > 4 ? 1 : indent; // Treat indented code blocks (> 4 spaces) as having only 1 indent
itemContents = line.slice(indent);
indent += cap[1].length;
}

blankLine = false;
raw = cap[0];

if (!lines[0] && /^ *$/.test(lines[1])) { // items begin with at most one blank line
raw = cap[1] + lines.slice(0, 2).join('\n') + '\n';
if (!line && /^ *$/.test(nextLine)) { // Items begin with at most one blank line
raw += nextLine + '\n';
src = src.substring(nextLine.length + 1);
list.loose = true;
lines = [];
}

const nextBulletRegex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])`);

for (i = 1; i < lines.length; i++) {
line = lines[i];
// Check if following lines should be included in List Item
while (src && !list.loose) {
rawLine = src.split('\n', 1)[0];
line = rawLine;

if (this.options.pedantic) { // Re-align to follow commonmark nesting rules
// Re-align to follow commonmark nesting rules
if (this.options.pedantic) {
line = line.replace(/^ {1,4}(?=( {4})*[^ ])/g, ' ');
}

// End list item if found start of new bullet
if (nextBulletRegex.test(line)) {
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
break;
}

// Until we encounter a blank line, item contents do not need indentation
if (!blankLine) {
if (!line.trim()) { // Check if current line is empty
blankLine = true;
}

// Dedent if possible
if (line.search(/[^ ]/) >= indent) {
itemContents += '\n' + line.slice(indent);
} else {
itemContents += '\n' + line;
}
continue;
}

// Dedent this line
if (line.search(/[^ ]/) >= indent || !line.trim()) {
if (line.search(/[^ ]/) >= indent || !line.trim()) { // Dedent if possible
itemContents += '\n' + line.slice(indent);
continue;
} else { // Line was not properly indented; end of this item
raw = cap[1] + lines.slice(0, i).join('\n') + '\n';
} else if (!blankLine) { // Until blank line, item doesn't need indentation
itemContents += '\n' + line;
} else { // Otherwise, improper indentation ends this item
break;
}

if (!blankLine && !line.trim()) { // Check if current line is blank
blankLine = true;
}

raw += rawLine + '\n';
src = src.substring(rawLine.length + 1);
}

if (!list.loose) {
Expand Down Expand Up @@ -573,7 +570,6 @@ class Tokenizer {
});

list.raw += raw;
src = src.slice(raw.length);
}

// Do not consume newlines at end of final item. Alternatively, make itemRegex *start* with any newlines to simplify/speed up endsWithBlankLine logic
Expand All @@ -587,9 +583,9 @@ class Tokenizer {
for (i = 0; i < l; i++) {
this.lexer.state.top = false;
list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
if (list.items[i].tokens.some(t => t.type === 'space')) {
if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) {
list.loose = true;
list.items[i].loose = true;
// list.items[i].loose = true;
calculuschild marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down