From 3cfa915f05a361a9a39824eeafa284f972d8e81e Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Tue, 25 Feb 2020 20:19:31 -0500 Subject: [PATCH 01/15] nicer more modular solution --- src/highlight.js | 7 +++++++ src/lib/mode_compiler.js | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/src/highlight.js b/src/highlight.js index 2b97470e3a..3236b3d969 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -214,6 +214,12 @@ const HLJS = function(hljs) { var lexeme = match[0]; var new_mode = match.rule; + // __abortIf is consider private API + if (new_mode.__abortIf && new_mode.__abortIf(match)) { + mode_buffer += lexeme; + return lexeme.length; + } + if (new_mode && new_mode.endSameAsBegin) { new_mode.endRe = regex.escape( lexeme ); } @@ -358,6 +364,7 @@ const HLJS = function(hljs) { while (true) { top.terminators.lastIndex = index; match = top.terminators.exec(codeToHighlight); + // console.log("match", match[0], match.rule && match.rule.begin) if (!match) break; let beforeMatch = codeToHighlight.substring(index, match.index); diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index d468022a28..24386e416f 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -70,6 +70,14 @@ export function compileLanguage(language) { return matcher; } + function abortIf_preceedingOrTrailingDot(match) { + let before = match.input[match.index-1]; + let after = match.input[match.index + match[0].length]; + if (before === "." || after === ".") { + return true; + } + } + function compileMode(mode, parent) { if (mode.compiled) return; @@ -89,6 +97,7 @@ export function compileLanguage(language) { // doesn't allow spaces in keywords anyways and we still check for the boundary // first mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)'; + mode.__abortIf = abortIf_preceedingOrTrailingDot; } if (!mode.begin) mode.begin = /\B|\b/; From ce4b7a23e9f3233303dfa87e7746789e2b83bb2b Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Tue, 25 Feb 2020 23:12:02 -0500 Subject: [PATCH 02/15] explain abort vs ignore vs skip --- src/highlight.js | 1 - src/lib/mode_compiler.js | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index 3236b3d969..cd61a7bb34 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -214,7 +214,6 @@ const HLJS = function(hljs) { var lexeme = match[0]; var new_mode = match.rule; - // __abortIf is consider private API if (new_mode.__abortIf && new_mode.__abortIf(match)) { mode_buffer += lexeme; return lexeme.length; diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index 24386e416f..85df007665 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -70,7 +70,9 @@ export function compileLanguage(language) { return matcher; } - function abortIf_preceedingOrTrailingDot(match) { + // HACK: Abort vs ignore is technically broken. (See note below) + // TODO: We need negative look-behind support to do this properly + function hasPrecedingOrTrailingDot(match) { let before = match.input[match.index-1]; let after = match.input[match.index + match[0].length]; if (before === "." || after === ".") { @@ -78,11 +80,44 @@ export function compileLanguage(language) { } } + /** skip vs abort vs ignore + * + * @skip - The mode is still entered and exited normally (and contains rules apply), + * but all content is held and added to the parent buffer rather than being + * output when the mode ends. Mostly used with `sublanguage` to build up + * a single large buffer than can be parsed by sublanguage. + * + * - The mode begin ands ends normally. + * - Content matched is added to the parent mode buffer. + * - The parser cursor is moved forward normally. + * + * @abort - A hack placeholder until we have ignore. Aborts the mode (as if it + * never matched) but DOES NOT continue to match subsequent `contains` + * modes. Abort is bad/suboptimal because it can result in modes + * farther down not getting applied because an earlier rule eats the + * content but then aborts. + * + * - The mode does not begin. + * - Content matched by `begin` is added to the mode buffer. + * - The parser cursor is moved forward accordingly. + * + * @ignore - Ignores the mode (as if it never matched) and continues to match any + * subsequent `contains` modes. Ignore isn't technically possible with + * the current parser implementation. + * + * - The mode does not begin. + * - Content matched by `begin` is ignored. + * - The parser cursor is not moved forward. + */ + function compileMode(mode, parent) { if (mode.compiled) return; mode.compiled = true; + // __abortIf is considered private API, internal use only + mode.__abortIf = null; + mode.keywords = mode.keywords || mode.beginKeywords; if (mode.keywords) mode.keywords = compileKeywords(mode.keywords, language.case_insensitive); @@ -97,7 +132,7 @@ export function compileLanguage(language) { // doesn't allow spaces in keywords anyways and we still check for the boundary // first mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)'; - mode.__abortIf = abortIf_preceedingOrTrailingDot; + mode.__abortIf = hasPrecedingOrTrailingDot; } if (!mode.begin) mode.begin = /\B|\b/; From 461ce6dbe2563acd537975baa232988009ba9580 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 14:21:23 -0500 Subject: [PATCH 03/15] MultiMatcher gets its own class --- src/lib/mode_compiler.js | 78 ++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index 85df007665..ef264adcbc 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -15,59 +15,57 @@ export function compileLanguage(language) { ); } - function buildModeRegex(mode) { - - var matchIndexes = {}; - var matcherRe; - var regexes = []; - var matcher = {}; - var matchAt = 1; - - function addRule(rule, re) { - matchIndexes[matchAt] = rule; - regexes.push([rule, re]); - matchAt += regex.countMatchGroups(re) + 1; + class MultiMatcher { + constructor() { + this.matchIndexes = {}; + this.regexes = []; + this.matchAt = 1; } - mode.contains.forEach(term => addRule(term, term.begin)) - - if (mode.terminator_end) - addRule("end", mode.terminator_end); - if (mode.illegal) - addRule("illegal", mode.illegal); - - var terminators = regexes.map(el => el[1]); - matcherRe = langRe(regex.join(terminators, '|'), true); + addRule(re, opts) { + this.matchIndexes[this.matchAt] = opts; + this.regexes.push([opts, re]); + this.matchAt += regex.countMatchGroups(re) + 1; + } - matcher.lastIndex = 0; - matcher.exec = function(s) { - var rule; + compile() { + let terminators = this.regexes.map(el => el[1]); + this.matcherRe = langRe(regex.join(terminators, '|'), true); + this.lastIndex = 0; + } - if( regexes.length === 0) return null; + exec(s) { + var matchData; + if (this.regexes.length === 0) return null; - matcherRe.lastIndex = matcher.lastIndex; - var match = matcherRe.exec(s); + this.matcherRe.lastIndex = this.lastIndex; + let match = this.matcherRe.exec(s); if (!match) { return null; } for(var i = 0; i mm.addRule(term.begin, {rule: term, type: "begin" })) + + if (mode.terminator_end) + mm.addRule(mode.terminator_end, {type: "end"} ); + if (mode.illegal) + mm.addRule(mode.illegal, {type: "illegal"} ); - return matcher; + mm.compile(); + return mm; } // HACK: Abort vs ignore is technically broken. (See note below) From 9a97ae7a6fb46ae7ff29584e66c94bd169217227 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 14:55:35 -0500 Subject: [PATCH 04/15] deeper down the rabbit hole --- src/lib/mode_compiler.js | 47 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index ef264adcbc..be7c0d48ae 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -15,14 +15,16 @@ export function compileLanguage(language) { ); } - class MultiMatcher { + class MultiRegex { constructor() { this.matchIndexes = {}; this.regexes = []; this.matchAt = 1; + this.position = 0; } addRule(re, opts) { + opts.position = this.position++; this.matchIndexes[this.matchAt] = opts; this.regexes.push([opts, re]); this.matchAt += regex.countMatchGroups(re) + 1; @@ -53,9 +55,49 @@ export function compileLanguage(language) { } } + class ContinuableMultiRegex { + constructor() { + this.rules = []; + this.multiRegexes = []; + this.count = 0; + + this.lastIndex = 0; + this.startAt = 0; + } + + getMatcher(index) { + if (this.multiRegexes[index]) return this.multiRegexes[index]; + + let matcher = new MultiRegex(); + this.rules.slice(index).forEach(([re, opts])=> matcher.addRule(re,opts)) + matcher.compile(); + this.multiRegexes[index] = matcher; + return matcher; + } + + addRule(re, opts) { + this.rules.push([re, opts]); + if (opts.type==="begin") this.count++; + } + + exec(s) { + let m = this.getMatcher(this.startAt); + m.lastIndex = this.lastIndex; + let result = m.exec(s); + if (result) { + this.startAt += result.position + 1; + if (this.startAt === this.count) // wrap-around + this.startAt = 0; + } + + this.startAt = 0; + return result; + } + } + function buildModeRegex(mode) { - let mm = new MultiMatcher(); + let mm = new ContinuableMultiRegex(); mode.contains.forEach(term => mm.addRule(term.begin, {rule: term, type: "begin" })) @@ -64,7 +106,6 @@ export function compileLanguage(language) { if (mode.illegal) mm.addRule(mode.illegal, {type: "illegal"} ); - mm.compile(); return mm; } From 36c72fec210e323677dd399b3828707fe51c376f Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 16:19:19 -0500 Subject: [PATCH 05/15] document --- src/lib/mode_compiler.js | 51 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index be7c0d48ae..95104fd1be 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -15,6 +15,19 @@ export function compileLanguage(language) { ); } + /** + Stores multiple regular expressions and allows you to quickly search for + them all in a string simultaneously - returning the first match. It does + this by creating a huge (a|b|c) regex - each individual item wrapped with () + and joined by `|` - using match groups to track position. When a match is + found checking which position in the array has content allows us to figure + out which of the original regexes / match groups triggered the match. + + The match object itself (the result of `Regex.exec`) is returned but also + enhanced by merging in any meta-data that was registered with the regex. + This is how we keep track of which mode matched, and what type of rule + (`illegal`, `begin`, end, etc). + */ class MultiRegex { constructor() { this.matchIndexes = {}; @@ -55,7 +68,38 @@ export function compileLanguage(language) { } } - class ContinuableMultiRegex { + /* + Created to solve the key deficiently with MultiRegex - there is no way to + test for multiple matches at a single location. Why would we need to do + that? In the future a more dynamic engine will allow certain matches to be + ignored. An example: if we matched say the 3rd regex in a large group but + decided to ignore it - we'd need to started testing again at the 4th + regex... but MultiRegex itself gives us no real way to do that. + + So what this class creates MultiRegexs on the fly for whatever search + position they are needed. + + NOTE: These additional MultiRegex objects are created dynamically. For most + grammars most of the time we will never actually need anything more than the + first MultiRegex - so this shouldn't have too much overhead. + + Say this is our search group, and we match regex3, but wish to ignore it. + + regex1 | regex2 | regex3 | regex4 | regex5 ' ie, startAt = 0 + + What we need is a new MultiRegex that only includes the remaining + possibilities: + + regex4 | regex5 ' ie, startAt = 3 + + This class wraps all that complexity up in a simple API... `startAt` decides + where in the array of expressions to start doing the matching. It + auto-increments, so if a match is found at position 2, then startAt will be + set to 3. If the end is reached startAt will return to 0. + + MOST of the time the parser will be setting startAt manually to 0. + */ + class ResumableMultiRegex { constructor() { this.rules = []; this.multiRegexes = []; @@ -90,14 +134,14 @@ export function compileLanguage(language) { this.startAt = 0; } - this.startAt = 0; + // this.startAt = 0; return result; } } function buildModeRegex(mode) { - let mm = new ContinuableMultiRegex(); + let mm = new ResumableMultiRegex(); mode.contains.forEach(term => mm.addRule(term.begin, {rule: term, type: "begin" })) @@ -109,7 +153,6 @@ export function compileLanguage(language) { return mm; } - // HACK: Abort vs ignore is technically broken. (See note below) // TODO: We need negative look-behind support to do this properly function hasPrecedingOrTrailingDot(match) { let before = match.input[match.index-1]; From 09103044c137daef36500d985d481b13f822dbee Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 16:27:45 -0500 Subject: [PATCH 06/15] tweaks --- src/highlight.js | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index cd61a7bb34..09d55f2860 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -207,18 +207,35 @@ const HLJS = function(hljs) { emitter.openNode(mode.className) } top = Object.create(mode, {parent: {value: top}}); + top.terminators.startAt = 0; } + function doIgnore(lexeme) { + if (top.terminators.startAt === 0) { + // no more regexs to potentially match here, so we move the cursor forward one + // space + mode_buffer += lexeme[0]; + return 1; + } else { + // no need to move the cursor, we still have additional regexes to try and + // match at this very spot + return 0; + } + } function doBeginMatch(match) { var lexeme = match[0]; var new_mode = match.rule; if (new_mode.__abortIf && new_mode.__abortIf(match)) { - mode_buffer += lexeme; - return lexeme.length; + return doIgnore(lexeme); + // mode_buffer += lexeme; + // return lexeme.length; } + // we are not ignoring + top.terminators.startAt = 0; + if (new_mode && new_mode.endSameAsBegin) { new_mode.endRe = regex.escape( lexeme ); } @@ -242,6 +259,7 @@ const HLJS = function(hljs) { var lexeme = match[0]; var matchPlusRemainder = codeToHighlight.substr(match.index); var end_mode = endOfMode(top, matchPlusRemainder); + top.terminators.startAt = 0; if (!end_mode) { return; } var origin = top; @@ -297,6 +315,8 @@ const HLJS = function(hljs) { return 0; } + + // we've found a 0 width match and we're stuck, so we need to advance // this happens when we have badly behaved rules that have optional matchers to the degree that // sometimes they can end up matching nothing at all @@ -360,6 +380,7 @@ const HLJS = function(hljs) { var match, processedCount, index = 0; try { + top.terminators.startAt = 0; while (true) { top.terminators.lastIndex = index; match = top.terminators.exec(codeToHighlight); From 7f5a0872af685d84a3ec0ab23cb7bb79a0adb335 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 17:27:36 -0500 Subject: [PATCH 07/15] rename api --- src/highlight.js | 10 +++++----- src/lib/mode_compiler.js | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index 09d55f2860..cbac94c45e 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -227,13 +227,13 @@ const HLJS = function(hljs) { var lexeme = match[0]; var new_mode = match.rule; - if (new_mode.__abortIf && new_mode.__abortIf(match)) { - return doIgnore(lexeme); - // mode_buffer += lexeme; - // return lexeme.length; + if (new_mode.__onBegin) { + let res = new_mode.__onBegin(match) || {}; + if (res.ignoreMatch) + return doIgnore(lexeme); } - // we are not ignoring + // we are not ignoring, so next match should start with first regex in stack top.terminators.startAt = 0; if (new_mode && new_mode.endSameAsBegin) { diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index 95104fd1be..fcdde55847 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -154,11 +154,11 @@ export function compileLanguage(language) { } // TODO: We need negative look-behind support to do this properly - function hasPrecedingOrTrailingDot(match) { + function skipIfhasPrecedingOrTrailingDot(match) { let before = match.input[match.index-1]; let after = match.input[match.index + match[0].length]; if (before === "." || after === ".") { - return true; + return {ignoreMatch: true }; } } @@ -197,8 +197,8 @@ export function compileLanguage(language) { return; mode.compiled = true; - // __abortIf is considered private API, internal use only - mode.__abortIf = null; + // __onBegin is considered private API, internal use only + mode.__onBegin = null; mode.keywords = mode.keywords || mode.beginKeywords; if (mode.keywords) @@ -214,7 +214,7 @@ export function compileLanguage(language) { // doesn't allow spaces in keywords anyways and we still check for the boundary // first mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)'; - mode.__abortIf = hasPrecedingOrTrailingDot; + mode.__onBegin = skipIfhasPrecedingOrTrailingDot; } if (!mode.begin) mode.begin = /\B|\b/; From c1bdbd587aa1c2674cafc4076d6040c82c6cb9e5 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 17:40:52 -0500 Subject: [PATCH 08/15] add test --- test/api/beginKeywords.js | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 test/api/beginKeywords.js diff --git a/test/api/beginKeywords.js b/test/api/beginKeywords.js new file mode 100644 index 0000000000..53bf85b796 --- /dev/null +++ b/test/api/beginKeywords.js @@ -0,0 +1,51 @@ +'use strict'; + +const hljs = require('../../build'); + +let grammar = function() { + return { + contains: [ + { beginKeywords: "class" } + ] + } +} + +let grammarWithFollowupRule = function() { + return { + contains: [ + { beginKeywords: "class" }, + { begin: "class", className: "found" } + ] + } +} + +describe('beginKeywords', () => { + before( () => { + hljs.registerLanguage("test", grammar); + hljs.registerLanguage("has-followup", grammarWithFollowupRule); + }); + + it("should allow subsequence matches to still succeed", () => { + let content = "A.class = self"; + let res = hljs.highlight("has-followup", content); + res.value.should.equal('A.class = self'); + }); + + it("should ignore a preceeding .", () => { + let content = "A.class = self"; + let res = hljs.highlight("test", content); + res.value.should.equal('A.class = self'); + }); + + it("should ignore a trailing .", () => { + let content = "class.config = true"; + let res = hljs.highlight("test", content); + res.value.should.equal('class.config = true'); + }); + + it('should detect keywords', () => { + let content = "I have a class yes I do."; + let res = hljs.highlight("test", content); + res.value.should.equal('I have a class yes I do.'); + }); +}); From 8d3656c3b63cbf0fede37e8fedce2cf379fabaf0 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 17:53:38 -0500 Subject: [PATCH 09/15] add docs --- docs/reference.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/reference.rst b/docs/reference.rst index 91aff26029..e8e913d008 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -89,21 +89,25 @@ Used instead of ``begin`` for modes starting with keywords to avoid needless rep :: { - begin: '\\b(extends|implements) ', - keywords: 'extends implements' + begin: '\\b(class|interface) ', + keywords: 'class interface' } -… becomes: +… can often be shortened to: :: { - beginKeywords: 'extends implements' + beginKeywords: 'class interface' } Unlike the :ref:`keywords ` attribute, this one allows only a simple list of space separated keywords. If you do need additional features of ``keywords`` or you just need more keywords for this mode you may include ``keywords`` along with ``beginKeywords``. +Note: ``beginKeywords`` also checks for a ``.`` before or after the keywords and will fail to match if one is found. +This is to avoid false positives for method calls or property accesses. + +Ex. ``class A { ... }`` would match while ``A.class == B.class`` would not. .. _endsWithParent: From 034057dd995b7e113ae58ae1cf57327d2aed3c12 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 18:06:15 -0500 Subject: [PATCH 10/15] simplify code --- src/lib/mode_compiler.js | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index fcdde55847..2916099550 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -44,25 +44,22 @@ export function compileLanguage(language) { } compile() { + if (this.regexes.length === 0) { + // avoids the need to check length every time exec is called + this.exec = () => null; + } let terminators = this.regexes.map(el => el[1]); this.matcherRe = langRe(regex.join(terminators, '|'), true); this.lastIndex = 0; } exec(s) { - var matchData; - if (this.regexes.length === 0) return null; - this.matcherRe.lastIndex = this.lastIndex; let match = this.matcherRe.exec(s); if (!match) { return null; } - for(var i = 0; i i>0 && el!=undefined); + let matchData = this.matchIndexes[i]; return Object.assign(match, matchData); } From 0f71edd8134c1399f207deb688da96467199d216 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 18:35:03 -0500 Subject: [PATCH 11/15] update docs --- docs/reference.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference.rst b/docs/reference.rst index e8e913d008..8d5b0b80e6 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -89,7 +89,7 @@ Used instead of ``begin`` for modes starting with keywords to avoid needless rep :: { - begin: '\\b(class|interface) ', + begin: '\\b(class|interface)\\b', keywords: 'class interface' } From e5bfaf20448763245b4965998b9fec9cd33eca3d Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 19:01:51 -0500 Subject: [PATCH 12/15] reset regex index when stuck --- src/highlight.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/highlight.js b/src/highlight.js index cbac94c45e..0a4eda023b 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -322,6 +322,7 @@ const HLJS = function(hljs) { // sometimes they can end up matching nothing at all // Ref: https://github.com/highlightjs/highlight.js/issues/2140 if (lastMatch.type=="begin" && match.type=="end" && lastMatch.index == match.index && lexeme === "") { + top.terminators.startAt = 0; // spit the "skipped" character that our regex choked on back into the output sequence mode_buffer += codeToHighlight.slice(match.index, match.index + 1); if (!SAFE_MODE) { From e09c9902f448a02eb3f0c116b6706b1c53cc748f Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 27 Feb 2020 19:19:05 -0500 Subject: [PATCH 13/15] DRY --- src/highlight.js | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index 0a4eda023b..b0da8f6159 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -207,7 +207,6 @@ const HLJS = function(hljs) { emitter.openNode(mode.className) } top = Object.create(mode, {parent: {value: top}}); - top.terminators.startAt = 0; } function doIgnore(lexeme) { @@ -219,6 +218,7 @@ const HLJS = function(hljs) { } else { // no need to move the cursor, we still have additional regexes to try and // match at this very spot + findNextRegexMatch = true; return 0; } } @@ -233,9 +233,6 @@ const HLJS = function(hljs) { return doIgnore(lexeme); } - // we are not ignoring, so next match should start with first regex in stack - top.terminators.startAt = 0; - if (new_mode && new_mode.endSameAsBegin) { new_mode.endRe = regex.escape( lexeme ); } @@ -259,7 +256,6 @@ const HLJS = function(hljs) { var lexeme = match[0]; var matchPlusRemainder = codeToHighlight.substr(match.index); var end_mode = endOfMode(top, matchPlusRemainder); - top.terminators.startAt = 0; if (!end_mode) { return; } var origin = top; @@ -322,7 +318,6 @@ const HLJS = function(hljs) { // sometimes they can end up matching nothing at all // Ref: https://github.com/highlightjs/highlight.js/issues/2140 if (lastMatch.type=="begin" && match.type=="end" && lastMatch.index == match.index && lexeme === "") { - top.terminators.startAt = 0; // spit the "skipped" character that our regex choked on back into the output sequence mode_buffer += codeToHighlight.slice(match.index, match.index + 1); if (!SAFE_MODE) { @@ -381,9 +376,16 @@ const HLJS = function(hljs) { var match, processedCount, index = 0; try { + var findNextRegexMatch = false; top.terminators.startAt = 0; + while (true) { top.terminators.lastIndex = index; + if (findNextRegexMatch) { + findNextRegexMatch = false; + } else { + top.terminators.startAt = 0; + } match = top.terminators.exec(codeToHighlight); // console.log("match", match[0], match.rule && match.rule.begin) if (!match) From e1370794c01fc009bab3b33b9c321727704f3d31 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Sun, 1 Mar 2020 21:15:47 -0500 Subject: [PATCH 14/15] try to improve clarity with naming --- src/highlight.js | 20 +++++++++++--------- src/lib/mode_compiler.js | 18 +++++++++++------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index b0da8f6159..24fb7d3395 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -210,7 +210,7 @@ const HLJS = function(hljs) { } function doIgnore(lexeme) { - if (top.terminators.startAt === 0) { + if (top.matcher.regexIndex === 0) { // no more regexs to potentially match here, so we move the cursor forward one // space mode_buffer += lexeme[0]; @@ -218,7 +218,7 @@ const HLJS = function(hljs) { } else { // no need to move the cursor, we still have additional regexes to try and // match at this very spot - findNextRegexMatch = true; + continueScanAtSamePosition = true; return 0; } } @@ -376,17 +376,19 @@ const HLJS = function(hljs) { var match, processedCount, index = 0; try { - var findNextRegexMatch = false; - top.terminators.startAt = 0; + var continueScanAtSamePosition = false; + top.matcher.considerAll(); while (true) { - top.terminators.lastIndex = index; - if (findNextRegexMatch) { - findNextRegexMatch = false; + if (continueScanAtSamePosition) { + continueScanAtSamePosition = false; + // only regexes not matched previously will now be + // considered for a potential match } else { - top.terminators.startAt = 0; + top.matcher.lastIndex = index; + top.matcher.considerAll(); } - match = top.terminators.exec(codeToHighlight); + match = top.matcher.exec(codeToHighlight); // console.log("match", match[0], match.rule && match.rule.begin) if (!match) break; diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index 2916099550..10fd000483 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -103,7 +103,7 @@ export function compileLanguage(language) { this.count = 0; this.lastIndex = 0; - this.startAt = 0; + this.regexIndex = 0; } getMatcher(index) { @@ -116,22 +116,26 @@ export function compileLanguage(language) { return matcher; } + considerAll() { + this.regexIndex = 0; + } + addRule(re, opts) { this.rules.push([re, opts]); if (opts.type==="begin") this.count++; } exec(s) { - let m = this.getMatcher(this.startAt); + let m = this.getMatcher(this.regexIndex); m.lastIndex = this.lastIndex; let result = m.exec(s); if (result) { - this.startAt += result.position + 1; - if (this.startAt === this.count) // wrap-around - this.startAt = 0; + this.regexIndex += result.position + 1; + if (this.regexIndex === this.count) // wrap-around + this.regexIndex = 0; } - // this.startAt = 0; + // this.regexIndex = 0; return result; } } @@ -242,7 +246,7 @@ export function compileLanguage(language) { compileMode(mode.starts, parent); } - mode.terminators = buildModeRegex(mode); + mode.matcher = buildModeRegex(mode); } // self is not valid at the top-level From a24226769a0de6c80e87919332536a488fb1a717 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Mon, 2 Mar 2020 16:04:14 -0500 Subject: [PATCH 15/15] update changelog --- CHANGES.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 07cb83922a..a1bf0402fa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,7 @@ New languages: -- (php-template) Explicit language to detect PHP templates (vs xml) [Josh Goebel][] +- add(php-template) Explicit language to detect PHP templates (vs xml) [Josh Goebel][] - enh(python) Added `python-repl` for Python REPL sessions New themes: @@ -11,12 +11,13 @@ New themes: Parser Engine Changes: -- add `before:highlight` plugin API callback (#2395) [Josh Goebel][] -- add `after:highlight` plugin API callback (#2395) [Josh Goebel][] -- split out parse tree generation and HTML rendering concerns (#2404) [Josh Goebel][] -- every language can have a `name` attribute now (#2400) [Josh Goebel][] -- improve regular expression detect (less false-positives) (#2380) [Josh Goebel][] -- make `noHighlightRe` and `languagePrefixRe` configurable (#2374) [Josh Goebel][] +- (bug) Fix `beginKeywords` to ignore . matches (#2434) [Josh Goebel][] +- (enh) add `before:highlight` plugin API callback (#2395) [Josh Goebel][] +- (enh) add `after:highlight` plugin API callback (#2395) [Josh Goebel][] +- (enh) split out parse tree generation and HTML rendering concerns (#2404) [Josh Goebel][] +- (enh) every language can have a `name` attribute now (#2400) [Josh Goebel][] +- (enh) improve regular expression detect (less false-positives) (#2380) [Josh Goebel][] +- (enh) make `noHighlightRe` and `languagePrefixRe` configurable (#2374) [Josh Goebel][] Language Improvements: