From 8c248fd7107c0e98b8cbef14220e4ce91c8f5637 Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Mon, 2 Mar 2020 16:32:55 -0500 Subject: [PATCH] (bug) Fix `beginKeywords` to ignore . matches (#2434) - split out complex multi-regex matching into discrete classes - new classes allow multiple regex matches at the same index position so if a one mode match is found unsuitable another mode matchcan be attempted - add private ` __onBegin` hook for mode matching (to allow rejecting `.` matches for `beginKeywords`) --- CHANGES.md | 15 +-- docs/reference.rst | 12 ++- src/highlight.js | 36 ++++++- src/lib/mode_compiler.js | 209 ++++++++++++++++++++++++++++++-------- test/api/beginKeywords.js | 51 ++++++++++ 5 files changed, 269 insertions(+), 54 deletions(-) create mode 100644 test/api/beginKeywords.js diff --git a/CHANGES.md b/CHANGES.md index 07cb83922a..a1bf0402fa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,7 @@ New languages: -- (php-template) Explicit language to detect PHP templates (vs xml) [Josh Goebel][] +- add(php-template) Explicit language to detect PHP templates (vs xml) [Josh Goebel][] - enh(python) Added `python-repl` for Python REPL sessions New themes: @@ -11,12 +11,13 @@ New themes: Parser Engine Changes: -- add `before:highlight` plugin API callback (#2395) [Josh Goebel][] -- add `after:highlight` plugin API callback (#2395) [Josh Goebel][] -- split out parse tree generation and HTML rendering concerns (#2404) [Josh Goebel][] -- every language can have a `name` attribute now (#2400) [Josh Goebel][] -- improve regular expression detect (less false-positives) (#2380) [Josh Goebel][] -- make `noHighlightRe` and `languagePrefixRe` configurable (#2374) [Josh Goebel][] +- (bug) Fix `beginKeywords` to ignore . matches (#2434) [Josh Goebel][] +- (enh) add `before:highlight` plugin API callback (#2395) [Josh Goebel][] +- (enh) add `after:highlight` plugin API callback (#2395) [Josh Goebel][] +- (enh) split out parse tree generation and HTML rendering concerns (#2404) [Josh Goebel][] +- (enh) every language can have a `name` attribute now (#2400) [Josh Goebel][] +- (enh) improve regular expression detect (less false-positives) (#2380) [Josh Goebel][] +- (enh) make `noHighlightRe` and `languagePrefixRe` configurable (#2374) [Josh Goebel][] Language Improvements: diff --git a/docs/reference.rst b/docs/reference.rst index 91aff26029..8d5b0b80e6 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -89,21 +89,25 @@ Used instead of ``begin`` for modes starting with keywords to avoid needless rep :: { - begin: '\\b(extends|implements) ', - keywords: 'extends implements' + begin: '\\b(class|interface)\\b', + keywords: 'class interface' } -… becomes: +… can often be shortened to: :: { - beginKeywords: 'extends implements' + beginKeywords: 'class interface' } Unlike the :ref:`keywords ` attribute, this one allows only a simple list of space separated keywords. If you do need additional features of ``keywords`` or you just need more keywords for this mode you may include ``keywords`` along with ``beginKeywords``. +Note: ``beginKeywords`` also checks for a ``.`` before or after the keywords and will fail to match if one is found. +This is to avoid false positives for method calls or property accesses. + +Ex. ``class A { ... }`` would match while ``A.class == B.class`` would not. .. _endsWithParent: diff --git a/src/highlight.js b/src/highlight.js index 2b97470e3a..24fb7d3395 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -209,11 +209,30 @@ const HLJS = function(hljs) { top = Object.create(mode, {parent: {value: top}}); } + function doIgnore(lexeme) { + if (top.matcher.regexIndex === 0) { + // no more regexs to potentially match here, so we move the cursor forward one + // space + mode_buffer += lexeme[0]; + return 1; + } else { + // no need to move the cursor, we still have additional regexes to try and + // match at this very spot + continueScanAtSamePosition = true; + return 0; + } + } function doBeginMatch(match) { var lexeme = match[0]; var new_mode = match.rule; + if (new_mode.__onBegin) { + let res = new_mode.__onBegin(match) || {}; + if (res.ignoreMatch) + return doIgnore(lexeme); + } + if (new_mode && new_mode.endSameAsBegin) { new_mode.endRe = regex.escape( lexeme ); } @@ -292,6 +311,8 @@ const HLJS = function(hljs) { return 0; } + + // we've found a 0 width match and we're stuck, so we need to advance // this happens when we have badly behaved rules that have optional matchers to the degree that // sometimes they can end up matching nothing at all @@ -355,9 +376,20 @@ const HLJS = function(hljs) { var match, processedCount, index = 0; try { + var continueScanAtSamePosition = false; + top.matcher.considerAll(); + while (true) { - top.terminators.lastIndex = index; - match = top.terminators.exec(codeToHighlight); + if (continueScanAtSamePosition) { + continueScanAtSamePosition = false; + // only regexes not matched previously will now be + // considered for a potential match + } else { + top.matcher.lastIndex = index; + top.matcher.considerAll(); + } + match = top.matcher.exec(codeToHighlight); + // console.log("match", match[0], match.rule && match.rule.begin) if (!match) break; let beforeMatch = codeToHighlight.substring(index, match.index); diff --git a/src/lib/mode_compiler.js b/src/lib/mode_compiler.js index d468022a28..10fd000483 100644 --- a/src/lib/mode_compiler.js +++ b/src/lib/mode_compiler.js @@ -15,66 +15,192 @@ export function compileLanguage(language) { ); } - function buildModeRegex(mode) { + /** + Stores multiple regular expressions and allows you to quickly search for + them all in a string simultaneously - returning the first match. It does + this by creating a huge (a|b|c) regex - each individual item wrapped with () + and joined by `|` - using match groups to track position. When a match is + found checking which position in the array has content allows us to figure + out which of the original regexes / match groups triggered the match. + + The match object itself (the result of `Regex.exec`) is returned but also + enhanced by merging in any meta-data that was registered with the regex. + This is how we keep track of which mode matched, and what type of rule + (`illegal`, `begin`, end, etc). + */ + class MultiRegex { + constructor() { + this.matchIndexes = {}; + this.regexes = []; + this.matchAt = 1; + this.position = 0; + } - var matchIndexes = {}; - var matcherRe; - var regexes = []; - var matcher = {}; - var matchAt = 1; + addRule(re, opts) { + opts.position = this.position++; + this.matchIndexes[this.matchAt] = opts; + this.regexes.push([opts, re]); + this.matchAt += regex.countMatchGroups(re) + 1; + } - function addRule(rule, re) { - matchIndexes[matchAt] = rule; - regexes.push([rule, re]); - matchAt += regex.countMatchGroups(re) + 1; + compile() { + if (this.regexes.length === 0) { + // avoids the need to check length every time exec is called + this.exec = () => null; + } + let terminators = this.regexes.map(el => el[1]); + this.matcherRe = langRe(regex.join(terminators, '|'), true); + this.lastIndex = 0; } - mode.contains.forEach(term => addRule(term, term.begin)) + exec(s) { + this.matcherRe.lastIndex = this.lastIndex; + let match = this.matcherRe.exec(s); + if (!match) { return null; } - if (mode.terminator_end) - addRule("end", mode.terminator_end); - if (mode.illegal) - addRule("illegal", mode.illegal); + let i = match.findIndex((el, i) => i>0 && el!=undefined); + let matchData = this.matchIndexes[i]; - var terminators = regexes.map(el => el[1]); - matcherRe = langRe(regex.join(terminators, '|'), true); + return Object.assign(match, matchData); + } + } - matcher.lastIndex = 0; - matcher.exec = function(s) { - var rule; + /* + Created to solve the key deficiently with MultiRegex - there is no way to + test for multiple matches at a single location. Why would we need to do + that? In the future a more dynamic engine will allow certain matches to be + ignored. An example: if we matched say the 3rd regex in a large group but + decided to ignore it - we'd need to started testing again at the 4th + regex... but MultiRegex itself gives us no real way to do that. - if( regexes.length === 0) return null; + So what this class creates MultiRegexs on the fly for whatever search + position they are needed. - matcherRe.lastIndex = matcher.lastIndex; - var match = matcherRe.exec(s); - if (!match) { return null; } + NOTE: These additional MultiRegex objects are created dynamically. For most + grammars most of the time we will never actually need anything more than the + first MultiRegex - so this shouldn't have too much overhead. - for(var i = 0; i matcher.addRule(re,opts)) + matcher.compile(); + this.multiRegexes[index] = matcher; + return matcher; + } + + considerAll() { + this.regexIndex = 0; + } + + addRule(re, opts) { + this.rules.push([re, opts]); + if (opts.type==="begin") this.count++; + } - // illegal or end match - if (typeof rule === "string") { - match.type = rule; - match.extra = [mode.illegal, mode.terminator_end]; - } else { - match.type = "begin"; - match.rule = rule; + exec(s) { + let m = this.getMatcher(this.regexIndex); + m.lastIndex = this.lastIndex; + let result = m.exec(s); + if (result) { + this.regexIndex += result.position + 1; + if (this.regexIndex === this.count) // wrap-around + this.regexIndex = 0; } - return match; - }; - return matcher; + // this.regexIndex = 0; + return result; + } + } + + function buildModeRegex(mode) { + + let mm = new ResumableMultiRegex(); + + mode.contains.forEach(term => mm.addRule(term.begin, {rule: term, type: "begin" })) + + if (mode.terminator_end) + mm.addRule(mode.terminator_end, {type: "end"} ); + if (mode.illegal) + mm.addRule(mode.illegal, {type: "illegal"} ); + + return mm; } + // TODO: We need negative look-behind support to do this properly + function skipIfhasPrecedingOrTrailingDot(match) { + let before = match.input[match.index-1]; + let after = match.input[match.index + match[0].length]; + if (before === "." || after === ".") { + return {ignoreMatch: true }; + } + } + + /** skip vs abort vs ignore + * + * @skip - The mode is still entered and exited normally (and contains rules apply), + * but all content is held and added to the parent buffer rather than being + * output when the mode ends. Mostly used with `sublanguage` to build up + * a single large buffer than can be parsed by sublanguage. + * + * - The mode begin ands ends normally. + * - Content matched is added to the parent mode buffer. + * - The parser cursor is moved forward normally. + * + * @abort - A hack placeholder until we have ignore. Aborts the mode (as if it + * never matched) but DOES NOT continue to match subsequent `contains` + * modes. Abort is bad/suboptimal because it can result in modes + * farther down not getting applied because an earlier rule eats the + * content but then aborts. + * + * - The mode does not begin. + * - Content matched by `begin` is added to the mode buffer. + * - The parser cursor is moved forward accordingly. + * + * @ignore - Ignores the mode (as if it never matched) and continues to match any + * subsequent `contains` modes. Ignore isn't technically possible with + * the current parser implementation. + * + * - The mode does not begin. + * - Content matched by `begin` is ignored. + * - The parser cursor is not moved forward. + */ + function compileMode(mode, parent) { if (mode.compiled) return; mode.compiled = true; + // __onBegin is considered private API, internal use only + mode.__onBegin = null; + mode.keywords = mode.keywords || mode.beginKeywords; if (mode.keywords) mode.keywords = compileKeywords(mode.keywords, language.case_insensitive); @@ -89,6 +215,7 @@ export function compileLanguage(language) { // doesn't allow spaces in keywords anyways and we still check for the boundary // first mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)'; + mode.__onBegin = skipIfhasPrecedingOrTrailingDot; } if (!mode.begin) mode.begin = /\B|\b/; @@ -119,7 +246,7 @@ export function compileLanguage(language) { compileMode(mode.starts, parent); } - mode.terminators = buildModeRegex(mode); + mode.matcher = buildModeRegex(mode); } // self is not valid at the top-level diff --git a/test/api/beginKeywords.js b/test/api/beginKeywords.js new file mode 100644 index 0000000000..53bf85b796 --- /dev/null +++ b/test/api/beginKeywords.js @@ -0,0 +1,51 @@ +'use strict'; + +const hljs = require('../../build'); + +let grammar = function() { + return { + contains: [ + { beginKeywords: "class" } + ] + } +} + +let grammarWithFollowupRule = function() { + return { + contains: [ + { beginKeywords: "class" }, + { begin: "class", className: "found" } + ] + } +} + +describe('beginKeywords', () => { + before( () => { + hljs.registerLanguage("test", grammar); + hljs.registerLanguage("has-followup", grammarWithFollowupRule); + }); + + it("should allow subsequence matches to still succeed", () => { + let content = "A.class = self"; + let res = hljs.highlight("has-followup", content); + res.value.should.equal('A.class = self'); + }); + + it("should ignore a preceeding .", () => { + let content = "A.class = self"; + let res = hljs.highlight("test", content); + res.value.should.equal('A.class = self'); + }); + + it("should ignore a trailing .", () => { + let content = "class.config = true"; + let res = hljs.highlight("test", content); + res.value.should.equal('class.config = true'); + }); + + it('should detect keywords', () => { + let content = "I have a class yes I do."; + let res = hljs.highlight("test", content); + res.value.should.equal('I have a class yes I do.'); + }); +});