From 41bde381edbb97cced2be61117ec6a803aa73fe3 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Fri, 8 Nov 2019 18:20:45 -0500 Subject: [PATCH] cpp: Fix highlighting of unterminated raw strings PR #1897 switched C++ raw strings to use backreferences, however this breaks souce files where raw strings are truncated. Like comments, it would be preferable to highlight them. Instead, go back to using separate begin and end regexps, but introduce an endFilter feature to filter out false positive matches. This internally works similarly to endSameAsBegin. See also issue #2259. --- docs/reference.rst | 22 ++++++++++++++++++- src/highlight.js | 16 +++++++++----- src/languages/cpp.js | 11 +++++++++- .../cpp/truncated-block-comment.expect.txt | 3 +++ test/markup/cpp/truncated-block-comment.txt | 2 ++ .../cpp/truncated-raw-string.expect.txt | 5 +++++ test/markup/cpp/truncated-raw-string.txt | 4 ++++ 7 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 test/markup/cpp/truncated-block-comment.expect.txt create mode 100644 test/markup/cpp/truncated-block-comment.txt create mode 100644 test/markup/cpp/truncated-raw-string.expect.txt create mode 100644 test/markup/cpp/truncated-raw-string.txt diff --git a/docs/reference.rst b/docs/reference.rst index dafee4abd2..8b79c3e644 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -186,7 +186,7 @@ endSameAsBegin Acts as ``end`` matching exactly the same string that was found by the corresponding ``begin`` regexp. -For example, in PostgreSQL string constants can uee "dollar quotes", +For example, in PostgreSQL string constants can use "dollar quotes", consisting of a dollar sign, an optional tag of zero or more characters, and another dollar sign. String constant must be ended with the same construct using the same tag. It is possible to nest dollar-quoted string @@ -204,6 +204,26 @@ In this case you can't simply specify the same regexp for ``begin`` and ``end`` (say, ``"\\$[a-z]\\$"``), but you can use ``begin: "\\$[a-z]\\$"`` and ``endSameAsBegin: true``. +.. _endFilter: + +endFilter +^^^^^^^^^ + +**type**: function + +Filters ``end`` matches to implement end rules that cannot be expressed as a +standalone regular expression. + +This should be a function which takes two string parameters, the string that +matched the ``begin`` regexp and the string that matched the ``end`` regexp. It +should return true to end the mode and false otherwise. + +For example, C++11 raw string constants use syntax like ``R"tag(.....)tag"``, +where ``tag`` is any zero to sixteen character string that must be repeated at +the end. This could be matched with a single regexp containing backreferences, +but truncated raw strings would not highlight. Instead, ``endFilter`` can be +used to reject ``)tag"`` delimiters which do not match the starting value. + .. _lexemes: lexemes diff --git a/src/highlight.js b/src/highlight.js index bbeb712ad6..bac0a5fbee 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -501,15 +501,19 @@ https://highlightjs.org/ return new RegExp(value.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'm'); } - function endOfMode(mode, lexeme) { - if (testRe(mode.endRe, lexeme)) { + function endOfMode(mode, matchPlusRemainder, lexeme) { + var modeEnded = testRe(mode.endRe, matchPlusRemainder); + if (modeEnded && mode.endFilter) { + modeEnded = mode.endFilter(mode.beginValue, lexeme); + } + if (modeEnded) { while (mode.endsParent && mode.parent) { mode = mode.parent; } return mode; } if (mode.endsWithParent) { - return endOfMode(mode.parent, lexeme); + return endOfMode(mode.parent, matchPlusRemainder, lexeme); } } @@ -585,9 +589,9 @@ https://highlightjs.org/ mode_buffer = ''; } - function startNewMode(mode) { + function startNewMode(mode, lexeme) { result += mode.className? buildSpan(mode.className, '', true): ''; - top = Object.create(mode, {parent: {value: top}}); + top = Object.create(mode, {parent: {value: top}, beginValue: {value: lexeme}}); } @@ -617,7 +621,7 @@ https://highlightjs.org/ function doEndMatch(match) { var lexeme = match[0]; var matchPlusRemainder = value.substr(match.index); - var end_mode = endOfMode(top, matchPlusRemainder); + var end_mode = endOfMode(top, matchPlusRemainder, lexeme); if (!end_mode) { return; } var origin = top; diff --git a/src/languages/cpp.js b/src/languages/cpp.js index 29c33263b6..26646d3854 100644 --- a/src/languages/cpp.js +++ b/src/languages/cpp.js @@ -27,7 +27,16 @@ function(hljs) { begin: '(u8?|U|L)?\'(' + CHARACTER_ESCAPES + "|.)", end: '\'', illegal: '.' }, - { begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ } + { + begin: /(?:u8?|U|L)?R"[^()\\ ]{0,16}\(/, + end: /\)[^()\\ ]{0,16}"/, + endFilter: function(begin, end) { + var quote = begin.indexOf('"'); + var beginDelimiter = begin.substring(quote + 1, begin.length - 1); + var endDelimiter = end.substring(1, end.length - 1); + return beginDelimiter == endDelimiter; + }, + } ] }; diff --git a/test/markup/cpp/truncated-block-comment.expect.txt b/test/markup/cpp/truncated-block-comment.expect.txt new file mode 100644 index 0000000000..a2f5ce048a --- /dev/null +++ b/test/markup/cpp/truncated-block-comment.expect.txt @@ -0,0 +1,3 @@ +/* +Truncated block comment + diff --git a/test/markup/cpp/truncated-block-comment.txt b/test/markup/cpp/truncated-block-comment.txt new file mode 100644 index 0000000000..b266bf0806 --- /dev/null +++ b/test/markup/cpp/truncated-block-comment.txt @@ -0,0 +1,2 @@ +/* +Truncated block comment diff --git a/test/markup/cpp/truncated-raw-string.expect.txt b/test/markup/cpp/truncated-raw-string.expect.txt new file mode 100644 index 0000000000..8d133e8bae --- /dev/null +++ b/test/markup/cpp/truncated-raw-string.expect.txt @@ -0,0 +1,5 @@ +R"foo( +Truncated raw string +)nope" +Still not completed. + diff --git a/test/markup/cpp/truncated-raw-string.txt b/test/markup/cpp/truncated-raw-string.txt new file mode 100644 index 0000000000..b012c82bfe --- /dev/null +++ b/test/markup/cpp/truncated-raw-string.txt @@ -0,0 +1,4 @@ +R"foo( +Truncated raw string +)nope" +Still not completed.