Skip to content

Commit

Permalink
(chore) Clean up all regexs to be UTF-8 compliant/ready (#2759)
Browse files Browse the repository at this point in the history
Work toward #2756.

Cleans up a lot of incorrect (unnecessary escaped) regex that would not compile with the `u` flag. After that makes some rather large performance improvements (with utf8 turned on at least) to `yaml` and `mipsasm`. It looks like the mipasm rules have been wrong all along... as far as I can determine they are intended to match a literal `.` (otherwise they are far too broad) but were matching any character - which seems to terribly slow down the whole grammar in `u` mode.

The changes consisted largely of:

- Most unescaped { and }
- Lots of unneeded escapes for -, <, >, and others.
- Converting strings to regex if it made them simpler, easier to read (editor syntax coloring)
  • Loading branch information
joshgoebel committed Nov 2, 2020
1 parent bab94ff commit 0cee2f3
Show file tree
Hide file tree
Showing 88 changed files with 276 additions and 279 deletions.
2 changes: 1 addition & 1 deletion docs/language-guide.rst
Expand Up @@ -130,7 +130,7 @@ This is commonly used to define nested modes:

{
className: 'object',
begin: '{', end: '}',
begin: /\{/, end: /\}/,
contains: [hljs.QUOTE_STRING_MODE, 'self']
}

Expand Down
2 changes: 1 addition & 1 deletion docs/mode-reference.rst
Expand Up @@ -158,7 +158,7 @@ This is when ``endsWithParent`` comes into play:
::

{
className: 'rules', begin: '{', end: '}',
className: 'rules', begin: /\{/, end: /\}/,
contains: [
{className: 'rule', /* ... */ end: ';', endsWithParent: true}
]
Expand Down
4 changes: 2 additions & 2 deletions src/languages/actionscript.js
Expand Up @@ -34,12 +34,12 @@ export default function(hljs) {
hljs.C_NUMBER_MODE,
{
className: 'class',
beginKeywords: 'package', end: '{',
beginKeywords: 'package', end: /\{/,
contains: [hljs.TITLE_MODE]
},
{
className: 'class',
beginKeywords: 'class interface', end: '{', excludeEnd: true,
beginKeywords: 'class interface', end: /\{/, excludeEnd: true,
contains: [
{
beginKeywords: 'extends implements'
Expand Down
2 changes: 1 addition & 1 deletion src/languages/ada.js
Expand Up @@ -37,7 +37,7 @@ export default function(hljs) {
var ID_REGEX = '[A-Za-z](_?[A-Za-z0-9.])*';

// bad chars, only allowed in literals
var BAD_CHARS = `[]{}%#'"`;
var BAD_CHARS = `[]\\{\\}%#'"`;

// Ada doesn't have block comments, only line comments
var COMMENTS = hljs.COMMENT('--', '$');
Expand Down
6 changes: 3 additions & 3 deletions src/languages/angelscript.js
Expand Up @@ -37,7 +37,7 @@ export default function(hljs) {
'abstract|0 try catch protected explicit property',

// avoid close detection with C# and JS
illegal: '(^using\\s+[A-Za-z0-9_\\.]+;$|\\bfunction\s*[^\\(])',
illegal: '(^using\\s+[A-Za-z0-9_\\.]+;$|\\bfunction\\s*[^\\(])',

contains: [
{ // 'strings'
Expand Down Expand Up @@ -71,7 +71,7 @@ export default function(hljs) {
},

{ // interface or namespace declaration
beginKeywords: 'interface namespace', end: '{',
beginKeywords: 'interface namespace', end: /\{/,
illegal: '[;.\\-]',
contains: [
{ // interface or namespace name
Expand All @@ -82,7 +82,7 @@ export default function(hljs) {
},

{ // class declaration
beginKeywords: 'class', end: '{',
beginKeywords: 'class', end: /\{/,
illegal: '[;.\\-]',
contains: [
{ // class name
Expand Down
2 changes: 1 addition & 1 deletion src/languages/asciidoc.js
Expand Up @@ -96,7 +96,7 @@ export default function(hljs) {
// lists (can only capture indicators)
{
className: 'bullet',
begin: '^(\\*+|\\-+|\\.+|[^\\n]+?::)\\s+'
begin: '^(\\*+|-+|\\.+|[^\\n]+?::)\\s+'
},
// admonition
{
Expand Down
2 changes: 1 addition & 1 deletion src/languages/awk.js
Expand Up @@ -11,7 +11,7 @@ export default function(hljs) {
className: 'variable',
variants: [
{begin: /\$[\w\d#@][\w\d_]*/},
{begin: /\$\{(.*?)}/}
{begin: /\$\{(.*?)\}/}
]
};
var KEYWORDS = 'BEGIN END if else while do for in break continue delete next nextfile function func exit|10';
Expand Down
8 changes: 4 additions & 4 deletions src/languages/axapta.js
Expand Up @@ -75,15 +75,15 @@ export default function(hljs) {
'firstonly100',
'firstonly1000',
'flush',
'for',
'for',
'forceliterals',
'forcenestedloop',
'forceplaceholders',
'forceselectorder',
'forupdate',
'from',
'generateonly',
'group',
'group',
'hint',
'if',
'implements',
Expand Down Expand Up @@ -119,7 +119,7 @@ export default function(hljs) {
'select',
'server',
'setting',
'static',
'static',
'sum',
'super',
'switch',
Expand Down Expand Up @@ -160,7 +160,7 @@ export default function(hljs) {
},
{
className: 'class',
beginKeywords: 'class interface', end: '{', excludeEnd: true,
beginKeywords: 'class interface', end: /\{/, excludeEnd: true,
illegal: ':',
contains: [
{beginKeywords: 'extends implements'},
Expand Down
8 changes: 4 additions & 4 deletions src/languages/basic.js
Expand Up @@ -13,7 +13,7 @@ export default function(hljs) {
illegal: '^\.',
// Support explicitly typed variables that end with $%! or #.
keywords: {
$pattern: '[a-zA-Z][a-zA-Z0-9_\$\%\!\#]*',
$pattern: '[a-zA-Z][a-zA-Z0-9_$%!#]*',
keyword:
'ABS ASC AND ATN AUTO|0 BEEP BLOAD|10 BSAVE|10 CALL CALLS CDBL CHAIN CHDIR CHR$|10 CINT CIRCLE ' +
'CLEAR CLOSE CLS COLOR COM COMMON CONT COS CSNG CSRLIN CVD CVI CVS DATA DATE$ ' +
Expand All @@ -35,7 +35,7 @@ export default function(hljs) {
{
// Match line numbers
className: 'symbol',
begin: '^[0-9]+\ ',
begin: '^[0-9]+ ',
relevance: 10
},
{
Expand All @@ -47,12 +47,12 @@ export default function(hljs) {
{
// Match hexadecimal numbers (&Hxxxx)
className: 'number',
begin: '(\&[hH][0-9a-fA-F]{1,4})'
begin: '(&[hH][0-9a-fA-F]{1,4})'
},
{
// Match octal numbers (&Oxxxxxx)
className: 'number',
begin: '(\&[oO][0-7]{1,6})'
begin: '(&[oO][0-7]{1,6})'
}
]
};
Expand Down
4 changes: 2 additions & 2 deletions src/languages/brainfuck.js
Expand Up @@ -8,7 +8,7 @@ Website: https://esolangs.org/wiki/Brainfuck
export default function(hljs) {
var LITERAL = {
className: 'literal',
begin: '[\\+\\-]',
begin: /[+-]/,
relevance: 0
};
return {
Expand All @@ -35,7 +35,7 @@ export default function(hljs) {
},
{
// this mode works as the only relevance counter
begin: /(?:\+\+|\-\-)/,
begin: /(?:\+\+|--)/,
contains: [LITERAL]
},
LITERAL
Expand Down
2 changes: 1 addition & 1 deletion src/languages/ceylon.js
Expand Up @@ -67,7 +67,7 @@ export default function(hljs) {
{
// compiler annotation
className: 'meta',
begin: '@[a-z]\\w*(?:\\:\"[^\"]*\")?'
begin: '@[a-z]\\w*(?::"[^"]*")?'
}
].concat(EXPRESSIONS)
};
Expand Down
2 changes: 1 addition & 1 deletion src/languages/clojure.js
Expand Up @@ -96,7 +96,7 @@ export default function(hljs) {
var GLOBAL = {
beginKeywords: globals,
lexemes: SYMBOL_RE,
end: '(\\[|\\#|\\d|"|:|\\{|\\)|\\(|$)',
end: '(\\[|#|\\d|"|:|\\{|\\)|\\(|$)',
contains: [
{
className: 'title',
Expand Down
2 changes: 1 addition & 1 deletion src/languages/cmake.js
Expand Up @@ -51,7 +51,7 @@ export default function(hljs) {
contains: [
{
className: 'variable',
begin: '\\${', end: '}'
begin: /\$\{/, end: /\}/
},
hljs.HASH_COMMENT_MODE,
hljs.QUOTE_STRING_MODE,
Expand Down
2 changes: 1 addition & 1 deletion src/languages/coffeescript.js
Expand Up @@ -51,7 +51,7 @@ export default function(hljs) {
var JS_IDENT_RE = '[A-Za-z$_][0-9A-Za-z$_]*';
var SUBST = {
className: 'subst',
begin: /#\{/, end: /}/,
begin: /#\{/, end: /\}/,
keywords: KEYWORDS
};
var EXPRESSIONS = [
Expand Down
2 changes: 1 addition & 1 deletion src/languages/crmsh.js
Expand Up @@ -89,7 +89,7 @@ export default function(hljs) {
},
{
className: 'attr',
begin: /([A-Za-z\$_\#][\w_-]+)=/,
begin: /([A-Za-z$_#][\w_-]+)=/,
relevance: 0
},
{
Expand Down
16 changes: 8 additions & 8 deletions src/languages/crystal.js
Expand Up @@ -9,8 +9,8 @@ export default function(hljs) {
var INT_SUFFIX = '(_*[ui](8|16|32|64|128))?';
var FLOAT_SUFFIX = '(_*f(32|64))?';
var CRYSTAL_IDENT_RE = '[a-zA-Z_]\\w*[!?=]?';
var CRYSTAL_METHOD_RE = '[a-zA-Z_]\\w*[!?=]?|[-+~]\\@|<<|>>|[=!]~|===?|<=>|[<>]=?|\\*\\*|[-/+%^&*~|]|//|//=|&[-+*]=?|&\\*\\*|\\[\\][=?]?';
var CRYSTAL_PATH_RE = '[A-Za-z_]\\w*(::\\w+)*(\\?|\\!)?';
var CRYSTAL_METHOD_RE = '[a-zA-Z_]\\w*[!?=]?|[-+~]@|<<|>>|[=!]~|===?|<=>|[<>]=?|\\*\\*|[-/+%^&*~|]|//|//=|&[-+*]=?|&\\*\\*|\\[\\][=?]?';
var CRYSTAL_PATH_RE = '[A-Za-z_]\\w*(::\\w+)*(\\?|!)?';
var CRYSTAL_KEYWORDS = {
$pattern: CRYSTAL_IDENT_RE,
keyword:
Expand All @@ -22,7 +22,7 @@ export default function(hljs) {
};
var SUBST = {
className: 'subst',
begin: '#{', end: '}',
begin: /#\{/, end: /\}/,
keywords: CRYSTAL_KEYWORDS
};
var EXPANSION = {
Expand All @@ -49,7 +49,7 @@ export default function(hljs) {
{begin: /`/, end: /`/},
{begin: '%[Qwi]?\\(', end: '\\)', contains: recursiveParen('\\(', '\\)')},
{begin: '%[Qwi]?\\[', end: '\\]', contains: recursiveParen('\\[', '\\]')},
{begin: '%[Qwi]?{', end: '}', contains: recursiveParen('{', '}')},
{begin: '%[Qwi]?\\{', end: /\}/, contains: recursiveParen(/\{/, /\}/)},
{begin: '%[Qwi]?<', end: '>', contains: recursiveParen('<', '>')},
{begin: '%[Qwi]?\\|', end: '\\|'},
{begin: /<<-\w+$/, end: /^\s*\w+$/},
Expand All @@ -61,15 +61,15 @@ export default function(hljs) {
variants: [
{begin: '%q\\(', end: '\\)', contains: recursiveParen('\\(', '\\)')},
{begin: '%q\\[', end: '\\]', contains: recursiveParen('\\[', '\\]')},
{begin: '%q{', end: '}', contains: recursiveParen('{', '}')},
{begin: '%q\\{', end: /\}/, contains: recursiveParen(/\{/, /\}/)},
{begin: '%q<', end: '>', contains: recursiveParen('<', '>')},
{begin: '%q\\|', end: '\\|'},
{begin: /<<-'\w+'$/, end: /^\s*\w+$/},
],
relevance: 0,
};
var REGEXP = {
begin: '(?!%})(' + hljs.RE_STARTERS_RE + '|\\n|\\b(case|if|select|unless|until|when|while)\\b)\\s*',
begin: '(?!%\\})(' + hljs.RE_STARTERS_RE + '|\\n|\\b(case|if|select|unless|until|when|while)\\b)\\s*',
keywords: 'case if select unless until when while',
contains: [
{
Expand All @@ -89,7 +89,7 @@ export default function(hljs) {
variants: [
{begin: '%r\\(', end: '\\)', contains: recursiveParen('\\(', '\\)')},
{begin: '%r\\[', end: '\\]', contains: recursiveParen('\\[', '\\]')},
{begin: '%r{', end: '}', contains: recursiveParen('{', '}')},
{begin: '%r\\{', end: /\}/, contains: recursiveParen(/\{/, /\}/)},
{begin: '%r<', end: '>', contains: recursiveParen('<', '>')},
{begin: '%r\\|', end: '\\|'},
],
Expand Down Expand Up @@ -162,7 +162,7 @@ export default function(hljs) {
},
{
className: 'symbol',
begin: hljs.UNDERSCORE_IDENT_RE + '(\\!|\\?)?:',
begin: hljs.UNDERSCORE_IDENT_RE + '(!|\\?)?:',
relevance: 0
},
{
Expand Down
12 changes: 6 additions & 6 deletions src/languages/csharp.js
Expand Up @@ -170,24 +170,24 @@ export default function(hljs) {
var VERBATIM_STRING_NO_LF = hljs.inherit(VERBATIM_STRING, {illegal: /\n/});
var SUBST = {
className: 'subst',
begin: '{', end: '}',
begin: /\{/, end: /\}/,
keywords: KEYWORDS
};
var SUBST_NO_LF = hljs.inherit(SUBST, {illegal: /\n/});
var INTERPOLATED_STRING = {
className: 'string',
begin: /\$"/, end: '"',
illegal: /\n/,
contains: [{begin: '{{'}, {begin: '}}'}, hljs.BACKSLASH_ESCAPE, SUBST_NO_LF]
contains: [{begin: /\{\{/}, {begin: /\}\}/}, hljs.BACKSLASH_ESCAPE, SUBST_NO_LF]
};
var INTERPOLATED_VERBATIM_STRING = {
className: 'string',
begin: /\$@"/, end: '"',
contains: [{begin: '{{'}, {begin: '}}'}, {begin: '""'}, SUBST]
contains: [{begin: /\{\{/}, {begin: /\}\}/}, {begin: '""'}, SUBST]
};
var INTERPOLATED_VERBATIM_STRING_NO_LF = hljs.inherit(INTERPOLATED_VERBATIM_STRING, {
illegal: /\n/,
contains: [{begin: '{{'}, {begin: '}}'}, {begin: '""'}, SUBST_NO_LF]
contains: [{begin: /\{\{/}, {begin: /\}\}/}, {begin: '""'}, SUBST_NO_LF]
});
SUBST.contains = [
INTERPOLATED_VERBATIM_STRING,
Expand Down Expand Up @@ -319,14 +319,14 @@ export default function(hljs) {
},
{
className: 'function',
begin: '(' + TYPE_IDENT_RE + '\\s+)+' + hljs.IDENT_RE + '\\s*(\\<.+\\>)?\\s*\\(', returnBegin: true,
begin: '(' + TYPE_IDENT_RE + '\\s+)+' + hljs.IDENT_RE + '\\s*(<.+>)?\\s*\\(', returnBegin: true,
end: /\s*[{;=]/, excludeEnd: true,
keywords: KEYWORDS,
contains: [
// prevents these from being highlighted `title`
{ beginKeywords: FUNCTION_MODIFIERS.join(" ")},
{
begin: hljs.IDENT_RE + '\\s*(\\<.+\\>)?\\s*\\(', returnBegin: true,
begin: hljs.IDENT_RE + '\\s*(<.+>)?\\s*\\(', returnBegin: true,
contains: [
hljs.TITLE_MODE,
GENERIC_MODIFIER
Expand Down
8 changes: 4 additions & 4 deletions src/languages/css.js
Expand Up @@ -46,10 +46,10 @@ export default function(hljs) {
var AT_IDENTIFIER = '@[a-z-]+' // @font-face
var AT_MODIFIERS = "and or not only"
var MEDIA_TYPES = "all print screen speech"
var AT_PROPERTY_RE = /@\-?\w[\w]*(\-\w+)*/ // @-webkit-keyframes
var AT_PROPERTY_RE = /@-?\w[\w]*(-\w+)*/ // @-webkit-keyframes
var IDENT_RE = '[a-zA-Z-][a-zA-Z0-9_-]*';
var RULE = {
begin: /(?:[A-Z\_\.\-]+|--[a-zA-Z0-9_-]+)\s*:/, returnBegin: true, end: ';', endsWithParent: true,
begin: /(?:[A-Z_.-]+|--[a-zA-Z0-9_-]+)\s*:/, returnBegin: true, end: ';', endsWithParent: true,
contains: [
ATTRIBUTE
]
Expand Down Expand Up @@ -78,7 +78,7 @@ export default function(hljs) {
},
{
className: 'selector-pseudo',
begin: /:(:)?[a-zA-Z0-9\_\-\+\(\)"'.]+/
begin: /:(:)?[a-zA-Z0-9_+()"'.-]+/
},
// matching these here allows us to treat them more like regular CSS
// rules so everything between the {} gets regular rule highlighting,
Expand Down Expand Up @@ -121,7 +121,7 @@ export default function(hljs) {
relevance: 0
},
{
begin: '{', end: '}',
begin: /\{/, end: /\}/,
illegal: /\S/,
contains: [
hljs.C_BLOCK_COMMENT_MODE,
Expand Down
6 changes: 3 additions & 3 deletions src/languages/dart.js
Expand Up @@ -18,8 +18,8 @@ export default function(hljs) {
const BRACED_SUBST = {
className: 'subst',
variants: [{
begin: '\\${',
end: '}'
begin: /\$\{/,
end: /\}/
}],
keywords: 'true false null this is new super',
};
Expand Down Expand Up @@ -155,7 +155,7 @@ export default function(hljs) {
{
className: 'class',
beginKeywords: 'class interface',
end: '{',
end: /\{/,
excludeEnd: true,
contains: [{
beginKeywords: 'extends implements'
Expand Down

0 comments on commit 0cee2f3

Please sign in to comment.