/
mode_compiler.js
509 lines (442 loc) · 16.8 KB
/
mode_compiler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
import * as regex from './regex.js';
import { inherit } from './utils.js';
// keywords that should have no default relevance value
var COMMON_KEYWORDS = [
'of',
'and',
'for',
'in',
'not',
'or',
'if',
'then',
'parent', // common variable name
'list', // common variable name
'value' // common variable name
];
// compilation
/**
* Compiles a language definition result
*
* Given the raw result of a language definition (Language), compiles this so
* that it is ready for highlighting code.
* @param {Language} language
* @returns {CompiledLanguage}
*/
export function compileLanguage(language) {
/**
* Builds a regex with the case sensativility of the current language
*
* @param {RegExp | string} value
* @param {boolean} [global]
*/
function langRe(value, global) {
return new RegExp(
regex.source(value),
'm' + (language.case_insensitive ? 'i' : '') + (global ? 'g' : '')
);
}
/**
Stores multiple regular expressions and allows you to quickly search for
them all in a string simultaneously - returning the first match. It does
this by creating a huge (a|b|c) regex - each individual item wrapped with ()
and joined by `|` - using match groups to track position. When a match is
found checking which position in the array has content allows us to figure
out which of the original regexes / match groups triggered the match.
The match object itself (the result of `Regex.exec`) is returned but also
enhanced by merging in any meta-data that was registered with the regex.
This is how we keep track of which mode matched, and what type of rule
(`illegal`, `begin`, end, etc).
*/
class MultiRegex {
constructor() {
this.matchIndexes = {};
// @ts-ignore
this.regexes = [];
this.matchAt = 1;
this.position = 0;
}
// @ts-ignore
addRule(re, opts) {
opts.position = this.position++;
// @ts-ignore
this.matchIndexes[this.matchAt] = opts;
this.regexes.push([opts, re]);
this.matchAt += regex.countMatchGroups(re) + 1;
}
compile() {
if (this.regexes.length === 0) {
// avoids the need to check length every time exec is called
// @ts-ignore
this.exec = () => null;
}
const terminators = this.regexes.map(el => el[1]);
this.matcherRe = langRe(regex.join(terminators), true);
this.lastIndex = 0;
}
/** @param {string} s */
exec(s) {
this.matcherRe.lastIndex = this.lastIndex;
const match = this.matcherRe.exec(s);
if (!match) { return null; }
// eslint-disable-next-line no-undefined
const i = match.findIndex((el, i) => i > 0 && el !== undefined);
// @ts-ignore
const matchData = this.matchIndexes[i];
// trim off any earlier non-relevant match groups (ie, the other regex
// match groups that make up the multi-matcher)
match.splice(0, i);
return Object.assign(match, matchData);
}
}
/*
Created to solve the key deficiently with MultiRegex - there is no way to
test for multiple matches at a single location. Why would we need to do
that? In the future a more dynamic engine will allow certain matches to be
ignored. An example: if we matched say the 3rd regex in a large group but
decided to ignore it - we'd need to started testing again at the 4th
regex... but MultiRegex itself gives us no real way to do that.
So what this class creates MultiRegexs on the fly for whatever search
position they are needed.
NOTE: These additional MultiRegex objects are created dynamically. For most
grammars most of the time we will never actually need anything more than the
first MultiRegex - so this shouldn't have too much overhead.
Say this is our search group, and we match regex3, but wish to ignore it.
regex1 | regex2 | regex3 | regex4 | regex5 ' ie, startAt = 0
What we need is a new MultiRegex that only includes the remaining
possibilities:
regex4 | regex5 ' ie, startAt = 3
This class wraps all that complexity up in a simple API... `startAt` decides
where in the array of expressions to start doing the matching. It
auto-increments, so if a match is found at position 2, then startAt will be
set to 3. If the end is reached startAt will return to 0.
MOST of the time the parser will be setting startAt manually to 0.
*/
class ResumableMultiRegex {
constructor() {
// @ts-ignore
this.rules = [];
// @ts-ignore
this.multiRegexes = [];
this.count = 0;
this.lastIndex = 0;
this.regexIndex = 0;
}
// @ts-ignore
getMatcher(index) {
if (this.multiRegexes[index]) return this.multiRegexes[index];
const matcher = new MultiRegex();
this.rules.slice(index).forEach(([re, opts]) => matcher.addRule(re, opts));
matcher.compile();
this.multiRegexes[index] = matcher;
return matcher;
}
resumingScanAtSamePosition() {
return this.regexIndex !== 0;
}
considerAll() {
this.regexIndex = 0;
}
// @ts-ignore
addRule(re, opts) {
this.rules.push([re, opts]);
if (opts.type === "begin") this.count++;
}
/** @param {string} s */
exec(s) {
const m = this.getMatcher(this.regexIndex);
m.lastIndex = this.lastIndex;
let result = m.exec(s);
// The following is because we have no easy way to say "resume scanning at the
// existing position but also skip the current rule ONLY". What happens is
// all prior rules are also skipped which can result in matching the wrong
// thing. Example of matching "booger":
// our matcher is [string, "booger", number]
//
// ....booger....
// if "booger" is ignored then we'd really need a regex to scan from the
// SAME position for only: [string, number] but ignoring "booger" (if it
// was the first match), a simple resume would scan ahead who knows how
// far looking only for "number", ignoring potential string matches (or
// future "booger" matches that might be valid.)
// So what we do: We execute two matchers, one resuming at the same
// position, but the second full matcher starting at the position after:
// /--- resume first regex match here (for [number])
// |/---- full match here for [string, "booger", number]
// vv
// ....booger....
// Which ever results in a match first is then used. So this 3-4 step
// process essentially allows us to say "match at this position, excluding
// a prior rule that was ignored".
//
// 1. Match "booger" first, ignore. Also proves that [string] does non match.
// 2. Resume matching for [number]
// 3. Match at index + 1 for [string, "booger", number]
// 4. If #2 and #3 result in matches, which came first?
if (this.resumingScanAtSamePosition()) {
if (result && result.index === this.lastIndex) {
// result is position +0 and therefore a valid
// "resume" match so result stays result
} else { // use the second matcher result
const m2 = this.getMatcher(0);
m2.lastIndex = this.lastIndex + 1;
result = m2.exec(s);
}
}
if (result) {
this.regexIndex += result.position + 1;
if (this.regexIndex === this.count) {
// wrap-around to considering all matches again
this.considerAll();
}
}
return result;
}
}
/**
* Given a mode, builds a huge ResumableMultiRegex that can be used to walk
* the content and find matches.
*
* @param {CompiledMode} mode
* @returns {ResumableMultiRegex}
*/
function buildModeRegex(mode) {
const mm = new ResumableMultiRegex();
mode.contains.forEach(term => mm.addRule(term.begin, { rule: term, type: "begin" }));
if (mode.terminator_end) {
mm.addRule(mode.terminator_end, { type: "end" });
}
if (mode.illegal) {
mm.addRule(mode.illegal, { type: "illegal" });
}
return mm;
}
// TODO: We need negative look-behind support to do this properly
/**
* Skip a match if it has a preceding or trailing dot
*
* This is used for `beginKeywords` to prevent matching expressions such as
* `bob.keyword.do()`. The mode compiler automatically wires this up as a
* special _internal_ 'on:begin' callback for modes with `beginKeywords`
* @param {RegExpMatchArray} match
* @param {CallbackResponse} response
*/
function skipIfhasPrecedingOrTrailingDot(match, response) {
const before = match.input[match.index - 1];
const after = match.input[match.index + match[0].length];
if (before === "." || after === ".") {
response.ignoreMatch();
}
}
/** skip vs abort vs ignore
*
* @skip - The mode is still entered and exited normally (and contains rules apply),
* but all content is held and added to the parent buffer rather than being
* output when the mode ends. Mostly used with `sublanguage` to build up
* a single large buffer than can be parsed by sublanguage.
*
* - The mode begin ands ends normally.
* - Content matched is added to the parent mode buffer.
* - The parser cursor is moved forward normally.
*
* @abort - A hack placeholder until we have ignore. Aborts the mode (as if it
* never matched) but DOES NOT continue to match subsequent `contains`
* modes. Abort is bad/suboptimal because it can result in modes
* farther down not getting applied because an earlier rule eats the
* content but then aborts.
*
* - The mode does not begin.
* - Content matched by `begin` is added to the mode buffer.
* - The parser cursor is moved forward accordingly.
*
* @ignore - Ignores the mode (as if it never matched) and continues to match any
* subsequent `contains` modes. Ignore isn't technically possible with
* the current parser implementation.
*
* - The mode does not begin.
* - Content matched by `begin` is ignored.
* - The parser cursor is not moved forward.
*/
/**
* Compiles an individual mode
*
* This can raise an error if the mode contains certain detectable known logic
* issues.
* @param {Mode} mode
* @param {CompiledMode | null} [parent]
* @returns {CompiledMode | never}
*/
function compileMode(mode, parent) {
const cmode = /** @type CompiledMode */ (mode);
if (mode.compiled) return cmode;
mode.compiled = true;
// __beforeBegin is considered private API, internal use only
mode.__beforeBegin = null;
mode.keywords = mode.keywords || mode.beginKeywords;
let keywordPattern = null;
if (typeof mode.keywords === "object") {
keywordPattern = mode.keywords.$pattern;
delete mode.keywords.$pattern;
}
if (mode.keywords) {
mode.keywords = compileKeywords(mode.keywords, language.case_insensitive);
}
// both are not allowed
if (mode.lexemes && keywordPattern) {
throw new Error("ERR: Prefer `keywords.$pattern` to `mode.lexemes`, BOTH are not allowed. (see mode reference) ");
}
// `mode.lexemes` was the old standard before we added and now recommend
// using `keywords.$pattern` to pass the keyword pattern
cmode.keywordPatternRe = langRe(mode.lexemes || keywordPattern || /\w+/, true);
if (parent) {
if (mode.beginKeywords) {
// for languages with keywords that include non-word characters checking for
// a word boundary is not sufficient, so instead we check for a word boundary
// or whitespace - this does no harm in any case since our keyword engine
// doesn't allow spaces in keywords anyways and we still check for the boundary
// first
mode.begin = '\\b(' + mode.beginKeywords.split(' ').join('|') + ')(?=\\b|\\s)';
mode.__beforeBegin = skipIfhasPrecedingOrTrailingDot;
}
if (!mode.begin) mode.begin = /\B|\b/;
cmode.beginRe = langRe(mode.begin);
if (mode.endSameAsBegin) mode.end = mode.begin;
if (!mode.end && !mode.endsWithParent) mode.end = /\B|\b/;
if (mode.end) cmode.endRe = langRe(mode.end);
cmode.terminator_end = regex.source(mode.end) || '';
if (mode.endsWithParent && parent.terminator_end) {
cmode.terminator_end += (mode.end ? '|' : '') + parent.terminator_end;
}
}
if (mode.illegal) cmode.illegalRe = langRe(mode.illegal);
// eslint-disable-next-line no-undefined
if (mode.relevance === undefined) mode.relevance = 1;
if (!mode.contains) mode.contains = [];
mode.contains = [].concat(...mode.contains.map(function(c) {
return expandOrCloneMode(c === 'self' ? mode : c);
}));
mode.contains.forEach(function(c) { compileMode(/** @type Mode */ (c), cmode); });
if (mode.starts) {
compileMode(mode.starts, parent);
}
cmode.matcher = buildModeRegex(cmode);
return cmode;
}
// self is not valid at the top-level
if (language.contains && language.contains.includes('self')) {
throw new Error("ERR: contains `self` is not supported at the top-level of a language. See documentation.");
}
return compileMode(/** @type Mode */ (language));
}
/**
* Determines if a mode has a dependency on it's parent or not
*
* If a mode does have a parent dependency then often we need to clone it if
* it's used in multiple places so that each copy points to the correct parent,
* where-as modes without a parent can often safely be re-used at the bottom of
* a mode chain.
*
* @param {Mode | null} mode
* @returns {boolean} - is there a dependency on the parent?
* */
function dependencyOnParent(mode) {
if (!mode) return false;
return mode.endsWithParent || dependencyOnParent(mode.starts);
}
/**
* Expands a mode or clones it if necessary
*
* This is necessary for modes with parental dependenceis (see notes on
* `dependencyOnParent`) and for nodes that have `variants` - which must then be
* exploded into their own individual modes at compile time.
*
* @param {Mode} mode
* @returns {Mode | Mode[]}
* */
function expandOrCloneMode(mode) {
if (mode.variants && !mode.cached_variants) {
mode.cached_variants = mode.variants.map(function(variant) {
return inherit(mode, { variants: null }, variant);
});
}
// EXPAND
// if we have variants then essentially "replace" the mode with the variants
// this happens in compileMode, where this function is called from
if (mode.cached_variants) {
return mode.cached_variants;
}
// CLONE
// if we have dependencies on parents then we need a unique
// instance of ourselves, so we can be reused with many
// different parents without issue
if (dependencyOnParent(mode)) {
return inherit(mode, { starts: mode.starts ? inherit(mode.starts) : null });
}
if (Object.isFrozen(mode)) {
return inherit(mode);
}
// no special dependency issues, just return ourselves
return mode;
}
/***********************************************
Keywords
***********************************************/
/**
* Given raw keywords from a language definition, compile them.
*
* @param {string | Record<string,string>} rawKeywords
* @param {boolean} caseInsensitive
*/
function compileKeywords(rawKeywords, caseInsensitive) {
/** @type KeywordDict */
var compiledKeywords = {};
if (typeof rawKeywords === 'string') { // string
splitAndCompile('keyword', rawKeywords);
} else {
Object.keys(rawKeywords).forEach(function(className) {
splitAndCompile(className, rawKeywords[className]);
});
}
return compiledKeywords;
// ---
/**
* Compiles an individual list of keywords
*
* Ex: "for if when while|5"
*
* @param {string} className
* @param {string} keywordList
*/
function splitAndCompile(className, keywordList) {
if (caseInsensitive) {
keywordList = keywordList.toLowerCase();
}
keywordList.split(' ').forEach(function(keyword) {
var pair = keyword.split('|');
compiledKeywords[pair[0]] = [className, scoreForKeyword(pair[0], pair[1])];
});
}
}
/**
* Returns the proper score for a given keyword
*
* Also takes into account comment keywords, which will be scored 0 UNLESS
* another score has been manually assigned.
* @param {string} keyword
* @param {string} [providedScore]
*/
function scoreForKeyword(keyword, providedScore) {
// manual scores always win over common keywords
// so you can force a score of 1 if you really insist
if (providedScore) {
return Number(providedScore);
}
return commonKeyword(keyword) ? 0 : 1;
}
/**
* Determines if a given keyword is common or not
*
* @param {string} keyword */
function commonKeyword(keyword) {
return COMMON_KEYWORDS.includes(keyword.toLowerCase());
}