From 0a7fcfcc24e0071a71b58bf5680483fd49adb64f Mon Sep 17 00:00:00 2001 From: Richard Gibson Date: Tue, 19 Oct 2021 17:11:55 -0400 Subject: [PATCH] fix(python) Fix recognition of numeric literals followed by keywords without whitespace Fixes #2985 --- src/languages/python.js | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/languages/python.js b/src/languages/python.js index ce603a86d4..46ed88a6f0 100644 --- a/src/languages/python.js +++ b/src/languages/python.js @@ -255,6 +255,12 @@ export default function(hljs) { // https://docs.python.org/3.9/reference/lexical_analysis.html#numeric-literals const digitpart = '[0-9](_?[0-9])*'; const pointfloat = `(\\b(${digitpart}))?\\.(${digitpart})|\\b(${digitpart})\\.`; + // Whitespace after a number (or any lexical token) is needed only if its absence + // would change the tokenization + // https://docs.python.org/3.9/reference/lexical_analysis.html#whitespace-between-tokens + // We deviate slightly, requiring a word boundary or a keyword + // to avoid accidentally recognizing *prefixes* (e.g., `0` in `0x41` or `08` or `0__1`) + const lookahead = `\\b|${RESERVED_WORDS.join('|')}`; const NUMBER = { className: 'number', relevance: 0, @@ -270,7 +276,7 @@ export default function(hljs) { // because both MUST contain a decimal point and so cannot be confused with // the interior part of an identifier { - begin: `(\\b(${digitpart})|(${pointfloat}))[eE][+-]?(${digitpart})[jJ]?\\b` + begin: `(\\b(${digitpart})|(${pointfloat}))[eE][+-]?(${digitpart})[jJ]?(?=${lookahead})` }, { begin: `(${pointfloat})[jJ]?` @@ -283,22 +289,22 @@ export default function(hljs) { // decinteger is optionally imaginary // https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals { - begin: '\\b([1-9](_?[0-9])*|0+(_?0)*)[lLjJ]?\\b' + begin: `\\b([1-9](_?[0-9])*|0+(_?0)*)[lLjJ]?(?=${lookahead})` }, { - begin: '\\b0[bB](_?[01])+[lL]?\\b' + begin: `\\b0[bB](_?[01])+[lL]?(?=${lookahead})` }, { - begin: '\\b0[oO](_?[0-7])+[lL]?\\b' + begin: `\\b0[oO](_?[0-7])+[lL]?(?=${lookahead})` }, { - begin: '\\b0[xX](_?[0-9a-fA-F])+[lL]?\\b' + begin: `\\b0[xX](_?[0-9a-fA-F])+[lL]?(?=${lookahead})` }, // imagnumber (digitpart-based) // https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals { - begin: `\\b(${digitpart})[jJ]\\b` + begin: `\\b(${digitpart})[jJ](?=${lookahead})` } ] };