From 79bd99c4b085582d18dc6ff2df1fc54d7d6a783b Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sun, 27 Sep 2020 10:44:49 -0500 Subject: [PATCH 1/2] MySQL: Tokenize quoted schema object names, and escape characters, uniquely Changes in this patch: * Name.Quoted and Name.Quoted.Escape are introduced as non-standard tokens * HTML and LaTeX formatters were confirmed to provide default formatting if they encounter these two non-standard tokens. They also add style classes based on the token name, like "n-Quoted" (HTML) or "nQuoted" (LaTeX) so that users can add custom styles for these. * Removed "\`" and "\\" as schema object name escapes. These are relics of the previous regular expression for backtick-quoted names and are not treated as escape sequences. The behavior was confirmed in the MySQL documentation as well as by running queries in MySQL Workbench. * Prevent "123abc" from being treated as an integer followed by a schema object name. MySQL allows leading numbers in schema object names as long as 0-9 are not the only characters in the schema object name. * Add ~10 more unit tests to validate behavior. Closes #1551 --- pygments/lexers/sql.py | 18 ++++++++--------- tests/examplefiles/mysql.txt | 2 +- tests/test_mysql.py | 39 ++++++++++++++++++++++++++++++------ 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py index 98d53c5ce6..8bea81486d 100644 --- a/pygments/lexers/sql.py +++ b/pygments/lexers/sql.py @@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer): (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent (r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats - (r'[0-9]+', Number.Integer), + (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff]|$)', Number.Integer), # Integers that are not in a schema object name # Date literals (r"\{\s*d\s*(?P['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}", @@ -673,7 +673,7 @@ class MySqlLexer(RegexLexer): # numeric literals have already been handled above. # ('[0-9a-z$_\u0080-\uffff]+', Name), - (r'`', Name, 'schema-object-name'), + (r'`', Name.Quoted, 'schema-object-name'), # Punctuation (r'[(),.;]', Punctuation), @@ -737,15 +737,15 @@ class MySqlLexer(RegexLexer): # Schema object name substates # ---------------------------- # - # Backtick-quoted schema object names support escape characters. - # It may be desirable to tokenize escape sequences differently, - # but currently Pygments does not have an obvious token type for - # this unique situation (for example, "Name.Escape"). + # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but + # formatters will style them as "Name" by default but add + # additional styles based on the token name. This gives users + # flexibility to add custom styles as desired. # 'schema-object-name': [ - (r'[^`\\]+', Name), - (r'(?:\\\\|\\`|``)', Name), # This could be an escaped name token type. - (r'`', Name, '#pop'), + (r'[^`]+', Name.Quoted), + (r'``', Name.Quoted.Escape), + (r'`', Name.Quoted, '#pop'), ], } diff --git a/tests/examplefiles/mysql.txt b/tests/examplefiles/mysql.txt index 4927abd833..c00b0b9269 100644 --- a/tests/examplefiles/mysql.txt +++ b/tests/examplefiles/mysql.txt @@ -107,7 +107,7 @@ CREATE TABLE basic ( SELECT e1.`apple` AS a, `example2`.b FROM example1 AS e1 JOIN example2 e2 -ON `example1`.`id` = e2.id; +ON `example1`.`a``b` = e2.`123`; -- Operators diff --git a/tests/test_mysql.py b/tests/test_mysql.py index 9b5e2b8cf8..249c3f00b6 100644 --- a/tests/test_mysql.py +++ b/tests/test_mysql.py @@ -29,10 +29,15 @@ def lexer(): @pytest.mark.parametrize('text', ('123',)) -def test_integer_literals(lexer, text): +def test_integer_literals_positive_match(lexer, text): assert list(lexer.get_tokens(text))[0] == (Number.Integer, text) +@pytest.mark.parametrize('text', ('1a', '1A', '1ひ', '1$', '1_', '1\u0080', '1\uffff')) +def test_integer_literals_negative_match(lexer, text): + assert list(lexer.get_tokens(text))[0][0] != Number.Integer + + @pytest.mark.parametrize( 'text', ( @@ -215,18 +220,40 @@ def test_functions(lexer, text): @pytest.mark.parametrize( 'text', ( - 'abc_$123', '上市年限', 'ひらがな', - '`a`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`', - '````', r'`\``', r'`\\`', - '`-- `', '`/*`', '`#`', + 'abc_$123', '上市年限', 'ひらがな', '123_$abc', '123ひらがな', ), ) -def test_schema_object_names(lexer, text): +def test_schema_object_names_unquoted(lexer, text): tokens = list(lexer.get_tokens(text))[:-1] assert all(token[0] == Name for token in tokens) assert ''.join(token[1] for token in tokens) == text +@pytest.mark.parametrize( + 'text', + ( + '`a`', '`1`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`', + '`-- `', '`/*`', '`#`', + ), +) +def test_schema_object_names_quoted(lexer, text): + tokens = list(lexer.get_tokens(text))[:-1] + assert tokens[0] == (Name.Quoted, '`') + assert tokens[1] == (Name.Quoted, text[1:-1]) + assert tokens[2] == (Name.Quoted, '`') + assert ''.join(token[1] for token in tokens) == text + + +@pytest.mark.parametrize('text', ('````', )) +def test_schema_object_names_quoted_escaped(lexer, text): + """Test quoted schema object names with escape sequences.""" + tokens = list(lexer.get_tokens(text))[:-1] + assert tokens[0] == (Name.Quoted, '`') + assert tokens[1] == (Name.Quoted.Escape, text[1:-1]) + assert tokens[2] == (Name.Quoted, '`') + assert ''.join(token[1] for token in tokens) == text + + @pytest.mark.parametrize( 'text', ('+', '*', '/', '%', '&&', ':=', '!', '<', '->>', '^', '|', '~'), From 51b21b4e4ba2915e73b88b854ff2937485b8ba0c Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sun, 27 Sep 2020 14:15:48 -0500 Subject: [PATCH 2/2] Remove an end-of-line regex match that triggered a lint warning Also, add tests that confirm correct behavior. No tests failed before or after removing the '$' match in the regex, but now regexlint isn't complaining. Removing the '$' matching probably depends on the fact that Pygments adds a newline at the end of the input text, so there is always something after a bare integer literal. --- pygments/lexers/sql.py | 2 +- tests/test_mysql.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py index 8bea81486d..e27e0ddb4f 100644 --- a/pygments/lexers/sql.py +++ b/pygments/lexers/sql.py @@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer): (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent (r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats - (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff]|$)', Number.Integer), # Integers that are not in a schema object name + (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer), # Integers that are not in a schema object name # Date literals (r"\{\s*d\s*(?P['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}", diff --git a/tests/test_mysql.py b/tests/test_mysql.py index 249c3f00b6..207ec822c4 100644 --- a/tests/test_mysql.py +++ b/tests/test_mysql.py @@ -28,13 +28,17 @@ def lexer(): yield MySqlLexer() -@pytest.mark.parametrize('text', ('123',)) +@pytest.mark.parametrize('text', ('1', '22', '22 333', '22 a', '22+', '22)', '22\n333', '22\r\n333')) def test_integer_literals_positive_match(lexer, text): - assert list(lexer.get_tokens(text))[0] == (Number.Integer, text) + """Validate that integer literals are tokenized as integers.""" + token = list(lexer.get_tokens(text))[0] + assert token[0] == Number.Integer + assert token[1] in {'1', '22'} -@pytest.mark.parametrize('text', ('1a', '1A', '1ひ', '1$', '1_', '1\u0080', '1\uffff')) +@pytest.mark.parametrize('text', ('1a', '1A', '1.', '1ひ', '1$', '1_', '1\u0080', '1\uffff')) def test_integer_literals_negative_match(lexer, text): + """Validate that non-integer texts are not matched as integers.""" assert list(lexer.get_tokens(text))[0][0] != Number.Integer