Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MySQL: Tokenize quoted schema object names, and escape characters, uniquely #1555

Merged
merged 2 commits into from Oct 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 9 additions & 9 deletions pygments/lexers/sql.py
Expand Up @@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer):
(r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent
(r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent
(r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats
(r'[0-9]+', Number.Integer),
(r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer), # Integers that are not in a schema object name

# Date literals
(r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
Expand Down Expand Up @@ -673,7 +673,7 @@ class MySqlLexer(RegexLexer):
# numeric literals have already been handled above.
#
('[0-9a-z$_\u0080-\uffff]+', Name),
(r'`', Name, 'schema-object-name'),
(r'`', Name.Quoted, 'schema-object-name'),

# Punctuation
(r'[(),.;]', Punctuation),
Expand Down Expand Up @@ -737,15 +737,15 @@ class MySqlLexer(RegexLexer):
# Schema object name substates
# ----------------------------
#
# Backtick-quoted schema object names support escape characters.
# It may be desirable to tokenize escape sequences differently,
# but currently Pygments does not have an obvious token type for
# this unique situation (for example, "Name.Escape").
# "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
# formatters will style them as "Name" by default but add
# additional styles based on the token name. This gives users
# flexibility to add custom styles as desired.
#
'schema-object-name': [
(r'[^`\\]+', Name),
(r'(?:\\\\|\\`|``)', Name), # This could be an escaped name token type.
(r'`', Name, '#pop'),
(r'[^`]+', Name.Quoted),
(r'``', Name.Quoted.Escape),
(r'`', Name.Quoted, '#pop'),
],
}

Expand Down
2 changes: 1 addition & 1 deletion tests/examplefiles/mysql.txt
Expand Up @@ -107,7 +107,7 @@ CREATE TABLE basic (
SELECT e1.`apple` AS a, `example2`.b
FROM example1 AS e1
JOIN example2 e2
ON `example1`.`id` = e2.id;
ON `example1`.`a``b` = e2.`123`;


-- Operators
Expand Down
47 changes: 39 additions & 8 deletions tests/test_mysql.py
Expand Up @@ -28,9 +28,18 @@ def lexer():
yield MySqlLexer()


@pytest.mark.parametrize('text', ('123',))
def test_integer_literals(lexer, text):
assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)
@pytest.mark.parametrize('text', ('1', '22', '22 333', '22 a', '22+', '22)', '22\n333', '22\r\n333'))
def test_integer_literals_positive_match(lexer, text):
"""Validate that integer literals are tokenized as integers."""
token = list(lexer.get_tokens(text))[0]
assert token[0] == Number.Integer
assert token[1] in {'1', '22'}


@pytest.mark.parametrize('text', ('1a', '1A', '1.', '1ひ', '1$', '1_', '1\u0080', '1\uffff'))
def test_integer_literals_negative_match(lexer, text):
"""Validate that non-integer texts are not matched as integers."""
assert list(lexer.get_tokens(text))[0][0] != Number.Integer


@pytest.mark.parametrize(
Expand Down Expand Up @@ -215,18 +224,40 @@ def test_functions(lexer, text):
@pytest.mark.parametrize(
'text',
(
'abc_$123', '上市年限', 'ひらがな',
'`a`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
'````', r'`\``', r'`\\`',
'`-- `', '`/*`', '`#`',
'abc_$123', '上市年限', 'ひらがな', '123_$abc', '123ひらがな',
),
)
def test_schema_object_names(lexer, text):
def test_schema_object_names_unquoted(lexer, text):
tokens = list(lexer.get_tokens(text))[:-1]
assert all(token[0] == Name for token in tokens)
assert ''.join(token[1] for token in tokens) == text


@pytest.mark.parametrize(
'text',
(
'`a`', '`1`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
'`-- `', '`/*`', '`#`',
),
)
def test_schema_object_names_quoted(lexer, text):
tokens = list(lexer.get_tokens(text))[:-1]
assert tokens[0] == (Name.Quoted, '`')
assert tokens[1] == (Name.Quoted, text[1:-1])
assert tokens[2] == (Name.Quoted, '`')
assert ''.join(token[1] for token in tokens) == text


@pytest.mark.parametrize('text', ('````', ))
def test_schema_object_names_quoted_escaped(lexer, text):
"""Test quoted schema object names with escape sequences."""
tokens = list(lexer.get_tokens(text))[:-1]
assert tokens[0] == (Name.Quoted, '`')
assert tokens[1] == (Name.Quoted.Escape, text[1:-1])
assert tokens[2] == (Name.Quoted, '`')
assert ''.join(token[1] for token in tokens) == text


@pytest.mark.parametrize(
'text',
('+', '*', '/', '%', '&&', ':=', '!', '<', '->>', '^', '|', '~'),
Expand Down