Skip to content

Commit

Permalink
MySQL: Tokenize quoted schema object names, and escape characters, un…
Browse files Browse the repository at this point in the history
…iquely

Changes in this patch:

* Name.Quoted and Name.Quoted.Escape are introduced as non-standard tokens
* HTML and LaTeX formatters were confirmed to provide default formatting
  if they encounter these two non-standard tokens. They also add style
  classes based on the token name, like "n-Quoted" (HTML) or "nQuoted"
  (LaTeX) so that users can add custom styles for these.
* Removed "\`" and "\\" as schema object name escapes. These are relics
  of the previous regular expression for backtick-quoted names and are
  not treated as escape sequences. The behavior was confirmed in the
  MySQL documentation as well as by running queries in MySQL Workbench.
* Prevent "123abc" from being treated as an integer followed by a schema
  object name. MySQL allows leading numbers in schema object names as long
  as 0-9 are not the only characters in the schema object name.
* Add ~10 more unit tests to validate behavior.

Closes pygments#1551
  • Loading branch information
kurtmckee committed Sep 27, 2020
1 parent 9fca2a1 commit 79bd99c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 16 deletions.
18 changes: 9 additions & 9 deletions pygments/lexers/sql.py
Expand Up @@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer):
(r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float), # Mandatory integer, optional fraction and exponent
(r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float), # Mandatory fraction, optional integer and exponent
(r'[0-9]+e[+-]?[0-9]+', Number.Float), # Exponents with integer significands are still floats
(r'[0-9]+', Number.Integer),
(r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff]|$)', Number.Integer), # Integers that are not in a schema object name

# Date literals
(r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
Expand Down Expand Up @@ -673,7 +673,7 @@ class MySqlLexer(RegexLexer):
# numeric literals have already been handled above.
#
('[0-9a-z$_\u0080-\uffff]+', Name),
(r'`', Name, 'schema-object-name'),
(r'`', Name.Quoted, 'schema-object-name'),

# Punctuation
(r'[(),.;]', Punctuation),
Expand Down Expand Up @@ -737,15 +737,15 @@ class MySqlLexer(RegexLexer):
# Schema object name substates
# ----------------------------
#
# Backtick-quoted schema object names support escape characters.
# It may be desirable to tokenize escape sequences differently,
# but currently Pygments does not have an obvious token type for
# this unique situation (for example, "Name.Escape").
# "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
# formatters will style them as "Name" by default but add
# additional styles based on the token name. This gives users
# flexibility to add custom styles as desired.
#
'schema-object-name': [
(r'[^`\\]+', Name),
(r'(?:\\\\|\\`|``)', Name), # This could be an escaped name token type.
(r'`', Name, '#pop'),
(r'[^`]+', Name.Quoted),
(r'``', Name.Quoted.Escape),
(r'`', Name.Quoted, '#pop'),
],
}

Expand Down
2 changes: 1 addition & 1 deletion tests/examplefiles/mysql.txt
Expand Up @@ -107,7 +107,7 @@ CREATE TABLE basic (
SELECT e1.`apple` AS a, `example2`.b
FROM example1 AS e1
JOIN example2 e2
ON `example1`.`id` = e2.id;
ON `example1`.`a``b` = e2.`123`;


-- Operators
Expand Down
39 changes: 33 additions & 6 deletions tests/test_mysql.py
Expand Up @@ -29,10 +29,15 @@ def lexer():


@pytest.mark.parametrize('text', ('123',))
def test_integer_literals(lexer, text):
def test_integer_literals_positive_match(lexer, text):
assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)


@pytest.mark.parametrize('text', ('1a', '1A', '1ひ', '1$', '1_', '1\u0080', '1\uffff'))
def test_integer_literals_negative_match(lexer, text):
assert list(lexer.get_tokens(text))[0][0] != Number.Integer


@pytest.mark.parametrize(
'text',
(
Expand Down Expand Up @@ -215,18 +220,40 @@ def test_functions(lexer, text):
@pytest.mark.parametrize(
'text',
(
'abc_$123', '上市年限', 'ひらがな',
'`a`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
'````', r'`\``', r'`\\`',
'`-- `', '`/*`', '`#`',
'abc_$123', '上市年限', 'ひらがな', '123_$abc', '123ひらがな',
),
)
def test_schema_object_names(lexer, text):
def test_schema_object_names_unquoted(lexer, text):
tokens = list(lexer.get_tokens(text))[:-1]
assert all(token[0] == Name for token in tokens)
assert ''.join(token[1] for token in tokens) == text


@pytest.mark.parametrize(
'text',
(
'`a`', '`1`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
'`-- `', '`/*`', '`#`',
),
)
def test_schema_object_names_quoted(lexer, text):
tokens = list(lexer.get_tokens(text))[:-1]
assert tokens[0] == (Name.Quoted, '`')
assert tokens[1] == (Name.Quoted, text[1:-1])
assert tokens[2] == (Name.Quoted, '`')
assert ''.join(token[1] for token in tokens) == text


@pytest.mark.parametrize('text', ('````', ))
def test_schema_object_names_quoted_escaped(lexer, text):
"""Test quoted schema object names with escape sequences."""
tokens = list(lexer.get_tokens(text))[:-1]
assert tokens[0] == (Name.Quoted, '`')
assert tokens[1] == (Name.Quoted.Escape, text[1:-1])
assert tokens[2] == (Name.Quoted, '`')
assert ''.join(token[1] for token in tokens) == text


@pytest.mark.parametrize(
'text',
('+', '*', '/', '%', '&&', ':=', '!', '<', '->>', '^', '|', '~'),
Expand Down

0 comments on commit 79bd99c

Please sign in to comment.