MySQL: Tokenize quoted schema object names, and escape characters, un…

…iquely Changes in this patch: * Name.Quoted and Name.Quoted.Escape are introduced as non-standard tokens * HTML and LaTeX formatters were confirmed to provide default formatting if they encounter these two non-standard tokens. They also add style classes based on the token name, like "n-Quoted" (HTML) or "nQuoted" (LaTeX) so that users can add custom styles for these. * Removed "\`" and "\\" as schema object name escapes. These are relics of the previous regular expression for backtick-quoted names and are not treated as escape sequences. The behavior was confirmed in the MySQL documentation as well as by running queries in MySQL Workbench. * Prevent "123abc" from being treated as an integer followed by a schema object name. MySQL allows leading numbers in schema object names as long as 0-9 are not the only characters in the schema object name. * Add ~10 more unit tests to validate behavior. Closes pygments#1551
kurtmckee · Sep 27, 2020 · 79bd99c · 79bd99c
1 parent 9fca2a1
commit 79bd99c
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 16 deletions.
diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py
@@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer):
             (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float),  # Mandatory integer, optional fraction and exponent
             (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float),  # Mandatory fraction, optional integer and exponent
             (r'[0-9]+e[+-]?[0-9]+', Number.Float),  # Exponents with integer significands are still floats
-            (r'[0-9]+', Number.Integer),
+            (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff]|$)', Number.Integer),  # Integers that are not in a schema object name
 
             # Date literals
             (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
@@ -673,7 +673,7 @@ class MySqlLexer(RegexLexer):
             # numeric literals have already been handled above.
             #
             ('[0-9a-z$_\u0080-\uffff]+', Name),
-            (r'`', Name, 'schema-object-name'),
+            (r'`', Name.Quoted, 'schema-object-name'),
 
             # Punctuation
             (r'[(),.;]', Punctuation),
@@ -737,15 +737,15 @@ class MySqlLexer(RegexLexer):
         # Schema object name substates
         # ----------------------------
         #
-        # Backtick-quoted schema object names support escape characters.
-        # It may be desirable to tokenize escape sequences differently,
-        # but currently Pygments does not have an obvious token type for
-        # this unique situation (for example, "Name.Escape").
+        # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
+        # formatters will style them as "Name" by default but add
+        # additional styles based on the token name. This gives users
+        # flexibility to add custom styles as desired.
         #
         'schema-object-name': [
-            (r'[^`\\]+', Name),
-            (r'(?:\\\\|\\`|``)', Name),  # This could be an escaped name token type.
-            (r'`', Name, '#pop'),
+            (r'[^`]+', Name.Quoted),
+            (r'``', Name.Quoted.Escape),
+            (r'`', Name.Quoted, '#pop'),
         ],
     }
 

diff --git a/tests/examplefiles/mysql.txt b/tests/examplefiles/mysql.txt
@@ -107,7 +107,7 @@ CREATE TABLE basic (
 SELECT e1.`apple` AS a, `example2`.b
 FROM example1 AS e1
 JOIN example2 e2
-ON `example1`.`id` = e2.id;
+ON `example1`.`a``b` = e2.`123`;
 
 
 -- Operators

diff --git a/tests/test_mysql.py b/tests/test_mysql.py
@@ -29,10 +29,15 @@ def lexer():
 
 
 @pytest.mark.parametrize('text', ('123',))
-def test_integer_literals(lexer, text):
+def test_integer_literals_positive_match(lexer, text):
     assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)
 
 
+@pytest.mark.parametrize('text', ('1a', '1A', '1ひ', '1$', '1_', '1\u0080', '1\uffff'))
+def test_integer_literals_negative_match(lexer, text):
+    assert list(lexer.get_tokens(text))[0][0] != Number.Integer
+
+
 @pytest.mark.parametrize(
     'text',
     (
@@ -215,18 +220,40 @@ def test_functions(lexer, text):
 @pytest.mark.parametrize(
     'text',
     (
-        'abc_$123', '上市年限', 'ひらがな',
-        '`a`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
-        '````', r'`\``', r'`\\`',
-        '`-- `', '`/*`', '`#`',
+        'abc_$123', '上市年限', 'ひらがな', '123_$abc', '123ひらがな',
     ),
 )
-def test_schema_object_names(lexer, text):
+def test_schema_object_names_unquoted(lexer, text):
     tokens = list(lexer.get_tokens(text))[:-1]
     assert all(token[0] == Name for token in tokens)
     assert ''.join(token[1] for token in tokens) == text
 
 
+@pytest.mark.parametrize(
+    'text',
+    (
+        '`a`', '`1`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
+        '`-- `', '`/*`', '`#`',
+    ),
+)
+def test_schema_object_names_quoted(lexer, text):
+    tokens = list(lexer.get_tokens(text))[:-1]
+    assert tokens[0] == (Name.Quoted, '`')
+    assert tokens[1] == (Name.Quoted, text[1:-1])
+    assert tokens[2] == (Name.Quoted, '`')
+    assert ''.join(token[1] for token in tokens) == text
+
+
+@pytest.mark.parametrize('text', ('````', ))
+def test_schema_object_names_quoted_escaped(lexer, text):
+    """Test quoted schema object names with escape sequences."""
+    tokens = list(lexer.get_tokens(text))[:-1]
+    assert tokens[0] == (Name.Quoted, '`')
+    assert tokens[1] == (Name.Quoted.Escape, text[1:-1])
+    assert tokens[2] == (Name.Quoted, '`')
+    assert ''.join(token[1] for token in tokens) == text
+
+
 @pytest.mark.parametrize(
     'text',
     ('+', '*', '/', '%', '&&', ':=', '!', '<', '->>', '^', '|', '~'),