From 79bd99c4b085582d18dc6ff2df1fc54d7d6a783b Mon Sep 17 00:00:00 2001
From: Kurt McKee <contactme@kurtmckee.org>
Date: Sun, 27 Sep 2020 10:44:49 -0500
Subject: [PATCH 1/2] MySQL: Tokenize quoted schema object names, and escape
 characters, uniquely

Changes in this patch:

* Name.Quoted and Name.Quoted.Escape are introduced as non-standard tokens
* HTML and LaTeX formatters were confirmed to provide default formatting
  if they encounter these two non-standard tokens. They also add style
  classes based on the token name, like "n-Quoted" (HTML) or "nQuoted"
  (LaTeX) so that users can add custom styles for these.
* Removed "\`" and "\\" as schema object name escapes. These are relics
  of the previous regular expression for backtick-quoted names and are
  not treated as escape sequences. The behavior was confirmed in the
  MySQL documentation as well as by running queries in MySQL Workbench.
* Prevent "123abc" from being treated as an integer followed by a schema
  object name. MySQL allows leading numbers in schema object names as long
  as 0-9 are not the only characters in the schema object name.
* Add ~10 more unit tests to validate behavior.

Closes #1551
---
 pygments/lexers/sql.py       | 18 ++++++++---------
 tests/examplefiles/mysql.txt |  2 +-
 tests/test_mysql.py          | 39 ++++++++++++++++++++++++++++++------
 3 files changed, 43 insertions(+), 16 deletions(-)
diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py
index 98d53c5ce6..8bea81486d 100644
--- a/pygments/lexers/sql.py
+++ b/pygments/lexers/sql.py
@@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer):
             (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float),  # Mandatory integer, optional fraction and exponent
             (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float),  # Mandatory fraction, optional integer and exponent
             (r'[0-9]+e[+-]?[0-9]+', Number.Float),  # Exponents with integer significands are still floats
-            (r'[0-9]+', Number.Integer),
+            (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff]|$)', Number.Integer),  # Integers that are not in a schema object name
 
             # Date literals
             (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
@@ -673,7 +673,7 @@ class MySqlLexer(RegexLexer):
             # numeric literals have already been handled above.
             #
             ('[0-9a-z$_\u0080-\uffff]+', Name),
-            (r'`', Name, 'schema-object-name'),
+            (r'`', Name.Quoted, 'schema-object-name'),
 
             # Punctuation
             (r'[(),.;]', Punctuation),
@@ -737,15 +737,15 @@ class MySqlLexer(RegexLexer):
         # Schema object name substates
         # ----------------------------
         #
-        # Backtick-quoted schema object names support escape characters.
-        # It may be desirable to tokenize escape sequences differently,
-        # but currently Pygments does not have an obvious token type for
-        # this unique situation (for example, "Name.Escape").
+        # "Name.Quoted" and "Name.Quoted.Escape" are non-standard but
+        # formatters will style them as "Name" by default but add
+        # additional styles based on the token name. This gives users
+        # flexibility to add custom styles as desired.
         #
         'schema-object-name': [
-            (r'[^`\\]+', Name),
-            (r'(?:\\\\|\\`|``)', Name),  # This could be an escaped name token type.
-            (r'`', Name, '#pop'),
+            (r'[^`]+', Name.Quoted),
+            (r'``', Name.Quoted.Escape),
+            (r'`', Name.Quoted, '#pop'),
         ],
     }
 
diff --git a/tests/examplefiles/mysql.txt b/tests/examplefiles/mysql.txt
index 4927abd833..c00b0b9269 100644
--- a/tests/examplefiles/mysql.txt
+++ b/tests/examplefiles/mysql.txt
@@ -107,7 +107,7 @@ CREATE TABLE basic (
 SELECT e1.`apple` AS a, `example2`.b
 FROM example1 AS e1
 JOIN example2 e2
-ON `example1`.`id` = e2.id;
+ON `example1`.`a``b` = e2.`123`;
 
 
 -- Operators
diff --git a/tests/test_mysql.py b/tests/test_mysql.py
index 9b5e2b8cf8..249c3f00b6 100644
--- a/tests/test_mysql.py
+++ b/tests/test_mysql.py
@@ -29,10 +29,15 @@ def lexer():
 
 
 @pytest.mark.parametrize('text', ('123',))
-def test_integer_literals(lexer, text):
+def test_integer_literals_positive_match(lexer, text):
     assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)
 
 
+@pytest.mark.parametrize('text', ('1a', '1A', '1ひ', '1$', '1_', '1\u0080', '1\uffff'))
+def test_integer_literals_negative_match(lexer, text):
+    assert list(lexer.get_tokens(text))[0][0] != Number.Integer
+
+
 @pytest.mark.parametrize(
     'text',
     (
@@ -215,18 +220,40 @@ def test_functions(lexer, text):
 @pytest.mark.parametrize(
     'text',
     (
-        'abc_$123', '上市年限', 'ひらがな',
-        '`a`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
-        '````', r'`\``', r'`\\`',
-        '`-- `', '`/*`', '`#`',
+        'abc_$123', '上市年限', 'ひらがな', '123_$abc', '123ひらがな',
     ),
 )
-def test_schema_object_names(lexer, text):
+def test_schema_object_names_unquoted(lexer, text):
     tokens = list(lexer.get_tokens(text))[:-1]
     assert all(token[0] == Name for token in tokens)
     assert ''.join(token[1] for token in tokens) == text
 
 
+@pytest.mark.parametrize(
+    'text',
+    (
+        '`a`', '`1`', '`上市年限`', '`ひらがな`', '`select`', '`concat(`',
+        '`-- `', '`/*`', '`#`',
+    ),
+)
+def test_schema_object_names_quoted(lexer, text):
+    tokens = list(lexer.get_tokens(text))[:-1]
+    assert tokens[0] == (Name.Quoted, '`')
+    assert tokens[1] == (Name.Quoted, text[1:-1])
+    assert tokens[2] == (Name.Quoted, '`')
+    assert ''.join(token[1] for token in tokens) == text
+
+
+@pytest.mark.parametrize('text', ('````', ))
+def test_schema_object_names_quoted_escaped(lexer, text):
+    """Test quoted schema object names with escape sequences."""
+    tokens = list(lexer.get_tokens(text))[:-1]
+    assert tokens[0] == (Name.Quoted, '`')
+    assert tokens[1] == (Name.Quoted.Escape, text[1:-1])
+    assert tokens[2] == (Name.Quoted, '`')
+    assert ''.join(token[1] for token in tokens) == text
+
+
 @pytest.mark.parametrize(
     'text',
     ('+', '*', '/', '%', '&&', ':=', '!', '<', '->>', '^', '|', '~'),

From 51b21b4e4ba2915e73b88b854ff2937485b8ba0c Mon Sep 17 00:00:00 2001
From: Kurt McKee <contactme@kurtmckee.org>
Date: Sun, 27 Sep 2020 14:15:48 -0500
Subject: [PATCH 2/2] Remove an end-of-line regex match that triggered a lint
 warning

Also, add tests that confirm correct behavior. No tests failed before
or after removing the '$' match in the regex, but now regexlint isn't
complaining.

Removing the '$' matching probably depends on the fact that Pygments
adds a newline at the end of the input text, so there is always something
after a bare integer literal.
---
 pygments/lexers/sql.py |  2 +-
 tests/test_mysql.py    | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py
index 8bea81486d..e27e0ddb4f 100644
--- a/pygments/lexers/sql.py
+++ b/pygments/lexers/sql.py
@@ -620,7 +620,7 @@ class MySqlLexer(RegexLexer):
             (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float),  # Mandatory integer, optional fraction and exponent
             (r'[0-9]*\.[0-9]+(e[+-]?[0-9]+)?', Number.Float),  # Mandatory fraction, optional integer and exponent
             (r'[0-9]+e[+-]?[0-9]+', Number.Float),  # Exponents with integer significands are still floats
-            (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff]|$)', Number.Integer),  # Integers that are not in a schema object name
+            (r'[0-9]+(?=[^0-9a-z$_\u0080-\uffff])', Number.Integer),  # Integers that are not in a schema object name
 
             # Date literals
             (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
diff --git a/tests/test_mysql.py b/tests/test_mysql.py
index 249c3f00b6..207ec822c4 100644
--- a/tests/test_mysql.py
+++ b/tests/test_mysql.py
@@ -28,13 +28,17 @@ def lexer():
     yield MySqlLexer()
 
 
-@pytest.mark.parametrize('text', ('123',))
+@pytest.mark.parametrize('text', ('1', '22', '22 333', '22 a', '22+', '22)', '22\n333', '22\r\n333'))
 def test_integer_literals_positive_match(lexer, text):
-    assert list(lexer.get_tokens(text))[0] == (Number.Integer, text)
+    """Validate that integer literals are tokenized as integers."""
+    token = list(lexer.get_tokens(text))[0]
+    assert token[0] == Number.Integer
+    assert token[1] in {'1', '22'}
 
 
-@pytest.mark.parametrize('text', ('1a', '1A', '1ひ', '1$', '1_', '1\u0080', '1\uffff'))
+@pytest.mark.parametrize('text', ('1a', '1A', '1.', '1ひ', '1$', '1_', '1\u0080', '1\uffff'))
 def test_integer_literals_negative_match(lexer, text):
+    """Validate that non-integer texts are not matched as integers."""
     assert list(lexer.get_tokens(text))[0][0] != Number.Integer