diff --git a/pygments/formatters/html.py b/pygments/formatters/html.py
index 29cc9fb8b1..cf2e079c37 100644
--- a/pygments/formatters/html.py
+++ b/pygments/formatters/html.py
@@ -9,6 +9,7 @@
:license: BSD, see LICENSE for details.
"""
+import functools
import os
import sys
import os.path
@@ -414,6 +415,7 @@ def __init__(self, **options):
self.tagurlformat = self._decodeifneeded(options.get('tagurlformat', ''))
self.filename = self._decodeifneeded(options.get('filename', ''))
self.wrapcode = get_bool_opt(options, 'wrapcode', False)
+ self.span_element_openers = {}
if self.tagsfile:
if not ctags:
@@ -455,13 +457,20 @@ def _get_css_class(self, ttype):
return ''
def _get_css_classes(self, ttype):
- """Return the css classes of this token type prefixed with
- the classprefix option."""
+ """Generate the opening tag for a given token type using CSS classes."""
cls = self._get_css_class(ttype)
while ttype not in STANDARD_TYPES:
ttype = ttype.parent
cls = self._get_css_class(ttype) + ' ' + cls
- return cls
+ return cls and '' % cls or ''
+
+ def _get_css_inline_styles(self, ttype):
+ """Generate the opening tag for a given token type using inline CSS styles."""
+ cclass = self.ttype2class.get(ttype)
+ while cclass is None:
+ ttype = ttype.parent
+ cclass = self.ttype2class.get(ttype)
+ return cclass and '' % self.class2style[cclass][0] or ''
def _create_stylesheet(self):
t2c = self.ttype2class = {Token: ''}
@@ -786,6 +795,11 @@ def _wrap_code(self, inner):
yield from inner
yield 0, ''
+ @functools.lru_cache(maxsize=100)
+ def _translate_parts(self, value):
+ """HTML-escape a value and split it by newlines."""
+ return value.translate(_escape_html_table).split('\n')
+
def _format_lines(self, tokensource):
"""
Just format the tokens, without any wrapping tags.
@@ -793,26 +807,20 @@ def _format_lines(self, tokensource):
"""
nocls = self.noclasses
lsep = self.lineseparator
- # for lookup only
- getcls = self.ttype2class.get
- c2s = self.class2style
- escape_table = _escape_html_table
tagsfile = self.tagsfile
lspan = ''
line = []
for ttype, value in tokensource:
- if nocls:
- cclass = getcls(ttype)
- while cclass is None:
- ttype = ttype.parent
- cclass = getcls(ttype)
- cspan = cclass and '' % c2s[cclass][0] or ''
- else:
- cls = self._get_css_classes(ttype)
- cspan = cls and '' % cls or ''
+ try:
+ cspan = self.span_element_openers[ttype]
+ except KeyError:
+ if nocls:
+ cspan = self.span_element_openers[ttype] = self._get_css_inline_styles(ttype)
+ else:
+ cspan = self.span_element_openers[ttype] = self._get_css_classes(ttype)
- parts = value.translate(escape_table).split('\n')
+ parts = self._translate_parts(value)
if tagsfile and ttype in Token.Name:
filename, linenumber = self._lookup_ctag(value)
diff --git a/pygments/lexers/data.py b/pygments/lexers/data.py
index fa05b10c67..96594a557d 100644
--- a/pygments/lexers/data.py
+++ b/pygments/lexers/data.py
@@ -11,7 +11,7 @@
import re
-from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \
+from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, LexerContext, \
include, bygroups, inherit
from pygments.token import Text, Comment, Keyword, Name, String, Number, \
Punctuation, Literal, Error
@@ -436,7 +436,7 @@ def get_tokens_unprocessed(self, text=None, context=None):
return super().get_tokens_unprocessed(text, context)
-class JsonLexer(RegexLexer):
+class JsonLexer(Lexer):
"""
For JSON data structures.
@@ -448,71 +448,188 @@ class JsonLexer(RegexLexer):
filenames = ['*.json', 'Pipfile.lock']
mimetypes = ['application/json']
- flags = re.DOTALL
+ # No validation of integers, floats, or constants is done.
+ # As long as the characters are members of the following
+ # sets, the token will be considered valid. For example,
+ #
+ # "--1--" is parsed as an integer
+ # "1...eee" is parsed as a float
+ # "trustful" is parsed as a constant
+ #
+ integers = set('-0123456789')
+ floats = set('.eE+')
+ constants = set('truefalsenull') # true|false|null
+ hexadecimals = set('0123456789abcdefABCDEF')
+ punctuations = set('{}[],')
+ whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'}
+
+ def get_tokens_unprocessed(self, text):
+ """Parse JSON data."""
+
+ in_string = False
+ in_escape = False
+ in_unicode_escape = 0
+ in_whitespace = False
+ in_constant = False
+ in_number = False
+ in_float = False
+ in_punctuation = False
+
+ start = 0
+
+ # The queue is used to store data that may need to be tokenized
+ # differently based on what follows. In particular, JSON object
+ # keys are tokenized differently than string values, but cannot
+ # be distinguished until punctuation is encountered outside the
+ # string.
+ #
+ # A ":" character after the string indicates that the string is
+ # an object key; any other character indicates the string is a
+ # regular string value.
+ #
+ # The queue holds tuples that contain the following data:
+ #
+ # (start_index, token_type, text)
+ #
+ # By default the token type of text in double quotes is
+ # String.Double. The token type will be replaced if a colon
+ # is encountered after the string closes.
+ #
+ queue = []
+
+ for stop, character in enumerate(text):
+ if in_string:
+ if in_unicode_escape:
+ if character in self.hexadecimals:
+ in_unicode_escape -= 1
+ if not in_unicode_escape:
+ in_escape = False
+ else:
+ in_unicode_escape = 0
+ in_escape = False
+
+ elif in_escape:
+ if character == 'u':
+ in_unicode_escape = 4
+ else:
+ in_escape = False
+
+ elif character == '\\':
+ in_escape = True
+
+ elif character == '"':
+ queue.append((start, String.Double, text[start:stop + 1]))
+ in_string = False
+ in_escape = False
+ in_unicode_escape = 0
+
+ continue
+
+ elif in_whitespace:
+ if character in self.whitespaces:
+ continue
+
+ if queue:
+ queue.append((start, Text, text[start:stop]))
+ else:
+ yield start, Text, text[start:stop]
+ in_whitespace = False
+ # Fall through so the new character can be evaluated.
+
+ elif in_constant:
+ if character in self.constants:
+ continue
+
+ yield start, Keyword.Constant, text[start:stop]
+ in_constant = False
+ # Fall through so the new character can be evaluated.
+
+ elif in_number:
+ if character in self.integers:
+ continue
+ elif character in self.floats:
+ in_float = True
+ continue
+
+ if in_float:
+ yield start, Number.Float, text[start:stop]
+ else:
+ yield start, Number.Integer, text[start:stop]
+ in_number = False
+ in_float = False
+ # Fall through so the new character can be evaluated.
+
+ elif in_punctuation:
+ if character in self.punctuations:
+ continue
+
+ yield start, Punctuation, text[start:stop]
+ in_punctuation = False
+ # Fall through so the new character can be evaluated.
+
+ start = stop
+
+ if character == '"':
+ in_string = True
+
+ elif character in self.whitespaces:
+ in_whitespace = True
+
+ elif character in {'f', 'n', 't'}: # The first letters of true|false|null
+ # Exhaust the queue. Accept the existing token types.
+ yield from queue
+ queue.clear()
+
+ in_constant = True
+
+ elif character in self.integers:
+ # Exhaust the queue. Accept the existing token types.
+ yield from queue
+ queue.clear()
+
+ in_number = True
+
+ elif character == ':':
+ # Yield from the queue. Replace string token types.
+ for _start, _token, _text in queue:
+ if _token is Text:
+ yield _start, _token, _text
+ elif _token is String.Double:
+ yield _start, Name.Tag, _text
+ else:
+ yield _start, Error, _text
+ queue.clear()
+
+ in_punctuation = True
+
+ elif character in self.punctuations:
+ # Exhaust the queue. Accept the existing token types.
+ yield from queue
+ queue.clear()
+
+ in_punctuation = True
- # integer part of a number
- int_part = r'-?(0|[1-9]\d*)'
-
- # fractional part of a number
- frac_part = r'\.\d+'
-
- # exponential part of a number
- exp_part = r'[eE](\+|-)?\d+'
-
- tokens = {
- 'whitespace': [
- (r'\s+', Text),
- ],
-
- # represents a simple terminal value
- 'simplevalue': [
- (r'(true|false|null)\b', Keyword.Constant),
- (('%(int_part)s(%(frac_part)s%(exp_part)s|'
- '%(exp_part)s|%(frac_part)s)') % vars(),
- Number.Float),
- (int_part, Number.Integer),
- (r'"(\\(["\\/bfnrt]|u[a-fA-F0-9]{4})|[^\\"])*"', String.Double),
- ],
-
-
- # the right hand side of an object, after the attribute name
- 'objectattribute': [
- include('value'),
- (r':', Punctuation),
- # comma terminates the attribute but expects more
- (r',', Punctuation, '#pop'),
- # a closing bracket terminates the entire object, so pop twice
- (r'\}', Punctuation, '#pop:2'),
- ],
-
- # a json object - { attr, attr, ... }
- 'objectvalue': [
- include('whitespace'),
- (r'"(\\(["\\/bfnrt]|u[a-fA-F0-9]{4})|[^\\"])*"', Name.Tag, 'objectattribute'),
- (r'\}', Punctuation, '#pop'),
- ],
-
- # json array - [ value, value, ... }
- 'arrayvalue': [
- include('whitespace'),
- include('value'),
- (r',', Punctuation),
- (r'\]', Punctuation, '#pop'),
- ],
-
- # a json value - either a simple value or a complex value (object or array)
- 'value': [
- include('whitespace'),
- include('simplevalue'),
- (r'\{', Punctuation, 'objectvalue'),
- (r'\[', Punctuation, 'arrayvalue'),
- ],
-
- # the root of a json document whould be a value
- 'root': [
- include('value'),
- ],
- }
+ else:
+ # Exhaust the queue. Accept the existing token types.
+ yield from queue
+ queue.clear()
+
+ yield start, Error, character
+
+ # Yield any remaining text.
+ yield from queue
+ if in_string:
+ yield start, Error, text[start:]
+ elif in_float:
+ yield start, Number.Float, text[start:]
+ elif in_number:
+ yield start, Number.Integer, text[start:]
+ elif in_constant:
+ yield start, Keyword.Constant, text[start:]
+ elif in_whitespace:
+ yield start, Text, text[start:]
+ elif in_punctuation:
+ yield start, Punctuation, text[start:]
class JsonBareObjectLexer(JsonLexer):
@@ -527,21 +644,10 @@ class JsonBareObjectLexer(JsonLexer):
filenames = []
mimetypes = ['application/json-object']
- tokens = {
- 'root': [
- (r'\}', Error),
- include('objectvalue'),
- ],
- 'objectattribute': [
- (r'\}', Error),
- inherit,
- ],
- }
-
class JsonLdLexer(JsonLexer):
"""
- For `JSON-LD `_ linked data.
+ For `JSON-LD `_ linked data.
.. versionadded:: 2.0
"""
@@ -551,11 +657,38 @@ class JsonLdLexer(JsonLexer):
filenames = ['*.jsonld']
mimetypes = ['application/ld+json']
- tokens = {
- 'objectvalue': [
- (r'"@(context|id|value|language|type|container|list|set|'
- r'reverse|index|base|vocab|graph)"', Name.Decorator,
- 'objectattribute'),
- inherit,
- ],
+ json_ld_keywords = {
+ '"@%s"' % keyword
+ for keyword in (
+ 'base',
+ 'container',
+ 'context',
+ 'direction',
+ 'graph',
+ 'id',
+ 'import',
+ 'included',
+ 'index',
+ 'json',
+ 'language',
+ 'list',
+ 'nest',
+ 'none',
+ 'prefix',
+ 'propagate',
+ 'protected',
+ 'reverse',
+ 'set',
+ 'type',
+ 'value',
+ 'version',
+ 'vocab',
+ )
}
+
+ def get_tokens_unprocessed(self, text):
+ for start, token, value in super(JsonLdLexer, self).get_tokens_unprocessed(text):
+ if token is Name.Tag and value in self.json_ld_keywords:
+ yield start, Name.Decorator, value
+ else:
+ yield start, token, value
diff --git a/tests/test_data.py b/tests/test_data.py
index db094d5236..5388910ae5 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -7,10 +7,12 @@
:license: BSD, see LICENSE for details.
"""
+import time
+
import pytest
-from pygments.lexers import JsonLexer, JsonBareObjectLexer, YamlLexer
-from pygments.token import Token
+from pygments.lexers.data import JsonLexer, JsonBareObjectLexer, JsonLdLexer, YamlLexer
+from pygments.token import Token, Punctuation, Text, Number, String, Keyword, Name
@pytest.fixture(scope='module')
@@ -23,11 +25,186 @@ def lexer_bare():
yield JsonBareObjectLexer()
+@pytest.fixture(scope='module')
+def lexer_json_ld():
+ yield JsonLdLexer()
+
+
@pytest.fixture(scope='module')
def lexer_yaml():
yield YamlLexer()
+@pytest.mark.parametrize(
+ 'text, expected_token_types',
+ (
+ # Integers
+ ('0', (Number.Integer,)),
+ ('-1', (Number.Integer,)),
+ ('1234567890', (Number.Integer,)),
+ ('-1234567890', (Number.Integer,)),
+
+ # Floats, including scientific notation
+ ('123456789.0123456789', (Number.Float,)),
+ ('-123456789.0123456789', (Number.Float,)),
+ ('1e10', (Number.Float,)),
+ ('-1E10', (Number.Float,)),
+ ('1e-10', (Number.Float,)),
+ ('-1E+10', (Number.Float,)),
+ ('1.0e10', (Number.Float,)),
+ ('-1.0E10', (Number.Float,)),
+ ('1.0e-10', (Number.Float,)),
+ ('-1.0E+10', (Number.Float,)),
+
+ # Strings (escapes are tested elsewhere)
+ ('""', (String.Double,)),
+ ('"abc"', (String.Double,)),
+ ('"ひらがな"', (String.Double,)),
+ ('"123"', (String.Double,)),
+ ('"[]"', (String.Double,)),
+ ('"{}"', (String.Double,)),
+ ('"true"', (String.Double,)),
+ ('"false"', (String.Double,)),
+ ('"null"', (String.Double,)),
+ ('":,"', (String.Double,)),
+
+ # Constants
+ ('true', (Keyword.Constant, )),
+ ('false', (Keyword.Constant, )),
+ ('null', (Keyword.Constant, )),
+
+ # Whitespace
+ ('\u0020', (Text,)), # space
+ ('\u000a', (Text,)), # newline
+ ('\u000d', (Text,)), # carriage return
+ ('\u0009', (Text,)), # tab
+
+ # Arrays
+ ('[]', (Punctuation,)),
+ ('["a", "b"]', (Punctuation, String.Double, Punctuation, Text, String.Double, Punctuation)),
+
+ # Objects
+ ('{}', (Punctuation,)),
+ ('{"a": "b"}', (Punctuation, Name.Tag, Punctuation, Text, String.Double, Punctuation)),
+ )
+)
+def test_json_literals_positive_match(lexer_json, text, expected_token_types):
+ """Validate that syntactically-correct JSON literals are parsed correctly."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert len(tokens) == len(expected_token_types)
+ assert all(token[1] is expected_token for token, expected_token in zip(tokens, expected_token_types))
+ assert ''.join(token[2] for token in tokens) == text
+
+
+@pytest.mark.parametrize(
+ 'text',
+ (
+ '"', '\\', '/', 'b', 'f', 'n', 'r', 't',
+ 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01',
+ )
+)
+def test_json_object_key_escapes_positive_match(lexer_json, text):
+ """Validate that escape sequences in JSON object keys are parsed correctly."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed('{"\\%s": 1}' % text))
+ assert len(tokens) == 6
+ assert tokens[1][1] is Name.Tag
+ assert tokens[1][2] == '"\\%s"' % text
+
+
+@pytest.mark.parametrize(
+ 'text',
+ (
+ '"', '\\', '/', 'b', 'f', 'n', 'r', 't',
+ 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01',
+ )
+)
+def test_json_string_escapes_positive_match(lexer_json, text):
+ """Validate that escape sequences in JSON string values are parsed correctly."""
+
+ text = '"\\%s"' % text
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert len(tokens) == 1
+ assert tokens[0][1] is String.Double
+ assert tokens[0][2] == text
+
+
+@pytest.mark.parametrize('text', ('+\n', '0\n', '""0\n', 'a\nb\n',))
+def test_json_round_trip_errors(lexer_json, text):
+ """Validate that past round-trip errors never crop up again."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert ''.join(t[2] for t in tokens) == text
+
+
+def test_json_escape_backtracking(lexer_json):
+ """Confirm that there is no catastrophic backtracking in the lexer.
+
+ This no longer applies because the JSON lexer doesn't use regular expressions,
+ but the test is included to ensure no loss of functionality now or in the future.
+ """
+
+ fragment = r'{"\u00D0000\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\63CD'
+ start_time = time.time()
+ list(lexer_json.get_tokens(fragment))
+ assert time.time() - start_time < 1, 'The JSON lexer may have catastrophic backtracking'
+
+
+@pytest.mark.parametrize(
+ 'keyword',
+ (
+ 'base',
+ 'container',
+ 'context',
+ 'direction',
+ 'graph',
+ 'id',
+ 'import',
+ 'included',
+ 'index',
+ 'json',
+ 'language',
+ 'list',
+ 'nest',
+ 'none',
+ 'prefix',
+ 'propagate',
+ 'protected',
+ 'reverse',
+ 'set',
+ 'type',
+ 'value',
+ 'version',
+ 'vocab',
+ )
+)
+def test_json_ld_keywords_positive_match(lexer_json_ld, keyword):
+ """Validate that JSON-LD keywords are parsed correctly."""
+
+ tokens = list(lexer_json_ld.get_tokens_unprocessed('{"@%s": ""}' % keyword))
+ assert len(tokens) == 6
+ assert tokens[1][1] is Token.Name.Decorator
+ assert tokens[1][2] == '"@%s"' % keyword
+
+
+@pytest.mark.parametrize(
+ 'keyword',
+ (
+ '@bogus', # "@" does not guarantee a keyword match
+ '@bases', # Begins with the keyword "@base"
+ 'container', # Matches "container" but has no leading "@"
+ )
+)
+def test_json_ld_keywords_negative_match(lexer_json_ld, keyword):
+ """Validate that JSON-LD non-keywords are parsed correctly."""
+
+ tokens = list(lexer_json_ld.get_tokens_unprocessed('{"%s": ""}' % keyword))
+ assert len(tokens) == 6
+ assert tokens[1][1] is Token.Name.Tag
+ assert tokens[1][2] == '"%s"' % keyword
+
+
def test_basic_json(lexer_json):
fragment = '{"foo": "bar", "foo2": [1, 2, 3], "\\u0123": "\\u0123"}\n'
tokens = [
@@ -49,8 +226,7 @@ def test_basic_json(lexer_json):
(Token.Punctuation, ','),
(Token.Text, ' '),
(Token.Literal.Number.Integer, '3'),
- (Token.Punctuation, ']'),
- (Token.Punctuation, ','),
+ (Token.Punctuation, '],'),
(Token.Text, ' '),
(Token.Name.Tag, '"\\u0123"'),
(Token.Punctuation, ':'),
@@ -62,33 +238,6 @@ def test_basic_json(lexer_json):
assert list(lexer_json.get_tokens(fragment)) == tokens
-def test_json_escape_backtracking(lexer_json):
- # This tests that an (invalid) sequence of escapes doesn't cause the lexer
- # to fall into catastrophic backtracking. unfortunately, if it's broken
- # this test will hang and that's how we know it's broken :(
- fragment = r'{"\u00D0000\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\63CD'
- tokens = [
- (Token.Punctuation, '{'),
- (Token.Error, r'"'),
- (Token.Error, '\\'),
- (Token.Error, r'u'),
- (Token.Error, r'0'),
- (Token.Error, r'0'),
- (Token.Error, r'D'),
- (Token.Error, r'0'),
- (Token.Error, r'0'),
- (Token.Error, r'0'),
- (Token.Error, r'0')
- ] + [(Token.Error, '\\')] * 178 + [
- (Token.Error, r'6'),
- (Token.Error, r'3'),
- (Token.Error, r'C'),
- (Token.Error, r'D'),
- (Token.Text, '\n')]
-
- assert list(lexer_json.get_tokens(fragment)) == tokens
-
-
def test_basic_bare(lexer_bare):
# This is the same as testBasic for JsonLexer above, except the
# enclosing curly braces are removed.
@@ -117,31 +266,6 @@ def test_basic_bare(lexer_bare):
assert list(lexer_bare.get_tokens(fragment)) == tokens
-def test_closing_curly(lexer_bare):
- # This can be an Error token, but should not be a can't-pop-from-stack
- # exception.
- fragment = '}"a"\n'
- tokens = [
- (Token.Error, '}'),
- (Token.Name.Tag, '"a"'),
- (Token.Text, '\n'),
- ]
- assert list(lexer_bare.get_tokens(fragment)) == tokens
-
-
-def test_closing_curly_in_value(lexer_bare):
- fragment = '"": ""}\n'
- tokens = [
- (Token.Name.Tag, '""'),
- (Token.Punctuation, ':'),
- (Token.Text, ' '),
- (Token.Literal.String.Double, '""'),
- (Token.Error, '}'),
- (Token.Text, '\n'),
- ]
- assert list(lexer_bare.get_tokens(fragment)) == tokens
-
-
def test_yaml(lexer_yaml):
# Bug #1528: This previously parsed 'token # innocent' as a tag
fragment = 'here: token # innocent: comment\n'