diff --git a/pygments/formatters/html.py b/pygments/formatters/html.py index 29cc9fb8b1..cf2e079c37 100644 --- a/pygments/formatters/html.py +++ b/pygments/formatters/html.py @@ -9,6 +9,7 @@ :license: BSD, see LICENSE for details. """ +import functools import os import sys import os.path @@ -414,6 +415,7 @@ def __init__(self, **options): self.tagurlformat = self._decodeifneeded(options.get('tagurlformat', '')) self.filename = self._decodeifneeded(options.get('filename', '')) self.wrapcode = get_bool_opt(options, 'wrapcode', False) + self.span_element_openers = {} if self.tagsfile: if not ctags: @@ -455,13 +457,20 @@ def _get_css_class(self, ttype): return '' def _get_css_classes(self, ttype): - """Return the css classes of this token type prefixed with - the classprefix option.""" + """Generate the opening tag for a given token type using CSS classes.""" cls = self._get_css_class(ttype) while ttype not in STANDARD_TYPES: ttype = ttype.parent cls = self._get_css_class(ttype) + ' ' + cls - return cls + return cls and '' % cls or '' + + def _get_css_inline_styles(self, ttype): + """Generate the opening tag for a given token type using inline CSS styles.""" + cclass = self.ttype2class.get(ttype) + while cclass is None: + ttype = ttype.parent + cclass = self.ttype2class.get(ttype) + return cclass and '' % self.class2style[cclass][0] or '' def _create_stylesheet(self): t2c = self.ttype2class = {Token: ''} @@ -786,6 +795,11 @@ def _wrap_code(self, inner): yield from inner yield 0, '' + @functools.lru_cache(maxsize=100) + def _translate_parts(self, value): + """HTML-escape a value and split it by newlines.""" + return value.translate(_escape_html_table).split('\n') + def _format_lines(self, tokensource): """ Just format the tokens, without any wrapping tags. @@ -793,26 +807,20 @@ def _format_lines(self, tokensource): """ nocls = self.noclasses lsep = self.lineseparator - # for lookup only - getcls = self.ttype2class.get - c2s = self.class2style - escape_table = _escape_html_table tagsfile = self.tagsfile lspan = '' line = [] for ttype, value in tokensource: - if nocls: - cclass = getcls(ttype) - while cclass is None: - ttype = ttype.parent - cclass = getcls(ttype) - cspan = cclass and '' % c2s[cclass][0] or '' - else: - cls = self._get_css_classes(ttype) - cspan = cls and '' % cls or '' + try: + cspan = self.span_element_openers[ttype] + except KeyError: + if nocls: + cspan = self.span_element_openers[ttype] = self._get_css_inline_styles(ttype) + else: + cspan = self.span_element_openers[ttype] = self._get_css_classes(ttype) - parts = value.translate(escape_table).split('\n') + parts = self._translate_parts(value) if tagsfile and ttype in Token.Name: filename, linenumber = self._lookup_ctag(value) diff --git a/pygments/lexers/data.py b/pygments/lexers/data.py index fa05b10c67..96594a557d 100644 --- a/pygments/lexers/data.py +++ b/pygments/lexers/data.py @@ -11,7 +11,7 @@ import re -from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \ +from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, LexerContext, \ include, bygroups, inherit from pygments.token import Text, Comment, Keyword, Name, String, Number, \ Punctuation, Literal, Error @@ -436,7 +436,7 @@ def get_tokens_unprocessed(self, text=None, context=None): return super().get_tokens_unprocessed(text, context) -class JsonLexer(RegexLexer): +class JsonLexer(Lexer): """ For JSON data structures. @@ -448,71 +448,188 @@ class JsonLexer(RegexLexer): filenames = ['*.json', 'Pipfile.lock'] mimetypes = ['application/json'] - flags = re.DOTALL + # No validation of integers, floats, or constants is done. + # As long as the characters are members of the following + # sets, the token will be considered valid. For example, + # + # "--1--" is parsed as an integer + # "1...eee" is parsed as a float + # "trustful" is parsed as a constant + # + integers = set('-0123456789') + floats = set('.eE+') + constants = set('truefalsenull') # true|false|null + hexadecimals = set('0123456789abcdefABCDEF') + punctuations = set('{}[],') + whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'} + + def get_tokens_unprocessed(self, text): + """Parse JSON data.""" + + in_string = False + in_escape = False + in_unicode_escape = 0 + in_whitespace = False + in_constant = False + in_number = False + in_float = False + in_punctuation = False + + start = 0 + + # The queue is used to store data that may need to be tokenized + # differently based on what follows. In particular, JSON object + # keys are tokenized differently than string values, but cannot + # be distinguished until punctuation is encountered outside the + # string. + # + # A ":" character after the string indicates that the string is + # an object key; any other character indicates the string is a + # regular string value. + # + # The queue holds tuples that contain the following data: + # + # (start_index, token_type, text) + # + # By default the token type of text in double quotes is + # String.Double. The token type will be replaced if a colon + # is encountered after the string closes. + # + queue = [] + + for stop, character in enumerate(text): + if in_string: + if in_unicode_escape: + if character in self.hexadecimals: + in_unicode_escape -= 1 + if not in_unicode_escape: + in_escape = False + else: + in_unicode_escape = 0 + in_escape = False + + elif in_escape: + if character == 'u': + in_unicode_escape = 4 + else: + in_escape = False + + elif character == '\\': + in_escape = True + + elif character == '"': + queue.append((start, String.Double, text[start:stop + 1])) + in_string = False + in_escape = False + in_unicode_escape = 0 + + continue + + elif in_whitespace: + if character in self.whitespaces: + continue + + if queue: + queue.append((start, Text, text[start:stop])) + else: + yield start, Text, text[start:stop] + in_whitespace = False + # Fall through so the new character can be evaluated. + + elif in_constant: + if character in self.constants: + continue + + yield start, Keyword.Constant, text[start:stop] + in_constant = False + # Fall through so the new character can be evaluated. + + elif in_number: + if character in self.integers: + continue + elif character in self.floats: + in_float = True + continue + + if in_float: + yield start, Number.Float, text[start:stop] + else: + yield start, Number.Integer, text[start:stop] + in_number = False + in_float = False + # Fall through so the new character can be evaluated. + + elif in_punctuation: + if character in self.punctuations: + continue + + yield start, Punctuation, text[start:stop] + in_punctuation = False + # Fall through so the new character can be evaluated. + + start = stop + + if character == '"': + in_string = True + + elif character in self.whitespaces: + in_whitespace = True + + elif character in {'f', 'n', 't'}: # The first letters of true|false|null + # Exhaust the queue. Accept the existing token types. + yield from queue + queue.clear() + + in_constant = True + + elif character in self.integers: + # Exhaust the queue. Accept the existing token types. + yield from queue + queue.clear() + + in_number = True + + elif character == ':': + # Yield from the queue. Replace string token types. + for _start, _token, _text in queue: + if _token is Text: + yield _start, _token, _text + elif _token is String.Double: + yield _start, Name.Tag, _text + else: + yield _start, Error, _text + queue.clear() + + in_punctuation = True + + elif character in self.punctuations: + # Exhaust the queue. Accept the existing token types. + yield from queue + queue.clear() + + in_punctuation = True - # integer part of a number - int_part = r'-?(0|[1-9]\d*)' - - # fractional part of a number - frac_part = r'\.\d+' - - # exponential part of a number - exp_part = r'[eE](\+|-)?\d+' - - tokens = { - 'whitespace': [ - (r'\s+', Text), - ], - - # represents a simple terminal value - 'simplevalue': [ - (r'(true|false|null)\b', Keyword.Constant), - (('%(int_part)s(%(frac_part)s%(exp_part)s|' - '%(exp_part)s|%(frac_part)s)') % vars(), - Number.Float), - (int_part, Number.Integer), - (r'"(\\(["\\/bfnrt]|u[a-fA-F0-9]{4})|[^\\"])*"', String.Double), - ], - - - # the right hand side of an object, after the attribute name - 'objectattribute': [ - include('value'), - (r':', Punctuation), - # comma terminates the attribute but expects more - (r',', Punctuation, '#pop'), - # a closing bracket terminates the entire object, so pop twice - (r'\}', Punctuation, '#pop:2'), - ], - - # a json object - { attr, attr, ... } - 'objectvalue': [ - include('whitespace'), - (r'"(\\(["\\/bfnrt]|u[a-fA-F0-9]{4})|[^\\"])*"', Name.Tag, 'objectattribute'), - (r'\}', Punctuation, '#pop'), - ], - - # json array - [ value, value, ... } - 'arrayvalue': [ - include('whitespace'), - include('value'), - (r',', Punctuation), - (r'\]', Punctuation, '#pop'), - ], - - # a json value - either a simple value or a complex value (object or array) - 'value': [ - include('whitespace'), - include('simplevalue'), - (r'\{', Punctuation, 'objectvalue'), - (r'\[', Punctuation, 'arrayvalue'), - ], - - # the root of a json document whould be a value - 'root': [ - include('value'), - ], - } + else: + # Exhaust the queue. Accept the existing token types. + yield from queue + queue.clear() + + yield start, Error, character + + # Yield any remaining text. + yield from queue + if in_string: + yield start, Error, text[start:] + elif in_float: + yield start, Number.Float, text[start:] + elif in_number: + yield start, Number.Integer, text[start:] + elif in_constant: + yield start, Keyword.Constant, text[start:] + elif in_whitespace: + yield start, Text, text[start:] + elif in_punctuation: + yield start, Punctuation, text[start:] class JsonBareObjectLexer(JsonLexer): @@ -527,21 +644,10 @@ class JsonBareObjectLexer(JsonLexer): filenames = [] mimetypes = ['application/json-object'] - tokens = { - 'root': [ - (r'\}', Error), - include('objectvalue'), - ], - 'objectattribute': [ - (r'\}', Error), - inherit, - ], - } - class JsonLdLexer(JsonLexer): """ - For `JSON-LD `_ linked data. + For `JSON-LD `_ linked data. .. versionadded:: 2.0 """ @@ -551,11 +657,38 @@ class JsonLdLexer(JsonLexer): filenames = ['*.jsonld'] mimetypes = ['application/ld+json'] - tokens = { - 'objectvalue': [ - (r'"@(context|id|value|language|type|container|list|set|' - r'reverse|index|base|vocab|graph)"', Name.Decorator, - 'objectattribute'), - inherit, - ], + json_ld_keywords = { + '"@%s"' % keyword + for keyword in ( + 'base', + 'container', + 'context', + 'direction', + 'graph', + 'id', + 'import', + 'included', + 'index', + 'json', + 'language', + 'list', + 'nest', + 'none', + 'prefix', + 'propagate', + 'protected', + 'reverse', + 'set', + 'type', + 'value', + 'version', + 'vocab', + ) } + + def get_tokens_unprocessed(self, text): + for start, token, value in super(JsonLdLexer, self).get_tokens_unprocessed(text): + if token is Name.Tag and value in self.json_ld_keywords: + yield start, Name.Decorator, value + else: + yield start, token, value diff --git a/tests/test_data.py b/tests/test_data.py index db094d5236..5388910ae5 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -7,10 +7,12 @@ :license: BSD, see LICENSE for details. """ +import time + import pytest -from pygments.lexers import JsonLexer, JsonBareObjectLexer, YamlLexer -from pygments.token import Token +from pygments.lexers.data import JsonLexer, JsonBareObjectLexer, JsonLdLexer, YamlLexer +from pygments.token import Token, Punctuation, Text, Number, String, Keyword, Name @pytest.fixture(scope='module') @@ -23,11 +25,186 @@ def lexer_bare(): yield JsonBareObjectLexer() +@pytest.fixture(scope='module') +def lexer_json_ld(): + yield JsonLdLexer() + + @pytest.fixture(scope='module') def lexer_yaml(): yield YamlLexer() +@pytest.mark.parametrize( + 'text, expected_token_types', + ( + # Integers + ('0', (Number.Integer,)), + ('-1', (Number.Integer,)), + ('1234567890', (Number.Integer,)), + ('-1234567890', (Number.Integer,)), + + # Floats, including scientific notation + ('123456789.0123456789', (Number.Float,)), + ('-123456789.0123456789', (Number.Float,)), + ('1e10', (Number.Float,)), + ('-1E10', (Number.Float,)), + ('1e-10', (Number.Float,)), + ('-1E+10', (Number.Float,)), + ('1.0e10', (Number.Float,)), + ('-1.0E10', (Number.Float,)), + ('1.0e-10', (Number.Float,)), + ('-1.0E+10', (Number.Float,)), + + # Strings (escapes are tested elsewhere) + ('""', (String.Double,)), + ('"abc"', (String.Double,)), + ('"ひらがな"', (String.Double,)), + ('"123"', (String.Double,)), + ('"[]"', (String.Double,)), + ('"{}"', (String.Double,)), + ('"true"', (String.Double,)), + ('"false"', (String.Double,)), + ('"null"', (String.Double,)), + ('":,"', (String.Double,)), + + # Constants + ('true', (Keyword.Constant, )), + ('false', (Keyword.Constant, )), + ('null', (Keyword.Constant, )), + + # Whitespace + ('\u0020', (Text,)), # space + ('\u000a', (Text,)), # newline + ('\u000d', (Text,)), # carriage return + ('\u0009', (Text,)), # tab + + # Arrays + ('[]', (Punctuation,)), + ('["a", "b"]', (Punctuation, String.Double, Punctuation, Text, String.Double, Punctuation)), + + # Objects + ('{}', (Punctuation,)), + ('{"a": "b"}', (Punctuation, Name.Tag, Punctuation, Text, String.Double, Punctuation)), + ) +) +def test_json_literals_positive_match(lexer_json, text, expected_token_types): + """Validate that syntactically-correct JSON literals are parsed correctly.""" + + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert len(tokens) == len(expected_token_types) + assert all(token[1] is expected_token for token, expected_token in zip(tokens, expected_token_types)) + assert ''.join(token[2] for token in tokens) == text + + +@pytest.mark.parametrize( + 'text', + ( + '"', '\\', '/', 'b', 'f', 'n', 'r', 't', + 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01', + ) +) +def test_json_object_key_escapes_positive_match(lexer_json, text): + """Validate that escape sequences in JSON object keys are parsed correctly.""" + + tokens = list(lexer_json.get_tokens_unprocessed('{"\\%s": 1}' % text)) + assert len(tokens) == 6 + assert tokens[1][1] is Name.Tag + assert tokens[1][2] == '"\\%s"' % text + + +@pytest.mark.parametrize( + 'text', + ( + '"', '\\', '/', 'b', 'f', 'n', 'r', 't', + 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01', + ) +) +def test_json_string_escapes_positive_match(lexer_json, text): + """Validate that escape sequences in JSON string values are parsed correctly.""" + + text = '"\\%s"' % text + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert len(tokens) == 1 + assert tokens[0][1] is String.Double + assert tokens[0][2] == text + + +@pytest.mark.parametrize('text', ('+\n', '0\n', '""0\n', 'a\nb\n',)) +def test_json_round_trip_errors(lexer_json, text): + """Validate that past round-trip errors never crop up again.""" + + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert ''.join(t[2] for t in tokens) == text + + +def test_json_escape_backtracking(lexer_json): + """Confirm that there is no catastrophic backtracking in the lexer. + + This no longer applies because the JSON lexer doesn't use regular expressions, + but the test is included to ensure no loss of functionality now or in the future. + """ + + fragment = r'{"\u00D0000\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\63CD' + start_time = time.time() + list(lexer_json.get_tokens(fragment)) + assert time.time() - start_time < 1, 'The JSON lexer may have catastrophic backtracking' + + +@pytest.mark.parametrize( + 'keyword', + ( + 'base', + 'container', + 'context', + 'direction', + 'graph', + 'id', + 'import', + 'included', + 'index', + 'json', + 'language', + 'list', + 'nest', + 'none', + 'prefix', + 'propagate', + 'protected', + 'reverse', + 'set', + 'type', + 'value', + 'version', + 'vocab', + ) +) +def test_json_ld_keywords_positive_match(lexer_json_ld, keyword): + """Validate that JSON-LD keywords are parsed correctly.""" + + tokens = list(lexer_json_ld.get_tokens_unprocessed('{"@%s": ""}' % keyword)) + assert len(tokens) == 6 + assert tokens[1][1] is Token.Name.Decorator + assert tokens[1][2] == '"@%s"' % keyword + + +@pytest.mark.parametrize( + 'keyword', + ( + '@bogus', # "@" does not guarantee a keyword match + '@bases', # Begins with the keyword "@base" + 'container', # Matches "container" but has no leading "@" + ) +) +def test_json_ld_keywords_negative_match(lexer_json_ld, keyword): + """Validate that JSON-LD non-keywords are parsed correctly.""" + + tokens = list(lexer_json_ld.get_tokens_unprocessed('{"%s": ""}' % keyword)) + assert len(tokens) == 6 + assert tokens[1][1] is Token.Name.Tag + assert tokens[1][2] == '"%s"' % keyword + + def test_basic_json(lexer_json): fragment = '{"foo": "bar", "foo2": [1, 2, 3], "\\u0123": "\\u0123"}\n' tokens = [ @@ -49,8 +226,7 @@ def test_basic_json(lexer_json): (Token.Punctuation, ','), (Token.Text, ' '), (Token.Literal.Number.Integer, '3'), - (Token.Punctuation, ']'), - (Token.Punctuation, ','), + (Token.Punctuation, '],'), (Token.Text, ' '), (Token.Name.Tag, '"\\u0123"'), (Token.Punctuation, ':'), @@ -62,33 +238,6 @@ def test_basic_json(lexer_json): assert list(lexer_json.get_tokens(fragment)) == tokens -def test_json_escape_backtracking(lexer_json): - # This tests that an (invalid) sequence of escapes doesn't cause the lexer - # to fall into catastrophic backtracking. unfortunately, if it's broken - # this test will hang and that's how we know it's broken :( - fragment = r'{"\u00D0000\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\63CD' - tokens = [ - (Token.Punctuation, '{'), - (Token.Error, r'"'), - (Token.Error, '\\'), - (Token.Error, r'u'), - (Token.Error, r'0'), - (Token.Error, r'0'), - (Token.Error, r'D'), - (Token.Error, r'0'), - (Token.Error, r'0'), - (Token.Error, r'0'), - (Token.Error, r'0') - ] + [(Token.Error, '\\')] * 178 + [ - (Token.Error, r'6'), - (Token.Error, r'3'), - (Token.Error, r'C'), - (Token.Error, r'D'), - (Token.Text, '\n')] - - assert list(lexer_json.get_tokens(fragment)) == tokens - - def test_basic_bare(lexer_bare): # This is the same as testBasic for JsonLexer above, except the # enclosing curly braces are removed. @@ -117,31 +266,6 @@ def test_basic_bare(lexer_bare): assert list(lexer_bare.get_tokens(fragment)) == tokens -def test_closing_curly(lexer_bare): - # This can be an Error token, but should not be a can't-pop-from-stack - # exception. - fragment = '}"a"\n' - tokens = [ - (Token.Error, '}'), - (Token.Name.Tag, '"a"'), - (Token.Text, '\n'), - ] - assert list(lexer_bare.get_tokens(fragment)) == tokens - - -def test_closing_curly_in_value(lexer_bare): - fragment = '"": ""}\n' - tokens = [ - (Token.Name.Tag, '""'), - (Token.Punctuation, ':'), - (Token.Text, ' '), - (Token.Literal.String.Double, '""'), - (Token.Error, '}'), - (Token.Text, '\n'), - ] - assert list(lexer_bare.get_tokens(fragment)) == tokens - - def test_yaml(lexer_yaml): # Bug #1528: This previously parsed 'token # innocent' as a tag fragment = 'here: token # innocent: comment\n'