Skip to content

Commit

Permalink
Speed up JSON and reduce HTML formatter consumption (#1569)
Browse files Browse the repository at this point in the history
* Update the JSON-LD keyword list to match JSON-LD 1.1

Changes in this patch:

* Update the JSON-LD URL to HTTPS
* Update the list of JSON-LD keywords
* Make the JSON-LD parser less dependent on the JSON lexer implementation
* Add unit tests for the JSON-LD lexer

* Add unit tests for the JSON parser

This includes:

* Testing valid literals
* Testing valid string escapes
* Testing that object keys are tokenized differently from string values

* Rewrite the JSON lexer

Related to #1425

Included in this change:

* The JSON parser is rewritten
* The JSON bare object parser no longer requires additional code
* `get_tokens_unprocessed()` returns as much as it can to reduce yields
  (for example, side-by-side punctuation is not returned separately)
* The unit tests were updated
* Add unit tests based on Hypothesis test results

* Reduce HTML formatter memory consumption by ~33% and speed it up

Related to #1425

Tested on a 118MB JSON file. Memory consumption tops out at ~3GB before
this patch and drops to only ~2GB with this patch. These were the command
lines used:

python -m pygments -l json -f html -o .\new-code-classes.html .\jc-output.txt
python -m pygments -l json -f html -O "noclasses" -o .\new-code-styles.html .\jc-output.txt

* Add an LRU cache to the HTML formatter's HTML-escaping and line-splitting

For a 118MB JSON input file, this reduces memory consumption by ~500MB
and reduces formatting time by ~15 seconds.

* JSON: Add a catastrophic backtracking test back to the test suite

* JSON: Update the comment that documents the internal queue

* JSON: Document in comments that ints/floats/constants are not validated
  • Loading branch information
kurtmckee committed Oct 26, 2020
1 parent 9c1a078 commit 164dcb5
Show file tree
Hide file tree
Showing 3 changed files with 423 additions and 158 deletions.
42 changes: 25 additions & 17 deletions pygments/formatters/html.py
Expand Up @@ -9,6 +9,7 @@
:license: BSD, see LICENSE for details.
"""

import functools
import os
import sys
import os.path
Expand Down Expand Up @@ -414,6 +415,7 @@ def __init__(self, **options):
self.tagurlformat = self._decodeifneeded(options.get('tagurlformat', ''))
self.filename = self._decodeifneeded(options.get('filename', ''))
self.wrapcode = get_bool_opt(options, 'wrapcode', False)
self.span_element_openers = {}

if self.tagsfile:
if not ctags:
Expand Down Expand Up @@ -455,13 +457,20 @@ def _get_css_class(self, ttype):
return ''

def _get_css_classes(self, ttype):
"""Return the css classes of this token type prefixed with
the classprefix option."""
"""Generate the opening <span> tag for a given token type using CSS classes."""
cls = self._get_css_class(ttype)
while ttype not in STANDARD_TYPES:
ttype = ttype.parent
cls = self._get_css_class(ttype) + ' ' + cls
return cls
return cls and '<span class="%s">' % cls or ''

def _get_css_inline_styles(self, ttype):
"""Generate the opening <span> tag for a given token type using inline CSS styles."""
cclass = self.ttype2class.get(ttype)
while cclass is None:
ttype = ttype.parent
cclass = self.ttype2class.get(ttype)
return cclass and '<span style="%s">' % self.class2style[cclass][0] or ''

def _create_stylesheet(self):
t2c = self.ttype2class = {Token: ''}
Expand Down Expand Up @@ -786,33 +795,32 @@ def _wrap_code(self, inner):
yield from inner
yield 0, '</code>'

@functools.lru_cache(maxsize=100)
def _translate_parts(self, value):
"""HTML-escape a value and split it by newlines."""
return value.translate(_escape_html_table).split('\n')

def _format_lines(self, tokensource):
"""
Just format the tokens, without any wrapping tags.
Yield individual lines.
"""
nocls = self.noclasses
lsep = self.lineseparator
# for <span style=""> lookup only
getcls = self.ttype2class.get
c2s = self.class2style
escape_table = _escape_html_table
tagsfile = self.tagsfile

lspan = ''
line = []
for ttype, value in tokensource:
if nocls:
cclass = getcls(ttype)
while cclass is None:
ttype = ttype.parent
cclass = getcls(ttype)
cspan = cclass and '<span style="%s">' % c2s[cclass][0] or ''
else:
cls = self._get_css_classes(ttype)
cspan = cls and '<span class="%s">' % cls or ''
try:
cspan = self.span_element_openers[ttype]
except KeyError:
if nocls:
cspan = self.span_element_openers[ttype] = self._get_css_inline_styles(ttype)
else:
cspan = self.span_element_openers[ttype] = self._get_css_classes(ttype)

parts = value.translate(escape_table).split('\n')
parts = self._translate_parts(value)

if tagsfile and ttype in Token.Name:
filename, linenumber = self._lookup_ctag(value)
Expand Down

0 comments on commit 164dcb5

Please sign in to comment.