Speed up JSON and reduce HTML formatter consumption (#1569)

* Update the JSON-LD keyword list to match JSON-LD 1.1 Changes in this patch: * Update the JSON-LD URL to HTTPS * Update the list of JSON-LD keywords * Make the JSON-LD parser less dependent on the JSON lexer implementation * Add unit tests for the JSON-LD lexer * Add unit tests for the JSON parser This includes: * Testing valid literals * Testing valid string escapes * Testing that object keys are tokenized differently from string values * Rewrite the JSON lexer Related to #1425 Included in this change: * The JSON parser is rewritten * The JSON bare object parser no longer requires additional code * `get_tokens_unprocessed()` returns as much as it can to reduce yields (for example, side-by-side punctuation is not returned separately) * The unit tests were updated * Add unit tests based on Hypothesis test results * Reduce HTML formatter memory consumption by ~33% and speed it up Related to #1425 Tested on a 118MB JSON file. Memory consumption tops out at ~3GB before this patch and drops to only ~2GB with this patch. These were the command lines used: python -m pygments -l json -f html -o .\new-code-classes.html .\jc-output.txt python -m pygments -l json -f html -O "noclasses" -o .\new-code-styles.html .\jc-output.txt * Add an LRU cache to the HTML formatter's HTML-escaping and line-splitting For a 118MB JSON input file, this reduces memory consumption by ~500MB and reduces formatting time by ~15 seconds. * JSON: Add a catastrophic backtracking test back to the test suite * JSON: Update the comment that documents the internal queue * JSON: Document in comments that ints/floats/constants are not validated
pygments · Oct 26, 2020 · 164dcb5 · 164dcb5
1 parent 9c1a078
commit 164dcb5
Show file tree

Hide file tree

Showing 3 changed files with 423 additions and 158 deletions.
diff --git a/pygments/formatters/html.py b/pygments/formatters/html.py
@@ -9,6 +9,7 @@
     :license: BSD, see LICENSE for details.
 """
 
+import functools
 import os
 import sys
 import os.path
@@ -414,6 +415,7 @@ def __init__(self, **options):
         self.tagurlformat = self._decodeifneeded(options.get('tagurlformat', ''))
         self.filename = self._decodeifneeded(options.get('filename', ''))
         self.wrapcode = get_bool_opt(options, 'wrapcode', False)
+        self.span_element_openers = {}
 
         if self.tagsfile:
             if not ctags:
@@ -455,13 +457,20 @@ def _get_css_class(self, ttype):
         return ''
 
     def _get_css_classes(self, ttype):
-        """Return the css classes of this token type prefixed with
-        the classprefix option."""
+        """Generate the opening <span> tag for a given token type using CSS classes."""
         cls = self._get_css_class(ttype)
         while ttype not in STANDARD_TYPES:
             ttype = ttype.parent
             cls = self._get_css_class(ttype) + ' ' + cls
-        return cls
+        return cls and '<span class="%s">' % cls or ''
+
+    def _get_css_inline_styles(self, ttype):
+        """Generate the opening <span> tag for a given token type using inline CSS styles."""
+        cclass = self.ttype2class.get(ttype)
+        while cclass is None:
+            ttype = ttype.parent
+            cclass = self.ttype2class.get(ttype)
+        return cclass and '<span style="%s">' % self.class2style[cclass][0] or ''
 
     def _create_stylesheet(self):
         t2c = self.ttype2class = {Token: ''}
@@ -786,33 +795,32 @@ def _wrap_code(self, inner):
         yield from inner
         yield 0, '</code>'
 
+    @functools.lru_cache(maxsize=100)
+    def _translate_parts(self, value):
+        """HTML-escape a value and split it by newlines."""
+        return value.translate(_escape_html_table).split('\n')
+
     def _format_lines(self, tokensource):
         """
         Just format the tokens, without any wrapping tags.
         Yield individual lines.
         """
         nocls = self.noclasses
         lsep = self.lineseparator
-        # for <span style=""> lookup only
-        getcls = self.ttype2class.get
-        c2s = self.class2style
-        escape_table = _escape_html_table
         tagsfile = self.tagsfile
 
         lspan = ''
         line = []
         for ttype, value in tokensource:
-            if nocls:
-                cclass = getcls(ttype)
-                while cclass is None:
-                    ttype = ttype.parent
-                    cclass = getcls(ttype)
-                cspan = cclass and '<span style="%s">' % c2s[cclass][0] or ''
-            else:
-                cls = self._get_css_classes(ttype)
-                cspan = cls and '<span class="%s">' % cls or ''
+            try:
+                cspan = self.span_element_openers[ttype]
+            except KeyError:
+                if nocls:
+                    cspan = self.span_element_openers[ttype] = self._get_css_inline_styles(ttype)
+                else:
+                    cspan = self.span_element_openers[ttype] = self._get_css_classes(ttype)
 
-            parts = value.translate(escape_table).split('\n')
+            parts = self._translate_parts(value)
 
             if tagsfile and ttype in Token.Name:
                 filename, linenumber = self._lookup_ctag(value)