Unclosed script/style tag handling Fixes #1614

Explicitly handle unclosed <script> and <style> tags which previously would result in O(n^2) work to lex as Error tokens per character up to the end of the line or end of file (whichever comes first). Now we try lexing the rest of the line as Javascript/CSS if there's no closing script/style tag. We recover on the next line in the root state if there is a newline, otherwise just keep parsing as Javascript/CSS. This is similar to how the error handling in lexer.py works except we get Javascript or CSS tokens instead of Error tokens. And we get to the end of the line much faster since we don't apply an O(n) regex for every character in the line. I added a new test suite for html lexer (there wasn't one except for coverage in test_examplefiles.py) including a trivial happy-path case and several cases around <script> and <style> fragments, including regression coverage that fails on the old logic.
pygments · Dec 3, 2020 · 4b2326f · 4b2326f
1 parent fc11c62
commit 4b2326f
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 0 deletions.
diff --git a/pygments/lexers/html.py b/pygments/lexers/html.py
@@ -77,12 +77,24 @@ class HtmlLexer(RegexLexer):
              bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
                       Punctuation), '#pop'),
             (r'.+?(?=<\s*/\s*script\s*>)', using(JavascriptLexer)),
+            # fallback cases for when there is no closing script tag
+            # first look for newline and then go back into root state
+            # if that fails just read the rest of the file
+            # this is similar to the error handling logic in lexer.py
+            (r'.+?\n', using(JavascriptLexer), '#pop'),
+            (r'.+', using(JavascriptLexer), '#pop'),
         ],
         'style-content': [
             (r'(<)(\s*)(/)(\s*)(style)(\s*)(>)',
              bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
                       Punctuation),'#pop'),
             (r'.+?(?=<\s*/\s*style\s*>)', using(CssLexer)),
+            # fallback cases for when there is no closing style tag
+            # first look for newline and then go back into root state
+            # if that fails just read the rest of the file
+            # this is similar to the error handling logic in lexer.py
+            (r'.+?\n', using(CssLexer), '#pop'),
+            (r'.+', using(CssLexer), '#pop'),
         ],
         'attr': [
             ('".*?"', String, '#pop'),

diff --git a/tests/test_html_lexer.py b/tests/test_html_lexer.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+"""
+    HTML Lexer Tests
+    ~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2020-2020 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import time
+
+import pytest
+
+from pygments.lexers.html import HtmlLexer
+from pygments.token import Token
+
+@pytest.fixture(scope='module')
+def lexer_html():
+    yield HtmlLexer()
+
+def test_simple_html(lexer_html):
+    """ extremely basic happy-path case
+
+    more tests are in test_examplefiles """
+
+    fragment = "<html>\n\t<body>\n\t\thello world\n\t</body>\n</html>"
+    tokens = list(lexer_html.get_tokens(fragment))
+    assert all(x[1] != Token.Error for x in tokens)
+
+def test_happy_javascript_fragment(lexer_html):
+    """ valid, even long Javascript fragments should still get parsed ok """
+
+    fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*2000+"</script>"
+    start_time = time.time()
+    tokens = list(lexer_html.get_tokens(fragment))
+    assert all(x[1] != Token.Error for x in tokens)
+    assert time.time() - start_time < 1, 'The HTML lexer might have an expensive happy-path script case'
+
+def test_happy_css_fragment(lexer_html):
+    """ valid, even long CSS fragments should still get parsed ok """
+
+    fragment = "<style>"+".ui-helper-hidden{display:none}"*2000+"</style>"
+    start_time = time.time()
+    tokens = list(lexer_html.get_tokens(fragment))
+    assert all(x[1] != Token.Error for x in tokens)
+    assert time.time() - start_time < 1, 'The HTML lexer might have an expensive happy-path style case'
+
+def test_long_unclosed_javascript_fragment(lexer_html):
+    """ unclosed, long Javascript fragments should parse quickly """
+
+    reps = 2000
+    fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*reps
+    start_time = time.time()
+    tokens = list(lexer_html.get_tokens(fragment))
+    assert time.time() - start_time < 1, 'The HTML lexer might have an expensive error script case'
+    tokens_intro = [
+        (Token.Punctuation, '<'),
+        (Token.Name.Tag, 'script'),
+        (Token.Text, ' '),
+        (Token.Name.Attribute, 'type'),
+        (Token.Operator, '='),
+        (Token.Literal.String, '"text/javascript"'),
+        (Token.Punctuation, '>'),
+    ]
+    tokens_body = [
+        (Token.Name.Other, 'alert'),
+        (Token.Punctuation, '('),
+        (Token.Literal.String.Double, '"hi"'),
+        (Token.Punctuation, ')'),
+        (Token.Punctuation, ';'),
+    ]
+
+    # make sure we get the right opening tokens
+    assert tokens[:len(tokens_intro)] == tokens_intro
+    # and make sure we get the right body tokens even though the script is
+    # unclosed
+    assert tokens[len(tokens_intro):-1] == tokens_body * reps
+    # and of course, the newline we get for free from get_tokens
+    assert tokens[-1] == (Token.Text, "\n")
+
+def test_long_unclosed_css_fragment(lexer_html):
+    """ unclosed, long CSS fragments should parse quickly """
+
+    reps = 2000
+    fragment = "<style>"+".ui-helper-hidden{display:none}"*reps
+    start_time = time.time()
+    tokens = list(lexer_html.get_tokens(fragment))
+    assert time.time() - start_time < 1, 'The HTML lexer might have an expensive error style case'
+
+    tokens_intro = [
+        (Token.Punctuation, '<'),
+        (Token.Name.Tag, 'style'),
+        (Token.Punctuation, '>'),
+    ]
+    tokens_body = [
+        (Token.Punctuation, '.'),
+        (Token.Name.Class, 'ui-helper-hidden'),
+        (Token.Punctuation, '{'),
+        (Token.Keyword, 'display'),
+        (Token.Punctuation, ':'),
+        (Token.Keyword.Constant, 'none'),
+        (Token.Punctuation, '}'),
+    ]
+
+    # make sure we get the right opening tokens
+    assert tokens[:len(tokens_intro)] == tokens_intro
+    # and make sure we get the right body tokens even though the style block is
+    # unclosed
+    assert tokens[len(tokens_intro):-1] == tokens_body * reps
+    # and of course, the newline we get for free from get_tokens
+    assert tokens[-1] == (Token.Text, "\n")
+
+def test_unclosed_fragment_with_newline_recovery(lexer_html):
+    """ unclosed Javascript fragments should recover on the next line """
+
+    fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*20+"\n<div>hi</div>"
+    tokens = list(lexer_html.get_tokens(fragment))
+    recovery_tokens = [
+            (Token.Punctuation, '<'),
+            (Token.Name.Tag, 'div'),
+            (Token.Punctuation, '>'),
+            (Token.Text, 'hi'),
+            (Token.Punctuation, '<'),
+            (Token.Punctuation, '/'),
+            (Token.Name.Tag, 'div'),
+            (Token.Punctuation, '>'),
+            (Token.Text, '\n')]
+    assert tokens[-1*len(recovery_tokens):] == recovery_tokens
+