Skip to content

Commit

Permalink
Unclosed script/style tag handling Fixes #1614
Browse files Browse the repository at this point in the history
Explicitly handle unclosed <script> and <style> tags which previously
would result in O(n^2) work to lex as Error tokens per character up to
the end of the line or end of file (whichever comes first).

Now we try lexing the rest of the line as Javascript/CSS  if there's no
closing script/style tag. We recover on the next line in the root state
if there is a newline, otherwise just keep parsing as Javascript/CSS.

This is similar to how the error handling in lexer.py works except we
get Javascript or CSS tokens instead of Error tokens. And we get to the
end of the line much faster since we don't apply an O(n) regex for every
character in the line.

I added a new test suite for html lexer (there wasn't one except for
coverage in test_examplefiles.py) including a trivial happy-path case
and several cases around <script> and <style> fragments, including
regression coverage that fails on the old logic.
  • Loading branch information
gerner committed Dec 3, 2020
1 parent fc11c62 commit 4b2326f
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 0 deletions.
12 changes: 12 additions & 0 deletions pygments/lexers/html.py
Expand Up @@ -77,12 +77,24 @@ class HtmlLexer(RegexLexer):
bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
Punctuation), '#pop'),
(r'.+?(?=<\s*/\s*script\s*>)', using(JavascriptLexer)),
# fallback cases for when there is no closing script tag
# first look for newline and then go back into root state
# if that fails just read the rest of the file
# this is similar to the error handling logic in lexer.py
(r'.+?\n', using(JavascriptLexer), '#pop'),
(r'.+', using(JavascriptLexer), '#pop'),
],
'style-content': [
(r'(<)(\s*)(/)(\s*)(style)(\s*)(>)',
bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
Punctuation),'#pop'),
(r'.+?(?=<\s*/\s*style\s*>)', using(CssLexer)),
# fallback cases for when there is no closing style tag
# first look for newline and then go back into root state
# if that fails just read the rest of the file
# this is similar to the error handling logic in lexer.py
(r'.+?\n', using(CssLexer), '#pop'),
(r'.+', using(CssLexer), '#pop'),
],
'attr': [
('".*?"', String, '#pop'),
Expand Down
129 changes: 129 additions & 0 deletions tests/test_html_lexer.py
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
"""
HTML Lexer Tests
~~~~~~~~~~~~~~~~
:copyright: Copyright 2020-2020 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""

import time

import pytest

from pygments.lexers.html import HtmlLexer
from pygments.token import Token

@pytest.fixture(scope='module')
def lexer_html():
yield HtmlLexer()

def test_simple_html(lexer_html):
""" extremely basic happy-path case
more tests are in test_examplefiles """

fragment = "<html>\n\t<body>\n\t\thello world\n\t</body>\n</html>"
tokens = list(lexer_html.get_tokens(fragment))
assert all(x[1] != Token.Error for x in tokens)

def test_happy_javascript_fragment(lexer_html):
""" valid, even long Javascript fragments should still get parsed ok """

fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*2000+"</script>"
start_time = time.time()
tokens = list(lexer_html.get_tokens(fragment))
assert all(x[1] != Token.Error for x in tokens)
assert time.time() - start_time < 1, 'The HTML lexer might have an expensive happy-path script case'

def test_happy_css_fragment(lexer_html):
""" valid, even long CSS fragments should still get parsed ok """

fragment = "<style>"+".ui-helper-hidden{display:none}"*2000+"</style>"
start_time = time.time()
tokens = list(lexer_html.get_tokens(fragment))
assert all(x[1] != Token.Error for x in tokens)
assert time.time() - start_time < 1, 'The HTML lexer might have an expensive happy-path style case'

def test_long_unclosed_javascript_fragment(lexer_html):
""" unclosed, long Javascript fragments should parse quickly """

reps = 2000
fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*reps
start_time = time.time()
tokens = list(lexer_html.get_tokens(fragment))
assert time.time() - start_time < 1, 'The HTML lexer might have an expensive error script case'
tokens_intro = [
(Token.Punctuation, '<'),
(Token.Name.Tag, 'script'),
(Token.Text, ' '),
(Token.Name.Attribute, 'type'),
(Token.Operator, '='),
(Token.Literal.String, '"text/javascript"'),
(Token.Punctuation, '>'),
]
tokens_body = [
(Token.Name.Other, 'alert'),
(Token.Punctuation, '('),
(Token.Literal.String.Double, '"hi"'),
(Token.Punctuation, ')'),
(Token.Punctuation, ';'),
]

# make sure we get the right opening tokens
assert tokens[:len(tokens_intro)] == tokens_intro
# and make sure we get the right body tokens even though the script is
# unclosed
assert tokens[len(tokens_intro):-1] == tokens_body * reps
# and of course, the newline we get for free from get_tokens
assert tokens[-1] == (Token.Text, "\n")

def test_long_unclosed_css_fragment(lexer_html):
""" unclosed, long CSS fragments should parse quickly """

reps = 2000
fragment = "<style>"+".ui-helper-hidden{display:none}"*reps
start_time = time.time()
tokens = list(lexer_html.get_tokens(fragment))
assert time.time() - start_time < 1, 'The HTML lexer might have an expensive error style case'

tokens_intro = [
(Token.Punctuation, '<'),
(Token.Name.Tag, 'style'),
(Token.Punctuation, '>'),
]
tokens_body = [
(Token.Punctuation, '.'),
(Token.Name.Class, 'ui-helper-hidden'),
(Token.Punctuation, '{'),
(Token.Keyword, 'display'),
(Token.Punctuation, ':'),
(Token.Keyword.Constant, 'none'),
(Token.Punctuation, '}'),
]

# make sure we get the right opening tokens
assert tokens[:len(tokens_intro)] == tokens_intro
# and make sure we get the right body tokens even though the style block is
# unclosed
assert tokens[len(tokens_intro):-1] == tokens_body * reps
# and of course, the newline we get for free from get_tokens
assert tokens[-1] == (Token.Text, "\n")

def test_unclosed_fragment_with_newline_recovery(lexer_html):
""" unclosed Javascript fragments should recover on the next line """

fragment = "<script type=\"text/javascript\">"+"alert(\"hi\");"*20+"\n<div>hi</div>"
tokens = list(lexer_html.get_tokens(fragment))
recovery_tokens = [
(Token.Punctuation, '<'),
(Token.Name.Tag, 'div'),
(Token.Punctuation, '>'),
(Token.Text, 'hi'),
(Token.Punctuation, '<'),
(Token.Punctuation, '/'),
(Token.Name.Tag, 'div'),
(Token.Punctuation, '>'),
(Token.Text, '\n')]
assert tokens[-1*len(recovery_tokens):] == recovery_tokens

0 comments on commit 4b2326f

Please sign in to comment.