diff --git a/pygments/lexers/html.py b/pygments/lexers/html.py index 277148082f..b711e2feb2 100644 --- a/pygments/lexers/html.py +++ b/pygments/lexers/html.py @@ -77,12 +77,24 @@ class HtmlLexer(RegexLexer): bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text, Punctuation), '#pop'), (r'.+?(?=<\s*/\s*script\s*>)', using(JavascriptLexer)), + # fallback cases for when there is no closing script tag + # first look for newline and then go back into root state + # if that fails just read the rest of the file + # this is similar to the error handling logic in lexer.py + ('.+?\n', using(JavascriptLexer), '#pop'), + (r'.+', using(JavascriptLexer), '#pop'), ], 'style-content': [ (r'(<)(\s*)(/)(\s*)(style)(\s*)(>)', bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text, Punctuation),'#pop'), (r'.+?(?=<\s*/\s*style\s*>)', using(CssLexer)), + # fallback cases for when there is no closing style tag + # first look for newline and then go back into root state + # if that fails just read the rest of the file + # this is similar to the error handling logic in lexer.py + ('.+?\n', using(JavascriptLexer), '#pop'), + (r'.+', using(JavascriptLexer), '#pop'), ], 'attr': [ ('".*?"', String, '#pop'), diff --git a/tests/test_html_lexer.py b/tests/test_html_lexer.py new file mode 100644 index 0000000000..fe3ddf593e --- /dev/null +++ b/tests/test_html_lexer.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +""" + HTML Lexer Tests + ~~~~~~~~~~~~~~~~ + + :copyright: Copyright 2020-2020 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import time + +import pytest + +from pygments.lexers.html import HtmlLexer +from pygments.token import Token + +@pytest.fixture(scope='module') +def lexer_html(): + yield HtmlLexer() + +def test_simple_html(lexer_html): + """ extremely basic happy-path case + + more tests are in test_examplefiles """ + + fragment = "\n\t\n\t\thello world\n\t\n" + tokens = list(lexer_html.get_tokens(fragment)) + assert all(x[1] != Token.Error for x in tokens) + +def test_happy_javascript_fragment(lexer_html): + """ valid, even long Javascript fragments should still get parsed ok """ + + fragment = "" + start_time = time.time() + tokens = list(lexer_html.get_tokens(fragment)) + assert all(x[1] != Token.Error for x in tokens) + assert time.time() - start_time < 1, 'The HTML lexer might have an expensive happy-path script case' + +def test_happy_css_fragment(lexer_html): + """ valid, even long CSS fragments should still get parsed ok """ + + fragment = "" + start_time = time.time() + tokens = list(lexer_html.get_tokens(fragment)) + assert all(x[1] != Token.Error for x in tokens) + assert time.time() - start_time < 1, 'The HTML lexer might have an expensive happy-path style case' + +def test_long_unclosed_javascript_fragment(lexer_html): + """ unclosed, long Javascript fragments should parse quickly """ + + fragment = "