From 3e1b79c82d2df318f63f24984d875fd2a3400808 Mon Sep 17 00:00:00 2001 From: Ken Date: Wed, 14 Oct 2020 13:18:39 +0800 Subject: [PATCH] TNTLexer: Don't crash on unexpected EOL. (#1570) * TNTLexer: Don't crash on unexpected EOL Catch IndexErrors in each line and error the rest of the line, leaving whatever tokens were found. * Write and pass tests for Typographic Number Theory pygments/lexers/tnt.py: * Fix indentation on import * Fix: TNTLexer.cur is class-level reference if not initialized in get_tokens_unprocessed, so init it in __init__ too * Fix: Fantasy markers are not allowed as components of other formulas, so have a dedicated check for them in the body of get_tokens_unprocessed which disables the normal formula handling if present * Clarify TNTLexer.lineno docstring * Attempt to discard tokens before an IndexError +tests/test_tnt.py: * Test every method, and test both +ve and -ve matches for most * Lexer fixture is test-level to reinitialize cur clean each time * Don't test actual get_tokens_unprocessed method (besides for fantasy markers) because the text testing is left to examplefiles AUTHORS: + Add myself to credits :) * Add a TNT test just to make sure no crashes --- AUTHORS | 1 + pygments/lexers/tnt.py | 121 +++++++++++++----------- tests/test_tnt.py | 204 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 272 insertions(+), 54 deletions(-) create mode 100644 tests/test_tnt.py diff --git a/AUTHORS b/AUTHORS index 5058c612d6..f209a8acba 100644 --- a/AUTHORS +++ b/AUTHORS @@ -90,6 +90,7 @@ Other contributors, listed alphabetically, are: * Justin Hendrick -- ParaSail lexer * Jordi GutiƩrrez Hermoso -- Octave lexer * David Hess, Fish Software, Inc. -- Objective-J lexer +* Ken Hilton -- Typographic Number Theory and Arrow lexers * Varun Hiremath -- Debian control lexer * Rob Hoelz -- Perl 6 lexer * Doug Hogan -- Mscgen lexer diff --git a/pygments/lexers/tnt.py b/pygments/lexers/tnt.py index f62f3ab9f0..1d966ac872 100644 --- a/pygments/lexers/tnt.py +++ b/pygments/lexers/tnt.py @@ -13,7 +13,7 @@ from pygments.lexer import Lexer from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \ - Punctuation, Error + Punctuation, Error __all__ = ['TNTLexer'] @@ -55,6 +55,10 @@ class TNTLexer(Lexer): LINENOS = re.compile(r'(?:[0-9]+)(?:(?:, ?|,? and )(?:[0-9]+))*') COMMENT = re.compile(r'\[[^\n\]]+\]') + def __init__(self, *args, **kwargs): + Lexer.__init__(self, *args, **kwargs) + self.cur = [] + def whitespace(self, start, text, required=False): """Tokenize whitespace.""" end = start @@ -104,9 +108,6 @@ def term(self, start, text): def formula(self, start, text): """Tokenize a formula.""" - if text[start] in '[]': # fantasy push or pop - self.cur.append((start, Keyword, text[start])) - return start+1 if text[start] in self.NEGATORS: # ~<...> end = start+1 while text[end] in self.NEGATORS: @@ -154,7 +155,7 @@ def rule(self, start, text): return match.end() def lineno(self, start, text): - """Tokenize a line marker.""" + """Tokenize a line referral.""" end = start while text[end] not in self.NUMBERS: end += 1 @@ -186,65 +187,77 @@ def get_tokens_unprocessed(self, text): self.cur = [] start = end = self.whitespace(0, text) while start <= end < len(text): - # try line number - while text[end] in self.NUMBERS: - end += 1 - if end != start: # actual number present - self.cur.append((start, Number.Integer, text[start:end])) - # whitespace is required after a line number + try: + # try line number + while text[end] in self.NUMBERS: + end += 1 + if end != start: # actual number present + self.cur.append((start, Number.Integer, text[start:end])) + # whitespace is required after a line number + orig = len(self.cur) + try: + start = end = self.whitespace(end, text, True) + except AssertionError: + del self.cur[orig:] + start = end = self.error_till_line_end(end, text) + continue + # at this point it could be a comment + match = self.COMMENT.match(text, start) + if match is not None: + self.cur.append((start, Comment, text[start:match.end()])) + start = end = match.end() + # anything after the closing bracket is invalid + start = end = self.error_till_line_end(start, text) + # do not attempt to process the rest + continue + del match + if text[start] in '[]': # fantasy push or pop + self.cur.append((start, Keyword, text[start])) + start += 1 + end += 1 + else: + # one formula, possibly containing subformulae + orig = len(self.cur) + try: + start = end = self.formula(start, text) + except AssertionError: # not well-formed + del self.cur[orig:] + while text[end] not in self.WHITESPACE: + end += 1 + self.cur.append((start, Error, text[start:end])) + start = end + # skip whitespace after formula orig = len(self.cur) try: start = end = self.whitespace(end, text, True) except AssertionError: del self.cur[orig:] - start = end = self.error_till_line_end(end, text) + start = end = self.error_till_line_end(start, text) continue - # at this point it could be a comment - match = self.COMMENT.match(text, start) - if match is not None: - self.cur.append((start, Comment, text[start:match.end()])) - start = end = match.end() - # anything after the closing bracket is invalid - start = end = self.error_till_line_end(start, text) - # do not attempt to process the rest - continue - del match - # one formula, possibly containing subformulae - orig = len(self.cur) - try: - start = end = self.formula(start, text) - except AssertionError: # not well-formed - del self.cur[orig:] - while text[end] not in self.WHITESPACE: - end += 1 - self.cur.append((start, Error, text[start:end])) - start = end - # skip whitespace after formula - orig = len(self.cur) - try: - start = end = self.whitespace(end, text, True) - except AssertionError: - del self.cur[orig:] - start = end = self.error_till_line_end(start, text) - continue - # rule proving this formula a theorem - orig = len(self.cur) - try: - start = end = self.rule(start, text) - except AssertionError: - del self.cur[orig:] - start = end = self.error_till_line_end(start, text) - continue - # skip whitespace after rule - start = end = self.whitespace(end, text) - # line marker - if text[start] == '(': + # rule proving this formula a theorem orig = len(self.cur) try: - start = end = self.lineno(start, text) + start = end = self.rule(start, text) except AssertionError: del self.cur[orig:] start = end = self.error_till_line_end(start, text) continue - start = end = self.whitespace(start, text) + # skip whitespace after rule + start = end = self.whitespace(end, text) + # line marker + if text[start] == '(': + orig = len(self.cur) + try: + start = end = self.lineno(start, text) + except AssertionError: + del self.cur[orig:] + start = end = self.error_till_line_end(start, text) + continue + start = end = self.whitespace(start, text) + except IndexError: + try: + del self.cur[orig:] + except NameError: + pass # if orig was never defined, fine + self.error_till_line_end(start, text) return self.cur diff --git a/tests/test_tnt.py b/tests/test_tnt.py new file mode 100644 index 0000000000..e14834acb1 --- /dev/null +++ b/tests/test_tnt.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- +""" + Typograhic Number Theory tests + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import pytest + +from pygments.lexers.tnt import TNTLexer +from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \ + Punctuation, Error + +@pytest.fixture(autouse=True) +def lexer(): + yield TNTLexer() + +# whitespace + +@pytest.mark.parametrize('text', (' a', ' \t0', '\n\n 3')) +def test_whitespace_positive_matches(lexer, text): + """Test fragments that should be tokenized as whitespace text.""" + assert lexer.whitespace(0, text) == len(text) - 1 + assert lexer.whitespace(0, text, True) == len(text) - 1 + assert lexer.cur[-1] == (0, Text, text[:-1]) + +@pytest.mark.parametrize('text', ('0 a=b premise', 'b=a symmetry')) +def test_whitespace_negative_matches(lexer, text): + """Test statements that do not start with whitespace text.""" + assert lexer.whitespace(0, text) == 0 + with pytest.raises(AssertionError): + lexer.whitespace(0, text, True) + assert not lexer.cur + +# terms that can go on either side of an = sign + +@pytest.mark.parametrize('text', ('a ', "a' ", 'b ', "c' ")) +def test_variable_positive_matches(lexer, text): + """Test fragments that should be tokenized as variables.""" + assert lexer.variable(0, text) == len(text) - 1 + assert lexer.cur[-1] == (0, Name.Variable, text[:-1]) + +@pytest.mark.parametrize('text', ("' ", 'f ', "f' ")) +def test_variable_negative_matches(lexer, text): + """Test fragments that should **not** be tokenized as variables.""" + with pytest.raises(AssertionError): + lexer.variable(0, text) + assert not lexer.cur + +@pytest.mark.parametrize('text', ('0', 'S0', 'SSSSS0')) +def test_numeral_positive_matches(lexer, text): + """Test fragments that should be tokenized as (unary) numerals.""" + assert lexer.term(0, text) == len(text) + assert lexer.cur[-1] == (len(text) - 1, Number.Integer, text[-1]) + if text != '0': + assert lexer.cur[-2] == (0, Number.Integer, text[:-1]) + +@pytest.mark.parametrize('text', ( + '(a+b)', '(b.a)', '(c+d)' +)) +def test_multiterm_positive_matches(lexer, text): + """Test fragments that should be tokenized as a compound term.""" + assert lexer.term(0, text) == len(text) + assert [t[1] for t in lexer.cur] == [ + Punctuation, Name.Variable, Operator, + Name.Variable, Punctuation + ] + +@pytest.mark.parametrize('text', ('1', '=', 'A')) +def test_term_negative_matches(lexer, text): + """Test fragments that should not be tokenized as terms at all.""" + with pytest.raises(AssertionError): + lexer.term(0, text) + assert not lexer.cur + +# full statements, minus rule + +@pytest.mark.parametrize('text', ('~a=b ', '~~~~a=b ')) +def test_negator_positive_matches(lexer, text): + """Test statements that start with a negation.""" + assert lexer.formula(0, text) == len(text) - 1 + assert lexer.cur[0] == (0, Operator, text[:-4]) + +@pytest.mark.parametrize('text', ('Aa:a=b ', 'Eb:a=b ')) +def test_quantifier_positive_matches(lexer, text): + """Test statements that start with a quantifier.""" + assert lexer.formula(0, text) == len(text) - 1 + assert lexer.cur[0][1] == Keyword.Declaration + assert lexer.cur[1][1] == Name.Variable + assert lexer.cur[2] == (2, Punctuation, ':') + +@pytest.mark.parametrize('text', ('Aaa=b', 'Eba=b')) +def test_quantifier_negative_matches(lexer, text): + """Test quantifiers that are only partially valid.""" + with pytest.raises(AssertionError): + lexer.formula(0, text) + # leftovers should still be valid + assert lexer.cur[0][1] == Keyword.Declaration + assert lexer.cur[1][1] == Name.Variable + +@pytest.mark.parametrize('text', ('', '', '')) +def test_compound_positive_matches(lexer, text): + """Test statements that consist of multiple formulas compounded.""" + assert lexer.formula(0, text) == len(text) + assert lexer.cur[0] == (0, Punctuation, '<') + assert lexer.cur[4][1] == Operator + assert lexer.cur[-1] == (len(text)-1, Punctuation, '>') + +@pytest.mark.parametrize('text', ('', '