TNTLexer: Don't crash on unexpected EOL. (#1570)

* TNTLexer: Don't crash on unexpected EOL Catch IndexErrors in each line and error the rest of the line, leaving whatever tokens were found. * Write and pass tests for Typographic Number Theory pygments/lexers/tnt.py: * Fix indentation on import * Fix: TNTLexer.cur is class-level reference if not initialized in get_tokens_unprocessed, so init it in __init__ too * Fix: Fantasy markers are not allowed as components of other formulas, so have a dedicated check for them in the body of get_tokens_unprocessed which disables the normal formula handling if present * Clarify TNTLexer.lineno docstring * Attempt to discard tokens before an IndexError +tests/test_tnt.py: * Test every method, and test both +ve and -ve matches for most * Lexer fixture is test-level to reinitialize cur clean each time * Don't test actual get_tokens_unprocessed method (besides for fantasy markers) because the text testing is left to examplefiles AUTHORS: + Add myself to credits :) * Add a TNT test just to make sure no crashes
pygments · Oct 14, 2020 · 3e1b79c · 3e1b79c
1 parent 5d6b610
commit 3e1b79c
Show file tree

Hide file tree

Showing 3 changed files with 272 additions and 54 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -90,6 +90,7 @@ Other contributors, listed alphabetically, are:
 * Justin Hendrick -- ParaSail lexer
 * Jordi Gutiérrez Hermoso -- Octave lexer
 * David Hess, Fish Software, Inc. -- Objective-J lexer
+* Ken Hilton -- Typographic Number Theory and Arrow lexers
 * Varun Hiremath -- Debian control lexer
 * Rob Hoelz -- Perl 6 lexer
 * Doug Hogan -- Mscgen lexer

diff --git a/pygments/lexers/tnt.py b/pygments/lexers/tnt.py
@@ -13,7 +13,7 @@
 
 from pygments.lexer import Lexer
 from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \
-    Punctuation, Error
+     Punctuation, Error
 
 __all__ = ['TNTLexer']
 
@@ -55,6 +55,10 @@ class TNTLexer(Lexer):
     LINENOS = re.compile(r'(?:[0-9]+)(?:(?:, ?|,? and )(?:[0-9]+))*')
     COMMENT = re.compile(r'\[[^\n\]]+\]')
 
+    def __init__(self, *args, **kwargs):
+        Lexer.__init__(self, *args, **kwargs)
+        self.cur = []
+
     def whitespace(self, start, text, required=False):
         """Tokenize whitespace."""
         end = start
@@ -104,9 +108,6 @@ def term(self, start, text):
 
     def formula(self, start, text):
         """Tokenize a formula."""
-        if text[start] in '[]':  # fantasy push or pop
-            self.cur.append((start, Keyword, text[start]))
-            return start+1
         if text[start] in self.NEGATORS:  # ~<...>
             end = start+1
             while text[end] in self.NEGATORS:
@@ -154,7 +155,7 @@ def rule(self, start, text):
         return match.end()
 
     def lineno(self, start, text):
-        """Tokenize a line marker."""
+        """Tokenize a line referral."""
         end = start
         while text[end] not in self.NUMBERS:
             end += 1
@@ -186,65 +187,77 @@ def get_tokens_unprocessed(self, text):
         self.cur = []
         start = end = self.whitespace(0, text)
         while start <= end < len(text):
-            # try line number
-            while text[end] in self.NUMBERS:
-                end += 1
-            if end != start:  # actual number present
-                self.cur.append((start, Number.Integer, text[start:end]))
-                # whitespace is required after a line number
+            try:
+                # try line number
+                while text[end] in self.NUMBERS:
+                    end += 1
+                if end != start:  # actual number present
+                    self.cur.append((start, Number.Integer, text[start:end]))
+                    # whitespace is required after a line number
+                    orig = len(self.cur)
+                    try:
+                        start = end = self.whitespace(end, text, True)
+                    except AssertionError:
+                        del self.cur[orig:]
+                        start = end = self.error_till_line_end(end, text)
+                        continue
+                # at this point it could be a comment
+                match = self.COMMENT.match(text, start)
+                if match is not None:
+                    self.cur.append((start, Comment, text[start:match.end()]))
+                    start = end = match.end()
+                    # anything after the closing bracket is invalid
+                    start = end = self.error_till_line_end(start, text)
+                    # do not attempt to process the rest
+                    continue
+                del match
+                if text[start] in '[]':  # fantasy push or pop
+                    self.cur.append((start, Keyword, text[start]))
+                    start += 1
+                    end += 1
+                else:
+                    # one formula, possibly containing subformulae
+                    orig = len(self.cur)
+                    try:
+                        start = end = self.formula(start, text)
+                    except AssertionError:  # not well-formed
+                        del self.cur[orig:]
+                        while text[end] not in self.WHITESPACE:
+                            end += 1
+                        self.cur.append((start, Error, text[start:end]))
+                        start = end
+                # skip whitespace after formula
                 orig = len(self.cur)
                 try:
                     start = end = self.whitespace(end, text, True)
                 except AssertionError:
                     del self.cur[orig:]
-                    start = end = self.error_till_line_end(end, text)
+                    start = end = self.error_till_line_end(start, text)
                     continue
-            # at this point it could be a comment
-            match = self.COMMENT.match(text, start)
-            if match is not None:
-                self.cur.append((start, Comment, text[start:match.end()]))
-                start = end = match.end()
-                # anything after the closing bracket is invalid
-                start = end = self.error_till_line_end(start, text)
-                # do not attempt to process the rest
-                continue
-            del match
-            # one formula, possibly containing subformulae
-            orig = len(self.cur)
-            try:
-                start = end = self.formula(start, text)
-            except AssertionError:  # not well-formed
-                del self.cur[orig:]
-                while text[end] not in self.WHITESPACE:
-                    end += 1
-                self.cur.append((start, Error, text[start:end]))
-                start = end
-            # skip whitespace after formula
-            orig = len(self.cur)
-            try:
-                start = end = self.whitespace(end, text, True)
-            except AssertionError:
-                del self.cur[orig:]
-                start = end = self.error_till_line_end(start, text)
-                continue
-            # rule proving this formula a theorem
-            orig = len(self.cur)
-            try:
-                start = end = self.rule(start, text)
-            except AssertionError:
-                del self.cur[orig:]
-                start = end = self.error_till_line_end(start, text)
-                continue
-            # skip whitespace after rule
-            start = end = self.whitespace(end, text)
-            # line marker
-            if text[start] == '(':
+                # rule proving this formula a theorem
                 orig = len(self.cur)
                 try:
-                    start = end = self.lineno(start, text)
+                    start = end = self.rule(start, text)
                 except AssertionError:
                     del self.cur[orig:]
                     start = end = self.error_till_line_end(start, text)
                     continue
-                start = end = self.whitespace(start, text)
+                # skip whitespace after rule
+                start = end = self.whitespace(end, text)
+                # line marker
+                if text[start] == '(':
+                    orig = len(self.cur)
+                    try:
+                        start = end = self.lineno(start, text)
+                    except AssertionError:
+                        del self.cur[orig:]
+                        start = end = self.error_till_line_end(start, text)
+                        continue
+                    start = end = self.whitespace(start, text)
+            except IndexError:
+                try:
+                    del self.cur[orig:]
+                except NameError:
+                    pass # if orig was never defined, fine
+                self.error_till_line_end(start, text)
         return self.cur
diff --git a/tests/test_tnt.py b/tests/test_tnt.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+"""
+    Typograhic Number Theory tests
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import pytest
+
+from pygments.lexers.tnt import TNTLexer
+from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \
+     Punctuation, Error
+
+@pytest.fixture(autouse=True)
+def lexer():
+    yield TNTLexer()
+
+# whitespace
+
+@pytest.mark.parametrize('text', ('  a', ' \t0', '\n\n 3'))
+def test_whitespace_positive_matches(lexer, text):
+    """Test fragments that should be tokenized as whitespace text."""
+    assert lexer.whitespace(0, text) == len(text) - 1
+    assert lexer.whitespace(0, text, True) == len(text) - 1
+    assert lexer.cur[-1] == (0, Text, text[:-1])
+
+@pytest.mark.parametrize('text', ('0 a=b premise', 'b=a symmetry'))
+def test_whitespace_negative_matches(lexer, text):
+    """Test statements that do not start with whitespace text."""
+    assert lexer.whitespace(0, text) == 0
+    with pytest.raises(AssertionError):
+        lexer.whitespace(0, text, True)
+    assert not lexer.cur
+
+# terms that can go on either side of an = sign
+
+@pytest.mark.parametrize('text', ('a ', "a' ", 'b ', "c' "))
+def test_variable_positive_matches(lexer, text):
+    """Test fragments that should be tokenized as variables."""
+    assert lexer.variable(0, text) == len(text) - 1
+    assert lexer.cur[-1] == (0, Name.Variable, text[:-1])
+
+@pytest.mark.parametrize('text', ("' ", 'f ', "f' "))
+def test_variable_negative_matches(lexer, text):
+    """Test fragments that should **not** be tokenized as variables."""
+    with pytest.raises(AssertionError):
+        lexer.variable(0, text)
+    assert not lexer.cur
+
+@pytest.mark.parametrize('text', ('0', 'S0', 'SSSSS0'))
+def test_numeral_positive_matches(lexer, text):
+    """Test fragments that should be tokenized as (unary) numerals."""
+    assert lexer.term(0, text) == len(text)
+    assert lexer.cur[-1] == (len(text) - 1, Number.Integer, text[-1])
+    if text != '0':
+        assert lexer.cur[-2] == (0, Number.Integer, text[:-1])
+
+@pytest.mark.parametrize('text', (
+    '(a+b)', '(b.a)', '(c+d)'
+))
+def test_multiterm_positive_matches(lexer, text):
+    """Test fragments that should be tokenized as a compound term."""
+    assert lexer.term(0, text) == len(text)
+    assert [t[1] for t in lexer.cur] == [
+        Punctuation, Name.Variable, Operator,
+        Name.Variable, Punctuation
+    ]
+
+@pytest.mark.parametrize('text', ('1', '=', 'A'))
+def test_term_negative_matches(lexer, text):
+    """Test fragments that should not be tokenized as terms at all."""
+    with pytest.raises(AssertionError):
+        lexer.term(0, text)
+    assert not lexer.cur
+
+# full statements, minus rule
+
+@pytest.mark.parametrize('text', ('~a=b ', '~~~~a=b '))
+def test_negator_positive_matches(lexer, text):
+    """Test statements that start with a negation."""
+    assert lexer.formula(0, text) == len(text) - 1
+    assert lexer.cur[0] == (0, Operator, text[:-4])
+
+@pytest.mark.parametrize('text', ('Aa:a=b ', 'Eb:a=b '))
+def test_quantifier_positive_matches(lexer, text):
+    """Test statements that start with a quantifier."""
+    assert lexer.formula(0, text) == len(text) - 1
+    assert lexer.cur[0][1] == Keyword.Declaration
+    assert lexer.cur[1][1] == Name.Variable
+    assert lexer.cur[2] == (2, Punctuation, ':')
+
+@pytest.mark.parametrize('text', ('Aaa=b', 'Eba=b'))
+def test_quantifier_negative_matches(lexer, text):
+    """Test quantifiers that are only partially valid."""
+    with pytest.raises(AssertionError):
+        lexer.formula(0, text)
+    # leftovers should still be valid
+    assert lexer.cur[0][1] == Keyword.Declaration
+    assert lexer.cur[1][1] == Name.Variable
+
+@pytest.mark.parametrize('text', ('<a=b&b=a>', '<a=b|b=a>', '<a=b]b=a>'))
+def test_compound_positive_matches(lexer, text):
+    """Test statements that consist of multiple formulas compounded."""
+    assert lexer.formula(0, text) == len(text)
+    assert lexer.cur[0] == (0, Punctuation, '<')
+    assert lexer.cur[4][1] == Operator
+    assert lexer.cur[-1] == (len(text)-1, Punctuation, '>')
+
+@pytest.mark.parametrize('text', ('<a=b/b=a>', '<a=b&b=a '))
+def test_compound_negative_matches(lexer, text):
+    """Test statements that look like compounds but are invalid."""
+    with pytest.raises(AssertionError):
+        lexer.formula(0, text)
+    assert lexer.cur[0] == (0, Punctuation, '<')
+
+@pytest.mark.parametrize('text', ('a=b ', 'a=0 ', '0=b '))
+def test_formula_postive_matches(lexer, text):
+    """Test the normal singular formula."""
+    assert lexer.formula(0, text) == len(text) - 1
+    assert lexer.cur[0][2] == text[0]
+    assert lexer.cur[1] == (1, Operator, '=')
+    assert lexer.cur[2][2] == text[2]
+
+@pytest.mark.parametrize('text', ('a/b', '0+0 '))
+def test_formula_negative_matches(lexer, text):
+    """Test anything but an equals sign."""
+    with pytest.raises(AssertionError):
+        lexer.formula(0, text)
+
+# rules themselves
+
+@pytest.mark.parametrize('text', (
+    'fantasy rule', 'carry over line 5', 'premise', 'joining',
+    'double-tilde', 'switcheroo', 'De Morgan', 'specification'
+))
+def test_rule_positive_matches(lexer, text):
+    """Test some valid rules of TNT."""
+    assert lexer.rule(0, text) == len(text)
+    assert lexer.cur[0][:2] == (0, Keyword)
+    if text[-1].isdigit():
+        assert lexer.cur[1][1] == Number.Integer
+
+@pytest.mark.parametrize('text', (
+    'fantasy', 'carry over', 'premse', 'unjoining',
+    'triple-tilde', 'switcheru', 'De-Morgan', 'despecification'
+))
+def test_rule_negative_matches(lexer, text):
+    """Test some invalid rules of TNT."""
+    with pytest.raises(AssertionError):
+        lexer.rule(0, text)
+
+# referrals
+
+@pytest.mark.parametrize('text', ('(lines 1, 2, and 4)', '(line 3,5,6)', '(lines 1, 6 and 0)'))
+def test_lineno_positive_matches(lexer, text):
+    """Test line referrals."""
+    assert lexer.lineno(0, text) == len(text)
+    assert lexer.cur[0] == (0, Punctuation, '(')
+    assert lexer.cur[1][:2] == (1, Text)
+    assert lexer.cur[2][1] == Number.Integer
+    assert lexer.cur[3] == (len(text)-1, Punctuation, ')')
+
+@pytest.mark.parametrize('text', (
+    '(lines one, two, and four)1 ', # to avoid IndexError
+    '(lines 1 2 and 3)', '(lines 1 2 3)'
+))
+def test_lineno_negative_matches(lexer, text):
+    """Test invalid line referrals."""
+    with pytest.raises(AssertionError):
+        lexer.lineno(0, text)
+
+# worst-case: error text
+
+@pytest.mark.parametrize('text', ('asdf', 'fdsa\nasdf', 'asdf\n  '))
+def test_error_till_line_end(lexer, text):
+    try:
+        nl = text.index('\n')
+    except ValueError:
+        nl = len(text)
+    try:
+        end = text.find(text.split(None, 2)[1])
+    except IndexError: # split failed
+        end = len(text)
+    assert lexer.error_till_line_end(0, text) == end
+    assert lexer.cur[0] == (0, Error, text[:nl])
+
+# full statement, including rule (because this can't be tested any other way)
+
+@pytest.mark.parametrize('text', ('[ push', '] pop'))
+def test_fantasy_positive_matches(lexer, text):
+    """Test statements that should be tokenized as push/pop statements."""
+    assert lexer.get_tokens_unprocessed(text)[0] == (0, Keyword, text[0])
+
+# full text is already done by examplefiles, but here's some exceptions
+
+@pytest.mark.parametrize('text', (
+    '0', 'a=b', 'premise',
+    '0 a=b premise', '1 b=a symmetry (line 0)'
+))
+def test_no_crashing(lexer, text):
+    """Test incomplete text fragments that shouldn't crash the whole lexer."""
+    assert lexer.get_tokens(text)