Skip to content

Commit

Permalink
TNTLexer: Don't crash on unexpected EOL. (#1570)
Browse files Browse the repository at this point in the history
* TNTLexer: Don't crash on unexpected EOL

Catch IndexErrors in each line and error the rest of the line,
leaving whatever tokens were found.

* Write and pass tests for Typographic Number Theory

pygments/lexers/tnt.py:
* Fix indentation on import
* Fix: TNTLexer.cur is class-level reference if not initialized
  in get_tokens_unprocessed, so init it in __init__ too
* Fix: Fantasy markers are not allowed as components of other formulas,
  so have a dedicated check for them in the body of get_tokens_unprocessed
  which disables the normal formula handling if present
* Clarify TNTLexer.lineno docstring
* Attempt to discard tokens before an IndexError

+tests/test_tnt.py:
* Test every method, and test both +ve and -ve matches for most
* Lexer fixture is test-level to reinitialize cur clean each time
* Don't test actual get_tokens_unprocessed method (besides for fantasy markers)
  because the text testing is left to examplefiles

AUTHORS:
+ Add myself to credits :)

* Add a TNT test just to make sure no crashes
  • Loading branch information
Kenny2github committed Oct 14, 2020
1 parent 5d6b610 commit 3e1b79c
Show file tree
Hide file tree
Showing 3 changed files with 272 additions and 54 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Expand Up @@ -90,6 +90,7 @@ Other contributors, listed alphabetically, are:
* Justin Hendrick -- ParaSail lexer
* Jordi Gutiérrez Hermoso -- Octave lexer
* David Hess, Fish Software, Inc. -- Objective-J lexer
* Ken Hilton -- Typographic Number Theory and Arrow lexers
* Varun Hiremath -- Debian control lexer
* Rob Hoelz -- Perl 6 lexer
* Doug Hogan -- Mscgen lexer
Expand Down
121 changes: 67 additions & 54 deletions pygments/lexers/tnt.py
Expand Up @@ -13,7 +13,7 @@

from pygments.lexer import Lexer
from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \
Punctuation, Error
Punctuation, Error

__all__ = ['TNTLexer']

Expand Down Expand Up @@ -55,6 +55,10 @@ class TNTLexer(Lexer):
LINENOS = re.compile(r'(?:[0-9]+)(?:(?:, ?|,? and )(?:[0-9]+))*')
COMMENT = re.compile(r'\[[^\n\]]+\]')

def __init__(self, *args, **kwargs):
Lexer.__init__(self, *args, **kwargs)
self.cur = []

def whitespace(self, start, text, required=False):
"""Tokenize whitespace."""
end = start
Expand Down Expand Up @@ -104,9 +108,6 @@ def term(self, start, text):

def formula(self, start, text):
"""Tokenize a formula."""
if text[start] in '[]': # fantasy push or pop
self.cur.append((start, Keyword, text[start]))
return start+1
if text[start] in self.NEGATORS: # ~<...>
end = start+1
while text[end] in self.NEGATORS:
Expand Down Expand Up @@ -154,7 +155,7 @@ def rule(self, start, text):
return match.end()

def lineno(self, start, text):
"""Tokenize a line marker."""
"""Tokenize a line referral."""
end = start
while text[end] not in self.NUMBERS:
end += 1
Expand Down Expand Up @@ -186,65 +187,77 @@ def get_tokens_unprocessed(self, text):
self.cur = []
start = end = self.whitespace(0, text)
while start <= end < len(text):
# try line number
while text[end] in self.NUMBERS:
end += 1
if end != start: # actual number present
self.cur.append((start, Number.Integer, text[start:end]))
# whitespace is required after a line number
try:
# try line number
while text[end] in self.NUMBERS:
end += 1
if end != start: # actual number present
self.cur.append((start, Number.Integer, text[start:end]))
# whitespace is required after a line number
orig = len(self.cur)
try:
start = end = self.whitespace(end, text, True)
except AssertionError:
del self.cur[orig:]
start = end = self.error_till_line_end(end, text)
continue
# at this point it could be a comment
match = self.COMMENT.match(text, start)
if match is not None:
self.cur.append((start, Comment, text[start:match.end()]))
start = end = match.end()
# anything after the closing bracket is invalid
start = end = self.error_till_line_end(start, text)
# do not attempt to process the rest
continue
del match
if text[start] in '[]': # fantasy push or pop
self.cur.append((start, Keyword, text[start]))
start += 1
end += 1
else:
# one formula, possibly containing subformulae
orig = len(self.cur)
try:
start = end = self.formula(start, text)
except AssertionError: # not well-formed
del self.cur[orig:]
while text[end] not in self.WHITESPACE:
end += 1
self.cur.append((start, Error, text[start:end]))
start = end
# skip whitespace after formula
orig = len(self.cur)
try:
start = end = self.whitespace(end, text, True)
except AssertionError:
del self.cur[orig:]
start = end = self.error_till_line_end(end, text)
start = end = self.error_till_line_end(start, text)
continue
# at this point it could be a comment
match = self.COMMENT.match(text, start)
if match is not None:
self.cur.append((start, Comment, text[start:match.end()]))
start = end = match.end()
# anything after the closing bracket is invalid
start = end = self.error_till_line_end(start, text)
# do not attempt to process the rest
continue
del match
# one formula, possibly containing subformulae
orig = len(self.cur)
try:
start = end = self.formula(start, text)
except AssertionError: # not well-formed
del self.cur[orig:]
while text[end] not in self.WHITESPACE:
end += 1
self.cur.append((start, Error, text[start:end]))
start = end
# skip whitespace after formula
orig = len(self.cur)
try:
start = end = self.whitespace(end, text, True)
except AssertionError:
del self.cur[orig:]
start = end = self.error_till_line_end(start, text)
continue
# rule proving this formula a theorem
orig = len(self.cur)
try:
start = end = self.rule(start, text)
except AssertionError:
del self.cur[orig:]
start = end = self.error_till_line_end(start, text)
continue
# skip whitespace after rule
start = end = self.whitespace(end, text)
# line marker
if text[start] == '(':
# rule proving this formula a theorem
orig = len(self.cur)
try:
start = end = self.lineno(start, text)
start = end = self.rule(start, text)
except AssertionError:
del self.cur[orig:]
start = end = self.error_till_line_end(start, text)
continue
start = end = self.whitespace(start, text)
# skip whitespace after rule
start = end = self.whitespace(end, text)
# line marker
if text[start] == '(':
orig = len(self.cur)
try:
start = end = self.lineno(start, text)
except AssertionError:
del self.cur[orig:]
start = end = self.error_till_line_end(start, text)
continue
start = end = self.whitespace(start, text)
except IndexError:
try:
del self.cur[orig:]
except NameError:
pass # if orig was never defined, fine
self.error_till_line_end(start, text)
return self.cur
204 changes: 204 additions & 0 deletions tests/test_tnt.py
@@ -0,0 +1,204 @@
# -*- coding: utf-8 -*-
"""
Typograhic Number Theory tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""

import pytest

from pygments.lexers.tnt import TNTLexer
from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \
Punctuation, Error

@pytest.fixture(autouse=True)
def lexer():
yield TNTLexer()

# whitespace

@pytest.mark.parametrize('text', (' a', ' \t0', '\n\n 3'))
def test_whitespace_positive_matches(lexer, text):
"""Test fragments that should be tokenized as whitespace text."""
assert lexer.whitespace(0, text) == len(text) - 1
assert lexer.whitespace(0, text, True) == len(text) - 1
assert lexer.cur[-1] == (0, Text, text[:-1])

@pytest.mark.parametrize('text', ('0 a=b premise', 'b=a symmetry'))
def test_whitespace_negative_matches(lexer, text):
"""Test statements that do not start with whitespace text."""
assert lexer.whitespace(0, text) == 0
with pytest.raises(AssertionError):
lexer.whitespace(0, text, True)
assert not lexer.cur

# terms that can go on either side of an = sign

@pytest.mark.parametrize('text', ('a ', "a' ", 'b ', "c' "))
def test_variable_positive_matches(lexer, text):
"""Test fragments that should be tokenized as variables."""
assert lexer.variable(0, text) == len(text) - 1
assert lexer.cur[-1] == (0, Name.Variable, text[:-1])

@pytest.mark.parametrize('text', ("' ", 'f ', "f' "))
def test_variable_negative_matches(lexer, text):
"""Test fragments that should **not** be tokenized as variables."""
with pytest.raises(AssertionError):
lexer.variable(0, text)
assert not lexer.cur

@pytest.mark.parametrize('text', ('0', 'S0', 'SSSSS0'))
def test_numeral_positive_matches(lexer, text):
"""Test fragments that should be tokenized as (unary) numerals."""
assert lexer.term(0, text) == len(text)
assert lexer.cur[-1] == (len(text) - 1, Number.Integer, text[-1])
if text != '0':
assert lexer.cur[-2] == (0, Number.Integer, text[:-1])

@pytest.mark.parametrize('text', (
'(a+b)', '(b.a)', '(c+d)'
))
def test_multiterm_positive_matches(lexer, text):
"""Test fragments that should be tokenized as a compound term."""
assert lexer.term(0, text) == len(text)
assert [t[1] for t in lexer.cur] == [
Punctuation, Name.Variable, Operator,
Name.Variable, Punctuation
]

@pytest.mark.parametrize('text', ('1', '=', 'A'))
def test_term_negative_matches(lexer, text):
"""Test fragments that should not be tokenized as terms at all."""
with pytest.raises(AssertionError):
lexer.term(0, text)
assert not lexer.cur

# full statements, minus rule

@pytest.mark.parametrize('text', ('~a=b ', '~~~~a=b '))
def test_negator_positive_matches(lexer, text):
"""Test statements that start with a negation."""
assert lexer.formula(0, text) == len(text) - 1
assert lexer.cur[0] == (0, Operator, text[:-4])

@pytest.mark.parametrize('text', ('Aa:a=b ', 'Eb:a=b '))
def test_quantifier_positive_matches(lexer, text):
"""Test statements that start with a quantifier."""
assert lexer.formula(0, text) == len(text) - 1
assert lexer.cur[0][1] == Keyword.Declaration
assert lexer.cur[1][1] == Name.Variable
assert lexer.cur[2] == (2, Punctuation, ':')

@pytest.mark.parametrize('text', ('Aaa=b', 'Eba=b'))
def test_quantifier_negative_matches(lexer, text):
"""Test quantifiers that are only partially valid."""
with pytest.raises(AssertionError):
lexer.formula(0, text)
# leftovers should still be valid
assert lexer.cur[0][1] == Keyword.Declaration
assert lexer.cur[1][1] == Name.Variable

@pytest.mark.parametrize('text', ('<a=b&b=a>', '<a=b|b=a>', '<a=b]b=a>'))
def test_compound_positive_matches(lexer, text):
"""Test statements that consist of multiple formulas compounded."""
assert lexer.formula(0, text) == len(text)
assert lexer.cur[0] == (0, Punctuation, '<')
assert lexer.cur[4][1] == Operator
assert lexer.cur[-1] == (len(text)-1, Punctuation, '>')

@pytest.mark.parametrize('text', ('<a=b/b=a>', '<a=b&b=a '))
def test_compound_negative_matches(lexer, text):
"""Test statements that look like compounds but are invalid."""
with pytest.raises(AssertionError):
lexer.formula(0, text)
assert lexer.cur[0] == (0, Punctuation, '<')

@pytest.mark.parametrize('text', ('a=b ', 'a=0 ', '0=b '))
def test_formula_postive_matches(lexer, text):
"""Test the normal singular formula."""
assert lexer.formula(0, text) == len(text) - 1
assert lexer.cur[0][2] == text[0]
assert lexer.cur[1] == (1, Operator, '=')
assert lexer.cur[2][2] == text[2]

@pytest.mark.parametrize('text', ('a/b', '0+0 '))
def test_formula_negative_matches(lexer, text):
"""Test anything but an equals sign."""
with pytest.raises(AssertionError):
lexer.formula(0, text)

# rules themselves

@pytest.mark.parametrize('text', (
'fantasy rule', 'carry over line 5', 'premise', 'joining',
'double-tilde', 'switcheroo', 'De Morgan', 'specification'
))
def test_rule_positive_matches(lexer, text):
"""Test some valid rules of TNT."""
assert lexer.rule(0, text) == len(text)
assert lexer.cur[0][:2] == (0, Keyword)
if text[-1].isdigit():
assert lexer.cur[1][1] == Number.Integer

@pytest.mark.parametrize('text', (
'fantasy', 'carry over', 'premse', 'unjoining',
'triple-tilde', 'switcheru', 'De-Morgan', 'despecification'
))
def test_rule_negative_matches(lexer, text):
"""Test some invalid rules of TNT."""
with pytest.raises(AssertionError):
lexer.rule(0, text)

# referrals

@pytest.mark.parametrize('text', ('(lines 1, 2, and 4)', '(line 3,5,6)', '(lines 1, 6 and 0)'))
def test_lineno_positive_matches(lexer, text):
"""Test line referrals."""
assert lexer.lineno(0, text) == len(text)
assert lexer.cur[0] == (0, Punctuation, '(')
assert lexer.cur[1][:2] == (1, Text)
assert lexer.cur[2][1] == Number.Integer
assert lexer.cur[3] == (len(text)-1, Punctuation, ')')

@pytest.mark.parametrize('text', (
'(lines one, two, and four)1 ', # to avoid IndexError
'(lines 1 2 and 3)', '(lines 1 2 3)'
))
def test_lineno_negative_matches(lexer, text):
"""Test invalid line referrals."""
with pytest.raises(AssertionError):
lexer.lineno(0, text)

# worst-case: error text

@pytest.mark.parametrize('text', ('asdf', 'fdsa\nasdf', 'asdf\n '))
def test_error_till_line_end(lexer, text):
try:
nl = text.index('\n')
except ValueError:
nl = len(text)
try:
end = text.find(text.split(None, 2)[1])
except IndexError: # split failed
end = len(text)
assert lexer.error_till_line_end(0, text) == end
assert lexer.cur[0] == (0, Error, text[:nl])

# full statement, including rule (because this can't be tested any other way)

@pytest.mark.parametrize('text', ('[ push', '] pop'))
def test_fantasy_positive_matches(lexer, text):
"""Test statements that should be tokenized as push/pop statements."""
assert lexer.get_tokens_unprocessed(text)[0] == (0, Keyword, text[0])

# full text is already done by examplefiles, but here's some exceptions

@pytest.mark.parametrize('text', (
'0', 'a=b', 'premise',
'0 a=b premise', '1 b=a symmetry (line 0)'
))
def test_no_crashing(lexer, text):
"""Test incomplete text fragments that shouldn't crash the whole lexer."""
assert lexer.get_tokens(text)

0 comments on commit 3e1b79c

Please sign in to comment.