pygments/lexers/tnt.py

# -*- coding: utf-8 -*-
"""
    pygments.lexers.tnt
    ~~~~~~~~~~~~~~~~~~~

    Lexer for Typographic Number Theory.

    :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import re

from pygments.lexer import Lexer
from pygments.token import Text, Comment, Operator, Keyword, Name, Number, \
     Punctuation, Error

__all__ = ['TNTLexer']


class TNTLexer(Lexer):
    """
    Lexer for Typographic Number Theory, as described in the book
    Gödel, Escher, Bach, by Douglas R. Hofstadter,
    or as summarized here:
    https://github.com/Kenny2github/language-tnt/blob/master/README.md#summary-of-tnt

    .. versionadded:: 2.7
    """

    name = 'Typographic Number Theory'
    aliases = ['tnt']
    filenames = ['*.tnt']

    cur = []

    LOGIC = set('⊃→]&∧^|∨Vv')
    OPERATORS = set('+.⋅*')
    VARIABLES = set('abcde')
    PRIMES = set("'′")
    NEGATORS = set('~!')
    QUANTIFIERS = set('AE∀∃')
    NUMBERS = set('0123456789')
    WHITESPACE = set('\t \v\n')

    RULES = re.compile('''(?xi)
        joining | separation | double-tilde | fantasy\\ rule
        | carry[- ]over(?:\\ of)?(?:\\ line)?\\ ([0-9]+) | detachment
        | contrapositive | De\\ Morgan | switcheroo
        | specification | generalization | interchange
        | existence | symmetry | transitivity
        | add\\ S | drop\\ S | induction
        | axiom\\ ([1-5]) | premise | push | pop
    ''')
    LINENOS = re.compile(r'(?:[0-9]+)(?:(?:, ?|,? and )(?:[0-9]+))*')
    COMMENT = re.compile(r'\[[^\n\]]+\]')

    def __init__(self, *args, **kwargs):
        Lexer.__init__(self, *args, **kwargs)
        self.cur = []

    def whitespace(self, start, text, required=False):
        """Tokenize whitespace."""
        end = start
        try:
            while text[end] in self.WHITESPACE:
                end += 1
        except IndexError:
            end = len(text)
        if required:
            assert end != start
        if end != start:
            self.cur.append((start, Text, text[start:end]))
        return end

    def variable(self, start, text):
        """Tokenize a variable."""
        assert text[start] in self.VARIABLES
        end = start+1
        while text[end] in self.PRIMES:
            end += 1
        self.cur.append((start, Name.Variable, text[start:end]))
        return end

    def term(self, start, text):
        """Tokenize a term."""
        if text[start] == 'S':  # S...S(...) or S...0
            end = start+1
            while text[end] == 'S':
                end += 1
            self.cur.append((start, Number.Integer, text[start:end]))
            return self.term(end, text)
        if text[start] == '0':  # the singleton 0
            self.cur.append((start, Number.Integer, text[start]))
            return start+1
        if text[start] in self.VARIABLES:  # a''...
            return self.variable(start, text)
        if text[start] == '(':  # (...+...)
            self.cur.append((start, Punctuation, text[start]))
            start = self.term(start+1, text)
            assert text[start] in self.OPERATORS
            self.cur.append((start, Operator, text[start]))
            start = self.term(start+1, text)
            assert text[start] == ')'
            self.cur.append((start, Punctuation, text[start]))
            return start+1
        raise AssertionError  # no matches

    def formula(self, start, text):
        """Tokenize a formula."""
        if text[start] in self.NEGATORS:  # ~<...>
            end = start+1
            while text[end] in self.NEGATORS:
                end += 1
            self.cur.append((start, Operator, text[start:end]))
            return self.formula(end, text)
        if text[start] in self.QUANTIFIERS:  # Aa:<...>
            self.cur.append((start, Keyword.Declaration, text[start]))
            start = self.variable(start+1, text)
            assert text[start] == ':'
            self.cur.append((start, Punctuation, text[start]))
            return self.formula(start+1, text)
        if text[start] == '<':  # <...&...>
            self.cur.append((start, Punctuation, text[start]))
            start = self.formula(start+1, text)
            assert text[start] in self.LOGIC
            self.cur.append((start, Operator, text[start]))
            start = self.formula(start+1, text)
            assert text[start] == '>'
            self.cur.append((start, Punctuation, text[start]))
            return start+1
        # ...=...
        start = self.term(start, text)
        assert text[start] == '='
        self.cur.append((start, Operator, text[start]))
        start = self.term(start+1, text)
        return start

    def rule(self, start, text):
        """Tokenize a rule."""
        match = self.RULES.match(text, start)
        assert match is not None
        groups = sorted(match.regs[1:])  # exclude whole match
        for group in groups:
            if group[0] >= 0:  # this group matched
                self.cur.append((start, Keyword, text[start:group[0]]))
                self.cur.append((group[0], Number.Integer,
                                 text[group[0]:group[1]]))
                if group[1] != match.end():
                    self.cur.append((group[1], Keyword,
                                     text[group[1]:match.end()]))
                break
        else:
            self.cur.append((start, Keyword, text[start:match.end()]))
        return match.end()

    def lineno(self, start, text):
        """Tokenize a line referral."""
        end = start
        while text[end] not in self.NUMBERS:
            end += 1
        self.cur.append((start, Punctuation, text[start]))
        self.cur.append((start+1, Text, text[start+1:end]))
        start = end
        match = self.LINENOS.match(text, start)
        assert match is not None
        assert text[match.end()] == ')'
        self.cur.append((match.start(), Number.Integer, match.group(0)))
        self.cur.append((match.end(), Punctuation, text[match.end()]))
        return match.end() + 1

    def error_till_line_end(self, start, text):
        """Mark everything from ``start`` to the end of the line as Error."""
        end = start
        try:
            while text[end] != '\n':  # there's whitespace in rules
                end += 1
        except IndexError:
            end = len(text)
        if end != start:
            self.cur.append((start, Error, text[start:end]))
        end = self.whitespace(end, text)
        return end

    def get_tokens_unprocessed(self, text):
        """Returns a list of TNT tokens."""
        self.cur = []
        start = end = self.whitespace(0, text)
        while start <= end < len(text):
            try:
                # try line number
                while text[end] in self.NUMBERS:
                    end += 1
                if end != start:  # actual number present
                    self.cur.append((start, Number.Integer, text[start:end]))
                    # whitespace is required after a line number
                    orig = len(self.cur)
                    try:
                        start = end = self.whitespace(end, text, True)
                    except AssertionError:
                        del self.cur[orig:]
                        start = end = self.error_till_line_end(end, text)
                        continue
                # at this point it could be a comment
                match = self.COMMENT.match(text, start)
                if match is not None:
                    self.cur.append((start, Comment, text[start:match.end()]))
                    start = end = match.end()
                    # anything after the closing bracket is invalid
                    start = end = self.error_till_line_end(start, text)
                    # do not attempt to process the rest
                    continue
                del match
                if text[start] in '[]':  # fantasy push or pop
                    self.cur.append((start, Keyword, text[start]))
                    start += 1
                    end += 1
                else:
                    # one formula, possibly containing subformulae
                    orig = len(self.cur)
                    try:
                        start = end = self.formula(start, text)
                    except AssertionError:  # not well-formed
                        del self.cur[orig:]
                        while text[end] not in self.WHITESPACE:
                            end += 1
                        self.cur.append((start, Error, text[start:end]))
                        start = end
                # skip whitespace after formula
                orig = len(self.cur)
                try:
                    start = end = self.whitespace(end, text, True)
                except AssertionError:
                    del self.cur[orig:]
                    start = end = self.error_till_line_end(start, text)
                    continue
                # rule proving this formula a theorem
                orig = len(self.cur)
                try:
                    start = end = self.rule(start, text)
                except AssertionError:
                    del self.cur[orig:]
                    start = end = self.error_till_line_end(start, text)
                    continue
                # skip whitespace after rule
                start = end = self.whitespace(end, text)
                # line marker
                if text[start] == '(':
                    orig = len(self.cur)
                    try:
                        start = end = self.lineno(start, text)
                    except AssertionError:
                        del self.cur[orig:]
                        start = end = self.error_till_line_end(start, text)
                        continue
                    start = end = self.whitespace(start, text)
            except IndexError:
                try:
                    del self.cur[orig:]
                except NameError:
                    pass # if orig was never defined, fine
                self.error_till_line_end(start, text)
        return self.cur