Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pygments lexer for Wren #1123

Closed
PureFox48 opened this issue Nov 14, 2022 · 2 comments
Closed

Pygments lexer for Wren #1123

PureFox48 opened this issue Nov 14, 2022 · 2 comments

Comments

@PureFox48
Copy link
Contributor

I wonder if anyone who's familiar with either Pygments or other syntax highlighters can help me with this.

Ideally, I'd like a Pygments lexer for Wren to help with my Rosetta Code efforts and which might also be generally useful for the Wren community. Currently, I use the Javascript lexer which is not bad but far from perfect.

Back in 2018 @munificent wrote a Pygments lexer for Wren which I think was originally used to syntax highlght code in the documentation though it has since been withdrawn. This was fairly simple and based on version 0.2.0:

import re
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter

from pygments.lexer import RegexLexer
from pygments.token import *

class WrenLexer(RegexLexer):
    name = 'Wren'
    aliases = ['wren']
    filenames = ['*.wren']

    flags = re.MULTILINE | re.DOTALL

    tokens = {
        'root': [
            # Whitespace.
            (r'\s+', Text),
            (r'[,\\\[\]{}]', Punctuation),

            # Push a parenthesized state so that we know the corresponding ')'
            # is for a parenthesized expression and not interpolation.
            (r'\(', Punctuation, ('parenthesized', 'root')),

            # In this state, we don't know whether a closing ')' is for a
            # parenthesized expression or the end of an interpolation. So, do
            # a non-consuming match and let the parent state (either
            # 'parenthesized' or 'interpolation' decide.
            (r'(?=\))', Text, '#pop'),

            # Keywords.
            (r'(break|class|construct|else|for|foreign|if|import|in|is|'
             r'return|static|super|var|while)\b', Keyword),

            (r'(true|false|null)\b', Keyword.Constant),

            (r'this\b', Name.Builtin),

            # Comments.
            (r'/\*', Comment.Multiline, 'comment'),
            (r'//.*?$', Comment.Single),

            # Names and operators.
            (r'[~!$%^&*\-=+\\|/?<>\.:]+', Operator),
            (r'[A-Z][a-zA-Z_0-9]+', Name.Variable.Global),
            (r'__[a-zA-Z_0-9]+', Name.Variable.Class),
            (r'_[a-zA-Z_0-9]+', Name.Variable.Instance),
            (r'[a-z][a-zA-Z_0-9]+', Name),

            # Numbers.
            (r'\d+\.\d+([eE]-?\d+)?', Number.Float),
            (r'0x[0-9a-fA-F]+', Number.Hex),
            (r'\d+', Number.Integer),

            # Strings.
            (r'L?"', String, 'string'),
        ],
        'comment': [
            (r'/\*', Comment.Multiline, '#push'),
            (r'\*/', Comment.Multiline, '#pop'),
            (r'.', Comment.Multiline), # All other characters.
        ],
        'string': [
            (r'"', String, '#pop'),
            (r'\\[\\%0abfnrtv"\']', String.Escape), # Escape.
            (r'\\x[a-fA-F0-9]{2}', String.Escape), # Byte escape.
            (r'\\u[a-fA-F0-9]{4}', String.Escape), # Unicode escape.
            (r'\\U[a-fA-F0-9]{8}', String.Escape), # Long Unicode escape.

            (r'%\(', String.Interpol, ('interpolation', 'root')),
            (r'.', String), # All other characters.
        ],
        'parenthesized': [
            # We only get to this state when we're at a ')'.
            (r'\)', Punctuation, '#pop'),
        ],
        'interpolation': [
            # We only get to this state when we're at a ')'.
            (r'\)', String.Interpol, '#pop'),
        ],
    }

Taking this as my starting point, I updated it to version 0.4.0 and, after making various other changes including some suggested by the Pygments team themselves, have ended up with the following wren.py which is still quite simple but does everything I'd like it to:

"""
    pygments.lexers.wren
    ~~~~~~~~~~~~~~~~~~~~
    
    Lexer for Wren.
    
    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import re

from pygments.lexer import RegexLexer, words
from pygments.token import Whitespace, Punctuation, Keyword, Name, Comment, \
    Operator, Number, String

__all__ = ['WrenLexer']

class WrenLexer(RegexLexer):
    """
    For Wren source code, version 0.4.0.
    .. versionadded:: 2.14.0
    """
    name = 'Wren'
    url = 'https://wren.io'
    aliases = ['wren']
    filenames = ['*.wren']

    flags = re.MULTILINE | re.DOTALL

    tokens = {
        'root': [
            # Whitespace.
            (r'\s+', Whitespace),
            (r'[,\\\[\]{}]', Punctuation),

            # Push a parenthesized state so that we know the corresponding ')'
            # is for a parenthesized expression and not interpolation.
            (r'\(', Punctuation, ('parenthesized', 'root')),

            # In this state, we don't know whether a closing ')' is for a
            # parenthesized expression or the end of an interpolation. So, do
            # a non-consuming match and let the parent state (either
            # 'parenthesized' or 'interpolation') decide.
            (r'(?=\))', Punctuation, '#pop'),

            # Keywords.
            (words((
                'as', 'break', 'class', 'construct', 'continue', 'else',
                'for', 'foreign', 'if', 'import', 'return', 'static', 'super',
                'var', 'while'), prefix = r'(?<!\.)',
                suffix = r'\b'), Keyword),
            (words((
                'true', 'false', 'null'), prefix = r'(?<!\.)',
                suffix = r'\b'), Keyword.Constant),

            (words((
                'this'), prefix = r'(?<!\.)',
                suffix = r'\b'), Name.Builtin),

            (words((
                'in', 'is'), prefix = r'(?<!\.)',
                suffix = r'\b'), Operator.Word),

            # Comments.
            (r'/\*', Comment.Multiline, 'comment'), # Multiline, can nest.
            (r'//.*?$', Comment.Single),            # Single line.
            (r'#.*?(\(.*?\))?$', Comment.Special),  # Attribute or shebang.

            # Names and operators.
            (r'[!%&*+\-./:<=>?\\^|~]+', Operator),
            (r'[a-z][a-zA-Z_0-9]*', Name),
            (r'[A-Z][a-zA-Z_0-9]*', Name.Class),
            (r'__[a-zA-Z_0-9]*', Name.Variable.Class),
            (r'_[a-zA-Z_0-9]*', Name.Variable.Instance),

            # Numbers.
            (r'0x[0-9a-fA-F]+', Number.Hex),
            (r'\d+(\.\d+)?([eE][-+]?\d+)?', Number.Float),

            # Strings.
            (r'""".*?"""', String),   # Raw string
            (r'"', String, 'string'), # Other string
        ],
        'comment': [
            (r'/\*', Comment.Multiline, '#push'),
            (r'\*/', Comment.Multiline, '#pop'),
            (r'([^*/]|\*(?!/)|/(?!\*))+', Comment.Multiline),
        ],
        'string': [
            (r'"', String, '#pop'),
            (r'\\[\\%"0abefnrtv]', String.Escape), # Escape.
            (r'\\x[a-fA-F0-9]{2}', String.Escape), # Byte escape.
            (r'\\u[a-fA-F0-9]{4}', String.Escape), # Unicode escape.
            (r'\\U[a-fA-F0-9]{8}', String.Escape), # Long Unicode escape.

            (r'%\(', String.Interpol, ('interpolation', 'root')),
            (r'[^\\"%]+', String), # All remaining characters.
        ],
        'parenthesized': [
            # We only get to this state when we're at a ')'.
            (r'\)', Punctuation, '#pop'),
        ],
        'interpolation': [
            # We only get to this state when we're at a ')'.
            (r'\)', String.Interpol, '#pop'),
        ],
    }

The instructions for building your own lexer are here and everything appears to be working fine using my own example file which I've designed to test all of Wren's keywords and main constructs.

Now, I submitted a PR to Pygments to hopefully get this included in the next release (2.14) but, as you can see from the link, have run into difficulties with their randomized tests :(

In fact, if I try to run the pytest --randomly-seed=2087311863 command on my own machine (Ubuntu 22.04, Python 3.10.6), I only get to 59% completion (within tests/test_basic_api.py) before it gets stuck.

On the face of it the lexer seems to be generating an infinite number of tokens for the random input but, despite trying various things, I haven't been able to resolve this.

Can anyone see what the problem is here?

Assuming multi-line comments are OK, I suspect that the problem may lie with either interpolation or attributes though these are working fine with my own example file and various other files (up to 2000 lines) which I've tested the lexer against.

@PureFox48
Copy link
Contributor Author

PureFox48 commented Nov 14, 2022

Think I may have solved it, simply by catching arbitrary strings of characters at the end of the 'root' section.

The lexer now looks like this:

"""
    pygments.lexers.wren
    ~~~~~~~~~~~~~~~~~~~~
    
    Lexer for Wren.
    
    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

import re

from pygments.lexer import RegexLexer, words
from pygments.token import Whitespace, Punctuation, Keyword, Name, Comment, \
    Operator, Number, String, Error

__all__ = ['WrenLexer']

class WrenLexer(RegexLexer):
    """
    For Wren source code, version 0.4.0.

    .. versionadded:: 2.14.0
    """
    name = 'Wren'
    url = 'https://wren.io'
    aliases = ['wren']
    filenames = ['*.wren']

    flags = re.MULTILINE | re.DOTALL

    tokens = {
        'root': [
            # Whitespace.
            (r'\s+', Whitespace),
            (r'[,\\\[\]{}]', Punctuation),

            # Push a parenthesized state so that we know the corresponding ')'
            # is for a parenthesized expression and not interpolation.
            (r'\(', Punctuation, ('parenthesized', 'root')),

            # In this state, we don't know whether a closing ')' is for a
            # parenthesized expression or the end of an interpolation. So, do
            # a non-consuming match and let the parent state (either
            # 'parenthesized' or 'interpolation') decide.
            (r'(?=\))', Punctuation, '#pop'),

            # Keywords.
            (words((
                'as', 'break', 'class', 'construct', 'continue', 'else',
                'for', 'foreign', 'if', 'import', 'return', 'static', 'super',
                'this', 'var', 'while'), prefix = r'(?<!\.)',
                suffix = r'\b'), Keyword),

            (words((
                'true', 'false', 'null'), prefix = r'(?<!\.)',
                suffix = r'\b'), Keyword.Constant),

            (words((
                'in', 'is'), prefix = r'(?<!\.)',
                suffix = r'\b'), Operator.Word),

            # Comments.
            (r'/\*', Comment.Multiline, 'comment'), # Multiline, can nest.
            (r'//.*?$', Comment.Single),            # Single line.
            (r'#.*?(\(.*?\))?$', Comment.Special),  # Attribute or shebang.

            # Names and operators.
            (r'[!%&*+\-./:<=>?\\^|~]+', Operator),
            (r'[a-z][a-zA-Z_0-9]*', Name),
            (r'[A-Z][a-zA-Z_0-9]*', Name.Class),
            (r'__[a-zA-Z_0-9]*', Name.Variable.Class),
            (r'_[a-zA-Z_0-9]*', Name.Variable.Instance),

            # Numbers.
            (r'0x[0-9a-fA-F]+', Number.Hex),
            (r'\d+(\.\d+)?([eE][-+]?\d+)?', Number.Float),

            # Strings.
            (r'""".*?"""', String),   # Raw string
            (r'"', String, 'string'), # Other string

            # Errors.
            (r'.+$', Error), 
        ],
        'comment': [
            (r'/\*', Comment.Multiline, '#push'),
            (r'\*/', Comment.Multiline, '#pop'),
            (r'([^*/]|\*(?!/)|/(?!\*))+', Comment.Multiline),
        ],
        'string': [
            (r'"', String, '#pop'),
            (r'\\[\\%"0abefnrtv]', String.Escape), # Escape.
            (r'\\x[a-fA-F0-9]{2}', String.Escape), # Byte escape.
            (r'\\u[a-fA-F0-9]{4}', String.Escape), # Unicode escape.
            (r'\\U[a-fA-F0-9]{8}', String.Escape), # Long Unicode escape.

            (r'%\(', String.Interpol, ('interpolation', 'root')),
            (r'[^\\"%]+', String), # All remaining characters.
        ],
        'parenthesized': [
            # We only get to this state when we're at a ')'.
            (r'\)', Punctuation, '#pop'),
        ],
        'interpolation': [
            # We only get to this state when we're at a ')'.
            (r'\)', String.Interpol, '#pop'),
        ],
    }

This now survives the random pytest. I''ll submit the change and hope for the best :)

Incidentally, I've reclassified this as an ordinary keyword rather than a built-in name as it wasn't highlighting properly before.

@PureFox48
Copy link
Contributor Author

Just a postscript to this issue.

Although my own fix seemed to work, it turned out that it was masking the real problem which was due to parentheses not being matched in the case of invalid input. Anyway, one of the Pygments maintainers has kindly fixed the problem properly for us and the PR has now been merged.

So the Wren syntax highlighter should be 'officially' ready to go from release 2.14.0 which I believe is imminent :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant