From f8b097c61cf86616012b616b0b873384cf8df236 Mon Sep 17 00:00:00 2001 From: Oleh Prypin Date: Mon, 4 Jan 2021 18:12:07 +0100 Subject: [PATCH] Update Crystal lexer (#1650) * crystal: drop all classes from builtins; these aren't normally highlighted ("normally" meaning all other highlighter tools) * crystal: fix percent-strings, drop Ruby-specific arbitrary delimiters It seems that Ruby supports strings such as `%*text*` where `*` can be anything. But Crystal never had anything like that. It does, however, keep `%|text|`, so add a case for that. * crystal: update keywords and builtins * crystal: fix string literals and escape sequences Update list of escapes. Support Unicode escape sequences. Also remove the Ruby-specific `:@foo` symbol syntax, Crystal doesn't have it. * crystal: uppercase identifiers aren't always constants Make `FOO::Bar` be highlighted like `Foo::Bar` would be, rather than like `FOO` * crystal: annotations can be namespaced Highlight the entire inside part of `@[Foo::Bar]`, not just the `Foo` part (these used to be named 'attributes' but the official name is 'annotations' now, so I also change that) * fixup! crystal: fix percent-strings, drop Ruby-specific arbitrary delimiters --- pygments/lexers/crystal.py | 119 +++++++------------ tests/examplefiles/test.cr | 16 +-- tests/test_crystal.py | 237 ++++++++++++++++++++++++++++++++++++- 3 files changed, 282 insertions(+), 90 deletions(-) diff --git a/pygments/lexers/crystal.py b/pygments/lexers/crystal.py index 8c648e3fe7..e16a01f7e4 100644 --- a/pygments/lexers/crystal.py +++ b/pygments/lexers/crystal.py @@ -89,26 +89,10 @@ def heredoc_callback(self, match, ctx): del heredocstack[:] def gen_crystalstrings_rules(): - def intp_regex_callback(self, match, ctx): - yield match.start(1), String.Regex, match.group(1) # begin - nctx = LexerContext(match.group(3), 0, ['interpolated-regex']) - for i, t, v in self.get_tokens_unprocessed(context=nctx): - yield match.start(3)+i, t, v - yield match.start(4), String.Regex, match.group(4) # end[imsx]* - ctx.pos = match.end() - - def intp_string_callback(self, match, ctx): - yield match.start(1), String.Other, match.group(1) - nctx = LexerContext(match.group(3), 0, ['interpolated-string']) - for i, t, v in self.get_tokens_unprocessed(context=nctx): - yield match.start(3)+i, t, v - yield match.start(4), String.Other, match.group(4) # end - ctx.pos = match.end() - states = {} states['strings'] = [ - (r'\:@{0,2}[a-zA-Z_]\w*[!?]?', String.Symbol), - (words(CRYSTAL_OPERATORS, prefix=r'\:@{0,2}'), String.Symbol), + (r'\:\w+[!?]?', String.Symbol), + (words(CRYSTAL_OPERATORS, prefix=r'\:'), String.Symbol), (r":'(\\\\|\\[^\\]|[^'\\])*'", String.Symbol), # This allows arbitrary text after '\ for simplicity (r"'(\\\\|\\'|[^']|\\[^'\\]+)'", String.Char), @@ -130,35 +114,42 @@ def intp_string_callback(self, match, ctx): (end, ttype, '#pop'), ] - # braced quoted strings + # https://crystal-lang.org/docs/syntax_and_semantics/literals/string.html#percent-string-literals for lbrace, rbrace, bracecc, name in \ ('\\{', '\\}', '{}', 'cb'), \ ('\\[', '\\]', '\\[\\]', 'sb'), \ ('\\(', '\\)', '()', 'pa'), \ - ('<', '>', '<>', 'ab'): + ('<', '>', '<>', 'ab'), \ + ('\\|', '\\|', '\\|', 'pi'): states[name+'-intp-string'] = [ (r'\\' + lbrace, String.Other), + ] + (lbrace != rbrace) * [ (lbrace, String.Other, '#push'), + ] + [ (rbrace, String.Other, '#pop'), include('string-intp-escaped'), (r'[\\#' + bracecc + ']', String.Other), (r'[^\\#' + bracecc + ']+', String.Other), ] - states['strings'].append((r'%' + lbrace, String.Other, + states['strings'].append((r'%Q?' + lbrace, String.Other, name+'-intp-string')) states[name+'-string'] = [ (r'\\[\\' + bracecc + ']', String.Other), + ] + (lbrace != rbrace) * [ (lbrace, String.Other, '#push'), + ] + [ (rbrace, String.Other, '#pop'), (r'[\\#' + bracecc + ']', String.Other), (r'[^\\#' + bracecc + ']+', String.Other), ] - # http://crystal-lang.org/docs/syntax_and_semantics/literals/array.html - states['strings'].append((r'%[wi]' + lbrace, String.Other, + # https://crystal-lang.org/docs/syntax_and_semantics/literals/array.html#percent-array-literals + states['strings'].append((r'%[qwi]' + lbrace, String.Other, name+'-string')) states[name+'-regex'] = [ (r'\\[\\' + bracecc + ']', String.Regex), + ] + (lbrace != rbrace) * [ (lbrace, String.Regex, '#push'), + ] + [ (rbrace + '[imsx]*', String.Regex, '#pop'), include('string-intp'), (r'[\\#' + bracecc + ']', String.Regex), @@ -167,27 +158,6 @@ def intp_string_callback(self, match, ctx): states['strings'].append((r'%r' + lbrace, String.Regex, name+'-regex')) - # these must come after %! - states['strings'] += [ - # %r regex - (r'(%r([\W_]))((?:\\\2|(?!\2).)*)(\2[imsx]*)', - intp_regex_callback), - # regular fancy strings with qsw - (r'(%[wi]([\W_]))((?:\\\2|(?!\2).)*)(\2)', - intp_string_callback), - # special forms of fancy strings after operators or - # in method calls with braces - (r'(?<=[-+/*%=<>&!^|~,(])(\s*)(%([\t ])(?:(?:\\\3|(?!\3).)*)\3)', - bygroups(Text, String.Other, None)), - # and because of fixed width lookbehinds the whole thing a - # second time for line startings... - (r'^(\s*)(%([\t ])(?:(?:\\\3|(?!\3).)*)\3)', - bygroups(Text, String.Other, None)), - # all regular fancy strings without qsw - (r'(%([\[{(<]))((?:\\\2|(?!\2).)*)(\2)', - intp_string_callback), - ] - return states tokens = { @@ -195,10 +165,15 @@ def intp_string_callback(self, match, ctx): (r'#.*?$', Comment.Single), # keywords (words(''' - abstract asm as begin break case do else elsif end ensure extend ifdef if - include instance_sizeof next of pointerof private protected rescue return - require sizeof super then typeof unless until when while with yield + abstract asm begin break case do else elsif end ensure extend if in + include next of private protected require rescue return select self super + then unless until when while with yield '''.split(), suffix=r'\b'), Keyword), + (words(''' + previous_def forall out uninitialized __DIR__ __FILE__ __LINE__ + '''.split(), prefix=r'(?=])', Keyword, 'funcname'), - (r'(class|struct|union|type|alias|enum)(\s+)((?:[a-zA-Z_]\w*::)*)', + (r'(annotation|class|struct|union|type|alias|enum)(\s+)((?:[a-zA-Z_]\w*::)*)', bygroups(Keyword, Text, Name.Namespace), 'classname'), - (r'(self|out|uninitialized)\b|(is_a|responds_to)\?', Keyword.Pseudo), - # macros + # https://crystal-lang.org/api/toplevel.html (words(''' - debugger record pp assert_responds_to spawn parallel - getter setter property delegate def_hash def_equals def_equals_and_hash - forward_missing_to - '''.split(), suffix=r'\b'), Name.Builtin.Pseudo), - (r'getter[!?]|property[!?]|__(DIR|FILE|LINE)__\b', Name.Builtin.Pseudo), + instance_sizeof offsetof pointerof sizeof typeof + '''.split(), prefix=r'(?> # same as "hello <\"world\">" +%(hello ("world")) # => "hello (\"world\")" +%[hello ["world"]] # => "hello [\"world\"]" +%{hello {"world"}} # => "hello {\"world\"}" +%> # => "hello <\"world\">" +%|hello "world"| # => "hello \"world\"" <<-XML diff --git a/tests/test_crystal.py b/tests/test_crystal.py index fb2503aa33..ae0dd7ffaa 100644 --- a/tests/test_crystal.py +++ b/tests/test_crystal.py @@ -104,6 +104,19 @@ def test_interpolation_nested_curly(lexer): assert list(lexer.get_tokens(fragment)) == tokens +def test_escaped_interpolation(lexer): + fragment = '"\\#{a + b}"\n' + # i.e. no actual interpolation + tokens = [ + (String.Double, '"'), + (String.Escape, '\\#'), + (String.Double, '{a + b}'), + (String.Double, '"'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + def test_operator_methods(lexer): fragment = '([] of Int32).[]?(5)\n' tokens = [ @@ -113,7 +126,7 @@ def test_operator_methods(lexer): (Text, ' '), (Keyword, 'of'), (Text, ' '), - (Name.Builtin, 'Int32'), + (Name, 'Int32'), (Punctuation, ')'), (Operator, '.'), (Name.Operator, '[]?'), @@ -155,6 +168,21 @@ def test_numbers(lexer): assert next(lexer.get_tokens(fragment + '\n'))[0] == Error +def test_symbols(lexer): + for fragment in [':sym_bol', ':\u3042', ':question?']: + assert list(lexer.get_tokens(fragment + '\n')) == \ + [(String.Symbol, fragment), (Text, '\n')] + + fragment = ':"sym bol"\n' + tokens = [ + (String.Symbol, ':"'), + (String.Symbol, 'sym bol'), + (String.Symbol, '"'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + def test_chars(lexer): for fragment in ["'a'", "'я'", "'\\u{1234}'", "'\n'"]: assert list(lexer.get_tokens(fragment + '\n')) == \ @@ -162,6 +190,187 @@ def test_chars(lexer): assert next(lexer.get_tokens("'abc'"))[0] == Error +def test_string_escapes(lexer): + for body in ['\\n', '\\a', '\\xff', '\\u1234', '\\000', '\\u{0}', '\\u{10AfF9}']: + fragment = '"a' + body + 'z"\n' + assert list(lexer.get_tokens(fragment)) == [ + (String.Double, '"'), + (String.Double, 'a'), + (String.Escape, body), + (String.Double, 'z'), + (String.Double, '"'), + (Text, '\n'), + ] + + +def test_empty_percent_strings(lexer): + for body in ['%()', '%[]', '%{}', '%<>', '%||']: + fragment = '(' + body + ')\n' + assert list(lexer.get_tokens(fragment)) == [ + (Punctuation, '('), + (String.Other, body[:-1]), + (String.Other, body[-1]), + (Punctuation, ')'), + (Text, '\n'), + ] + + +def test_percent_strings(lexer): + fragment = ( + '%(hello ("world"))\n' + '%[hello ["world"]]\n' + '%{hello "world"}\n' + '%>\n' + '%|hello "world"|\n') + tokens = [ + (String.Other, '%('), + (String.Other, 'hello '), + (String.Other, '('), + (String.Other, '"world"'), + (String.Other, ')'), + (String.Other, ')'), + (Text, '\n'), + (String.Other, '%['), + (String.Other, 'hello '), + (String.Other, '['), + (String.Other, '"world"'), + (String.Other, ']'), + (String.Other, ']'), + (Text, '\n'), + (String.Other, '%{'), + (String.Other, 'hello "world"'), + (String.Other, '}'), + (Text, '\n'), + (String.Other, '%<'), + (String.Other, 'hello '), + (String.Other, '<'), + (String.Other, '"world"'), + (String.Other, '>'), + (String.Other, '>'), + (Text, '\n'), + (String.Other, '%|'), + (String.Other, 'hello "world"'), + (String.Other, '|'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_special_percent_strings(lexer): + fragment = '%Q(hello \\n #{name})\n%q(hello \\n #{name})\n%w(foo\\nbar baz)\n' + tokens = [ + (String.Other, '%Q('), + (String.Other, 'hello '), + (String.Escape, '\\n'), + (String.Other, ' '), + (String.Interpol, '#{'), + (Name, 'name'), + (String.Interpol, '}'), + (String.Other, ')'), + (Text, '\n'), + # The ones below have no interpolation. + (String.Other, '%q('), + (String.Other, 'hello '), + (String.Other, '\\'), + (String.Other, 'n '), + (String.Other, '#'), + (String.Other, '{name}'), + (String.Other, ')'), + (Text, '\n'), + (String.Other, '%w('), + (String.Other, 'foo'), + (String.Other, '\\'), + (String.Other, 'nbar baz'), + (String.Other, ')'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_pseudo_keywords(lexer): + fragment = ( + 'def f(x : T, line = __LINE__) forall T\n' + 'if x.is_a?(String)\n' + 'pp! x\n' + 'end\n' + 'end\n') + tokens = [ + (Keyword, 'def'), + (Text, ' '), + (Name.Function, 'f'), + (Punctuation, '('), + (Name, 'x'), + (Text, ' '), + (Punctuation, ':'), + (Text, ' '), + (Name, 'T'), + (Punctuation, ','), + (Text, ' '), + (Name, 'line'), + (Text, ' '), + (Operator, '='), + (Text, ' '), + (Keyword.Pseudo, '__LINE__'), + (Punctuation, ')'), + (Text, ' '), + (Keyword.Pseudo, 'forall'), + (Text, ' '), + (Name, 'T'), + (Text, '\n'), + (Keyword, 'if'), + (Text, ' '), + (Name, 'x'), + (Keyword.Pseudo, '.is_a?'), + (Punctuation, '('), + (Name, 'String'), + (Punctuation, ')'), + (Text, '\n'), + (Name.Builtin.Pseudo, 'pp!'), + (Text, ' '), + (Name, 'x'), + (Text, '\n'), + (Keyword, 'end'), + (Text, '\n'), + (Keyword, 'end'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_pseudo_builtins(lexer): + fragment = 'record Cls do\ndef_equals s\nend\n' + tokens = [ + (Name.Builtin.Pseudo, 'record'), + (Text, ' '), + (Name, 'Cls'), + (Text, ' '), + (Keyword, 'do'), + (Text, '\n'), + (Name.Builtin.Pseudo, 'def_equals'), + (Text, ' '), + (Name, 's'), + (Text, '\n'), + (Keyword, 'end'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + +def test_constant_and_module(lexer): + fragment = 'HTTP\nHTTP::Server.new\n' + tokens = [ + (Name.Constant, 'HTTP'), + (Text, '\n'), + (Name, 'HTTP'), + (Operator, '::'), + (Name, 'Server'), + (Operator, '.'), + (Name, 'new'), + (Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + def test_macro(lexer): fragment = ( 'def<=>(other : self) : Int\n' @@ -179,12 +388,12 @@ def test_macro(lexer): (Text, ' '), (Punctuation, ':'), (Text, ' '), - (Keyword.Pseudo, 'self'), + (Keyword, 'self'), (Punctuation, ')'), (Text, ' '), (Punctuation, ':'), (Text, ' '), - (Name.Builtin, 'Int'), + (Name, 'Int'), (Text, '\n'), (String.Interpol, '{%'), (Keyword, 'for'), @@ -276,7 +485,7 @@ def test_lib(lexer): (Text, ' '), (Punctuation, ':'), (Text, ' '), - (Name.Builtin, 'Void'), + (Name, 'Void'), (Operator, '*'), (Punctuation, ')'), (Text, ' '), @@ -284,7 +493,7 @@ def test_lib(lexer): (Text, ' '), (Name, 'LibC'), (Operator, '::'), - (Name.Builtin, 'Int'), + (Name, 'Int'), (Text, '\n'), (Keyword, 'end'), (Text, '\n') @@ -312,3 +521,21 @@ def test_escaped_bracestring(lexer): (Text, '\n'), ] assert list(lexer.get_tokens(fragment)) == tokens + + +def test_annotation(lexer): + fragment = '@[FOO::Bar::Baz(opt: "xx")]\n' + tokens = [ + (Operator, '@['), + (Name.Decorator, 'FOO::Bar::Baz'), + (Punctuation, '('), + (String.Symbol, 'opt'), + (Punctuation, ':'), + (Text, ' '), + (String.Double, '"'), + (String.Double, 'xx'), + (String.Double, '"'), + (Punctuation, ')'), + (Operator, ']'), + (Text, '\n'), + ]