From 811f1a6d63291e4cf086e76a909e5de6f25fe9ba Mon Sep 17 00:00:00 2001 From: Leistungsabfall Date: Wed, 6 Jan 2021 12:08:32 +0100 Subject: [PATCH] Markdown lexer improvements (#1623) * improve fenced code recognition for markdown lexer * improve inline code detection * improve detection of some Markdown keywords * remove Markdown recognition of code indented by 4 spaces as reliable detection is not possible with regex --- pygments/lexers/markup.py | 20 +++--- tests/test_markdown_lexer.py | 136 +++++++++++++++++++++++++++-------- 2 files changed, 115 insertions(+), 41 deletions(-) diff --git a/pygments/lexers/markup.py b/pygments/lexers/markup.py index a3eb6956dc..88e09ec22f 100644 --- a/pygments/lexers/markup.py +++ b/pygments/lexers/markup.py @@ -558,11 +558,9 @@ def _handle_codeblock(self, match): # quote (r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)), # code block fenced by 3 backticks - (r'^(\s*```\n(.+\n)+\s*```$)', String.Backtick), + (r'^(\s*```\n[\w\W]*?^\s*```$\n)', String.Backtick), # code block with language - (r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$)', _handle_codeblock), - # code block indented with 4 spaces or 1 tab - (r'(\n\n)((\ {4}|\t)(.+\n)+)', bygroups(Text, String.Backtick)), + (r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$\n)', _handle_codeblock), include('inline'), ], @@ -570,19 +568,19 @@ def _handle_codeblock(self, match): # escape (r'\\.', Text), # inline code - (r'([^`])(`[^`\n]+`)', bygroups(Text, String.Backtick)), + (r'([^`]?)(`[^`\n]+`)', bygroups(Text, String.Backtick)), # warning: the following rules eat outer tags. # eg. **foo _bar_ baz** => foo and baz are not recognized as bold # bold fenced by '**' - (r'(\*\*[^* \n][^*\n]*\*\*)', bygroups(Generic.Strong)), - # # bold fenced by '__' - (r'(\_\_[^_ \n][^_\n]*\_\_)', bygroups(Generic.Strong)), + (r'([^\*]?)(\*\*[^* \n][^*\n]*\*\*)', bygroups(Text, Generic.Strong)), + # bold fenced by '__' + (r'([^_]?)(__[^_ \n][^_\n]*__)', bygroups(Text, Generic.Strong)), # italics fenced by '*' - (r'(\*[^* \n][^*\n]*\*)', bygroups(Generic.Emph)), + (r'([^\*]?)(\*[^* \n][^*\n]*\*)', bygroups(Text, Generic.Emph)), # italics fenced by '_' - (r'(\_[^_ \n][^_\n]*\_)', bygroups(Generic.Emph)), + (r'([^_]?)(_[^_ \n][^_\n]*_)', bygroups(Text, Generic.Emph)), # strikethrough - (r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)), + (r'([^~]?)(~~[^~ \n][^~\n]*~~)', bygroups(Text, Generic.Deleted)), # mentions and topics (twitter and github stuff) (r'[@#][\w/:]+', Name.Entity), # (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png) diff --git a/tests/test_markdown_lexer.py b/tests/test_markdown_lexer.py index 24c187392d..7bffa9ea75 100644 --- a/tests/test_markdown_lexer.py +++ b/tests/test_markdown_lexer.py @@ -246,6 +246,20 @@ def test_bulleted_list(lexer): ] assert list(lexer.get_tokens(fragment)) == tokens + fragment = '```\ncode\n```\n* *foo*\n* bar' + tokens = [ + (String.Backtick, '```\ncode\n```\n'), + (Keyword, '*'), + (Token.Text, ' '), + (Generic.Emph, '*foo*'), + (Token.Text, '\n'), + (Keyword, '*'), + (Token.Text, ' '), + (Token.Text, 'bar'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + def test_numbered_list(lexer): fragment = '1. foo\n2. bar' @@ -287,20 +301,19 @@ def test_invalid_code_block(lexer): def test_code_block_fenced_by_backticks(lexer): fragments = ( - '```\ncode\n```', - '```\nmulti\n`line`\ncode\n```', + '```\ncode\n```\n', + '```\nmulti\n`line`\ncode\n```\n', ) for fragment in fragments: tokens = [ (String.Backtick, fragment), - (Token.Text, '\n'), ] assert list(lexer.get_tokens(fragment)) == tokens def test_code_block_with_language(lexer): fragments = ( - '```python\nimport this\n```', + '```python\nimport this\n```\n', ) for fragment in fragments: tokens = [ @@ -311,32 +324,7 @@ def test_code_block_with_language(lexer): (Token.Text, ' '), (Token.Name.Namespace, 'this'), (Token.Text, '\n'), - (String.Backtick, '```'), - (Token.Text, '\n'), - ] - assert list(lexer.get_tokens(fragment)) == tokens - - -def test_code_indented_with_spaces(lexer): - fragments = ( - 'sample:\n\n code\n', - ) - for fragment in fragments: - tokens = [ - (Token.Text, 'sample:'), - (Token.Text, '\n\n'), - (String.Backtick, ' code\n'), - ] - assert list(lexer.get_tokens(fragment)) == tokens - - fragments = ( - 'sample:\n\n\tcode\n', - ) - for fragment in fragments: - tokens = [ - (Token.Text, 'sample:'), - (Token.Text, '\n\n'), - (String.Backtick, '\tcode\n'), + (String.Backtick, '```\n'), ] assert list(lexer.get_tokens(fragment)) == tokens @@ -368,6 +356,58 @@ def test_inline_code(lexer): ] assert list(lexer.get_tokens(fragment)) == tokens + fragment = '* `code`' + tokens = [ + (Token.Keyword, '*'), + (Token.Text, ' '), + (String.Backtick, '`code`'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '```\ncode\n```\n* nocode\n* `code`' + tokens = [ + (String.Backtick, '```\ncode\n```\n'), + (Token.Keyword, '*'), + (Token.Text, ' '), + (Token.Text, 'nocode'), + (Token.Text, '\n'), + (Token.Keyword, '*'), + (Token.Text, ' '), + (String.Backtick, '`code`'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '- `code`' + tokens = [ + (Token.Keyword, '-'), + (Token.Text, ' '), + (String.Backtick, '`code`'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = '1. `code`' + tokens = [ + (Token.Keyword, '1.'), + (Token.Text, ' '), + (String.Backtick, '`code`'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + + fragment = 'code (`in brackets`)' + tokens = [ + (Token.Text, 'code'), + (Token.Text, ' '), + (Token.Text, '('), + (String.Backtick, '`in brackets`'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + def test_invalid_bold(lexer): fragments = ( @@ -390,6 +430,15 @@ def test_bold_fenced_by_asterisk(lexer): ] assert list(lexer.get_tokens(fragment)) == tokens + fragment = '(**bold**)' + tokens = [ + (Token.Text, '('), + (Generic.Strong, '**bold**'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + def test_bold_fenced_by_underscore(lexer): fragment = '__bold__' @@ -399,6 +448,15 @@ def test_bold_fenced_by_underscore(lexer): ] assert list(lexer.get_tokens(fragment)) == tokens + fragment = '(__bold__)' + tokens = [ + (Token.Text, '('), + (Generic.Strong, '__bold__'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + def test_invalid_italics(lexer): fragments = ( @@ -421,6 +479,15 @@ def test_italics_fenced_by_asterisk(lexer): ] assert list(lexer.get_tokens(fragment)) == tokens + fragment = '(*italics*)' + tokens = [ + (Token.Text, '('), + (Generic.Emph, '*italics*'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + def test_italics_fenced_by_underscore(lexer): fragment = '_italics_' @@ -430,6 +497,15 @@ def test_italics_fenced_by_underscore(lexer): ] assert list(lexer.get_tokens(fragment)) == tokens + fragment = '(_italics_)' + tokens = [ + (Token.Text, '('), + (Generic.Emph, '_italics_'), + (Token.Text, ')'), + (Token.Text, '\n'), + ] + assert list(lexer.get_tokens(fragment)) == tokens + def test_escape_italics(lexer): fragments = (