Skip to content

Commit

Permalink
Markdown lexer improvements (#1623)
Browse files Browse the repository at this point in the history
* improve fenced code recognition for markdown lexer

* improve inline code detection

* improve detection of some Markdown keywords

* remove Markdown recognition of code indented by 4 spaces as reliable detection is not possible with regex
  • Loading branch information
Leistungsabfall committed Jan 6, 2021
1 parent 9b0218e commit 811f1a6
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 41 deletions.
20 changes: 9 additions & 11 deletions pygments/lexers/markup.py
Expand Up @@ -558,31 +558,29 @@ def _handle_codeblock(self, match):
# quote
(r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
# code block fenced by 3 backticks
(r'^(\s*```\n(.+\n)+\s*```$)', String.Backtick),
(r'^(\s*```\n[\w\W]*?^\s*```$\n)', String.Backtick),
# code block with language
(r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$)', _handle_codeblock),
# code block indented with 4 spaces or 1 tab
(r'(\n\n)((\ {4}|\t)(.+\n)+)', bygroups(Text, String.Backtick)),
(r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$\n)', _handle_codeblock),

include('inline'),
],
'inline': [
# escape
(r'\\.', Text),
# inline code
(r'([^`])(`[^`\n]+`)', bygroups(Text, String.Backtick)),
(r'([^`]?)(`[^`\n]+`)', bygroups(Text, String.Backtick)),
# warning: the following rules eat outer tags.
# eg. **foo _bar_ baz** => foo and baz are not recognized as bold
# bold fenced by '**'
(r'(\*\*[^* \n][^*\n]*\*\*)', bygroups(Generic.Strong)),
# # bold fenced by '__'
(r'(\_\_[^_ \n][^_\n]*\_\_)', bygroups(Generic.Strong)),
(r'([^\*]?)(\*\*[^* \n][^*\n]*\*\*)', bygroups(Text, Generic.Strong)),
# bold fenced by '__'
(r'([^_]?)(__[^_ \n][^_\n]*__)', bygroups(Text, Generic.Strong)),
# italics fenced by '*'
(r'(\*[^* \n][^*\n]*\*)', bygroups(Generic.Emph)),
(r'([^\*]?)(\*[^* \n][^*\n]*\*)', bygroups(Text, Generic.Emph)),
# italics fenced by '_'
(r'(\_[^_ \n][^_\n]*\_)', bygroups(Generic.Emph)),
(r'([^_]?)(_[^_ \n][^_\n]*_)', bygroups(Text, Generic.Emph)),
# strikethrough
(r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)),
(r'([^~]?)(~~[^~ \n][^~\n]*~~)', bygroups(Text, Generic.Deleted)),
# mentions and topics (twitter and github stuff)
(r'[@#][\w/:]+', Name.Entity),
# (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
Expand Down
136 changes: 106 additions & 30 deletions tests/test_markdown_lexer.py
Expand Up @@ -246,6 +246,20 @@ def test_bulleted_list(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '```\ncode\n```\n* *foo*\n* bar'
tokens = [
(String.Backtick, '```\ncode\n```\n'),
(Keyword, '*'),
(Token.Text, ' '),
(Generic.Emph, '*foo*'),
(Token.Text, '\n'),
(Keyword, '*'),
(Token.Text, ' '),
(Token.Text, 'bar'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_numbered_list(lexer):
fragment = '1. foo\n2. bar'
Expand Down Expand Up @@ -287,20 +301,19 @@ def test_invalid_code_block(lexer):

def test_code_block_fenced_by_backticks(lexer):
fragments = (
'```\ncode\n```',
'```\nmulti\n`line`\ncode\n```',
'```\ncode\n```\n',
'```\nmulti\n`line`\ncode\n```\n',
)
for fragment in fragments:
tokens = [
(String.Backtick, fragment),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_code_block_with_language(lexer):
fragments = (
'```python\nimport this\n```',
'```python\nimport this\n```\n',
)
for fragment in fragments:
tokens = [
Expand All @@ -311,32 +324,7 @@ def test_code_block_with_language(lexer):
(Token.Text, ' '),
(Token.Name.Namespace, 'this'),
(Token.Text, '\n'),
(String.Backtick, '```'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_code_indented_with_spaces(lexer):
fragments = (
'sample:\n\n code\n',
)
for fragment in fragments:
tokens = [
(Token.Text, 'sample:'),
(Token.Text, '\n\n'),
(String.Backtick, ' code\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragments = (
'sample:\n\n\tcode\n',
)
for fragment in fragments:
tokens = [
(Token.Text, 'sample:'),
(Token.Text, '\n\n'),
(String.Backtick, '\tcode\n'),
(String.Backtick, '```\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

Expand Down Expand Up @@ -368,6 +356,58 @@ def test_inline_code(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '* `code`'
tokens = [
(Token.Keyword, '*'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '```\ncode\n```\n* nocode\n* `code`'
tokens = [
(String.Backtick, '```\ncode\n```\n'),
(Token.Keyword, '*'),
(Token.Text, ' '),
(Token.Text, 'nocode'),
(Token.Text, '\n'),
(Token.Keyword, '*'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '- `code`'
tokens = [
(Token.Keyword, '-'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '1. `code`'
tokens = [
(Token.Keyword, '1.'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = 'code (`in brackets`)'
tokens = [
(Token.Text, 'code'),
(Token.Text, ' '),
(Token.Text, '('),
(String.Backtick, '`in brackets`'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_invalid_bold(lexer):
fragments = (
Expand All @@ -390,6 +430,15 @@ def test_bold_fenced_by_asterisk(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(**bold**)'
tokens = [
(Token.Text, '('),
(Generic.Strong, '**bold**'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_bold_fenced_by_underscore(lexer):
fragment = '__bold__'
Expand All @@ -399,6 +448,15 @@ def test_bold_fenced_by_underscore(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(__bold__)'
tokens = [
(Token.Text, '('),
(Generic.Strong, '__bold__'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_invalid_italics(lexer):
fragments = (
Expand All @@ -421,6 +479,15 @@ def test_italics_fenced_by_asterisk(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(*italics*)'
tokens = [
(Token.Text, '('),
(Generic.Emph, '*italics*'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_italics_fenced_by_underscore(lexer):
fragment = '_italics_'
Expand All @@ -430,6 +497,15 @@ def test_italics_fenced_by_underscore(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(_italics_)'
tokens = [
(Token.Text, '('),
(Generic.Emph, '_italics_'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_escape_italics(lexer):
fragments = (
Expand Down

0 comments on commit 811f1a6

Please sign in to comment.