Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Markdown lexer improvements #1623

Merged
merged 5 commits into from Jan 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 9 additions & 11 deletions pygments/lexers/markup.py
Expand Up @@ -558,31 +558,29 @@ def _handle_codeblock(self, match):
# quote
(r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
# code block fenced by 3 backticks
(r'^(\s*```\n(.+\n)+\s*```$)', String.Backtick),
(r'^(\s*```\n[\w\W]*?^\s*```$\n)', String.Backtick),
# code block with language
(r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$)', _handle_codeblock),
# code block indented with 4 spaces or 1 tab
(r'(\n\n)((\ {4}|\t)(.+\n)+)', bygroups(Text, String.Backtick)),
(r'^(\s*```)(\w+)(\n)([\w\W]*?)(^\s*```$\n)', _handle_codeblock),

include('inline'),
],
'inline': [
# escape
(r'\\.', Text),
# inline code
(r'([^`])(`[^`\n]+`)', bygroups(Text, String.Backtick)),
(r'([^`]?)(`[^`\n]+`)', bygroups(Text, String.Backtick)),
# warning: the following rules eat outer tags.
# eg. **foo _bar_ baz** => foo and baz are not recognized as bold
# bold fenced by '**'
(r'(\*\*[^* \n][^*\n]*\*\*)', bygroups(Generic.Strong)),
# # bold fenced by '__'
(r'(\_\_[^_ \n][^_\n]*\_\_)', bygroups(Generic.Strong)),
(r'([^\*]?)(\*\*[^* \n][^*\n]*\*\*)', bygroups(Text, Generic.Strong)),
# bold fenced by '__'
(r'([^_]?)(__[^_ \n][^_\n]*__)', bygroups(Text, Generic.Strong)),
# italics fenced by '*'
(r'(\*[^* \n][^*\n]*\*)', bygroups(Generic.Emph)),
(r'([^\*]?)(\*[^* \n][^*\n]*\*)', bygroups(Text, Generic.Emph)),
# italics fenced by '_'
(r'(\_[^_ \n][^_\n]*\_)', bygroups(Generic.Emph)),
(r'([^_]?)(_[^_ \n][^_\n]*_)', bygroups(Text, Generic.Emph)),
# strikethrough
(r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)),
(r'([^~]?)(~~[^~ \n][^~\n]*~~)', bygroups(Text, Generic.Deleted)),
# mentions and topics (twitter and github stuff)
(r'[@#][\w/:]+', Name.Entity),
# (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
Expand Down
136 changes: 106 additions & 30 deletions tests/test_markdown_lexer.py
Expand Up @@ -246,6 +246,20 @@ def test_bulleted_list(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '```\ncode\n```\n* *foo*\n* bar'
tokens = [
(String.Backtick, '```\ncode\n```\n'),
(Keyword, '*'),
(Token.Text, ' '),
(Generic.Emph, '*foo*'),
(Token.Text, '\n'),
(Keyword, '*'),
(Token.Text, ' '),
(Token.Text, 'bar'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_numbered_list(lexer):
fragment = '1. foo\n2. bar'
Expand Down Expand Up @@ -287,20 +301,19 @@ def test_invalid_code_block(lexer):

def test_code_block_fenced_by_backticks(lexer):
fragments = (
'```\ncode\n```',
'```\nmulti\n`line`\ncode\n```',
'```\ncode\n```\n',
'```\nmulti\n`line`\ncode\n```\n',
)
for fragment in fragments:
tokens = [
(String.Backtick, fragment),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_code_block_with_language(lexer):
fragments = (
'```python\nimport this\n```',
'```python\nimport this\n```\n',
)
for fragment in fragments:
tokens = [
Expand All @@ -311,32 +324,7 @@ def test_code_block_with_language(lexer):
(Token.Text, ' '),
(Token.Name.Namespace, 'this'),
(Token.Text, '\n'),
(String.Backtick, '```'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_code_indented_with_spaces(lexer):
fragments = (
'sample:\n\n code\n',
)
for fragment in fragments:
tokens = [
(Token.Text, 'sample:'),
(Token.Text, '\n\n'),
(String.Backtick, ' code\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragments = (
'sample:\n\n\tcode\n',
)
for fragment in fragments:
tokens = [
(Token.Text, 'sample:'),
(Token.Text, '\n\n'),
(String.Backtick, '\tcode\n'),
(String.Backtick, '```\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

Expand Down Expand Up @@ -368,6 +356,58 @@ def test_inline_code(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '* `code`'
tokens = [
(Token.Keyword, '*'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '```\ncode\n```\n* nocode\n* `code`'
tokens = [
(String.Backtick, '```\ncode\n```\n'),
(Token.Keyword, '*'),
(Token.Text, ' '),
(Token.Text, 'nocode'),
(Token.Text, '\n'),
(Token.Keyword, '*'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '- `code`'
tokens = [
(Token.Keyword, '-'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '1. `code`'
tokens = [
(Token.Keyword, '1.'),
(Token.Text, ' '),
(String.Backtick, '`code`'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = 'code (`in brackets`)'
tokens = [
(Token.Text, 'code'),
(Token.Text, ' '),
(Token.Text, '('),
(String.Backtick, '`in brackets`'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_invalid_bold(lexer):
fragments = (
Expand All @@ -390,6 +430,15 @@ def test_bold_fenced_by_asterisk(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(**bold**)'
tokens = [
(Token.Text, '('),
(Generic.Strong, '**bold**'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_bold_fenced_by_underscore(lexer):
fragment = '__bold__'
Expand All @@ -399,6 +448,15 @@ def test_bold_fenced_by_underscore(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(__bold__)'
tokens = [
(Token.Text, '('),
(Generic.Strong, '__bold__'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_invalid_italics(lexer):
fragments = (
Expand All @@ -421,6 +479,15 @@ def test_italics_fenced_by_asterisk(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(*italics*)'
tokens = [
(Token.Text, '('),
(Generic.Emph, '*italics*'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_italics_fenced_by_underscore(lexer):
fragment = '_italics_'
Expand All @@ -430,6 +497,15 @@ def test_italics_fenced_by_underscore(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

fragment = '(_italics_)'
tokens = [
(Token.Text, '('),
(Generic.Emph, '_italics_'),
(Token.Text, ')'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_escape_italics(lexer):
fragments = (
Expand Down