Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify markdown strikethrough regex and leave text parsing to lower cases #1618

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions pygments/lexers/markup.py
Expand Up @@ -582,7 +582,7 @@ def _handle_codeblock(self, match):
# italics fenced by '_'
(r'(\_[^_ \n][^_\n]*\_)', bygroups(Generic.Emph)),
# strikethrough
(r'([^~]*)(~~[^~]+~~)', bygroups(Text, Generic.Deleted)),
(r'(~~[^~]+~~)', bygroups(Generic.Deleted)),
# mentions and topics (twitter and github stuff)
(r'[@#][\w/:]+', Name.Entity),
# (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
Expand All @@ -597,7 +597,7 @@ def _handle_codeblock(self, match):
bygroups(Text, Name.Label, Text, Name.Attribute)),

# general text, must come last!
(r'[^\\\s]+', Text),
(r'[^\\\s`~*]+', Text),
(r'.', Text),
],
}
Expand Down
83 changes: 83 additions & 0 deletions tests/test_markdown_lexer.py
Expand Up @@ -7,6 +7,8 @@
:license: BSD, see LICENSE for details.
"""

import time

import pytest
from pygments.token import Generic, Token, String, Keyword, Name

Expand Down Expand Up @@ -390,6 +392,18 @@ def test_bold_fenced_by_asterisk(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_embedded_bold(lexer):
fragment = 'embedded**bold**in text'
tokens = [
(Token.Text, 'embedded'),
(Generic.Strong, '**bold**'),
(Token.Text, 'in'),
(Token.Text, ' '),
(Token.Text, 'text'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_bold_fenced_by_underscore(lexer):
fragment = '__bold__'
Expand All @@ -399,6 +413,16 @@ def test_bold_fenced_by_underscore(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_embedded_underscore_no_bold(lexer):
fragment = 'embedded__bold__in text'
tokens = [
(Token.Text, 'embedded__bold__in'),
(Token.Text, ' '),
(Token.Text, 'text'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_invalid_italics(lexer):
fragments = (
Expand All @@ -421,6 +445,19 @@ def test_italics_fenced_by_asterisk(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_embedded_italics(lexer):
fragment = 'embedded*italics*in text'
tokens = [
(Token.Text, 'embedded'),
(Generic.Emph, '*italics*'),
(Token.Text, 'in'),
(Token.Text, ' '),
(Token.Text, 'text'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens



def test_italics_fenced_by_underscore(lexer):
fragment = '_italics_'
Expand All @@ -430,6 +467,16 @@ def test_italics_fenced_by_underscore(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_embedded_underscore_no_italics(lexer):
fragment = 'embedded_italics_in text'
tokens = [
(Token.Text, 'embedded_italics_in'),
(Token.Text, ' '),
(Token.Text, 'text'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_escape_italics(lexer):
fragments = (
Expand Down Expand Up @@ -484,6 +531,20 @@ def test_strikethrough(lexer):
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_embedded_strikethrough(lexer):
fragment = 'not striked~~striked through~~not striked'
tokens = [
(Token.Text, 'not'),
(Token.Text, ' '),
(Token.Text, 'striked'),
(Generic.Deleted, '~~striked through~~'),
(Token.Text, 'not'),
(Token.Text, ' '),
(Token.Text, 'striked'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens


def test_mentions(lexer):
fragment = 'note for @me:'
Expand Down Expand Up @@ -559,3 +620,25 @@ def test_reference_style_links(lexer):
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_simple_text(lexer):
fragment = 'this is simple text'
tokens = [
(Token.Text, 'this'),
(Token.Text, ' '),
(Token.Text, 'is'),
(Token.Text, ' '),
(Token.Text, 'simple'),
(Token.Text, ' '),
(Token.Text, 'text'),
(Token.Text, '\n'),
]
assert list(lexer.get_tokens(fragment)) == tokens

def test_long_line_perf(lexer):
# note, this test is weak enough to pass in pypy, so much so that in
# cpython it doesn't actually provide much perf regression coverage
fragment = "this is text\n"*1024
start_time = time.time()
assert all(x[0] == Token.Text for x in lexer.get_tokens(fragment))
assert time.time() - start_time < 5