Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backport urlize speedup #1343

Merged
merged 1 commit into from Jan 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGES.rst
@@ -1,5 +1,15 @@
.. currentmodule:: jinja2

Version 2.11.3
--------------

Unreleased

- Improve the speed of the ``urlize`` filter by reducing regex
backtracking. Email matching requires a word character at the start
of the domain part, and only word characters in the TLD. :pr:`1343`


Version 2.11.2
--------------

Expand Down
107 changes: 56 additions & 51 deletions src/jinja2/utils.py
Expand Up @@ -6,6 +6,8 @@
from collections import deque
from random import choice
from random import randrange
from string import ascii_letters as _letters
from string import digits as _digits
from threading import Lock

from markupsafe import escape
Expand All @@ -16,20 +18,6 @@
from ._compat import text_type
from ._compat import url_quote

_word_split_re = re.compile(r"(\s+)")
_punctuation_re = re.compile(
"^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$"
% (
"|".join(map(re.escape, ("(", "<", "&lt;"))),
"|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;"))),
)
)
_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
_entity_re = re.compile(r"&([^;]+);")
_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
_digits = "0123456789"

# special singleton representing missing values for the runtime
missing = type("MissingType", (), {"__repr__": lambda x: "missing"})()

Expand Down Expand Up @@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
and (x[:limit] + (len(x) >= limit and "..." or ""))
or x
)
words = _word_split_re.split(text_type(escape(text)))
words = re.split(r"(\s+)", text_type(escape(text)))
rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or ""
target_attr = target and ' target="%s"' % escape(target) or ""

for i, word in enumerate(words):
match = _punctuation_re.match(word)
head, middle, tail = "", word, ""
match = re.match(r"^([(<]|&lt;)+", middle)

if match:
lead, middle, trail = match.groups()
if middle.startswith("www.") or (
"@" not in middle
and not middle.startswith("http://")
and not middle.startswith("https://")
and len(middle) > 0
and middle[0] in _letters + _digits
and (
middle.endswith(".org")
or middle.endswith(".net")
or middle.endswith(".com")
)
):
middle = '<a href="http://%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)
if middle.startswith("http://") or middle.startswith("https://"):
middle = '<a href="%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)
if (
"@" in middle
and not middle.startswith("www.")
and ":" not in middle
and _simple_email_re.match(middle)
):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
head = match.group()
middle = middle[match.end() :]

# Unlike lead, which is anchored to the start of the string,
# need to check that the string ends with any of the characters
# before trying to match all of them, to avoid backtracking.
if middle.endswith((")", ">", ".", ",", "\n", "&gt;")):
match = re.search(r"([)>.,\n]|&gt;)+$", middle)

if match:
tail = match.group()
middle = middle[: match.start()]

if middle.startswith("www.") or (
"@" not in middle
and not middle.startswith("http://")
and not middle.startswith("https://")
and len(middle) > 0
and middle[0] in _letters + _digits
and (
middle.endswith(".org")
or middle.endswith(".net")
or middle.endswith(".com")
)
):
middle = '<a href="http://%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)

if middle.startswith("http://") or middle.startswith("https://"):
middle = '<a href="%s"%s%s>%s</a>' % (
middle,
rel_attr,
target_attr,
trim_url(middle),
)

if (
"@" in middle
and not middle.startswith("www.")
and ":" not in middle
and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle)
):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)

words[i] = head + middle + tail

return u"".join(words)


Expand Down