diff --git a/lib/errors.py b/lib/errors.py
new file mode 100644
index 00000000..65f53087
--- /dev/null
+++ b/lib/errors.py
@@ -0,0 +1,2 @@
+class MarkdownError(Exception):
+ pass
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 397a832a..2a3f364f 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -111,6 +111,22 @@
import codecs
from collections import defaultdict
+from lib.errors import MarkdownError
+from lib.utils import (
+ slugify,
+ calculate_toc_html,
+ curry,
+ regex_from_encoded_pattern,
+ dedentlines,
+ dedent,
+ memoized,
+ xml_oneliner_re_from_tab_width,
+ hr_tag_re_from_tab_width,
+ xml_escape_attr,
+ xml_encode_email_char_at_random,
+ html_escape_url,
+)
+
# ---- globals
DEBUG = False
@@ -133,12 +149,6 @@ def _hash_text(s):
# http://bumppo.net/projects/amputator/
_AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
-
-# ---- exceptions
-class MarkdownError(Exception):
- pass
-
-
# ---- public api
def markdown_path(path, encoding="utf-8",
@@ -517,7 +527,7 @@ def parse_structured_value(value):
# Multiline value
if v[:3] == " >\n":
- self.metadata[k.strip()] = _dedent(v[3:]).strip()
+ self.metadata[k.strip()] = dedent(v[3:]).strip()
# Empty value
elif v == "\n":
@@ -760,7 +770,7 @@ def _hash_html_blocks(self, text, raw=False):
return text
# Pass `raw` value into our calls to self._hash_html_block_sub.
- hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
+ hash_html_block_sub = curry(self._hash_html_block_sub, raw=raw)
# First, look for nested blocks, e.g.:
#
@@ -781,7 +791,7 @@ def _hash_html_blocks(self, text, raw=False):
# Special case just for
. It was easier to make a special
# case than to make the other regex more complicated.
if "
#
#
- _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
+ _xml_oneliner_re = xml_oneliner_re_from_tab_width(self.tab_width)
text = _xml_oneliner_re.sub(hash_html_block_sub, text)
return text
@@ -947,7 +957,7 @@ def _do_numbering(self, text):
def _extract_footnote_def_sub(self, match):
id, text = match.groups()
- text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
+ text = dedent(text, skip_first_line=not text.startswith('\n')).strip()
normed_id = re.sub(r'\W', '-', id)
# Ensure footnote text ends with a couple newlines (for some
# block gamut matches).
@@ -1034,10 +1044,10 @@ def _run_block_gamut(self, text):
def _pyshell_block_sub(self, match):
if "fenced-code-blocks" in self.extras:
- dedented = _dedent(match.group(0))
+ dedented = dedent(match.group(0))
return self._do_fenced_code_blocks("```pycon\n" + dedented + "```\n")
lines = match.group(0).splitlines(0)
- _dedentlines(lines)
+ dedentlines(lines)
indent = ' ' * self.tab_width
s = ('\n' # separate from possible cuddled paragraph
+ indent + ('\n'+indent).join(lines)
@@ -1494,7 +1504,7 @@ def _do_links(self, text):
.replace('_', self._escape_table['_'])
if title:
title_str = ' title="%s"' % (
- _xml_escape_attr(title)
+ xml_escape_attr(_AMPERSAND_RE, title)
.replace('*', self._escape_table['*'])
.replace('_', self._escape_table['_']))
else:
@@ -1502,8 +1512,8 @@ def _do_links(self, text):
if is_img:
img_class_str = self._html_class_str_from_tag("img")
result = '
- chars = [_xml_encode_email_char_at_random(ch)
+ chars = [xml_encode_email_char_at_random(ch)
for ch in "mailto:" + addr]
# Strip the mailto: from the visible part.
addr = '
%s' \
@@ -2465,41 +2475,6 @@ class MarkdownWithExtras(Markdown):
# ---- internal support functions
-
-def calculate_toc_html(toc):
- """Return the HTML for the current TOC.
-
- This expects the `_toc` attribute to have been set on this instance.
- """
- if toc is None:
- return None
-
- def indent():
- return ' ' * (len(h_stack) - 1)
- lines = []
- h_stack = [0] # stack of header-level numbers
- for level, id, name in toc:
- if level > h_stack[-1]:
- lines.append("%s
" % indent())
- h_stack.append(level)
- elif level == h_stack[-1]:
- lines[-1] += ""
- else:
- while level < h_stack[-1]:
- h_stack.pop()
- if not lines[-1].endswith(""):
- lines[-1] += ""
- lines.append("%s
" % indent())
- lines.append('%s
%s' % (
- indent(), id, name))
- while len(h_stack) > 1:
- h_stack.pop()
- if not lines[-1].endswith(""):
- lines[-1] += ""
- lines.append("%s" % indent())
- return '\n'.join(lines) + '\n'
-
-
class UnicodeWithAttrs(str):
"""A subclass of unicode used for the return value of conversion to
possibly attach some attributes. E.g. the "toc_html" attribute when
@@ -2508,260 +2483,6 @@ class UnicodeWithAttrs(str):
metadata = None
toc_html = None
-## {{{ http://code.activestate.com/recipes/577257/ (r1)
-_slugify_strip_re = re.compile(r'[^\w\s-]')
-_slugify_hyphenate_re = re.compile(r'[-\s]+')
-def _slugify(value):
- """
- Normalizes string, converts to lowercase, removes non-alpha characters,
- and converts spaces to hyphens.
-
- From Django's "django/template/defaultfilters.py".
- """
- import unicodedata
- value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
- value = _slugify_strip_re.sub('', value).strip().lower()
- return _slugify_hyphenate_re.sub('-', value)
-## end of http://code.activestate.com/recipes/577257/ }}}
-
-
-# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
-def _curry(*args, **kwargs):
- function, args = args[0], args[1:]
- def result(*rest, **kwrest):
- combined = kwargs.copy()
- combined.update(kwrest)
- return function(*args + rest, **combined)
- return result
-
-
-# Recipe: regex_from_encoded_pattern (1.0)
-def _regex_from_encoded_pattern(s):
- """'foo' -> re.compile(re.escape('foo'))
- '/foo/' -> re.compile('foo')
- '/foo/i' -> re.compile('foo', re.I)
- """
- if s.startswith('/') and s.rfind('/') != 0:
- # Parse it: /PATTERN/FLAGS
- idx = s.rfind('/')
- _, flags_str = s[1:idx], s[idx+1:]
- flag_from_char = {
- "i": re.IGNORECASE,
- "l": re.LOCALE,
- "s": re.DOTALL,
- "m": re.MULTILINE,
- "u": re.UNICODE,
- }
- flags = 0
- for char in flags_str:
- try:
- flags |= flag_from_char[char]
- except KeyError:
- raise ValueError("unsupported regex flag: '%s' in '%s' "
- "(must be one of '%s')"
- % (char, s, ''.join(list(flag_from_char.keys()))))
- return re.compile(s[1:idx], flags)
- else: # not an encoded regex
- return re.compile(re.escape(s))
-
-
-# Recipe: dedent (0.1.2)
-def _dedentlines(lines, tabsize=8, skip_first_line=False):
- """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
-
- "lines" is a list of lines to dedent.
- "tabsize" is the tab width to use for indent width calculations.
- "skip_first_line" is a boolean indicating if the first line should
- be skipped for calculating the indent width and for dedenting.
- This is sometimes useful for docstrings and similar.
-
- Same as dedent() except operates on a sequence of lines. Note: the
- lines list is modified **in-place**.
- """
- DEBUG = False
- if DEBUG:
- print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
- % (tabsize, skip_first_line))
- margin = None
- for i, line in enumerate(lines):
- if i == 0 and skip_first_line: continue
- indent = 0
- for ch in line:
- if ch == ' ':
- indent += 1
- elif ch == '\t':
- indent += tabsize - (indent % tabsize)
- elif ch in '\r\n':
- continue # skip all-whitespace lines
- else:
- break
- else:
- continue # skip all-whitespace lines
- if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
- if margin is None:
- margin = indent
- else:
- margin = min(margin, indent)
- if DEBUG: print("dedent: margin=%r" % margin)
-
- if margin is not None and margin > 0:
- for i, line in enumerate(lines):
- if i == 0 and skip_first_line: continue
- removed = 0
- for j, ch in enumerate(line):
- if ch == ' ':
- removed += 1
- elif ch == '\t':
- removed += tabsize - (removed % tabsize)
- elif ch in '\r\n':
- if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
- lines[i] = lines[i][j:]
- break
- else:
- raise ValueError("unexpected non-whitespace char %r in "
- "line %r while removing %d-space margin"
- % (ch, line, margin))
- if DEBUG:
- print("dedent: %r: %r -> removed %d/%d"\
- % (line, ch, removed, margin))
- if removed == margin:
- lines[i] = lines[i][j+1:]
- break
- elif removed > margin:
- lines[i] = ' '*(removed-margin) + lines[i][j+1:]
- break
- else:
- if removed:
- lines[i] = lines[i][removed:]
- return lines
-
-
-def _dedent(text, tabsize=8, skip_first_line=False):
- """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
-
- "text" is the text to dedent.
- "tabsize" is the tab width to use for indent width calculations.
- "skip_first_line" is a boolean indicating if the first line should
- be skipped for calculating the indent width and for dedenting.
- This is sometimes useful for docstrings and similar.
-
- textwrap.dedent(s), but don't expand tabs to spaces
- """
- lines = text.splitlines(1)
- _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
- return ''.join(lines)
-
-
-class _memoized(object):
- """Decorator that caches a function's return value each time it is called.
- If called later with the same arguments, the cached value is returned, and
- not re-evaluated.
-
- http://wiki.python.org/moin/PythonDecoratorLibrary
- """
- def __init__(self, func):
- self.func = func
- self.cache = {}
-
- def __call__(self, *args):
- try:
- return self.cache[args]
- except KeyError:
- self.cache[args] = value = self.func(*args)
- return value
- except TypeError:
- # uncachable -- for instance, passing a list as an argument.
- # Better to not cache than to blow up entirely.
- return self.func(*args)
-
- def __repr__(self):
- """Return the function's docstring."""
- return self.func.__doc__
-
-
-def _xml_oneliner_re_from_tab_width(tab_width):
- """Standalone XML processing instruction regex."""
- return re.compile(r"""
- (?:
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in $1
- [ ]{0,%d}
- (?:
- <\?\w+\b\s+.*?\?> # XML processing instruction
- |
- <\w+:\w+\b\s+.*?/> # namespaced single tag
- )
- [ \t]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
- )
- """ % (tab_width - 1), re.X)
-_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
-
-
-def _hr_tag_re_from_tab_width(tab_width):
- return re.compile(r"""
- (?:
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in \1
- [ ]{0,%d}
- <(hr) # start tag = \2
- \b # word break
- ([^<>])*? #
- /?> # the matching end tag
- [ \t]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
- )
- """ % (tab_width - 1), re.X)
-_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
-
-
-def _xml_escape_attr(attr, skip_single_quote=True):
- """Escape the given string for use in an HTML/XML tag attribute.
-
- By default this doesn't bother with escaping `'` to `'`, presuming that
- the tag attribute is surrounded by double quotes.
- """
- escaped = _AMPERSAND_RE.sub('&', attr)
-
- escaped = (attr
- .replace('"', '"')
- .replace('<', '<')
- .replace('>', '>'))
- if not skip_single_quote:
- escaped = escaped.replace("'", "'")
- return escaped
-
-
-def _xml_encode_email_char_at_random(ch):
- r = random()
- # Roughly 10% raw, 45% hex, 45% dec.
- # '@' *must* be encoded. I [John Gruber] insist.
- # Issue 26: '_' must be encoded.
- if r > 0.9 and ch not in "@_":
- return ch
- elif r < 0.45:
- # The [1:] is to drop leading '0': 0x63 -> x63
- return '%s;' % hex(ord(ch))[1:]
- else:
- return '%s;' % ord(ch)
-
-
-def _html_escape_url(attr, safe_mode=False):
- """Replace special characters that are potentially malicious in url string."""
- escaped = (attr
- .replace('"', '"')
- .replace('<', '<')
- .replace('>', '>'))
- if safe_mode:
- escaped = escaped.replace('+', ' ')
- escaped = escaped.replace("'", "'")
- return escaped
# ---- mainline
@@ -2850,7 +2571,7 @@ def main(argv=None):
raise MarkdownError("%s:%d: invalid link pattern line: %r"
% (opts.link_patterns_file, i+1, line))
link_patterns.append(
- (_regex_from_encoded_pattern(pat), href))
+ (regex_from_encoded_pattern(pat), href))
finally:
f.close()
else:
diff --git a/lib/utils.py b/lib/utils.py
new file mode 100644
index 00000000..ce0087db
--- /dev/null
+++ b/lib/utils.py
@@ -0,0 +1,321 @@
+from random import random
+import re
+
+
+## {{{ http://code.activestate.com/recipes/577257/ (r1)
+
+_slugify_strip_re = re.compile(r"[^\w\s-]")
+_slugify_hyphenate_re = re.compile(r"[-\s]+")
+
+
+def slugify(value):
+ """
+ Normalizes string, converts to lowercase, removes non-alpha characters,
+ and converts spaces to hyphens.
+
+ From Django's "django/template/defaultfilters.py".
+ """
+ import unicodedata
+
+ value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode()
+ value = _slugify_strip_re.sub("", value).strip().lower()
+ return _slugify_hyphenate_re.sub("-", value)
+
+
+## end of http://code.activestate.com/recipes/577257/ }}}
+
+
+def calculate_toc_html(toc):
+ """Return the HTML for the current TOC.
+
+ This expects the `_toc` attribute to have been set on this instance.
+ """
+ if toc is None:
+ return None
+
+ def indent():
+ return " " * (len(h_stack) - 1)
+
+ lines = []
+ h_stack = [0] # stack of header-level numbers
+ for level, id, name in toc:
+ if level > h_stack[-1]:
+ lines.append("%s
" % indent())
+ h_stack.append(level)
+ elif level == h_stack[-1]:
+ lines[-1] += ""
+ else:
+ while level < h_stack[-1]:
+ h_stack.pop()
+ if not lines[-1].endswith(""):
+ lines[-1] += ""
+ lines.append("%s
" % indent())
+ lines.append('%s
%s' % (indent(), id, name))
+ while len(h_stack) > 1:
+ h_stack.pop()
+ if not lines[-1].endswith(""):
+ lines[-1] += ""
+ lines.append("%s" % indent())
+ return "\n".join(lines) + "\n"
+
+
+# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
+def curry(*args, **kwargs):
+ function, args = args[0], args[1:]
+
+ def result(*rest, **kwrest):
+ combined = kwargs.copy()
+ combined.update(kwrest)
+ return function(*args + rest, **combined)
+
+ return result
+
+
+# Recipe: regex_from_encoded_pattern (1.0)
+def regex_from_encoded_pattern(s):
+ """'foo' -> re.compile(re.escape('foo'))
+ '/foo/' -> re.compile('foo')
+ '/foo/i' -> re.compile('foo', re.I)
+ """
+ if s.startswith("/") and s.rfind("/") != 0:
+ # Parse it: /PATTERN/FLAGS
+ idx = s.rfind("/")
+ _, flags_str = s[1:idx], s[idx + 1 :]
+ flag_from_char = {
+ "i": re.IGNORECASE,
+ "l": re.LOCALE,
+ "s": re.DOTALL,
+ "m": re.MULTILINE,
+ "u": re.UNICODE,
+ }
+ flags = 0
+ for char in flags_str:
+ try:
+ flags |= flag_from_char[char]
+ except KeyError:
+ raise ValueError(
+ "unsupported regex flag: '%s' in '%s' "
+ "(must be one of '%s')"
+ % (char, s, "".join(list(flag_from_char.keys())))
+ )
+ return re.compile(s[1:idx], flags)
+ else: # not an encoded regex
+ return re.compile(re.escape(s))
+
+
+# Recipe: dedent (0.1.2)
+def dedentlines(lines, tabsize=8, skip_first_line=False):
+ """dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
+
+ "lines" is a list of lines to dedent.
+ "tabsize" is the tab width to use for indent width calculations.
+ "skip_first_line" is a boolean indicating if the first line should
+ be skipped for calculating the indent width and for dedenting.
+ This is sometimes useful for docstrings and similar.
+
+ Same as dedent() except operates on a sequence of lines. Note: the
+ lines list is modified **in-place**.
+ """
+ DEBUG = False
+ if DEBUG:
+ print(
+ "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"
+ % (tabsize, skip_first_line)
+ )
+ margin = None
+ for i, line in enumerate(lines):
+ if i == 0 and skip_first_line:
+ continue
+ indent = 0
+ for ch in line:
+ if ch == " ":
+ indent += 1
+ elif ch == "\t":
+ indent += tabsize - (indent % tabsize)
+ elif ch in "\r\n":
+ continue # skip all-whitespace lines
+ else:
+ break
+ else:
+ continue # skip all-whitespace lines
+ if DEBUG:
+ print("dedent: indent=%d: %r" % (indent, line))
+ if margin is None:
+ margin = indent
+ else:
+ margin = min(margin, indent)
+ if DEBUG:
+ print("dedent: margin=%r" % margin)
+
+ if margin is not None and margin > 0:
+ for i, line in enumerate(lines):
+ if i == 0 and skip_first_line:
+ continue
+ removed = 0
+ for j, ch in enumerate(line):
+ if ch == " ":
+ removed += 1
+ elif ch == "\t":
+ removed += tabsize - (removed % tabsize)
+ elif ch in "\r\n":
+ if DEBUG:
+ print("dedent: %r: EOL -> strip up to EOL" % line)
+ lines[i] = lines[i][j:]
+ break
+ else:
+ raise ValueError(
+ "unexpected non-whitespace char %r in "
+ "line %r while removing %d-space margin" % (ch, line, margin)
+ )
+ if DEBUG:
+ print(
+ "dedent: %r: %r -> removed %d/%d" % (line, ch, removed, margin)
+ )
+ if removed == margin:
+ lines[i] = lines[i][j + 1 :]
+ break
+ elif removed > margin:
+ lines[i] = " " * (removed - margin) + lines[i][j + 1 :]
+ break
+ else:
+ if removed:
+ lines[i] = lines[i][removed:]
+ return lines
+
+
+def dedent(text, tabsize=8, skip_first_line=False):
+ """dedent(text, tabsize=8, skip_first_line=False) -> dedented text
+
+ "text" is the text to dedent.
+ "tabsize" is the tab width to use for indent width calculations.
+ "skip_first_line" is a boolean indicating if the first line should
+ be skipped for calculating the indent width and for dedenting.
+ This is sometimes useful for docstrings and similar.
+
+ textwrap.dedent(s), but don't expand tabs to spaces
+ """
+ lines = text.splitlines(1)
+ dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
+ return "".join(lines)
+
+
+class memoized(object):
+ """Decorator that caches a function's return value each time it is called.
+ If called later with the same arguments, the cached value is returned, and
+ not re-evaluated.
+
+ http://wiki.python.org/moin/PythonDecoratorLibrary
+ """
+
+ def __init__(self, func):
+ self.func = func
+ self.cache = {}
+
+ def __call__(self, *args):
+ try:
+ return self.cache[args]
+ except KeyError:
+ self.cache[args] = value = self.func(*args)
+ return value
+ except TypeError:
+ # uncachable -- for instance, passing a list as an argument.
+ # Better to not cache than to blow up entirely.
+ return self.func(*args)
+
+ def __repr__(self):
+ """Return the function's docstring."""
+ return self.func.__doc__
+
+
+def xml_oneliner_re_from_tab_width(tab_width):
+ """Standalone XML processing instruction regex."""
+ return re.compile(
+ r"""
+ (?:
+ (?<=\n\n) # Starting after a blank line
+ | # or
+ \A\n? # the beginning of the doc
+ )
+ ( # save in $1
+ [ ]{0,%d}
+ (?:
+ <\?\w+\b\s+.*?\?> # XML processing instruction
+ |
+ <\w+:\w+\b\s+.*?/> # namespaced single tag
+ )
+ [ \t]*
+ (?=\n{2,}|\Z) # followed by a blank line or end of document
+ )
+ """
+ % (tab_width - 1),
+ re.X,
+ )
+
+
+xml_oneliner_re_from_tab_width = memoized(xml_oneliner_re_from_tab_width)
+
+
+def hr_tag_re_from_tab_width(tab_width):
+ return re.compile(
+ r"""
+ (?:
+ (?<=\n\n) # Starting after a blank line
+ | # or
+ \A\n? # the beginning of the doc
+ )
+ ( # save in \1
+ [ ]{0,%d}
+ <(hr) # start tag = \2
+ \b # word break
+ ([^<>])*? #
+ /?> # the matching end tag
+ [ \t]*
+ (?=\n{2,}|\Z) # followed by a blank line or end of document
+ )
+ """
+ % (tab_width - 1),
+ re.X,
+ )
+
+
+hr_tag_re_from_tab_width = memoized(hr_tag_re_from_tab_width)
+
+
+def xml_escape_attr(ampersand_re, attr, skip_single_quote=True):
+ """Escape the given string for use in an HTML/XML tag attribute.
+
+ By default this doesn't bother with escaping `'` to `'`, presuming that
+ the tag attribute is surrounded by double quotes.
+ """
+ escaped = ampersand_re.sub("&", attr)
+
+ escaped = attr.replace('"', """).replace("<", "<").replace(">", ">")
+ if not skip_single_quote:
+ escaped = escaped.replace("'", "'")
+ return escaped
+
+
+def xml_encode_email_char_at_random(ch):
+ r = random()
+ # Roughly 10% raw, 45% hex, 45% dec.
+ # '@' *must* be encoded. I [John Gruber] insist.
+ # Issue 26: '_' must be encoded.
+ if r > 0.9 and ch not in "@_":
+ return ch
+ elif r < 0.45:
+ # The [1:] is to drop leading '0': 0x63 -> x63
+ return "%s;" % hex(ord(ch))[1:]
+ else:
+ return "%s;" % ord(ch)
+
+
+def html_escape_url(attr, safe_mode=False):
+ """Replace special characters that are potentially malicious in url string."""
+ escaped = (attr
+ .replace('"', '"')
+ .replace('<', '<')
+ .replace('>', '>'))
+ if safe_mode:
+ escaped = escaped.replace('+', ' ')
+ escaped = escaped.replace("'", "'")
+ return escaped
\ No newline at end of file