diff --git a/lib/errors.py b/lib/errors.py new file mode 100644 index 00000000..65f53087 --- /dev/null +++ b/lib/errors.py @@ -0,0 +1,2 @@ +class MarkdownError(Exception): + pass diff --git a/lib/markdown2.py b/lib/markdown2.py index 397a832a..2a3f364f 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -111,6 +111,22 @@ import codecs from collections import defaultdict +from lib.errors import MarkdownError +from lib.utils import ( + slugify, + calculate_toc_html, + curry, + regex_from_encoded_pattern, + dedentlines, + dedent, + memoized, + xml_oneliner_re_from_tab_width, + hr_tag_re_from_tab_width, + xml_escape_attr, + xml_encode_email_char_at_random, + html_escape_url, +) + # ---- globals DEBUG = False @@ -133,12 +149,6 @@ def _hash_text(s): # http://bumppo.net/projects/amputator/ _AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') - -# ---- exceptions -class MarkdownError(Exception): - pass - - # ---- public api def markdown_path(path, encoding="utf-8", @@ -517,7 +527,7 @@ def parse_structured_value(value): # Multiline value if v[:3] == " >\n": - self.metadata[k.strip()] = _dedent(v[3:]).strip() + self.metadata[k.strip()] = dedent(v[3:]).strip() # Empty value elif v == "\n": @@ -760,7 +770,7 @@ def _hash_html_blocks(self, text, raw=False): return text # Pass `raw` value into our calls to self._hash_html_block_sub. - hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) + hash_html_block_sub = curry(self._hash_html_block_sub, raw=raw) # First, look for nested blocks, e.g.: #
@@ -781,7 +791,7 @@ def _hash_html_blocks(self, text, raw=False): # Special case just for
. It was easier to make a special # case than to make the other regex more complicated. if " # # - _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) + _xml_oneliner_re = xml_oneliner_re_from_tab_width(self.tab_width) text = _xml_oneliner_re.sub(hash_html_block_sub, text) return text @@ -947,7 +957,7 @@ def _do_numbering(self, text): def _extract_footnote_def_sub(self, match): id, text = match.groups() - text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() + text = dedent(text, skip_first_line=not text.startswith('\n')).strip() normed_id = re.sub(r'\W', '-', id) # Ensure footnote text ends with a couple newlines (for some # block gamut matches). @@ -1034,10 +1044,10 @@ def _run_block_gamut(self, text): def _pyshell_block_sub(self, match): if "fenced-code-blocks" in self.extras: - dedented = _dedent(match.group(0)) + dedented = dedent(match.group(0)) return self._do_fenced_code_blocks("```pycon\n" + dedented + "```\n") lines = match.group(0).splitlines(0) - _dedentlines(lines) + dedentlines(lines) indent = ' ' * self.tab_width s = ('\n' # separate from possible cuddled paragraph + indent + ('\n'+indent).join(lines) @@ -1494,7 +1504,7 @@ def _do_links(self, text): .replace('_', self._escape_table['_']) if title: title_str = ' title="%s"' % ( - _xml_escape_attr(title) + xml_escape_attr(_AMPERSAND_RE, title) .replace('*', self._escape_table['*']) .replace('_', self._escape_table['_'])) else: @@ -1502,8 +1512,8 @@ def _do_links(self, text): if is_img: img_class_str = self._html_class_str_from_tag("img") result = '%s - chars = [_xml_encode_email_char_at_random(ch) + chars = [xml_encode_email_char_at_random(ch) for ch in "mailto:" + addr] # Strip the mailto: from the visible part. addr = '%s' \ @@ -2465,41 +2475,6 @@ class MarkdownWithExtras(Markdown): # ---- internal support functions - -def calculate_toc_html(toc): - """Return the HTML for the current TOC. - - This expects the `_toc` attribute to have been set on this instance. - """ - if toc is None: - return None - - def indent(): - return ' ' * (len(h_stack) - 1) - lines = [] - h_stack = [0] # stack of header-level numbers - for level, id, name in toc: - if level > h_stack[-1]: - lines.append("%s
    " % indent()) - h_stack.append(level) - elif level == h_stack[-1]: - lines[-1] += "" - else: - while level < h_stack[-1]: - h_stack.pop() - if not lines[-1].endswith(""): - lines[-1] += "" - lines.append("%s
" % indent()) - lines.append('%s
  • %s' % ( - indent(), id, name)) - while len(h_stack) > 1: - h_stack.pop() - if not lines[-1].endswith("
  • "): - lines[-1] += "" - lines.append("%s" % indent()) - return '\n'.join(lines) + '\n' - - class UnicodeWithAttrs(str): """A subclass of unicode used for the return value of conversion to possibly attach some attributes. E.g. the "toc_html" attribute when @@ -2508,260 +2483,6 @@ class UnicodeWithAttrs(str): metadata = None toc_html = None -## {{{ http://code.activestate.com/recipes/577257/ (r1) -_slugify_strip_re = re.compile(r'[^\w\s-]') -_slugify_hyphenate_re = re.compile(r'[-\s]+') -def _slugify(value): - """ - Normalizes string, converts to lowercase, removes non-alpha characters, - and converts spaces to hyphens. - - From Django's "django/template/defaultfilters.py". - """ - import unicodedata - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() - value = _slugify_strip_re.sub('', value).strip().lower() - return _slugify_hyphenate_re.sub('-', value) -## end of http://code.activestate.com/recipes/577257/ }}} - - -# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 -def _curry(*args, **kwargs): - function, args = args[0], args[1:] - def result(*rest, **kwrest): - combined = kwargs.copy() - combined.update(kwrest) - return function(*args + rest, **combined) - return result - - -# Recipe: regex_from_encoded_pattern (1.0) -def _regex_from_encoded_pattern(s): - """'foo' -> re.compile(re.escape('foo')) - '/foo/' -> re.compile('foo') - '/foo/i' -> re.compile('foo', re.I) - """ - if s.startswith('/') and s.rfind('/') != 0: - # Parse it: /PATTERN/FLAGS - idx = s.rfind('/') - _, flags_str = s[1:idx], s[idx+1:] - flag_from_char = { - "i": re.IGNORECASE, - "l": re.LOCALE, - "s": re.DOTALL, - "m": re.MULTILINE, - "u": re.UNICODE, - } - flags = 0 - for char in flags_str: - try: - flags |= flag_from_char[char] - except KeyError: - raise ValueError("unsupported regex flag: '%s' in '%s' " - "(must be one of '%s')" - % (char, s, ''.join(list(flag_from_char.keys())))) - return re.compile(s[1:idx], flags) - else: # not an encoded regex - return re.compile(re.escape(s)) - - -# Recipe: dedent (0.1.2) -def _dedentlines(lines, tabsize=8, skip_first_line=False): - """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines - - "lines" is a list of lines to dedent. - "tabsize" is the tab width to use for indent width calculations. - "skip_first_line" is a boolean indicating if the first line should - be skipped for calculating the indent width and for dedenting. - This is sometimes useful for docstrings and similar. - - Same as dedent() except operates on a sequence of lines. Note: the - lines list is modified **in-place**. - """ - DEBUG = False - if DEBUG: - print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ - % (tabsize, skip_first_line)) - margin = None - for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue - indent = 0 - for ch in line: - if ch == ' ': - indent += 1 - elif ch == '\t': - indent += tabsize - (indent % tabsize) - elif ch in '\r\n': - continue # skip all-whitespace lines - else: - break - else: - continue # skip all-whitespace lines - if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) - if margin is None: - margin = indent - else: - margin = min(margin, indent) - if DEBUG: print("dedent: margin=%r" % margin) - - if margin is not None and margin > 0: - for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue - removed = 0 - for j, ch in enumerate(line): - if ch == ' ': - removed += 1 - elif ch == '\t': - removed += tabsize - (removed % tabsize) - elif ch in '\r\n': - if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) - lines[i] = lines[i][j:] - break - else: - raise ValueError("unexpected non-whitespace char %r in " - "line %r while removing %d-space margin" - % (ch, line, margin)) - if DEBUG: - print("dedent: %r: %r -> removed %d/%d"\ - % (line, ch, removed, margin)) - if removed == margin: - lines[i] = lines[i][j+1:] - break - elif removed > margin: - lines[i] = ' '*(removed-margin) + lines[i][j+1:] - break - else: - if removed: - lines[i] = lines[i][removed:] - return lines - - -def _dedent(text, tabsize=8, skip_first_line=False): - """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text - - "text" is the text to dedent. - "tabsize" is the tab width to use for indent width calculations. - "skip_first_line" is a boolean indicating if the first line should - be skipped for calculating the indent width and for dedenting. - This is sometimes useful for docstrings and similar. - - textwrap.dedent(s), but don't expand tabs to spaces - """ - lines = text.splitlines(1) - _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) - return ''.join(lines) - - -class _memoized(object): - """Decorator that caches a function's return value each time it is called. - If called later with the same arguments, the cached value is returned, and - not re-evaluated. - - http://wiki.python.org/moin/PythonDecoratorLibrary - """ - def __init__(self, func): - self.func = func - self.cache = {} - - def __call__(self, *args): - try: - return self.cache[args] - except KeyError: - self.cache[args] = value = self.func(*args) - return value - except TypeError: - # uncachable -- for instance, passing a list as an argument. - # Better to not cache than to blow up entirely. - return self.func(*args) - - def __repr__(self): - """Return the function's docstring.""" - return self.func.__doc__ - - -def _xml_oneliner_re_from_tab_width(tab_width): - """Standalone XML processing instruction regex.""" - return re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,%d} - (?: - <\?\w+\b\s+.*?\?> # XML processing instruction - | - <\w+:\w+\b\s+.*?/> # namespaced single tag - ) - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - """ % (tab_width - 1), re.X) -_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) - - -def _hr_tag_re_from_tab_width(tab_width): - return re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in \1 - [ ]{0,%d} - <(hr) # start tag = \2 - \b # word break - ([^<>])*? # - /?> # the matching end tag - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - """ % (tab_width - 1), re.X) -_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) - - -def _xml_escape_attr(attr, skip_single_quote=True): - """Escape the given string for use in an HTML/XML tag attribute. - - By default this doesn't bother with escaping `'` to `'`, presuming that - the tag attribute is surrounded by double quotes. - """ - escaped = _AMPERSAND_RE.sub('&', attr) - - escaped = (attr - .replace('"', '"') - .replace('<', '<') - .replace('>', '>')) - if not skip_single_quote: - escaped = escaped.replace("'", "'") - return escaped - - -def _xml_encode_email_char_at_random(ch): - r = random() - # Roughly 10% raw, 45% hex, 45% dec. - # '@' *must* be encoded. I [John Gruber] insist. - # Issue 26: '_' must be encoded. - if r > 0.9 and ch not in "@_": - return ch - elif r < 0.45: - # The [1:] is to drop leading '0': 0x63 -> x63 - return '&#%s;' % hex(ord(ch))[1:] - else: - return '&#%s;' % ord(ch) - - -def _html_escape_url(attr, safe_mode=False): - """Replace special characters that are potentially malicious in url string.""" - escaped = (attr - .replace('"', '"') - .replace('<', '<') - .replace('>', '>')) - if safe_mode: - escaped = escaped.replace('+', ' ') - escaped = escaped.replace("'", "'") - return escaped # ---- mainline @@ -2850,7 +2571,7 @@ def main(argv=None): raise MarkdownError("%s:%d: invalid link pattern line: %r" % (opts.link_patterns_file, i+1, line)) link_patterns.append( - (_regex_from_encoded_pattern(pat), href)) + (regex_from_encoded_pattern(pat), href)) finally: f.close() else: diff --git a/lib/utils.py b/lib/utils.py new file mode 100644 index 00000000..ce0087db --- /dev/null +++ b/lib/utils.py @@ -0,0 +1,321 @@ +from random import random +import re + + +## {{{ http://code.activestate.com/recipes/577257/ (r1) + +_slugify_strip_re = re.compile(r"[^\w\s-]") +_slugify_hyphenate_re = re.compile(r"[-\s]+") + + +def slugify(value): + """ + Normalizes string, converts to lowercase, removes non-alpha characters, + and converts spaces to hyphens. + + From Django's "django/template/defaultfilters.py". + """ + import unicodedata + + value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode() + value = _slugify_strip_re.sub("", value).strip().lower() + return _slugify_hyphenate_re.sub("-", value) + + +## end of http://code.activestate.com/recipes/577257/ }}} + + +def calculate_toc_html(toc): + """Return the HTML for the current TOC. + + This expects the `_toc` attribute to have been set on this instance. + """ + if toc is None: + return None + + def indent(): + return " " * (len(h_stack) - 1) + + lines = [] + h_stack = [0] # stack of header-level numbers + for level, id, name in toc: + if level > h_stack[-1]: + lines.append("%s
      " % indent()) + h_stack.append(level) + elif level == h_stack[-1]: + lines[-1] += "" + else: + while level < h_stack[-1]: + h_stack.pop() + if not lines[-1].endswith(""): + lines[-1] += "" + lines.append("%s
    " % indent()) + lines.append('%s
  • %s' % (indent(), id, name)) + while len(h_stack) > 1: + h_stack.pop() + if not lines[-1].endswith("
  • "): + lines[-1] += "" + lines.append("%s" % indent()) + return "\n".join(lines) + "\n" + + +# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 +def curry(*args, **kwargs): + function, args = args[0], args[1:] + + def result(*rest, **kwrest): + combined = kwargs.copy() + combined.update(kwrest) + return function(*args + rest, **combined) + + return result + + +# Recipe: regex_from_encoded_pattern (1.0) +def regex_from_encoded_pattern(s): + """'foo' -> re.compile(re.escape('foo')) + '/foo/' -> re.compile('foo') + '/foo/i' -> re.compile('foo', re.I) + """ + if s.startswith("/") and s.rfind("/") != 0: + # Parse it: /PATTERN/FLAGS + idx = s.rfind("/") + _, flags_str = s[1:idx], s[idx + 1 :] + flag_from_char = { + "i": re.IGNORECASE, + "l": re.LOCALE, + "s": re.DOTALL, + "m": re.MULTILINE, + "u": re.UNICODE, + } + flags = 0 + for char in flags_str: + try: + flags |= flag_from_char[char] + except KeyError: + raise ValueError( + "unsupported regex flag: '%s' in '%s' " + "(must be one of '%s')" + % (char, s, "".join(list(flag_from_char.keys()))) + ) + return re.compile(s[1:idx], flags) + else: # not an encoded regex + return re.compile(re.escape(s)) + + +# Recipe: dedent (0.1.2) +def dedentlines(lines, tabsize=8, skip_first_line=False): + """dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines + + "lines" is a list of lines to dedent. + "tabsize" is the tab width to use for indent width calculations. + "skip_first_line" is a boolean indicating if the first line should + be skipped for calculating the indent width and for dedenting. + This is sometimes useful for docstrings and similar. + + Same as dedent() except operates on a sequence of lines. Note: the + lines list is modified **in-place**. + """ + DEBUG = False + if DEBUG: + print( + "dedent: dedent(..., tabsize=%d, skip_first_line=%r)" + % (tabsize, skip_first_line) + ) + margin = None + for i, line in enumerate(lines): + if i == 0 and skip_first_line: + continue + indent = 0 + for ch in line: + if ch == " ": + indent += 1 + elif ch == "\t": + indent += tabsize - (indent % tabsize) + elif ch in "\r\n": + continue # skip all-whitespace lines + else: + break + else: + continue # skip all-whitespace lines + if DEBUG: + print("dedent: indent=%d: %r" % (indent, line)) + if margin is None: + margin = indent + else: + margin = min(margin, indent) + if DEBUG: + print("dedent: margin=%r" % margin) + + if margin is not None and margin > 0: + for i, line in enumerate(lines): + if i == 0 and skip_first_line: + continue + removed = 0 + for j, ch in enumerate(line): + if ch == " ": + removed += 1 + elif ch == "\t": + removed += tabsize - (removed % tabsize) + elif ch in "\r\n": + if DEBUG: + print("dedent: %r: EOL -> strip up to EOL" % line) + lines[i] = lines[i][j:] + break + else: + raise ValueError( + "unexpected non-whitespace char %r in " + "line %r while removing %d-space margin" % (ch, line, margin) + ) + if DEBUG: + print( + "dedent: %r: %r -> removed %d/%d" % (line, ch, removed, margin) + ) + if removed == margin: + lines[i] = lines[i][j + 1 :] + break + elif removed > margin: + lines[i] = " " * (removed - margin) + lines[i][j + 1 :] + break + else: + if removed: + lines[i] = lines[i][removed:] + return lines + + +def dedent(text, tabsize=8, skip_first_line=False): + """dedent(text, tabsize=8, skip_first_line=False) -> dedented text + + "text" is the text to dedent. + "tabsize" is the tab width to use for indent width calculations. + "skip_first_line" is a boolean indicating if the first line should + be skipped for calculating the indent width and for dedenting. + This is sometimes useful for docstrings and similar. + + textwrap.dedent(s), but don't expand tabs to spaces + """ + lines = text.splitlines(1) + dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) + return "".join(lines) + + +class memoized(object): + """Decorator that caches a function's return value each time it is called. + If called later with the same arguments, the cached value is returned, and + not re-evaluated. + + http://wiki.python.org/moin/PythonDecoratorLibrary + """ + + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + try: + return self.cache[args] + except KeyError: + self.cache[args] = value = self.func(*args) + return value + except TypeError: + # uncachable -- for instance, passing a list as an argument. + # Better to not cache than to blow up entirely. + return self.func(*args) + + def __repr__(self): + """Return the function's docstring.""" + return self.func.__doc__ + + +def xml_oneliner_re_from_tab_width(tab_width): + """Standalone XML processing instruction regex.""" + return re.compile( + r""" + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,%d} + (?: + <\?\w+\b\s+.*?\?> # XML processing instruction + | + <\w+:\w+\b\s+.*?/> # namespaced single tag + ) + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + """ + % (tab_width - 1), + re.X, + ) + + +xml_oneliner_re_from_tab_width = memoized(xml_oneliner_re_from_tab_width) + + +def hr_tag_re_from_tab_width(tab_width): + return re.compile( + r""" + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in \1 + [ ]{0,%d} + <(hr) # start tag = \2 + \b # word break + ([^<>])*? # + /?> # the matching end tag + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + """ + % (tab_width - 1), + re.X, + ) + + +hr_tag_re_from_tab_width = memoized(hr_tag_re_from_tab_width) + + +def xml_escape_attr(ampersand_re, attr, skip_single_quote=True): + """Escape the given string for use in an HTML/XML tag attribute. + + By default this doesn't bother with escaping `'` to `'`, presuming that + the tag attribute is surrounded by double quotes. + """ + escaped = ampersand_re.sub("&", attr) + + escaped = attr.replace('"', """).replace("<", "<").replace(">", ">") + if not skip_single_quote: + escaped = escaped.replace("'", "'") + return escaped + + +def xml_encode_email_char_at_random(ch): + r = random() + # Roughly 10% raw, 45% hex, 45% dec. + # '@' *must* be encoded. I [John Gruber] insist. + # Issue 26: '_' must be encoded. + if r > 0.9 and ch not in "@_": + return ch + elif r < 0.45: + # The [1:] is to drop leading '0': 0x63 -> x63 + return "&#%s;" % hex(ord(ch))[1:] + else: + return "&#%s;" % ord(ch) + + +def html_escape_url(attr, safe_mode=False): + """Replace special characters that are potentially malicious in url string.""" + escaped = (attr + .replace('"', '"') + .replace('<', '<') + .replace('>', '>')) + if safe_mode: + escaped = escaped.replace('+', ' ') + escaped = escaped.replace("'", "'") + return escaped \ No newline at end of file