From fb3cebd1f212c17ad626c87cc102cd8b422472e7 Mon Sep 17 00:00:00 2001 From: Tiago de Paula Date: Fri, 22 Apr 2022 12:59:47 -0300 Subject: [PATCH 1/6] Update markdown to html converter to mistune 2.0.2 --- nbconvert/filters/markdown_mistune.py | 128 ++++++++++++-------------- 1 file changed, 57 insertions(+), 71 deletions(-) diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py index 382a53882..19bf4b255 100644 --- a/nbconvert/filters/markdown_mistune.py +++ b/nbconvert/filters/markdown_mistune.py @@ -21,7 +21,7 @@ from cgi import escape as html_escape import bs4 -import mistune +from mistune import BlockParser, HTMLRenderer, InlineParser, Markdown from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import get_lexer_by_name @@ -34,99 +34,83 @@ class InvalidNotebook(Exception): pass -class MathBlockGrammar(mistune.BlockGrammar): - """This defines a single regex comprised of the different patterns that - identify math content spanning multiple lines. These are used by the - MathBlockLexer. +class MathBlockParser(BlockParser): + """This acts as a pass-through to the MathInlineParser. It is needed in + order to avoid other block level rules splitting math sections apart. """ - multi_math_str = "|".join( - [r"^\$\$.*?\$\$", r"^\\\\\[.*?\\\\\]", r"^\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}"] + MULTILINE_MATH = re.compile( + r"(?=2.0.0`, where the pattern is passed + to the undocumented `re.Scanner`. """ + return f"(?s:{pattern})" - inline_math = re.compile(r"^\$(.+?)\$|^\\\\\((.+?)\\\\\)", re.DOTALL) - block_math = re.compile(r"^\$\$(.*?)\$\$|^\\\\\[(.*?)\\\\\]", re.DOTALL) - latex_environment = re.compile(r"^\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}", re.DOTALL) - text = re.compile(r"^[\s\S]+?(?=[\\%s\n" % mistune.escape(code) + return super().block_code(code) formatter = HtmlFormatter() return highlight(code, lexer, formatter) @@ -147,8 +131,8 @@ def inline_html(self, html): return super().inline_html(html) - def header(self, text, level, raw=None): - html = super().header(text, level, raw=raw) + def heading(self, text, level): + html = super().heading(text, level) if self.options.get("exclude_anchor_links"): return html anchor_link_text = self.options.get("anchor_link_text", "¶") @@ -157,23 +141,25 @@ def header(self, text, level, raw=None): def escape_html(self, text): return html_escape(text) + def multiline_math(self, text): + return text + def block_math(self, text): - return "$$%s$$" % self.escape_html(text) + return f"$${self.escape_html(text)}$$" def latex_environment(self, name, text): - name = self.escape_html(name) - text = self.escape_html(text) - return rf"\begin{{{name}}}{text}\end{{{name}}}" + name, text = self.escape_html(name), self.escape_html(text) + return f"\\begin{{{name}}}{text}\\end{{{name}}}" def inline_math(self, text): - return "$%s$" % self.escape_html(text) + return f"${self.escape_html(text)}$" - def image(self, src, title, text): + def image(self, src, text, title): """Rendering a image with title and text. :param src: source link of the image. - :param title: title text of the image. :param text: alt text of the image. + :param title: title text of the image. """ attachments = self.options.get("attachments", {}) attachment_prefix = "attachment:" From c6f55c37fdde421e81b1f7962f9fde6cd0800155 Mon Sep 17 00:00:00 2001 From: Tiago de Paula Date: Fri, 22 Apr 2022 13:22:59 -0300 Subject: [PATCH 2/6] Compatibility options for IPythonRenderer on mistune ver2 --- nbconvert/filters/markdown_mistune.py | 48 ++++++++++++++++++--------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py index 19bf4b255..bba701760 100644 --- a/nbconvert/filters/markdown_mistune.py +++ b/nbconvert/filters/markdown_mistune.py @@ -98,8 +98,32 @@ def __init__(self, renderer, block=None, inline=None, plugins=None): inline = MathInlineParser(renderer, hard_wrap=False) super().__init__(renderer, block, inline, plugins) + def render(self, s): + """Compatibility method with `mistune==0.8.4`.""" + return self.parse(s) + class IPythonRenderer(HTMLRenderer): + def __init__( + self, + escape=True, + allow_harmful_protocols=None, + embed_images=False, + exclude_anchor_links=False, + anchor_link_text="¶", + path="", + attachments=None, + ): + super().__init__(escape, allow_harmful_protocols) + self.embed_images = embed_images + self.exclude_anchor_links = exclude_anchor_links + self.anchor_link_text = anchor_link_text + self.path = path + if attachments is not None: + self.attachments = attachments + else: + self.attachments = {} + def block_code(self, code, info=None): if info: try: @@ -116,27 +140,22 @@ def block_code(self, code, info=None): return highlight(code, lexer, formatter) def block_html(self, html): - embed_images = self.options.get("embed_images", False) - - if embed_images: + if self.embed_images: html = self._html_embed_images(html) return super().block_html(html) def inline_html(self, html): - embed_images = self.options.get("embed_images", False) - - if embed_images: + if self.embed_images: html = self._html_embed_images(html) return super().inline_html(html) def heading(self, text, level): html = super().heading(text, level) - if self.options.get("exclude_anchor_links"): + if self.exclude_anchor_links: return html - anchor_link_text = self.options.get("anchor_link_text", "¶") - return add_anchor(html, anchor_link_text=anchor_link_text) + return add_anchor(html, anchor_link_text=self.anchor_link_text) def escape_html(self, text): return html_escape(text) @@ -161,17 +180,15 @@ def image(self, src, text, title): :param text: alt text of the image. :param title: title text of the image. """ - attachments = self.options.get("attachments", {}) attachment_prefix = "attachment:" - embed_images = self.options.get("embed_images", False) if src.startswith(attachment_prefix): name = src[len(attachment_prefix) :] - if name not in attachments: + if name not in self.attachments: raise InvalidNotebook(f"missing attachment: {name}") - attachment = attachments[name] + attachment = self.attachments[name] # we choose vector over raster, and lossless over lossy preferred_mime_types = ["image/svg+xml", "image/png", "image/jpeg"] for preferred_mime_type in preferred_mime_types: @@ -183,7 +200,7 @@ def image(self, src, text, title): data = attachment[mime_type] src = "data:" + mime_type + ";base64," + data - elif embed_images: + elif self.embed_images: base64_url = self._src_to_base64(src) if base64_url is not None: @@ -197,8 +214,7 @@ def _src_to_base64(self, src): :param src: source link of the file. :return: the base64 url or None if the file was not found. """ - path = self.options.get("path", "") - src_path = os.path.join(path, src) + src_path = os.path.join(self.path, src) if not os.path.exists(src_path): return None From 16189b9782f2f0af4c32b4c9e6dda68149a0d0a9 Mon Sep 17 00:00:00 2001 From: Tiago de Paula Date: Sat, 23 Apr 2022 12:14:45 -0300 Subject: [PATCH 3/6] Fix some discrepancies with mistune version 2 - 'AXT_HEADING' is now requiring whitespace after the '#', fixed with a new regex - 're.Scanner' is not able to extract the correct group, the text needs trimming before use - 'javascript:...' links considered harmful by 'HTMLRenderer', should we disable it too? --- nbconvert/filters/markdown_mistune.py | 32 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py index bba701760..0f3cd60ef 100644 --- a/nbconvert/filters/markdown_mistune.py +++ b/nbconvert/filters/markdown_mistune.py @@ -48,6 +48,9 @@ class MathBlockParser(BlockParser): RULE_NAMES = ("multiline_math",) + BlockParser.RULE_NAMES + # Regex for header that doesn't require space after '#' + AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)\s*([^\n]*?)$") + def parse_multiline_math(self, m, state): """Pass token through mutiline math.""" return {"type": "multiline_math", "text": m.group(0)} @@ -63,6 +66,19 @@ def _dotall(pattern): return f"(?s:{pattern})" +def _strip(text, *, prefix, suffix): + """Remove prefix and suffix from text, if present. + + `InlineParser` sometimes return these affixes, even though it shouldn't. + """ + np, ns = len(prefix), len(suffix) + if text[:np] == prefix: + text = text[np:] + if text[-ns:] == suffix: + text = text[:-ns] + return text + + class MathInlineParser(InlineParser): r"""This interprets the content of LaTeX style math objects. @@ -78,11 +94,19 @@ class MathInlineParser(InlineParser): RULE_NAMES = ("block_math", "inline_math", "latex_environment") + InlineParser.RULE_NAMES def parse_inline_math(self, m, state): - text = m.group(1) or m.group(2) + text = m.group(1) + if text: + text = _strip(text, prefix="$", suffix="$") + else: + text = _strip(m.group(2), prefix="\\\\(", suffix="\\\\)") return "inline_math", text def parse_block_math(self, m, state): - text = m.group(1) or m.group(2) + text = m.group(1) + if text: + text = _strip(text, prefix="$$", suffix="$$") + else: + text = _strip(m.group(2), prefix="\\\\[", suffix="\\\\]") return "block_math", text def parse_latex_environment(self, m, state): @@ -107,7 +131,7 @@ class IPythonRenderer(HTMLRenderer): def __init__( self, escape=True, - allow_harmful_protocols=None, + allow_harmful_protocols=True, embed_images=False, exclude_anchor_links=False, anchor_link_text="¶", @@ -206,7 +230,7 @@ def image(self, src, text, title): if base64_url is not None: src = base64_url - return super().image(src, title, text) + return super().image(src, text, title) def _src_to_base64(self, src): """Turn the source file into a base64 url. From 7f0959c8f2445ce5aabf8b05c1e62fb8fb231436 Mon Sep 17 00:00:00 2001 From: Tiago de Paula Date: Wed, 27 Apr 2022 02:09:42 -0300 Subject: [PATCH 4/6] Split BLOCK_MATH and INLINE_MATH into four regexes Important to avoid problems with the TeX-style block math where the trailing '$$' is not removed. --- nbconvert/filters/markdown_mistune.py | 52 +++++++++++++-------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py index 0f3cd60ef..636e1e8cc 100644 --- a/nbconvert/filters/markdown_mistune.py +++ b/nbconvert/filters/markdown_mistune.py @@ -66,19 +66,6 @@ def _dotall(pattern): return f"(?s:{pattern})" -def _strip(text, *, prefix, suffix): - """Remove prefix and suffix from text, if present. - - `InlineParser` sometimes return these affixes, even though it shouldn't. - """ - np, ns = len(prefix), len(suffix) - if text[:np] == prefix: - text = text[np:] - if text[-ns:] == suffix: - text = text[:-ns] - return text - - class MathInlineParser(InlineParser): r"""This interprets the content of LaTeX style math objects. @@ -87,27 +74,38 @@ class MathInlineParser(InlineParser): delimiters from all these varieties, and extracts the type of environment in the last case (``foo`` in this example). """ - INLINE_MATH = _dotall(r"(? Date: Mon, 9 May 2022 06:12:48 -0500 Subject: [PATCH 5/6] update mistune dep --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 060bda2c7..25f12e7c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "jupyter_core>=4.7", "jupyterlab_pygments", "MarkupSafe>=2.0", - "mistune>=0.8.1,<2", + "mistune>=2.02", "nbclient>=0.5.0", "nbformat>=5.1", "packaging", From 4f65730003f599ae90b5cb54b177fa5711b44db9 Mon Sep 17 00:00:00 2001 From: Steven Silvester Date: Mon, 9 May 2022 06:14:57 -0500 Subject: [PATCH 6/6] fix mistune version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 25f12e7c5..ae51a7970 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "jupyter_core>=4.7", "jupyterlab_pygments", "MarkupSafe>=2.0", - "mistune>=2.02", + "mistune>=2.0.2", "nbclient>=0.5.0", "nbformat>=5.1", "packaging",