forked from jupyter/nbconvert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
markdown_mistune.py
271 lines (205 loc) · 8.43 KB
/
markdown_mistune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""Markdown filters with mistune
Used from markdown.py
"""
# Copyright (c) IPython Development Team.
# Distributed under the terms of the Modified BSD License.
import base64
import mimetypes
import os
import re
from functools import partial
try:
from html import escape
html_escape = partial(escape, quote=False)
except ImportError:
# Python 2
from cgi import escape as html_escape
import bs4
from mistune import BlockParser, HTMLRenderer, InlineParser, Markdown
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import get_lexer_by_name
from pygments.util import ClassNotFound
from nbconvert.filters.strings import add_anchor
class InvalidNotebook(Exception):
pass
class MathBlockParser(BlockParser):
"""This acts as a pass-through to the MathInlineParser. It is needed in
order to avoid other block level rules splitting math sections apart.
"""
MULTILINE_MATH = re.compile(
r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|"
r"\\\\\[.*?\\\\\]|"
r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}",
re.DOTALL,
)
RULE_NAMES = ("multiline_math",) + BlockParser.RULE_NAMES
# Regex for header that doesn't require space after '#'
AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)\s*([^\n]*?)$")
def parse_multiline_math(self, m, state):
"""Pass token through mutiline math."""
return {"type": "multiline_math", "text": m.group(0)}
def _dotall(pattern):
"""Make the '.' special character match any character inside the pattern, including a newline.
This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL` when
it is the only pattern used. It is necessary since `mistune>=2.0.0`, where the pattern is passed
to the undocumented `re.Scanner`.
"""
return f"(?s:{pattern})"
class MathInlineParser(InlineParser):
r"""This interprets the content of LaTeX style math objects.
In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``,
and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips
delimiters from all these varieties, and extracts the type of environment
in the last case (``foo`` in this example).
"""
BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$")
BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]")
INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$")
INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)")
LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}")
# The order is important here
RULE_NAMES = (
"block_math_tex",
"block_math_latex",
"inline_math_tex",
"inline_math_latex",
"latex_environment",
) + InlineParser.RULE_NAMES
def parse_block_math_tex(self, m, state):
# sometimes the Scanner keeps the final '$$', so we use the
# full matched string and remove the math markers
text = m.group(0)[2:-2]
return "block_math", text
def parse_block_math_latex(self, m, state):
text = m.group(1)
return "block_math", text
def parse_inline_math_tex(self, m, state):
text = m.group(1)
return "inline_math", text
def parse_inline_math_latex(self, m, state):
text = m.group(1)
return "inline_math", text
def parse_latex_environment(self, m, state):
name, text = m.group(1), m.group(2)
return "latex_environment", name, text
class MarkdownWithMath(Markdown):
def __init__(self, renderer, block=None, inline=None, plugins=None):
if block is None:
block = MathBlockParser()
if inline is None:
inline = MathInlineParser(renderer, hard_wrap=False)
super().__init__(renderer, block, inline, plugins)
def render(self, s):
"""Compatibility method with `mistune==0.8.4`."""
return self.parse(s)
class IPythonRenderer(HTMLRenderer):
def __init__(
self,
escape=True,
allow_harmful_protocols=True,
embed_images=False,
exclude_anchor_links=False,
anchor_link_text="¶",
path="",
attachments=None,
):
super().__init__(escape, allow_harmful_protocols)
self.embed_images = embed_images
self.exclude_anchor_links = exclude_anchor_links
self.anchor_link_text = anchor_link_text
self.path = path
if attachments is not None:
self.attachments = attachments
else:
self.attachments = {}
def block_code(self, code, info=None):
if info:
try:
lang = info.strip().split(None, 1)[0]
lexer = get_lexer_by_name(lang, stripall=True)
except ClassNotFound:
code = lang + "\n" + code
lang = None
if not lang:
return super().block_code(code)
formatter = HtmlFormatter()
return highlight(code, lexer, formatter)
def block_html(self, html):
if self.embed_images:
html = self._html_embed_images(html)
return super().block_html(html)
def inline_html(self, html):
if self.embed_images:
html = self._html_embed_images(html)
return super().inline_html(html)
def heading(self, text, level):
html = super().heading(text, level)
if self.exclude_anchor_links:
return html
return add_anchor(html, anchor_link_text=self.anchor_link_text)
def escape_html(self, text):
return html_escape(text)
def multiline_math(self, text):
return text
def block_math(self, text):
return f"$${self.escape_html(text)}$$"
def latex_environment(self, name, text):
name, text = self.escape_html(name), self.escape_html(text)
return f"\\begin{{{name}}}{text}\\end{{{name}}}"
def inline_math(self, text):
return f"${self.escape_html(text)}$"
def image(self, src, text, title):
"""Rendering a image with title and text.
:param src: source link of the image.
:param text: alt text of the image.
:param title: title text of the image.
"""
attachment_prefix = "attachment:"
if src.startswith(attachment_prefix):
name = src[len(attachment_prefix) :]
if name not in self.attachments:
raise InvalidNotebook(f"missing attachment: {name}")
attachment = self.attachments[name]
# we choose vector over raster, and lossless over lossy
preferred_mime_types = ["image/svg+xml", "image/png", "image/jpeg"]
for preferred_mime_type in preferred_mime_types:
if preferred_mime_type in attachment:
break
else: # otherwise we choose the first mimetype we can find
preferred_mime_type = list(attachment.keys())[0]
mime_type = preferred_mime_type
data = attachment[mime_type]
src = "data:" + mime_type + ";base64," + data
elif self.embed_images:
base64_url = self._src_to_base64(src)
if base64_url is not None:
src = base64_url
return super().image(src, text, title)
def _src_to_base64(self, src):
"""Turn the source file into a base64 url.
:param src: source link of the file.
:return: the base64 url or None if the file was not found.
"""
src_path = os.path.join(self.path, src)
if not os.path.exists(src_path):
return None
with open(src_path, "rb") as fobj:
mime_type = mimetypes.guess_type(src_path)[0]
base64_data = base64.b64encode(fobj.read())
base64_data = base64_data.replace(b"\n", b"").decode("ascii")
return f"data:{mime_type};base64,{base64_data}"
def _html_embed_images(self, html):
parsed_html = bs4.BeautifulSoup(html, features="html.parser")
imgs = parsed_html.find_all("img")
# Replace img tags's sources by base64 dataurls
for img in imgs:
if "src" not in img.attrs:
continue
base64_url = self._src_to_base64(img.attrs["src"])
if base64_url is not None:
img.attrs["src"] = base64_url
return str(parsed_html)
def markdown2html_mistune(source):
"""Convert a markdown string to HTML using mistune"""
return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)