Replace lxml.html.clean_html with bleach; drop lxml dependency

jupyter · Sep 6, 2022 · 5ecd89d · 5ecd89d
1 parent 765285e
commit 5ecd89d
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 4 deletions.
diff --git a/nbconvert/exporters/templateexporter.py b/nbconvert/exporters/templateexporter.py
@@ -22,7 +22,6 @@
     TemplateNotFound,
 )
 from jupyter_core.paths import jupyter_path
-from lxml.html.clean import clean_html
 from traitlets import Bool, Dict, HasTraits, List, Unicode, default, observe, validate
 from traitlets.config import Config
 from traitlets.utils.importstring import import_item
@@ -72,7 +71,7 @@
     "escape_html": lambda s: html.escape(str(s)),
     "escape_html_keep_quotes": lambda s: html.escape(str(s), quote=False),
     # For sanitizing HTML for any XSS
-    "clean_html": clean_html,
+    "clean_html": filters.clean_html,
     "strip_trailing_newline": filters.strip_trailing_newline,
     "text_base64": filters.text_base64,
 }

diff --git a/nbconvert/filters/strings.py b/nbconvert/filters/strings.py
@@ -8,6 +8,7 @@
 # Distributed under the terms of the Modified BSD License.
 
 import base64
+import bleach
 import os
 import re
 import textwrap
@@ -21,6 +22,7 @@
 __all__ = [
     "wrap_text",
     "html2text",
+    "clean_html",
     "add_anchor",
     "strip_dollars",
     "strip_files_prefix",
@@ -75,6 +77,21 @@ def html2text(element):
     return text
 
 
+def clean_html(element):
+    if isinstance(element, bytes):
+        element = element.decode()
+    else:
+        element = str(element)
+    return bleach.clean(
+        element,
+        tags=[*bleach.ALLOWED_TAGS, "div", "pre", "code", "span"],
+        attributes={
+            **bleach.ALLOWED_ATTRIBUTES,
+            "*": ["class", "id"],
+        },
+    )
+
+
 def _convert_header_id(header_contents):
     """Convert header contents to valid id value. Takes string as input, returns string.
 

diff --git a/nbconvert/tests/test_nbconvertapp.py b/nbconvert/tests/test_nbconvertapp.py
@@ -367,7 +367,7 @@ def test_no_input(self):
             '<span class="o">=</span> '
             '<span class="n">symbols</span>'
             '<span class="p">(</span>'
-            "<span class=\"s1\">'x y z'</span>"
+            '<span class="s1">&#39;x y z&#39;</span>'
             '<span class="p">)</span>'
         )
         for no_input_flag in (False, True):

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,6 @@ classifiers = [
 urls = {Homepage = "https://jupyter.org"}
 requires-python = ">=3.7"
 dependencies = [
-    "lxml",
     "beautifulsoup4",
     "bleach",
     "defusedxml",