html5lib · gsnedders · Jul 14, 2016 · Jul 12, 2016 · Jul 12, 2016 · Jul 12, 2016
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -50,6 +50,20 @@ Released on XXX
   with a set of keyword arguments: override_encoding, transport_encoding,
   same_origin_parent_encoding, likely_encoding, and default_encoding.**
 
+* **Move filters._base, treebuilder._base, and treewalkers._base to .base
+  to clarify their status as public.**
+
+* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
+  sanitizer.htmlsanitizer module and move that to saniziter. This means
+  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
+  code changes.**
+
+* **Rename treewalkers.lxmletree to .etree_lxml and
+  treewalkers.genshistream to .genshi to have a consistent API.**
+
+* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
+  utils) to be underscore prefixed to clarify their status as private.
+
 
 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~

diff --git a/doc/html5lib.filters.rst b/doc/html5lib.filters.rst
@@ -1,10 +1,10 @@
 filters Package
 ===============
 
-:mod:`_base` Module
+:mod:`base` Module
 -------------------
 
-.. automodule:: html5lib.filters._base
+.. automodule:: html5lib.filters.base
     :members:
     :undoc-members:
     :show-inheritance:

diff --git a/doc/html5lib.rst b/doc/html5lib.rst
@@ -25,42 +25,10 @@ html5lib Package
     :undoc-members:
     :show-inheritance:
 
-:mod:`ihatexml` Module
+:mod:`serializer` Module
 ----------------------
 
-.. automodule:: html5lib.ihatexml
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`inputstream` Module
--------------------------
-
-.. automodule:: html5lib.inputstream
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`sanitizer` Module
------------------------
-
-.. automodule:: html5lib.sanitizer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`tokenizer` Module
------------------------
-
-.. automodule:: html5lib.tokenizer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`utils` Module
--------------------
-
-.. automodule:: html5lib.utils
+.. automodule:: html5lib.serializer
     :members:
     :undoc-members:
     :show-inheritance:
@@ -71,7 +39,6 @@ Subpackages
 .. toctree::
 
     html5lib.filters
-    html5lib.serializer
     html5lib.treebuilders
     html5lib.treewalkers
 
diff --git a/doc/html5lib.serializer.rst b/doc/html5lib.serializer.rst
diff --git a/doc/html5lib.treebuilders.rst b/doc/html5lib.treebuilders.rst
@@ -9,10 +9,10 @@ treebuilders Package
     :undoc-members:
     :show-inheritance:
 
-:mod:`_base` Module
+:mod:`base` Module
 -------------------
 
-.. automodule:: html5lib.treebuilders._base
+.. automodule:: html5lib.treebuilders.base
     :members:
     :undoc-members:
     :show-inheritance:

diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst
@@ -9,10 +9,10 @@ treewalkers Package
     :undoc-members:
     :show-inheritance:
 
-:mod:`_base` Module
+:mod:`base` Module
 -------------------
 
-.. automodule:: html5lib.treewalkers._base
+.. automodule:: html5lib.treewalkers.base
     :members:
     :undoc-members:
     :show-inheritance:
@@ -33,18 +33,19 @@ treewalkers Package
     :undoc-members:
     :show-inheritance:
 
-:mod:`genshistream` Module
---------------------------
+:mod:`etree_lxml` Module
+-----------------------
 
-.. automodule:: html5lib.treewalkers.genshistream
+.. automodule:: html5lib.treewalkers.etree_lxml
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`lxmletree` Module
------------------------
 
-.. automodule:: html5lib.treewalkers.lxmletree
+:mod:`genshi` Module
+--------------------------
+
+.. automodule:: html5lib.treewalkers.genshi
     :members:
     :undoc-members:
-    :show-inheritance:
+    :show-inheritance:
diff --git a/html5lib/ihatexml.py → html5lib/_ihatexml.py b/html5lib/ihatexml.py → html5lib/_ihatexml.py
diff --git a/html5lib/inputstream.py → html5lib/_inputstream.py b/html5lib/inputstream.py → html5lib/_inputstream.py
@@ -10,7 +10,7 @@
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from .constants import ReparseException
-from . import utils
+from . import _utils
 
 from io import StringIO
 
@@ -28,7 +28,7 @@
 
 invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
 
-if utils.supports_lone_surrogates:
+if _utils.supports_lone_surrogates:
     # Use one extra step of indirection and create surrogates with
     # eval. Not using this indirection would introduce an illegal
     # unicode literal on platforms not supporting such lone
@@ -176,7 +176,7 @@ def __init__(self, source):
 
         """
 
-        if not utils.supports_lone_surrogates:
+        if not _utils.supports_lone_surrogates:
             # Such platforms will have already checked for such
             # surrogate errors, so no need to do this checking.
             self.reportCharacterErrors = None
@@ -304,9 +304,9 @@ def characterErrorsUCS2(self, data):
             codepoint = ord(match.group())
             pos = match.start()
             # Pretty sure there should be endianness issues here
-            if utils.isSurrogatePair(data[pos:pos + 2]):
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
                 # We have a surrogate pair!
-                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
                 if char_val in non_bmp_invalid_codepoints:
                     self.errors.append("invalid-codepoint")
                 skip = True

diff --git a/html5lib/tokenizer.py → html5lib/_tokenizer.py b/html5lib/tokenizer.py → html5lib/_tokenizer.py
@@ -11,9 +11,9 @@
 from .constants import tokenTypes, tagTokenTypes
 from .constants import replacementCharacters
 
-from .inputstream import HTMLInputStream
+from ._inputstream import HTMLInputStream
 
-from .trie import Trie
+from ._trie import Trie
 
 entitiesTrie = Trie(entities)
 

diff --git a/html5lib/trie/__init__.py → html5lib/_trie/__init__.py b/html5lib/trie/__init__.py → html5lib/_trie/__init__.py
diff --git a/html5lib/trie/_base.py → html5lib/_trie/_base.py b/html5lib/trie/_base.py → html5lib/_trie/_base.py
diff --git a/html5lib/trie/datrie.py → html5lib/_trie/datrie.py b/html5lib/trie/datrie.py → html5lib/_trie/datrie.py
diff --git a/html5lib/trie/py.py → html5lib/_trie/py.py b/html5lib/trie/py.py → html5lib/_trie/py.py
diff --git a/html5lib/utils.py → html5lib/_utils.py b/html5lib/utils.py → html5lib/_utils.py
diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
@@ -1,16 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import _base
+from . import base
 
 try:
     from collections import OrderedDict
 except ImportError:
     from ordereddict import OrderedDict
 
 
-class Filter(_base.Filter):
+class Filter(base.Filter):
     def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
             if token["type"] in ("StartTag", "EmptyTag"):
                 attrs = OrderedDict()
                 for name, value in sorted(token["data"].items(),

diff --git a/html5lib/filters/_base.py → html5lib/filters/base.py b/html5lib/filters/_base.py → html5lib/filters/base.py
diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
@@ -1,19 +1,19 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import _base
+from . import base
 
 
-class Filter(_base.Filter):
+class Filter(base.Filter):
     def __init__(self, source, encoding):
-        _base.Filter.__init__(self, source)
+        base.Filter.__init__(self, source)
         self.encoding = encoding
 
     def __iter__(self):
         state = "pre_head"
         meta_found = (self.encoding is None)
         pending = []
 
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
             type = token["type"]
             if type == "StartTag":
                 if token["name"].lower() == "head":

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
@@ -2,21 +2,21 @@
 
 from six import text_type
 
-from . import _base
+from . import base
 from ..constants import namespaces, voidElements
 
 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
 
 
-class Filter(_base.Filter):
+class Filter(base.Filter):
     def __init__(self, source, require_matching_tags=True):
         super(Filter, self).__init__(source)
         self.require_matching_tags = require_matching_tags
 
     def __iter__(self):
         open_elements = []
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
             type = token["type"]
             if type in ("StartTag", "EmptyTag"):
                 namespace = token["namespace"]

diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
@@ -1,9 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import _base
+from . import base
 
 
-class Filter(_base.Filter):
+class Filter(base.Filter):
     def slider(self):
         previous1 = previous2 = None
         for token in self.source:

diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py
@@ -5,13 +5,13 @@
 
 from six.moves import urllib_parse as urlparse
 
-from . import _base
+from . import base
 from ..constants import namespaces, prefixes
 
 __all__ = ["Filter"]
 
 
-acceptable_elements = frozenset((
+allowed_elements = frozenset((
     (namespaces['html'], 'a'),
     (namespaces['html'], 'abbr'),
     (namespaces['html'], 'acronym'),
@@ -175,7 +175,7 @@
     (namespaces['svg'], 'use'),
 ))
 
-acceptable_attributes = frozenset((
+allowed_attributes = frozenset((
     # HTML attributes
     (None, 'abbr'),
     (None, 'accept'),
@@ -552,7 +552,7 @@
     (None, 'use')
 ))
 
-acceptable_css_properties = frozenset((
+allowed_css_properties = frozenset((
     'azimuth',
     'background-color',
     'border-bottom-color',
@@ -601,7 +601,7 @@
     'width',
 ))
 
-acceptable_css_keywords = frozenset((
+allowed_css_keywords = frozenset((
     'auto',
     'aqua',
     'black',
@@ -643,7 +643,7 @@
     'yellow',
 ))
 
-acceptable_svg_properties = frozenset((
+allowed_svg_properties = frozenset((
     'fill',
     'fill-opacity',
     'fill-rule',
@@ -654,7 +654,7 @@
     'stroke-opacity',
 ))
 
-acceptable_protocols = frozenset((
+allowed_protocols = frozenset((
     'ed2k',
     'ftp',
     'http',
@@ -680,7 +680,7 @@
     'data',
 ))
 
-acceptable_content_types = frozenset((
+allowed_content_types = frozenset((
     'image/png',
     'image/jpeg',
     'image/gif',
@@ -689,14 +689,6 @@
     'text/plain',
 ))
 
-allowed_elements = acceptable_elements
-allowed_attributes = acceptable_attributes
-allowed_css_properties = acceptable_css_properties
-allowed_css_keywords = acceptable_css_keywords
-allowed_svg_properties = acceptable_svg_properties
-allowed_protocols = acceptable_protocols
-allowed_content_types = acceptable_content_types
-
 
 data_content_type = re.compile(r'''
                                 ^
@@ -712,7 +704,7 @@
                                re.VERBOSE)
 
 
-class Filter(_base.Filter):
+class Filter(base.Filter):
     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
     def __init__(self,
                  source,
@@ -739,7 +731,7 @@ def __init__(self,
         self.svg_allow_local_href = svg_allow_local_href
 
     def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
             token = self.sanitize_token(token)
             if token:
                 yield token