diff --git a/CHANGES.rst b/CHANGES.rst index fe07f1ec..c6cbe78f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -50,6 +50,20 @@ Released on XXX with a set of keyword arguments: override_encoding, transport_encoding, same_origin_parent_encoding, likely_encoding, and default_encoding.** +* **Move filters._base, treebuilder._base, and treewalkers._base to .base + to clarify their status as public.** + +* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the + sanitizer.htmlsanitizer module and move that to saniziter. This means + anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no + code changes.** + +* **Rename treewalkers.lxmletree to .etree_lxml and + treewalkers.genshistream to .genshi to have a consistent API.** + +* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer, + utils) to be underscore prefixed to clarify their status as private. + 0.9999999/1.0b8 ~~~~~~~~~~~~~~~ diff --git a/doc/html5lib.filters.rst b/doc/html5lib.filters.rst index 1fda38a7..38d4a956 100644 --- a/doc/html5lib.filters.rst +++ b/doc/html5lib.filters.rst @@ -1,10 +1,10 @@ filters Package =============== -:mod:`_base` Module +:mod:`base` Module ------------------- -.. automodule:: html5lib.filters._base +.. automodule:: html5lib.filters.base :members: :undoc-members: :show-inheritance: diff --git a/doc/html5lib.rst b/doc/html5lib.rst index d4ed12b4..f0646aac 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -25,42 +25,10 @@ html5lib Package :undoc-members: :show-inheritance: -:mod:`ihatexml` Module +:mod:`serializer` Module ---------------------- -.. automodule:: html5lib.ihatexml - :members: - :undoc-members: - :show-inheritance: - -:mod:`inputstream` Module -------------------------- - -.. automodule:: html5lib.inputstream - :members: - :undoc-members: - :show-inheritance: - -:mod:`sanitizer` Module ------------------------ - -.. automodule:: html5lib.sanitizer - :members: - :undoc-members: - :show-inheritance: - -:mod:`tokenizer` Module ------------------------ - -.. automodule:: html5lib.tokenizer - :members: - :undoc-members: - :show-inheritance: - -:mod:`utils` Module -------------------- - -.. automodule:: html5lib.utils +.. automodule:: html5lib.serializer :members: :undoc-members: :show-inheritance: @@ -71,7 +39,6 @@ Subpackages .. toctree:: html5lib.filters - html5lib.serializer html5lib.treebuilders html5lib.treewalkers diff --git a/doc/html5lib.serializer.rst b/doc/html5lib.serializer.rst deleted file mode 100644 index fa954742..00000000 --- a/doc/html5lib.serializer.rst +++ /dev/null @@ -1,19 +0,0 @@ -serializer Package -================== - -:mod:`serializer` Package -------------------------- - -.. automodule:: html5lib.serializer - :members: - :undoc-members: - :show-inheritance: - -:mod:`htmlserializer` Module ----------------------------- - -.. automodule:: html5lib.serializer.htmlserializer - :members: - :undoc-members: - :show-inheritance: - diff --git a/doc/html5lib.treebuilders.rst b/doc/html5lib.treebuilders.rst index 99119839..aee82142 100644 --- a/doc/html5lib.treebuilders.rst +++ b/doc/html5lib.treebuilders.rst @@ -9,10 +9,10 @@ treebuilders Package :undoc-members: :show-inheritance: -:mod:`_base` Module +:mod:`base` Module ------------------- -.. automodule:: html5lib.treebuilders._base +.. automodule:: html5lib.treebuilders.base :members: :undoc-members: :show-inheritance: diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst index 694c8194..46501258 100644 --- a/doc/html5lib.treewalkers.rst +++ b/doc/html5lib.treewalkers.rst @@ -9,10 +9,10 @@ treewalkers Package :undoc-members: :show-inheritance: -:mod:`_base` Module +:mod:`base` Module ------------------- -.. automodule:: html5lib.treewalkers._base +.. automodule:: html5lib.treewalkers.base :members: :undoc-members: :show-inheritance: @@ -33,18 +33,19 @@ treewalkers Package :undoc-members: :show-inheritance: -:mod:`genshistream` Module --------------------------- +:mod:`etree_lxml` Module +----------------------- -.. automodule:: html5lib.treewalkers.genshistream +.. automodule:: html5lib.treewalkers.etree_lxml :members: :undoc-members: :show-inheritance: -:mod:`lxmletree` Module ------------------------ -.. automodule:: html5lib.treewalkers.lxmletree +:mod:`genshi` Module +-------------------------- + +.. automodule:: html5lib.treewalkers.genshi :members: :undoc-members: - :show-inheritance: + :show-inheritance: \ No newline at end of file diff --git a/html5lib/ihatexml.py b/html5lib/_ihatexml.py similarity index 100% rename from html5lib/ihatexml.py rename to html5lib/_ihatexml.py diff --git a/html5lib/inputstream.py b/html5lib/_inputstream.py similarity index 99% rename from html5lib/inputstream.py rename to html5lib/_inputstream.py index dafe33ca..79f2331e 100644 --- a/html5lib/inputstream.py +++ b/html5lib/_inputstream.py @@ -10,7 +10,7 @@ from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import ReparseException -from . import utils +from . import _utils from io import StringIO @@ -28,7 +28,7 @@ invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa -if utils.supports_lone_surrogates: +if _utils.supports_lone_surrogates: # Use one extra step of indirection and create surrogates with # eval. Not using this indirection would introduce an illegal # unicode literal on platforms not supporting such lone @@ -176,7 +176,7 @@ def __init__(self, source): """ - if not utils.supports_lone_surrogates: + if not _utils.supports_lone_surrogates: # Such platforms will have already checked for such # surrogate errors, so no need to do this checking. self.reportCharacterErrors = None @@ -304,9 +304,9 @@ def characterErrorsUCS2(self, data): codepoint = ord(match.group()) pos = match.start() # Pretty sure there should be endianness issues here - if utils.isSurrogatePair(data[pos:pos + 2]): + if _utils.isSurrogatePair(data[pos:pos + 2]): # We have a surrogate pair! - char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2]) + char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2]) if char_val in non_bmp_invalid_codepoints: self.errors.append("invalid-codepoint") skip = True diff --git a/html5lib/tokenizer.py b/html5lib/_tokenizer.py similarity index 99% rename from html5lib/tokenizer.py rename to html5lib/_tokenizer.py index 3f10c01f..6078f66a 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/_tokenizer.py @@ -11,9 +11,9 @@ from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters -from .inputstream import HTMLInputStream +from ._inputstream import HTMLInputStream -from .trie import Trie +from ._trie import Trie entitiesTrie = Trie(entities) diff --git a/html5lib/trie/__init__.py b/html5lib/_trie/__init__.py similarity index 100% rename from html5lib/trie/__init__.py rename to html5lib/_trie/__init__.py diff --git a/html5lib/trie/_base.py b/html5lib/_trie/_base.py similarity index 100% rename from html5lib/trie/_base.py rename to html5lib/_trie/_base.py diff --git a/html5lib/trie/datrie.py b/html5lib/_trie/datrie.py similarity index 100% rename from html5lib/trie/datrie.py rename to html5lib/_trie/datrie.py diff --git a/html5lib/trie/py.py b/html5lib/_trie/py.py similarity index 100% rename from html5lib/trie/py.py rename to html5lib/_trie/py.py diff --git a/html5lib/utils.py b/html5lib/_utils.py similarity index 100% rename from html5lib/utils.py rename to html5lib/_utils.py diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py index fed6996c..4795baec 100644 --- a/html5lib/filters/alphabeticalattributes.py +++ b/html5lib/filters/alphabeticalattributes.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from . import _base +from . import base try: from collections import OrderedDict @@ -8,9 +8,9 @@ from ordereddict import OrderedDict -class Filter(_base.Filter): +class Filter(base.Filter): def __iter__(self): - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): if token["type"] in ("StartTag", "EmptyTag"): attrs = OrderedDict() for name, value in sorted(token["data"].items(), diff --git a/html5lib/filters/_base.py b/html5lib/filters/base.py similarity index 100% rename from html5lib/filters/_base.py rename to html5lib/filters/base.py diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py index ca33b70b..2059ec86 100644 --- a/html5lib/filters/inject_meta_charset.py +++ b/html5lib/filters/inject_meta_charset.py @@ -1,11 +1,11 @@ from __future__ import absolute_import, division, unicode_literals -from . import _base +from . import base -class Filter(_base.Filter): +class Filter(base.Filter): def __init__(self, source, encoding): - _base.Filter.__init__(self, source) + base.Filter.__init__(self, source) self.encoding = encoding def __iter__(self): @@ -13,7 +13,7 @@ def __iter__(self): meta_found = (self.encoding is None) pending = [] - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): type = token["type"] if type == "StartTag": if token["name"].lower() == "head": diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index af231d8e..a9c0831a 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -2,21 +2,21 @@ from six import text_type -from . import _base +from . import base from ..constants import namespaces, voidElements from ..constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -class Filter(_base.Filter): +class Filter(base.Filter): def __init__(self, source, require_matching_tags=True): super(Filter, self).__init__(source) self.require_matching_tags = require_matching_tags def __iter__(self): open_elements = [] - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): type = token["type"] if type in ("StartTag", "EmptyTag"): namespace = token["namespace"] diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py index 8f11fff4..f6edb734 100644 --- a/html5lib/filters/optionaltags.py +++ b/html5lib/filters/optionaltags.py @@ -1,9 +1,9 @@ from __future__ import absolute_import, division, unicode_literals -from . import _base +from . import base -class Filter(_base.Filter): +class Filter(base.Filter): def slider(self): previous1 = previous2 = None for token in self.source: diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index 7f81c0d1..b5ddcb93 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -5,13 +5,13 @@ from six.moves import urllib_parse as urlparse -from . import _base +from . import base from ..constants import namespaces, prefixes __all__ = ["Filter"] -acceptable_elements = frozenset(( +allowed_elements = frozenset(( (namespaces['html'], 'a'), (namespaces['html'], 'abbr'), (namespaces['html'], 'acronym'), @@ -175,7 +175,7 @@ (namespaces['svg'], 'use'), )) -acceptable_attributes = frozenset(( +allowed_attributes = frozenset(( # HTML attributes (None, 'abbr'), (None, 'accept'), @@ -552,7 +552,7 @@ (None, 'use') )) -acceptable_css_properties = frozenset(( +allowed_css_properties = frozenset(( 'azimuth', 'background-color', 'border-bottom-color', @@ -601,7 +601,7 @@ 'width', )) -acceptable_css_keywords = frozenset(( +allowed_css_keywords = frozenset(( 'auto', 'aqua', 'black', @@ -643,7 +643,7 @@ 'yellow', )) -acceptable_svg_properties = frozenset(( +allowed_svg_properties = frozenset(( 'fill', 'fill-opacity', 'fill-rule', @@ -654,7 +654,7 @@ 'stroke-opacity', )) -acceptable_protocols = frozenset(( +allowed_protocols = frozenset(( 'ed2k', 'ftp', 'http', @@ -680,7 +680,7 @@ 'data', )) -acceptable_content_types = frozenset(( +allowed_content_types = frozenset(( 'image/png', 'image/jpeg', 'image/gif', @@ -689,14 +689,6 @@ 'text/plain', )) -allowed_elements = acceptable_elements -allowed_attributes = acceptable_attributes -allowed_css_properties = acceptable_css_properties -allowed_css_keywords = acceptable_css_keywords -allowed_svg_properties = acceptable_svg_properties -allowed_protocols = acceptable_protocols -allowed_content_types = acceptable_content_types - data_content_type = re.compile(r''' ^ @@ -712,7 +704,7 @@ re.VERBOSE) -class Filter(_base.Filter): +class Filter(base.Filter): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" def __init__(self, source, @@ -739,7 +731,7 @@ def __init__(self, self.svg_allow_local_href = svg_allow_local_href def __iter__(self): - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): token = self.sanitize_token(token) if token: yield token diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py index dfc60eeb..89210528 100644 --- a/html5lib/filters/whitespace.py +++ b/html5lib/filters/whitespace.py @@ -2,20 +2,20 @@ import re -from . import _base +from . import base from ..constants import rcdataElements, spaceCharacters spaceCharacters = "".join(spaceCharacters) SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) -class Filter(_base.Filter): +class Filter(base.Filter): spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) def __iter__(self): preserve = 0 - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): type = token["type"] if type == "StartTag" \ and (preserve or token["name"] in self.spacePreserveElements): diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 6a5c8bcb..470c8a7d 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -8,13 +8,13 @@ except ImportError: from ordereddict import OrderedDict -from . import inputstream -from . import tokenizer +from . import _inputstream +from . import _tokenizer from . import treebuilders -from .treebuilders._base import Marker +from .treebuilders.base import Marker -from . import utils +from . import _utils from .constants import ( spaceCharacters, asciiUpper2Lower, specialElements, headingElements, cdataElements, rcdataElements, @@ -82,7 +82,7 @@ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kw self.innerHTMLMode = innerHTML self.container = container self.scripting = scripting - self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) + self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) self.reset() try: @@ -344,7 +344,7 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] -@utils.memoize +@_utils.memoize def getPhases(debug): def log(function): """Logger that records which phase processes each token""" @@ -586,13 +586,13 @@ class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("head", "body", "html", "br"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther @@ -632,7 +632,7 @@ class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), (("noframes", "style"), self.startTagNoFramesStyle), @@ -645,7 +645,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("head", self.endTagHead), (("br", "html", "body"), self.endTagHtmlBodyBr) ]) @@ -687,8 +687,8 @@ def startTagMeta(self, token): # the abstract Unicode string, and just use the # ContentAttrParser on that, but using UTF-8 allows all chars # to be encoded and as a ASCII-superset works. - data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) - parser = inputstream.ContentAttrParser(data) + data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = _inputstream.ContentAttrParser(data) codec = parser.parse() self.parser.tokenizer.stream.changeEncoding(codec) @@ -735,14 +735,14 @@ class InHeadNoscriptPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), (("head", "noscript"), self.startTagHeadNoscript), ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("noscript", self.endTagNoscript), ("br", self.endTagBr), ]) @@ -799,7 +799,7 @@ class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("body", self.startTagBody), ("frameset", self.startTagFrameset), @@ -809,8 +809,8 @@ def __init__(self, parser, tree): ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), - self.endTagHtmlBodyBr)]) + self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + self.endTagHtmlBodyBr)]) self.endTagHandler.default = self.endTagOther def processEOF(self): @@ -871,7 +871,7 @@ def __init__(self, parser, tree): # Set this to the default handler self.processSpaceCharacters = self.processSpaceCharactersNonPre - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("base", "basefont", "bgsound", "command", "link", "meta", "script", "style", "title"), @@ -918,7 +918,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("body", self.endTagBody), ("html", self.endTagHtml), (("address", "article", "aside", "blockquote", "button", "center", @@ -1588,9 +1588,9 @@ def endTagOther(self, token): class TextPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([]) + self.startTagHandler = _utils.MethodDispatcher([]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("script", self.endTagScript)]) self.endTagHandler.default = self.endTagOther @@ -1622,7 +1622,7 @@ class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("caption", self.startTagCaption), ("colgroup", self.startTagColgroup), @@ -1636,7 +1636,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("table", self.endTagTable), (("body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"), self.endTagIgnore) @@ -1813,14 +1813,14 @@ class InCaptionPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagTableElement) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("caption", self.endTagCaption), ("table", self.endTagTable), (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", @@ -1885,13 +1885,13 @@ class InColumnGroupPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("col", self.startTagCol) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("colgroup", self.endTagColgroup), ("col", self.endTagCol) ]) @@ -1949,7 +1949,7 @@ class InTableBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("tr", self.startTagTr), (("td", "th"), self.startTagTableCell), @@ -1958,7 +1958,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), ("table", self.endTagTable), (("body", "caption", "col", "colgroup", "html", "td", "th", @@ -2047,7 +2047,7 @@ class InRowPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-row def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("td", "th"), self.startTagTableCell), (("caption", "col", "colgroup", "tbody", "tfoot", "thead", @@ -2055,7 +2055,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("tr", self.endTagTr), ("table", self.endTagTable), (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), @@ -2136,14 +2136,14 @@ class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("td", "th"), self.endTagTableCell), (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) @@ -2212,7 +2212,7 @@ class InSelectPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("option", self.startTagOption), ("optgroup", self.startTagOptgroup), @@ -2222,7 +2222,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("option", self.endTagOption), ("optgroup", self.endTagOptgroup), ("select", self.endTagSelect) @@ -2312,13 +2312,13 @@ class InSelectInTablePhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), self.startTagTable) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), self.endTagTable) ]) @@ -2466,12 +2466,12 @@ class AfterBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)]) + self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)]) self.endTagHandler.default = self.endTagOther def processEOF(self): @@ -2514,7 +2514,7 @@ class InFramesetPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("frameset", self.startTagFrameset), ("frame", self.startTagFrame), @@ -2522,7 +2522,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("frameset", self.endTagFrameset) ]) self.endTagHandler.default = self.endTagOther @@ -2571,13 +2571,13 @@ class AfterFramesetPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("noframes", self.startTagNoframes) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("html", self.endTagHtml) ]) self.endTagHandler.default = self.endTagOther @@ -2607,7 +2607,7 @@ class AfterAfterBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml) ]) self.startTagHandler.default = self.startTagOther @@ -2645,7 +2645,7 @@ class AfterAfterFramesetPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("noframes", self.startTagNoFrames) ]) @@ -2707,7 +2707,7 @@ def processEndTag(self, token): def adjust_attributes(token, replacements): - if PY3 or utils.PY27: + if PY3 or _utils.PY27: needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) else: needs_adjustment = frozenset(token['data']) & frozenset(replacements) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer.py similarity index 85% rename from html5lib/serializer/htmlserializer.py rename to html5lib/serializer.py index 8a9439df..103dd206 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer.py @@ -5,40 +5,38 @@ from codecs import register_error, xmlcharrefreplace_errors -from ..constants import voidElements, booleanAttributes, spaceCharacters -from ..constants import rcdataElements, entities, xmlEntities -from .. import utils +from .constants import voidElements, booleanAttributes, spaceCharacters +from .constants import rcdataElements, entities, xmlEntities +from . import treewalkers, _utils from xml.sax.saxutils import escape -spaceCharacters = "".join(spaceCharacters) +_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`" +_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]") +_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars + + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" + "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" + "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" + "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" + "\u3000]") -quoteAttributeSpecChars = spaceCharacters + "\"'=<>`" -quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]") -quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars + - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" - "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" - "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" - "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" - "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" - "\u3000]") - -encode_entity_map = {} -is_ucs4 = len("\U0010FFFF") == 1 +_encode_entity_map = {} +_is_ucs4 = len("\U0010FFFF") == 1 for k, v in list(entities.items()): # skip multi-character entities - if ((is_ucs4 and len(v) > 1) or - (not is_ucs4 and len(v) > 2)): + if ((_is_ucs4 and len(v) > 1) or + (not _is_ucs4 and len(v) > 2)): continue if v != "&": if len(v) == 2: - v = utils.surrogatePairToCodepoint(v) + v = _utils.surrogatePairToCodepoint(v) else: v = ord(v) - if v not in encode_entity_map or k.islower(): + if v not in _encode_entity_map or k.islower(): # prefer < over < and similarly for &, >, etc. - encode_entity_map[v] = k + _encode_entity_map[v] = k def htmlentityreplace_errors(exc): @@ -51,14 +49,14 @@ def htmlentityreplace_errors(exc): skip = False continue index = i + exc.start - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2]) skip = True else: codepoint = ord(c) codepoints.append(codepoint) for cp in codepoints: - e = encode_entity_map.get(cp) + e = _encode_entity_map.get(cp) if e: res.append("&") res.append(e) @@ -73,6 +71,13 @@ def htmlentityreplace_errors(exc): register_error("htmlentityreplace", htmlentityreplace_errors) +def serialize(input, tree="etree", encoding=None, **serializer_opts): + # XXX: Should we cache this? + walker = treewalkers.getTreeWalker(tree) + s = HTMLSerializer(**serializer_opts) + return s.render(walker(input), encoding) + + class HTMLSerializer(object): # attribute quoting options @@ -181,24 +186,24 @@ def serialize(self, treewalker, encoding=None): self.errors = [] if encoding and self.inject_meta_charset: - from ..filters.inject_meta_charset import Filter + from .filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # Alphabetical attributes is here under the assumption that none of # the later filters add or change order of attributes; it needs to be # before the sanitizer so escaped elements come out correctly if self.alphabetical_attributes: - from ..filters.alphabeticalattributes import Filter + from .filters.alphabeticalattributes import Filter treewalker = Filter(treewalker) # WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: - from ..filters.whitespace import Filter + from .filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: - from ..filters.sanitizer import Filter + from .filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: - from ..filters.optionaltags import Filter + from .filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: @@ -251,9 +256,9 @@ def serialize(self, treewalker, encoding=None): if self.quote_attr_values == "always" or len(v) == 0: quote_attr = True elif self.quote_attr_values == "spec": - quote_attr = quoteAttributeSpec.search(v) is not None + quote_attr = _quoteAttributeSpec.search(v) is not None elif self.quote_attr_values == "legacy": - quote_attr = quoteAttributeLegacy.search(v) is not None + quote_attr = _quoteAttributeLegacy.search(v) is not None else: raise ValueError("quote_attr_values must be one of: " "'always', 'spec', or 'legacy'") diff --git a/html5lib/serializer/__init__.py b/html5lib/serializer/__init__.py deleted file mode 100644 index 8380839a..00000000 --- a/html5lib/serializer/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals - -from .. import treewalkers - -from .htmlserializer import HTMLSerializer - - -def serialize(input, tree="etree", format="html", encoding=None, - **serializer_opts): - # XXX: Should we cache this? - walker = treewalkers.getTreeWalker(tree) - if format == "html": - s = HTMLSerializer(**serializer_opts) - else: - raise ValueError("type must be html") - return s.render(walker(input), encoding) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index b6d20f24..9a411c77 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -5,7 +5,7 @@ import pytest from .support import get_data_files, test_dir, errorMessage, TestData as _TestData -from html5lib import HTMLParser, inputstream +from html5lib import HTMLParser, _inputstream def test_basic_prescan_length(): @@ -13,7 +13,7 @@ def test_basic_prescan_length(): pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'utf-8' == stream.charEncoding[0].name @@ -22,7 +22,7 @@ def test_parser_reparse(): pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) @@ -47,7 +47,7 @@ def test_parser_reparse(): ("windows-1252", b"", {}), ]) def test_parser_args(expected, data, kwargs): - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) assert expected == stream.charEncoding[0].name p = HTMLParser() p.parse(data, useChardet=False, **kwargs) @@ -85,7 +85,7 @@ def runParserEncodingTest(data, encoding): def runPreScanEncodingTest(data, encoding): - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) encoding = encoding.lower().decode("ascii") # Very crude way to ignore irrelevant tests @@ -111,6 +111,6 @@ def test_encoding(): else: def test_chardet(): with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: - encoding = inputstream.HTMLInputStream(fp.read()).charEncoding + encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" # pylint:enable=wrong-import-position diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index b3cda7d7..9333286e 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -10,7 +10,7 @@ from html5lib import constants from html5lib.filters.lint import Filter as Lint from html5lib.serializer import HTMLSerializer, serialize -from html5lib.treewalkers._base import TreeWalker +from html5lib.treewalkers.base import TreeWalker # pylint:disable=wrong-import-position optionals_loaded = [] diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index e8d9fd86..27c39538 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -11,9 +11,9 @@ import six from six.moves import http_client, urllib -from html5lib.inputstream import (BufferedStream, HTMLInputStream, - HTMLUnicodeInputStream, HTMLBinaryInputStream) -from html5lib.utils import supports_lone_surrogates +from html5lib._inputstream import (BufferedStream, HTMLInputStream, + HTMLUnicodeInputStream, HTMLBinaryInputStream) +from html5lib._utils import supports_lone_surrogates def test_basic(): diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index 255c1859..1440a722 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -8,8 +8,8 @@ import pytest from six import unichr -from html5lib.tokenizer import HTMLTokenizer -from html5lib import constants, utils +from html5lib._tokenizer import HTMLTokenizer +from html5lib import constants, _utils class TokenizerTestParser(object): @@ -156,7 +156,7 @@ def repl(m): except ValueError: # This occurs when unichr throws ValueError, which should # only be for a lone-surrogate. - if utils.supports_lone_surrogates: + if _utils.supports_lone_surrogates: raise return None diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py index 6a6b2a4c..e2328847 100644 --- a/html5lib/treebuilders/__init__.py +++ b/html5lib/treebuilders/__init__.py @@ -28,7 +28,7 @@ from __future__ import absolute_import, division, unicode_literals -from ..utils import default_etree +from .._utils import default_etree treeBuilderCache = {} diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/base.py similarity index 100% rename from html5lib/treebuilders/_base.py rename to html5lib/treebuilders/base.py diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 9d7f4824..dcfac220 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -5,10 +5,10 @@ from xml.dom import minidom, Node import weakref -from . import _base +from . import base from .. import constants from ..constants import namespaces -from ..utils import moduleFactoryFactory +from .._utils import moduleFactoryFactory def getDomBuilder(DomImplementation): @@ -50,9 +50,9 @@ def __delitem__(self, name): else: del self.element.attributes[name] - class NodeBuilder(_base.Node): + class NodeBuilder(base.Node): def __init__(self, element): - _base.Node.__init__(self, element.nodeName) + base.Node.__init__(self, element.nodeName) self.element = element namespace = property(lambda self: hasattr(self.element, "namespaceURI") and @@ -117,7 +117,7 @@ def getNameTuple(self): nameTuple = property(getNameTuple) - class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable + class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable def documentClass(self): self.dom = Dom.getDOMImplementation().createDocument(None, None, None) return weakref.proxy(self) @@ -157,12 +157,12 @@ def getDocument(self): return self.dom def getFragment(self): - return _base.TreeBuilder.getFragment(self).element + return base.TreeBuilder.getFragment(self).element def insertText(self, data, parent=None): data = data if parent != self: - _base.TreeBuilder.insertText(self, data, parent) + base.TreeBuilder.insertText(self, data, parent) else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 4d12bd45..cb1d4aef 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -5,11 +5,11 @@ import re -from . import _base -from .. import ihatexml +from . import base +from .. import _ihatexml from .. import constants from ..constants import namespaces -from ..utils import moduleFactoryFactory +from .._utils import moduleFactoryFactory tag_regexp = re.compile("{([^}]*)}(.*)") @@ -18,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class Element(_base.Node): + class Element(base.Node): def __init__(self, name, namespace=None): self._name = name self._namespace = namespace @@ -142,7 +142,7 @@ def reparentChildren(self, newParent): if self._element.text is not None: newParent._element.text += self._element.text self._element.text = "" - _base.Node.reparentChildren(self, newParent) + base.Node.reparentChildren(self, newParent) class Comment(Element): def __init__(self, data): @@ -259,7 +259,7 @@ def serializeElement(element, indent=0): def tostring(element): # pylint:disable=unused-variable """Serialize an element and its child nodes to a string""" rv = [] - filter = ihatexml.InfosetFilter() + filter = _ihatexml.InfosetFilter() def serializeElement(element): if isinstance(element, ElementTree.ElementTree): @@ -310,7 +310,7 @@ def serializeElement(element): return "".join(rv) - class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable + class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable documentClass = Document doctypeClass = DocumentType elementClass = Element @@ -332,7 +332,7 @@ def getDocument(self): return self.document._element.find("html") def getFragment(self): - return _base.TreeBuilder.getFragment(self)._element + return base.TreeBuilder.getFragment(self)._element return locals() diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 2a69769b..908820c0 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -16,11 +16,11 @@ import re import sys -from . import _base +from . import base from ..constants import DataLossWarning from .. import constants from . import etree as etree_builders -from .. import ihatexml +from .. import _ihatexml import lxml.etree as etree @@ -54,7 +54,7 @@ def _getChildNodes(self): def testSerializer(element): rv = [] - infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) + infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True) def serializeElement(element, indent=0): if not hasattr(element, "tag"): @@ -172,7 +172,7 @@ def serializeElement(element): return "".join(rv) -class TreeBuilder(_base.TreeBuilder): +class TreeBuilder(base.TreeBuilder): documentClass = Document doctypeClass = DocumentType elementClass = None @@ -182,7 +182,7 @@ class TreeBuilder(_base.TreeBuilder): def __init__(self, namespaceHTMLElements, fullTree=False): builder = etree_builders.getETreeModule(etree, fullTree=fullTree) - infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) + infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True) self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): @@ -254,10 +254,10 @@ def _getData(self): self.elementClass = Element self.commentClass = Comment # self.fragmentClass = builder.DocumentFragment - _base.TreeBuilder.__init__(self, namespaceHTMLElements) + base.TreeBuilder.__init__(self, namespaceHTMLElements) def reset(self): - _base.TreeBuilder.reset(self) + base.TreeBuilder.reset(self) self.insertComment = self.insertCommentInitial self.initial_comments = [] self.doctype = None diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py index 00ae2804..9e19a559 100644 --- a/html5lib/treewalkers/__init__.py +++ b/html5lib/treewalkers/__init__.py @@ -11,9 +11,9 @@ from __future__ import absolute_import, division, unicode_literals from .. import constants -from ..utils import default_etree +from .._utils import default_etree -__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"] +__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"] treeWalkerCache = {} @@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs): from . import dom treeWalkerCache[treeType] = dom.TreeWalker elif treeType == "genshi": - from . import genshistream - treeWalkerCache[treeType] = genshistream.TreeWalker + from . import genshi + treeWalkerCache[treeType] = genshi.TreeWalker elif treeType == "lxml": - from . import lxmletree - treeWalkerCache[treeType] = lxmletree.TreeWalker + from . import etree_lxml + treeWalkerCache[treeType] = etree_lxml.TreeWalker elif treeType == "etree": from . import etree if implementation is None: diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/base.py similarity index 100% rename from html5lib/treewalkers/_base.py rename to html5lib/treewalkers/base.py diff --git a/html5lib/treewalkers/dom.py b/html5lib/treewalkers/dom.py index ac4dcf31..b0c89b00 100644 --- a/html5lib/treewalkers/dom.py +++ b/html5lib/treewalkers/dom.py @@ -2,16 +2,16 @@ from xml.dom import Node -from . import _base +from . import base -class TreeWalker(_base.NonRecursiveTreeWalker): +class TreeWalker(base.NonRecursiveTreeWalker): def getNodeDetails(self, node): if node.nodeType == Node.DOCUMENT_TYPE_NODE: - return _base.DOCTYPE, node.name, node.publicId, node.systemId + return base.DOCTYPE, node.name, node.publicId, node.systemId elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): - return _base.TEXT, node.nodeValue + return base.TEXT, node.nodeValue elif node.nodeType == Node.ELEMENT_NODE: attrs = {} @@ -21,17 +21,17 @@ def getNodeDetails(self, node): attrs[(attr.namespaceURI, attr.localName)] = attr.value else: attrs[(None, attr.name)] = attr.value - return (_base.ELEMENT, node.namespaceURI, node.nodeName, + return (base.ELEMENT, node.namespaceURI, node.nodeName, attrs, node.hasChildNodes()) elif node.nodeType == Node.COMMENT_NODE: - return _base.COMMENT, node.nodeValue + return base.COMMENT, node.nodeValue elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): - return (_base.DOCUMENT,) + return (base.DOCUMENT,) else: - return _base.UNKNOWN, node.nodeType + return base.UNKNOWN, node.nodeType def getFirstChild(self, node): return node.firstChild diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index d3b0c50e..8f30f078 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -12,8 +12,8 @@ from six import string_types -from . import _base -from ..utils import moduleFactoryFactory +from . import base +from .._utils import moduleFactoryFactory tag_regexp = re.compile("{([^}]*)}(.*)") @@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable + class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable """Given the particular ElementTree representation, this implementation, to avoid using recursion, returns "nodes" as tuples with the following content: @@ -40,7 +40,7 @@ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element elt, _, _, flag = node if flag in ("text", "tail"): - return _base.TEXT, getattr(elt, flag) + return base.TEXT, getattr(elt, flag) else: node = elt @@ -48,14 +48,14 @@ def getNodeDetails(self, node): node = node.getroot() if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"): - return (_base.DOCUMENT,) + return (base.DOCUMENT,) elif node.tag == "": - return (_base.DOCTYPE, node.text, + return (base.DOCTYPE, node.text, node.get("publicId"), node.get("systemId")) elif node.tag == ElementTreeCommentType: - return _base.COMMENT, node.text + return base.COMMENT, node.text else: assert isinstance(node.tag, string_types), type(node.tag) @@ -73,7 +73,7 @@ def getNodeDetails(self, node): attrs[(match.group(1), match.group(2))] = value else: attrs[(None, name)] = value - return (_base.ELEMENT, namespace, tag, + return (base.ELEMENT, namespace, tag, attrs, len(node) or node.text) def getFirstChild(self, node): diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/etree_lxml.py similarity index 90% rename from html5lib/treewalkers/lxmletree.py rename to html5lib/treewalkers/etree_lxml.py index ff31a44e..fb236311 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/etree_lxml.py @@ -4,9 +4,9 @@ from lxml import etree from ..treebuilders.etree import tag_regexp -from . import _base +from . import base -from .. import ihatexml +from .. import _ihatexml def ensure_str(s): @@ -122,7 +122,7 @@ def __len__(self): return len(self.obj) -class TreeWalker(_base.NonRecursiveTreeWalker): +class TreeWalker(base.NonRecursiveTreeWalker): def __init__(self, tree): # pylint:disable=redefined-variable-type if isinstance(tree, list): @@ -131,29 +131,29 @@ def __init__(self, tree): else: self.fragmentChildren = set() tree = Root(tree) - _base.NonRecursiveTreeWalker.__init__(self, tree) - self.filter = ihatexml.InfosetFilter() + base.NonRecursiveTreeWalker.__init__(self, tree) + self.filter = _ihatexml.InfosetFilter() def getNodeDetails(self, node): if isinstance(node, tuple): # Text node node, key = node assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key - return _base.TEXT, ensure_str(getattr(node, key)) + return base.TEXT, ensure_str(getattr(node, key)) elif isinstance(node, Root): - return (_base.DOCUMENT,) + return (base.DOCUMENT,) elif isinstance(node, Doctype): - return _base.DOCTYPE, node.name, node.public_id, node.system_id + return base.DOCTYPE, node.name, node.public_id, node.system_id elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): - return _base.TEXT, ensure_str(node.obj) + return base.TEXT, ensure_str(node.obj) elif node.tag == etree.Comment: - return _base.COMMENT, ensure_str(node.text) + return base.COMMENT, ensure_str(node.text) elif node.tag == etree.Entity: - return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &; + return base.ENTITY, ensure_str(node.text)[1:-1] # strip &; else: # This is assumed to be an ordinary element @@ -172,7 +172,7 @@ def getNodeDetails(self, node): attrs[(match.group(1), match.group(2))] = value else: attrs[(None, name)] = value - return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), + return (base.ELEMENT, namespace, self.filter.fromXmlName(tag), attrs, len(node) > 0 or node.text) def getFirstChild(self, node): diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshi.py similarity index 97% rename from html5lib/treewalkers/genshistream.py rename to html5lib/treewalkers/genshi.py index 61cbfede..7483be27 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshi.py @@ -4,12 +4,12 @@ from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT -from . import _base +from . import base from ..constants import voidElements, namespaces -class TreeWalker(_base.TreeWalker): +class TreeWalker(base.TreeWalker): def __iter__(self): # Buffer the events so we can pass in the following one previous = None diff --git a/parse.py b/parse.py index d5087fb8..3e65c330 100755 --- a/parse.py +++ b/parse.py @@ -11,7 +11,7 @@ from html5lib import html5parser from html5lib import treebuilders, serializer, treewalkers from html5lib import constants -from html5lib import utils +from html5lib import _utils def parse(): @@ -116,7 +116,7 @@ def printOutput(parser, document, opts): import lxml.etree sys.stdout.write(lxml.etree.tostring(document, encoding="unicode")) elif tb == "etree": - sys.stdout.write(utils.default_etree.tostring(document, encoding="unicode")) + sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document]