Merge pull request #257 from gsnedders/det_encoding

Update encoding detection; r=nobody!
html5lib · Jul 11, 2016 · 699276b · 699276b
2 parents dce9d62 + fc9f63b
commit 699276b
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 83 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -46,6 +46,10 @@ Released on XXX
 
 * **Drop support of charade, now that chardet is supported once more.**
 
+* **Replace the charset keyword argument on parse and related methods
+  with a set of keyword arguments: override_encoding, transport_encoding,
+  same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
 
 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~

diff --git a/README.rst b/README.rst
@@ -51,7 +51,7 @@ pass into html5lib as follows:
   import html5lib
 
   with closing(urlopen("http://example.com/")) as f:
-      document = html5lib.parse(f, encoding=f.info().getparam("charset"))
+      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
 
 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
   import html5lib
 
   with urlopen("http://example.com/") as f:
-      document = html5lib.parse(f, encoding=f.info().get_content_charset())
+      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
 
 To have more control over the parser, create a parser object explicitly.
 For instance, to make the parser raise exceptions on parse errors, use:

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -28,19 +28,17 @@
 )
 
 
-def parse(doc, treebuilder="etree", encoding=None,
-          namespaceHTMLElements=True, scripting=False):
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
     """Parse a string or file-like object into a tree"""
     tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
-    return p.parse(doc, encoding=encoding, scripting=scripting)
+    return p.parse(doc, **kwargs)
 
 
-def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
-                  namespaceHTMLElements=True, scripting=False):
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
     tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
-    return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting)
+    return p.parseFragment(doc, container=container, **kwargs)
 
 
 def method_decorator_metaclass(function):
@@ -59,18 +57,13 @@ class HTMLParser(object):
     """HTML parser. Generates a tree structure from a stream of (possibly
         malformed) HTML"""
 
-    def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
-                 strict=False, namespaceHTMLElements=True, debug=False):
+    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
         """
         strict - raise an exception when a parse error is encountered
 
         tree - a treebuilder class controlling the type of tree that will be
         returned. Built in treebuilders can be accessed through
         html5lib.treebuilders.getTreeBuilder(treeType)
-
-        tokenizer - a class that provides a stream of tokens to the treebuilder.
-        This may be replaced for e.g. a sanitizer which converts some tags to
-        text
         """
 
         # Raise an exception on the first error encountered
@@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
         if tree is None:
             tree = treebuilders.getTreeBuilder("etree")
         self.tree = tree(namespaceHTMLElements)
-        self.tokenizer_class = tokenizer
         self.errors = []
 
         self.phases = dict([(name, cls(self, self.tree)) for name, cls in
                             getPhases(debug).items()])
 
-    def _parse(self, stream, innerHTML=False, container="div", encoding=None,
-               parseMeta=True, useChardet=True, scripting=False, **kwargs):
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
 
         self.innerHTMLMode = innerHTML
         self.container = container
         self.scripting = scripting
-        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
-                                              parseMeta=parseMeta,
-                                              useChardet=useChardet,
-                                              parser=self, **kwargs)
+        self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
         self.reset()
 
         try:
@@ -232,8 +220,7 @@ def normalizedTokens(self):
         for token in self.tokenizer:
             yield self.normalizeToken(token)
 
-    def parse(self, stream, encoding=None, parseMeta=True,
-              useChardet=True, scripting=False):
+    def parse(self, stream, *args, **kwargs):
         """Parse a HTML document into a well-formed tree
 
         stream - a filelike object or string containing the HTML to be parsed
@@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True,
 
         scripting - treat noscript elements as if javascript was turned on
         """
-        self._parse(stream, innerHTML=False, encoding=encoding,
-                    parseMeta=parseMeta, useChardet=useChardet, scripting=scripting)
+        self._parse(stream, False, None, *args, **kwargs)
         return self.tree.getDocument()
 
-    def parseFragment(self, stream, container="div", encoding=None,
-                      parseMeta=False, useChardet=True, scripting=False):
-        # pylint:disable=unused-argument
+    def parseFragment(self, stream, *args, **kwargs):
         """Parse a HTML fragment into a well-formed tree fragment
 
         container - name of the element we're setting the innerHTML property
@@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
 
         scripting - treat noscript elements as if javascript was turned on
         """
-        self._parse(stream, True, container=container,
-                    encoding=encoding, scripting=scripting)
+        self._parse(stream, True, *args, **kwargs)
         return self.tree.getFragment()
 
     def parseError(self, errorcode="XXX-undefined-error", datavars=None):

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
         return b"".join(rv)
 
 
-def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
+def HTMLInputStream(source, **kwargs):
     # Work around Python bug #20007: read(0) closes the connection.
     # http://bugs.python.org/issue20007
     if (isinstance(source, http_client.HTTPResponse) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
         isUnicode = isinstance(source, text_type)
 
     if isUnicode:
-        if encoding is not None:
-            raise TypeError("Cannot explicitly set an encoding with a unicode string")
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
 
-        return HTMLUnicodeInputStream(source)
+        return HTMLUnicodeInputStream(source, **kwargs)
     else:
-        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+        return HTMLBinaryInputStream(source, **kwargs)
 
 
 class HTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
         regardless of any BOM or later declaration (such as in a meta
         element)
 
-        parseMeta - Look for a <meta> element containing encoding information
-
         """
 
         if not utils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
 
     """
 
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         regardless of any BOM or later declaration (such as in a meta
         element)
 
-        parseMeta - Look for a <meta> element containing encoding information
-
         """
         # Raw Stream - for unicode objects this will encode to utf-8 and set
         #              self.charEncoding as appropriate
         self.rawStream = self.openStream(source)
 
         HTMLUnicodeInputStream.__init__(self, self.rawStream)
 
-        self.charEncoding = (lookupEncoding(encoding), "certain")
-
         # Encoding Information
         # Number of bytes to use when looking for a meta element with
         # encoding information
         self.numBytesMeta = 1024
         # Number of bytes to use when using detecting encoding using chardet
         self.numBytesChardet = 100
-        # Encoding to use if no other information can be found
-        self.defaultEncoding = "windows-1252"
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding
 
-        # Detect encoding iff no explicit "transport level" encoding is supplied
-        if (self.charEncoding[0] is None):
-            self.charEncoding = self.detectEncoding(parseMeta, chardet)
-            assert self.charEncoding[0] is not None
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None
 
         # Call superclass
         self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
 
         return stream
 
-    def detectEncoding(self, parseMeta=True, chardet=True):
-        # First look for a BOM
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
         # This will also read past the BOM if present
-        encoding = self.detectBOM()
-        confidence = "certain"
-        # If there is no BOM need to look for meta elements with encoding
-        # information
-        if encoding is None and parseMeta:
-            encoding = self.detectEncodingMeta()
-            confidence = "tentative"
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overriden, we've been overriden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
         # Guess with chardet, if available
-        if encoding is None and chardet:
-            confidence = "tentative"
+        if chardet:
             try:
                 from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
                 buffers = []
                 detector = UniversalDetector()
                 while not detector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
                 detector.close()
                 encoding = lookupEncoding(detector.result['encoding'])
                 self.rawStream.seek(0)
-            except ImportError:
-                pass
-        # If all else fails use the default encoding
-        if encoding is None:
-            confidence = "tentative"
-            encoding = lookupEncoding(self.defaultEncoding)
+                if encoding is not None:
+                    return encoding, "tentative"
+
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
 
-        return encoding, confidence
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"
 
     def changeEncoding(self, newEncoding):
         assert self.charEncoding[1] != "certain"

diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -2,6 +2,8 @@
 
 import os
 
+import pytest
+
 from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
 from html5lib import HTMLParser, inputstream
 
@@ -11,7 +13,7 @@ def test_basic_prescan_length():
     pad = 1024 - len(data) + 1
     data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
     assert len(data) == 1024  # Sanity
-    stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+    stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
     assert 'utf-8' == stream.charEncoding[0].name
 
 
@@ -20,14 +22,59 @@ def test_parser_reparse():
     pad = 10240 - len(data) + 1
     data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
     assert len(data) == 10240  # Sanity
-    stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+    stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
     assert 'windows-1252' == stream.charEncoding[0].name
     p = HTMLParser(namespaceHTMLElements=False)
     doc = p.parse(data, useChardet=False)
     assert 'utf-8' == p.documentEncoding
     assert doc.find(".//title").text == "Caf\u00E9"
 
 
+@pytest.mark.parametrize("expected,data,kwargs", [
+    ("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
+    ("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
+    ("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
+    ("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
+    ("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
+    ("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
+    ("windows-1252", b"", {}),
+])
+def test_parser_args(expected, data, kwargs):
+    stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
+    assert expected == stream.charEncoding[0].name
+    p = HTMLParser()
+    p.parse(data, useChardet=False, **kwargs)
+    assert expected == p.documentEncoding
+
+
+@pytest.mark.parametrize("kwargs", [
+    {"override_encoding": "iso-8859-2"},
+    {"override_encoding": None},
+    {"transport_encoding": "iso-8859-2"},
+    {"transport_encoding": None},
+    {"same_origin_parent_encoding": "iso-8859-2"},
+    {"same_origin_parent_encoding": None},
+    {"likely_encoding": "iso-8859-2"},
+    {"likely_encoding": None},
+    {"default_encoding": "iso-8859-2"},
+    {"default_encoding": None},
+    {"foo_encoding": "iso-8859-2"},
+    {"foo_encoding": None},
+])
+def test_parser_args_raises(kwargs):
+    with pytest.raises(TypeError) as exc_info:
+        p = HTMLParser()
+        p.parse("", useChardet=False, **kwargs)
+    assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
+
+
 def runParserEncodingTest(data, encoding):
     p = HTMLParser()
     assert p.documentEncoding is None
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
 
 
 def runPreScanEncodingTest(data, encoding):
-    stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+    stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
     encoding = encoding.lower().decode("ascii")
 
     # Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
             yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
             yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
 
+
 # pylint:disable=wrong-import-position
 try:
     import chardet  # noqa