TYP: type read_xml and deprecate passing positional arguments (#45133)

pandas-dev · Jan 4, 2022 · a2aa477 · a2aa477
1 parent 490b189
commit a2aa477
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 31 deletions.
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -611,6 +611,7 @@ Other Deprecations
 - Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`)
 - Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`)
 - Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`):
+- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`):
 - Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
 - Deprecated the behavior of :func:`to_datetime` with the string "now" with ``utc=False``; in a future version this will match ``Timestamp("now")``, which in turn matches :meth:`Timestamp.now` returning the local time (:issue:`18705`)
 - Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -246,6 +246,7 @@ def closed(self) -> bool:
 CompressionOptions = Optional[
     Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
 ]
+XMLParsers = Literal["lxml", "etree"]
 
 
 # types in DataFrameFormatter

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -5,19 +5,24 @@
 from __future__ import annotations
 
 import io
+from typing import Sequence
 
 from pandas._typing import (
     CompressionOptions,
     FilePath,
     ReadBuffer,
     StorageOptions,
+    XMLParsers,
 )
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     ParserError,
 )
-from pandas.util._decorators import doc
+from pandas.util._decorators import (
+    deprecate_nonkeyword_arguments,
+    doc,
+)
 
 from pandas.core.dtypes.common import is_list_like
 
@@ -98,17 +103,17 @@ class _XMLFrameParser:
 
     def __init__(
         self,
-        path_or_buffer,
-        xpath,
-        namespaces,
-        elems_only,
-        attrs_only,
-        names,
-        encoding,
-        stylesheet,
+        path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+        xpath: str,
+        namespaces: dict[str, str] | None,
+        elems_only: bool,
+        attrs_only: bool,
+        names: Sequence[str] | None,
+        encoding: str | None,
+        stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
         compression: CompressionOptions,
         storage_options: StorageOptions,
-    ) -> None:
+    ):
         self.path_or_buffer = path_or_buffer
         self.xpath = xpath
         self.namespaces = namespaces
@@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser):
     XPath 1.0 and XSLT 1.0.
     """
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
     def parse_data(self) -> list[dict[str, str | None]]:
         """
         Parse xml data.
@@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes:
             curr_parser = XMLParser(encoding=self.encoding)
 
             if isinstance(xml_data, io.StringIO):
+                if self.encoding is None:
+                    raise TypeError(
+                        "Can not pass encoding None when input is StringIO."
+                    )
+
                 doc = fromstring(
                     xml_data.getvalue().encode(self.encoding), parser=curr_parser
                 )
@@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes:
 
 def get_data_from_filepath(
     filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
-    encoding,
+    encoding: str | None,
     compression: CompressionOptions,
     storage_options: StorageOptions,
 ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
@@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes.
 
 
 def _parse(
-    path_or_buffer,
-    xpath,
-    namespaces,
-    elems_only,
-    attrs_only,
-    names,
-    encoding,
-    parser,
-    stylesheet,
+    path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+    xpath: str,
+    namespaces: dict[str, str] | None,
+    elems_only: bool,
+    attrs_only: bool,
+    names: Sequence[str] | None,
+    encoding: str | None,
+    parser: XMLParsers,
+    stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
     compression: CompressionOptions,
     storage_options: StorageOptions,
     **kwargs,
@@ -686,11 +693,11 @@ def _parse(
         * If parser is not lxml or etree.
     """
 
-    lxml = import_optional_dependency("lxml.etree", errors="ignore")
-
     p: _EtreeFrameParser | _LxmlFrameParser
 
     if parser == "lxml":
+        lxml = import_optional_dependency("lxml.etree", errors="ignore")
+
         if lxml is not None:
             p = _LxmlFrameParser(
                 path_or_buffer,
@@ -728,19 +735,23 @@ def _parse(
     return _data_to_frame(data=data_dicts, **kwargs)
 
 
+@deprecate_nonkeyword_arguments(
+    version=None, allowed_args=["path_or_buffer"], stacklevel=2
+)
 @doc(
     storage_options=_shared_docs["storage_options"],
     decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
 )
 def read_xml(
     path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
-    xpath: str | None = "./*",
-    namespaces: dict | list[dict] | None = None,
-    elems_only: bool | None = False,
-    attrs_only: bool | None = False,
-    names: list[str] | None = None,
+    xpath: str = "./*",
+    namespaces: dict[str, str] | None = None,
+    elems_only: bool = False,
+    attrs_only: bool = False,
+    names: Sequence[str] | None = None,
+    # encoding can not be None for lxml and StringIO input
     encoding: str | None = "utf-8",
-    parser: str | None = "lxml",
+    parser: XMLParsers = "lxml",
     stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions = None,

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath):
     tm.assert_frame_equal(df_lxml, df_etree)
 
 
+@td.skip_if_no("lxml")
+def test_wrong_encoding_for_lxml():
+    # GH#45133
+    data = """<data>
+  <row>
+    <a>c</a>
+  </row>
+</data>
+"""
+    with pytest.raises(TypeError, match="encoding None"):
+        read_xml(StringIO(data), parser="lxml", encoding=None)
+
+
+def test_none_encoding_etree():
+    # GH#45133
+    data = """<data>
+  <row>
+    <a>c</a>
+  </row>
+</data>
+"""
+    result = read_xml(StringIO(data), parser="etree", encoding=None)
+    expected = DataFrame({"a": ["c"]})
+    tm.assert_frame_equal(result, expected)
+
+
 # PARSER
 
 
@@ -769,6 +795,19 @@ def test_stylesheet_file(datapath):
     tm.assert_frame_equal(df_kml, df_style)
 
 
+def test_read_xml_passing_as_positional_deprecated(datapath, parser):
+    # GH#45133
+    kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
+
+    with tm.assert_produces_warning(FutureWarning, match="keyword-only"):
+        read_xml(
+            kml,
+            ".//k:Placemark",
+            namespaces={"k": "http://www.opengis.net/kml/2.2"},
+            parser=parser,
+        )
+
+
 @td.skip_if_no("lxml")
 def test_stylesheet_file_like(datapath, mode):
     kml = datapath("io", "data", "xml", "cta_rail_lines.kml")