Merge remote-tracking branch 'upstream/master' into non-hashable-seri…

…es-or-frame
pandas-dev · May 12, 2021 · 55a9dc8 · 55a9dc8
2 parents aefea78 + 4ec6925
commit 55a9dc8
Show file tree

Hide file tree

Showing 109 changed files with 2,706 additions and 1,660 deletions.
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -230,16 +230,21 @@ def time_contains(self, dtype, regex):
 
 class Split:
 
-    params = [True, False]
-    param_names = ["expand"]
+    params = (["str", "string", "arrow_string"], [True, False])
+    param_names = ["dtype", "expand"]
+
+    def setup(self, dtype, expand):
+        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
 
-    def setup(self, expand):
-        self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
+        try:
+            self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--")
+        except ImportError:
+            raise NotImplementedError
 
-    def time_split(self, expand):
+    def time_split(self, dtype, expand):
         self.s.str.split("--", expand=expand)
 
-    def time_rsplit(self, expand):
+    def time_rsplit(self, dtype, expand):
         self.s.str.rsplit("--", expand=expand)
 
 

diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -405,6 +405,35 @@ Blaze provides a standard API for doing computations with various
 in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables,
 PySpark.
 
+`Cylon <https://cylondata.org/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Cylon is a fast, scalable, distributed memory parallel runtime with a pandas
+like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache
+Arrow format to represent the data in-memory. Cylon DataFrame API implements
+most of the core operators of pandas such as merge, filter, join, concat,
+group-by, drop_duplicates, etc. These operators are designed to work across
+thousands of cores to scale applications. It can interoperate with pandas
+DataFrame by reading data from pandas or converting data to pandas so users
+can selectively scale parts of their pandas DataFrame applications.
+
+.. code:: python
+
+    from pycylon import read_csv, DataFrame, CylonEnv
+    from pycylon.net import MPIConfig
+
+    # Initialize Cylon distributed environment
+    config: MPIConfig = MPIConfig()
+    env: CylonEnv = CylonEnv(config=config, distributed=True)
+
+    df1: DataFrame = read_csv('/tmp/csv1.csv')
+    df2: DataFrame = read_csv('/tmp/csv2.csv')
+
+    # Using 1000s of cores across the cluster to compute the join
+    df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env)
+
+    print(df3)
+
 `Dask <https://dask.readthedocs.io/en/latest/>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -1184,11 +1184,9 @@ a single value and returning a single value. For example:
 
    df4
 
-
    def f(x):
        return len(str(x))
 
-
    df4["one"].map(f)
    df4.applymap(f)
 

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -494,15 +494,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
 
    S = pd.Series([i / 100.0 for i in range(1, 11)])
 
-
    def cum_ret(x, y):
        return x * (1 + y)
 
-
    def red(x):
        return functools.reduce(cum_ret, x, 1.0)
 
-
    S.expanding().apply(red, raw=True)
 
 
@@ -514,12 +511,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
    df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]})
    gb = df.groupby("A")
 
-
    def replace(g):
        mask = g < 0
        return g.where(mask, g[~mask].mean())
 
-
    gb.transform(replace)
 
 `Sort groups by aggregated data
@@ -551,13 +546,11 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
    rng = pd.date_range(start="2014-10-07", periods=10, freq="2min")
    ts = pd.Series(data=list(range(10)), index=rng)
 
-
    def MyCust(x):
        if len(x) > 2:
            return x[1] * 1.234
        return pd.NaT
 
-
    mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust}
    ts.resample("5min").apply(mhc)
    ts
@@ -803,11 +796,9 @@ Apply
        index=["I", "II", "III"],
    )
 
-
    def SeriesFromSubList(aList):
        return pd.Series(aList)
 
-
    df_orgz = pd.concat(
        {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}
    )
@@ -827,12 +818,10 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc
    )
    df
 
-
    def gm(df, const):
        v = ((((df["A"] + df["B"]) + 1).cumprod()) - 1) * const
        return v.iloc[-1]
 
-
    s = pd.Series(
        {
            df.index[i]: gm(df.iloc[i: min(i + 51, len(df) - 1)], 5)
@@ -859,11 +848,9 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight
    )
    df
 
-
    def vwap(bars):
        return (bars.Close * bars.Volume).sum() / bars.Volume.sum()
 
-
    window = 5
    s = pd.concat(
        [

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1617,12 +1617,10 @@ column index name will be used as the name of the inserted column:
        }
    )
 
-
    def compute_metrics(x):
        result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
        return pd.Series(result, name="metrics")
 
-
    result = df.groupby("a").apply(compute_metrics)
 
    result

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -4648,11 +4648,9 @@ chunks.
 
    store.append("dfeq", dfeq, data_columns=["number"])
 
-
    def chunks(l, n):
        return [l[i: i + n] for i in range(0, len(l), n)]
 
-
    evens = [2, 4, 6, 8, 10]
    coordinates = store.select_as_coordinates("dfeq", "number=evens")
    for c in chunks(coordinates, 2):

diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
@@ -1578,4 +1578,5 @@ to ``True``.
 You may also keep all the original values even if they are equal.
 
 .. ipython:: python
+
    df.compare(df2, keep_shape=True, keep_equal=True)
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -18,7 +18,6 @@ Reshaping by pivoting DataFrame objects
 
    import pandas._testing as tm
 
-
    def unpivot(frame):
        N, K = frame.shape
        data = {
@@ -29,7 +28,6 @@ Reshaping by pivoting DataFrame objects
        columns = ["date", "variable", "value"]
        return pd.DataFrame(data, columns=columns)
 
-
    df = unpivot(tm.makeTimeDataFrame(3))
 
 Data is often stored in so-called "stacked" or "record" format:

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
@@ -345,6 +345,7 @@ we need to supply the divisions manually.
 Now we can do things like fast random access with ``.loc``.
 
 .. ipython:: python
+   :okwarning:
 
    ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute()
 

diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
@@ -325,7 +325,6 @@ In the example below, we transform the ``Series`` to a sparse representation of
        row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
    )
 
-
    A
    A.todense()
    rows

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -297,24 +297,19 @@ positional argument (a regex object) and return a string.
    # Reverse every lowercase alphabetic word
    pat = r"[a-z]+"
 
-
    def repl(m):
        return m.group(0)[::-1]
 
-
    pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(
        pat, repl, regex=True
    )
 
-
    # Using regex groups
    pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
 
-
    def repl(m):
        return m.group("two").swapcase()
 
-
    pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(
        pat, repl, regex=True
    )

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1422,7 +1422,6 @@ An example of how holidays and holiday calendars are defined:
         MO,
     )
 
-
     class ExampleCalendar(AbstractHolidayCalendar):
         rules = [
             USMemorialDay,
@@ -1435,7 +1434,6 @@ An example of how holidays and holiday calendars are defined:
             ),
         ]
 
-
     cal = ExampleCalendar()
     cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31))
 
@@ -1707,13 +1705,11 @@ We can instead only resample those groups where we have points as follows:
     from functools import partial
     from pandas.tseries.frequencies import to_offset
 
-
     def round(t, freq):
         # round a Timestamp to a specified freq
         freq = to_offset(freq)
         return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value)
 
-
     ts.groupby(partial(round, freq="3T")).sum()
 
 .. _timeseries.aggregate:
@@ -2255,11 +2251,9 @@ To convert from an ``int64`` based YYYYMMDD representation.
    s = pd.Series([20121231, 20141130, 99991231])
    s
 
-
    def conv(x):
        return pd.Period(year=x // 10000, month=x // 100 % 100, day=x % 100, freq="D")
 
-
    s.apply(conv)
    s.apply(conv)[2]
 

diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
@@ -212,7 +212,6 @@ from present information back to past information. This allows the rolling windo
 
    df
 
-
 .. _window.custom_rolling_window:
 
 Custom window rolling
@@ -294,13 +293,12 @@ conditions. In these cases it can be useful to perform forward-looking rolling w
 This :func:`BaseIndexer <pandas.api.indexers.BaseIndexer>` subclass implements a closed fixed-width
 forward-looking rolling window, and we can use it as follows:
 
-.. ipython:: ipython
+.. ipython:: python
 
    from pandas.api.indexers import FixedForwardWindowIndexer
    indexer = FixedForwardWindowIndexer(window_size=2)
    df.rolling(indexer, min_periods=1).sum()
 
-
 .. _window.rolling_apply:
 
 Rolling apply
@@ -319,7 +317,6 @@ the windows are cast as :class:`Series` objects (``raw=False``) or ndarray objec
    s = pd.Series(range(10))
    s.rolling(window=4).apply(mad, raw=True)
 
-
 .. _window.numba_engine:
 
 Numba engine

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -226,6 +226,7 @@ Other enhancements
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
 - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
+- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -644,6 +645,7 @@ Deprecations
 - Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
 - The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` is deprecated and will be removed in a future version (:issue:`37643`)
 - Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword  and already existing columns (:issue:`22818`)
+- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`)
 
 .. ---------------------------------------------------------------------------
 
@@ -748,7 +750,7 @@ Strings
 ^^^^^^^
 
 - Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`)
--
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` ignoring replacements with ``regex=True`` for ``StringDType`` data (:issue:`41333`, :issue:`35977`)
 
 Interval
 ^^^^^^^^
@@ -788,9 +790,11 @@ Indexing
 - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`)
 - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`)
 - Bug in :meth:`DataFrame.__setitem__` and :meth:`DataFrame.iloc.__setitem__` raising ``ValueError`` when trying to index with a row-slice and setting a list as values (:issue:`40440`)
+- Bug in :meth:`DataFrame.loc` not raising ``KeyError`` when key was not found in :class:`MultiIndex` when levels contain more values than used (:issue:`41170`)
 - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`)
 - Bug in :meth:`DataFrame.loc` incorrectly matching non-boolean index elements (:issue:`20432`)
 - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`)
+- Bug in :meth:`DataFrame.__setitem__` raising ``TypeError`` when using a str subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`)
 
 Missing
 ^^^^^^^
@@ -807,6 +811,7 @@ MultiIndex
 - Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in result (:issue:`38623`)
 - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differently ordered (:issue:`38439`)
 - Bug in :meth:`MultiIndex.intersection` always returning empty when intersecting with :class:`CategoricalIndex` (:issue:`38653`)
+- Bug in :meth:`MultiIndex.reindex` raising ``ValueError`` with empty MultiIndex and indexing only a specific level (:issue:`41170`)
 
 I/O
 ^^^
@@ -836,6 +841,7 @@ I/O
 - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`)
 - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
 - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)
+- Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`)
 - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`)
 - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
 
@@ -852,6 +858,7 @@ Plotting
 - Prevent warnings when matplotlib's ``constrained_layout`` is enabled (:issue:`25261`)
 - Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``yerr`` while others didn't (partial fix of :issue:`39522`)
 - Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``secondary_y`` and others use ``legend=False`` (:issue:`40044`)
+- Bug in :meth:`DataFrame.plot.box` in box plot when ``dark_background`` theme was selected, caps or min/max markers for the plot was not visible (:issue:`40769`)
 
 
 Groupby/resample/rolling
@@ -893,6 +900,8 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`)
 - Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`)
 - Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`)
+- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`)
+- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
 
 Reshaping
 ^^^^^^^^^