From 08fa73f683c4a57aa7fe605b448c55c393d9d81a Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 10 Oct 2022 13:31:42 -0500 Subject: [PATCH 1/6] Update frame.py --- doc/source/whatsnew/v1.5.1.rst | 1 + pandas/core/frame.py | 28 +++++++++++------ pandas/tests/frame/methods/test_cov_corr.py | 35 ++++++++++++++++++++- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index 4518c6f544e48..d3a804ff9f400 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -88,6 +88,7 @@ Fixed regressions - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`) - Fixed regression in :meth:`DataFrame.to_parquet` raising when file name was specified as ``bytes`` (:issue:`48944`) - Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`) +- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8b6235374bed0..0f2619dd2122f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -161,6 +161,7 @@ from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( + BaseMaskedArray, DatetimeArray, ExtensionArray, PeriodArray, @@ -10590,23 +10591,30 @@ def corrwith( corrs = {} if numeric_only: cols = self.select_dtypes(include=np.number).columns - ndf = self[cols].values.transpose() else: cols = self.columns - ndf = self.values.transpose() k = other.values + k_mask = ~other.isna() + if isinstance(k, BaseMaskedArray): + k = k._data if method == "pearson": - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ + for col in cols: + val = self[col].values + nonnull_mask = ~self[col].isna() & k_mask + if isinstance(val, BaseMaskedArray): + val = val._data + corrs[col] = np.corrcoef(val[nonnull_mask], k[nonnull_mask])[ 0, 1 ] else: - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[cols[i]] = np.corrcoef( - r[nonnull_mask].argsort().argsort(), - k[nonnull_mask].argsort().argsort(), + for col in cols: + val = self[col].values + nonnull_mask = ~self[col].isna() & k_mask + if isinstance(val, BaseMaskedArray): + val = val._data + corrs[col] = np.corrcoef( + libalgos.rank_1d(val[nonnull_mask]), + libalgos.rank_1d(k[nonnull_mask]), )[0, 1] return Series(corrs) else: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index ee9af3f436943..2d3cc6ff815cf 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only): expected = Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) else: - with pytest.raises(TypeError, match="not supported for the input types"): + with pytest.raises( + TypeError, + match=r"unsupported operand type\(s\) for /: 'str' and 'int'", + ): df.corrwith(s, numeric_only=numeric_only) def test_corrwith_index_intersection(self): @@ -406,3 +409,33 @@ def test_corrwith_kendall(self): result = df.corrwith(df**2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) + + def test_corrwith_spearman_with_tied_data(self): + # GH#21925 + df = DataFrame( + { + "A": [2, 5, 8, 9], + "B": [2, np.nan, 8, 9], + "C": Series([2, np.nan, 8, 9], dtype="Int64"), + "D": [0, 1, 1, 0], + "E": [0, np.nan, 1, 0], + "F": Series([0, np.nan, 1, 0], dtype="Float64"), + "G": [False, True, True, False], + "H": Series([False, pd.NA, True, False], dtype="boolean"), + }, + ) + ser_list = [ + Series([0, 1, 1, 0]), + Series([0.0, 1.0, 1.0, 0.0]), + Series([False, True, True, False]), + Series([0, pd.NA, 1, 0], dtype="Int64"), + Series([0, pd.NA, 1, 0], dtype="Float64"), + Series([False, pd.NA, True, False], dtype="boolean"), + ] + expected = Series( + [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], + index=["A", "B", "C", "D", "E", "F", "G", "H"], + ) + for ser in ser_list: + result = df.corrwith(ser, method="spearman", numeric_only=False) + tm.assert_series_equal(result, expected) From 7a219127ff6e47f9f1feb105dcfa456063d82f7d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 17 Oct 2022 09:21:27 +0100 Subject: [PATCH 2/6] Revert "PERF: faster corrwith method for pearson and spearman correlation when other is a Series and axis = 0 (column-wise) (#46174)" This reverts commit 5efb570ec3de616dfeb036d0ee622275955b7888. --- pandas/core/frame.py | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f2619dd2122f..95de613fd2752 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -161,7 +161,6 @@ from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( - BaseMaskedArray, DatetimeArray, ExtensionArray, PeriodArray, @@ -10578,47 +10577,8 @@ def corrwith( if numeric_only is lib.no_default and len(this.columns) < len(self.columns): com.deprecate_numeric_only_default(type(self), "corrwith") - # GH46174: when other is a Series object and axis=0, we achieve a speedup over - # passing .corr() to .apply() by taking the columns as ndarrays and iterating - # over the transposition row-wise. Then we delegate the correlation coefficient - # computation and null-masking to np.corrcoef and np.isnan respectively, - # which are much faster. We exploit the fact that the Spearman correlation - # of two vectors is equal to the Pearson correlation of their ranks to use - # substantially the same method for Pearson and Spearman, - # just with intermediate argsorts on the latter. if isinstance(other, Series): - if axis == 0 and method in ["pearson", "spearman"]: - corrs = {} - if numeric_only: - cols = self.select_dtypes(include=np.number).columns - else: - cols = self.columns - k = other.values - k_mask = ~other.isna() - if isinstance(k, BaseMaskedArray): - k = k._data - if method == "pearson": - for col in cols: - val = self[col].values - nonnull_mask = ~self[col].isna() & k_mask - if isinstance(val, BaseMaskedArray): - val = val._data - corrs[col] = np.corrcoef(val[nonnull_mask], k[nonnull_mask])[ - 0, 1 - ] - else: - for col in cols: - val = self[col].values - nonnull_mask = ~self[col].isna() & k_mask - if isinstance(val, BaseMaskedArray): - val = val._data - corrs[col] = np.corrcoef( - libalgos.rank_1d(val[nonnull_mask]), - libalgos.rank_1d(k[nonnull_mask]), - )[0, 1] - return Series(corrs) - else: - return this.apply(lambda x: other.corr(x, method=method), axis=axis) + return this.apply(lambda x: other.corr(x, method=method), axis=axis) if numeric_only_bool: other = other._get_numeric_data() From f60a87b1a8bd7abdc8487e857989b2ad2a8beb07 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 17 Oct 2022 09:27:42 +0100 Subject: [PATCH 3/6] fix GH issue number in test --- pandas/tests/frame/methods/test_cov_corr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 2d3cc6ff815cf..ac9066e06c913 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -411,7 +411,7 @@ def test_corrwith_kendall(self): tm.assert_series_equal(result, expected) def test_corrwith_spearman_with_tied_data(self): - # GH#21925 + # GH#48826 df = DataFrame( { "A": [2, 5, 8, 9], From b6bd6178f3b1628af093c347750ea849476f450c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 17 Oct 2022 09:31:59 +0100 Subject: [PATCH 4/6] add test from original issue --- pandas/tests/frame/methods/test_cov_corr.py | 34 ++++++--------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index ac9066e06c913..d8372e74e682a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -412,30 +412,14 @@ def test_corrwith_kendall(self): def test_corrwith_spearman_with_tied_data(self): # GH#48826 - df = DataFrame( + df1 = DataFrame( { - "A": [2, 5, 8, 9], - "B": [2, np.nan, 8, 9], - "C": Series([2, np.nan, 8, 9], dtype="Int64"), - "D": [0, 1, 1, 0], - "E": [0, np.nan, 1, 0], - "F": Series([0, np.nan, 1, 0], dtype="Float64"), - "G": [False, True, True, False], - "H": Series([False, pd.NA, True, False], dtype="boolean"), - }, - ) - ser_list = [ - Series([0, 1, 1, 0]), - Series([0.0, 1.0, 1.0, 0.0]), - Series([False, True, True, False]), - Series([0, pd.NA, 1, 0], dtype="Int64"), - Series([0, pd.NA, 1, 0], dtype="Float64"), - Series([False, pd.NA, True, False], dtype="boolean"), - ] - expected = Series( - [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], - index=["A", "B", "C", "D", "E", "F", "G", "H"], + "A": [1, np.nan, 7, 8], + "B": [False, True, True, False], + "C": [10, 4, 9, 3], + } ) - for ser in ser_list: - result = df.corrwith(ser, method="spearman", numeric_only=False) - tm.assert_series_equal(result, expected) + df2 = df1[["B", "C"]] + result = (df1 + 1).corrwith(df2.B, method="spearman") + expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) From 5bb438dc6c099451b427ff3457f288ec5c0f715a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 17 Oct 2022 10:25:18 +0100 Subject: [PATCH 5/6] skip if no scipy --- pandas/tests/frame/methods/test_cov_corr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index d8372e74e682a..a5070aef0fa6b 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -410,6 +410,7 @@ def test_corrwith_kendall(self): expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) + @td.skip_if_no_scipy def test_corrwith_spearman_with_tied_data(self): # GH#48826 df1 = DataFrame( From 3f33d3aef511e54eed93d6a5b4e36de92e70af61 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 17 Oct 2022 10:29:27 +0100 Subject: [PATCH 6/6] add extra test case --- pandas/tests/frame/methods/test_cov_corr.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a5070aef0fa6b..25ef49718fbe7 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -424,3 +424,11 @@ def test_corrwith_spearman_with_tied_data(self): result = (df1 + 1).corrwith(df2.B, method="spearman") expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) + + df_bool = DataFrame( + {"A": [True, True, False, False], "B": [True, False, False, True]} + ) + ser_bool = Series([True, True, False, True]) + result = df_bool.corrwith(ser_bool) + expected = Series([0.57735, 0.57735], index=["A", "B"]) + tm.assert_series_equal(result, expected)