Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "PERF: faster corrwith method for pearson and spearman correlation when other is a Series and axis = 0 (column-wise) (#46174)" #49140

Merged
merged 6 commits into from Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.1.rst
Expand Up @@ -88,6 +88,7 @@ Fixed regressions
- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
- Fixed regression in :meth:`DataFrame.to_parquet` raising when file name was specified as ``bytes`` (:issue:`48944`)
- Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`)
- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`)

.. ---------------------------------------------------------------------------

Expand Down
34 changes: 1 addition & 33 deletions pandas/core/frame.py
Expand Up @@ -10577,40 +10577,8 @@ def corrwith(
if numeric_only is lib.no_default and len(this.columns) < len(self.columns):
com.deprecate_numeric_only_default(type(self), "corrwith")

# GH46174: when other is a Series object and axis=0, we achieve a speedup over
# passing .corr() to .apply() by taking the columns as ndarrays and iterating
# over the transposition row-wise. Then we delegate the correlation coefficient
# computation and null-masking to np.corrcoef and np.isnan respectively,
# which are much faster. We exploit the fact that the Spearman correlation
# of two vectors is equal to the Pearson correlation of their ranks to use
# substantially the same method for Pearson and Spearman,
# just with intermediate argsorts on the latter.
if isinstance(other, Series):
if axis == 0 and method in ["pearson", "spearman"]:
corrs = {}
if numeric_only:
cols = self.select_dtypes(include=np.number).columns
ndf = self[cols].values.transpose()
else:
cols = self.columns
ndf = self.values.transpose()
k = other.values
if method == "pearson":
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
0, 1
]
else:
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[cols[i]] = np.corrcoef(
r[nonnull_mask].argsort().argsort(),
k[nonnull_mask].argsort().argsort(),
)[0, 1]
return Series(corrs)
else:
return this.apply(lambda x: other.corr(x, method=method), axis=axis)
return this.apply(lambda x: other.corr(x, method=method), axis=axis)

if numeric_only_bool:
other = other._get_numeric_data()
Expand Down
28 changes: 27 additions & 1 deletion pandas/tests/frame/methods/test_cov_corr.py
Expand Up @@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only):
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not supported for the input types"):
with pytest.raises(
TypeError,
match=r"unsupported operand type\(s\) for /: 'str' and 'int'",
):
df.corrwith(s, numeric_only=numeric_only)

def test_corrwith_index_intersection(self):
Expand Down Expand Up @@ -406,3 +409,26 @@ def test_corrwith_kendall(self):
result = df.corrwith(df**2, method="kendall")
expected = Series(np.ones(len(result)))
tm.assert_series_equal(result, expected)

@td.skip_if_no_scipy
def test_corrwith_spearman_with_tied_data(self):
# GH#48826
df1 = DataFrame(
{
"A": [1, np.nan, 7, 8],
"B": [False, True, True, False],
"C": [10, 4, 9, 3],
}
)
df2 = df1[["B", "C"]]
result = (df1 + 1).corrwith(df2.B, method="spearman")
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)

df_bool = DataFrame(
{"A": [True, True, False, False], "B": [True, False, False, True]}
)
ser_bool = Series([True, True, False, True])
result = df_bool.corrwith(ser_bool)
expected = Series([0.57735, 0.57735], index=["A", "B"])
tm.assert_series_equal(result, expected)