From 1074defbc37bf96e41cbb003769dae6af9ff8792 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 23 Nov 2022 16:44:15 -0500 Subject: [PATCH 1/6] ENH Imrpoves error message for mixed types for feature names --- sklearn/utils/tests/test_validation.py | 5 +++-- sklearn/utils/validation.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index cc1ac47a42615..530bd08b20fec 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1675,8 +1675,9 @@ def test_get_feature_names_invalid_dtypes(names, dtypes): X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names) msg = re.escape( - "Feature names only support names that are all strings. " - f"Got feature names with dtypes: {dtypes}." + "Feature names only support names that are all strings. Got feature names with" + f" dtypes: {dtypes}. Please convert to a common type, for example using" + " X.columns = X.columns.astype(str)" ) with pytest.raises(TypeError, match=msg): names = _get_feature_names(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index aeb3a8814be22..9b560bbf08037 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1884,8 +1884,9 @@ def _get_feature_names(X): # mixed type of string and non-string is not supported if len(types) > 1 and "str" in types: raise TypeError( - "Feature names only support names that are all strings. " - f"Got feature names with dtypes: {types}." + "Feature names only support names that are all strings. Got feature names" + f" with dtypes: {types}. Please convert to a common type, for example using" + " X.columns = X.columns.astype(str)" ) # Only feature names of all strings are supported From 51e447fe03a7603da480cc195ec2e88b0881e493 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 24 Nov 2022 22:08:54 -0500 Subject: [PATCH 2/6] CLN Improves error message --- sklearn/tests/test_base.py | 5 +++-- sklearn/utils/tests/test_validation.py | 6 +++--- sklearn/utils/validation.py | 7 ++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 934bf7719163c..bc39881a2d70d 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -651,8 +651,9 @@ def transform(self, X): df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2]) trans = NoOpTransformer() msg = re.escape( - "Feature names only support names that are all strings. " - "Got feature names with dtypes: ['int', 'str']" + "Feature names only support column names that are all strings, but got dtypes:" + " ['int', 'str']. If you want support for feature names, convert the" + " columns to strings." ) with pytest.raises(TypeError, match=msg): trans.fit(df_mixed) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 530bd08b20fec..ea46c2c80c703 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1675,9 +1675,9 @@ def test_get_feature_names_invalid_dtypes(names, dtypes): X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names) msg = re.escape( - "Feature names only support names that are all strings. Got feature names with" - f" dtypes: {dtypes}. Please convert to a common type, for example using" - " X.columns = X.columns.astype(str)" + "Feature names only support column names that are all strings, but got dtypes:" + f" {dtypes}. If you want support for feature names, convert the" + " columns to strings." ) with pytest.raises(TypeError, match=msg): names = _get_feature_names(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9b560bbf08037..2b896b3676e23 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1884,9 +1884,10 @@ def _get_feature_names(X): # mixed type of string and non-string is not supported if len(types) > 1 and "str" in types: raise TypeError( - "Feature names only support names that are all strings. Got feature names" - f" with dtypes: {types}. Please convert to a common type, for example using" - " X.columns = X.columns.astype(str)" + "Feature names only support column names that are all strings, but got" + f" dtypes: {types}. If you want support for feature names, convert the" + " columns to strings. For example, by using X.columns =" + " X.columns.astype(str)." ) # Only feature names of all strings are supported From ed509cfdc4c1a3cca49c40c2f166cfd0ee738b27 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 24 Nov 2022 22:09:40 -0500 Subject: [PATCH 3/6] CLN Use column names --- sklearn/tests/test_base.py | 2 +- sklearn/utils/tests/test_validation.py | 2 +- sklearn/utils/validation.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index bc39881a2d70d..5ef35dcdf5a18 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -653,7 +653,7 @@ def transform(self, X): msg = re.escape( "Feature names only support column names that are all strings, but got dtypes:" " ['int', 'str']. If you want support for feature names, convert the" - " columns to strings." + " column names to strings." ) with pytest.raises(TypeError, match=msg): trans.fit(df_mixed) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ea46c2c80c703..7786fdefef0ec 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1677,7 +1677,7 @@ def test_get_feature_names_invalid_dtypes(names, dtypes): msg = re.escape( "Feature names only support column names that are all strings, but got dtypes:" f" {dtypes}. If you want support for feature names, convert the" - " columns to strings." + " column names to strings." ) with pytest.raises(TypeError, match=msg): names = _get_feature_names(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2b896b3676e23..2418903b98751 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1886,7 +1886,7 @@ def _get_feature_names(X): raise TypeError( "Feature names only support column names that are all strings, but got" f" dtypes: {types}. If you want support for feature names, convert the" - " columns to strings. For example, by using X.columns =" + " column names to strings. For example, by using X.columns =" " X.columns.astype(str)." ) From 97c7da61d2e31b5580645ec0d5d46e70aafb4429 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 25 Nov 2022 11:20:56 +0100 Subject: [PATCH 4/6] improve warning message --- sklearn/tests/test_base.py | 9 ++++++--- sklearn/utils/tests/test_validation.py | 9 ++++++--- sklearn/utils/validation.py | 10 ++++++---- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 5ef35dcdf5a18..2df274873bc53 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -651,9 +651,12 @@ def transform(self, X): df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2]) trans = NoOpTransformer() msg = re.escape( - "Feature names only support column names that are all strings, but got dtypes:" - " ['int', 'str']. If you want support for feature names, convert the" - " column names to strings." + "Feature names are only supported if all input features have string names, " + "but your input has ['int', 'str'] as feature name / column name types. " + "If you want feature names to be stored and validated, you must convert " + "them all to strings. If you'd like to silence this warning, you can " + "remove the feature / column names from your input data, or convert them " + "all to a non-string data type." ) with pytest.raises(TypeError, match=msg): trans.fit(df_mixed) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 7786fdefef0ec..4aa45fd3a4394 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1675,9 +1675,12 @@ def test_get_feature_names_invalid_dtypes(names, dtypes): X = pd.DataFrame([[1, 2], [4, 5], [5, 6]], columns=names) msg = re.escape( - "Feature names only support column names that are all strings, but got dtypes:" - f" {dtypes}. If you want support for feature names, convert the" - " column names to strings." + "Feature names are only supported if all input features have string names, " + f"but your input has {dtypes} as feature name / column name types. " + "If you want feature names to be stored and validated, you must convert " + "them all to strings. If you'd like to silence this warning, you can " + "remove the feature / column names from your input data, or convert them " + "all to a non-string data type." ) with pytest.raises(TypeError, match=msg): names = _get_feature_names(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2418903b98751..311378657abbc 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1884,10 +1884,12 @@ def _get_feature_names(X): # mixed type of string and non-string is not supported if len(types) > 1 and "str" in types: raise TypeError( - "Feature names only support column names that are all strings, but got" - f" dtypes: {types}. If you want support for feature names, convert the" - " column names to strings. For example, by using X.columns =" - " X.columns.astype(str)." + "Feature names are only supported if all input features have string names, " + f"but your input has {types} as feature name / column name types. " + "If you want feature names to be stored and validated, you must convert " + "them all to strings. If you'd like to silence this warning, you can " + "remove the feature / column names from your input data, or convert them " + "all to a non-string data type." ) # Only feature names of all strings are supported From db8d6859396e9c2f2a42ce8960e874c7f37966c8 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 25 Nov 2022 12:00:19 +0100 Subject: [PATCH 5/6] remove the warning part --- sklearn/tests/test_base.py | 5 ++--- sklearn/utils/tests/test_validation.py | 5 ++--- sklearn/utils/validation.py | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 2df274873bc53..7715df9794e19 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -654,9 +654,8 @@ def transform(self, X): "Feature names are only supported if all input features have string names, " "but your input has ['int', 'str'] as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " - "them all to strings. If you'd like to silence this warning, you can " - "remove the feature / column names from your input data, or convert them " - "all to a non-string data type." + "them all to strings, by using X.columns = X.columns.astype(str) for " + "example." ) with pytest.raises(TypeError, match=msg): trans.fit(df_mixed) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 4aa45fd3a4394..ed21c04f5886f 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1678,9 +1678,8 @@ def test_get_feature_names_invalid_dtypes(names, dtypes): "Feature names are only supported if all input features have string names, " f"but your input has {dtypes} as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " - "them all to strings. If you'd like to silence this warning, you can " - "remove the feature / column names from your input data, or convert them " - "all to a non-string data type." + "them all to strings, by using X.columns = X.columns.astype(str) for " + "example." ) with pytest.raises(TypeError, match=msg): names = _get_feature_names(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 311378657abbc..74e69f40a42f9 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1887,9 +1887,8 @@ def _get_feature_names(X): "Feature names are only supported if all input features have string names, " f"but your input has {types} as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " - "them all to strings. If you'd like to silence this warning, you can " - "remove the feature / column names from your input data, or convert them " - "all to a non-string data type." + "them all to strings, by using X.columns = X.columns.astype(str) for " + "example." ) # Only feature names of all strings are supported From 77d44d5e52649d14ee3b417f8111767614807210 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 25 Nov 2022 12:57:24 +0100 Subject: [PATCH 6/6] roll back --- sklearn/tests/test_base.py | 3 ++- sklearn/utils/tests/test_validation.py | 3 ++- sklearn/utils/validation.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 7715df9794e19..a0e2f6fd1f273 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -655,7 +655,8 @@ def transform(self, X): "but your input has ['int', 'str'] as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " "them all to strings, by using X.columns = X.columns.astype(str) for " - "example." + "example. Otherwise you can remove feature / column names from your input " + "data, or convert them all to a non-string data type." ) with pytest.raises(TypeError, match=msg): trans.fit(df_mixed) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ed21c04f5886f..78f26a988dc60 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1679,7 +1679,8 @@ def test_get_feature_names_invalid_dtypes(names, dtypes): f"but your input has {dtypes} as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " "them all to strings, by using X.columns = X.columns.astype(str) for " - "example." + "example. Otherwise you can remove feature / column names from your input " + "data, or convert them all to a non-string data type." ) with pytest.raises(TypeError, match=msg): names = _get_feature_names(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 74e69f40a42f9..7de0fe200607b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1888,7 +1888,8 @@ def _get_feature_names(X): f"but your input has {types} as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " "them all to strings, by using X.columns = X.columns.astype(str) for " - "example." + "example. Otherwise you can remove feature / column names from your input " + "data, or convert them all to a non-string data type." ) # Only feature names of all strings are supported