Issue with GridSearchCV and "set_output" API for Pandas format #25528

s1sriniv · 2023-02-01T20:49:15Z

s1sriniv
Feb 1, 2023

I have an example Pipeline that works just fine with the "set_output" API when I fit it on its own. However, when I pass this Pipeline object to "GridSearchCV", the output is no longer being preserved as a Pandas dataframe, causing bugs within the pipeline.

Here is some example code (note the custom Transformer that is causing the bug):

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

np.random.seed(0)

from sklearn import set_config
set_config(transform_output="default")

class CustomMissingImputer(BaseEstimator, TransformerMixin):
    """Performs sklearn-style "SimpleImputer" imputation,
       but accepts multiple placeholders for "missing values"!
       
       Note: Even if you don't pass "np.nan" as a missing value
       placeholder in the placeholder list, it will still be
       treated as a missing value anyway (i.e. unchanged.)
    """
    def __init__(self, missing_value_placeholders, strategy):
      self.missing_value_placeholders = missing_value_placeholders
      self.strategy = strategy
      self.simple_imputer_obj = None
      self.set_output_api_val = None
      
      assert(type(missing_value_placeholders) is list)
        
    def fit(self, X, y=None):
      df = X.copy()
      missing_value_placeholders = self.missing_value_placeholders
      strategy = self.strategy
      
      df[df.isin(missing_value_placeholders)] = np.nan
      simple_imputer = SimpleImputer(strategy=strategy)
      simple_imputer.fit(df)
      self.simple_imputer_obj = simple_imputer
      return(self)

    def transform(self, X, y=None):
      df = X.copy()
      missing_value_placeholders = self.missing_value_placeholders
      simple_imputer_obj = self.simple_imputer_obj
      
      print(f'VAL FROM WITHIN transform: {self.set_output_api_val}')
      set_output_api_val = self.set_output_api_val
            
      df[df.isin(missing_value_placeholders)] = np.nan
      features = simple_imputer_obj.set_output(transform=set_output_api_val).transform(df)
      
      # Asserting that indices of input & output match
      assert((X.index != features.index).sum() == 0)
      
      # Asserting that no nulls are left
      assert(features.isna().sum().sum() == 0)
      
      return(features)
    
    def set_output(self, transform=None):
      self.set_output_api_val = transform
      print(f'VAL FROM WITHIN set_output: {self.set_output_api_val}')
      
      return(self)
    
test_pipeline = Pipeline([
  ('transform', CustomMissingImputer(missing_value_placeholders=[np.nan], strategy='mean')),
  ('model', GradientBoostingRegressor())
])

test_pipeline = test_pipeline.set_output(transform='pandas')

X_df = pd.DataFrame(np.random.randn(500).reshape(100,5))
y = pd.Series(np.random.randn(100))

param_grid = {
  'model__n_estimators':[5,10]
}

gscv = GridSearchCV(test_pipeline, param_grid, scoring='neg_mean_squared_error', cv=2)
gscv.fit(X=X_df, y=y)

This is the print output (suggesting that my configuration from ".set_output()" was not being stored at ".transform()" time):

VAL FROM WITHIN set_output: pandas
VAL FROM WITHIN transform: None
VAL FROM WITHIN transform: None
VAL FROM WITHIN transform: None
VAL FROM WITHIN transform: None

Here is a truncated traceback (confirming that the final output in ".transform()" was a Numpy array rather than a Pandas dataframe):

4 fits failed with the following error:
Traceback (most recent call last):
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/databricks/python_shell/dbruntime/MLWorkloadsInstrumentation/_sklearn.py", line 29, in patch_function
original_result = original(self, *args, **kwargs)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/pipeline.py", line 402, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/pipeline.py", line 360, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/joblib/memory.py", line 349, in call
return self.func(*args, **kwargs)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/pipeline.py", line 894, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/base.py", line 851, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-eecc0cc4-c2cb-41a7-9c7c-e8a18df18257/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File "", line 49, in transform
assert((X.index != features.index).sum() == 0)
AttributeError: 'numpy.ndarray' object has no attribute 'index'

Interestingly, I don't get the error when I set the output to Pandas using the global "set_config()" function.

My question is why does my code work just fine with the standalone Pipeline, but not when I pass this Pipeline to a GridSearchCV object?

thomasjpfan · 2023-02-02T17:09:46Z

thomasjpfan
Feb 2, 2023
Maintainer

We do not really have great developer support for defining a custom set_output or getting the current output configuration. If you want to get it working you can copy the implementation:

scikit-learn/sklearn/utils/_set_output.py

Lines 231 to 238 in 681e8e2

    
           if transform is None: 
        
               return self 
        
           if not hasattr(self, "_sklearn_output_config"): 
        
               self._sklearn_output_config = {} 
        
           self._sklearn_output_config["transform"] = transform 
        
           return self

Concretely, in your case:

    def transform(self, X, y=None):
      ...
      # Get the configuration for estimator
      config = getattr(self, "_sklearn_output_config", {})
      set_output_api_val = config.get("transform", None)
            
      df[df.isin(missing_value_placeholders)] = np.nan
      features = simple_imputer_obj.set_output(transform=set_output_api_val).transform(df)
      ...

    def set_output(self, transform=None):
      if transform is None:
        return self
      if not hasattr(self, "_sklearn_output_config"):
        self._sklearn_output_config = {}
      self._sklearn_output_config["transform"] = transform 
      return self

Another way to do it is to define get_feature_names_out and set_output would be defined for you:

    def transform(...):
        # same as above

    def get_feature_names_out(self, input_features=None):
      return self.simple_imputer_obj.get_feature_names_out(input_features)

Edit: If CustomMissingImputer is pandas specific and does not require NumPy output at all, then I would consider setting simple_imputer_obj.set_output(transform="pandas") all the time. This way CustomMissingImputer.set_output is not required. Scikit-learn will work correctly with a transformer that always outputs a pandas DataFrame.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Issue with GridSearchCV and "set_output" API for Pandas format #25528

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Select a reply

Issue with GridSearchCV and "set_output" API for Pandas format #25528

s1sriniv Feb 1, 2023

Replies: 1 comment

thomasjpfan Feb 2, 2023 Maintainer

s1sriniv
Feb 1, 2023

thomasjpfan
Feb 2, 2023
Maintainer