Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pyspark] Add param validation for "objective" and "eval_metric" param, and remove invalid booster params #8173

Merged
merged 12 commits into from Aug 24, 2022
Merged
40 changes: 30 additions & 10 deletions python-package/xgboost/spark/core.py
Expand Up @@ -101,10 +101,10 @@

_unsupported_fit_params = {
"sample_weight", # Supported by spark param weightCol
# Supported by spark param weightCol # and validationIndicatorCol
"eval_set",
"sample_weight_eval_set",
"eval_set", # Supported by spark param validation_indicator_col
"sample_weight_eval_set", # Supported by spark param weight_col + validation_indicator_col
"base_margin", # Supported by spark param base_margin_col
"base_margin_eval_set", # Supported by spark param base_margin_col + validation_indicator_col
}

_unsupported_predict_params = {
Expand Down Expand Up @@ -267,6 +267,18 @@ def _validate_params(self):
"If features_cols param set, then features_col param is ignored."
)

if self.getOrDefault(self.objective) is not None:
Copy link
Contributor

@wbo4958 wbo4958 Aug 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not make sense (to me) to check the type of a parameter that already has the typeConverter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "objective" param is automatically generated, currently it does not have type converter.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still hope we can find a common solution for the type converter in the following PR

if not isinstance(self.getOrDefault(self.objective), str):
raise ValueError(
"Only string type 'objective' param is allowed."
)

if self.getOrDefault(self.eval_metric) is not None:
if not isinstance(self.getOrDefault(self.eval_metric), str):
raise ValueError(
"Only string type 'eval_metric' param is allowed."
)

if self.getOrDefault(self.use_gpu):
tree_method = self.getParam("tree_method")
if (
Expand Down Expand Up @@ -492,14 +504,17 @@ def _get_distributed_train_params(self, dataset):
params.update(fit_params)
params["verbose_eval"] = verbose_eval
classification = self._xgb_cls() == XGBClassifier
num_classes = int(dataset.select(countDistinct(alias.label)).collect()[0][0])
if classification and num_classes == 2:
params["objective"] = "binary:logistic"
elif classification and num_classes > 2:
params["objective"] = "multi:softprob"
params["num_class"] = num_classes
if classification:
num_classes = int(dataset.select(countDistinct(alias.label)).collect()[0][0])
if num_classes <= 2:
params["objective"] = "binary:logistic"
else:
params["objective"] = "multi:softprob"
params["num_class"] = num_classes
else:
params["objective"] = "reg:squarederror"
# use user specified objective or default objective.
# e.g., the default objective for Regressor is 'reg:squarederror'
params["objective"] = self.getOrDefault(self.objective)

# TODO: support "num_parallel_tree" for random forest
params["num_boost_round"] = self.getOrDefault(self.n_estimators)
Expand All @@ -518,6 +533,11 @@ def _get_xgb_train_call_args(cls, train_params):
kwargs_params[key] = value
else:
booster_params[key] = value

booster_params = {
k: v
for k, v in booster_params.items() if k not in _non_booster_params
}
return booster_params, kwargs_params

def _fit(self, dataset):
Expand Down
12 changes: 12 additions & 0 deletions python-package/xgboost/spark/estimator.py
Expand Up @@ -203,6 +203,11 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction

def __init__(self, **kwargs):
super().__init__()
# The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
# but in pyspark we will automatically set objective param depending on
# binary or multinomial input dataset, and we need to remove the fixed default
# param value as well to avoid causing ambiguity.
self._setDefault(objective=None)
self.setParams(**kwargs)

@classmethod
Expand All @@ -213,5 +218,12 @@ def _xgb_cls(cls):
def _pyspark_model_cls(cls):
return SparkXGBClassifierModel

def _validate_params(self):
super(SparkXGBClassifier, self)._validate_params()
if self.getOrDefault(self.objective):
raise ValueError(
"Setting custom 'objective' param is not allowed in 'SparkXGBClassifier'."
)


_set_pyspark_xgb_cls_param_attrs(SparkXGBClassifier, SparkXGBClassifierModel)
2 changes: 2 additions & 0 deletions tests/python/test_spark/test_spark_local.py
Expand Up @@ -390,6 +390,7 @@ def test_regressor_params_basic(self):
self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
self.assertFalse(hasattr(py_reg, "gpu_id"))
self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
py_reg2 = SparkXGBRegressor(n_estimators=200)
self.assertEqual(py_reg2.getOrDefault(py_reg2.n_estimators), 200)
py_reg3 = py_reg2.copy({py_reg2.max_depth: 10})
Expand All @@ -402,6 +403,7 @@ def test_classifier_params_basic(self):
self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
self.assertFalse(hasattr(py_cls, "gpu_id"))
self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
py_cls2 = SparkXGBClassifier(n_estimators=200)
self.assertEqual(py_cls2.getOrDefault(py_cls2.n_estimators), 200)
py_cls3 = py_cls2.copy({py_cls2.max_depth: 10})
Expand Down