New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[pyspark] Make Xgboost estimator support using sparse matrix as optimization #8145
Changes from 5 commits
861d008
3c0b30b
131711d
d5ba06b
354660c
466bf8e
cf02d60
ce95f5b
f58d7ff
45cf38b
baec186
0584c5d
56c72d3
1fb08e2
534274e
1408c29
5c11a46
4d22623
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,7 +44,9 @@ | |
SparkXGBReader, | ||
SparkXGBWriter, | ||
) | ||
from .params import HasArbitraryParamsDict, HasBaseMarginCol, HasFeaturesCols | ||
from .params import ( | ||
HasArbitraryParamsDict, HasBaseMarginCol, HasFeaturesCols, HasEnableSparseDataOptim | ||
) | ||
from .utils import ( | ||
RabitContext, | ||
_get_args_from_message_list, | ||
|
@@ -124,6 +126,7 @@ class _SparkXGBParams( | |
HasArbitraryParamsDict, | ||
HasBaseMarginCol, | ||
HasFeaturesCols, | ||
HasEnableSparseDataOptim, | ||
): | ||
num_workers = Param( | ||
Params._dummy(), | ||
|
@@ -363,6 +366,23 @@ def _validate_and_convert_feature_col_as_array_col(dataset, features_col_name): | |
return features_array_col | ||
|
||
|
||
def _get_unwrap_udt_fn(): | ||
try: | ||
from pyspark.sql.functions import unwrap_udt | ||
return unwrap_udt | ||
except ImportError: | ||
pass | ||
|
||
try: | ||
from pyspark.databricks.sql.functions import unwrap_udt | ||
return unwrap_udt | ||
except ImportError: | ||
raise RuntimeError( | ||
"Cannot import pyspark `unwrap_udt` function. Please install pyspark>=3.4 " | ||
"or run on Databricks Runtime." | ||
) | ||
|
||
|
||
class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable): | ||
def __init__(self): | ||
super().__init__() | ||
|
@@ -527,17 +547,69 @@ def _fit(self, dataset): | |
|
||
select_cols = [label_col] | ||
features_cols_names = None | ||
if self.getOrDefault(self.features_cols): | ||
features_cols_names = self.getOrDefault(self.features_cols) | ||
features_cols = _validate_and_convert_feature_col_as_float_col_list( | ||
dataset, features_cols_names | ||
) | ||
select_cols.extend(features_cols) | ||
enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim) | ||
if enable_sparse_data_optim: | ||
from pyspark.ml.linalg import VectorUDT | ||
|
||
if self.getOrDefault(self.missing) != 0.0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we put this checking into the _validate_params? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @trivialfis There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need the restriction that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to be sure that we are on the same page. For the |
||
# If DMatrix is constructed from csr / csc matrix, then inactive elements | ||
# in csr / csc matrix are regarded as missing value, but, in pyspark, we | ||
# are hard to control elements to be active or inactive in sparse vector column, | ||
# some spark transformers such as VectorAssembler might compress vectors | ||
# to be dense or sparse format automatically, and when a spark ML vector object | ||
# is compressed to sparse vector, then all zero value elements become inactive. | ||
# So we force setting missing param to be 0 when enable_sparse_data_optim config | ||
# is True. | ||
raise ValueError( | ||
"If enable_sparse_data_optim is True, missing param != 0 is not supported." | ||
) | ||
|
||
if self.getOrDefault(self.features_cols): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wbo4958 Would you like to take a look into this? |
||
raise ValueError( | ||
"If enable_sparse_data_optim is True, you cannot set multiple feature columns " | ||
"but you should set one feature column with values of " | ||
"`pyspark.ml.linalg.Vector` type." | ||
) | ||
features_col_name = self.getOrDefault(self.featuresCol) | ||
features_col_datatype = dataset.schema[features_col_name].dataType | ||
if not isinstance(features_col_datatype, VectorUDT): | ||
raise ValueError( | ||
"If enable_sparse_data_optim is True, the feature column values must be " | ||
"`pyspark.ml.linalg.Vector` type." | ||
) | ||
|
||
unwrap_udt = _get_unwrap_udt_fn() | ||
features_unwrapped_vec_col = unwrap_udt(col(features_col_name)) | ||
|
||
# After a `pyspark.ml.linalg.VectorUDT` type column being unwrapped, it becomes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the detailed comments! |
||
# a pyspark struct type column, the struct fields are: | ||
# - `type`: byte | ||
# - `size`: int | ||
# - `indices`: array<int> | ||
# - `values`: array<double> | ||
# For sparse vector, `type` field is 0, `size` field means vector length, | ||
# `indices` field is the array of active element indices, `values` field | ||
# is the array of active element values. | ||
# For dense vector, `type` field is 1, `size` and `indices` fields are None, | ||
# `values` field is the array of the vector element values. | ||
select_cols.extend([ | ||
features_unwrapped_vec_col.type.alias("featureVectorType"), | ||
features_unwrapped_vec_col.size.alias("featureVectorSize"), | ||
features_unwrapped_vec_col.indices.alias("featureVectorIndices"), | ||
features_unwrapped_vec_col.values.alias("featureVectorValues"), | ||
]) | ||
else: | ||
features_array_col = _validate_and_convert_feature_col_as_array_col( | ||
dataset, self.getOrDefault(self.featuresCol) | ||
) | ||
select_cols.append(features_array_col) | ||
if self.getOrDefault(self.features_cols): | ||
features_cols_names = self.getOrDefault(self.features_cols) | ||
features_cols = _validate_and_convert_feature_col_as_float_col_list( | ||
dataset, features_cols_names | ||
) | ||
select_cols.extend(features_cols) | ||
else: | ||
features_array_col = _validate_and_convert_feature_col_as_array_col( | ||
dataset, self.getOrDefault(self.featuresCol) | ||
) | ||
select_cols.append(features_array_col) | ||
|
||
if self.isDefined(self.weightCol) and self.getOrDefault(self.weightCol): | ||
select_cols.append( | ||
|
@@ -589,7 +661,7 @@ def _fit(self, dataset): | |
"feature_types": self.getOrDefault(self.feature_types), | ||
"feature_names": self.getOrDefault(self.feature_names), | ||
"feature_weights": self.getOrDefault(self.feature_weights), | ||
"missing": self.getOrDefault(self.missing), | ||
"missing": float(self.getOrDefault(self.missing)), | ||
} | ||
booster_params["nthread"] = cpu_per_task | ||
use_gpu = self.getOrDefault(self.use_gpu) | ||
|
@@ -627,7 +699,8 @@ def _train_booster(pandas_df_iter): | |
evals_result = {} | ||
with RabitContext(_rabit_args, context): | ||
dtrain, dvalid = create_dmatrix_from_partitions( | ||
pandas_df_iter, features_cols_names, gpu_id, dmatrix_kwargs | ||
pandas_df_iter, features_cols_names, gpu_id, dmatrix_kwargs, | ||
enable_sparse_data_optim=enable_sparse_data_optim, | ||
) | ||
if dvalid is not None: | ||
dval = [(dtrain, "training"), (dvalid, "validation")] | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -106,6 +106,7 @@ def create_dmatrix_from_partitions( | |||
feature_cols: Optional[Sequence[str]], | ||||
gpu_id: Optional[int], | ||||
kwargs: Dict[str, Any], # use dict to make sure this parameter is passed. | ||||
enable_sparse_data_optim: bool, | ||||
) -> Tuple[DMatrix, Optional[DMatrix]]: | ||||
"""Create DMatrix from spark data partitions. This is not particularly efficient as | ||||
we need to convert the pandas series format to numpy then concatenate all the data. | ||||
|
@@ -139,6 +140,76 @@ def append_m(part: pd.DataFrame, name: str, is_valid: bool) -> None: | |||
else: | ||||
train_data[name].append(array) | ||||
|
||||
def append_m_sparse(part: pd.DataFrame, name: str, is_valid: bool) -> None: | ||||
from scipy.sparse import csr_matrix | ||||
nonlocal n_features | ||||
|
||||
if name == alias.data or name in part.columns: | ||||
if name == alias.data: | ||||
# variables for constructing csr_matrix | ||||
csr_indices_list, csr_indptr_list, csr_values_list = [], [0], [] | ||||
|
||||
for vec_type, vec_size_, vec_indices, vec_values in zip( | ||||
part.featureVectorType, | ||||
part.featureVectorSize, | ||||
part.featureVectorIndices, | ||||
part.featureVectorValues | ||||
): | ||||
if vec_type == 0: | ||||
# sparse vector | ||||
vec_size = int(vec_size_) | ||||
csr_indices = vec_indices | ||||
csr_values = vec_values | ||||
else: | ||||
# dense vector | ||||
# Note: According to spark ML VectorUDT format, | ||||
# when type field is 1, the size field is also empty. | ||||
# we need to check the values field to get vector length. | ||||
vec_size = len(vec_values) | ||||
csr_indices = np.arange(vec_size, dtype=np.int32) | ||||
csr_values = vec_values | ||||
|
||||
if n_features == 0: | ||||
n_features = vec_size | ||||
assert n_features == vec_size | ||||
|
||||
# remove zero elements from csr_indices / csr_values | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this necessary? When There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I remember in the DMatrix ctor, if data argument using csc/csr matrix, then it ignore the "missing" argument but regard all inactive element in the sparse matrix as missing values. (Ref: #341 (comment)) If so, then keep zero elements or removing them represents 2 different semantic: Is my understanding correct ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ctor for CSR matrix should be able to handle missing values (but not for the CSC, which would raise a warning). xgboost/python-package/xgboost/data.py Line 35 in 20d1bba
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. which is a good reminder that I should clear the difference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @trivialfis There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will update the CSC implementation instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @trivialfis what does "updating CSC implementation" mean? so for current implementation, it indeed remove the whole instance if one element in the instance is missing value? |
||||
n_actives = len(csr_indices) | ||||
nz_csr_indices = np.empty(n_actives, dtype=np.int32) | ||||
nz_csr_values = np.empty(n_actives, dtype=np.int32) | ||||
|
||||
active_i = 0 | ||||
nz_i = 0 | ||||
while active_i < n_actives: | ||||
if csr_values[active_i] != 0.0: | ||||
nz_csr_indices[nz_i] = csr_indices[active_i] | ||||
nz_csr_values[nz_i] = csr_values[active_i] | ||||
nz_i += 1 | ||||
active_i += 1 | ||||
|
||||
nz_csr_indices = nz_csr_indices[:nz_i] | ||||
nz_csr_values = nz_csr_values[:nz_i] | ||||
|
||||
csr_indices_list.append(nz_csr_indices) | ||||
csr_indptr_list.append(csr_indptr_list[-1] + len(nz_csr_indices)) | ||||
csr_values_list.append(nz_csr_values) | ||||
|
||||
csr_indptr_arr = np.array(csr_indptr_list) | ||||
csr_indices_arr = np.concatenate(csr_indices_list) | ||||
csr_values_arr = np.concatenate(csr_values_list) | ||||
|
||||
array = csr_matrix( | ||||
(csr_values_arr, csr_indices_arr, csr_indptr_arr), | ||||
shape=(len(part), n_features) | ||||
) | ||||
else: | ||||
array = part[name] | ||||
|
||||
if is_valid: | ||||
valid_data[name].append(array) | ||||
else: | ||||
train_data[name].append(array) | ||||
|
||||
def append_dqm(part: pd.DataFrame, name: str, is_valid: bool) -> None: | ||||
"""Preprocessing for DeviceQuantileDMatrix""" | ||||
nonlocal n_features | ||||
|
@@ -164,13 +235,18 @@ def make(values: Dict[str, List[np.ndarray]], kwargs: Dict[str, Any]) -> DMatrix | |||
label = concat_or_none(values.get(alias.label, None)) | ||||
weight = concat_or_none(values.get(alias.weight, None)) | ||||
margin = concat_or_none(values.get(alias.margin, None)) | ||||
|
||||
return DMatrix( | ||||
data=data, label=label, weight=weight, base_margin=margin, **kwargs | ||||
) | ||||
|
||||
is_dmatrix = feature_cols is None | ||||
if is_dmatrix: | ||||
cache_partitions(iterator, append_m) | ||||
if enable_sparse_data_optim: | ||||
append_fn = append_m_sparse | ||||
else: | ||||
append_fn = append_m | ||||
cache_partitions(iterator, append_fn) | ||||
dtrain = make(train_data, kwargs) | ||||
else: | ||||
cache_partitions(iterator, append_dqm) | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the _fit function is almost 200 lines, which is super huge, could we split this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am working on another Ranker estimator PR. We can do this refactor after these feature PRs merged. Otherwise fixing conflicts is annoying.