Skip to content

Commit

Permalink
Handle missing values in dataframe with category dtype. (#7331)
Browse files Browse the repository at this point in the history
* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.
  • Loading branch information
trivialfis committed Oct 27, 2021
1 parent 2eee874 commit ac9bfaa
Show file tree
Hide file tree
Showing 13 changed files with 301 additions and 103 deletions.
30 changes: 19 additions & 11 deletions python-package/xgboost/core.py
Expand Up @@ -373,7 +373,7 @@ def _reraise(self) -> None:
raise exc # pylint: disable=raising-bad-type

def __del__(self) -> None:
assert self._temporary_data is None, self._temporary_data
assert self._temporary_data is None
assert self._exception is None

def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
Expand All @@ -397,19 +397,19 @@ def data_handle(
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
**kwargs: Any,
):
) -> None:
from .data import dispatch_proxy_set_data
from .data import _proxy_transform

transformed, feature_names, feature_types = _proxy_transform(
new, cat_codes, feature_names, feature_types = _proxy_transform(
data,
feature_names,
feature_types,
self._enable_categorical,
)
# Stage the data, meta info are copied inside C++ MetaInfo.
self._temporary_data = transformed
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
self._temporary_data = (new, cat_codes)
dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
self.proxy.set_info(
feature_names=feature_names,
feature_types=feature_types,
Expand Down Expand Up @@ -1090,19 +1090,19 @@ def __init__(self): # pylint: disable=super-init-not-called
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))

def _set_data_from_cuda_interface(self, data):
def _set_data_from_cuda_interface(self, data) -> None:
"""Set data from CUDA array interface."""
interface = data.__cuda_array_interface__
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
_check_call(
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
)

def _set_data_from_cuda_columnar(self, data):
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
"""Set data from CUDA columnar format."""
from .data import _cudf_array_interfaces

_, interfaces_str = _cudf_array_interfaces(data)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))

def _set_data_from_array(self, data: np.ndarray):
Expand Down Expand Up @@ -2009,13 +2009,18 @@ def inplace_predict(

from .data import _is_pandas_df, _transform_pandas_df
from .data import _array_interface
if _is_pandas_df(data):
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)

if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype)
Expand Down Expand Up @@ -2068,8 +2073,11 @@ def inplace_predict(
)
return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
from .data import _cudf_array_interfaces
_, interfaces_str = _cudf_array_interfaces(data)
from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
data, None, None, enable_categorical
)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(
_LIB.XGBoosterPredictFromCudaColumnar(
self.handle,
Expand Down

0 comments on commit ac9bfaa

Please sign in to comment.