Skip to content

Commit

Permalink
Refactor pandas dataframe handling. (#7843)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Apr 26, 2022
1 parent bef1f93 commit ad06172
Showing 1 changed file with 80 additions and 53 deletions.
133 changes: 80 additions & 53 deletions python-package/xgboost/data.py
Expand Up @@ -6,7 +6,7 @@
import json
import warnings
import os
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type

import numpy as np

Expand All @@ -21,8 +21,6 @@
CAT_T = "c"

# meta info that can be a matrix instead of vector.
# For now it's base_margin for multi-class, but it can be extended to label once we have
# multi-output.
_matrix_meta = {"base_margin", "label"}


Expand Down Expand Up @@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
raise ValueError(msg)


# pylint: disable=too-many-locals
def _transform_pandas_df(
def _pandas_feature_info(
data: DataFrame,
meta: Optional[str],
feature_names: FeatureNames,
feature_types: FeatureTypes,
enable_categorical: bool,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
) -> Tuple[FeatureNames, FeatureTypes]:
import pandas as pd
from pandas.api.types import (
is_sparse,
is_categorical_dtype,
is_integer_dtype,
is_bool_dtype,
)

nullable_alias = {"Int16", "Int32", "Int64"}

# dtype: pd.core.arrays.numeric.NumericDtype
def is_nullable_dtype(dtype: Any) -> bool:
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool

if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or is_nullable_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)

# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex):
Expand All @@ -300,43 +276,94 @@ def is_nullable_dtype(dtype: Any) -> bool:
# handle feature types
if feature_types is None and meta is None:
feature_types = []
for i, dtype in enumerate(data.dtypes):
for dtype in data.dtypes:
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])
return feature_names, feature_types

# handle category codes.
transformed = pd.DataFrame()
# Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
if (
enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
) or any(is_nullable_dtype(dtype) for dtype in data.dtypes):
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
# pandas uses -1 as default missing value for categorical data
transformed[data.columns[i]] = (
data[data.columns[i]]
.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN)
)
elif is_nullable_dtype(dtype):
# Converts integer <NA> to float NaN
transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32)
else:
transformed[data.columns[i]] = data[data.columns[i]]

def is_nullable_dtype(dtype: Any) -> bool:
"""Wether dtype is a pandas nullable type."""
from pandas.api.types import is_integer_dtype, is_bool_dtype
# dtype: pd.core.arrays.numeric.NumericDtype
nullable_alias = {"Int16", "Int32", "Int64"}
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool


def _pandas_cat_null(data: DataFrame) -> DataFrame:
from pandas.api.types import is_categorical_dtype
# handle category codes and nullable.
cat_columns = [
col
for col, dtype in zip(data.columns, data.dtypes)
if is_categorical_dtype(dtype)
]
nul_columns = [
col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
]
if cat_columns or nul_columns:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
# fragmented
transformed = data.copy()
else:
transformed = data

if cat_columns:
# DF doesn't have the cat attribute, so we use apply here
transformed[cat_columns] = (
transformed[cat_columns]
.apply(lambda x: x.cat.codes)
.astype(np.float32)
.replace(-1.0, np.NaN)
)
if nul_columns:
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)

return transformed


def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: FeatureNames = None,
feature_types: FeatureTypes = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
from pandas.api.types import (
is_sparse,
is_categorical_dtype,
)

if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or is_nullable_dtype(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)

feature_names, feature_types = _pandas_feature_info(
data, meta, feature_names, feature_types, enable_categorical
)

transformed = _pandas_cat_null(data)

if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

dtype = meta_type if meta_type else np.float32
arr = transformed.values
dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
arr: np.ndarray = transformed.values
if meta_type:
arr = arr.astype(meta_type)
arr = arr.astype(dtype)
return arr, feature_names, feature_types


Expand Down

0 comments on commit ad06172

Please sign in to comment.